Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/tools/cephfs
parent: Initial commit. (diff)
download: ceph-upstream/18.2.2.tar.xz
ceph-upstream/18.2.2.zip
39 files changed, 11958 insertions, 0 deletions
diff --git a/src/tools/cephfs/CMakeLists.txt b/src/tools/cephfs/CMakeLists.txt
new file mode 100644
index 000000000..5d40f8ffb
--- /dev/null
+++ b/src/tools/cephfs/CMakeLists.txt
@@ -0,0 +1,58 @@
+set(cephfs_journal_tool_srcs
+  cephfs-journal-tool.cc
+  JournalTool.cc
+  JournalFilter.cc
+  JournalScanner.cc
+  EventOutput.cc
+  Dumper.cc
+  Resetter.cc
+  RoleSelector.cc
+  MDSUtility.cc)
+add_executable(cephfs-journal-tool ${cephfs_journal_tool_srcs})
+target_link_libraries(cephfs-journal-tool librados mds osdc global
+  ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs-meta-injection_srcs
+  cephfs-meta-injection.cc
+  MetaTool.cc
+  RoleSelector.cc
+  MDSUtility.cc)
+add_executable(cephfs-meta-injection ${cephfs-meta-injection_srcs})
+target_link_libraries(cephfs-meta-injection librados mds osdc global
+  ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs_table_tool_srcs
+  cephfs-table-tool.cc
+  TableTool.cc
+  RoleSelector.cc
+  MDSUtility.cc)
+add_executable(cephfs-table-tool ${cephfs_table_tool_srcs})
+target_link_libraries(cephfs-table-tool librados mds osdc global
+  ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs_data_scan_srcs
+  cephfs-data-scan.cc
+  DataScan.cc
+  RoleSelector.cc
+  PgFiles.cc
+  MDSUtility.cc)
+add_executable(cephfs-data-scan ${cephfs_data_scan_srcs})
+target_link_libraries(cephfs-data-scan librados cephfs mds osdc global
+  cls_cephfs_client
+  ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+install(TARGETS
+  cephfs-journal-tool
+  cephfs-table-tool
+  cephfs-data-scan
+  DESTINATION bin)
+
+option(WITH_CEPHFS_SHELL "install cephfs-shell" OFF)
+if(WITH_CEPHFS_SHELL)
+  add_subdirectory(shell)
+endif()
+
+option(WITH_CEPHFS_TOP "install cephfs-top utility" ON)
+if(WITH_CEPHFS_TOP)
+  add_subdirectory(top)
+endif()
diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc
new file mode 100644
index 000000000..0ba56c515
--- /dev/null
+++ b/src/tools/cephfs/DataScan.cc
@@ -0,0 +1,2404 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+#include <fstream>
+#include "include/util.h"
+#include "include/ceph_fs.h"
+
+#include "mds/CDentry.h"
+#include "mds/CInode.h"
+#include "mds/CDentry.h"
+#include "mds/InoTable.h"
+#include "mds/SnapServer.h"
+#include "cls/cephfs/cls_cephfs_client.h"
+
+#include "PgFiles.h"
+#include "DataScan.h"
+#include "include/compat.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "datascan." << __func__ << ": "
+
+using namespace std;
+
+void DataScan::usage()
+{
+  std::cout << "Usage: \n"
+    << "  cephfs-data-scan init [--force-init]\n"
+    << "  cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] [<data pool name> [<extra data pool name> ...]]\n"
+    << "  cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] [<data pool name>]\n"
+    << "  cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
+    << "  cephfs-data-scan scan_links\n"
+    << "\n"
+    << "    --force-corrupt: overrite apparently corrupt structures\n"
+    << "    --force-init: write root inodes even if they exist\n"
+    << "    --force-pool: use data pool even if it is not in FSMap\n"
+    << "    --worker_m: Maximum number of workers\n"
+    << "    --worker_n: Worker number, range 0-(worker_m-1)\n"
+    << "\n"
+    << "  cephfs-data-scan scan_frags [--force-corrupt]\n"
+    << "  cephfs-data-scan cleanup [<data pool name>]\n"
+    << std::endl;
+
+  generic_client_usage();
+}
+
+bool DataScan::parse_kwarg(
+    const std::vector<const char*> &args,
+    std::vector<const char *>::const_iterator &i,
+    int *r)
+{
+  if (i + 1 == args.end()) {
+    return false;
+  }
+
+  const std::string arg(*i);
+  const std::string val(*(i + 1));
+
+  if (arg == std::string("--output-dir")) {
+    if (driver != NULL) {
+      derr << "Unexpected --output-dir: output already selected!" << dendl;
+      *r = -EINVAL;
+      return false;
+    }
+    dout(4) << "Using local file output to '" << val << "'" << dendl;
+    driver = new LocalFileDriver(val, data_io);
+    return true;
+  } else if (arg == std::string("--worker_n")) {
+    std::string err;
+    n = strict_strtoll(val.c_str(), 10, &err);
+    if (!err.empty()) {
+      std::cerr << "Invalid worker number '" << val << "'" << std::endl;
+      *r = -EINVAL;
+      return false;
+    }
+    return true;
+  } else if (arg == std::string("--worker_m")) {
+    std::string err;
+    m = strict_strtoll(val.c_str(), 10, &err);
+    if (!err.empty()) {
+      std::cerr << "Invalid worker count '" << val << "'" << std::endl;
+      *r = -EINVAL;
+      return false;
+    }
+    return true;
+  } else if (arg == std::string("--filter-tag")) {
+    filter_tag = val;
+    dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
+    return true;
+  } else if (arg == std::string("--filesystem")) {
+    std::shared_ptr<const Filesystem> fs;
+    *r = fsmap->parse_filesystem(val, &fs);
+    if (*r != 0) {
+      std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
+      return false;
+    }
+    fscid = fs->fscid;
+    return true;
+  } else if (arg == std::string("--alternate-pool")) {
+    metadata_pool_name = val;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool DataScan::parse_arg(
+    const std::vector<const char*> &args,
+    std::vector<const char *>::const_iterator &i)
+{
+  const std::string arg(*i);
+  if (arg == "--force-pool") {
+    force_pool = true;
+    return true;
+  } else if (arg == "--force-corrupt") {
+    force_corrupt = true;
+    return true;
+  } else if (arg == "--force-init") {
+    force_init = true;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int DataScan::main(const std::vector<const char*> &args)
+{
+  // Parse args
+  // ==========
+  if (args.size() < 1) {
+    cerr << "missing position argument" << std::endl;
+    return -EINVAL;
+  }
+
+  // Common RADOS init: open metadata pool
+  // =====================================
+  librados::Rados rados;
+  int r = rados.init_with_context(g_ceph_context);
+  if (r < 0) {
+    derr << "RADOS unavailable" << dendl;
+    return r;
+  }
+
+  std::string const &command = args[0];
+  std::string data_pool_name;
+  std::set<std::string> extra_data_pool_names;
+
+  std::string pg_files_path;
+  std::set<pg_t> pg_files_pgs;
+
+  // Consume any known --key val or --flag arguments
+  for (std::vector<const char *>::const_iterator i = args.begin() + 1;
+       i != args.end(); ++i) {
+    if (parse_kwarg(args, i, &r)) {
+      // Skip the kwarg value field
+      ++i;
+      continue;
+    } else if (r) {
+      return r;
+    }
+
+    if (parse_arg(args, i)) {
+      continue;
+    }
+
+    // Trailing positional arguments
+    if (command == "scan_extents") {
+      if (data_pool_name.empty()) {
+	data_pool_name = *i;
+      } else if (*i != data_pool_name) {
+	extra_data_pool_names.insert(*i);
+      }
+      continue;
+    }
+
+    // Trailing positional argument
+    if (i + 1 == args.end() &&
+        (command == "scan_inodes"
+         || command == "cleanup")) {
+      data_pool_name = *i;
+      continue;
+    }
+
+    if (command == "pg_files") {
+      if (i == args.begin() + 1) {
+        pg_files_path = *i;
+        continue;
+      } else {
+        pg_t pg;
+        bool parsed = pg.parse(*i);
+        if (!parsed) {
+          std::cerr << "Invalid PG '" << *i << "'" << std::endl;
+          return -EINVAL;
+        } else {
+          pg_files_pgs.insert(pg);
+          continue;
+        }
+      }
+
+    }
+
+    // Fall through: unhandled
+    std::cerr << "Unknown argument '" << *i << "'" << std::endl;
+    return -EINVAL;
+  }
+
+  // If caller didn't specify a namespace, try to pick
+  // one if only one exists
+  if (fscid == FS_CLUSTER_ID_NONE) {
+    if (fsmap->filesystem_count() == 1) {
+      fscid = fsmap->get_filesystem()->fscid;
+    } else {
+      std::cerr << "Specify a filesystem with --filesystem" << std::endl;
+      return -EINVAL;
+    }
+  }
+  auto fs =  fsmap->get_filesystem(fscid);
+  ceph_assert(fs != nullptr);
+
+  // Default to output to metadata pool
+  if (driver == NULL) {
+    driver = new MetadataDriver();
+    driver->set_force_corrupt(force_corrupt);
+    driver->set_force_init(force_init);
+    dout(4) << "Using metadata pool output" << dendl;
+  }
+
+  dout(4) << "connecting to RADOS..." << dendl;
+  r = rados.connect();
+  if (r < 0) {
+    std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+
+  r = driver->init(rados, metadata_pool_name, fsmap, fscid);
+  if (r < 0) {
+    return r;
+  }
+
+  if (command == "pg_files") {
+    auto pge = PgFiles(objecter, pg_files_pgs);
+    pge.init();
+    return pge.scan_path(pg_files_path);
+  }
+
+  bool autodetect_data_pools = false;
+
+  // Initialize data_io for those commands that need it
+  if (command == "scan_inodes" ||
+      command == "scan_extents" ||
+      command == "cleanup") {
+    data_pool_id = fs->mds_map.get_first_data_pool();
+
+    std::string pool_name;
+    r = rados.pool_reverse_lookup(data_pool_id, &pool_name);
+    if (r < 0) {
+      std::cerr << "Failed to resolve data pool: " << cpp_strerror(r)
+		<< std::endl;
+      return r;
+    }
+
+    if (data_pool_name.empty()) {
+      autodetect_data_pools = true;
+      data_pool_name = pool_name;
+    } else if (data_pool_name != pool_name) {
+      std::cerr << "Warning: pool '" << data_pool_name << "' is not the "
+        "main CephFS data pool!" << std::endl;
+      if (!force_pool) {
+        std::cerr << "Use --force-pool to continue" << std::endl;
+        return -EINVAL;
+      }
+
+      data_pool_id = rados.pool_lookup(data_pool_name.c_str());
+      if (data_pool_id < 0) {
+	std::cerr << "Data pool '" << data_pool_name << "' not found!"
+		  << std::endl;
+	return -ENOENT;
+      }
+    }
+
+    dout(4) << "data pool '" << data_pool_name << "' has ID " << data_pool_id
+	    << dendl;
+
+    dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
+    r = rados.ioctx_create(data_pool_name.c_str(), data_io);
+    if (r != 0) {
+      return r;
+    }
+  }
+
+  // Initialize extra data_ios for those commands that need it
+  if (command == "scan_extents") {
+    if (autodetect_data_pools) {
+      ceph_assert(extra_data_pool_names.empty());
+
+      for (auto &pool_id : fs->mds_map.get_data_pools()) {
+	if (pool_id == data_pool_id) {
+	  continue;
+	}
+
+	std::string pool_name;
+	r = rados.pool_reverse_lookup(pool_id, &pool_name);
+	if (r < 0) {
+	  std::cerr << "Failed to resolve data pool: " << cpp_strerror(r)
+		    << std::endl;
+	  return r;
+	}
+	extra_data_pool_names.insert(pool_name);
+      }
+    }
+
+    for (auto &data_pool_name: extra_data_pool_names) {
+      int64_t pool_id = rados.pool_lookup(data_pool_name.c_str());
+      if (data_pool_id < 0) {
+	std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
+	return -ENOENT;
+      } else {
+	dout(4) << "data pool '" << data_pool_name << "' has ID " << pool_id
+		<< dendl;
+      }
+
+      if (!fs->mds_map.is_data_pool(pool_id)) {
+	std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
+	  "CephFS data pool!" << std::endl;
+	if (!force_pool) {
+	  std::cerr << "Use --force-pool to continue" << std::endl;
+	  return -EINVAL;
+	}
+      }
+
+      dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
+      extra_data_ios.push_back({});
+      r = rados.ioctx_create(data_pool_name.c_str(), extra_data_ios.back());
+      if (r != 0) {
+	return r;
+      }
+    }
+  }
+
+  // Initialize metadata_io from MDSMap for scan_frags
+  if (command == "scan_frags" || command == "scan_links") {
+    const auto fs = fsmap->get_filesystem(fscid);
+    if (fs == nullptr) {
+      std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
+      return -ENOENT;
+    }
+    int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
+
+    dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+    int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+    if (r < 0) {
+      std::cerr << "Pool " << metadata_pool_id
+        << " identified in MDS map not found in RADOS!" << std::endl;
+      return r;
+    }
+
+    r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+    if (r != 0) {
+      return r;
+    }
+
+    data_pools = fs->mds_map.get_data_pools();
+  }
+
+  // Finally, dispatch command
+  if (command == "scan_inodes") {
+    return scan_inodes();
+  } else if (command == "scan_extents") {
+    return scan_extents();
+  } else if (command == "scan_frags") {
+    return scan_frags();
+  } else if (command == "scan_links") {
+    return scan_links();
+  } else if (command == "cleanup") {
+    return cleanup();
+  } else if (command == "init") {
+    return driver->init_roots(fs->mds_map.get_first_data_pool());
+  } else {
+    std::cerr << "Unknown command '" << command << "'" << std::endl;
+    return -EINVAL;
+  }
+}
+
+int MetadataDriver::inject_unlinked_inode(
+    inodeno_t inono, int mode, int64_t data_pool_id)
+{
+  const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
+
+  // Skip if exists
+  bool already_exists = false;
+  int r = root_exists(inono, &already_exists);
+  if (r) {
+    return r;
+  }
+  if (already_exists && !force_init) {
+    std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
+               " exists, skipping create.  Use --force-init to overwrite"
+               " the existing object." << std::endl;
+    return 0;
+  }
+
+  // Compose
+  InodeStore inode_data;
+  auto inode = inode_data.get_inode();
+  inode->ino = inono;
+  inode->version = 1;
+  inode->xattr_version = 1;
+  inode->mode = 0500 | mode;
+  // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
+  // (we won't actually give the *correct* dirstat here though)
+  inode->dirstat.nfiles = 1;
+
+  inode->ctime = inode->mtime = ceph_clock_now();
+  inode->nlink = 1;
+  inode->truncate_size = -1ull;
+  inode->truncate_seq = 1;
+  inode->uid = g_conf()->mds_root_ino_uid;
+  inode->gid = g_conf()->mds_root_ino_gid;
+
+  // Force layout to default: should we let users override this so that
+  // they don't have to mount the filesystem to correct it?
+  inode->layout = file_layout_t::get_default();
+  inode->layout.pool_id = data_pool_id;
+  inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+
+  // Assume that we will get our stats wrong, and that we may
+  // be ignoring dirfrags that exist
+  inode_data.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
+
+  if (inono == CEPH_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) {
+    sr_t srnode;
+    srnode.seq = 1;
+    encode(srnode, inode_data.snap_blob);
+  }
+
+  // Serialize
+  bufferlist inode_bl;
+  encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
+  inode_data.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+  // Write
+  r = metadata_io.write_full(oid.name, inode_bl);
+  if (r != 0) {
+    derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  return r;
+}
+
+int MetadataDriver::root_exists(inodeno_t ino, bool *result)
+{
+  object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+  uint64_t size;
+  time_t mtime;
+  int r = metadata_io.stat(oid.name, &size, &mtime);
+  if (r == -ENOENT) {
+    *result = false;
+    return 0;
+  } else if (r < 0) {
+    return r;
+  }
+
+  *result = true;
+  return 0;
+}
+
+int MetadataDriver::init_roots(int64_t data_pool_id)
+{
+  int r = 0;
+  r = inject_unlinked_inode(CEPH_INO_ROOT, S_IFDIR|0755, data_pool_id);
+  if (r != 0) {
+    return r;
+  }
+  r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
+  if (r != 0) {
+    return r;
+  }
+  bool created = false;
+  r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
+  if (r != 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int MetadataDriver::check_roots(bool *result)
+{
+  int r;
+  r = root_exists(CEPH_INO_ROOT, result);
+  if (r != 0) {
+    return r;
+  }
+  if (!*result) {
+    return 0;
+  }
+
+  r = root_exists(MDS_INO_MDSDIR(0), result);
+  if (r != 0) {
+    return r;
+  }
+  if (!*result) {
+    return 0;
+  }
+
+  return 0;
+}
+
+/**
+ * Stages:
+ *
+ * SERIAL init
+ *  0. Create root inodes if don't exist
+ * PARALLEL scan_extents
+ *  1. Size and mtime recovery: scan ALL objects, and update 0th
+ *   objects with max size and max mtime seen.
+ * PARALLEL scan_inodes
+ *  2. Inode recovery: scan ONLY 0th objects, and inject metadata
+ *   into dirfrag OMAPs, creating blank dirfrags as needed.  No stats
+ *   or rstats at this stage.  Inodes without backtraces go into
+ *   lost+found
+ * TODO: SERIAL "recover stats"
+ *  3. Dirfrag statistics: depth first traverse into metadata tree,
+ *    rebuilding dir sizes.
+ * TODO PARALLEL "clean up"
+ *  4. Cleanup; go over all 0th objects (and dirfrags if we tagged
+ *   anything onto them) and remove any of the xattrs that we
+ *   used for accumulating.
+ */
+
+
+int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
+{
+  if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
+    return -EINVAL;
+  }
+
+  std::string err;
+  std::string inode_str = oid.substr(0, oid.find("."));
+  *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
+  if (!err.empty()) {
+    return -EINVAL;
+  }
+
+  std::string pos_string = oid.substr(oid.find(".") + 1);
+  *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
+  if (!err.empty()) {
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+
+int DataScan::scan_extents()
+{
+  std::vector<librados::IoCtx *> data_ios;
+  data_ios.push_back(&data_io);
+  for (auto &extra_data_io : extra_data_ios) {
+    data_ios.push_back(&extra_data_io);
+  }
+
+  for (auto ioctx : data_ios) {
+    int r = forall_objects(*ioctx, false, [this, ioctx](
+        std::string const &oid,
+        uint64_t obj_name_ino,
+        uint64_t obj_name_offset) -> int
+    {
+      // Read size
+      uint64_t size;
+      time_t mtime;
+      int r = ioctx->stat(oid, &size, &mtime);
+      dout(10) << "handling object " << obj_name_ino
+	       << "." << obj_name_offset << dendl;
+      if (r != 0) {
+	dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
+	return r;
+      }
+      int64_t obj_pool_id = data_io.get_id() != ioctx->get_id() ?
+	ioctx->get_id() : -1;
+
+      // I need to keep track of
+      //  * The highest object ID seen
+      //  * The size of the highest object ID seen
+      //  * The largest object seen
+      //  * The pool of the objects seen (if it is not the main data pool)
+      //
+      //  Given those things, I can later infer the object chunking
+      //  size, the offset of the last object (chunk size * highest ID seen),
+      //  the actual size (offset of last object + size of highest ID seen),
+      //  and the layout pool id.
+      //
+      //  This logic doesn't take account of striping.
+      r = ClsCephFSClient::accumulate_inode_metadata(
+          data_io,
+	  obj_name_ino,
+	  obj_name_offset,
+	  size,
+	  obj_pool_id,
+	  mtime);
+      if (r < 0) {
+	derr << "Failed to accumulate metadata data from '"
+	     << oid << "': " << cpp_strerror(r) << dendl;
+	return r;
+      }
+
+      return r;
+    });
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+int DataScan::probe_filter(librados::IoCtx &ioctx)
+{
+  bufferlist filter_bl;
+  ClsCephFSClient::build_tag_filter("test", &filter_bl);
+  librados::ObjectCursor range_i;
+  librados::ObjectCursor range_end;
+
+  std::vector<librados::ObjectItem> tmp_result;
+  librados::ObjectCursor tmp_next;
+  int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
+                            1, filter_bl, &tmp_result, &tmp_next);
+
+  return r >= 0;
+}
+
+int DataScan::forall_objects(
+    librados::IoCtx &ioctx,
+    bool untagged_only,
+    std::function<int(std::string, uint64_t, uint64_t)> handler
+    )
+{
+  librados::ObjectCursor range_i;
+  librados::ObjectCursor range_end;
+  ioctx.object_list_slice(
+      ioctx.object_list_begin(),
+      ioctx.object_list_end(),
+      n,
+      m,
+      &range_i,
+      &range_end);
+
+
+  bufferlist filter_bl;
+
+  bool legacy_filtering = false;
+  if (untagged_only) {
+    // probe to deal with older OSDs that don't support
+    // the cephfs pgls filtering mode
+    legacy_filtering = !probe_filter(ioctx);
+    if (!legacy_filtering) {
+      ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
+    }
+  }
+
+  int r = 0;
+  while(range_i < range_end) {
+    std::vector<librados::ObjectItem> result;
+    int r = ioctx.object_list(range_i, range_end, 1,
+                                filter_bl, &result, &range_i);
+    if (r < 0) {
+      derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    for (const auto &i : result) {
+      const std::string &oid = i.oid;
+      uint64_t obj_name_ino = 0;
+      uint64_t obj_name_offset = 0;
+      r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+      if (r != 0) {
+        dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+        continue;
+      }
+
+      if (untagged_only && legacy_filtering) {
+        dout(20) << "Applying filter to " << oid << dendl;
+
+        // We are only interested in 0th objects during this phase: we touched
+        // the other objects during scan_extents
+        if (obj_name_offset != 0) {
+          dout(20) << "Non-zeroth object" << dendl;
+          continue;
+        }
+
+        bufferlist scrub_tag_bl;
+        int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
+        if (r >= 0) {
+          std::string read_tag;
+          auto q = scrub_tag_bl.cbegin();
+          try {
+            decode(read_tag, q);
+            if (read_tag == filter_tag) {
+              dout(20) << "skipping " << oid << " because it has the filter_tag"
+                       << dendl;
+              continue;
+            }
+          } catch (const buffer::error &err) {
+          }
+          dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
+        } else {
+          dout(20) << "no tag read (" << r << ")" << dendl;
+        }
+
+      } else if (untagged_only) {
+        ceph_assert(obj_name_offset == 0);
+        dout(20) << "OSD matched oid " << oid << dendl;
+      }
+
+      int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
+      if (r == 0 && this_oid_r < 0) {
+        r = this_oid_r;
+      }
+    }
+  }
+
+  return r;
+}
+
+int DataScan::scan_inodes()
+{
+  bool roots_present;
+  int r = driver->check_roots(&roots_present);
+  if (r != 0) {
+    derr << "Unexpected error checking roots: '"
+      << cpp_strerror(r) << "'" << dendl;
+    return r;
+  }
+
+  if (!roots_present) {
+    std::cerr << "Some or all system inodes are absent.  Run 'init' from "
+      "one node before running 'scan_inodes'" << std::endl;
+    return -EIO;
+  }
+
+  return forall_objects(data_io, true, [this](
+        std::string const &oid,
+        uint64_t obj_name_ino,
+        uint64_t obj_name_offset) -> int
+  {
+    int r = 0;
+
+    dout(10) << "handling object "
+	     << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
+	     << dendl;
+
+    AccumulateResult accum_res;
+    inode_backtrace_t backtrace;
+    file_layout_t loaded_layout = file_layout_t::get_default();
+    std::string symlink;
+    r = ClsCephFSClient::fetch_inode_accumulate_result(
+        data_io, oid, &backtrace, &loaded_layout, &symlink, &accum_res);
+
+    if (r == -EINVAL) {
+      dout(4) << "Accumulated metadata missing from '"
+              << oid << ", did you run scan_extents?" << dendl;
+      return r;
+    } else if (r < 0) {
+      dout(4) << "Unexpected error loading accumulated metadata from '"
+              << oid << "': " << cpp_strerror(r) << dendl;
+      // FIXME: this creates situation where if a client has a corrupt
+      // backtrace/layout, we will fail to inject it.  We should (optionally)
+      // proceed if the backtrace/layout is corrupt but we have valid
+      // accumulated metadata.
+      return r;
+    }
+
+    const time_t file_mtime = accum_res.max_mtime;
+    uint64_t file_size = 0;
+    bool have_backtrace = !(backtrace.ancestors.empty());
+
+    // This is the layout we will use for injection, populated either
+    // from loaded_layout or from best guesses
+    file_layout_t guessed_layout;
+    if (accum_res.obj_pool_id == -1) {
+      guessed_layout.pool_id = data_pool_id;
+    } else {
+      guessed_layout.pool_id = accum_res.obj_pool_id;
+
+      librados::IoCtx ioctx;
+      r = librados::Rados(data_io).ioctx_create2(guessed_layout.pool_id, ioctx);
+      if (r != 0) {
+	derr << "Unexpected error opening file data pool id="
+	     << guessed_layout.pool_id << ": " << cpp_strerror(r) << dendl;
+	return r;
+      }
+
+      bufferlist bl;
+      int r = ioctx.getxattr(oid, "layout", bl);
+      if (r < 0) {
+	if (r != -ENODATA) {
+	  derr << "Unexpected error reading layout for " << oid << ": "
+	       << cpp_strerror(r) << dendl;
+	  return r;
+	}
+      } else {
+	try {
+	  auto q = bl.cbegin();
+	  decode(loaded_layout, q);
+	} catch (ceph::buffer::error &e) {
+	  derr << "Unexpected error decoding layout for " << oid << dendl;
+	  return -EINVAL;
+	}
+      }
+    }
+
+    // Calculate file_size, guess the layout
+    if (accum_res.ceiling_obj_index > 0) {
+      uint32_t chunk_size = file_layout_t::get_default().object_size;
+      // When there are multiple objects, the largest object probably
+      // indicates the chunk size.  But not necessarily, because files
+      // can be sparse.  Only make this assumption if size seen
+      // is a power of two, as chunk sizes typically are.
+      if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
+        chunk_size = accum_res.max_obj_size;
+      }
+
+      if (loaded_layout.pool_id == -1) {
+        // If no stashed layout was found, guess it
+        guessed_layout.object_size = chunk_size;
+        guessed_layout.stripe_unit = chunk_size;
+        guessed_layout.stripe_count = 1;
+      } else if (!loaded_layout.is_valid() ||
+          loaded_layout.object_size < accum_res.max_obj_size) {
+        // If the max size seen exceeds what the stashed layout claims, then
+        // disbelieve it.  Guess instead.  Same for invalid layouts on disk.
+        dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
+                << std::dec << ", ignoring in favour of best guess" << dendl;
+        guessed_layout.object_size = chunk_size;
+        guessed_layout.stripe_unit = chunk_size;
+        guessed_layout.stripe_count = 1;
+      } else {
+        // We have a stashed layout that we can't disprove, so apply it
+        guessed_layout = loaded_layout;
+        dout(20) << "loaded layout from xattr:"
+          << " pi: " << guessed_layout.pool_id
+          << " os: " << guessed_layout.object_size
+          << " sc: " << guessed_layout.stripe_count
+          << " su: " << guessed_layout.stripe_unit
+          << dendl;
+        // User might have transplanted files from a pool with a different
+        // ID, so if the pool from loaded_layout is not found in the list of
+        // the data pools, we'll force the injected layout to point to the
+        // pool we read from.
+	if (!fsmap->get_filesystem(fscid)->mds_map.is_data_pool(
+	      guessed_layout.pool_id)) {
+	  dout(20) << "overwriting layout pool_id " << data_pool_id << dendl;
+	  guessed_layout.pool_id = data_pool_id;
+	}
+      }
+
+      if (guessed_layout.stripe_count == 1) {
+        // Unstriped file: simple chunking
+        file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
+                    + accum_res.ceiling_obj_size;
+      } else {
+        // Striped file: need to examine the last stripe_count objects
+        // in the file to determine the size.
+
+	librados::IoCtx ioctx;
+	if (guessed_layout.pool_id == data_io.get_id()) {
+	  ioctx.dup(data_io);
+	} else {
+	  r = librados::Rados(data_io).ioctx_create2(guessed_layout.pool_id,
+						     ioctx);
+	  if (r != 0) {
+	    derr << "Unexpected error opening file data pool id="
+		 << guessed_layout.pool_id << ": " << cpp_strerror(r) << dendl;
+	    return r;
+	  }
+	}
+
+        // How many complete (i.e. not last stripe) objects?
+        uint64_t complete_objs = 0;
+        if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
+          complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
+        } else {
+          complete_objs = 0;
+        }
+
+        // How many potentially-short objects (i.e. last stripe set) objects?
+        uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
+
+        dout(10) << "calculating striped size from complete objs: "
+                 << complete_objs << ", partial objs: " << partial_objs
+                 << dendl;
+
+        // Maximum amount of data that may be in the incomplete objects
+        uint64_t incomplete_size = 0;
+
+        // For each short object, calculate the max file size within it
+        // and accumulate the maximum
+        for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
+          char buf[60];
+          snprintf(buf, sizeof(buf), "%llx.%08llx",
+              (long long unsigned)obj_name_ino, (long long unsigned)i);
+
+          uint64_t osize(0);
+          time_t omtime(0);
+          r = ioctx.stat(std::string(buf), &osize, &omtime);
+          if (r == 0) {
+            if (osize > 0) {
+              // Upper bound within this object
+              uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
+                * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
+                + (i % guessed_layout.stripe_count)
+                * guessed_layout.stripe_unit + (osize - 1)
+                % guessed_layout.stripe_unit + 1;
+              incomplete_size = std::max(incomplete_size, upper_size);
+            }
+          } else if (r == -ENOENT) {
+            // Absent object, treat as size 0 and ignore.
+          } else {
+            // Unexpected error, carry r to outer scope for handling.
+            break;
+          }
+        }
+        if (r != 0 && r != -ENOENT) {
+          derr << "Unexpected error checking size of ino 0x" << std::hex
+               << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
+          return r;
+        }
+        file_size = complete_objs * guessed_layout.object_size
+                    + incomplete_size;
+      }
+    } else {
+      file_size = accum_res.ceiling_obj_size;
+      if (loaded_layout.pool_id < 0
+          || loaded_layout.object_size < accum_res.max_obj_size) {
+        // No layout loaded, or inconsistent layout, use default
+        guessed_layout = file_layout_t::get_default();
+	guessed_layout.pool_id = accum_res.obj_pool_id != -1 ?
+	  accum_res.obj_pool_id : data_pool_id;
+      } else {
+        guessed_layout = loaded_layout;
+      }
+    }
+
+    // Santity checking backtrace ino against object name
+    if (have_backtrace && backtrace.ino != obj_name_ino) {
+      dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+        << " doesn't match object name ino 0x" << obj_name_ino
+        << std::dec << dendl;
+      have_backtrace = false;
+    }
+
+    InodeStore dentry;
+    build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry, symlink);
+
+    // Inject inode to the metadata pool
+    if (have_backtrace) {
+      inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+      if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+        /* Special case for strays: even if we have a good backtrace,
+         * don't put it in the stray dir, because while that would technically
+         * give it linkage it would still be invisible to the user */
+        r = driver->inject_lost_and_found(obj_name_ino, dentry);
+        if (r < 0) {
+          dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+            << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+          if (r == -EINVAL) {
+            dout(4) << "Use --force-corrupt to overwrite structures that "
+                       "appear to be corrupt" << dendl;
+          }
+        }
+      } else {
+        /* Happy case: we will inject a named dentry for this inode */
+        r = driver->inject_with_backtrace(backtrace, dentry);
+        if (r < 0) {
+          dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+            << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+          if (r == -EINVAL) {
+            dout(4) << "Use --force-corrupt to overwrite structures that "
+                       "appear to be corrupt" << dendl;
+          }
+        }
+      }
+    } else {
+      /* Backtrace-less case: we will inject a lost+found dentry */
+      r = driver->inject_lost_and_found(
+          obj_name_ino, dentry);
+      if (r < 0) {
+        dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+          << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+        if (r == -EINVAL) {
+          dout(4) << "Use --force-corrupt to overwrite structures that "
+                     "appear to be corrupt" << dendl;
+        }
+      }
+    }
+
+    return r;
+  });
+}
+
+int DataScan::cleanup()
+{
+  // We are looking for only zeroth object
+  //
+  return forall_objects(data_io, true, [this](
+        std::string const &oid,
+        uint64_t obj_name_ino,
+        uint64_t obj_name_offset) -> int
+      {
+      int r = 0;
+      r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
+      if (r < 0) {
+      dout(4) << "Error deleting accumulated metadata from '"
+      << oid << "': " << cpp_strerror(r) << dendl;
+      }
+      return r;
+      });
+}
+
+bool DataScan::valid_ino(inodeno_t ino) const
+{
+  return (ino >= inodeno_t((1ull << 40)))
+    || (MDS_INO_IS_STRAY(ino))
+    || (MDS_INO_IS_MDSDIR(ino))
+    || ino == CEPH_INO_ROOT
+    || ino == CEPH_INO_CEPH
+    || ino == CEPH_INO_LOST_AND_FOUND;
+}
+
+int DataScan::scan_links()
+{
+  MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
+  if (!metadata_driver) {
+    derr << "Unexpected --output-dir option for scan_links" << dendl;
+    return -EINVAL;
+  }
+
+  interval_set<uint64_t> used_inos;
+  map<inodeno_t, int> remote_links;
+  map<snapid_t, SnapInfo> snaps;
+  snapid_t last_snap = 1;
+  snapid_t snaprealm_v2_since = 2;
+
+  struct link_info_t {
+    inodeno_t dirino;
+    frag_t frag;
+    string name;
+    version_t version;
+    int nlink;
+    bool is_dir;
+    map<snapid_t, SnapInfo> snaps;
+    link_info_t() : version(0), nlink(0), is_dir(false) {}
+    link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::inode_const_ptr& i) :
+      dirino(di), frag(df), name(n),
+      version(i->version), nlink(i->nlink), is_dir(S_IFDIR & i->mode) {}
+    dirfrag_t dirfrag() const {
+      return dirfrag_t(dirino, frag);
+    }
+  };
+  map<inodeno_t, list<link_info_t> > dup_primaries;
+  map<inodeno_t, link_info_t> bad_nlink_inos;
+  map<inodeno_t, link_info_t> injected_inos;
+
+  map<dirfrag_t, set<string> > to_remove;
+
+  enum {
+    SCAN_INOS = 1,
+    CHECK_LINK,
+  };
+
+  for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
+    const librados::NObjectIterator it_end = metadata_io.nobjects_end();
+    for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
+      const std::string oid = it->get_oid();
+
+      dout(10) << "step " << step << ": handling object " << oid << dendl;
+
+      uint64_t dir_ino = 0;
+      uint64_t frag_id = 0;
+      int r = parse_oid(oid, &dir_ino, &frag_id);
+      if (r == -EINVAL) {
+	dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
+	continue;
+      } else {
+	// parse_oid can only do 0 or -EINVAL
+	ceph_assert(r == 0);
+      }
+
+      if (!valid_ino(dir_ino)) {
+	dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
+	continue;
+      }
+
+      std::map<std::string, bufferlist> items;
+      r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
+      if (r < 0) {
+	derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
+	return r;
+      }
+
+      for (auto& p : items) {
+	auto q = p.second.cbegin();
+	string dname;
+	snapid_t last;
+	dentry_key_t::decode_helper(p.first, dname, last);
+
+	if (last != CEPH_NOSNAP) {
+	  if (last > last_snap)
+	    last_snap = last;
+	  continue;
+	}
+
+	try {
+	  snapid_t dnfirst;
+	  decode(dnfirst, q);
+          if (dnfirst == CEPH_NOSNAP) {
+            dout(20) << "injected ino detected" << dendl;
+          } else if (dnfirst <= CEPH_MAXSNAP) {
+	    if (dnfirst - 1 > last_snap)
+	      last_snap = dnfirst - 1;
+	  }
+	  char dentry_type;
+	  decode(dentry_type, q);
+	  mempool::mds_co::string alternate_name;
+	  if (dentry_type == 'I' || dentry_type == 'i') {
+	    InodeStore inode;
+            if (dentry_type == 'i') {
+	      DECODE_START(2, q);
+              if (struct_v >= 2)
+                decode(alternate_name, q);
+	      inode.decode(q);
+	      DECODE_FINISH(q);
+	    } else {
+	      inode.decode_bare(q);
+	    }
+
+	    inodeno_t ino = inode.inode->ino;
+
+	    if (step == SCAN_INOS) {
+	      if (used_inos.contains(ino, 1)) {
+		dup_primaries.emplace(std::piecewise_construct,
+				      std::forward_as_tuple(ino),
+				      std::forward_as_tuple());
+	      } else {
+		used_inos.insert(ino);
+	      }
+	    } else if (step == CHECK_LINK) {
+	      sr_t srnode;
+	      if (inode.snap_blob.length()) {
+		auto p = inode.snap_blob.cbegin();
+		decode(srnode, p);
+		for (auto it = srnode.snaps.begin();
+		     it != srnode.snaps.end(); ) {
+		  if (it->second.ino != ino ||
+		      it->second.snapid != it->first) {
+		    srnode.snaps.erase(it++);
+		  } else {
+		    ++it;
+		  }
+		}
+		if (!srnode.past_parents.empty()) {
+		  snapid_t last = srnode.past_parents.rbegin()->first;
+		  if (last + 1 > snaprealm_v2_since)
+		    snaprealm_v2_since = last + 1;
+		}
+	      }
+	      if (inode.old_inodes && !inode.old_inodes->empty()) {
+		auto _last_snap = inode.old_inodes->rbegin()->first;
+		if (_last_snap > last_snap)
+		  last_snap = _last_snap;
+	      }
+	      auto q = dup_primaries.find(ino);
+	      if (q != dup_primaries.end()) {
+		q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
+		q->second.back().snaps.swap(srnode.snaps);
+	      } else {
+		int nlink = 0;
+		auto r = remote_links.find(ino);
+		if (r != remote_links.end())
+		  nlink = r->second;
+		if (!MDS_INO_IS_STRAY(dir_ino))
+		  nlink++;
+		if (inode.inode->nlink != nlink) {
+		  derr << "Bad nlink on " << ino << " expected " << nlink
+		       << " has " << inode.inode->nlink << dendl;
+		  bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
+		  bad_nlink_inos[ino].nlink = nlink;
+		}
+		snaps.insert(make_move_iterator(begin(srnode.snaps)),
+			     make_move_iterator(end(srnode.snaps)));
+	      }
+	      if (dnfirst == CEPH_NOSNAP) {
+                injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
+                dout(20) << "adding " << ino << " for future processing to fix dnfirst" << dendl;
+              }
+	    }
+	  } else if (dentry_type == 'L' || dentry_type == 'l') {
+	    inodeno_t ino;
+	    unsigned char d_type;
+            CDentry::decode_remote(dentry_type, ino, d_type, alternate_name, q);
+
+	    if (step == SCAN_INOS) {
+	      remote_links[ino]++;
+	    } else if (step == CHECK_LINK) {
+	      if (!used_inos.contains(ino, 1)) {
+		derr << "Bad remote link dentry 0x" << std::hex << dir_ino
+		     << std::dec << "/" << dname
+		     << ", ino " << ino << " not found" << dendl;
+		std::string key;
+		dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+		dn_key.encode(key);
+		to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
+	      }
+	    }
+	  } else {
+	    derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
+		 << std::dec << "/" << dname << dendl;
+	    return -EINVAL;
+	  }
+	} catch (const buffer::error &err) {
+	  derr << "Error decoding dentry 0x" << std::hex << dir_ino
+	       << std::dec << "/" << dname << dendl;
+	  return -EINVAL;
+	}
+      }
+    }
+  }
+
+  map<unsigned, uint64_t> max_ino_map;
+  {
+    auto prev_max_ino = (uint64_t)1 << 40;
+    for (auto p = used_inos.begin(); p != used_inos.end(); ++p) {
+      auto cur_max = p.get_start() + p.get_len() - 1;
+      if (cur_max < prev_max_ino)
+	continue; // system inodes
+
+      if ((prev_max_ino >> 40)  != (cur_max >> 40)) {
+	unsigned rank = (prev_max_ino >> 40) - 1;
+	max_ino_map[rank] = prev_max_ino;
+      } else if ((p.get_start() >> 40) != (cur_max >> 40)) {
+	unsigned rank = (p.get_start() >> 40) - 1;
+	max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1;
+      }
+      prev_max_ino = cur_max;
+    }
+    unsigned rank = (prev_max_ino >> 40) - 1;
+    max_ino_map[rank] = prev_max_ino;
+  }
+
+  used_inos.clear();
+
+  dout(10) << "processing " << dup_primaries.size() << " dup_primaries, "
+	   << remote_links.size() << " remote_links" << dendl;
+
+  for (auto& p : dup_primaries) {
+
+    dout(10) << "handling dup " << p.first << dendl;
+
+    link_info_t newest;
+    for (auto& q : p.second) {
+      if (q.version > newest.version) {
+	newest = q;
+      } else if (q.version == newest.version &&
+		 !MDS_INO_IS_STRAY(q.dirino) &&
+		 MDS_INO_IS_STRAY(newest.dirino)) {
+	newest = q;
+      }
+    }
+
+    for (auto& q : p.second) {
+      // in the middle of dir fragmentation?
+      if (newest.dirino == q.dirino && newest.name == q.name) {
+	snaps.insert(make_move_iterator(begin(q.snaps)),
+		     make_move_iterator(end(q.snaps)));
+	continue;
+      }
+
+      std::string key;
+      dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
+      dn_key.encode(key);
+      to_remove[q.dirfrag()].insert(key);
+      derr << "Remove duplicated ino 0x" << p.first << " from "
+	   << q.dirfrag() << "/" << q.name << dendl;
+    }
+
+    int nlink = 0;
+    auto q = remote_links.find(p.first);
+    if (q != remote_links.end())
+      nlink = q->second;
+    if (!MDS_INO_IS_STRAY(newest.dirino))
+      nlink++;
+
+    if (nlink != newest.nlink) {
+      derr << "Bad nlink on " << p.first << " expected " << nlink
+	   << " has " << newest.nlink << dendl;
+      bad_nlink_inos[p.first] = newest;
+      bad_nlink_inos[p.first].nlink = nlink;
+    }
+  }
+  dup_primaries.clear();
+  remote_links.clear();
+
+  {
+    objecter->with_osdmap([&](const OSDMap& o) {
+      for (auto p : data_pools) {
+	const pg_pool_t *pi = o.get_pg_pool(p);
+	if (!pi)
+	  continue;
+	if (pi->snap_seq > last_snap)
+	  last_snap = pi->snap_seq;
+      }
+    });
+
+    if (!snaps.empty()) {
+      if (snaps.rbegin()->first > last_snap)
+	last_snap = snaps.rbegin()->first;
+    }
+  }
+
+  dout(10) << "removing dup dentries from " << to_remove.size() << " objects"
+	   << dendl;
+
+  for (auto& p : to_remove) {
+    object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
+
+    dout(10) << "removing dup dentries from " << p.first << dendl;
+
+    int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
+    if (r != 0) {
+      derr << "Error removing duplicated dentries from " << p.first << dendl;
+      return r;
+    }
+  }
+  to_remove.clear();
+
+  dout(10) << "processing " << bad_nlink_inos.size() << " bad_nlink_inos"
+	   << dendl;
+
+  for (auto &p : bad_nlink_inos) {
+    dout(10) << "handling bad_nlink_ino " << p.first << dendl;
+
+    InodeStore inode;
+    snapid_t first;
+    int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
+    if (r < 0) {
+      derr << "Unexpected error reading dentry "
+	   << p.second.dirfrag() << "/" << p.second.name
+	   << ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    if (inode.inode->ino != p.first || inode.inode->version != p.second.version)
+      continue;
+
+    inode.get_inode()->nlink = p.second.nlink;
+    r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
+    if (r < 0)
+      return r;
+  }
+
+  dout(10) << "processing " << injected_inos.size() << " injected_inos"
+	   << dendl;
+
+  for (auto &p : injected_inos) {
+    dout(10) << "handling injected_ino " << p.first << dendl;
+
+    InodeStore inode;
+    snapid_t first;
+    dout(20) << " fixing linkage (dnfirst) of " << p.second.dirino << ":" << p.second.name << dendl;
+    int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
+    if (r < 0) {
+      derr << "Unexpected error reading dentry "
+	<< p.second.dirfrag() << "/" << p.second.name
+	<< ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    if (first != CEPH_NOSNAP) {
+      dout(20) << " ????" << dendl;
+      continue;
+    }
+
+    first = last_snap + 1;
+    dout(20) << " first is now " << first << dendl;
+    r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
+    if (r < 0)
+      return r;
+  }
+
+  dout(10) << "updating inotable" << dendl;
+
+  for (auto& p : max_ino_map) {
+    InoTable inotable(nullptr);
+    inotable.set_rank(p.first);
+    bool dirty = false;
+    int r = metadata_driver->load_table(&inotable);
+    if (r < 0) {
+      inotable.reset_state();
+      dirty = true;
+    }
+    if (inotable.force_consume_to(p.second))
+      dirty = true;
+    if (dirty) {
+      r = metadata_driver->save_table(&inotable);
+      if (r < 0)
+	return r;
+    }
+  }
+
+  dout(10) << "updating snaptable" << dendl;
+
+  {
+    SnapServer snaptable;
+    snaptable.set_rank(0);
+    bool dirty = false;
+    int r = metadata_driver->load_table(&snaptable);
+    if (r < 0) {
+      snaptable.reset_state();
+      dirty = true;
+    }
+    if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps))
+      dirty = true;
+    if (dirty) {
+      r = metadata_driver->save_table(&snaptable);
+      if (r < 0)
+	return r;
+    }
+  }
+  return 0;
+}
+
+int DataScan::scan_frags()
+{
+  bool roots_present;
+  int r = driver->check_roots(&roots_present);
+  if (r != 0) {
+    derr << "Unexpected error checking roots: '"
+      << cpp_strerror(r) << "'" << dendl;
+    return r;
+  }
+
+  if (!roots_present) {
+    std::cerr << "Some or all system inodes are absent.  Run 'init' from "
+      "one node before running 'scan_inodes'" << std::endl;
+    return -EIO;
+  }
+
+  return forall_objects(metadata_io, true, [this](
+        std::string const &oid,
+        uint64_t obj_name_ino,
+        uint64_t obj_name_offset) -> int
+  {
+    int r = 0;
+    r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+    if (r != 0) {
+      dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+      return r;
+    }
+
+    if (obj_name_ino < (1ULL << 40)) {
+      // FIXME: we're skipping stray dirs here: if they're
+      // orphaned then we should be resetting them some other
+      // way
+      dout(10) << "Skipping system ino " << obj_name_ino << dendl;
+      return 0;
+    }
+
+    AccumulateResult accum_res;
+    inode_backtrace_t backtrace;
+
+    // Default to inherit layout (i.e. no explicit layout on dir) which is
+    // expressed as a zeroed layout struct (see inode_t::has_layout)
+    file_layout_t loaded_layout;
+
+    int parent_r = 0;
+    bufferlist parent_bl;
+    int layout_r = 0;
+    bufferlist layout_bl;
+    bufferlist op_bl;
+
+    librados::ObjectReadOperation op;
+    op.getxattr("parent", &parent_bl, &parent_r);
+    op.getxattr("layout", &layout_bl, &layout_r);
+    r = metadata_io.operate(oid, &op, &op_bl);
+    if (r != 0 && r != -ENODATA) {
+      derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
+      return r;
+    }
+
+    if (parent_r != -ENODATA) {
+      try {
+        auto q = parent_bl.cbegin();
+        backtrace.decode(q);
+      } catch (buffer::error &e) {
+        dout(4) << "Corrupt backtrace on '" << oid << "': " << e.what() << dendl;
+        if (!force_corrupt) {
+          return -EINVAL;
+        } else {
+          // Treat backtrace as absent: we'll inject into lost+found
+          backtrace = inode_backtrace_t();
+        }
+      }
+    }
+
+    if (layout_r != -ENODATA) {
+      try {
+        auto q = layout_bl.cbegin();
+        decode(loaded_layout, q);
+      } catch (buffer::error &e) {
+        dout(4) << "Corrupt layout on '" << oid << "': " << e.what() << dendl;
+        if (!force_corrupt) {
+          return -EINVAL;
+        }
+      }
+    }
+
+    bool have_backtrace = !(backtrace.ancestors.empty());
+
+    // Santity checking backtrace ino against object name
+    if (have_backtrace && backtrace.ino != obj_name_ino) {
+      dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+        << " doesn't match object name ino 0x" << obj_name_ino
+        << std::dec << dendl;
+      have_backtrace = false;
+    }
+
+    uint64_t fnode_version = 0;
+    fnode_t fnode;
+    r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
+    if (r == -EINVAL) {
+      derr << "Corrupt fnode on " << oid << dendl;
+      if (force_corrupt) {
+	fnode.fragstat.mtime = 0;
+	fnode.fragstat.nfiles = 1;
+	fnode.fragstat.nsubdirs = 0;
+	fnode.accounted_fragstat = fnode.fragstat;
+      } else {
+        return r;
+      }
+    }
+
+    InodeStore dentry;
+    build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
+		loaded_layout, &dentry);
+
+    // Inject inode to the metadata pool
+    if (have_backtrace) {
+      inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+      if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+        /* Special case for strays: even if we have a good backtrace,
+         * don't put it in the stray dir, because while that would technically
+         * give it linkage it would still be invisible to the user */
+        r = driver->inject_lost_and_found(obj_name_ino, dentry);
+        if (r < 0) {
+          dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+            << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+          if (r == -EINVAL) {
+            dout(4) << "Use --force-corrupt to overwrite structures that "
+                       "appear to be corrupt" << dendl;
+          }
+        }
+      } else {
+        /* Happy case: we will inject a named dentry for this inode */
+        r = driver->inject_with_backtrace(backtrace, dentry);
+        if (r < 0) {
+          dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+            << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+          if (r == -EINVAL) {
+            dout(4) << "Use --force-corrupt to overwrite structures that "
+                       "appear to be corrupt" << dendl;
+          }
+        }
+      }
+    } else {
+      /* Backtrace-less case: we will inject a lost+found dentry */
+      r = driver->inject_lost_and_found(
+          obj_name_ino, dentry);
+      if (r < 0) {
+        dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+          << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+        if (r == -EINVAL) {
+          dout(4) << "Use --force-corrupt to overwrite structures that "
+                     "appear to be corrupt" << dendl;
+        }
+      }
+    }
+
+    return r;
+  });
+}
+
+int MetadataTool::read_fnode(
+    inodeno_t ino, frag_t frag, fnode_t *fnode,
+    uint64_t *last_version)
+{
+  ceph_assert(fnode != NULL);
+
+  object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
+  bufferlist fnode_bl;
+  int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
+  *last_version = metadata_io.get_last_version();
+  if (r < 0) {
+    return r;
+  }
+
+  auto old_fnode_iter = fnode_bl.cbegin();
+  try {
+    (*fnode).decode(old_fnode_iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
+                const std::string &dname, InodeStore *inode, snapid_t *dnfirst)
+{
+  ceph_assert(inode != NULL);
+
+  std::string key;
+  dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+  dn_key.encode(key);
+
+  std::set<std::string> keys;
+  keys.insert(key);
+  std::map<std::string, bufferlist> vals;
+  object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
+  int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);  
+  dout(20) << "oid=" << frag_oid.name
+           << " dname=" << dname
+           << " frag=" << frag
+           << ", r=" << r << dendl;
+  if (r < 0) {
+    return r;
+  }
+
+  if (vals.find(key) == vals.end()) {
+    dout(20) << key << " not found in result" << dendl;
+    return -ENOENT;
+  }
+
+  try {
+    auto q = vals[key].cbegin();
+    snapid_t first;
+    decode(first, q);
+    char dentry_type;
+    decode(dentry_type, q);
+    if (dentry_type == 'I' || dentry_type == 'i') {
+      if (dentry_type == 'i') {
+        mempool::mds_co::string alternate_name;
+
+        DECODE_START(2, q);
+        if (struct_v >= 2)
+          decode(alternate_name, q);
+        inode->decode(q);
+        DECODE_FINISH(q);
+      } else {
+        inode->decode_bare(q);
+      }
+    } else {
+      dout(20) << "dentry type '" << dentry_type << "': cannot"
+                  "read an inode out of that" << dendl;
+      return -EINVAL;
+    }
+    if (dnfirst)
+      *dnfirst = first;
+  } catch (const buffer::error &err) {
+    dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
+             << std::dec << "/" << dname << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int MetadataDriver::load_table(MDSTable *table)
+{
+  object_t table_oid = table->get_object_name();
+
+  bufferlist table_bl;
+  int r = metadata_io.read(table_oid.name, table_bl, 0, 0);
+  if (r < 0) {
+    derr << "unable to read mds table '" << table_oid.name << "': "
+      << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  try {
+    version_t table_ver;
+    auto p = table_bl.cbegin();
+    decode(table_ver, p);
+    table->decode_state(p);
+    table->force_replay_version(table_ver);
+  } catch (const buffer::error &err) {
+    derr << "unable to decode mds table '" << table_oid.name << "': "
+      << err.what() << dendl;
+    return -EIO;
+  }
+  return 0;
+}
+
+int MetadataDriver::save_table(MDSTable *table)
+{
+  object_t table_oid = table->get_object_name();
+
+  bufferlist table_bl;
+  encode(table->get_version(), table_bl);
+  table->encode_state(table_bl);
+  int r = metadata_io.write_full(table_oid.name, table_bl);
+  if (r != 0) {
+    derr << "error updating mds table " << table_oid.name
+      << ": " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int MetadataDriver::inject_lost_and_found(
+    inodeno_t ino, const InodeStore &dentry)
+{
+  // Create lost+found if doesn't exist
+  bool created = false;
+  int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
+  if (r < 0) {
+    return r;
+  }
+  InodeStore lf_ino;
+  r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
+  if (r == -ENOENT || r == -EINVAL) {
+    if (r == -EINVAL && !force_corrupt) {
+      return r;
+    }
+
+    // To have a directory not specify a layout, give it zeros (see
+    // inode_t::has_layout)
+    file_layout_t inherit_layout;
+
+    // Construct LF inode
+    frag_info_t fragstat;
+    fragstat.nfiles = 1,
+    build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
+
+    // Inject link to LF inode in the root dir
+    r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
+    if (r < 0) {
+      return r;
+    }
+  } else {
+    if (!(lf_ino.inode->mode & S_IFDIR)) {
+      derr << "lost+found exists but is not a directory!" << dendl;
+      // In this case we error out, and the user should do something about
+      // this problem.
+      return -EINVAL;
+    }
+  }
+
+  r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
+  if (r < 0) {
+    return r;
+  }
+
+  const std::string dname = lost_found_dname(ino);
+
+  // Write dentry into lost+found dirfrag
+  return inject_linkage(lf_ino.inode->ino, dname, frag_t(), dentry);
+}
+
+
+int MetadataDriver::get_frag_of(
+    inodeno_t dirino,
+    const std::string &target_dname,
+    frag_t *result_ft)
+{
+  object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
+
+  dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
+
+  // Find and load fragtree if existing dirfrag
+  // ==========================================
+  bool have_backtrace = false; 
+  bufferlist parent_bl;
+  int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
+  if (r == -ENODATA) {
+    dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
+  } else if (r < 0) {
+    dout(4) << "Unexpected error on '" << root_frag_oid << "': "
+      << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // Deserialize backtrace
+  inode_backtrace_t backtrace;
+  if (parent_bl.length()) {
+    try {
+      auto q = parent_bl.cbegin();
+      backtrace.decode(q);
+      have_backtrace = true;
+    } catch (buffer::error &e) {
+      dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': "
+	      << e.what() << dendl;
+    }
+  }
+
+  if (!(have_backtrace && backtrace.ancestors.size())) {
+    // Can't work out fragtree without a backtrace
+    dout(4) << "No backtrace on '" << root_frag_oid
+            << "': cannot determine fragtree" << dendl;
+    return -ENOENT;
+  }
+
+  // The parentage of dirino
+  const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
+
+  // The inode of dirino's parent
+  const inodeno_t parent_ino = bp.dirino;
+
+  // The dname of dirino in its parent.
+  const std::string &parent_dname = bp.dname;
+
+  dout(20) << "got backtrace parent " << parent_ino << "/"
+           << parent_dname << dendl;
+
+  // The primary dentry for dirino
+  InodeStore existing_dentry;
+
+  // See if we can find ourselves in dirfrag zero of the parent: this
+  // is a fast path that avoids needing to go further up the tree
+  // if the parent isn't fragmented (worst case we would have to
+  // go all the way to the root)
+  r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
+  if (r >= 0) {
+    // Great, fast path: return the fragtree from here
+    if (existing_dentry.inode->ino != dirino) {
+      dout(4) << "Unexpected inode in dentry! 0x" << std::hex
+              << existing_dentry.inode->ino
+              << " vs expected 0x" << dirino << std::dec << dendl;
+      return -ENOENT;
+    }
+    dout(20) << "fast path, fragtree is "
+             << existing_dentry.dirfragtree << dendl;
+    *result_ft = existing_dentry.pick_dirfrag(target_dname);
+    dout(20) << "frag is " << *result_ft << dendl;
+    return 0;
+  } else if (r != -ENOENT) {
+    // Dentry not present in 0th frag, must read parent's fragtree
+    frag_t parent_frag;
+    r = get_frag_of(parent_ino, parent_dname, &parent_frag);
+    if (r == 0) {
+      // We have the parent fragtree, so try again to load our dentry
+      r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
+      if (r >= 0) {
+        // Got it!
+        *result_ft = existing_dentry.pick_dirfrag(target_dname);
+        dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
+        return 0;
+      } else {
+        if (r == -EINVAL || r == -ENOENT) {
+          return -ENOENT;  // dentry missing or corrupt, so frag is missing
+        } else {
+          return r;
+        }
+      }
+    } else {
+      // Couldn't resolve parent fragtree, so can't find ours.
+      return r;
+    }
+  } else if (r == -EINVAL) {
+    // Unreadable dentry, can't know the fragtree.
+    return -ENOENT;
+  } else {
+    // Unexpected error, raise it
+    return r;
+  }
+}
+
+
+int MetadataDriver::inject_with_backtrace(
+    const inode_backtrace_t &backtrace, const InodeStore &dentry)
+    
+{
+
+  // On dirfrags
+  // ===========
+  // In order to insert something into a directory, we first (ideally)
+  // need to know the fragtree for the directory.  Sometimes we can't
+  // get that, in which case we just go ahead and insert it into
+  // fragment zero for a good chance of that being the right thing
+  // anyway (most moderate-sized dirs aren't fragmented!)
+
+  // On ancestry
+  // ===========
+  // My immediate ancestry should be correct, so if we can find that
+  // directory's dirfrag then go inject it there.  This works well
+  // in the case that this inode's dentry was somehow lost and we
+  // are recreating it, because the rest of the hierarchy
+  // will probably still exist.
+  //
+  // It's more of a "better than nothing" approach when rebuilding
+  // a whole tree, as backtraces will in general not be up to date
+  // beyond the first parent, if anything in the trace was ever
+  // moved after the file was created.
+
+  // On inode numbers
+  // ================
+  // The backtrace tells us inodes for each of the parents.  If we are
+  // creating those parent dirfrags, then there is a risk that somehow
+  // the inode indicated here was also used for data (not a dirfrag) at
+  // some stage.  That would be a zany situation, and we don't check
+  // for it here, because to do so would require extra IOs for everything
+  // we inject, and anyway wouldn't guarantee that the inode number
+  // wasn't in use in some dentry elsewhere in the metadata tree that
+  // just happened not to have any data objects.
+
+  // On multiple workers touching the same traces
+  // ============================================
+  // When creating linkage for a directory, *only* create it if we are
+  // also creating the object.  That way, we might not manage to get the
+  // *right* linkage for a directory, but at least we won't multiply link
+  // it.  We assume that if a root dirfrag exists for a directory, then
+  // it is linked somewhere (i.e. that the metadata pool is not already
+  // inconsistent).
+  //
+  // Making sure *that* is true is someone else's job!  Probably someone
+  // who is not going to run in parallel, so that they can self-consistently
+  // look at versions and move things around as they go.
+  // Note this isn't 100% safe: if we die immediately after creating dirfrag
+  // object, next run will fail to create linkage for the dirfrag object
+  // and leave it orphaned.
+
+  inodeno_t ino = backtrace.ino;
+  dout(10) << "  inode: 0x" << std::hex << ino << std::dec << dendl;
+  for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
+      i != backtrace.ancestors.end(); ++i) {
+    const inode_backpointer_t &backptr = *i;
+    dout(10) << "  backptr: 0x" << std::hex << backptr.dirino << std::dec
+      << "/" << backptr.dname << dendl;
+
+    // Examine root dirfrag for parent
+    const inodeno_t parent_ino = backptr.dirino;
+    const std::string dname = backptr.dname;
+
+    frag_t fragment;
+    int r = get_frag_of(parent_ino, dname, &fragment);
+    if (r == -ENOENT) {
+      // Don't know fragment, fall back to assuming root
+      dout(20) << "don't know fragment for 0x" << std::hex <<
+        parent_ino << std::dec << "/" << dname << ", will insert to root"
+        << dendl;
+    }
+
+    // Find or create dirfrag
+    // ======================
+    bool created_dirfrag;
+    r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
+    if (r < 0) {
+      return r;
+    }
+
+    // Check if dentry already exists
+    // ==============================
+    InodeStore existing_dentry;
+    r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
+    bool write_dentry = false;
+    if (r == -ENOENT || r == -EINVAL) {
+      if (r == -EINVAL && !force_corrupt) {
+        return r;
+      }
+      // Missing or corrupt dentry
+      write_dentry = true;
+    } else if (r < 0) {
+      derr << "Unexpected error reading dentry 0x" << std::hex
+        << parent_ino << std::dec << "/"
+        << dname << ": " << cpp_strerror(r) << dendl;
+      break;
+    } else {
+      // Dentry already present, does it link to me?
+      if (existing_dentry.inode->ino == ino) {
+        dout(20) << "Dentry 0x" << std::hex
+          << parent_ino << std::dec << "/"
+          << dname << " already exists and points to me" << dendl;
+      } else {
+        derr << "Dentry 0x" << std::hex
+          << parent_ino << std::dec << "/"
+          << dname << " already exists but points to 0x"
+          << std::hex << existing_dentry.inode->ino << std::dec << dendl;
+        // Fall back to lost+found!
+        return inject_lost_and_found(backtrace.ino, dentry);
+      }
+    }
+
+    // Inject linkage
+    // ==============
+
+    if (write_dentry) {
+      if (i == backtrace.ancestors.begin()) {
+        // This is the linkage for the file of interest
+        dout(10) << "Linking inode 0x" << std::hex << ino
+          << " at 0x" << parent_ino << "/" << dname << std::dec
+          << " with size=" << dentry.inode->size << " bytes" << dendl;
+
+        /* NOTE: dnfirst fixed in scan_links */
+        r = inject_linkage(parent_ino, dname, fragment, dentry);
+      } else {
+        // This is the linkage for an ancestor directory
+        dout(10) << "Linking ancestor directory of inode 0x" << std::hex << ino
+                 << " at 0x" << std::hex << parent_ino
+                 << ":" << dname << dendl;
+
+        InodeStore ancestor_dentry;
+        auto inode = ancestor_dentry.get_inode();
+        inode->mode = 0755 | S_IFDIR;
+
+        // Set nfiles to something non-zero, to fool any other code
+        // that tries to ignore 'empty' directories.  This won't be
+        // accurate, but it should avoid functional issues.
+
+        inode->dirstat.nfiles = 1;
+        inode->dir_layout.dl_dir_hash =
+                               g_conf()->mds_default_dir_hash;
+
+        inode->nlink = 1;
+        inode->ino = ino;
+        inode->uid = g_conf()->mds_root_ino_uid;
+        inode->gid = g_conf()->mds_root_ino_gid;
+        inode->version = 1;
+        inode->backtrace_version = 1;
+        /* NOTE: dnfirst fixed in scan_links */
+        r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
+      }
+
+      if (r < 0) {
+        return r;
+      }
+    }
+
+    if (!created_dirfrag) {
+      // If the parent dirfrag already existed, then stop traversing the
+      // backtrace: assume that the other ancestors already exist too.  This
+      // is an assumption rather than a truth, but it's a convenient way
+      // to avoid the risk of creating multiply-linked directories while
+      // injecting data.  If there are in fact missing ancestors, this
+      // should be fixed up using a separate tool scanning the metadata
+      // pool.
+      break;
+    } else {
+      // Proceed up the backtrace, creating parents
+      ino = parent_ino;
+    }
+  }
+
+  return 0;
+}
+
+int MetadataDriver::find_or_create_dirfrag(
+    inodeno_t ino,
+    frag_t fragment,
+    bool *created)
+{
+  ceph_assert(created != NULL);
+
+  fnode_t existing_fnode;
+  *created = false;
+
+  uint64_t read_version = 0;
+  int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
+  dout(10) << "read_version = " << read_version << dendl;
+
+  if (r == -ENOENT || r == -EINVAL) {
+    if (r == -EINVAL && !force_corrupt) {
+      return r;
+    }
+
+    // Missing or corrupt fnode, create afresh
+    bufferlist fnode_bl;
+    fnode_t blank_fnode;
+    blank_fnode.version = 1;
+    // mark it as non-empty
+    blank_fnode.fragstat.nfiles = 1;
+    blank_fnode.accounted_fragstat = blank_fnode.fragstat;
+    blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
+    blank_fnode.encode(fnode_bl);
+
+
+    librados::ObjectWriteOperation op;
+
+    if (read_version) {
+      ceph_assert(r == -EINVAL);
+      // Case A: We must assert that the version isn't changed since we saw the object
+      // was unreadable, to avoid the possibility of two data-scan processes
+      // both creating the frag.
+      op.assert_version(read_version);
+    } else {
+      ceph_assert(r == -ENOENT);
+      // Case B: The object didn't exist in read_fnode, so while creating it we must
+      // use an exclusive create to correctly populate *creating with
+      // whether we created it ourselves or someone beat us to it.
+      op.create(true);
+    }
+
+    object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
+    op.omap_set_header(fnode_bl);
+    r = metadata_io.operate(frag_oid.name, &op);
+    if (r == -EOVERFLOW || r == -EEXIST) {
+      // Someone else wrote it (see case A above)
+      dout(10) << "Dirfrag creation race: 0x" << std::hex
+        << ino << " " << fragment << std::dec << dendl;
+      *created = false;
+      return 0;
+    } else if (r < 0) {
+      // We were unable to create or write it, error out
+      derr << "Failed to create dirfrag 0x" << std::hex
+        << ino << std::dec << ": " << cpp_strerror(r) << dendl;
+      return r;
+    } else {
+      // Success: the dirfrag object now exists with a value header
+      dout(10) << "Created dirfrag: 0x" << std::hex
+        << ino << std::dec << dendl;
+      *created = true;
+    }
+  } else if (r < 0) {
+    derr << "Unexpected error reading dirfrag 0x" << std::hex
+      << ino << std::dec << " : " << cpp_strerror(r) << dendl;
+    return r;
+  } else {
+    dout(20) << "Dirfrag already exists: 0x" << std::hex
+      << ino << " " << fragment << std::dec << dendl;
+  }
+
+  return 0;
+}
+
+int MetadataDriver::inject_linkage(
+    inodeno_t dir_ino, const std::string &dname,
+    const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst)
+{
+  object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
+
+  std::string key;
+  dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+  dn_key.encode(key);
+
+  bufferlist dentry_bl;
+  encode(dnfirst, dentry_bl);
+  encode('I', dentry_bl);
+  inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+  // Write out
+  std::map<std::string, bufferlist> vals;
+  vals[key] = dentry_bl;
+  int r = metadata_io.omap_set(frag_oid.name, vals);
+  if (r != 0) {
+    derr << "Error writing dentry 0x" << std::hex
+      << dir_ino << std::dec << "/"
+      << dname << ": " << cpp_strerror(r) << dendl;
+    return r;
+  } else {
+    dout(20) << "Injected dentry 0x" << std::hex
+      << dir_ino << "/" << dname << " pointing to 0x"
+      << inode.inode->ino << std::dec << dendl;
+    return 0;
+  }
+}
+
+
+int MetadataDriver::init(
+  librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+  fs_cluster_id_t fscid)
+{
+  if (metadata_pool_name.empty()) {
+    auto fs =  fsmap->get_filesystem(fscid);
+    ceph_assert(fs != nullptr);
+    int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
+
+    dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+    int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+    if (r < 0) {
+      derr << "Pool " << metadata_pool_id
+	   << " identified in MDS map not found in RADOS!" << dendl;
+      return r;
+    }
+    dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
+  } else {
+    dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
+  }
+  return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+}
+
+int LocalFileDriver::init(
+  librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+  fs_cluster_id_t fscid)
+{
+  return 0;
+}
+
+int LocalFileDriver::inject_data(
+    const std::string &file_path,
+    uint64_t size,
+    uint32_t chunk_size,
+    inodeno_t ino)
+{
+  // Scrape the file contents out of the data pool and into the
+  // local filesystem
+  std::fstream f;
+  f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
+
+  for (uint64_t offset = 0; offset < size; offset += chunk_size) {
+    bufferlist bl;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf),
+        "%llx.%08llx",
+        (unsigned long long)ino,
+        (unsigned long long)(offset / chunk_size));
+    std::string oid(buf);
+
+    int r = data_io.read(oid, bl, chunk_size, 0);
+
+    if (r <= 0 && r != -ENOENT) {
+      derr << "error reading data object '" << oid << "': "
+        << cpp_strerror(r) << dendl;
+      f.close();
+      return r;
+    } else if (r >=0) {
+      
+      f.seekp(offset);
+      bl.write_stream(f);
+    }
+  }
+  f.close();
+
+  return 0;
+}
+
+
+int LocalFileDriver::inject_with_backtrace(
+    const inode_backtrace_t &bt,
+    const InodeStore &dentry)
+{
+  std::string path_builder = path;
+
+  // Iterate through backtrace creating directory parents
+  std::vector<inode_backpointer_t>::const_reverse_iterator i;
+  for (i = bt.ancestors.rbegin();
+      i != bt.ancestors.rend(); ++i) {
+
+    const inode_backpointer_t &backptr = *i;
+    path_builder += "/";
+    path_builder += backptr.dname;
+
+    // Last entry is the filename itself
+    bool is_file = (i + 1 == bt.ancestors.rend());
+    if (is_file) {
+      // FIXME: inject_data won't cope with interesting (i.e. striped)
+      // layouts (need a librados-compatible Filer to read these)
+      inject_data(path_builder, dentry.inode->size,
+		  dentry.inode->layout.object_size, bt.ino);
+    } else {
+      int r = mkdir(path_builder.c_str(), 0755);
+      if (r != 0 && r != -EPERM) {
+        derr << "error creating directory: '" << path_builder << "': "
+          << cpp_strerror(r) << dendl;
+        return r;
+      }
+    }
+  }
+
+  return 0;
+}
+
+int LocalFileDriver::inject_lost_and_found(
+    inodeno_t ino,
+    const InodeStore &dentry)
+{
+  std::string lf_path = path + "/lost+found";
+  int r = mkdir(lf_path.c_str(), 0755);
+  if (r != 0 && r != -EPERM) {
+    derr << "error creating directory: '" << lf_path << "': "
+      << cpp_strerror(r) << dendl;
+    return r;
+  }
+  
+  std::string file_path = lf_path + "/" + lost_found_dname(ino);
+  return inject_data(file_path, dentry.inode->size,
+		     dentry.inode->layout.object_size, ino);
+}
+
+int LocalFileDriver::init_roots(int64_t data_pool_id)
+{
+  // Ensure that the path exists and is a directory
+  bool exists;
+  int r = check_roots(&exists);
+  if (r != 0) {
+    return r;
+  }
+
+  if (exists) {
+    return 0;
+  } else {
+    return ::mkdir(path.c_str(), 0755);
+  }
+}
+
+int LocalFileDriver::check_roots(bool *result)
+{
+  // Check if the path exists and is a directory
+  DIR *d = ::opendir(path.c_str());
+  if (d == NULL) {
+    *result = false;
+  } else {
+    int r = closedir(d);
+    if (r != 0) {
+      // Weird, but maybe possible with e.g. stale FD on NFS mount?
+      *result = false;
+    } else {
+      *result = true;
+    }
+  }
+
+  return 0;
+}
+
+void MetadataTool::build_file_dentry(
+    inodeno_t ino, uint64_t file_size, time_t file_mtime,
+    const file_layout_t &layout, InodeStore *out, std::string symlink)
+{
+  ceph_assert(out != NULL);
+
+  auto inode = out->get_inode();
+  if(!symlink.empty()) {
+    inode->mode = 0777 | S_IFLNK;
+    out->symlink = symlink;
+  }
+  else {
+    inode->mode = 0500 | S_IFREG;
+  }
+
+  inode->size = file_size;
+  inode->max_size_ever = file_size;
+  inode->mtime.tv.tv_sec = file_mtime;
+  inode->atime.tv.tv_sec = file_mtime;
+  inode->ctime.tv.tv_sec = file_mtime;
+
+  inode->layout = layout;
+
+  inode->truncate_seq = 1;
+  inode->truncate_size = -1ull;
+
+  inode->inline_data.version = CEPH_INLINE_NONE;
+
+  inode->nlink = 1;
+  inode->ino = ino;
+  inode->version = 1;
+  inode->backtrace_version = 1;
+  inode->uid = g_conf()->mds_root_ino_uid;
+  inode->gid = g_conf()->mds_root_ino_gid;
+}
+
+void MetadataTool::build_dir_dentry(
+    inodeno_t ino, const frag_info_t &fragstat,
+    const file_layout_t &layout, InodeStore *out)
+{
+  ceph_assert(out != NULL);
+
+  auto inode = out->get_inode();
+  inode->mode = 0755 | S_IFDIR;
+  inode->dirstat = fragstat;
+  inode->mtime.tv.tv_sec = fragstat.mtime;
+  inode->atime.tv.tv_sec = fragstat.mtime;
+  inode->ctime.tv.tv_sec = fragstat.mtime;
+
+  inode->layout = layout;
+  inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+
+  inode->truncate_seq = 1;
+  inode->truncate_size = -1ull;
+
+  inode->inline_data.version = CEPH_INLINE_NONE;
+
+  inode->nlink = 1;
+  inode->ino = ino;
+  inode->version = 1;
+  inode->backtrace_version = 1;
+  inode->uid = g_conf()->mds_root_ino_uid;
+  inode->gid = g_conf()->mds_root_ino_gid;
+}
+
diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h
new file mode 100644
index 000000000..4b8f34bf6
--- /dev/null
+++ b/src/tools/cephfs/DataScan.h
@@ -0,0 +1,344 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "MDSUtility.h"
+#include "include/rados/librados.hpp"
+
+class InodeStore;
+class MDSTable;
+
+class RecoveryDriver {
+  protected:
+    // If true, overwrite structures that generate decoding errors.
+    bool force_corrupt;
+
+    // If true, overwrite root objects during init_roots even if they
+    // exist
+    bool force_init;
+
+  public:
+    virtual int init(
+        librados::Rados &rados,
+	std::string &metadata_pool_name,
+        const FSMap *fsmap,
+        fs_cluster_id_t fscid) = 0;
+
+    void set_force_corrupt(const bool val)
+    {
+      force_corrupt = val;
+    }
+
+    void set_force_init(const bool val)
+    {
+      force_init = val;
+    }
+
+
+    /**
+     * Inject an inode + dentry parents into the metadata pool,
+     * based on a backtrace recovered from the data pool
+     */
+    virtual int inject_with_backtrace(
+        const inode_backtrace_t &bt,
+        const InodeStore &dentry) = 0;
+
+    /**
+     * Inject an inode + dentry into the lost+found directory,
+     * when all we know about a file is its inode.
+     */
+    virtual int inject_lost_and_found(
+        inodeno_t ino,
+        const InodeStore &dentry) = 0;
+
+    /**
+     * Create any missing roots (i.e. mydir, strays, root inode)
+     */
+    virtual int init_roots(
+        int64_t data_pool_id) = 0;
+
+    /**
+     * Pre-injection check that all the roots are present in
+     * the metadata pool.  Used to avoid parallel workers interfering
+     * with one another, by cueing the user to go run 'init' on a
+     * single node before running a parallel scan.
+     *
+     * @param result: set to true if roots are present, else set to false
+     * @returns 0 on no unexpected errors, else error code.  Missing objects
+     *          are not considered an unexpected error: check *result for
+     *          this case.
+     */
+    virtual int check_roots(bool *result) = 0;
+
+    /**
+     * Helper to compose dnames for links to lost+found
+     * inodes.
+     */
+    std::string lost_found_dname(inodeno_t ino)
+    {
+      char s[20];
+      snprintf(s, sizeof(s), "%llx", (unsigned long long)ino);
+      return std::string(s);
+    }
+
+    RecoveryDriver()
+      : force_corrupt(false),
+	force_init(false)
+    {}
+
+    virtual ~RecoveryDriver() {}
+};
+
+class LocalFileDriver : public RecoveryDriver
+{ 
+  protected:
+    const std::string path;
+    librados::IoCtx &data_io;
+
+  int inject_data(
+      const std::string &file_path,
+      uint64_t size,
+      uint32_t chunk_size,
+      inodeno_t ino);
+  public:
+
+    LocalFileDriver(const std::string &path_, librados::IoCtx &data_io_)
+      : RecoveryDriver(), path(path_), data_io(data_io_)
+    {}
+
+    // Implement RecoveryDriver interface
+    int init(
+        librados::Rados &rados,
+	std::string &metadata_pool_name,
+        const FSMap *fsmap,
+        fs_cluster_id_t fscid) override;
+
+    int inject_with_backtrace(
+        const inode_backtrace_t &bt,
+        const InodeStore &dentry) override;
+
+    int inject_lost_and_found(
+        inodeno_t ino,
+        const InodeStore &dentry) override;
+
+    int init_roots(int64_t data_pool_id) override;
+
+    int check_roots(bool *result) override;
+};
+
+/**
+ * A class that knows how to work with objects in a CephFS
+ * metadata pool.
+ */
+class MetadataTool
+{
+  protected:
+
+  librados::IoCtx metadata_io;
+
+  /**
+   * Construct a synthetic InodeStore for a normal file
+   */
+  void build_file_dentry(
+    inodeno_t ino, uint64_t file_size, time_t file_mtime,
+    const file_layout_t &layout,
+    InodeStore *out,
+    std::string symlink);
+
+  /**
+   * Construct a synthetic InodeStore for a directory
+   */
+  void build_dir_dentry(
+    inodeno_t ino,
+    const frag_info_t &fragstat,
+    const file_layout_t &layout,
+    InodeStore *out);
+
+  /**
+   * Try and read an fnode from a dirfrag
+   */
+  int read_fnode(inodeno_t ino, frag_t frag,
+                 fnode_t *fnode, uint64_t *read_version);
+
+  /**
+   * Try and read a dentry from a dirfrag
+   */
+  int read_dentry(inodeno_t parent_ino, frag_t frag,
+		  const std::string &dname, InodeStore *inode, snapid_t *dnfirst=nullptr);
+};
+
+/**
+ * A class that knows how to manipulate CephFS metadata pools
+ */
+class MetadataDriver : public RecoveryDriver, public MetadataTool
+{
+  protected:
+    /**
+     * Create a .inode object, i.e. root or mydir
+     */
+    int inject_unlinked_inode(inodeno_t inono, int mode, int64_t data_pool_id);
+
+    /**
+     * Check for existence of .inode objects, before
+     * trying to go ahead and inject metadata.
+     */
+    int root_exists(inodeno_t ino, bool *result);
+    int find_or_create_dirfrag(
+        inodeno_t ino,
+        frag_t fragment,
+        bool *created);
+
+
+    /**
+     * Work out which fragment of a directory should contain a named
+     * dentry, recursing up the trace as necessary to retrieve
+     * fragtrees.
+     */
+    int get_frag_of(
+        inodeno_t dirino,
+        const std::string &dname,
+        frag_t *result_ft);
+
+  public:
+
+    // Implement RecoveryDriver interface
+    int init(
+        librados::Rados &rados,
+	std::string &metadata_pool_name,
+        const FSMap *fsmap,
+        fs_cluster_id_t fscid) override;
+
+    int inject_linkage(
+        inodeno_t dir_ino, const std::string &dname,
+        const frag_t fragment, const InodeStore &inode, snapid_t dnfirst=CEPH_NOSNAP);
+
+    int inject_with_backtrace(
+        const inode_backtrace_t &bt,
+        const InodeStore &dentry) override;
+
+    int inject_lost_and_found(
+        inodeno_t ino,
+        const InodeStore &dentry) override;
+
+    int init_roots(int64_t data_pool_id) override;
+
+    int check_roots(bool *result) override;
+
+    int load_table(MDSTable *table);
+    int save_table(MDSTable *table);
+};
+
+class DataScan : public MDSUtility, public MetadataTool
+{
+  protected:
+    RecoveryDriver *driver;
+    fs_cluster_id_t fscid;
+
+    std::string metadata_pool_name;
+    std::vector<int64_t> data_pools;
+
+    // IoCtx for data pool (where we scrape file backtraces from)
+    librados::IoCtx data_io;
+    // Remember the data pool ID for use in layouts
+    int64_t data_pool_id;
+    // IoCtxs for extra data pools
+    std::vector<librados::IoCtx> extra_data_ios;
+
+    uint32_t n;
+    uint32_t m;
+
+    /**
+     * Scan data pool for backtraces, and inject inodes to metadata pool
+     */
+    int scan_inodes();
+
+    /**
+     * Scan data pool for file sizes and mtimes
+     */
+    int scan_extents();
+
+    /**
+     * Scan metadata pool for 0th dirfrags to link orphaned
+     * directory inodes.
+     */
+    int scan_frags();
+
+    /**
+     * Cleanup xattrs from data pool
+     */
+    int cleanup();
+
+    /**
+     * Check if an inode number is in the permitted ranges
+     */
+    bool valid_ino(inodeno_t ino) const;
+
+
+    int scan_links();
+
+    // Accept pools which are not in the FSMap
+    bool force_pool;
+    // Respond to decode errors by overwriting
+    bool force_corrupt;
+    // Overwrite root objects even if they exist
+    bool force_init;
+    // Only scan inodes without this scrub tag
+    std::string filter_tag;
+
+    /**
+     * @param r set to error on valid key with invalid value
+     * @return true if argument consumed, else false
+     */
+    bool parse_kwarg(
+        const std::vector<const char*> &args,
+        std::vector<const char *>::const_iterator &i,
+        int *r);
+
+    /**
+     * @return true if argument consumed, else false
+     */
+    bool parse_arg(
+      const std::vector<const char*> &arg,
+      std::vector<const char *>::const_iterator &i);
+
+    int probe_filter(librados::IoCtx &ioctx);
+
+    /**
+     * Apply a function to all objects in an ioctx's pool, optionally
+     * restricted to only those objects with a 00000000 offset and
+     * no tag matching DataScan::scrub_tag.
+     */
+    int forall_objects(
+        librados::IoCtx &ioctx,
+        bool untagged_only,
+        std::function<int(std::string, uint64_t, uint64_t)> handler);
+
+  public:
+    static void usage();
+    int main(const std::vector<const char *> &args);
+
+    DataScan()
+      : driver(NULL), fscid(FS_CLUSTER_ID_NONE),
+	data_pool_id(-1), n(0), m(1),
+        force_pool(false), force_corrupt(false),
+        force_init(false)
+    {
+    }
+
+    ~DataScan() override
+    {
+      delete driver;
+    }
+};
+
diff --git a/src/tools/cephfs/Dumper.cc b/src/tools/cephfs/Dumper.cc
new file mode 100644
index 000000000..68a190182
--- /dev/null
+++ b/src/tools/cephfs/Dumper.cc
@@ -0,0 +1,433 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef _BACKWARD_BACKWARD_WARNING_H
+#define _BACKWARD_BACKWARD_WARNING_H   // make gcc 4.3 shut up about hash_*
+#endif
+
+#include "include/compat.h"
+#include "include/fs_types.h"
+#include "common/entity_name.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/JournalPointer.h"
+#include "osdc/Journaler.h"
+#include "mon/MonClient.h"
+
+#include "Dumper.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+#define HEADER_LEN 4096
+
+using namespace std;
+
+int Dumper::init(mds_role_t role_, const std::string &type)
+{
+  role = role_;
+
+  int r = MDSUtility::init();
+  if (r < 0) {
+    return r;
+  }
+
+  auto fs =  fsmap->get_filesystem(role.fscid);
+  ceph_assert(fs != nullptr);
+
+  if (type == "mdlog") {
+    JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool());
+    int jp_load_result = jp.load(objecter);
+    if (jp_load_result != 0) {
+      std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl;
+      return jp_load_result;
+    } else {
+      ino = jp.front;
+    }
+  } else if (type == "purge_queue") {
+    ino = MDS_INO_PURGE_QUEUE + role.rank;
+  } else {
+    ceph_abort(); // should not get here 
+  }
+  return 0;
+}
+
+
+int Dumper::recover_journal(Journaler *journaler)
+{
+  C_SaferCond cond;
+  lock.lock();
+  journaler->recover(&cond);
+  lock.unlock();
+  const int r = cond.wait();
+
+  if (r < 0) { // Error
+    derr << "error on recovery: " << cpp_strerror(r) << dendl;
+    return r;
+  } else {
+    dout(10) << "completed journal recovery" << dendl;
+    return 0;
+  }
+}
+
+
+int Dumper::dump(const char *dump_file)
+{
+  int r = 0;
+
+  auto fs =  fsmap->get_filesystem(role.fscid);
+  ceph_assert(fs != nullptr);
+
+  Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(),
+                      CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
+                      &finisher);
+  r = recover_journal(&journaler);
+  if (r) {
+    return r;
+  }
+  uint64_t start = journaler.get_read_pos();
+  uint64_t end = journaler.get_write_pos();
+  uint64_t len = end-start;
+
+  Filer filer(objecter, &finisher);
+
+  cout << "journal is " << start << "~" << len << std::endl;
+
+  int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0644);
+  if (fd >= 0) {
+    // include an informative header
+    uuid_d fsid = monc->get_fsid();
+    char fsid_str[40];
+    fsid.print(fsid_str);
+    char buf[HEADER_LEN];
+    memset(buf, 0, sizeof(buf));
+    snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\
+       length %llu (0x%llx)\n    write_pos %llu (0x%llx)\n    format %llu\n\
+       trimmed_pos %llu (0x%llx)\n    stripe_unit %lu (0x%lx)\n    stripe_count %lu (0x%lx)\n\
+       object_size %lu (0x%lx)\n    fsid %s\n%c",
+	    role.rank, 
+	    (unsigned long long)start, (unsigned long long)start,
+	    (unsigned long long)len, (unsigned long long)len,
+	    (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos,
+	    (unsigned long long)journaler.last_committed.stream_format,
+	    (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos,
+            (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit,
+            (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count,
+            (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size,
+	    fsid_str,
+	    4);
+    r = safe_write(fd, buf, sizeof(buf));
+    if (r) {
+      derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl;
+      ::close(fd);
+      return r;
+    }
+
+    // write the data
+    off64_t seeked = ::lseek64(fd, start, SEEK_SET);
+    if (seeked == (off64_t)-1) {
+      r = errno;
+      derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl;
+      ::close(fd);
+      return r;
+    }
+
+
+    // Read and write 32MB chunks.  Slower than it could be because we're not
+    // streaming, but that's okay because this is just a debug/disaster tool.
+    const uint32_t chunk_size = 32 * 1024 * 1024;
+
+    for (uint64_t pos = start; pos < start + len; pos += chunk_size) {
+      bufferlist bl;
+      dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl;
+
+      const uint32_t read_size = std::min<uint64_t>(chunk_size, end - pos);
+
+      C_SaferCond cond;
+      lock.lock();
+      filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP,
+                 pos, read_size, &bl, 0, &cond);
+      lock.unlock();
+      r = cond.wait();
+      if (r < 0) {
+        derr << "Error " << r << " (" << cpp_strerror(r) << ") reading "
+                "journal at offset 0x" << std::hex << pos << std::dec << dendl;
+        ::close(fd);
+        return r;
+      }
+      dout(10) << "Got 0x" << std::hex << bl.length() << std::dec
+               << " bytes" << dendl;
+
+      r = bl.write_fd(fd);
+      if (r) {
+        derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl;
+        ::close(fd);
+        return r;
+      }
+    }
+
+    r = ::close(fd);
+    if (r) {
+      r = errno;
+      derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl;
+      return r;
+    }
+
+    cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n"
+	 << "NOTE: this is a _sparse_ file; you can\n"
+	 << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n"
+	 << "      to efficiently compress it while preserving sparseness." << std::endl;
+    return 0;
+  } else {
+    int err = errno;
+    derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl;
+    return err;
+  }
+}
+
+int Dumper::undump(const char *dump_file, bool force)
+{
+  cout << "undump " << dump_file << std::endl;
+  
+  auto fs =  fsmap->get_filesystem(role.fscid);
+  ceph_assert(fs != nullptr);
+
+  int r = 0;
+  // try get layout info from cluster
+  Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(),
+                      CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
+                      &finisher);
+  int recovered = recover_journal(&journaler);
+  if (recovered != 0) {
+    derr << "recover_journal failed, try to get header from dump file " << dendl;
+  }
+
+  int fd = ::open(dump_file, O_RDONLY|O_BINARY);
+  if (fd < 0) {
+    r = errno;
+    derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // Ceph mds0 journal dump
+  //  start offset 232401996 (0xdda2c4c)
+  //        length 1097504 (0x10bf20)
+
+  char buf[HEADER_LEN];
+  r = safe_read(fd, buf, sizeof(buf));
+  if (r < 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return r;
+  }
+
+  long long unsigned start, len, write_pos, format, trimmed_pos;
+  long unsigned stripe_unit, stripe_count, object_size;
+  sscanf(strstr(buf, "start offset"), "start offset %llu", &start);
+  sscanf(strstr(buf, "length"), "length %llu", &len);
+  sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos);
+  sscanf(strstr(buf, "format"), "format %llu", &format);
+
+  if (!force) {
+    // need to check if fsid match onlien cluster fsid
+    if (strstr(buf, "fsid")) {
+      uuid_d fsid;
+      char fsid_str[40];
+      sscanf(strstr(buf, "fsid"), "fsid %39s", fsid_str);
+      r = fsid.parse(fsid_str);
+      if (!r) {
+	derr  << "Invalid fsid" << dendl;
+	::close(fd);
+	return -EINVAL;
+      }
+
+      if (fsid != monc->get_fsid()) {
+	derr << "Imported journal fsid does not match online cluster fsid" << dendl;
+	derr << "Use --force to skip fsid check" << dendl;
+	::close(fd);
+	return -EINVAL;
+      }
+    } else {
+      derr  << "Invalid header, no fsid embeded" << dendl;
+      ::close(fd);
+      return -EINVAL;
+    }
+  }
+
+  if (recovered == 0) {
+    stripe_unit = journaler.last_committed.layout.stripe_unit;
+    stripe_count = journaler.last_committed.layout.stripe_count;
+    object_size = journaler.last_committed.layout.object_size;
+  } else {
+    // try to get layout from dump file header, if failed set layout to default
+    if (strstr(buf, "stripe_unit")) {
+      sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit);
+    } else {
+      stripe_unit = file_layout_t::get_default().stripe_unit;
+    }
+    if (strstr(buf, "stripe_count")) {
+      sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count);
+    } else {
+      stripe_count = file_layout_t::get_default().stripe_count;
+    }
+    if (strstr(buf, "object_size")) {
+      sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size);
+    } else {
+      object_size = file_layout_t::get_default().object_size;
+    }
+  }
+
+  if (strstr(buf, "trimmed_pos")) {
+    sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos);
+  } else {
+    // Old format dump, any untrimmed objects before expire_pos will
+    // be discarded as trash.
+    trimmed_pos = start - (start % object_size);
+  }
+
+  if (trimmed_pos > start) {
+    derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos
+      << " > expire 0x" << start << std::dec << dendl;
+    ::close(fd);
+    return -EINVAL;
+  }
+
+  if (start > write_pos) {
+    derr << std::hex << "Invalid header (expire 0x" << start
+      << " > write 0x" << write_pos << std::dec << dendl;
+    ::close(fd);
+    return -EINVAL;
+  }
+
+  cout << "start " << start <<
+    " len " << len <<
+    " write_pos " << write_pos <<
+    " format " << format <<
+    " trimmed_pos " << trimmed_pos <<
+    " stripe_unit " << stripe_unit <<
+    " stripe_count " << stripe_count <<
+    " object_size " << object_size << std::endl;
+  
+  Journaler::Header h;
+  h.trimmed_pos = trimmed_pos;
+  h.expire_pos = start;
+  h.write_pos = write_pos;
+  h.stream_format = format;
+  h.magic = CEPH_FS_ONDISK_MAGIC;
+
+  h.layout.stripe_unit = stripe_unit;
+  h.layout.stripe_count = stripe_count;
+  h.layout.object_size = object_size;
+  h.layout.pool_id = fs->mds_map.get_metadata_pool();
+  
+  bufferlist hbl;
+  encode(h, hbl);
+
+  object_t oid = file_object_t(ino, 0);
+  object_locator_t oloc(fs->mds_map.get_metadata_pool());
+  SnapContext snapc;
+
+  cout << "writing header " << oid << std::endl;
+  C_SaferCond header_cond;
+  lock.lock();
+  objecter->write_full(oid, oloc, snapc, hbl,
+		       ceph::real_clock::now(), 0,
+		       &header_cond);
+  lock.unlock();
+
+  r = header_cond.wait();
+  if (r != 0) {
+    derr << "Failed to write header: " << cpp_strerror(r) << dendl;
+    ::close(fd);
+    return r;
+  }
+
+  Filer filer(objecter, &finisher);
+
+  /* Erase any objects at the end of the region to which we shall write
+   * the new log data.  This is to avoid leaving trailing junk after
+   * the newly written data.  Any junk more than one object ahead
+   * will be taken care of during normal operation by Journaler's
+   * prezeroing behaviour */
+  {
+    uint32_t const object_size = h.layout.object_size;
+    ceph_assert(object_size > 0);
+    uint64_t last_obj = h.write_pos / object_size;
+    uint64_t purge_count = 2;
+    /* When the length is zero, the last_obj should be zeroed 
+     * from the offset determined by the new write_pos instead of being purged.
+     */
+    if (!len) {
+        purge_count = 1;
+        ++last_obj;
+    }
+    C_SaferCond purge_cond;
+    cout << "Purging " << purge_count << " objects from " << last_obj << std::endl;
+    lock.lock();
+    filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count,
+		      ceph::real_clock::now(), 0, &purge_cond);
+    lock.unlock();
+    purge_cond.wait();
+  }
+  /* When the length is zero, zero the last object 
+   * from the offset determined by the new write_pos.
+   */
+  if (!len) {
+    uint64_t offset_in_obj = h.write_pos % h.layout.object_size;
+    uint64_t len           = h.layout.object_size - offset_in_obj;
+    C_SaferCond zero_cond;
+    cout << "Zeroing " << len << " bytes in the last object." << std::endl;
+    
+    lock.lock();
+    filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond);
+    lock.unlock();
+    zero_cond.wait();
+  }
+
+  // Stream from `fd` to `filer`
+  uint64_t pos = start;
+  uint64_t left = len;
+  while (left > 0) {
+    // Read
+    bufferlist j;
+    lseek64(fd, pos, SEEK_SET);
+    uint64_t l = std::min<uint64_t>(left, 1024*1024);
+    j.read_fd(fd, l);
+
+    // Write
+    cout << " writing " << pos << "~" << l << std::endl;
+    C_SaferCond write_cond;
+    lock.lock();
+    filer.write(ino, &h.layout, snapc, pos, l, j,
+		ceph::real_clock::now(), 0, &write_cond);
+    lock.unlock();
+
+    r = write_cond.wait();
+    if (r != 0) {
+      derr << "Failed to write header: " << cpp_strerror(r) << dendl;
+      ::close(fd);
+      return r;
+    }
+      
+    // Advance
+    pos += l;
+    left -= l;
+  }
+
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  cout << "done." << std::endl;
+  return 0;
+}
+
diff --git a/src/tools/cephfs/Dumper.h b/src/tools/cephfs/Dumper.h
new file mode 100644
index 000000000..758f3cdea
--- /dev/null
+++ b/src/tools/cephfs/Dumper.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef JOURNAL_DUMPER_H_
+#define JOURNAL_DUMPER_H_
+
+
+#include "MDSUtility.h"
+
+class Journaler;
+
+/**
+ * This class lets you dump out an mds journal for troubleshooting or whatever.
+ *
+ * It was built to work with cmds so some of the design choices are random.
+ * To use, create a Dumper, call init(), and then call dump() with the name
+ * of the file to dump to.
+ */
+
+class Dumper : public MDSUtility {
+private:
+  mds_role_t role;
+  inodeno_t ino;
+
+public:
+  Dumper() : ino(-1)
+  {}
+
+  int init(mds_role_t role_, const std::string &type);
+  int recover_journal(Journaler *journaler);
+  int dump(const char *dumpfile);
+  int undump(const char *dumpfile, bool force);
+};
+
+#endif /* JOURNAL_DUMPER_H_ */
diff --git a/src/tools/cephfs/EventOutput.cc b/src/tools/cephfs/EventOutput.cc
new file mode 100644
index 000000000..8cb235a82
--- /dev/null
+++ b/src/tools/cephfs/EventOutput.cc
@@ -0,0 +1,153 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation.  see file copying.
+ */
+
+
+#include <iostream>
+#include <fstream>
+
+#include "common/errno.h"
+#include "mds/mdstypes.h"
+#include "mds/events/EUpdate.h"
+#include "mds/LogEvent.h"
+#include "JournalScanner.h"
+
+#include "EventOutput.h"
+
+
+int EventOutput::binary() const
+{
+  // Binary output, files
+  int r = ::mkdir(path.c_str(), 0755);
+  if (r != 0) {
+    r = -errno;
+    if (r != -EEXIST) {
+      std::cerr << "Error creating output directory: " << cpp_strerror(r) << std::endl;
+      return r;
+    }
+  }
+
+  for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+    bufferlist bin;
+    std::stringstream filename;
+    if (auto& le = i->second.log_event; le) {
+      le->encode(bin, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      filename << "0x" << std::hex << i->first << std::dec << "_" << le->get_type_str() << ".bin";
+    } else if (auto& pi = i->second.pi; pi) {
+      pi->encode(bin);
+      filename << "0x" << std::hex << i->first << std::dec << "_" << pi->get_type_str() << ".bin";
+    }
+
+    std::string const file_path = path + std::string("/") + filename.str();
+    std::ofstream bin_file(file_path.c_str(), std::ofstream::out | std::ofstream::binary);
+    bin.write_stream(bin_file);
+    bin_file.close();
+    if (bin_file.fail()) {
+      return -EIO;
+    }
+  }
+  std::cerr << "Wrote output to binary files in directory '" << path << "'" << std::endl;
+
+  return 0;
+}
+
+int EventOutput::json() const
+{
+  JSONFormatter jf(true);
+  std::ofstream out_file(path.c_str(), std::ofstream::out);
+  jf.open_array_section("journal");
+  {
+    for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+      if (auto& le = i->second.log_event; le) {
+	jf.open_object_section("log_event");
+	le->dump(&jf);
+	jf.close_section();  // log_event
+      } else if (auto& pi = i->second.pi; pi) {
+	jf.open_object_section("purge_action");
+	pi->dump(&jf);
+	jf.close_section();
+      }
+    }
+  }
+  jf.close_section();  // journal
+  jf.flush(out_file);
+  out_file.close();
+
+  if (out_file.fail()) {
+    return -EIO;
+  } else {
+    std::cerr << "Wrote output to JSON file '" << path << "'" << std::endl;
+    return 0;
+  }
+}
+
+void EventOutput::list() const
+{
+  for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+    if (auto& le = i->second.log_event; le) {
+      std::vector<std::string> ev_paths;
+      EMetaBlob const *emb = le->get_metablob();
+      if (emb) {
+	emb->get_paths(ev_paths);
+      }
+
+      std::string detail;
+      if (le->get_type() == EVENT_UPDATE) {
+	auto& eu = reinterpret_cast<EUpdate&>(*le);
+	detail = eu.type;
+      }
+
+      std::cout << le->get_stamp() << " 0x"
+	<< std::hex << i->first << std::dec << " "
+	<< le->get_type_str() << ": "
+	<< " (" << detail << ")" << std::endl;
+      for (std::vector<std::string>::iterator i = ev_paths.begin(); i != ev_paths.end(); ++i) {
+	std::cout << "  " << *i << std::endl;
+      }
+    } else if (auto& pi = i->second.pi; pi) {
+      std::cout << pi->stamp << " 0x"
+	<< std::hex << i->first << std::dec << " "
+	<< pi->get_type_str() << std::endl;
+    }
+  }
+}
+
+void EventOutput::summary() const
+{
+  std::map<std::string, int> type_count;
+  for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+    std::string type;
+    if (auto& le = i->second.log_event; le)
+      type = le->get_type_str();
+    else if (auto& pi = i->second.pi; pi)
+      type = pi->get_type_str();
+    if (type_count.count(type) == 0) {
+      type_count[type] = 0;
+    }
+    type_count[type] += 1;
+  }
+
+  std::cout << "Events by type:" << std::endl;
+  for (std::map<std::string, int>::iterator i = type_count.begin(); i != type_count.end(); ++i) {
+      std::cout << "  " << i->first << ": " << i->second << std::endl;
+  }
+
+  std::cout << "Errors: " << scan.errors.size() << std::endl;
+  if (!scan.errors.empty()) {
+    for (JournalScanner::ErrorMap::const_iterator i = scan.errors.begin();
+         i != scan.errors.end(); ++i) {
+      std::cout << "  0x" << std::hex << i->first << std::dec
+                << ": " << i->second.r << " "
+                << i->second.description << std::endl;
+    }
+  }
+}
diff --git a/src/tools/cephfs/EventOutput.h b/src/tools/cephfs/EventOutput.h
new file mode 100644
index 000000000..65d968409
--- /dev/null
+++ b/src/tools/cephfs/EventOutput.h
@@ -0,0 +1,42 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation.  see file copying.
+ */
+
+
+#ifndef EVENT_OUTPUT_H
+#define EVENT_OUTPUT_H
+
+#include <string>
+
+class JournalScanner;
+
+/**
+ * Different output formats for the results of a journal scan
+ */
+class EventOutput
+{
+  private:
+    JournalScanner const &scan;
+    std::string const path;
+
+  public:
+    EventOutput(JournalScanner const &scan_, std::string const &path_)
+      : scan(scan_), path(path_) {}
+
+    void summary() const;
+    void list() const;
+    int json() const;
+    int binary() const;
+};
+
+#endif // EVENT_OUTPUT_H
+
diff --git a/src/tools/cephfs/JournalFilter.cc b/src/tools/cephfs/JournalFilter.cc
new file mode 100644
index 000000000..3a5e781a2
--- /dev/null
+++ b/src/tools/cephfs/JournalFilter.cc
@@ -0,0 +1,316 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation.  see file copying.
+ */
+
+
+#include "JournalFilter.h"
+
+#include "common/ceph_argparse.h"
+
+#include "mds/events/ESession.h"
+#include "mds/events/EUpdate.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+using namespace std;
+
+const string JournalFilter::range_separator("..");
+
+bool JournalFilter::apply(uint64_t pos, PurgeItem &pi) const
+{
+  /* Filtering by journal offset range */
+  if (pos < range_start || pos >= range_end) {
+    return false;
+  }
+
+  if (purge_action != PurgeItem::NONE) {
+    if (pi.action != purge_action)
+      return false;
+  }
+
+  if (inode) {
+    if (inode != pi.ino)
+      return false;
+  }
+  return true;
+}
+
+/*
+ * Return whether a LogEvent is to be included or excluded.
+ *
+ * The filter parameters are applied on an AND basis: if any
+ * condition is not met, the event is excluded.  Try to do
+ * the fastest checks first.
+ */
+bool JournalFilter::apply(uint64_t pos, LogEvent &le) const
+{
+  /* Filtering by journal offset range */
+  if (pos < range_start || pos >= range_end) {
+    return false;
+  }
+
+  /* Filtering by event type */
+  if (event_type != 0) {
+    if (le.get_type() != event_type) {
+      return false;
+    }
+  }
+
+  /* Filtering by client */
+  if (client_name.num()) {
+    EMetaBlob const *metablob = le.get_metablob();
+    if (metablob) {
+      if (metablob->get_client_name() != client_name) {
+        return false;
+      }
+    } else if (le.get_type() == EVENT_SESSION) {
+      ESession *es = reinterpret_cast<ESession*>(&le);
+      if (es->get_client_inst().name != client_name) {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  /* Filtering by inode */
+  if (inode) {
+    EMetaBlob const *metablob = le.get_metablob();
+    if (metablob) {
+      std::set<inodeno_t> inodes;
+      metablob->get_inodes(inodes);
+      bool match_any = false;
+      for (std::set<inodeno_t>::iterator i = inodes.begin(); i != inodes.end(); ++i) {
+        if (*i == inode) {
+          match_any = true;
+          break;
+        }
+      }
+      if (!match_any) {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  /* Filtering by frag and dentry */
+  if (!frag_dentry.empty() || frag.ino) {
+    EMetaBlob const *metablob = le.get_metablob();
+    if (metablob) {
+      std::map<dirfrag_t, std::set<std::string> > dentries;
+      metablob->get_dentries(dentries);
+
+      if (frag.ino) {
+        bool match_any = false;
+        for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin();
+            i != dentries.end(); ++i) {
+          if (i->first == frag) {
+            match_any = true;
+            break;
+          }
+        }
+        if (!match_any) {
+          return false;
+        }
+      }
+
+      if (!frag_dentry.empty()) {
+        bool match_any = false;
+        for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin();
+            i != dentries.end() && !match_any; ++i) {
+          std::set<std::string> const &names = i->second;
+          for (std::set<std::string>::iterator j = names.begin();
+              j != names.end() && !match_any; ++j) {
+            if (*j == frag_dentry) {
+              match_any = true;
+            }
+          }
+        }
+        if (!match_any) {
+          return false;
+        }
+      }
+
+    } else {
+      return false;
+    }
+  }
+
+  /* Filtering by file path */
+  if (!path_expr.empty()) {
+    EMetaBlob const *metablob = le.get_metablob();
+    if (metablob) {
+      std::vector<std::string> paths;
+      metablob->get_paths(paths);
+      bool match_any = false;
+      for (std::vector<std::string>::iterator p = paths.begin(); p != paths.end(); ++p) {
+        if ((*p).find(path_expr) != std::string::npos) {
+          match_any = true;
+          break;
+        }
+      }
+      if (!match_any) {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+
+int JournalFilter::parse_args(
+  std::vector<const char*> &argv, 
+  std::vector<const char*>::iterator &arg)
+{
+  while(arg != argv.end()) {
+    std::string arg_str;
+    if (ceph_argparse_witharg(argv, arg, &arg_str, "--range", (char*)NULL)) {
+      size_t sep_loc = arg_str.find(JournalFilter::range_separator);
+      if (sep_loc == std::string::npos || arg_str.size() <= JournalFilter::range_separator.size()) {
+        derr << "Invalid range '" << arg_str << "'" << dendl;
+        return -EINVAL;
+      }
+
+      // We have a lower bound
+      if (sep_loc > 0) {
+        std::string range_start_str = arg_str.substr(0, sep_loc); 
+        std::string parse_err;
+        range_start = strict_strtoll(range_start_str.c_str(), 0, &parse_err);
+        if (!parse_err.empty()) {
+          derr << "Invalid lower bound '" << range_start_str << "': " << parse_err << dendl;
+          return -EINVAL;
+        }
+      }
+
+      if (sep_loc < arg_str.size() - JournalFilter::range_separator.size()) {
+        std::string range_end_str = arg_str.substr(sep_loc + range_separator.size()); 
+        std::string parse_err;
+        range_end = strict_strtoll(range_end_str.c_str(), 0, &parse_err);
+        if (!parse_err.empty()) {
+          derr << "Invalid upper bound '" << range_end_str << "': " << parse_err << dendl;
+          return -EINVAL;
+        }
+      }
+    } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
+      if (!type.compare("purge_queue")) {
+	derr << "Invalid filter arguments: purge_queue doesn't take \"--path\"." << dendl;
+	return -EINVAL;
+      }
+      dout(4) << "Filtering by path '" << arg_str << "'" << dendl;
+      path_expr = arg_str;
+    } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--inode", (char*)NULL)) {
+      dout(4) << "Filtering by inode '" << arg_str << "'" << dendl;
+      std::string parse_err;
+      inode = strict_strtoll(arg_str.c_str(), 0, &parse_err);
+      if (!parse_err.empty()) {
+        derr << "Invalid inode '" << arg_str << "': " << parse_err << dendl;
+        return -EINVAL;
+      }
+    } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--type", (char*)NULL)) {
+      try {
+	if (!type.compare("mdlog")) {
+	  event_type = LogEvent::str_to_type(arg_str);
+	} else if (!type.compare("purge_queue")) {
+	  purge_action = PurgeItem::str_to_type(arg_str);
+	}
+      } catch (const std::out_of_range&) {
+	 derr << "Invalid event type '" << arg_str << "'" << dendl;
+	 return -EINVAL;
+      }
+    } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--frag", (char*)NULL)) {
+      if (!type.compare("purge_queue")) {
+	derr << "Invalid filter arguments: purge_queue doesn't take \"--frag\"." << dendl;
+	return -EINVAL;
+      }
+      std::string const frag_sep = ".";
+      size_t sep_loc = arg_str.find(frag_sep);
+      std::string inode_str;
+      std::string frag_str;
+      if (sep_loc != std::string::npos) {
+        inode_str = arg_str.substr(0, sep_loc);
+        frag_str = arg_str.substr(sep_loc + 1);
+      } else {
+        inode_str = arg_str;
+        frag_str = "0";
+      }
+
+      std::string parse_err;
+      inodeno_t frag_ino = strict_strtoll(inode_str.c_str(), 0, &parse_err);
+      if (!parse_err.empty()) {
+        derr << "Invalid inode '" << inode_str << "': " << parse_err << dendl;
+        return -EINVAL;
+      }
+
+      uint32_t frag_enc = strict_strtoll(frag_str.c_str(), 0, &parse_err);
+      if (!parse_err.empty()) {
+        derr << "Invalid frag '" << frag_str << "': " << parse_err << dendl;
+        return -EINVAL;
+      }
+
+      frag = dirfrag_t(frag_ino, frag_t(frag_enc));
+      dout(4) << "dirfrag filter: '" << frag << "'" << dendl;
+    } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--dname", (char*)NULL)) {
+      if (!type.compare("purge_queue")) {
+	derr << "Invalid filter arguments: purge_queue doesn't take \"--dname\"." << dendl;
+	return -EINVAL;
+      }
+      frag_dentry = arg_str;
+      dout(4) << "dentry filter: '" << frag_dentry << "'" << dendl;
+    } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--client", (char*)NULL)) {
+      if (!type.compare("purge_queue")) {
+	derr << "Invalid filter arguments: purge_queue doesn't take \"--client\"." << dendl;
+	return -EINVAL;
+      }
+
+      std::string parse_err;
+      int64_t client_num = strict_strtoll(arg_str.c_str(), 0, &parse_err);
+      if (!parse_err.empty()) {
+        derr << "Invalid client number " << arg_str << dendl;
+        return -EINVAL;
+      }
+      client_name = entity_name_t::CLIENT(client_num);
+    } else {
+      // We're done with args the filter understands
+      break;
+    }
+  }
+
+  return 0;
+}
+
+/**
+ * If the filter params are only range, then return
+ * true and set start & end.  Else return false.
+ *
+ * Use this to discover if the user has requested a contiguous range
+ * rather than any per-event filtering.
+ */
+bool JournalFilter::get_range(uint64_t &start, uint64_t &end) const
+{
+  if (!path_expr.empty()
+      || inode != 0
+      || event_type != 0
+      || frag.ino != 0
+      || client_name.num() != 0
+      || (range_start == 0 && range_end == (uint64_t)(-1))) {
+    return false;
+  } else {
+    start = range_start;
+    end = range_end;
+    return true;
+  }
+}
diff --git a/src/tools/cephfs/JournalFilter.h b/src/tools/cephfs/JournalFilter.h
new file mode 100644
index 000000000..f7a2db614
--- /dev/null
+++ b/src/tools/cephfs/JournalFilter.h
@@ -0,0 +1,73 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation.  see file copying.
+ */
+
+
+#ifndef JOURNAL_FILTER_H
+#define JOURNAL_FILTER_H
+
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/PurgeQueue.h"
+
+/**
+ * A set of conditions for narrowing down a search through the journal
+ */
+class JournalFilter
+{
+  private:
+
+  /* Filtering by journal offset range */
+  uint64_t range_start;
+  uint64_t range_end;
+  static const std::string range_separator;
+
+  /* Filtering by file (sub) path */
+  std::string path_expr;
+
+  /* Filtering by inode */
+  inodeno_t inode;
+
+  /* Filtering by type */
+  LogEvent::EventType event_type;
+
+  std::string type;
+
+  /* Filtering by PurgeItem::Action */
+  PurgeItem::Action purge_action;
+
+  /* Filtering by dirfrag */
+  dirfrag_t frag;
+  std::string frag_dentry;  //< optional, filter dentry name within fragment
+
+  /* Filtering by metablob client name */
+  entity_name_t client_name;
+
+  public:
+  JournalFilter(std::string t) :
+    range_start(0),
+    range_end(-1),
+    inode(0),
+    event_type(0),
+    type(t),
+    purge_action(PurgeItem::NONE) {}
+
+  bool get_range(uint64_t &start, uint64_t &end) const;
+  bool apply(uint64_t pos, LogEvent &le) const;
+  bool apply(uint64_t pos, PurgeItem &pi) const;
+  int parse_args(
+    std::vector<const char*> &argv, 
+    std::vector<const char*>::iterator &arg);
+};
+
+#endif // JOURNAL_FILTER_H
+
diff --git a/src/tools/cephfs/JournalScanner.cc b/src/tools/cephfs/JournalScanner.cc
new file mode 100644
index 000000000..e72542fd4
--- /dev/null
+++ b/src/tools/cephfs/JournalScanner.cc
@@ -0,0 +1,438 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation.  see file copying.
+ */
+
+
+#include "include/rados/librados.hpp"
+#include "mds/JournalPointer.h"
+
+#include "mds/events/ESubtreeMap.h"
+#include "mds/PurgeQueue.h"
+
+#include "JournalScanner.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+/**
+ * Read journal header, followed by sequential scan through journal space.
+ *
+ * Return 0 on success, else error code.  Note that success has the special meaning
+ * that we were able to apply our checks, it does *not* mean that the journal is
+ * healthy.
+ */
+int JournalScanner::scan(bool const full)
+{
+  int r = 0;
+
+  r = set_journal_ino();
+  if (r < 0) {
+    return r;
+  }
+
+  if (!is_mdlog || pointer_present) {
+    r = scan_header();
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  if (full && header_present) {
+    r = scan_events();
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+
+int JournalScanner::set_journal_ino()
+{
+  int r = 0;
+  if (type == "purge_queue") {
+    ino = MDS_INO_PURGE_QUEUE + rank;
+  }
+  else if (type == "mdlog"){
+    r = scan_pointer();
+    is_mdlog = true;
+  }
+  else {
+    ceph_abort(); // should not get here
+  }
+  return r;
+}
+
+int JournalScanner::scan_pointer()
+{
+  // Issue read
+  std::string const pointer_oid = obj_name(MDS_INO_LOG_POINTER_OFFSET + rank, 0);
+  bufferlist pointer_bl;
+  int r = io.read(pointer_oid, pointer_bl, INT_MAX, 0);
+  if (r == -ENOENT) {
+    // 'Successfully' discovered the pointer is missing.
+    derr << "Pointer " << pointer_oid << " is absent" << dendl;
+    return 0;
+  } else if (r < 0) {
+    // Error preventing us interrogating pointer
+    derr << "Pointer " << pointer_oid << " is unreadable" << dendl;
+    return r;
+  } else {
+    dout(4) << "Pointer " << pointer_oid << " is readable" << dendl;
+    pointer_present = true;
+
+    JournalPointer jp;
+    try {
+      auto q = pointer_bl.cbegin();
+      jp.decode(q);
+    } catch(buffer::error &e) {
+      derr << "Pointer " << pointer_oid << " is corrupt: " << e.what() << dendl;
+      return 0;
+    }
+
+    pointer_valid = true;
+    ino = jp.front;
+    return 0;
+  }
+}
+
+
+int JournalScanner::scan_header()
+{
+  int r;
+
+  bufferlist header_bl;
+  std::string header_name = obj_name(0);
+  dout(4) << "JournalScanner::scan: reading header object '" << header_name << "'" << dendl;
+  r = io.read(header_name, header_bl, INT_MAX, 0);
+  if (r < 0) {
+    derr << "Header " << header_name << " is unreadable" << dendl;
+    return 0;  // "Successfully" found an error
+  } else {
+    header_present = true;
+  }
+
+  auto header_bl_i = header_bl.cbegin();
+  header = new Journaler::Header();
+  try
+  {
+    header->decode(header_bl_i);
+  }
+  catch (buffer::error &e)
+  {
+    derr << "Header is corrupt (" << e.what() << ")" << dendl;
+    delete header;
+    header = NULL;
+    return 0;  // "Successfully" found an error
+  }
+
+  if (header->magic != std::string(CEPH_FS_ONDISK_MAGIC)) {
+    derr << "Header is corrupt (bad magic)" << dendl;
+    return 0;  // "Successfully" found an error
+  }
+  if (!((header->trimmed_pos <= header->expire_pos) && (header->expire_pos <= header->write_pos))) {
+    derr << "Header is invalid (inconsistent offsets)" << dendl;
+    return 0;  // "Successfully" found an error
+  }
+  header_valid = true;
+
+  return 0;
+}
+
+
+int JournalScanner::scan_events()
+{
+  uint64_t object_size = g_conf()->mds_log_segment_size;
+  if (object_size == 0) {
+    // Default layout object size
+    object_size = file_layout_t::get_default().object_size;
+  }
+
+  uint64_t read_offset = header->expire_pos;
+  dout(10) << std::hex << "Header 0x"
+    << header->trimmed_pos << " 0x"
+    << header->expire_pos << " 0x"
+    << header->write_pos << std::dec << dendl;
+  dout(10) << "Starting journal scan from offset 0x" << std::hex << read_offset << std::dec << dendl;
+
+  // TODO also check for extraneous objects before the trimmed pos or after the write pos,
+  // which would indicate a bogus header.
+
+  bufferlist read_buf;
+  bool gap = false;
+  uint64_t gap_start = -1;
+  for (uint64_t obj_offset = (read_offset / object_size); ; obj_offset++) {
+    uint64_t offset_in_obj = 0;
+    if (obj_offset * object_size < header->expire_pos) {
+      // Skip up to expire_pos from start of the object
+      // (happens for the first object we read)
+      offset_in_obj = header->expire_pos - obj_offset * object_size;
+    }
+
+    // Read this journal segment
+    bufferlist this_object;
+    std::string const oid = obj_name(obj_offset);
+    int r = io.read(oid, this_object, INT_MAX, offset_in_obj);
+
+    // Handle absent journal segments
+    if (r < 0) {
+      if (obj_offset > (header->write_pos / object_size)) {
+        dout(4) << "Reached end of journal objects" << dendl;
+        break;
+      } else {
+        derr << "Missing object " << oid << dendl;
+      }
+
+      objects_missing.push_back(obj_offset);
+      if (!gap) {
+        gap_start = read_offset;
+        gap = true;
+      }
+      if (read_buf.length() > 0) {
+        read_offset += read_buf.length();
+        read_buf.clear();
+      }
+      read_offset += object_size - offset_in_obj;
+      continue;
+    } else {
+      dout(4) << "Read 0x" << std::hex << this_object.length() << std::dec
+              << " bytes from " << oid << " gap=" << gap << dendl;
+      objects_valid.push_back(oid);
+      this_object.begin().copy(this_object.length(), read_buf);
+    }
+
+    if (gap) {
+      // No valid data at the current read offset, scan forward until we find something valid looking
+      // or have to drop out to load another object.
+      dout(4) << "Searching for sentinel from 0x" << std::hex << read_offset
+              << ", 0x" << read_buf.length() << std::dec << " bytes available" << dendl;
+
+      do {
+        auto p = read_buf.cbegin();
+        uint64_t candidate_sentinel;
+        decode(candidate_sentinel, p);
+
+        dout(4) << "Data at 0x" << std::hex << read_offset << " = 0x" << candidate_sentinel << std::dec << dendl;
+
+        if (candidate_sentinel == JournalStream::sentinel) {
+          dout(4) << "Found sentinel at 0x" << std::hex << read_offset << std::dec << dendl;
+          ranges_invalid.push_back(Range(gap_start, read_offset));
+          gap = false;
+          break;
+        } else {
+          // No sentinel, discard this byte
+          read_buf.splice(0, 1);
+          read_offset += 1;
+        }
+      } while (read_buf.length() >= sizeof(JournalStream::sentinel));
+      dout(4) << "read_buf size is " << read_buf.length() << dendl;
+    } 
+    {
+      dout(10) << "Parsing data, 0x" << std::hex << read_buf.length() << std::dec << " bytes available" << dendl;
+      while(true) {
+        // TODO: detect and handle legacy format journals: can do many things
+        // on them but on read errors have to give up instead of searching
+        // for sentinels.
+        JournalStream journal_stream(JOURNAL_FORMAT_RESILIENT);
+        bool readable = false;
+        try {
+          uint64_t need;
+          readable = journal_stream.readable(read_buf, &need);
+        } catch (buffer::error &e) {
+          readable = false;
+          dout(4) << "Invalid container encoding at 0x" << std::hex << read_offset << std::dec << dendl;
+          gap = true;
+          gap_start = read_offset;
+          read_buf.splice(0, 1);
+          read_offset += 1;
+          break;
+        }
+
+        if (!readable) {
+          // Out of data, continue to read next object
+          break;
+        }
+
+        bufferlist le_bl;  //< Serialized LogEvent blob
+        dout(10) << "Attempting decode at 0x" << std::hex << read_offset << std::dec << dendl;
+        // This cannot fail to decode because we pre-checked that a serialized entry
+        // blob would be readable.
+        uint64_t start_ptr = 0;
+        uint64_t consumed = journal_stream.read(read_buf, &le_bl, &start_ptr);
+        dout(10) << "Consumed 0x" << std::hex << consumed << std::dec << " bytes" << dendl;
+        if (start_ptr != read_offset) {
+          derr << "Bad entry start ptr (0x" << std::hex << start_ptr << ") at 0x"
+              << read_offset << std::dec << dendl;
+          gap = true;
+          gap_start = read_offset;
+          // FIXME: given that entry was invalid, should we be skipping over it?
+          // maybe push bytes back onto start of read_buf and just advance one byte
+          // to start scanning instead.  e.g. if a bogus size value is found it can
+          // cause us to consume and thus skip a bunch of following valid events.
+          read_offset += consumed;
+          break;
+        }
+        bool valid_entry = true;
+        if (is_mdlog) {
+          auto le = LogEvent::decode_event(le_bl.cbegin());
+
+          if (le) {
+            dout(10) << "Valid entry at 0x" << std::hex << read_offset << std::dec << dendl;
+
+            if (le->get_type() == EVENT_SUBTREEMAP
+                || le->get_type() == EVENT_SUBTREEMAP_TEST) {
+              auto&& sle = dynamic_cast<ESubtreeMap&>(*le);
+              if (sle.expire_pos > read_offset) {
+                errors.insert(std::make_pair(
+                      read_offset, EventError(
+                        -ERANGE,
+                        "ESubtreeMap has expire_pos ahead of its own position")));
+              }
+            }
+
+            if (filter.apply(read_offset, *le)) {
+              events.insert_or_assign(read_offset, EventRecord(std::move(le), consumed));
+            }
+          } else {
+            valid_entry = false;
+          }
+        } else if (type == "purge_queue"){
+           auto pi = std::make_unique<PurgeItem>();
+           try {
+             auto q = le_bl.cbegin();
+             pi->decode(q);
+	     if (filter.apply(read_offset, *pi)) {
+	       events.insert_or_assign(read_offset, EventRecord(std::move(pi), consumed));
+	     }
+           } catch (const buffer::error &err) {
+             valid_entry = false;
+           }
+        } else {
+          ceph_abort(); // should not get here
+        }
+        if (!valid_entry) {
+          dout(10) << "Invalid entry at 0x" << std::hex << read_offset << std::dec << dendl;
+          gap = true;
+          gap_start = read_offset;
+          read_offset += consumed;
+          break;
+        } else {
+          events_valid.push_back(read_offset);
+          read_offset += consumed;
+        }
+      }
+    }
+  }
+
+  if (gap) {
+    // Ended on a gap, assume it ran to end
+    ranges_invalid.push_back(Range(gap_start, -1));
+  }
+
+  dout(4) << "Scanned objects, " << objects_missing.size() << " missing, " << objects_valid.size() << " valid" << dendl;
+  dout(4) << "Events scanned, " << ranges_invalid.size() << " gaps" << dendl;
+  dout(4) << "Found " << events_valid.size() << " valid events" << dendl;
+  dout(4) << "Selected " << events.size() << " events events for processing" << dendl;
+
+  return 0;
+}
+
+
+JournalScanner::~JournalScanner()
+{
+  if (header) {
+    delete header;
+    header = NULL;
+  }
+  dout(4) << events.size() << " events" << dendl;
+  events.clear();
+}
+
+
+/**
+ * Whether the journal data looks valid and replayable
+ */
+bool JournalScanner::is_healthy() const
+{
+  return ((!is_mdlog || (pointer_present && pointer_valid))
+      && header_present && header_valid
+      && ranges_invalid.empty()
+      && objects_missing.empty());
+}
+
+
+/**
+ * Whether the journal data can be read from RADOS
+ */
+bool JournalScanner::is_readable() const
+{
+  return (header_present && header_valid && objects_missing.empty());
+}
+
+
+/**
+ * Calculate the object name for a given offset
+ */
+std::string JournalScanner::obj_name(inodeno_t ino, uint64_t offset) const
+{
+  char name[60];
+  snprintf(name, sizeof(name), "%llx.%08llx",
+      (unsigned long long)(ino),
+      (unsigned long long)offset);
+  return std::string(name);
+}
+
+
+std::string JournalScanner::obj_name(uint64_t offset) const
+{
+  return obj_name(ino, offset);
+}
+
+
+/*
+ * Write a human readable summary of the journal health
+ */
+void JournalScanner::report(std::ostream &out) const
+{
+  out << "Overall journal integrity: " << (is_healthy() ? "OK" : "DAMAGED") << std::endl;
+
+  if (is_mdlog) {
+    if (!pointer_present) {
+      out << "Pointer not found" << std::endl;
+    } else if (!pointer_valid) {
+      out << "Pointer could not be decoded" << std::endl;
+    }
+  }
+  if (!header_present) {
+    out << "Header not found" << std::endl;
+  } else if (!header_valid) {
+    out << "Header could not be decoded" << std::endl;
+  }
+
+  if (objects_missing.size()) {
+    out << "Objects missing:" << std::endl;
+    for (std::vector<uint64_t>::const_iterator om = objects_missing.begin();
+         om != objects_missing.end(); ++om) {
+      out << "  0x" << std::hex << *om << std::dec << std::endl;
+    }
+  }
+
+  if (ranges_invalid.size()) {
+    out << "Corrupt regions:" << std::endl;
+    for (std::vector<Range>::const_iterator r = ranges_invalid.begin();
+         r != ranges_invalid.end(); ++r) {
+      out << "  0x" << std::hex << r->first << "-" << r->second << std::dec << std::endl;
+    }
+  }
+}
+
diff --git a/src/tools/cephfs/JournalScanner.h b/src/tools/cephfs/JournalScanner.h
new file mode 100644
index 000000000..9197b5596
--- /dev/null
+++ b/src/tools/cephfs/JournalScanner.h
@@ -0,0 +1,133 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation.  see file copying.
+ */
+
+#ifndef JOURNAL_SCANNER_H
+#define JOURNAL_SCANNER_H
+
+#include "include/rados/librados_fwd.hpp"
+
+// For Journaler::Header, can't forward-declare nested classes
+#include <osdc/Journaler.h>
+
+#include "JournalFilter.h"
+
+/**
+ * A simple sequential reader for metadata journals.  Unlike
+ * the MDS Journaler class, this is written to detect, record,
+ * and read past corruptions and missing objects.  It is also
+ * less efficient but more plainly written.
+ */
+class JournalScanner
+{
+  private:
+  librados::IoCtx &io;
+
+  // Input constraints
+  const int rank;
+  std::string type;
+  JournalFilter const filter;
+
+  void gap_advance();
+
+  public:
+  JournalScanner(
+      librados::IoCtx &io_,
+      int rank_,
+      const std::string &type_,
+      JournalFilter const &filter_) :
+    io(io_),
+    rank(rank_),
+    type(type_),
+    filter(filter_),
+    is_mdlog(false),
+    pointer_present(false),
+    pointer_valid(false),
+    header_present(false),
+    header_valid(false),
+    header(NULL) {};
+
+  JournalScanner(
+      librados::IoCtx &io_,
+      int rank_,
+      const std::string &type_) :
+    io(io_),
+    rank(rank_),
+    type(type_),
+    filter(type_),
+    is_mdlog(false),
+    pointer_present(false),
+    pointer_valid(false),
+    header_present(false),
+    header_valid(false),
+    header(NULL) {};
+
+  ~JournalScanner();
+
+  int set_journal_ino();
+  int scan(bool const full=true);
+  int scan_pointer();
+  int scan_header();
+  int scan_events();
+  void report(std::ostream &out) const;
+
+  std::string obj_name(uint64_t offset) const;
+  std::string obj_name(inodeno_t ino, uint64_t offset) const;
+
+  // The results of the scan
+  inodeno_t ino;  // Corresponds to journal ino according their type
+  struct EventRecord {
+    EventRecord(std::unique_ptr<LogEvent> le, uint32_t rs) : log_event(std::move(le)), raw_size(rs) {}
+    EventRecord(std::unique_ptr<PurgeItem> p, uint32_t rs) : pi(std::move(p)), raw_size(rs) {}
+    std::unique_ptr<LogEvent> log_event;
+    std::unique_ptr<PurgeItem> pi;
+    uint32_t raw_size = 0;  //< Size from start offset including all encoding overhead
+  };
+
+  class EventError {
+    public:
+    int r;
+    std::string description;
+    EventError(int r_, const std::string &desc_)
+      : r(r_), description(desc_) {}
+  };
+
+  typedef std::map<uint64_t, EventRecord> EventMap;
+  typedef std::map<uint64_t, EventError> ErrorMap;
+  typedef std::pair<uint64_t, uint64_t> Range;
+  bool is_mdlog;
+  bool pointer_present; //mdlog specific
+  bool pointer_valid;   //mdlog specific
+  bool header_present;
+  bool header_valid;
+  Journaler::Header *header;
+
+  bool is_healthy() const;
+  bool is_readable() const;
+  std::vector<std::string> objects_valid;
+  std::vector<uint64_t> objects_missing;
+  std::vector<Range> ranges_invalid;
+  std::vector<uint64_t> events_valid;
+  EventMap events;
+
+  // For events present in ::events (i.e. scanned successfully),
+  // any subsequent errors handling them (e.g. replaying)
+  ErrorMap errors;
+
+
+  private:
+  // Forbid copy construction because I have ptr members
+  JournalScanner(const JournalScanner &rhs);
+};
+
+#endif // JOURNAL_SCANNER_H
+
diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc
new file mode 100644
index 000000000..6bca9bb08
--- /dev/null
+++ b/src/tools/cephfs/JournalTool.cc
@@ -0,0 +1,1266 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation.  see file copying.
+ */
+
+
+#include <sstream>
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/InoTable.h"
+
+#include "mds/events/ENoOp.h"
+#include "mds/events/EUpdate.h"
+
+#include "JournalScanner.h"
+#include "EventOutput.h"
+#include "Dumper.h"
+#include "Resetter.h"
+
+#include "JournalTool.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+using namespace std;
+
+void JournalTool::usage()
+{
+  std::cout << "Usage: \n"
+    << "  cephfs-journal-tool [options] journal <command>\n"
+    << "    <command>:\n"
+    << "      inspect\n"
+    << "      import <path> [--force]\n"
+    << "      export <path>\n"
+    << "      reset [--force]\n"
+    << "  cephfs-journal-tool [options] header <get|set> <field> <value>\n"
+    << "    <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n"
+    << "  cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
+    << "    <selector>:\n"
+    << "      --range=<start>..<end>\n"
+    << "      --path=<substring>\n"
+    << "      --inode=<integer>\n"
+    << "      --type=<UPDATE|OPEN|SESSION...><\n"
+    << "      --frag=<ino>.<frag> [--dname=<dentry string>]\n"
+    << "      --client=<session id integer>\n"
+    << "    <effect>: [get|recover_dentries|splice]\n"
+    << "    <output>: [summary|list|binary|json] [--path <path>]\n"
+    << "\n"
+    << "General options:\n"
+    << "  --rank=filesystem:{mds-rank|all} journal rank or \"all\" ranks (mandatory)\n"
+    << "  --journal=<mdlog|purge_queue>  Journal type (purge_queue means\n"
+    << "                                 this journal is used to queue for purge operation,\n"
+    << "                                 default is mdlog, and only mdlog support event mode)\n"
+    << "\n"
+    << "Special options\n"
+    << "  --alternate-pool <name>     Alternative metadata pool to target\n"
+    << "                              when using recover_dentries.\n";
+
+  generic_client_usage();
+}
+
+
+/**
+ * Handle arguments and hand off to journal/header/event mode
+ */
+int JournalTool::main(std::vector<const char*> &argv)
+{
+  int r;
+
+  dout(10) << "JournalTool::main " << dendl;
+  // Common arg parsing
+  // ==================
+  if (argv.empty()) {
+    cerr << "missing positional argument" << std::endl;
+    return -EINVAL;
+  }
+
+  std::vector<const char*>::iterator arg = argv.begin();
+
+  std::string rank_str;
+  if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
+    derr << "missing mandatory \"--rank\" argument" << dendl;
+    return -EINVAL;
+  }
+
+  if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) {
+    // Default is mdlog
+    type = "mdlog";
+  }
+  
+  r = validate_type(type);
+  if (r != 0) {
+    derr << "journal type is not correct." << dendl;
+    return r;
+  }
+
+  r = role_selector.parse(*fsmap, rank_str, false);
+  if (r != 0) {
+    derr << "Couldn't determine MDS rank." << dendl;
+    return r;
+  }
+
+  std::string mode;
+  if (arg == argv.end()) {
+    derr << "Missing mode [journal|header|event]" << dendl;
+    return -EINVAL;
+  }
+  mode = std::string(*arg);
+  arg = argv.erase(arg);
+
+  // RADOS init
+  // ==========
+  r = rados.init_with_context(g_ceph_context);
+  if (r < 0) {
+    derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+    return r;
+  }
+
+  dout(4) << "JournalTool: connecting to RADOS..." << dendl;
+  r = rados.connect();
+  if (r < 0) {
+    derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+ 
+  auto fs = fsmap->get_filesystem(role_selector.get_ns());
+  ceph_assert(fs != nullptr);
+  int64_t const pool_id = fs->mds_map.get_metadata_pool();
+  dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
+  std::string pool_name;
+  r = rados.pool_reverse_lookup(pool_id, &pool_name);
+  if (r < 0) {
+    derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
+    return r;
+  }
+
+  dout(4) << "JournalTool: creating IoCtx.." << dendl;
+  r = rados.ioctx_create(pool_name.c_str(), input);
+  ceph_assert(r == 0);
+  output.dup(input);
+
+  // Execution
+  // =========
+  // journal and header are general journal mode
+  // event mode is only specific for mdlog
+  auto roles = role_selector.get_roles();
+  if (roles.size() > 1) {
+    const std::string &command = argv[0];
+    bool allowed = can_execute_for_all_ranks(mode, command);
+    if (!allowed) {
+      derr << "operation not allowed for all ranks" << dendl;
+      return -EINVAL;
+    }
+
+    all_ranks = true;
+  }
+  for (auto role : roles) {
+    rank = role.rank;
+    std::vector<const char *> rank_argv(argv);
+    dout(4) << "Executing for rank " << rank << dendl;
+    if (mode == std::string("journal")) {
+      r = main_journal(rank_argv);
+    } else if (mode == std::string("header")) {
+      r = main_header(rank_argv);
+    } else if (mode == std::string("event")) {
+      r = main_event(rank_argv);
+    } else {
+      cerr << "Bad command '" << mode << "'" << std::endl;
+      return -EINVAL;
+    }
+
+    if (r != 0) {
+      return r;
+    }
+  }
+
+  return r;
+}
+
+int JournalTool::validate_type(const std::string &type)
+{
+  if (type == "mdlog" || type == "purge_queue") {
+    return 0;
+  }
+  return -1;
+}
+
+std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
+  if (!all_ranks) {
+    return prefix;
+  }
+
+  return prefix + "." + std::to_string(rank);
+}
+
+bool JournalTool::can_execute_for_all_ranks(const std::string &mode,
+                                            const std::string &command) {
+  if (mode == "journal" && command == "import") {
+    return false;
+  }
+
+  return true;
+}
+
+/**
+ * Handle arguments for 'journal' mode
+ *
+ * This is for operations that act on the journal as a whole.
+ */
+int JournalTool::main_journal(std::vector<const char*> &argv)
+{
+  if (argv.empty()) {
+    derr << "Missing journal command, please see help" << dendl;
+    return -EINVAL;
+  }
+
+  std::string command = argv[0];
+  if (command == "inspect") {
+    return journal_inspect();
+  } else if (command == "export" || command == "import") {
+    bool force = false;
+    if (argv.size() >= 2) {
+      std::string const path = argv[1];
+      if (argv.size() == 3) {
+        if (std::string(argv[2]) == "--force") {
+          force = true;
+        } else {
+          std::cerr << "Unknown argument " << argv[1] << std::endl;
+          return -EINVAL;
+        }
+      }
+      return journal_export(path, command == "import", force);
+    } else {
+      derr << "Missing path" << dendl;
+      return -EINVAL;
+    }
+  } else if (command == "reset") {
+    bool force = false;
+    if (argv.size() == 2) {
+      if (std::string(argv[1]) == "--force") {
+        force = true;
+      } else {
+        std::cerr << "Unknown argument " << argv[1] << std::endl;
+        return -EINVAL;
+      }
+    } else if (argv.size() > 2) {
+      std::cerr << "Too many arguments!" << std::endl;
+      return -EINVAL;
+    }
+    return journal_reset(force);
+  } else {
+    derr << "Bad journal command '" << command << "'" << dendl;
+    return -EINVAL;
+  }
+}
+
+
+/**
+ * Parse arguments and execute for 'header' mode
+ *
+ * This is for operations that act on the header only.
+ */
+int JournalTool::main_header(std::vector<const char*> &argv)
+{
+  JournalFilter filter(type);
+  JournalScanner js(input, rank, type, filter);
+  int r = js.scan(false);
+  if (r < 0) {
+    std::cerr << "Unable to scan journal" << std::endl;
+    return r;
+  }
+
+  if (!js.header_present) {
+    std::cerr << "Header object not found!" << std::endl;
+    return -ENOENT;
+  } else if (!js.header_valid && js.header == NULL) {
+    // Can't do a read or a single-field write without a copy of the original
+    derr << "Header could not be read!" << dendl;
+    return -ENOENT;
+  } else {
+    ceph_assert(js.header != NULL);
+  }
+
+  if (argv.empty()) {
+    derr << "Missing header command, must be [get|set]" << dendl;
+    return -EINVAL;
+  }
+  std::vector<const char *>::iterator arg = argv.begin();
+  std::string const command = *arg;
+  arg = argv.erase(arg);
+
+  if (command == std::string("get")) {
+    // Write JSON journal dump to stdout
+    JSONFormatter jf(true);
+    js.header->dump(&jf);
+    jf.flush(std::cout);
+    std::cout << std::endl;
+  } else if (command == std::string("set")) {
+    // Need two more args <key> <val>
+    if (argv.size() != 2) {
+      derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
+      return -EINVAL;
+    }
+
+    std::string const field_name = *arg;
+    arg = argv.erase(arg);
+
+    std::string const value_str = *arg;
+    arg = argv.erase(arg);
+    ceph_assert(argv.empty());
+
+    std::string parse_err;
+    uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
+    if (!parse_err.empty()) {
+      derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
+      return -EINVAL;
+    }
+
+    uint64_t *field = NULL;
+    if (field_name == "trimmed_pos") {
+      field = &(js.header->trimmed_pos);
+    } else if (field_name == "expire_pos") {
+      field = &(js.header->expire_pos);
+    } else if (field_name == "write_pos") {
+      field = &(js.header->write_pos);
+    } else if (field_name == "pool_id") {
+      field = (uint64_t*)(&(js.header->layout.pool_id));
+    } else {
+      derr << "Invalid field '" << field_name << "'" << dendl;
+      return -EINVAL;
+    }
+
+    std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
+    *field = new_val;
+
+    dout(4) << "Writing object..." << dendl;
+    bufferlist header_bl;
+    encode(*(js.header), header_bl);
+    output.write_full(js.obj_name(0), header_bl);
+    dout(4) << "Write complete." << dendl;
+    std::cout << "Successfully updated header." << std::endl;
+  } else {
+    derr << "Bad header command '" << command << "'" << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+
+/**
+ * Parse arguments and execute for 'event' mode
+ *
+ * This is for operations that act on LogEvents within the log
+ */
+int JournalTool::main_event(std::vector<const char*> &argv)
+{
+  int r;
+
+  if (argv.empty()) {
+    derr << "Missing event command, please see help" << dendl;
+    return -EINVAL;
+  }
+
+  std::vector<const char*>::iterator arg = argv.begin();
+  bool dry_run = false;
+
+  std::string command = *(arg++);
+  if (command != "get" && command != "splice" && command != "recover_dentries") {
+    derr << "Unknown argument '" << command << "'" << dendl;
+    return -EINVAL;
+  }
+
+  if (command == "recover_dentries") {
+    if (type != "mdlog") {
+      derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl;
+      return -EINVAL;
+    } else {
+      if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
+        dry_run = true;
+      }
+    }
+  }
+
+  if (arg == argv.end()) {
+    derr << "Incomplete command line" << dendl;
+    return -EINVAL;
+  }
+
+  // Parse filter options
+  // ====================
+  JournalFilter filter(type);
+  r = filter.parse_args(argv, arg);
+  if (r) {
+    return r;
+  }
+
+  // Parse output options
+  // ====================
+  if (arg == argv.end()) {
+    cerr << "Missing output command" << std::endl;
+    return -EINVAL;
+  }
+  std::string output_style = *(arg++);
+  if (output_style != "binary" && output_style != "json" &&
+      output_style != "summary" && output_style != "list") {
+    cerr << "Unknown argument: '" << output_style << "'" << std::endl;
+    return -EINVAL;
+  }
+
+  std::string output_path = "dump";
+  while(arg != argv.end()) {
+    std::string arg_str;
+    if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
+      output_path = arg_str;
+    } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
+				     nullptr)) {
+      dout(1) << "Using alternate pool " << arg_str << dendl;
+      int r = rados.ioctx_create(arg_str.c_str(), output);
+      ceph_assert(r == 0);
+      other_pool = true;
+    } else {
+      cerr << "Unknown argument: '" << *arg << "'" << std::endl;
+      return -EINVAL;
+    }
+  }
+
+  const std::string dump_path = gen_dump_file_path(output_path);
+
+  // Execute command
+  // ===============
+  JournalScanner js(input, rank, type, filter);
+  if (command == "get") {
+    r = js.scan();
+    if (r) {
+      derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+      return r;
+    }
+  } else if (command == "recover_dentries") {
+    r = js.scan();
+    if (r) {
+      derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+      return r;
+    }
+
+    /**
+     * Iterate over log entries, attempting to scavenge from each one
+     */
+    std::set<inodeno_t> consumed_inos;
+    for (JournalScanner::EventMap::iterator i = js.events.begin();
+         i != js.events.end(); ++i) {
+      auto& le = i->second.log_event;
+      EMetaBlob const *mb = le->get_metablob();
+      if (mb) {
+        int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
+        if (scav_r) {
+          dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
+                  << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
+          if (r == 0) {
+            r = scav_r;
+          }
+          // Our goal is to read all we can, so don't stop on errors, but
+          // do record them for possible later output
+          js.errors.insert(std::make_pair(i->first,
+                JournalScanner::EventError(scav_r, cpp_strerror(r))));
+        }
+      }
+    }
+
+    /**
+     * Update InoTable to reflect any inode numbers consumed during scavenge
+     */
+    dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
+    if (consumed_inos.size() && !dry_run) {
+      int consume_r = consume_inos(consumed_inos);
+      if (consume_r) {
+        dout(1) << "Error updating InoTable for " << consumed_inos.size()
+                << " consume inos: " << cpp_strerror(consume_r) << dendl;
+        if (r == 0) {
+          r = consume_r;
+        }
+      }
+    }
+
+    // Remove consumed dentries from lost+found.
+    if (other_pool && !dry_run) {
+      std::set<std::string> found;
+
+      for (auto i : consumed_inos) {
+	char s[20];
+
+	snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
+	dout(20) << "removing " << s << dendl;
+	found.insert(std::string(s));
+      }
+
+      object_t frag_oid;
+      frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
+					     frag_t(), "");
+      output.omap_rm_keys(frag_oid.name, found);
+    }
+  } else if (command == "splice") {
+    r = js.scan();
+    if (r) {
+      derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+      return r;
+    }
+
+    uint64_t start, end;
+    if (filter.get_range(start, end)) {
+      // Special case for range filter: erase a numeric range in the log
+      uint64_t range = end - start;
+      int r = erase_region(js, start, range);
+      if (r) {
+        derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
+             << ": " << cpp_strerror(r) << dendl;
+        return r;
+      }
+    } else {
+      // General case: erase a collection of individual entries in the log
+      for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
+        dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
+
+        int r = erase_region(js, i->first, i->second.raw_size);
+        if (r) {
+          derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
+               << ": " << cpp_strerror(r) << dendl;
+          return r;
+        }
+      }
+    }
+
+
+  } else {
+    cerr << "Unknown argument '" << command << "'" << std::endl;
+    return -EINVAL;
+  }
+
+  // Generate output
+  // ===============
+  EventOutput output(js, dump_path);
+  int output_result = 0;
+  if (output_style == "binary") {
+      output_result = output.binary();
+  } else if (output_style == "json") {
+      output_result = output.json();
+  } else if (output_style == "summary") {
+      output.summary();
+  } else if (output_style == "list") {
+      output.list();
+  } else {
+    std::cerr << "Bad output command '" << output_style << "'" << std::endl;
+    return -EINVAL;
+  }
+
+  if (output_result != 0) {
+    std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
+  }
+
+  return output_result;
+}
+
+/**
+ * Provide the user with information about the condition of the journal,
+ * especially indicating what range of log events is available and where
+ * any gaps or corruptions in the journal are.
+ */
+int JournalTool::journal_inspect()
+{
+  int r;
+
+  JournalFilter filter(type);
+  JournalScanner js(input, rank, type, filter);
+  r = js.scan();
+  if (r) {
+    std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
+    return r;
+  }
+
+  js.report(std::cout);
+
+  return 0;
+}
+
+
+/**
+ * Attempt to export a binary dump of the journal.
+ *
+ * This is allowed to fail if the header is malformed or there are
+ * objects inaccessible, in which case the user would have to fall
+ * back to manually listing RADOS objects and extracting them, which
+ * they can do with the ``rados`` CLI.
+ */
+int JournalTool::journal_export(std::string const &path, bool import, bool force)
+{
+  int r = 0;
+  JournalScanner js(input, rank, type);
+
+  if (!import) {
+    /*
+     * If doing an export, first check that the header is valid and
+     * no objects are missing before trying to dump
+     */
+    r = js.scan();
+    if (r < 0) {
+      derr << "Unable to scan journal, assuming badly damaged" << dendl;
+      return r;
+    }
+    if (!js.is_readable()) {
+      derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
+      return -EIO;
+    }
+  }
+
+  /*
+   * Assuming we can cleanly read the journal data, dump it out to a file
+   */
+  {
+    Dumper dumper;
+    r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type);
+    if (r < 0) {
+      derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    if (import) {
+      r = dumper.undump(path.c_str(), force);
+    } else {
+      const std::string ex_path = gen_dump_file_path(path);
+      r = dumper.dump(ex_path.c_str());
+    }
+  }
+
+  return r;
+}
+
+
+/**
+ * Truncate journal and insert EResetJournal
+ */
+int JournalTool::journal_reset(bool hard)
+{
+  int r = 0;
+  Resetter resetter;
+  r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard);
+  if (r < 0) {
+    derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (hard) {
+    r = resetter.reset_hard();
+  } else {
+    r = resetter.reset();
+  }
+
+  return r;
+}
+
+
+/**
+ * Selective offline replay which only reads out dentries and writes
+ * them to the backing store iff their version is > what is currently
+ * in the backing store.
+ *
+ * In order to write dentries to the backing store, we may create the
+ * required enclosing dirfrag objects.
+ *
+ * Test this by running scavenge on an unflushed journal, then nuking
+ * it offline, then starting an MDS and seeing that the dentries are
+ * visible.
+ *
+ * @param metablob an EMetaBlob retrieved from the journal
+ * @param dry_run if true, do no writes to RADOS
+ * @param consumed_inos output, populated with any inos inserted
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::recover_dentries(
+    EMetaBlob const &metablob,
+    bool const dry_run,
+    std::set<inodeno_t> *consumed_inos)
+{
+  ceph_assert(consumed_inos != NULL);
+
+  int r = 0;
+
+  // Replay fullbits (dentry+inode)
+  for (const auto& frag : metablob.lump_order) {
+    EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
+    lump._decode_bits();
+    object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
+
+    dout(4) << "inspecting lump " << frag_oid.name << dendl;
+
+
+    // We will record old fnode version for use in hard link handling
+    // If we don't read an old fnode, take version as zero and write in
+    // all hardlinks we find.
+    version_t old_fnode_version = 0;
+
+    // Update fnode in omap header of dirfrag object
+    bool write_fnode = false;
+    bufferlist old_fnode_bl;
+    r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
+    if (r == -ENOENT) {
+      // Creating dirfrag from scratch
+      dout(4) << "failed to read OMAP header from directory fragment "
+        << frag_oid.name << " " << cpp_strerror(r) << dendl;
+      write_fnode = true;
+      // Note: creating the dirfrag *without* a backtrace, relying on
+      // MDS to regenerate backtraces on read or in FSCK
+    } else if (r == 0) {
+      // Conditionally update existing omap header
+      fnode_t old_fnode;
+      auto old_fnode_iter = old_fnode_bl.cbegin();
+      try {
+        old_fnode.decode(old_fnode_iter);
+        dout(4) << "frag " << frag_oid.name << " fnode old v" <<
+          old_fnode.version << " vs new v" << lump.fnode->version << dendl;
+        old_fnode_version = old_fnode.version;
+        write_fnode = old_fnode_version < lump.fnode->version;
+      } catch (const buffer::error &err) {
+        dout(1) << "frag " << frag_oid.name
+                << " is corrupt, overwriting" << dendl;
+        write_fnode = true;
+      }
+    } else {
+      // Unexpected error
+      dout(4) << "failed to read OMAP header from directory fragment "
+        << frag_oid.name << " " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    if ((other_pool || write_fnode) && !dry_run) {
+      dout(4) << "writing fnode to omap header" << dendl;
+      bufferlist fnode_bl;
+      lump.fnode->encode(fnode_bl);
+      if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
+	r = output.omap_set_header(frag_oid.name, fnode_bl);
+      }
+      if (r != 0) {
+        derr << "Failed to write fnode for frag object "
+             << frag_oid.name << dendl;
+        return r;
+      }
+    }
+
+    std::set<std::string> read_keys;
+
+    // Compose list of potentially-existing dentries we would like to fetch
+    for (const auto& fb : lump.get_dfull()) {
+      // Get a key like "foobar_head"
+      std::string key;
+      dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+      dn_key.encode(key);
+      read_keys.insert(key);
+    }
+
+    for(const auto& rb : lump.get_dremote()) {
+      // Get a key like "foobar_head"
+      std::string key;
+      dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+      dn_key.encode(key);
+      read_keys.insert(key);
+    }
+
+    for (const auto& nb : lump.get_dnull()) {
+      // Get a key like "foobar_head"
+      std::string key;
+      dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+      dn_key.encode(key);
+      read_keys.insert(key);
+    }
+
+    // Perform bulk read of existing dentries
+    std::map<std::string, bufferlist> read_vals;
+    r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+    if (r == -ENOENT && other_pool) {
+      r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+    }
+    if (r != 0) {
+      derr << "unexpected error reading fragment object "
+           << frag_oid.name << ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    // Compose list of dentries we will write back
+    std::map<std::string, bufferlist> write_vals;
+    for (const auto& fb : lump.get_dfull()) {
+      // Get a key like "foobar_head"
+      std::string key;
+      dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+      dn_key.encode(key);
+
+      dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
+        << dendl;
+      bool write_dentry = false;
+      if (read_vals.find(key) == read_vals.end()) {
+        dout(4) << "dentry did not already exist, will create" << dendl;
+        write_dentry = true;
+      } else {
+        dout(4) << "dentry " << key << " existed already" << dendl;
+        dout(4) << "dentry exists, checking versions..." << dendl;
+        bufferlist &old_dentry = read_vals[key];
+        // Decode dentry+inode
+        auto q = old_dentry.cbegin();
+
+        snapid_t dnfirst;
+        decode(dnfirst, q);
+        char dentry_type;
+        decode(dentry_type, q);
+
+        if (dentry_type == 'L' || dentry_type == 'l') {
+          // leave write_dentry false, we have no version to
+          // compare with in a hardlink, so it's not safe to
+          // squash over it with what's in this fullbit
+          dout(10) << "Existing remote inode in slot to be (maybe) written "
+               << "by a full inode from the journal dn '" << fb.dn.c_str()
+               << "' with lump fnode version " << lump.fnode->version
+               << "vs existing fnode version " << old_fnode_version << dendl;
+          write_dentry = old_fnode_version < lump.fnode->version;
+        } else if (dentry_type == 'I' || dentry_type == 'i') {
+          // Read out inode version to compare with backing store
+          InodeStore inode;
+          if (dentry_type == 'i') {
+            mempool::mds_co::string alternate_name;
+
+            DECODE_START(2, q);
+            if (struct_v >= 2)
+              decode(alternate_name, q);
+            inode.decode(q);
+            DECODE_FINISH(q);
+	  } else {
+            inode.decode_bare(q);
+	  }
+          dout(4) << "decoded embedded inode version "
+            << inode.inode->version << " vs fullbit version "
+            << fb.inode->version << dendl;
+          if (inode.inode->version < fb.inode->version) {
+            write_dentry = true;
+          }
+        } else {
+          dout(4) << "corrupt dentry in backing store, overwriting from "
+            "journal" << dendl;
+          write_dentry = true;
+        }
+      }
+
+      if ((other_pool || write_dentry) && !dry_run) {
+        dout(4) << "writing I dentry " << key << " into frag "
+          << frag_oid.name << dendl;
+
+        // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+        bufferlist dentry_bl;
+        encode(fb.dnfirst, dentry_bl);
+        encode('I', dentry_bl);
+        encode_fullbit_as_inode(fb, true, &dentry_bl);
+
+        // Record for writing to RADOS
+        write_vals[key] = dentry_bl;
+        consumed_inos->insert(fb.inode->ino);
+      }
+    }
+
+    for(const auto& rb : lump.get_dremote()) {
+      // Get a key like "foobar_head"
+      std::string key;
+      dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+      dn_key.encode(key);
+
+      dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
+        << dendl;
+      bool write_dentry = false;
+      if (read_vals.find(key) == read_vals.end()) {
+        dout(4) << "dentry did not already exist, will create" << dendl;
+        write_dentry = true;
+      } else {
+        dout(4) << "dentry " << key << " existed already" << dendl;
+        dout(4) << "dentry exists, checking versions..." << dendl;
+        bufferlist &old_dentry = read_vals[key];
+        // Decode dentry+inode
+        auto q = old_dentry.cbegin();
+
+        snapid_t dnfirst;
+        decode(dnfirst, q);
+        char dentry_type;
+        decode(dentry_type, q);
+
+        if (dentry_type == 'L' || dentry_type == 'l') {
+          dout(10) << "Existing hardlink inode in slot to be (maybe) written "
+               << "by a remote inode from the journal dn '" << rb.dn.c_str()
+               << "' with lump fnode version " << lump.fnode->version
+               << "vs existing fnode version " << old_fnode_version << dendl;
+          write_dentry = old_fnode_version < lump.fnode->version;
+        } else if (dentry_type == 'I' || dentry_type == 'i') {
+          dout(10) << "Existing full inode in slot to be (maybe) written "
+               << "by a remote inode from the journal dn '" << rb.dn.c_str()
+               << "' with lump fnode version " << lump.fnode->version
+               << "vs existing fnode version " << old_fnode_version << dendl;
+          write_dentry = old_fnode_version < lump.fnode->version;
+        } else {
+          dout(4) << "corrupt dentry in backing store, overwriting from "
+            "journal" << dendl;
+          write_dentry = true;
+        }
+      }
+
+      if ((other_pool || write_dentry) && !dry_run) {
+        dout(4) << "writing L dentry " << key << " into frag "
+          << frag_oid.name << dendl;
+
+        // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+        bufferlist dentry_bl;
+        encode(rb.dnfirst, dentry_bl);
+        encode('L', dentry_bl);
+        encode(rb.ino, dentry_bl);
+        encode(rb.d_type, dentry_bl);
+
+        // Record for writing to RADOS
+        write_vals[key] = dentry_bl;
+        consumed_inos->insert(rb.ino);
+      }
+    }
+
+    std::set<std::string> null_vals;
+    for (const auto& nb : lump.get_dnull()) {
+      std::string key;
+      dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+      dn_key.encode(key);
+
+      dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
+	<< dendl;
+
+      auto it = read_vals.find(key);
+      if (it != read_vals.end()) {
+	dout(4) << "dentry exists, will remove" << dendl;
+
+	auto q = it->second.cbegin();
+	snapid_t dnfirst;
+	decode(dnfirst, q);
+	char dentry_type;
+	decode(dentry_type, q);
+
+	bool remove_dentry = false;
+	if (dentry_type == 'L' || dentry_type == 'l') {
+	  dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
+	    << "by null journal dn '" << nb.dn.c_str()
+	    << "' with lump fnode version " << lump.fnode->version
+	    << "vs existing fnode version " << old_fnode_version << dendl;
+	  remove_dentry = old_fnode_version < lump.fnode->version;
+	} else if (dentry_type == 'I' || dentry_type == 'i') {
+	  dout(10) << "Existing full inode in slot to be (maybe) removed "
+	    << "by null journal dn '" << nb.dn.c_str()
+	    << "' with lump fnode version " << lump.fnode->version
+	    << "vs existing fnode version " << old_fnode_version << dendl;
+	  remove_dentry = old_fnode_version < lump.fnode->version;
+	} else {
+	  dout(4) << "corrupt dentry in backing store, will remove" << dendl;
+	  remove_dentry = true;
+	}
+
+	if (remove_dentry)
+	  null_vals.insert(key);
+      }
+    }
+
+    // Write back any new/changed dentries
+    if (!write_vals.empty()) {
+      r = output.omap_set(frag_oid.name, write_vals);
+      if (r != 0) {
+	derr << "error writing dentries to " << frag_oid.name
+	     << ": " << cpp_strerror(r) << dendl;
+	return r;
+      }
+    }
+
+    // remove any null dentries
+    if (!null_vals.empty()) {
+      r = output.omap_rm_keys(frag_oid.name, null_vals);
+      if (r != 0) {
+	derr << "error removing dentries from " << frag_oid.name
+	  << ": " << cpp_strerror(r) << dendl;
+	return r;
+      }
+    }
+  }
+
+  /* Now that we've looked at the dirlumps, we finally pay attention to
+   * the roots (i.e. inodes without ancestry).  This is necessary in order
+   * to pick up dirstat updates on ROOT_INO.  dirstat updates are functionally
+   * important because clients use them to infer completeness
+   * of directories
+   */
+  for (const auto& fb : metablob.roots) {
+    inodeno_t ino = fb.inode->ino;
+    dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
+
+    object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+    dout(4) << "object id " << root_oid.name << dendl;
+
+    bool write_root_ino = false;
+    bufferlist old_root_ino_bl;
+    r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
+    if (r == -ENOENT) {
+      dout(4) << "root does not exist, will create" << dendl;
+      write_root_ino = true;
+    } else if (r >= 0) {
+      r = 0;
+      InodeStore old_inode;
+      dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
+        << ")" << dendl;
+      auto inode_bl_iter = old_root_ino_bl.cbegin(); 
+      std::string magic;
+      decode(magic, inode_bl_iter);
+      if (magic == CEPH_FS_ONDISK_MAGIC) {
+        dout(4) << "magic ok" << dendl;
+        old_inode.decode(inode_bl_iter);
+
+        if (old_inode.inode->version < fb.inode->version) {
+          write_root_ino = true;
+        }
+      } else {
+        dout(4) << "magic bad: '" << magic << "'" << dendl;
+        write_root_ino = true;
+      }
+    } else {
+      derr << "error reading root inode object " << root_oid.name
+            << ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    if (write_root_ino && !dry_run) {
+      dout(4) << "writing root ino " << root_oid.name
+               << " version " << fb.inode->version << dendl;
+
+      // Compose: root ino format is magic,InodeStore(bare=false)
+      bufferlist new_root_ino_bl;
+      encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
+      encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
+
+      // Write to RADOS
+      r = output.write_full(root_oid.name, new_root_ino_bl);
+      if (r != 0) {
+        derr << "error writing inode object " << root_oid.name
+              << ": " << cpp_strerror(r) << dendl;
+        return r;
+      }
+    }
+  }
+
+  return r;
+}
+
+
+/**
+ * Erase a region of the log by overwriting it with ENoOp
+ *
+ */
+int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
+{
+  // To erase this region, we use our preamble, the encoding overhead
+  // of an ENoOp, and our trailing start ptr.  Calculate how much padding
+  // is needed inside the ENoOp to make up the difference.
+  bufferlist tmp;
+  if (type == "mdlog") {
+    ENoOp enoop(0);
+    enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
+  } else if (type == "purge_queue") {
+    PurgeItem pi;
+    pi.encode(tmp);
+  }
+
+  dout(4) << "erase_region " << pos << " len=" << length << dendl;
+
+  // FIXME: get the preamble/postamble length via JournalStream
+  int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
+  dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
+
+  if (padding < 0) {
+    derr << "Erase region " << length << " too short" << dendl;
+    return -EINVAL;
+  }
+
+  bufferlist entry;
+  if (type == "mdlog") {
+    // Serialize an ENoOp with the correct amount of padding
+    ENoOp enoop(padding);
+    enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
+  } else if (type == "purge_queue") {
+    PurgeItem pi;
+    pi.pad_size = padding;
+    pi.encode(entry);
+  }
+  JournalStream stream(JOURNAL_FORMAT_RESILIENT);
+  // Serialize region of log stream
+  bufferlist log_data;
+  stream.write(entry, &log_data, pos);
+
+  dout(4) << "erase_region data length " << log_data.length() << dendl;
+  ceph_assert(log_data.length() == length);
+
+  // Write log stream region to RADOS
+  // FIXME: get object size somewhere common to scan_events
+  uint32_t object_size = g_conf()->mds_log_segment_size;
+  if (object_size == 0) {
+    // Default layout object size
+    object_size = file_layout_t::get_default().object_size;
+  }
+
+  uint64_t write_offset = pos;
+  uint64_t obj_offset = (pos / object_size);
+  int r = 0;
+  while(log_data.length()) {
+    std::string const oid = js.obj_name(obj_offset);
+    uint32_t offset_in_obj = write_offset % object_size;
+    uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
+
+    r = output.write(oid, log_data, write_len, offset_in_obj);
+    if (r < 0) {
+      return r;
+    } else {
+      dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
+      r = 0;
+    }
+     
+    log_data.splice(0, write_len);
+    write_offset += write_len;
+    obj_offset++;
+  }
+
+  return r;
+}
+
+/**
+ * Given an EMetaBlob::fullbit containing an inode, write out
+ * the encoded inode in the format used by InodeStore (i.e. the
+ * backing store format)
+ *
+ * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
+ * on an offline InodeStore instance.  It's way simpler, because we are just
+ * uncritically hauling the data between structs.
+ *
+ * @param fb a fullbit extracted from a journal entry
+ * @param bare if true, leave out [EN|DE]CODE_START decoration
+ * @param out_bl output, write serialized inode to this bufferlist
+ */
+void JournalTool::encode_fullbit_as_inode(
+  const EMetaBlob::fullbit &fb,
+  const bool bare,
+  bufferlist *out_bl)
+{
+  ceph_assert(out_bl != NULL);
+
+  // Compose InodeStore
+  InodeStore new_inode;
+  new_inode.inode = fb.inode;
+  new_inode.xattrs = fb.xattrs;
+  new_inode.dirfragtree = fb.dirfragtree;
+  new_inode.snap_blob = fb.snapbl;
+  new_inode.symlink = fb.symlink;
+  new_inode.old_inodes = fb.old_inodes;
+
+  // Serialize InodeStore
+  if (bare) {
+    new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+  } else {
+    new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+  }
+}
+
+/**
+ * Given a list of inode numbers known to be in use by
+ * inodes in the backing store, ensure that none of these
+ * numbers are listed as free in the InoTables in the
+ * backing store.
+ *
+ * Used after injecting inodes into the backing store, to
+ * ensure that the same inode numbers are not subsequently
+ * used for new files during ordinary operation.
+ *
+ * @param inos list of inode numbers to be removed from
+ *             free lists in InoTables
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
+{
+  int r = 0;
+
+  // InoTable is a per-MDS structure, so iterate over assigned ranks
+  auto fs = fsmap->get_filesystem(role_selector.get_ns());
+  std::set<mds_rank_t> in_ranks;
+  fs->mds_map.get_mds_set(in_ranks);
+
+  for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
+      rank_i != in_ranks.end(); ++rank_i)
+  {
+    // Compose object name
+    std::ostringstream oss;
+    oss << "mds" << *rank_i << "_inotable";
+    object_t inotable_oid = object_t(oss.str());
+
+    // Read object
+    bufferlist inotable_bl;
+    int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
+    if (read_r < 0) {
+      // Things are really bad if we can't read inotable.  Beyond our powers.
+      derr << "unable to read inotable '" << inotable_oid.name << "': "
+        << cpp_strerror(read_r) << dendl;
+      r = r ? r : read_r;
+      continue;
+    }
+
+    // Deserialize InoTable
+    version_t inotable_ver;
+    auto q = inotable_bl.cbegin();
+    decode(inotable_ver, q);
+    InoTable ino_table(NULL);
+    ino_table.decode(q);
+    
+    // Update InoTable in memory
+    bool inotable_modified = false;
+    for (std::set<inodeno_t>::iterator i = inos.begin();
+        i != inos.end(); ++i)
+    {
+      const inodeno_t ino = *i;
+      if (ino_table.force_consume(ino)) {
+        dout(4) << "Used ino 0x" << std::hex << ino << std::dec
+          << " requires inotable update" << dendl;
+        inotable_modified = true;
+      }
+    }
+
+    // Serialize and write InoTable
+    if (inotable_modified) {
+      inotable_ver += 1;
+      dout(4) << "writing modified inotable version " << inotable_ver << dendl;
+      bufferlist inotable_new_bl;
+      encode(inotable_ver, inotable_new_bl);
+      ino_table.encode_state(inotable_new_bl);
+      int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
+      if (write_r != 0) {
+        derr << "error writing modified inotable " << inotable_oid.name
+          << ": " << cpp_strerror(write_r) << dendl;
+        r = r ? r : read_r;
+        continue;
+      }
+    }
+  }
+
+  return r;
+}
+
diff --git a/src/tools/cephfs/JournalTool.h b/src/tools/cephfs/JournalTool.h
new file mode 100644
index 000000000..8d610a866
--- /dev/null
+++ b/src/tools/cephfs/JournalTool.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+#include <vector>
+
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/events/EMetaBlob.h"
+
+#include "include/rados/librados.hpp"
+
+#include "JournalFilter.h"
+
+class JournalScanner;
+
+
+/**
+ * Command line tool for investigating and repairing filesystems
+ * with damaged metadata logs
+ */
+class JournalTool : public MDSUtility
+{
+  private:
+    MDSRoleSelector role_selector;
+    // Bit hacky, use this `rank` member to control behaviour of the
+    // various main_ functions.
+    mds_rank_t rank;
+    // when set, generate per rank dump file path
+    bool all_ranks = false;
+   
+    std::string type;
+
+    // Entry points
+    int main_journal(std::vector<const char*> &argv);
+    int main_header(std::vector<const char*> &argv);
+    int main_event(std::vector<const char*> &argv);
+
+    // Shared functionality
+    int recover_journal();
+
+    // Journal operations
+    int journal_inspect();
+    int journal_export(std::string const &path, bool import, bool force);
+    int journal_reset(bool hard);
+
+    // Header operations
+    int header_set();
+
+    // I/O handles
+    librados::Rados rados;
+    librados::IoCtx input;
+    librados::IoCtx output;
+
+    bool other_pool;
+
+    // Metadata backing store manipulation
+    int read_lost_found(std::set<std::string> &lost);
+    int recover_dentries(
+        EMetaBlob const &metablob,
+        bool const dry_run,
+        std::set<inodeno_t> *consumed_inos);
+
+    // Splicing
+    int erase_region(JournalScanner const &jp, uint64_t const pos, uint64_t const length);
+
+    // Backing store helpers
+    void encode_fullbit_as_inode(
+        const EMetaBlob::fullbit &fb,
+        const bool bare,
+        bufferlist *out_bl);
+    int consume_inos(const std::set<inodeno_t> &inos);
+
+    //validate type
+    int validate_type(const std::string &type);
+
+    // generate output file path for dump/export
+    std::string gen_dump_file_path(const std::string &prefix);
+
+    // check if an operation (mode, command) is safe to be
+    // executed on all ranks.
+    bool can_execute_for_all_ranks(const std::string &mode,
+                                   const std::string &command);
+  public:
+    static void usage();
+    JournalTool() :
+      rank(0), other_pool(false) {}
+    int main(std::vector<const char*> &argv);
+};
+
diff --git a/src/tools/cephfs/MDSUtility.cc b/src/tools/cephfs/MDSUtility.cc
new file mode 100644
index 000000000..54386d219
--- /dev/null
+++ b/src/tools/cephfs/MDSUtility.cc
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "MDSUtility.h"
+#include "mon/MonClient.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+
+MDSUtility::MDSUtility() :
+  Dispatcher(g_ceph_context),
+  objecter(NULL),
+  finisher(g_ceph_context, "MDSUtility", "fn_mds_utility"),
+  waiting_for_mds_map(NULL),
+  inited(false)
+{
+  monc = new MonClient(g_ceph_context, poolctx);
+  messenger = Messenger::create_client_messenger(g_ceph_context, "mds");
+  fsmap = new FSMap();
+  objecter = new Objecter(g_ceph_context, messenger, monc, poolctx);
+}
+
+
+MDSUtility::~MDSUtility()
+{
+  if (inited) {
+    shutdown();
+  }
+  delete objecter;
+  delete monc;
+  delete messenger;
+  delete fsmap;
+  ceph_assert(waiting_for_mds_map == NULL);
+}
+
+
+int MDSUtility::init()
+{
+  // Initialize Messenger
+  poolctx.start(1);
+  messenger->start();
+
+  objecter->set_client_incarnation(0);
+  objecter->init();
+
+  // Connect dispatchers before starting objecter
+  messenger->add_dispatcher_tail(objecter);
+  messenger->add_dispatcher_tail(this);
+
+  // Initialize MonClient
+  if (monc->build_initial_monmap() < 0) {
+    objecter->shutdown();
+    messenger->shutdown();
+    messenger->wait();
+    return -1;
+  }
+
+  monc->set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD|CEPH_ENTITY_TYPE_MDS);
+  monc->set_messenger(messenger);
+  monc->init();
+  int r = monc->authenticate();
+  if (r < 0) {
+    derr << "Authentication failed, did you specify an MDS ID with a valid keyring?" << dendl;
+    monc->shutdown();
+    objecter->shutdown();
+    messenger->shutdown();
+    messenger->wait();
+    return r;
+  }
+
+  client_t whoami = monc->get_global_id();
+  messenger->set_myname(entity_name_t::CLIENT(whoami.v));
+
+  // Start Objecter and wait for OSD map
+  objecter->start();
+  objecter->wait_for_osd_map();
+
+  // Prepare to receive MDS map and request it
+  ceph::mutex init_lock = ceph::make_mutex("MDSUtility:init");
+  ceph::condition_variable cond;
+  bool done = false;
+  ceph_assert(!fsmap->get_epoch());
+  lock.lock();
+  waiting_for_mds_map = new C_SafeCond(init_lock, cond, &done, NULL);
+  lock.unlock();
+  monc->sub_want("fsmap", 0, CEPH_SUBSCRIBE_ONETIME);
+  monc->renew_subs();
+
+  // Wait for MDS map
+  dout(4) << "waiting for MDS map..." << dendl;
+  {
+    std::unique_lock locker{init_lock};
+    cond.wait(locker, [&done] { return done; });
+  }
+  dout(4) << "Got MDS map " << fsmap->get_epoch() << dendl;
+
+  finisher.start();
+
+  inited = true;
+  return 0;
+}
+
+
+void MDSUtility::shutdown()
+{
+  finisher.stop();
+
+  lock.lock();
+  objecter->shutdown();
+  lock.unlock();
+  monc->shutdown();
+  messenger->shutdown();
+  messenger->wait();
+  poolctx.finish();
+}
+
+
+bool MDSUtility::ms_dispatch(Message *m)
+{
+  std::lock_guard locker{lock};
+   switch (m->get_type()) {
+   case CEPH_MSG_FS_MAP:
+     handle_fs_map((MFSMap*)m);
+     break;
+   case CEPH_MSG_OSD_MAP:
+     break;
+   default:
+     return false;
+   }
+   m->put();
+   return true;
+}
+
+
+void MDSUtility::handle_fs_map(MFSMap* m)
+{
+  *fsmap = m->get_fsmap();
+  if (waiting_for_mds_map) {
+    waiting_for_mds_map->complete(0);
+    waiting_for_mds_map = NULL;
+  }
+}
+
+
diff --git a/src/tools/cephfs/MDSUtility.h b/src/tools/cephfs/MDSUtility.h
new file mode 100644
index 000000000..09f1918ba
--- /dev/null
+++ b/src/tools/cephfs/MDSUtility.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MDS_UTILITY_H_
+#define MDS_UTILITY_H_
+
+#include "osdc/Objecter.h"
+#include "mds/FSMap.h"
+#include "messages/MFSMap.h"
+#include "msg/Dispatcher.h"
+#include "msg/Messenger.h"
+#include "auth/Auth.h"
+#include "common/async/context_pool.h"
+#include "common/Finisher.h"
+#include "common/Timer.h"
+
+/// MDS Utility
+/**
+ * This class is the parent for MDS utilities, i.e. classes that
+ * need access the objects belonging to the MDS without actually
+ * acting as an MDS daemon themselves.
+ */
+class MDSUtility : public Dispatcher {
+protected:
+  Objecter *objecter;
+  FSMap *fsmap;
+  Messenger *messenger;
+  MonClient *monc;
+
+  ceph::mutex lock = ceph::make_mutex("MDSUtility::lock");
+  Finisher finisher;
+  ceph::async::io_context_pool poolctx;
+
+  Context *waiting_for_mds_map;
+
+  bool inited;
+public:
+  MDSUtility();
+  ~MDSUtility() override;
+
+  void handle_fs_map(MFSMap* m);
+  bool ms_dispatch(Message *m) override;
+  bool ms_handle_reset(Connection *con) override { return false; }
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override { return false; }
+  int init();
+  void shutdown();
+};
+
+#endif /* MDS_UTILITY_H_ */
diff --git a/src/tools/cephfs/MetaTool.cc b/src/tools/cephfs/MetaTool.cc
new file mode 100644
index 000000000..baa0d498a
--- /dev/null
+++ b/src/tools/cephfs/MetaTool.cc
@@ -0,0 +1,1000 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#include <string.h>
+#include <map>
+#include <sstream>
+#include <fstream>
+
+#include "include/types.h"
+#include "common/Formatter.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/InoTable.h"
+#include "mds/CDentry.h"
+
+#include "mds/events/ENoOp.h"
+#include "mds/events/EUpdate.h"
+
+#include "mds/JournalPointer.h"
+// #include "JournalScanner.h"
+// #include "EventOutput.h"
+// #include "Dumper.h"
+// #include "Resetter.h"
+
+// #include "JournalTool.h"
+#include "MetaTool.h"
+#include "type_helper.hpp"
+#include "include/object.h"
+
+WRITE_RAW_ENCODER(char)
+WRITE_RAW_ENCODER(unsigned char)
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+using namespace std;
+
+void MetaTool::meta_op::release()
+{
+  for (const auto& i : inodes) {
+    delete i.second;
+  }
+
+  while (!sub_ops.empty()) {
+    delete sub_ops.top();
+    sub_ops.pop();
+  }
+}
+
+void MetaTool::inode_meta_t::decode_json(JSONObj *obj)
+{
+  unsigned long long tmp;
+  JSONDecoder::decode_json("snapid_t", tmp, obj, true);
+  _f.val = tmp;
+  JSONDecoder::decode_json("itype", tmp, obj, true);
+  _t = tmp;
+  if (NULL == _i)
+    _i = new InodeStore;
+  JSONDecoder::decode_json("store", *_i, obj, true);
+}
+
+void MetaTool::usage()
+{
+  generic_client_usage();
+}
+
+int MetaTool::main(string& mode,
+                   string& rank_str,
+                   string& minfo,
+                   string&ino,
+                   string& out,
+                   string& in,
+                   bool confirm
+                   )
+{
+  int r = 0;
+
+  std::string manual_meta_pool;
+  std::string manual_data_pool;
+  std::string manual_rank_num;
+  bool manual_mode = false;
+  if (minfo != "") {
+    vector<string> v;
+    string_split(minfo, v);
+    manual_meta_pool = v.size() >= 1 ? v[0] : "";
+    manual_data_pool = v.size() >= 2 ? v[1] : "";
+    manual_rank_num = v.size() >= 3 ? v[2] : "";
+    std::cout << "("<< minfo<< ")=>"
+              << " mpool: " << manual_meta_pool
+              << " dpool: " << manual_data_pool
+              << " rank: " << manual_rank_num
+              << std::endl;
+    if (!manual_meta_pool.empty() && !manual_data_pool.empty() && !manual_rank_num.empty()) {
+      std::cout << "you specify rank: " << manual_rank_num
+                << " mpool: " << manual_meta_pool
+                << " dpool: " << manual_data_pool
+                << "\nstart manual mode!!"<< std::endl;
+      manual_mode = true;
+    }
+  }
+
+  // RADOS init
+  r = rados.init_with_context(g_ceph_context);
+  if (r < 0) {
+    cerr << "RADOS unavailable" << std::endl;
+    return r;
+  }
+
+  if (_debug)
+    cout << "MetaTool: connecting to RADOS..." << std::endl;
+  r = rados.connect();
+  if (r < 0) {
+    cerr << "couldn't connect to cluster: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  if (!manual_mode) {
+    r = role_selector.parse(*fsmap, rank_str);
+    if (r != 0) {
+      cerr << "Couldn't determine MDS rank." << std::endl;
+      return r;
+    }
+
+    auto fs = fsmap->get_filesystem(role_selector.get_ns());
+    assert(fs != nullptr);
+
+    // prepare io for meta pool
+    int64_t const pool_id = fs->mds_map.get_metadata_pool();
+    features = fs->mds_map.get_up_features();
+    if (features == 0)
+      features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+    else if (features != CEPH_FEATURES_SUPPORTED_DEFAULT) {
+      cout << "I think we need to check the feature! : " << features << std::endl;
+      return -1;
+    }
+
+    std::string pool_name;
+    r = rados.pool_reverse_lookup(pool_id, &pool_name);
+    if (r < 0) {
+      cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl;
+      return r;
+    }
+
+    if (_debug)
+      cout << "MetaTool: creating IoCtx.." << std::endl;
+    r = rados.ioctx_create(pool_name.c_str(), io_meta);
+    assert(r == 0);
+    output.dup(io_meta);
+
+    // prepare io for data pool
+    for (const auto p : fs->mds_map.get_data_pools()) {
+      r = rados.pool_reverse_lookup(p, &pool_name);
+      if (r < 0) {
+        cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl;
+        return r;
+      }
+      librados::IoCtx* io_data = new librados::IoCtx;
+      r = rados.ioctx_create(pool_name.c_str(), *io_data);
+      assert(r == 0);
+      io_data_v.push_back(io_data);
+    }
+
+    for (auto role : role_selector.get_roles()) {
+      rank = role.rank;
+
+      r =  process(mode, ino, out, in, confirm);
+      cout << "executing for rank " << rank << " op[" <<mode<< "] ret : " << r << std::endl;
+    }
+
+  } else {
+    features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+    r = rados.ioctx_create(manual_meta_pool.c_str(), io_meta);
+    assert(r == 0);
+
+    librados::IoCtx* io_data = new librados::IoCtx;
+    r = rados.ioctx_create(manual_data_pool.c_str(), *io_data);
+    assert(r == 0);
+    io_data_v.push_back(io_data);
+
+
+    rank = conv_t<int>(manual_rank_num);
+    r = process(mode, ino, out, in, confirm);
+    cout << "op[" << mode << "] ret : " << r << std::endl;
+  }
+  return r;
+}
+
+int MetaTool::process(string& mode, string& ino, string out, string in, bool confirm)
+{
+  if (mode == "showm") {
+    return show_meta_info(ino, out);
+  } else if (mode == "showfn") {
+    return show_fnode(ino, out);
+  } else if (mode == "listc") {
+    return list_meta_info(ino, out);
+  } else if (mode == "amend") {
+    return amend_meta_info(ino, in, confirm);
+  } else if (mode == "amendfn") {
+    return amend_fnode(in, confirm);
+  } else {
+    cerr << "bad command '" << mode << "'" << std::endl;
+    return -EINVAL;
+  }
+}
+int MetaTool::show_fnode(string& ino, string& out)
+{
+  if (ino != "0") {
+    inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+    meta_op op(_debug, out);
+    meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+    nsop->sub_op_t = meta_op::OP_SHOW_FN;
+    nsop->sub_ino_t = meta_op::INO_DIR;
+    nsop->ino = i_ino;
+    op.push_op(nsop);
+    return op_process(op);
+  } else {
+    cerr << "parameter error? : ino = " << ino << std::endl;
+  }
+  return 0;
+}
+int MetaTool::amend_fnode(string& in, bool confirm)
+{
+  meta_op op(_debug, "", in, confirm);
+  meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+  nsop->sub_op_t = meta_op::OP_AMEND_FN;
+  nsop->sub_ino_t = meta_op::INO_DIR;
+  nsop->ino = 0;
+  op.push_op(nsop);
+  return op_process(op);
+}
+int MetaTool::amend_meta_info(string& ino, string& in, bool confirm)
+{
+  if (ino != "0" && in != "") {
+    inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+    meta_op op(_debug, "", in, confirm);
+    meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+    nsop->sub_op_t = meta_op::OP_AMEND;
+    nsop->sub_ino_t = meta_op::INO_DIR;
+    nsop->ino = i_ino;
+    op.push_op(nsop);
+    return op_process(op);
+  } else {
+    cerr << "parameter error? : ino = " << ino << std::endl;
+  }
+  return 0;
+}
+int MetaTool::list_meta_info(string& ino, string& out)
+{
+  if (ino != "0") {
+    inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+    meta_op op(_debug, out);
+    meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+    nsop->sub_op_t = meta_op::OP_LIST;
+    nsop->sub_ino_t = meta_op::INO_DIR;
+    nsop->ino = i_ino;
+    op.push_op(nsop);
+    return op_process(op);
+  } else {
+    cerr << "parameter error? : ino = " << ino << std::endl;
+  }
+  return 0;
+}
+int MetaTool::show_meta_info(string& ino, string& out)
+{
+  if (ino != "0") {
+    inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+    meta_op op(_debug, out);
+
+    meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+    nsop->sub_op_t = meta_op::OP_SHOW;
+    nsop->sub_ino_t = meta_op::INO_DIR;
+    nsop->ino = i_ino;
+    op.push_op(nsop);
+    return op_process(op);
+  } else {
+    cerr << "parameter error? : ino = " << ino << std::endl;
+  }
+  return 0;
+}
+
+int MetaTool::op_process(meta_op& op)
+{
+  int r = 0;
+  while (!op.no_sops()) {
+    if (_debug)
+      std::cout << "process : " << op.top_op()->detail() << std::endl;
+    switch(op.top_op()->sub_op_t) {
+    case meta_op::OP_LIST:
+      r = list_meta(op);
+      break;
+    case meta_op::OP_LTRACE:
+      r = file_meta(op);
+      break;
+    case meta_op::OP_SHOW:
+      r = show_meta(op);
+      break;
+    case meta_op::OP_AMEND:
+      r = amend_meta(op);
+      break;
+    case meta_op::OP_SHOW_FN:
+      r = show_fn(op);
+      break;
+    case meta_op::OP_AMEND_FN:
+      r = amend_fn(op);
+      break;
+    default:
+      cerr << "unknow op" << std::endl;
+    }
+    if (r == 0)
+      op.pop_op();
+    else if (r < 0)
+      op.clear_sops();
+  }
+  op.release();
+  return r;
+}
+
+int MetaTool::amend_meta(meta_op &op)
+{
+  meta_op::sub_op* sop = op.top_op();
+  auto item = op.inodes.find(sop->ino);
+  auto item_k = op.okeys.find(sop->ino);
+  if (item != op.inodes.end() && item_k != op.okeys.end()) {
+    if (_amend_meta(item_k->second, *(item->second), op.infile(), op) < 0)
+      return -1;
+  } else {
+    if (op.inodes.empty()) {
+      meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+      nsop->sub_op_t = meta_op::OP_LIST;
+      nsop->sub_ino_t = meta_op::INO_DIR;
+      nsop->trace_level = 0;
+      nsop->ino_c = sop->ino;
+      op.push_op(nsop);
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  return 0;
+}
+
+void MetaTool::inode_meta_t::encode(::ceph::bufferlist& bl, uint64_t features)
+{
+    ::encode(_f, bl);
+    ::encode(_t, bl);
+    _i->encode_bare(bl, features);
+}
+int MetaTool::_amend_meta(string& k, inode_meta_t& inode_meta, const string& fn, meta_op& op)
+{
+  JSONParser parser;
+  if (!parser.parse(fn.c_str())) {
+    cout << "Error parsing create user response" << std::endl;
+    return -1;
+  }
+  
+  try {
+    inode_meta.decode_json(&parser);
+  } catch (JSONDecoder::err& e) {
+    cout << "failed to decode JSON input: " << e.what() << std::endl;
+    return -1;
+  }
+  
+  if (!op.confirm_chg() || op.is_debug()) {
+    cout << "you will amend info of inode ==>: " << std::endl;
+    _show_meta(inode_meta, "");
+  }
+  
+  if (!op.confirm_chg()) {
+    cout << "warning: this operation is irreversibl!!!\n"
+         << "         You must confirm that all logs of mds have been flushed!!!\n"
+         << "         if you want amend it, please add --yes-i-really-really-mean-it!!!"
+         << std::endl;
+    return -1;
+  }
+  
+  bufferlist bl;
+  inode_meta.encode(bl, features);
+  map<string, bufferlist> to_set;
+  to_set[k].swap(bl);
+  inode_backpointer_t bp;
+  if (!op.top_op()->get_ancestor(bp))
+    return -1;
+  frag_t frag;
+  auto item = op.inodes.find(bp.dirino);
+  if (item != op.inodes.end()) {
+    frag = item->second->get_meta()->pick_dirfrag(bp.dname);
+  }
+  string oid = obj_name(bp.dirino, frag);
+  int ret = io_meta.omap_set(oid, to_set);
+  to_set.clear();
+  return ret;
+}
+int MetaTool::show_fn(meta_op &op)
+{
+  meta_op::sub_op* sop = op.top_op();
+  auto item = op.inodes.find(sop->ino);
+  if (item != op.inodes.end()) {
+    if (_show_fn(*(item->second), op.outfile()) < 0)
+      return -1;
+  } else {
+    if (op.inodes.empty()) {
+      meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+      nsop->sub_op_t = meta_op::OP_LIST;
+      nsop->sub_ino_t = meta_op::INO_DIR;
+      nsop->trace_level = 0;
+      nsop->ino_c = sop->ino;
+      op.push_op(nsop);
+      return 1;
+    } else
+      return -1;
+  }
+  return 0;
+}
+int MetaTool::_show_fn(inode_meta_t& inode_meta, const string& fn)
+{
+  std::list<frag_t> frags;
+  inode_meta.get_meta()->dirfragtree.get_leaves(frags);
+  std::stringstream ds;
+  std::string format = "json";
+  std::string oids;
+  Formatter* f = Formatter::create(format);
+  f->enable_line_break();
+  f->open_object_section("fnodes");
+  for (const auto &frag : frags) {
+    bufferlist hbl;
+    string oid = obj_name(inode_meta.get_meta()->inode->ino, frag);
+    int ret = io_meta.omap_get_header(oid, &hbl);
+    if (ret < 0) {
+      std::cerr << __func__ << " : can't find oid("<< oid << ")" << std::endl;
+      return -1;
+    }
+    {
+      fnode_t got_fnode;
+      try {
+        auto p = hbl.cbegin();
+        ::decode(got_fnode, p);
+      } catch (const buffer::error &err) {
+        cerr << "corrupt fnode header in " << oid
+             << ": " << err.what() << std::endl;
+        return -1;
+      }
+      if (!oids.empty())
+        oids += ",";
+      oids += oid;
+      f->open_object_section(oid.c_str());
+      got_fnode.dump(f);
+      f->close_section();
+    }
+  }
+  f->dump_string("oids", oids.c_str());
+  f->close_section();
+  f->flush(ds);
+  if (fn != "") {
+    ofstream o;
+    o.open(fn);
+    if (o) {
+      o << ds.str();
+      o.close();
+    } else {
+      cout << "out to file (" << fn << ") failed" << std::endl;
+      cout << ds.str() << std::endl;
+    }
+  } else
+    std::cout << ds.str() << std::endl;
+  return 0;
+}
+int MetaTool::amend_fn(meta_op &op)
+{
+  if (_amend_fn(op.infile(), op.confirm_chg()) < 0)
+    return -1;
+  return 0;
+}
+int MetaTool::_amend_fn(const string& fn, bool confirm)
+{
+  JSONParser parser;
+  if (!parser.parse(fn.c_str())) {
+    cout << "Error parsing create user response : " << fn << std::endl;
+    return -1;
+  }
+  if (!confirm) {
+    cout << "warning: this operation is irreversibl!!!\n"
+         << "         You must confirm that all logs of mds have been flushed!!!\n"
+         << "         if you want amend it, please add --yes-i-really-really-mean-it!!!"
+         << std::endl;
+    return -1;
+  }
+  try {
+    string tmp;
+    JSONDecoder::decode_json("oids", tmp, &parser, true);
+    string::size_type pos1, pos2;
+    vector<string> v;
+    string c = ",";
+    pos2 = tmp.find(c);
+    pos1 = 0;
+    while (string::npos != pos2) {
+      v.push_back(tmp.substr(pos1, pos2-pos1));
+      pos1 = pos2 + c.size();
+      pos2 = tmp.find(c, pos1);
+    }
+    if (pos1 != tmp.length())
+      v.push_back(tmp.substr(pos1));
+    int ret = 0;
+    for (auto i : v) {
+      cout << "amend frag : " << i << "..." << std::endl;
+      fnode_t fnode;
+      JSONDecoder::decode_json(i.c_str(), fnode, &parser, true);
+      bufferlist bl;
+      fnode.encode(bl);
+      ret = io_meta.omap_set_header(i, bl);
+      if (ret < 0)
+        return ret;
+    }
+  } catch (JSONDecoder::err& e) {
+    cout << "failed to decode JSON input: " << e.what() << std::endl;
+    return -1;
+  }
+  return 0;
+}
+int MetaTool::show_meta(meta_op &op)
+{
+  meta_op::sub_op* sop = op.top_op();
+  auto item = op.inodes.find(sop->ino);
+  if (item != op.inodes.end()) {
+    if (_show_meta(*(item->second), op.outfile()) < 0)
+      return -1;
+  } else {
+    if (op.inodes.empty()) {
+      meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+      nsop->sub_op_t = meta_op::OP_LIST;
+      nsop->sub_ino_t = meta_op::INO_DIR;
+      nsop->trace_level = 0;
+      nsop->ino_c = sop->ino;
+      op.push_op(nsop);
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  return 0;
+}
+int MetaTool::_show_meta(inode_meta_t& inode_meta, const string& fn)
+{
+  std::stringstream ds;
+  std::string format = "json";
+  InodeStore& inode_data = *inode_meta.get_meta();
+  Formatter* f = Formatter::create(format);
+  f->enable_line_break();
+  f->open_object_section("meta");
+  f->dump_unsigned("snapid_t", inode_meta.get_snapid());
+  f->dump_unsigned("itype", inode_meta.get_type());
+  f->open_object_section("store");
+  inode_data.dump(f);
+  try {
+    if (inode_data.snap_blob.length()) {
+      sr_t srnode;
+      auto p = inode_data.snap_blob.cbegin();
+      decode(srnode, p);
+      f->open_object_section("snap_blob");
+      srnode.dump(f);
+      f->close_section();
+    }
+  } catch (const buffer::error &err) {
+    cerr << "corrupt decode in snap_blob"
+         << ": " << err.what() << std::endl;
+    return -1;
+  }
+
+  f->close_section();
+  f->close_section();
+  f->flush(ds);
+
+  if (fn != "") {
+    ofstream o;
+    o.open(fn);
+    if (o) {
+      o << ds.str();
+      o.close();
+    } else {
+      cout << "out to file (" << fn << ") failed" << std::endl;
+      cout << ds.str() << std::endl;
+    }
+
+  } else
+    std::cout << ds.str() << std::endl;
+  return 0;
+}
+int MetaTool::list_meta(meta_op &op)
+{
+  meta_op::sub_op* sop = op.top_op();
+
+  bool list_all = false;
+  string oid;
+  inodeno_t ino = sop->ino_c;
+  frag_t frag = sop->frag;
+
+  if (sop->ino_c == 0) {
+    list_all = true;
+    oid = obj_name(sop->ino, frag);
+  } else {
+    if (_debug)
+      std::cout << __func__ << " : " << sop->trace_level << " " << op.ancestors.size() << std::endl;
+    inode_backpointer_t bp;
+    if (sop->get_c_ancestor(bp)) {
+      auto item = op.inodes.find(bp.dirino);
+      if (item != op.inodes.end()) {
+        frag = item->second->get_meta()->pick_dirfrag(bp.dname);
+      }
+      oid = obj_name(bp.dirino, frag);
+    } else {
+      meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+      nsop->ino = sop->ino_c;
+      nsop->sub_op_t = meta_op::OP_LTRACE;
+      nsop->sub_ino_t = meta_op::INO_DIR;
+      op.push_op(nsop);
+      return 1;
+    }
+  }
+  if (_debug)
+    std::cout << __func__ << " : " << string(list_all?"listall ":"info ") << oid << " "<< ino << std::endl;
+  bufferlist hbl;
+  int ret = io_meta.omap_get_header(oid, &hbl);
+  if (ret < 0) {
+    std::cerr << __func__ << " : can't find it, maybe it (ino:"<< sop->ino<< ")isn't a normal dir!" << std::endl;
+    return -1;
+  }
+
+  if (hbl.length() == 0) {   // obj has splite
+    if (list_all) {
+      if (frag == frag_t()) {
+        auto item = op.inodes.find(sop->ino);
+        if (item != op.inodes.end()) {
+            inodeno_t tmp = sop->ino;
+            op.pop_op();
+            std::list<frag_t> frags;
+            item->second->get_meta()->dirfragtree.get_leaves(frags);
+            for (const auto &frag : frags) {
+              meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+              nsop->ino = tmp;
+              nsop->sub_op_t = meta_op::OP_LIST;
+              nsop->sub_ino_t = meta_op::INO_DIR;
+              nsop->frag = frag;
+              op.push_op(nsop);
+            }
+        } else {
+          meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+          nsop->ino_c = sop->ino;
+          nsop->sub_op_t = meta_op::OP_LIST;
+          nsop->sub_ino_t = meta_op::INO_DIR;
+          op.push_op(nsop);
+        }
+        return 1;
+      } else {
+        cerr << __func__ << " missing some data (" << oid << ")???" << std::endl;
+        return -1;
+      }
+    } else {
+      if (frag == frag_t()) {
+        inode_backpointer_t bp;
+        if (sop->get_c_ancestor(bp)) {
+          meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+          nsop->ino_c = bp.dirino;
+          nsop->sub_op_t = meta_op::OP_LIST;
+          nsop->sub_ino_t = meta_op::INO_DIR;
+          nsop->trace_level = sop->trace_level + 1;
+          op.push_op(nsop);
+          return 1;
+        } else {
+          cerr << __func__ << "can't find obj(" << oid << ") ,miss ancestors or miss some objs??? " << std::endl;
+          return -1;
+        }
+      } else {
+        cerr << __func__ << "missing some objs(" << oid << ")??? " << std::endl;
+        return -1;
+      }
+    }
+  }
+
+  fnode_t got_fnode;
+  try {
+    auto p = hbl.cbegin();
+    ::decode(got_fnode, p);
+  } catch (const buffer::error &err) {
+    cerr << "corrupt fnode header in " << oid
+         << ": " << err.what() << std::endl;
+    return -1;
+  }
+
+  if (_debug) {
+    std::string format = "json";
+    Formatter* f = Formatter::create(format);
+    f->enable_line_break();
+    f->dump_string("type", "--fnode--");
+    f->open_object_section("fnode");
+    got_fnode.dump(f);
+    f->close_section();
+    f->flush(std::cout);
+    std::cout << std::endl;
+  }
+
+  // print children
+  std::map<string, bufferlist> out_vals;
+  int max_vals = 5;
+  io_meta.omap_get_vals(oid, "", max_vals, &out_vals);
+
+  bool force_dirty = false;
+  const set<snapid_t> *snaps = NULL;
+  unsigned pos = out_vals.size() - 1;
+  std::string last_dname;
+  for (map<string, bufferlist>::iterator p = out_vals.begin();
+       p != out_vals.end();
+       ++p, --pos) {
+    string dname;
+    snapid_t last;
+    dentry_key_t::decode_helper(p->first, dname, last);
+    if (_debug)
+      last_dname = dname;
+    try {
+      if (!list_all) {
+        if (show_child(p->first, dname, last, p->second, pos, snaps,
+                       &force_dirty, ino, &op) == 1) {
+          return 0;
+        }
+      } else {
+        cout << "dname : " << dname << " " << last << std::endl;
+        if (show_child(p->first, dname, last, p->second, pos, snaps,
+                       &force_dirty) == 1)
+          return 0;
+      }
+    } catch (const buffer::error &err) {
+      derr << "Corrupt dentry '" << dname << "' : "
+           << err.what() << "(" << "" << ")" << dendl;
+      return -1;
+    }
+  }
+  while (out_vals.size() == (size_t)max_vals) {
+    out_vals.clear();
+    io_meta.omap_get_vals(oid, last_dname, max_vals, &out_vals);
+    pos = out_vals.size() - 1;
+    for (map<string, bufferlist>::iterator p = (++out_vals.begin());
+         p != out_vals.end();
+         ++p, --pos) {
+      string dname;
+      snapid_t last;
+      dentry_key_t::decode_helper(p->first, dname, last);
+      last_dname = dname;
+      try {
+        if (!list_all) {
+          if (show_child(p->first, dname, last, p->second, pos, snaps,
+                         &force_dirty, ino, &op) == 1) {
+            return 0;
+          }
+        } else {
+          cout << "dname : " << dname << " " << last << std::endl;
+          if (show_child(p->first, dname, last, p->second, pos, snaps,
+                         &force_dirty) == 1)
+            return 0;
+        }
+      } catch (const buffer::error &err) {
+          derr << "Corrupt dentry '" << dname << "' : "
+               << err.what() << "(" << "" << ")" << dendl;
+          return -1;
+      }
+    }
+  }
+
+  if (!list_all) {
+    cerr << __func__ << "miss obj(ino:" << ino << ")??? " << std::endl;
+    return -1;
+  }
+  return 0;
+}
+
+int MetaTool::file_meta(meta_op &op)
+{
+  int r = 0;
+  if (op.top_op()->sub_ino_t ==  meta_op::INO_DIR) {
+    r = _file_meta(op, io_meta);
+  } else if (op.top_op()->sub_ino_t == meta_op::INO_F) {
+    for (auto i = io_data_v.begin(); i != io_data_v.end(); ++i)
+      if ((r = _file_meta(op, **i)) == 1)
+        break;
+  }
+  if (r == 1) {
+    inode_backpointer_t bp;
+    if (op.top_op()->get_ancestor(bp)) {
+      return 0;
+    } else {
+      std::cerr << "no trace for obj (ino:" << op.top_op()->ino <<")??" << std::endl;
+      return -1;
+    }
+  } else if (op.top_op()->sub_ino_t == meta_op::INO_DIR) {
+    std::cerr << "\tmaybe it's a file(ino:" << op.top_op()->ino << ")" << std::endl;
+    op.top_op()->sub_ino_t = meta_op::INO_F;
+    return 1;
+  }
+    
+  std::cerr << "can't get (ino:" << op.top_op()->ino <<")trace??" << std::endl;
+  return -1;
+}
+
+int MetaTool::_file_meta(meta_op &op, librados::IoCtx& io)
+{
+  inodeno_t ino = op.top_op()->ino;
+  std::string oid = obj_name(ino);
+  bufferlist pointer_bl;
+  std::map<std::string, bufferlist> attrset;
+  int r = 0;
+  bool have_data = false;
+  r = io.getxattrs (oid.c_str(), attrset);
+  if (0 == r) {
+    std::stringstream ds;
+    std::string format = "json";
+    Formatter* f = Formatter::create(format);
+    auto item = attrset.find("parent");
+    if (item != attrset.end()) {
+      inode_backtrace_t i_bt;
+      try {
+        bufferlist::const_iterator q = item->second.cbegin();
+        i_bt.decode(q);
+        f->open_array_section("info");
+        have_data = true;
+        if (i_bt.ancestors.size() > 0)
+          op.ancestors[ino] = i_bt.ancestors[0];
+        f->dump_string("type", "--i_bt--");
+        f->open_object_section("parent");
+        i_bt.dump(f);
+        f->close_section();
+      } catch (buffer::error &e) {
+        cerr << "failed to decode parent of " << oid << std::endl;
+        return -1;
+      }
+    } else {
+      cerr << oid << " in " << io.get_pool_name()  << " , but no parent" << std::endl;
+      return -1;
+    }
+
+    item = attrset.find("layout");
+    if (item != attrset.end()) {
+      file_layout_t layout;
+      try {
+        auto q = item->second.cbegin();
+        layout.decode(q);
+        f->dump_string("type", "--layout--");
+        f->open_object_section("layout");
+        layout.dump(f);
+        f->close_section();
+
+      } catch (buffer::error &e) {
+        cerr << "failed to decode layout of " << oid << std::endl;
+        return -1;
+      }
+    } else {
+      cerr << oid << " in " << io.get_pool_name()  << " , but no layout" << std::endl;
+    }
+    if (have_data) {
+      f->close_section();
+      f->flush(ds);
+      if (_debug)
+        cout << ino << " : "<< ds.str() << std::endl;
+      return 1;
+    }
+  }
+  return 0;
+}
+std::string MetaTool::obj_name(inodeno_t ino, uint64_t offset, const char *suffix) const
+{
+    char name[60];
+  snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)offset, suffix ? suffix : "");
+  return std::string(name);
+}
+std::string MetaTool::obj_name(inodeno_t ino, frag_t fg, const char *suffix) const
+{
+  char name[60];
+  snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)fg, suffix ? suffix : "");
+  return std::string(name);
+}
+
+std::string MetaTool::obj_name(const char* ino, uint64_t offset, const char *suffix) const
+{
+  char name[60];
+  snprintf(name, sizeof(name), "%s.%08llx%s", ino, (long long unsigned)offset, suffix ? suffix : "");
+  std::string out = name;
+  transform(out.begin(), out.end(), out.begin(),::tolower);
+  return out;
+}
+
+int MetaTool::show_child(std::string_view key,
+                         std::string_view dname,
+                         const snapid_t last,
+                         bufferlist &bl,
+                         const int pos,
+                         const std::set<snapid_t> *snaps,
+                         bool *force_dirty,
+                         inodeno_t sp_ino,
+                         meta_op* op)
+{
+  bufferlist::const_iterator q = bl.cbegin();
+
+  snapid_t first;
+  ::decode(first, q);
+
+  // marker
+  char type;
+  ::decode(type, q);
+
+  if (_debug)
+    std::cout << pos << " type '" << type << "' dname '" << dname
+              << " [" << first << "," << last << "]"
+              << std::endl;
+  // bool stale = false;
+  if (snaps && last != CEPH_NOSNAP) {
+    derr << "!!!! erro !!!!" << dendl;
+    return -1;
+  }
+
+  // CDentry *dn = NULL;
+  // look for existing dentry for _last_ snap, can't process snap of obj
+  //if *(stale)
+  //    dn = lookup_exact_snap(dname, last);
+  //else
+  //    dn = lookup(dname, last);
+  if (type == 'L' || type == 'l') {
+    // hard link
+    inodeno_t ino;
+    unsigned char d_type;
+    mempool::mds_co::string alternate_name;
+
+    CDentry::decode_remote(type, ino, d_type, alternate_name, q);
+
+    if (sp_ino > 0) {
+      if (sp_ino == ino) {
+        std::cout << "find hard link : " << ino << "," << d_type << std::endl;
+        return 1;
+      }
+    }
+
+    std::cout << "hard link : " << ino << "," << d_type << std::endl;
+  } else if (type == 'I' || type == 'i') {
+    // inode
+    // load inode data before lookuping up or constructing CInode
+    InodeStore& inode_data = *(new InodeStore);
+    if (type == 'i') {
+      mempool::mds_co::string alternate_name;
+
+      DECODE_START(2, q);
+      if (struct_v >= 2)
+        decode(alternate_name, q);
+      inode_data.decode(q);
+      DECODE_FINISH(q);
+    } else {
+      inode_data.decode_bare(q);
+    }
+
+    std::stringstream ds;
+    std::string format = "json";
+    Formatter* f = Formatter::create(format);
+    f->enable_line_break();
+    f->open_object_section("meta");
+    f->dump_unsigned("snapid_t", first);
+    f->dump_unsigned("itype", type);
+    f->open_object_section("store");
+    inode_data.dump(f);
+    try {
+      if (inode_data.snap_blob.length()) {
+        sr_t srnode;
+        auto p = inode_data.snap_blob.cbegin();
+        srnode.decode(p);
+        f->open_object_section("snap_blob");
+        srnode.dump(f);
+        f->close_section();
+      }
+    } catch (const buffer::error &err) {
+      cerr << "corrupt decode in snap_blob"
+           << ": " << err.what() << std::endl;
+    }
+    f->close_section();
+    f->close_section();
+    f->flush(ds);
+
+    if (sp_ino > 0 && op != NULL && sp_ino == inode_data.inode->ino) {
+      inode_meta_t* tmp = new inode_meta_t(first, type, &inode_data);
+      op->inodes[inode_data.inode->ino] = tmp;
+      op->okeys[inode_data.inode->ino] = key.data();
+      return 1;
+    } else {
+      delete &inode_data;
+    }
+
+    if (sp_ino == 0) {
+      cout << ds.str() << std::endl;
+    }
+    } else {
+      std::cerr << __func__ << "unknow type : " << dname << "," << type << std::endl;
+    }
+  return 0;
+}
diff --git a/src/tools/cephfs/MetaTool.h b/src/tools/cephfs/MetaTool.h
new file mode 100644
index 000000000..d36f7bba2
--- /dev/null
+++ b/src/tools/cephfs/MetaTool.h
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef METATOOL_H__
+#define METATOOL_H__
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+#include <vector>
+#include <stack>
+using std::stack;
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/events/EMetaBlob.h"
+
+#include "include/rados/librados.hpp"
+#include "common/ceph_json.h"
+
+using ::ceph::bufferlist;
+class MetaTool : public MDSUtility
+{
+public:
+  class inode_meta_t {
+  public:
+    inode_meta_t(snapid_t f = CEPH_NOSNAP, char t = char(255), InodeStore* i = NULL):
+        _f(f),_t(t),_i(i) {
+    };
+    snapid_t get_snapid() const { 
+      return _f;
+    }
+    InodeStore* get_meta() const {
+      if (_t == 'I')
+        return _i;
+      else
+        return NULL;
+    }
+    int get_type() const {
+      return _t;
+    }
+    void decode_json(JSONObj *obj);
+    void encode(::ceph::bufferlist& bl, uint64_t features);
+  private:
+    snapid_t _f;
+    char _t;
+    InodeStore* _i;
+  };
+private:
+  class meta_op {
+  public:
+    meta_op(bool debug = false, std::string out = "", std::string in = "", bool confirm = false):
+        _debug(debug),
+        _out(out),
+        _in(in),
+        _confirm(confirm)
+      {}
+    void release();
+    typedef enum {
+      OP_LIST = 0,
+      OP_LTRACE,
+      OP_SHOW,
+      OP_AMEND,
+      OP_SHOW_FN,
+      OP_AMEND_FN,
+      OP_NO
+    } op_type;
+
+    typedef enum {
+      INO_DIR = 0,
+      INO_F
+    } ino_type;
+
+    static std::string op_type_name(op_type& t) {
+      std::string name;
+      switch (t) {
+      case OP_LIST:
+        name = "list dir";
+        break;
+      case OP_LTRACE:
+        name = "load trace";
+        break;
+      case OP_SHOW:
+        name = "show info";
+        break;
+      case OP_AMEND:
+        name = "amend info";
+        break;
+      case OP_SHOW_FN:
+        name = "show fnode";
+        break;
+      case OP_AMEND_FN:
+        name = "amend fnode";
+        break;
+      case OP_NO:
+        name = "noop";
+        break;
+      default:
+        name = "unknow op type";
+      }
+      return name;
+    }
+    static std::string ino_type_name(ino_type& t) {
+      std::string name;
+      switch (t) {
+      case INO_DIR:
+        name = "dir";
+        break;
+      case INO_F:
+        name = "file";
+        break;
+      default:
+        name = "unknow file type";
+      }
+      return name;
+    }
+    class sub_op {
+    public:
+      sub_op(meta_op* mop):
+          trace_level(0),
+          _proc(false),
+          _mop(mop)
+        {}
+      void print() {
+        std::cout << detail() << std::endl;
+      }
+      std::string detail() {
+        std::stringstream ds;
+        ds << " [sub_op]" << op_type_name(sub_op_t) << "|"
+           << ino_type_name(sub_ino_t) << "|"
+           << ino << "|"
+           << frag << "|"
+           << ino_c << "|"
+           << trace_level << "|"
+           << name;
+        return ds.str();
+      }
+      bool get_c_ancestor(inode_backpointer_t& bp) {
+        if (!_mop || !ino_c)
+          return false;
+        auto item = _mop->ancestors.find(ino_c);
+        if (item != _mop->ancestors.end()) {
+          bp = item->second;
+          return true;
+        } else
+          return false;
+      }
+      bool get_ancestor(inode_backpointer_t& bp) {
+        if (!_mop || !ino)
+          return false;
+        auto item = _mop->ancestors.find(ino);
+        if (item != _mop->ancestors.end()) {
+          bp = item->second;
+          return true;
+        } else
+          return false;
+      }
+      op_type sub_op_t;
+      ino_type sub_ino_t;
+      inodeno_t ino;
+      frag_t frag;
+      inodeno_t ino_c;
+      unsigned trace_level;
+      std::string name;
+      bool _proc;
+      meta_op* _mop;
+    };
+      
+    std::map<inodeno_t, inode_backpointer_t > ancestors;
+    std::map<inodeno_t, inode_meta_t* > inodes;
+    std::map<inodeno_t, std::string > okeys;
+      
+    void clear_sops() {
+      while(!no_sops())
+        pop_op();
+    }
+    bool no_sops() {
+      return sub_ops.empty();
+    }
+    void push_op(sub_op* sop) {
+      if (_debug)
+        std::cout << "<<====" << sop->detail() << std::endl;
+      sub_ops.push(sop);
+    }
+    sub_op* top_op() {
+      return sub_ops.top();
+    }
+    void pop_op() {
+      sub_op* sop = sub_ops.top();
+      if (_debug)
+        std::cout << "====>>" << sop->detail() << std::endl;;
+      delete sop;
+      sub_ops.pop();
+    }
+    std::string outfile() {
+      return _out;
+    }
+    std::string infile() {
+      return _in;
+    }
+    bool is_debug() {
+      return _debug;
+    }
+    bool confirm_chg() {
+      return _confirm;
+    }
+  private:
+    stack<sub_op*> sub_ops;
+    bool _debug;
+    std::string _out;
+    std::string _in;
+    bool _confirm;
+  };
+  MDSRoleSelector role_selector;
+  mds_rank_t rank;
+    
+  // I/O handles
+  librados::Rados rados;
+  librados::IoCtx io_meta;
+  std::vector<librados::IoCtx*> io_data_v;
+  librados::IoCtx output;
+  bool _debug;
+  uint64_t features;
+
+  std::string obj_name(inodeno_t ino, frag_t fg = frag_t(), const char *suffix = NULL) const;
+  std::string obj_name(inodeno_t ino, uint64_t offset, const char *suffix = NULL) const;
+  std::string obj_name(const char* ino, uint64_t offset, const char *suffix = NULL) const;
+
+  // 0 : continue to find 
+  // 1 : stop to find it
+  int show_child(std::string_view key,
+                 std::string_view dname,
+                 const snapid_t last,
+                 bufferlist &bl,
+                 const int pos,
+                 const std::set<snapid_t> *snaps,
+                 bool *force_dirty,
+                 inodeno_t sp_ino = 0,
+                 meta_op* op = NULL
+                 );
+
+  int process(std::string& mode, std::string& ino, std::string out, std::string in, bool confirm);
+  int show_meta_info(std::string& ino, std::string& out);
+  int list_meta_info(std::string& ino, std::string& out);
+  int amend_meta_info(std::string& ino, std::string& in, bool confirm);
+  int show_fnode(std::string& ino, std::string& out);
+  int amend_fnode(std::string& in, bool confirm);
+  int op_process(meta_op &op);
+  int list_meta(meta_op &op);
+  int file_meta(meta_op &op);
+  int show_meta(meta_op &op);
+  int amend_meta(meta_op &op);
+  int show_fn(meta_op &op);
+  int amend_fn(meta_op &op);
+  public:
+  int _file_meta(meta_op &op, librados::IoCtx& io);
+  int _show_meta(inode_meta_t& i, const std::string& fn);
+  int _amend_meta(std::string &k, inode_meta_t& i, const std::string& fn, meta_op& op);
+  int _show_fn(inode_meta_t& i, const std::string& fn);
+  int _amend_fn(const std::string& fn, bool confirm);
+  void usage();
+  MetaTool(bool debug=false):
+      _debug(debug) {}
+  ~MetaTool() {}
+
+  int main(std::string& mode,
+           std::string& rank_str,
+           std::string& minfo,
+           std::string&ino,
+           std::string& out,
+           std::string& in,
+           bool confirm = false
+           );
+};
+#endif // METATOOL_H__
diff --git a/src/tools/cephfs/PgFiles.cc b/src/tools/cephfs/PgFiles.cc
new file mode 100644
index 000000000..2abca7223
--- /dev/null
+++ b/src/tools/cephfs/PgFiles.cc
@@ -0,0 +1,194 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "osdc/Striper.h"
+
+#include "PgFiles.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "pgeffects." << __func__ << ": "
+
+int PgFiles::init()
+{
+  int r = ceph_create_with_context(&cmount, g_ceph_context);
+  if (r != 0) {
+    return r;
+  }
+
+  return ceph_init(cmount);
+}
+
+PgFiles::PgFiles(Objecter *o, const std::set<pg_t> &pgs_)
+  : objecter(o), pgs(pgs_)
+{
+  for (const auto &i : pgs) {
+    pools.insert(i.m_pool);
+  }
+}
+
+PgFiles::~PgFiles()
+{
+  ceph_release(cmount);
+}
+
+void PgFiles::hit_dir(std::string const &path)
+{
+  dout(10) << "entering " << path << dendl;
+
+  ceph_dir_result *dr = nullptr;
+  int r = ceph_opendir(cmount, path.c_str(), &dr);
+  if (r != 0) {
+    derr << "Failed to open path: " << cpp_strerror(r) << dendl;
+    return;
+  }
+
+  struct dirent de;
+  while((r = ceph_readdir_r(cmount, dr, &de)) != 0) {
+    if (r < 0) {
+      derr << "Error reading path " << path << ": " << cpp_strerror(r)
+           << dendl;
+      ceph_closedir(cmount, dr); // best effort, ignore r
+      return;
+    }
+
+    if (std::string(de.d_name) == "." || std::string(de.d_name) == "..") {
+      continue;
+    }
+
+    struct ceph_statx stx;
+    std::string de_path = (path + std::string("/") + de.d_name);
+    r = ceph_statx(cmount, de_path.c_str(), &stx,
+		    CEPH_STATX_INO|CEPH_STATX_SIZE, 0);
+    if (r != 0) {
+      derr << "Failed to stat path " << de_path << ": "
+            << cpp_strerror(r) << dendl;
+      // Don't hold up the whole process for one bad inode
+      continue;
+    }
+
+    if (S_ISREG(stx.stx_mode)) {
+      hit_file(de_path, stx);
+    } else if (S_ISDIR(stx.stx_mode)) {
+      hit_dir(de_path);
+    } else {
+      dout(20) << "Skipping non reg/dir file: " << de_path << dendl;
+    }
+  }
+
+  r = ceph_closedir(cmount, dr);
+  if (r != 0) {
+    derr << "Error closing path " << path << ": " << cpp_strerror(r) << dendl;
+    return;
+  }
+}
+
+void PgFiles::hit_file(std::string const &path, const struct ceph_statx &stx)
+{
+  ceph_assert(S_ISREG(stx.stx_mode));
+
+  dout(20) << "Hitting file '" << path << "'" << dendl;
+
+  int l_stripe_unit = 0;
+  int l_stripe_count = 0;
+  int l_object_size = 0;
+  int l_pool_id = 0;
+  int r = ceph_get_path_layout(cmount, path.c_str(), &l_stripe_unit,
+                               &l_stripe_count, &l_object_size,
+                               &l_pool_id);
+  if (r != 0) {
+    derr << "Error reading layout on " << path << ": " << cpp_strerror(r)
+         << dendl;
+    return;
+  }
+
+  struct file_layout_t layout;
+  layout.stripe_unit = l_stripe_unit;
+  layout.stripe_count = l_stripe_count;
+  layout.object_size = l_object_size;
+  layout.pool_id = l_pool_id;
+
+  // Avoid calculating PG if the layout targeted a completely different pool
+  if (pools.count(layout.pool_id) == 0) {
+    dout(20) << "Fast check missed: pool " << layout.pool_id << " not in "
+                "target set" << dendl;
+    return;
+  }
+
+  auto num_objects = Striper::get_num_objects(layout, stx.stx_size);
+
+  for (uint64_t i = 0; i < num_objects; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)stx.stx_ino,
+                                              (long long unsigned int)i);
+    dout(20) << "  object " << std::string(buf) << dendl;
+
+    pg_t target;
+    object_t oid;
+    object_locator_t loc;
+    loc.pool = layout.pool_id;
+    loc.key = std::string(buf);
+
+    unsigned pg_num_mask = 0;
+    unsigned pg_num = 0;
+
+    int r = 0;
+    objecter->with_osdmap([&r, oid, loc, &target, &pg_num_mask, &pg_num]
+                          (const OSDMap &osd_map) {
+      r = osd_map.object_locator_to_pg(oid, loc, target);
+      if (r == 0) {
+        auto pool = osd_map.get_pg_pool(loc.pool);
+        pg_num_mask = pool->get_pg_num_mask();
+        pg_num = pool->get_pg_num();
+      }
+    });
+    if (r != 0) {
+      // Can happen if layout pointed to pool not in osdmap, for example
+      continue;
+    }
+
+    target.m_seed = ceph_stable_mod(target.ps(), pg_num, pg_num_mask);
+
+    dout(20) << "  target " << target << dendl;
+
+    if (pgs.count(target)) {
+      std::cout << path << std::endl;
+      return;
+    }
+  }
+  
+}
+
+int PgFiles::scan_path(std::string const &path)
+{
+  int r = ceph_mount(cmount, "/");
+  if (r != 0) {
+    derr << "Failed to mount: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  hit_dir(path);
+
+  r = ceph_unmount(cmount);
+  if (r != 0) {
+    derr << "Failed to unmount: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  return r;
+}
+
diff --git a/src/tools/cephfs/PgFiles.h b/src/tools/cephfs/PgFiles.h
new file mode 100644
index 000000000..1ba4b3d28
--- /dev/null
+++ b/src/tools/cephfs/PgFiles.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef PG_EFFECTS_H_
+#define PG_EFFECTS_H_
+
+#include "include/cephfs/libcephfs.h"
+#include "osd/osd_types.h"
+#include <set>
+#include "osdc/Objecter.h"
+
+/**
+ * This utility scans the files (via an online MDS) and works out
+ * which ones rely on named PGs.  For use when someone has
+ * some bad/damaged PGs and wants to see which files might be
+ * affected.
+ */
+class PgFiles
+{
+private:
+  Objecter *objecter;
+  struct ceph_mount_info *cmount = nullptr;
+
+  std::set<pg_t> pgs;
+  std::set<uint64_t> pools;
+
+  void hit_file(std::string const &path, const struct ceph_statx &stx);
+  void hit_dir(std::string const &path);
+
+
+public:
+  PgFiles(Objecter *o, const std::set<pg_t> &pgs_);
+  ~PgFiles();
+
+  int init();
+  int scan_path(std::string const &path);
+};
+
+#endif
+
diff --git a/src/tools/cephfs/Resetter.cc b/src/tools/cephfs/Resetter.cc
new file mode 100644
index 000000000..7c0aa30ab
--- /dev/null
+++ b/src/tools/cephfs/Resetter.cc
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#include <memory>
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/JournalPointer.h"
+
+#include "mds/mdstypes.h"
+#include "mds/MDCache.h"
+#include "mon/MonClient.h"
+#include "mds/events/EResetJournal.h"
+
+#include "Resetter.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+using namespace std;
+
+int Resetter::init(mds_role_t role_, const std::string &type, bool hard)
+{
+  role = role_;
+  int r = MDSUtility::init();
+  if (r < 0) {
+    return r;
+  }
+
+  auto fs = fsmap->get_filesystem(role.fscid);
+  ceph_assert(nullptr != fs);
+
+  is_mdlog = false;
+  if (type == "mdlog") {
+    JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool());
+    int rt = 0;
+    if (hard) {
+      jp.front = role.rank + MDS_INO_LOG_OFFSET;
+      jp.back = 0;
+      rt = jp.save(objecter);
+      if (rt != 0) {
+        derr << "Error writing journal pointer:  " << cpp_strerror(rt) << dendl;
+        return rt;
+      }
+      ino = jp.front; // only need to reset ino for mdlog
+    } else {
+      rt = jp.load(objecter);
+      if (rt != 0) {
+        std::cerr << "Error loading journal: " << cpp_strerror(rt) <<
+        ", pass --force to forcibly reset this journal" << std::endl;
+        return rt;
+      } else {
+        ino = jp.front;
+      }
+    }
+    is_mdlog = true;
+  } else if (type == "purge_queue") {
+    ino = MDS_INO_PURGE_QUEUE + role.rank;
+  } else {
+    ceph_abort(); // should not get here
+  }
+  return 0; 
+}
+
+int Resetter::reset()
+{
+  ceph::mutex mylock = ceph::make_mutex("Resetter::reset::lock");
+  ceph::condition_variable cond;
+  bool done;
+  int r;
+
+  auto fs =  fsmap->get_filesystem(role.fscid);
+  ceph_assert(fs != nullptr);
+
+  Journaler journaler("resetter", ino,
+      fs->mds_map.get_metadata_pool(),
+      CEPH_FS_ONDISK_MAGIC,
+      objecter, 0, 0, &finisher);
+  {
+    std::lock_guard locker{lock};
+    journaler.recover(new C_SafeCond(mylock, cond, &done, &r));
+  }
+  {
+    std::unique_lock locker{mylock};
+    cond.wait(locker, [&done] { return done; });
+  }
+  if (r != 0) {
+    if (r == -ENOENT) {
+      cerr << "journal does not exist on-disk. Did you set a bad rank?"
+	   << std::endl;
+      std::cerr << "Error loading journal: " << cpp_strerror(r) <<
+        ", pass --force to forcibly reset this journal" << std::endl;
+      return r;
+    } else {
+      cerr << "got error " << r << "from Journaler, failing" << std::endl;
+      return r;
+    }
+  }
+
+  lock.lock();
+  uint64_t old_start = journaler.get_read_pos();
+  uint64_t old_end = journaler.get_write_pos();
+  uint64_t old_len = old_end - old_start;
+  cout << "old journal was " << old_start << "~" << old_len << std::endl;
+
+  uint64_t new_start = round_up_to(old_end+1, journaler.get_layout_period());
+  cout << "new journal start will be " << new_start
+       << " (" << (new_start - old_end) << " bytes past old end)" << std::endl;
+
+  journaler.set_read_pos(new_start);
+  journaler.set_write_pos(new_start);
+  journaler.set_expire_pos(new_start);
+  journaler.set_trimmed_pos(new_start);
+  journaler.set_writeable();
+
+  cout << "writing journal head" << std::endl;
+  journaler.write_head(new C_SafeCond(mylock, cond, &done, &r));
+  lock.unlock();
+  {
+    std::unique_lock locker{mylock};
+    cond.wait(locker, [&done] { return done; });
+  }
+  std::lock_guard l{lock};
+  if (r != 0) {
+    return r;
+  }
+ 
+  if (is_mdlog) {
+    r = _write_reset_event(&journaler); // reset envent is specific for mdlog journal
+    if (r != 0) {
+      return r;
+    }
+  }
+  cout << "done" << std::endl;
+
+  return 0;
+}
+
+int Resetter::reset_hard()
+{
+  auto fs =  fsmap->get_filesystem(role.fscid);
+  
+  Journaler journaler("resetter", ino,
+    fs->mds_map.get_metadata_pool(),
+    CEPH_FS_ONDISK_MAGIC,
+    objecter, 0, 0, &finisher);
+  journaler.set_writeable();
+
+  file_layout_t default_log_layout = MDCache::gen_default_log_layout(
+      fsmap->get_filesystem(role.fscid)->mds_map);
+  journaler.create(&default_log_layout, g_conf()->mds_journal_format);
+
+  C_SaferCond cond;
+  {
+    std::lock_guard l{lock};
+    journaler.write_head(&cond);
+  }
+  
+  int r = cond.wait();
+  if (r != 0) {
+    derr << "Error writing journal header: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  
+  if (is_mdlog) // reset event is specific for mdlog journal
+  {
+    std::lock_guard l{lock};
+    r = _write_reset_event(&journaler);
+    if (r != 0) {
+      derr << "Error writing EResetJournal: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+  
+  if (is_mdlog) {
+    dout(4) << "Successfully wrote new journal pointer and header for rank "
+      << role << dendl;
+  } else {
+    dout(4) << "Successfully wrote header for rank " << role << dendl;
+  }
+  return 0;
+}
+
+int Resetter::_write_reset_event(Journaler *journaler)
+{
+  ceph_assert(journaler != NULL);
+
+  auto le = std::make_unique<EResetJournal>();
+
+  bufferlist bl;
+  le->encode_with_header(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+  cout << "writing EResetJournal entry" << std::endl;
+  journaler->append_entry(bl);
+
+  int ret;
+  {
+    C_SaferCond cond;
+    journaler->flush(&cond);
+    ret = cond.wait();
+    if (ret < 0)
+      return ret;
+  }
+  {
+    // wait until all journal prezero ops are done
+    C_SaferCond cond;
+    journaler->wait_for_prezero(&cond);
+    cond.wait();
+  }
+
+  return ret;
+}
+
diff --git a/src/tools/cephfs/Resetter.h b/src/tools/cephfs/Resetter.h
new file mode 100644
index 000000000..6998e4598
--- /dev/null
+++ b/src/tools/cephfs/Resetter.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef JOURNAL_RESETTER_H_
+#define JOURNAL_RESETTER_H_
+
+
+#include "MDSUtility.h"
+
+class Journaler;
+
+/**
+ * This class lets you reset an mds journal for troubleshooting or whatever.
+ *
+ * To use, create a Resetter, call init(), and then call reset() with the name
+ * of the file to dump to.
+ */
+class Resetter : public MDSUtility {
+private:
+  mds_role_t role;
+  inodeno_t ino;
+  bool is_mdlog;
+
+protected:
+  int _write_reset_event(Journaler *journaler);
+
+public:
+  Resetter() {}
+  ~Resetter() {}
+
+  int init(mds_role_t role_, const std::string &type, bool hard);
+  /**
+   * For use when no journal header/pointer was present: write one
+   * out from scratch.
+   */
+  int reset_hard();
+  int reset();
+};
+
+#endif /* JOURNAL_RESETTER_H_ */
diff --git a/src/tools/cephfs/RoleSelector.cc b/src/tools/cephfs/RoleSelector.cc
new file mode 100644
index 000000000..e2d53b86e
--- /dev/null
+++ b/src/tools/cephfs/RoleSelector.cc
@@ -0,0 +1,59 @@
+
+#include "RoleSelector.h"
+
+int MDSRoleSelector::parse_rank(
+    const FSMap &fsmap,
+    std::string const &str)
+{
+  if (str == "all" || str == "*") {
+    std::set<mds_rank_t> in;
+    const MDSMap &mds_map = fsmap.get_filesystem(fscid)->mds_map;
+    mds_map.get_mds_set(in);
+
+    for (auto rank : in) {
+      roles.push_back(mds_role_t(fscid, rank));
+    }
+
+    return 0;
+  } else {
+    std::string rank_err;
+    mds_rank_t rank = strict_strtol(str.c_str(), 10, &rank_err);
+    if (!rank_err.empty()) {
+      return -EINVAL;
+    }
+    if (fsmap.get_filesystem(fscid)->mds_map.is_dne(rank)) {
+      return -ENOENT;
+    }
+    roles.push_back(mds_role_t(fscid, rank));
+    return 0;
+  }
+}
+
+int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str,
+                           bool allow_unqualified_rank)
+{
+  auto colon_pos = str.find(":");
+  if (colon_pos == std::string::npos) {
+    // An unqualified rank.  Only valid if there is only one
+    // namespace.
+    if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) {
+      fscid = fsmap.get_filesystem()->fscid;
+      return parse_rank(fsmap, str);
+    } else {
+      return -EINVAL;
+    }
+  } else if (colon_pos == 0 || colon_pos == str.size() - 1) {
+    return -EINVAL;
+  } else {
+    const std::string ns_str = str.substr(0, colon_pos);
+    const std::string rank_str = str.substr(colon_pos + 1);
+    std::shared_ptr<const Filesystem> fs_ptr;
+    int r = fsmap.parse_filesystem(ns_str, &fs_ptr);
+    if (r != 0) {
+      return r;
+    }
+    fscid = fs_ptr->fscid;
+    return parse_rank(fsmap, rank_str);
+  }
+}
+
diff --git a/src/tools/cephfs/RoleSelector.h b/src/tools/cephfs/RoleSelector.h
new file mode 100644
index 000000000..9090b7200
--- /dev/null
+++ b/src/tools/cephfs/RoleSelector.h
@@ -0,0 +1,36 @@
+
+#ifndef ROLE_SELECTOR_H_
+#define ROLE_SELECTOR_H_
+
+#include <string>
+#include <vector>
+#include "mds/mdstypes.h"
+#include "mds/FSMap.h"
+
+/**
+ * When you want to let the user act on a single rank in a namespace,
+ * or all of them.
+ */
+class MDSRoleSelector
+{
+  public:
+    const std::vector<mds_role_t> &get_roles() const {return roles;}
+    int parse(const FSMap &fsmap, std::string const &str,
+            bool allow_unqualified_rank=true);
+    MDSRoleSelector()
+      : fscid(FS_CLUSTER_ID_NONE)
+    {}
+    fs_cluster_id_t get_ns() const
+    {
+      return fscid;
+    }
+  protected:
+    int parse_rank(
+        const FSMap &fsmap,
+        std::string const &str);
+    std::vector<mds_role_t> roles;
+    fs_cluster_id_t fscid;
+};
+
+#endif // ROLE_SELECTOR_H_
+
diff --git a/src/tools/cephfs/TableTool.cc b/src/tools/cephfs/TableTool.cc
new file mode 100644
index 000000000..dcd35a624
--- /dev/null
+++ b/src/tools/cephfs/TableTool.cc
@@ -0,0 +1,419 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+
+#include "mds/SessionMap.h"
+#include "mds/InoTable.h"
+#include "mds/SnapServer.h"
+
+#include "TableTool.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+using namespace std;
+
+void TableTool::usage()
+{
+  std::cout << "Usage: \n"
+    << "  cephfs-table-tool <all|[mds rank]> <reset|show> <session|snap|inode>"
+    << "  cephfs-table-tool <all|[mds rank]> <take_inos> <max_ino>"
+    << std::endl;
+
+  generic_client_usage();
+}
+
+
+/**
+ * For a function that takes an MDS role as an argument and
+ * returns an error code, execute it on the roles specified
+ * by `role_selector`.
+ */
+int TableTool::apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f)
+{
+  ceph_assert(f != NULL);
+
+  int r = 0;
+
+  f->open_object_section("ranks");
+
+  for (auto role : role_selector.get_roles()) {
+    std::ostringstream rank_str;
+    rank_str << role.rank;
+    f->open_object_section(rank_str.str().c_str());
+
+    f->open_object_section("data");
+    int rank_r = fptr(role, f);
+    f->close_section();
+    r = r ? r : rank_r;
+
+    f->dump_int("result", rank_r);
+    f->close_section();
+
+    
+  }
+
+  f->close_section();
+
+  return r;
+}
+
+
+/**
+ * This class wraps an MDS table class (SessionMap, SnapServer, InoTable)
+ * with offline load/store code such that we can do offline dumps and resets
+ * on those tables.
+ */
+template <typename A>
+class TableHandler
+{
+protected:
+  // The RADOS object ID for the table
+  std::string object_name;
+
+  // The role in question (may be NONE)
+  mds_role_t role;
+
+  // Whether this is an MDSTable subclass (i.e. has leading version field to decode)
+  bool mds_table;
+
+public:
+  TableHandler(mds_role_t r, std::string const &name, bool mds_table_)
+    : role(r), mds_table(mds_table_)
+  {
+    // Compose object name of the table we will dump
+    std::ostringstream oss;
+    oss << "mds";
+    if (!role.is_none()) {
+      oss << role.rank;
+    }
+    oss << "_" << name;
+    object_name = oss.str();
+  }
+
+  int load_and_dump(librados::IoCtx *io, Formatter *f)
+  {
+    ceph_assert(io != NULL);
+    ceph_assert(f != NULL);
+
+    // Attempt read
+    bufferlist table_bl;
+    int read_r = io->read(object_name, table_bl, 0, 0);
+    if (read_r >= 0) {
+      auto q = table_bl.cbegin();
+      try {
+        if (mds_table) {
+          version_t version;
+          decode(version, q);
+          f->dump_int("version", version);
+        }
+        A table_inst;
+        table_inst.set_rank(role.rank);
+        table_inst.decode(q);
+        table_inst.dump(f);
+
+        return 0;
+      } catch (buffer::error &e) {
+        derr << "table " << object_name << " is corrupt" << dendl;
+        return -EIO;
+      }
+    } else {
+      derr << "error reading table object " << object_name
+        << ": " << cpp_strerror(read_r) << dendl;
+      return read_r;
+    }
+  }
+
+  int reset(librados::IoCtx *io)
+  {
+    A table_inst;
+    // Compose new (blank) table
+    table_inst.set_rank(role.rank);
+    table_inst.reset_state();
+    // Write the table out
+    return write(table_inst, io);
+  }
+
+protected:
+
+  int write(const A &table_inst, librados::IoCtx *io)
+  {
+    bufferlist new_bl;
+    if (mds_table) {
+      version_t version = 1;
+      encode(version, new_bl);
+    }
+    table_inst.encode_state(new_bl);
+
+    // Write out new table
+    int r = io->write_full(object_name, new_bl);
+    if (r != 0) {
+      derr << "error writing table object " << object_name
+        << ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    return r;
+  }
+};
+
+template <typename A>
+class TableHandlerOmap
+{
+private:
+  // The RADOS object ID for the table
+  std::string object_name;
+
+  // The role (rank may be NONE)
+  mds_role_t role;
+
+  // Whether this is an MDSTable subclass (i.e. has leading version field to decode)
+  bool mds_table;
+
+public:
+  TableHandlerOmap(mds_role_t r, std::string const &name, bool mds_table_)
+    : role(r), mds_table(mds_table_)
+  {
+    // Compose object name of the table we will dump
+    std::ostringstream oss;
+    oss << "mds";
+    if (!role.is_none()) {
+      oss << role.rank;
+    }
+    oss << "_" << name;
+    object_name = oss.str();
+  }
+
+  int load_and_dump(librados::IoCtx *io, Formatter *f)
+  {
+    ceph_assert(io != NULL);
+    ceph_assert(f != NULL);
+
+    // Read in the header
+    bufferlist header_bl;
+    int r = io->omap_get_header(object_name, &header_bl);
+    if (r != 0) {
+      derr << "error reading header on '" << object_name << "': "
+           << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    // Decode the header
+    A table_inst;
+    table_inst.set_rank(role.rank);
+    try {
+      table_inst.decode_header(header_bl);
+    } catch (buffer::error &e) {
+      derr << "table " << object_name << " is corrupt" << dendl;
+      return -EIO;
+    }
+
+    // Read and decode OMAP values in chunks
+    std::string last_key = "";
+    while(true) {
+      std::map<std::string, bufferlist> values;
+      int r = io->omap_get_vals(object_name, last_key,
+          g_conf()->mds_sessionmap_keys_per_op, &values);
+
+      if (r != 0) {
+        derr << "error reading values: " << cpp_strerror(r) << dendl;
+        return r;
+      }
+
+      if (values.empty()) {
+        break;
+      }
+
+      try {
+        table_inst.decode_values(values);
+      } catch (buffer::error &e) {
+        derr << "table " << object_name << " is corrupt" << dendl;
+        return -EIO;
+      }
+      last_key = values.rbegin()->first;
+    }
+
+    table_inst.dump(f);
+
+    return 0;
+  }
+
+  int reset(librados::IoCtx *io)
+  {
+    A table_inst;
+    table_inst.set_rank(role.rank);
+    table_inst.reset_state();
+    bufferlist header_bl;
+    table_inst.encode_header(&header_bl);
+
+    // Compose a transaction to clear and write header
+    librados::ObjectWriteOperation op;
+    op.omap_clear();
+    op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
+    op.omap_set_header(header_bl);
+    
+    return io->operate(object_name, &op);
+  }
+};
+
+class InoTableHandler : public TableHandler<InoTable>
+{
+  public:
+  explicit InoTableHandler(mds_role_t r)
+    : TableHandler(r, "inotable", true)
+  {}
+
+  int take_inos(librados::IoCtx *io, inodeno_t max, Formatter *f)
+  {
+    InoTable inst;
+    inst.set_rank(role.rank);
+    inst.reset_state();
+
+    int r = 0;
+    if (inst.force_consume_to(max)) {
+      r = write(inst, io);
+    }
+
+    f->dump_int("version", inst.get_version());
+    inst.dump(f);
+
+    return r;
+  }
+};
+
+
+int TableTool::main(std::vector<const char*> &argv)
+{
+  int r;
+
+  dout(10) << __func__ << dendl;
+
+  // RADOS init
+  // ==========
+  r = rados.init_with_context(g_ceph_context);
+  if (r < 0) {
+    derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+    return r;
+  }
+
+  dout(4) << "connecting to RADOS..." << dendl;
+  r = rados.connect();
+  if (r < 0) {
+    derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // Require at least 3 args <rank> <mode> <arg> [args...]
+  if (argv.size() < 3) {
+    cerr << "missing required 3 arguments" << std::endl;
+    return -EINVAL;
+  }
+
+  const std::string role_str = std::string(argv[0]);
+  const std::string mode = std::string(argv[1]);
+  const std::string table = std::string(argv[2]);
+
+  r = role_selector.parse(*fsmap, role_str);
+  if (r < 0) {
+    derr << "Bad rank selection: " << role_str << "'" << dendl;
+    return r;
+  }
+
+  auto fs =  fsmap->get_filesystem(role_selector.get_ns());
+  ceph_assert(fs != nullptr);
+  int64_t const pool_id = fs->mds_map.get_metadata_pool();
+  dout(4) << "resolving pool " << pool_id << dendl;
+  std::string pool_name;
+  r = rados.pool_reverse_lookup(pool_id, &pool_name);
+  if (r < 0) {
+    derr << "Pool " << pool_id << " identified in MDS map not found in RADOS!"
+         << dendl;
+    return r;
+  }
+
+  dout(4) << "creating IoCtx.." << dendl;
+  r = rados.ioctx_create(pool_name.c_str(), io);
+  if (r != 0) {
+    return r;
+  }
+
+  JSONFormatter jf(true);
+  if (mode == "reset") {
+    const std::string table = std::string(argv[2]);
+    if (table == "session") {
+      r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+            return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).reset(&io);
+      }, &jf);
+    } else if (table == "inode") {
+      r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+            return TableHandler<InoTable>(rank, "inotable", true).reset(&io);
+      }, &jf);
+    } else if (table == "snap") {
+      r = TableHandler<SnapServer>(mds_role_t(), "snaptable", true).reset(&io);
+      jf.open_object_section("reset_snap_status");
+      jf.dump_int("result", r);
+      jf.close_section();
+    } else {
+      cerr << "Invalid table '" << table << "'" << std::endl;
+      return -EINVAL;
+    }
+  } else if (mode == "show") {
+    const std::string table = std::string(argv[2]);
+    if (table == "session") {
+      r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+        return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f);
+      }, &jf);
+    } else if (table == "inode") {
+      r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+        return TableHandler<InoTable>(rank, "inotable", true).load_and_dump(&io, f);;
+      }, &jf);
+    } else if (table == "snap") {
+      jf.open_object_section("show_snap_table");
+      {
+        r = TableHandler<SnapServer>(
+            mds_role_t(), "snaptable", true).load_and_dump(&io, &jf);
+        jf.dump_int("result", r);
+      }
+      jf.close_section();
+    } else {
+      cerr << "Invalid table '" << table << "'" << std::endl;
+      return -EINVAL;
+    }
+  } else if (mode == "take_inos") {
+    const std::string ino_str = std::string(argv[2]);
+    std::string ino_err;
+    inodeno_t ino = strict_strtoll(ino_str.c_str(), 10, &ino_err);
+    if (!ino_err.empty()) {
+      derr << "Bad ino '" << ino_str << "'" << dendl;
+      return -EINVAL;
+    }
+    r = apply_role_fn([this, ino](mds_role_t rank, Formatter *f) -> int {
+      return InoTableHandler(rank).take_inos(&io, ino, f);
+    }, &jf);
+  } else {
+    cerr << "Invalid mode '" << mode << "'" << std::endl;
+    return -EINVAL;
+  }
+
+  // Subcommand should have written to formatter, flush it
+  jf.flush(std::cout);
+  std::cout << std::endl;
+  return r;
+}
+
diff --git a/src/tools/cephfs/TableTool.h b/src/tools/cephfs/TableTool.h
new file mode 100644
index 000000000..bf9b95c12
--- /dev/null
+++ b/src/tools/cephfs/TableTool.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+
+#include "include/rados/librados.hpp"
+
+/**
+ * Command line tool for debugging the backing store of
+ * MDSTable instances.
+ */
+class TableTool : public MDSUtility
+{
+  private:
+    MDSRoleSelector role_selector;
+
+    // I/O handles
+    librados::Rados rados;
+    librados::IoCtx io;
+
+    int apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f);
+
+  public:
+    static void usage();
+    int main(std::vector<const char*> &argv);
+
+};
+
diff --git a/src/tools/cephfs/cephfs-data-scan.cc b/src/tools/cephfs/cephfs-data-scan.cc
new file mode 100644
index 000000000..3e1b75cb6
--- /dev/null
+++ b/src/tools/cephfs/cephfs-data-scan.cc
@@ -0,0 +1,46 @@
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "DataScan.h"
+
+using namespace std;
+
+int main(int argc, const char **argv)
+{
+  auto args = argv_to_vec(argc, argv);
+  if (args.empty()) {
+    cerr << argv[0] << ": -h or --help for usage" << std::endl;
+    exit(1);
+  }
+  if (ceph_argparse_need_usage(args)) {
+    DataScan::usage();
+    exit(0);
+  }
+
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                         CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  DataScan data_scan;
+
+  // Connect to mon cluster, download MDS map etc
+  int rc = data_scan.init();
+  if (rc != 0) {
+      std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+      return rc;
+  }
+
+  // Finally, execute the user's commands
+  rc = data_scan.main(args);
+  if (rc != 0) {
+    std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+  }
+
+
+  return rc;
+}
+
diff --git a/src/tools/cephfs/cephfs-journal-tool.cc b/src/tools/cephfs/cephfs-journal-tool.cc
new file mode 100644
index 000000000..f95e7e265
--- /dev/null
+++ b/src/tools/cephfs/cephfs-journal-tool.cc
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "JournalTool.h"
+
+
+int main(int argc, const char **argv)
+{
+  auto args = argv_to_vec(argc, argv);
+  if (args.empty()) {
+    std::cerr << argv[0] << ": -h or --help for usage" << std::endl;
+    exit(1);
+  }
+  if (ceph_argparse_need_usage(args)) {
+    JournalTool::usage();
+    exit(0);
+  }
+
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+			     CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  JournalTool jt;
+
+  // Connect to mon cluster, download MDS map etc
+  int rc = jt.init();
+  if (rc != 0) {
+      std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+      return rc;
+  }
+
+  // Finally, execute the user's commands
+  rc = jt.main(args);
+  if (rc != 0) {
+    std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+  }
+
+  return rc;
+}
+
diff --git a/src/tools/cephfs/cephfs-meta-injection.cc b/src/tools/cephfs/cephfs-meta-injection.cc
new file mode 100644
index 000000000..48a913469
--- /dev/null
+++ b/src/tools/cephfs/cephfs-meta-injection.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#include <include/types.h>
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "MetaTool.h"
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <boost/program_options.hpp>
+namespace po = boost::program_options;
+using std::string;
+using namespace std;
+static string version = "cephfs-meta-injection v1.1";
+
+int main(int argc, const char **argv)
+{
+  auto args = argv_to_vec(argc, argv);
+  env_to_vec(args);
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                         CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  string rank_str, minfo, ino, out,in;
+  po::options_description general("general options");
+  general.add_options()
+      ("help,h", "produce help message")
+      ("debug", "show debug info")
+      ("rank,r", po::value<string>(&rank_str), "the rank of cephfs, default(0) (e.g. -r cephfs_a:0)")
+      ("minfo", po::value<string>(&minfo), "specify metapool, datapools and rank (e.g. cephfs_metadata_a:cephfs_data_a:0)")
+      ("ino,i", po::value<string>(&ino), "specify inode. e.g. 1099511627776 or 0x10000000000, you can find it with cmd, 'ls -i'")
+      ("out,o", po::value<string>(&out), "output file")
+      ("in", po::value<string>(&in), "input file")
+      ("yes-i-really-really-mean-it", "need by amend info")
+      ;
+
+  string mode;
+  po::options_description modeoptions("mode options");
+  modeoptions.add_options()
+      ("mode", po::value<string>(&mode),
+       "\tlistc : list all obj of dir\n"        \
+       "\tshowm : show the info of ino\n"          \
+       "\tshowfn : show the fnode of dir\n"        \
+       "\tamend : amend part of the meta data\n"   \
+       "\tamendfn : amend fnode from file\n"
+       );
+
+  po::positional_options_description p;
+  p.add("mode", 1);
+
+  po::options_description all("all options");
+  all.add(modeoptions).add(general);
+  po::variables_map vm;
+  try {
+    po::store(po::command_line_parser(argc, argv).options(all).positional(p).allow_unregistered().run(), vm);
+  } catch(exception &e) {
+    cerr << "error : " << e.what() << std::endl;
+    return -1;
+  } catch(...) {
+    cout << "param error" << std::endl;
+    return 0;
+  }
+
+  boost::program_options::notify(vm);
+  if (vm.count("help")) {
+    std::cout << version << std::endl;
+    std::cout << "usage : \n"
+              << "  cephfs-meta-injection <listc|showm|showfn|amend|amendfn> -r <fsname:rank> -i <ino>"
+              << std::endl;
+    std::cout << "example : \n"
+              << "  amend info of inode(1099531628828)\n"
+              << "    cephfs-meta-injection showm -r cephfs_a:0 -i 1099531628828 -o out\n"
+              << "    alter file\n"
+              << "    cephfs-meta-injection amend -r cephfs_a:0 -i 1099531628828 --in out --yes-i-really-mean-it"
+              << std::endl;
+    std::cout << all << std::endl;
+    return 0;
+  }
+
+  MetaTool mt(vm.count("debug"));
+  int rc = mt.init();
+  if (rc != 0) {
+    std::cerr << "error in initialization: " << cpp_strerror(rc) << std::endl;
+    return rc;
+  }
+  rc = mt.main(mode, rank_str, minfo, ino, out, in, vm.count("yes-i-really-really-mean-it"));
+  if (rc != 0) {
+    std::cerr << "error (" << cpp_strerror(rc) << ")" << std::endl;
+    return -1;
+  }
+  return rc;
+}
diff --git a/src/tools/cephfs/cephfs-table-tool.cc b/src/tools/cephfs/cephfs-table-tool.cc
new file mode 100644
index 000000000..4b57080d6
--- /dev/null
+++ b/src/tools/cephfs/cephfs-table-tool.cc
@@ -0,0 +1,46 @@
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "TableTool.h"
+
+using namespace std;
+
+int main(int argc, const char **argv)
+{
+  auto args = argv_to_vec(argc, argv);
+  if (args.empty()) {
+    cerr << argv[0] << ": -h or --help for usage" << std::endl;
+    exit(1);
+  }
+  if (ceph_argparse_need_usage(args)) {
+    TableTool::usage();
+    exit(0);
+  }
+
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                         CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  TableTool tt;
+
+  // Connect to mon cluster, download MDS map etc
+  int rc = tt.init();
+  if (rc != 0) {
+      std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+      return rc;
+  }
+
+  // Finally, execute the user's commands
+  rc = tt.main(args);
+  if (rc != 0) {
+    std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+  }
+
+  return rc;
+}
+
+
diff --git a/src/tools/cephfs/first-damage.py b/src/tools/cephfs/first-damage.py
new file mode 100644
index 000000000..0479dc8cb
--- /dev/null
+++ b/src/tools/cephfs/first-damage.py
@@ -0,0 +1,156 @@
+# Ceph - scalable distributed file system
+#
+# Copyright (C) 2022 Red Hat, Inc.
+#
+# This is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License version 2.1, as published by the Free Software
+# Foundation.  See file COPYING.
+
+# Suggested recovery sequence (for single MDS cluster):
+#
+# 1) Unmount all clients.
+#
+# 2) Flush the journal (if possible):
+#
+#    ceph tell mds.<fs_name>:0 flush journal
+#
+# 3) Fail the file system:
+#
+#    ceph fs fail <fs_name>
+#
+# 4a) Recover dentries from the journal. This will be a no-op if the MDS flushed the journal successfully:
+#
+#    cephfs-journal-tool --rank=<fs_name>:0 event recover_dentries summary
+#
+# 4b) If all good so far, reset the journal:
+#
+#    cephfs-journal-tool --rank=<fs_name>:0 journal reset
+#
+# 5) Run this tool to see list of damaged dentries:
+#
+#    python3 first-damage.py --memo run.1 <pool>
+#
+# 6) Optionally, remove them:
+#
+#    python3 first-damage.py --memo run.2 --remove <pool>
+#
+# Note: use --memo to specify a different file to save objects that have
+# already been traversed, for independent runs.
+#
+# This has the effect of removing that dentry from the snapshot or HEAD
+# (current hierarchy).  Note: the inode's linkage will be lost. The inode may
+# be recoverable in lost+found during a future data scan recovery.
+
+import argparse
+import logging
+import os
+import rados
+import re
+import sys
+import struct
+
+log = logging.getLogger("first-damage-traverse")
+
+MEMO = None
+REMOVE = False
+POOL = None
+NEXT_SNAP = None
+CONF = os.environ.get('CEPH_CONF')
+REPAIR_NOSNAP = None
+
+CEPH_NOSNAP = 0xfffffffe # int32 -2
+
+DIR_PATTERN = re.compile(r'[0-9a-fA-F]{8,}\.[0-9a-fA-F]+')
+
+CACHE = set()
+
+def traverse(MEMO, ioctx):
+    for o in ioctx.list_objects():
+        if not DIR_PATTERN.fullmatch(o.key):
+            log.debug("skipping %s", o.key)
+            continue
+        elif o.key in CACHE:
+            log.debug("skipping previously examined object %s", o.key)
+            continue
+        log.info("examining: %s", o.key)
+
+        with rados.ReadOpCtx() as rctx:
+            nkey = None
+            while True:
+                it = ioctx.get_omap_vals(rctx, nkey, None, 100, omap_key_type=bytes)[0]
+                ioctx.operate_read_op(rctx, o.key)
+                nkey = None
+                for (dnk, val) in it:
+                    log.debug(f'\t{dnk}: val size {len(val)}')
+                    (first,) = struct.unpack('<I', val[:4])
+                    if first > NEXT_SNAP:
+                        log.warning(f"found {o.key}:{dnk} first (0x{first:x}) > NEXT_SNAP (0x{NEXT_SNAP:x})")
+                        if REPAIR_NOSNAP and dnk.endswith(b"_head") and first == CEPH_NOSNAP:
+                            log.warning(f"repairing first==CEPH_NOSNAP damage, setting to NEXT_SNAP (0x{NEXT_SNAP:x})")
+                            first = NEXT_SNAP
+                            nval = bytearray(val)
+                            struct.pack_into("<I", nval, 0, NEXT_SNAP)
+                            with rados.WriteOpCtx() as wctx:
+                                ioctx.set_omap(wctx, (dnk,), (bytes(nval),))
+                                ioctx.operate_write_op(wctx, o.key)
+                        elif REMOVE:
+                            log.warning(f"removing {o.key}:{dnk}")
+                            with rados.WriteOpCtx() as wctx:
+                                ioctx.remove_omap_keys(wctx, [dnk])
+                                ioctx.operate_write_op(wctx, o.key)
+                    nkey = dnk
+                if nkey is None:
+                    break
+        MEMO.write(f"{o.key}\n")
+
+if __name__ == '__main__':
+    outpath = os.path.join(os.path.expanduser('~'), os.path.basename(sys.argv[0]))
+    P = argparse.ArgumentParser(description="remove CephFS metadata dentries with invalid first snapshot")
+    P.add_argument('--conf', action='store', help='Ceph conf file', type=str, default=CONF)
+    P.add_argument('--debug', action='store', help='debug file', type=str, default=outpath+'.log')
+    P.add_argument('--memo', action='store', help='db for traversed dirs', default=outpath+'.memo')
+    P.add_argument('--next-snap', action='store', help='force next-snap (dev)', type=int)
+    P.add_argument('--remove', action='store_true', help='remove bad dentries', default=False)
+    P.add_argument('--repair-nosnap', action='store_true', help='repair first=CEPH_NOSNAP damage', default=False)
+    P.add_argument('pool', action='store', help='metadata pool', type=str)
+    NS = P.parse_args()
+
+    logging.basicConfig(filename=NS.debug, level=logging.DEBUG)
+
+    MEMO = NS.memo
+    REMOVE = NS.remove
+    POOL = NS.pool
+    NEXT_SNAP = NS.next_snap
+    CONF = NS.conf
+    REPAIR_NOSNAP = NS.repair_nosnap
+
+    log.info("running as pid %d", os.getpid())
+
+    try:
+        with open(MEMO) as f:
+            for line in f.readlines():
+                CACHE.add(line.rstrip())
+    except FileNotFoundError:
+        pass
+
+    R = rados.Rados(conffile=CONF)
+    R.connect()
+    ioctx = R.open_ioctx(POOL)
+
+    if NEXT_SNAP is None:
+        data = ioctx.read("mds_snaptable")
+        # skip "version" of MDSTable payload
+        # V=$(dd if="$SNAPTABLE" bs=1 count=1 skip=8 | od --endian=little -An -t u1)
+        V = struct.unpack('<b', data[8:9])[0]
+        log.debug("version is %d", V)
+        if V != 5:
+            raise RuntimeError("incompatible snaptable")
+        # skip version,struct_v,compat_v,length
+        # NEXT_SNAP=$((1 + $(dd if="$SNAPTABLE" bs=1 count=8 skip=14 | od --endian=little -An -t u8)))
+        NEXT_SNAP = 1 + struct.unpack('<Q', data[14:22])[0]
+        log.debug("NEXT_SNAP = %d", NEXT_SNAP)
+
+    with open(MEMO, 'a') as f:
+        log.info("saving traversed keys to %s to allow resuming", MEMO)
+        traverse(f, ioctx)
diff --git a/src/tools/cephfs/shell/CMakeLists.txt b/src/tools/cephfs/shell/CMakeLists.txt
new file mode 100644
index 000000000..5a1f6ad80
--- /dev/null
+++ b/src/tools/cephfs/shell/CMakeLists.txt
@@ -0,0 +1,7 @@
+include(Distutils)
+distutils_install_module(cephfs-shell)
+
+if(WITH_TESTS)
+  include(AddCephTest)
+  add_tox_test(cephfs-shell)
+endif()
diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell
new file mode 100755
index 000000000..58884a275
--- /dev/null
+++ b/src/tools/cephfs/shell/cephfs-shell
@@ -0,0 +1,1854 @@
+#!/usr/bin/python3
+# coding = utf-8
+
+import argparse
+import os
+import os.path
+import sys
+import cephfs as libcephfs
+import shutil
+import traceback
+import colorama
+import fnmatch
+import math
+import re
+import shlex
+import stat
+import errno
+
+from distutils.version import LooseVersion
+
+from cmd2 import Cmd
+from cmd2 import __version__ as cmd2_version
+# XXX: In cmd2 versions < 1.0.1, we'll get SystemExit(2) instead of
+# Cmd2ArgparseError
+if LooseVersion(cmd2_version) >= LooseVersion("1.0.1"):
+    from cmd2.exceptions import Cmd2ArgparseError
+else:
+    # HACK: so that we don't have check for version everywhere
+    # Cmd2ArgparseError is used.
+    class Cmd2ArgparseError:
+        pass
+
+if sys.version_info.major < 3:
+    raise RuntimeError("cephfs-shell is only compatible with python3")
+
+try:
+    from cmd2 import with_argparser
+except ImportError:
+    def with_argparser(argparser):
+        import functools
+
+        def argparser_decorator(func):
+            @functools.wraps(func)
+            def wrapper(thiz, cmdline):
+                if isinstance(cmdline, list):
+                    arglist = cmdline
+                else:
+                    # do not split if it's already a list
+                    arglist = shlex.split(cmdline, posix=False)
+                    # in case user quotes the command args
+                    arglist = [arg.strip('\'""') for arg in arglist]
+                try:
+                    args = argparser.parse_args(arglist)
+                except SystemExit:
+                    shell.exit_code = 1
+                    # argparse exits at seeing bad arguments
+                    return
+                else:
+                    return func(thiz, args)
+            argparser.prog = func.__name__[3:]
+            if argparser.description is None and func.__doc__:
+                argparser.description = func.__doc__
+
+            return wrapper
+
+        return argparser_decorator
+
+
+cephfs = None   # holds CephFS Python bindings
+shell = None    # holds instance of class CephFSShell
+exit_codes = {'Misc': 1,
+              'KeyboardInterrupt': 2,
+              errno.EPERM: 3,
+              errno.EACCES: 4,
+              errno.ENOENT: 5,
+              errno.EIO: 6,
+              errno.ENOSPC: 7,
+              errno.EEXIST: 8,
+              errno.ENODATA: 9,
+              errno.EINVAL: 10,
+              errno.EOPNOTSUPP: 11,
+              errno.ERANGE: 12,
+              errno.EWOULDBLOCK: 13,
+              errno.ENOTEMPTY: 14,
+              errno.ENOTDIR: 15,
+              errno.EDQUOT: 16,
+              errno.EPIPE: 17,
+              errno.ESHUTDOWN: 18,
+              errno.ECONNABORTED: 19,
+              errno.ECONNREFUSED: 20,
+              errno.ECONNRESET: 21,
+              errno.EINTR: 22,
+              errno.EISDIR: 23}
+
+
+#########################################################################
+#
+# Following are methods are generically useful through class CephFSShell
+#
+#######################################################################
+
+
+def poutput(s, end='\n'):
+    shell.poutput(s, end=end)
+
+
+def perror(msg, **kwargs):
+    shell.perror(msg, **kwargs)
+
+
+def set_exit_code_msg(errcode='Misc', msg=''):
+    """
+    Set exit code and print error message
+    """
+    if isinstance(msg, libcephfs.Error):
+        shell.exit_code = exit_codes[msg.get_error_code()]
+    else:
+        shell.exit_code = exit_codes[errcode]
+    if msg:
+        perror(msg)
+
+
+def mode_notation(mode):
+    """
+    """
+    permission_bits = {'0': '---',
+                       '1': '--x',
+                       '2': '-w-',
+                       '3': '-wx',
+                       '4': 'r--',
+                       '5': 'r-x',
+                       '6': 'rw-',
+                       '7': 'rwx'}
+    mode = str(oct(mode))
+    notation = '-'
+    if mode[2] == '4':
+        notation = 'd'
+    elif mode[2:4] == '12':
+        notation = 'l'
+    for i in mode[-3:]:
+        notation += permission_bits[i]
+    return notation
+
+
+def get_chunks(file_size):
+    chunk_start = 0
+    chunk_size = 0x20000  # 131072 bytes, default max ssl buffer size
+    while chunk_start + chunk_size < file_size:
+        yield chunk_start, chunk_size
+        chunk_start += chunk_size
+    final_chunk_size = file_size - chunk_start
+    yield chunk_start, final_chunk_size
+
+
+def to_bytes(param):
+    # don't convert as follows as it can lead unusable results like converting
+    # [1, 2, 3, 4] to '[1, 2, 3, 4]' -
+    # str(param).encode('utf-8')
+    if isinstance(param, bytes):
+        return param
+    elif isinstance(param, str):
+        return bytes(param, encoding='utf-8')
+    elif isinstance(param, list):
+        return [i.encode('utf-8') if isinstance(i, str) else to_bytes(i) for
+                i in param]
+    elif isinstance(param, int) or isinstance(param, float):
+        return str(param).encode('utf-8')
+    elif param is None:
+        return None
+
+
+def ls(path, opts=''):
+    # opts tries to be like /bin/ls opts
+    almost_all = 'A' in opts
+    try:
+        with cephfs.opendir(path) as d:
+            while True:
+                dent = cephfs.readdir(d)
+                if dent is None:
+                    return
+                elif almost_all and dent.d_name in (b'.', b'..'):
+                    continue
+                yield dent
+    except libcephfs.ObjectNotFound as e:
+        set_exit_code_msg(msg=e)
+
+
+def glob(path, pattern):
+    paths = []
+    parent_dir = os.path.dirname(path)
+    if parent_dir == b'':
+        parent_dir = b'/'
+    if path == b'/' or is_dir_exists(os.path.basename(path), parent_dir):
+        for i in ls(path, opts='A'):
+            if fnmatch.fnmatch(i.d_name, pattern):
+                paths.append(os.path.join(path, i.d_name))
+    return paths
+
+
+def locate_file(name, case_sensitive=True):
+    dir_list = sorted(set(dirwalk(cephfs.getcwd())))
+    if not case_sensitive:
+        return [dname for dname in dir_list if name.lower() in dname.lower()]
+    else:
+        return [dname for dname in dir_list if name in dname]
+
+
+def get_all_possible_paths(pattern):
+    complete_pattern = pattern[:]
+    paths = []
+    is_rel_path = not os.path.isabs(pattern)
+    if is_rel_path:
+        dir_ = cephfs.getcwd()
+    else:
+        dir_ = b'/'
+        pattern = pattern[1:]
+    patterns = pattern.split(b'/')
+    paths.extend(glob(dir_, patterns[0]))
+    patterns.pop(0)
+    for pattern in patterns:
+        for path in paths:
+            paths.extend(glob(path, pattern))
+    if is_rel_path:
+        complete_pattern = os.path.join(cephfs.getcwd(), complete_pattern)
+    return [path for path in paths if fnmatch.fnmatch(path, complete_pattern)]
+
+
+suffixes = ['B', 'K', 'M', 'G', 'T', 'P']
+
+
+def humansize(nbytes):
+    i = 0
+    while nbytes >= 1024 and i < len(suffixes) - 1:
+        nbytes /= 1024.
+        i += 1
+    nbytes = math.ceil(nbytes)
+    f = ('%d' % nbytes).rstrip('.')
+    return '%s%s' % (f, suffixes[i])
+
+
+def style_listing(path, is_dir, is_symlink, ls_long=False):
+    if not (is_dir or is_symlink):
+        return path
+    pretty = colorama.Style.BRIGHT
+    if is_symlink:
+        pretty += colorama.Fore.CYAN + path
+        if ls_long:
+            # Add target path
+            pretty += ' -> ' + cephfs.readlink(path, size=255).decode('utf-8')
+    elif is_dir:
+        pretty += colorama.Fore.BLUE + path + '/'
+    pretty += colorama.Style.RESET_ALL
+    return pretty
+
+
+def print_long(path, is_dir, is_symlink, human_readable):
+    info = cephfs.stat(path, follow_symlink=(not is_symlink))
+    pretty = style_listing(os.path.basename(path.decode('utf-8')), is_dir, is_symlink, True)
+    if human_readable:
+        sizefmt = '\t {:10s}'.format(humansize(info.st_size))
+    else:
+        sizefmt = '{:12d}'.format(info.st_size)
+    poutput(f'{mode_notation(info.st_mode)} {sizefmt} {info.st_uid} {info.st_gid} {info.st_mtime}'
+            f' {pretty}')
+
+
+def word_len(word):
+    """
+    Returns the word length, minus any color codes.
+    """
+    if word[0] == '\x1b':
+        return len(word) - 9
+    return len(word)
+
+
+def is_dir_exists(path, dir_=b''):
+    path_to_stat = os.path.join(dir_, path)
+    try:
+        return ((cephfs.stat(path_to_stat).st_mode & 0o0040000) != 0)
+    except libcephfs.Error:
+        return False
+
+
+def is_file_exists(path, dir_=b''):
+    try:
+        # if its not a directory, then its a file
+        return ((cephfs.stat(os.path.join(dir_, path)).st_mode & 0o0040000) == 0)
+    except libcephfs.Error:
+        return False
+
+
+def print_list(words, termwidth=79):
+    if not words:
+        return
+    words = [word.decode('utf-8') if isinstance(word, bytes) else word for word in words]
+    width = max([word_len(word) for word in words]) + 2
+    nwords = len(words)
+    ncols = max(1, (termwidth + 1) // (width + 1))
+    nrows = (nwords + ncols - 1) // ncols
+    for row in range(nrows):
+        for i in range(row, nwords, nrows):
+            word = words[i]
+            print_width = width
+            if word[0] == '\x1b':
+                print_width = print_width + 10
+
+            poutput('%-*s' % (print_width, words[i]),
+                    end='\n' if i + nrows >= nwords else '')
+
+
+def copy_from_local(local_path, remote_path):
+    stdin = -1
+    file_ = None
+    fd = None
+    convert_to_bytes = False
+    if local_path == b'-':
+        file_ = sys.stdin
+        convert_to_bytes = True
+    else:
+        try:
+            file_ = open(local_path, 'rb')
+        except PermissionError as e:
+            set_exit_code_msg(e.errno, 'error: no permission to read local file {}'.format(
+                local_path.decode('utf-8')))
+            return
+        stdin = 1
+    try:
+        fd = cephfs.open(remote_path, 'w', 0o666)
+    except libcephfs.Error as e:
+        set_exit_code_msg(msg=e)
+        return
+    progress = 0
+    while True:
+        data = file_.read(65536)
+        if not data or len(data) == 0:
+            break
+        if convert_to_bytes:
+            data = to_bytes(data)
+        wrote = cephfs.write(fd, data, progress)
+        if wrote < 0:
+            break
+        progress += wrote
+    cephfs.close(fd)
+    if stdin > 0:
+        file_.close()
+    poutput('')
+
+
+def copy_to_local(remote_path, local_path):
+    fd = None
+    if local_path != b'-':
+        local_dir = os.path.dirname(local_path)
+        dir_list = remote_path.rsplit(b'/', 1)
+        if not os.path.exists(local_dir):
+            os.makedirs(local_dir)
+        if len(dir_list) > 2 and dir_list[1] == b'':
+            return
+        fd = open(local_path, 'wb+')
+    file_ = cephfs.open(remote_path, 'r')
+    file_size = cephfs.stat(remote_path).st_size
+    if file_size <= 0:
+        return
+    progress = 0
+    for chunk_start, chunk_size in get_chunks(file_size):
+        file_chunk = cephfs.read(file_, chunk_start, chunk_size)
+        progress += len(file_chunk)
+        if fd:
+            fd.write(file_chunk)
+        else:
+            poutput(file_chunk.decode('utf-8'))
+    cephfs.close(file_)
+    if fd:
+        fd.close()
+
+
+def dirwalk(path):
+    """
+    walk a directory tree, using a generator
+    """
+    path = os.path.normpath(path)
+    for item in ls(path, opts='A'):
+        fullpath = os.path.join(path, item.d_name)
+        src_path = fullpath.rsplit(b'/', 1)[0]
+
+        yield os.path.normpath(fullpath)
+        if is_dir_exists(item.d_name, src_path):
+            for x in dirwalk(fullpath):
+                yield x
+
+
+##################################################################
+#
+# Following methods are implementation for CephFS Shell commands
+#
+#################################################################
+
+class CephFSShell(Cmd):
+
+    def __init__(self):
+        super().__init__()
+        self.working_dir = cephfs.getcwd().decode('utf-8')
+        self.set_prompt()
+        self.interactive = False
+        self.umask = '2'
+
+    def default(self, line):
+        self.exit_code = 127
+        perror('Unrecognized command')
+
+    def set_prompt(self):
+        self.prompt = ('\033[01;33mCephFS:~' + colorama.Fore.LIGHTCYAN_EX
+                       + self.working_dir + colorama.Style.RESET_ALL
+                       + '\033[01;33m>>>\033[00m ')
+
+    def create_argparser(self, command):
+        try:
+            argparse_args = getattr(self, 'argparse_' + command)
+        except AttributeError:
+            set_exit_code_msg()
+            return None
+        doc_lines = getattr(
+            self, 'do_' + command).__doc__.expandtabs().splitlines()
+        if '' in doc_lines:
+            blank_idx = doc_lines.index('')
+            usage = doc_lines[:blank_idx]
+            description = doc_lines[blank_idx + 1:]
+        else:
+            usage = doc_lines
+            description = []
+        parser = argparse.ArgumentParser(
+            prog=command,
+            usage='\n'.join(usage),
+            description='\n'.join(description),
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter
+        )
+        for args, kwargs in argparse_args:
+            parser.add_argument(*args, **kwargs)
+        return parser
+
+    def complete_filenames(self, text, line, begidx, endidx):
+        if not text:
+            completions = [x.d_name.decode('utf-8') + '/' * int(x.is_dir())
+                           for x in ls(b".", opts='A')]
+        else:
+            if text.count('/') > 0:
+                completions = [text.rsplit('/', 1)[0] + '/'
+                               + x.d_name.decode('utf-8') + '/'
+                               * int(x.is_dir()) for x in ls('/'
+                               + text.rsplit('/', 1)[0], opts='A')
+                               if x.d_name.decode('utf-8').startswith(
+                                   text.rsplit('/', 1)[1])]
+            else:
+                completions = [x.d_name.decode('utf-8') + '/'
+                               * int(x.is_dir()) for x in ls(b".", opts='A')
+                               if x.d_name.decode('utf-8').startswith(text)]
+            if len(completions) == 1 and completions[0][-1] == '/':
+                dir_, file_ = completions[0].rsplit('/', 1)
+                completions.extend([dir_ + '/' + x.d_name.decode('utf-8')
+                                    + '/' * int(x.is_dir()) for x in
+                                    ls('/' + dir_, opts='A')
+                                    if x.d_name.decode('utf-8').startswith(file_)])
+            return self.delimiter_complete(text, line, begidx, endidx, completions, '/')
+        return completions
+
+    def onecmd(self, line, **kwargs):
+        """
+        Global error catcher
+        """
+        try:
+            res = Cmd.onecmd(self, line, **kwargs)
+            if self.interactive:
+                self.set_prompt()
+            return res
+        except ConnectionError as e:
+            set_exit_code_msg(e.errno, f'***\n{e}')
+        except KeyboardInterrupt:
+            set_exit_code_msg('KeyboardInterrupt', 'Command aborted')
+        except (libcephfs.Error, Exception) as e:
+            if shell.debug:
+                traceback.print_exc(file=sys.stdout)
+            if isinstance(e, Cmd2ArgparseError):
+                # NOTE: In case of Cmd2ArgparseError the error message is
+                # already printed beforehand (plus Cmd2ArgparseError
+                # instances have empty error message), so let's just set the
+                # exit code.
+                set_exit_code_msg(msg=None)
+            else:
+                set_exit_code_msg(msg=f'{type(e).__name__}: {e}')
+        # In cmd2 versions < 1.1.0 we'll get SystemExit(2) instead of
+        # Cmd2ArgparseError
+        except SystemExit:
+            raise
+
+    class path_to_bytes(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            values = to_bytes(values)
+            setattr(namespace, self.dest, values)
+
+    # TODO: move the necessary contents from here to `class path_to_bytes`.
+    class get_list_of_bytes_path(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            values = to_bytes(values)
+
+            if values == b'.':
+                values = cephfs.getcwd()
+            else:
+                for i in values:
+                    if i == b'.':
+                        values[values.index(i)] = cephfs.getcwd()
+
+            setattr(namespace, self.dest, values)
+
+    def complete_mkdir(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    class ModeAction(argparse.Action):
+        def __init__(self, option_strings, dest, nargs=None, **kwargs):
+            if nargs is not None and nargs != '?':
+                raise ValueError("more than one modes not allowed")
+            super().__init__(option_strings, dest, **kwargs)
+
+        def __call__(self, parser, namespace, values, option_string=None):
+            o_mode = 0
+            res = None
+            try:
+                o_mode = int(values, base=8)
+            except ValueError:
+                res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', values)
+                if res is None:
+                    parser.error(f"invalid mode: {values}\n"
+                                 "mode must be a numeric octal literal\n"
+                                 "or   ((u?g?o?)|(a?))(=)(r?w?x?)")
+                else:
+                    # we are supporting only assignment of mode and not + or -
+                    # as is generally available with the chmod command
+                    # eg.
+                    # >>> res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', 'go=')
+                    # >>> res.groups()
+                    # ('go', 'go', None, '=', '')
+                    val = res.groups()
+
+                    if val[3] != '=':
+                        parser.error("need assignment operator between user "
+                                     "and mode specifiers")
+                    if val[4] == '':
+                        parser.error(f"invalid mode: {values}\n"
+                                     "mode must be combination of: r | w | x")
+                    users = ''
+                    if val[2] is None:
+                        users = val[1]
+                    else:
+                        users = val[2]
+
+                    t_mode = 0
+                    if users == 'a':
+                        users = 'ugo'
+
+                    if 'r' in val[4]:
+                        t_mode |= 4
+                    if 'w' in val[4]:
+                        t_mode |= 2
+                    if 'x' in val[4]:
+                        t_mode |= 1
+
+                    if 'u' in users:
+                        o_mode |= (t_mode << 6)
+                    if 'g' in users:
+                        o_mode |= (t_mode << 3)
+                    if 'o' in users:
+                        o_mode |= t_mode
+
+            if o_mode < 0:
+                parser.error(f"invalid mode: {values}\n"
+                             "mode cannot be negative")
+            if o_mode > 0o7777:
+                parser.error(f"invalid mode: {values}\n"
+                             "mode cannot be greater than octal 07777")
+
+            setattr(namespace, self.dest, str(oct(o_mode)))
+
+    mkdir_parser = argparse.ArgumentParser(
+        description='Create the directory(ies), if they do not already exist.')
+    mkdir_parser.add_argument('dirs', type=str,
+                              action=path_to_bytes,
+                              metavar='DIR_NAME',
+                              help='Name of new_directory.',
+                              nargs='+')
+    mkdir_parser.add_argument('-m', '--mode', type=str,
+                              action=ModeAction,
+                              help='Sets the access mode for the new directory.')
+    mkdir_parser.add_argument('-p', '--parent', action='store_true',
+                              help='Create parent directories as necessary. '
+                                   'When this option is specified, no error is'
+                                   'reported if a directory already exists.')
+
+    @with_argparser(mkdir_parser)
+    def do_mkdir(self, args):
+        """
+        Create directory.
+        """
+        for path in args.dirs:
+            if args.mode:
+                permission = int(args.mode, 8)
+            else:
+                permission = 0o777
+            if args.parent:
+                cephfs.mkdirs(path, permission)
+            else:
+                try:
+                    cephfs.mkdir(path, permission)
+                except libcephfs.Error as e:
+                    set_exit_code_msg(e)
+
+    def complete_put(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        index_dict = {1: self.path_complete}
+        return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+    put_parser = argparse.ArgumentParser(
+        description='Copy a file/directory to Ceph File System from Local File System.')
+    put_parser.add_argument('local_path', type=str, action=path_to_bytes,
+                            help='Path of the file in the local system')
+    put_parser.add_argument('remote_path', type=str, action=path_to_bytes,
+                            help='Path of the file in the remote system')
+    put_parser.add_argument('-f', '--force', action='store_true',
+                            help='Overwrites the destination if it already exists.')
+
+    @with_argparser(put_parser)
+    def do_put(self, args):
+        """
+        Copy a local file/directory to CephFS.
+        """
+        if args.local_path != b'-' and not os.path.isfile(args.local_path) \
+                and not os.path.isdir(args.local_path):
+            set_exit_code_msg(errno.ENOENT,
+                              msg=f"error: "
+                                  f"{args.local_path.decode('utf-8')}: "
+                                  f"No such file or directory")
+            return
+
+        if (is_file_exists(args.remote_path) or is_dir_exists(
+                args.remote_path)) and not args.force:
+            set_exit_code_msg(msg=f"error: file/directory "
+                                  f"{args.remote_path.decode('utf-8')} "
+                                  f"exists, use --force to overwrite")
+            return
+
+        root_src_dir = args.local_path
+        root_dst_dir = args.remote_path
+        if args.local_path == b'.' or args.local_path == b'./':
+            root_src_dir = os.getcwdb()
+        elif len(args.local_path.rsplit(b'/', 1)) < 2:
+            root_src_dir = os.path.join(os.getcwdb(), args.local_path)
+        else:
+            p = args.local_path.split(b'/')
+            if p[0] == b'.':
+                root_src_dir = os.getcwdb()
+                p.pop(0)
+                while len(p) > 0:
+                    root_src_dir += b'/' + p.pop(0)
+
+        if root_dst_dir == b'.':
+            if args.local_path != b'-':
+                root_dst_dir = root_src_dir.rsplit(b'/', 1)[1]
+                if root_dst_dir == b'':
+                    root_dst_dir = root_src_dir.rsplit(b'/', 1)[0]
+                    a = root_dst_dir.rsplit(b'/', 1)
+                    if len(a) > 1:
+                        root_dst_dir = a[1]
+                    else:
+                        root_dst_dir = a[0]
+            else:
+                set_exit_code_msg(errno.EINVAL, 'error: no filename specified '
+                                  'for destination')
+                return
+
+        if root_dst_dir[-1] != b'/':
+            root_dst_dir += b'/'
+
+        if args.local_path == b'-' or os.path.isfile(root_src_dir):
+            if args.local_path == b'-':
+                root_src_dir = b'-'
+            copy_from_local(root_src_dir, root_dst_dir)
+        else:
+            for src_dir, dirs, files in os.walk(root_src_dir):
+                if isinstance(src_dir, str):
+                    src_dir = to_bytes(src_dir)
+                dst_dir = src_dir.replace(root_src_dir, root_dst_dir, 1)
+                dst_dir = re.sub(rb'\/+', b'/', cephfs.getcwd()
+                                 + dst_dir)
+                if args.force and dst_dir != b'/' and not is_dir_exists(
+                        dst_dir[:-1]) and not locate_file(dst_dir):
+                    try:
+                        cephfs.mkdirs(dst_dir, 0o777)
+                    except libcephfs.Error:
+                        pass
+                if (not args.force) and dst_dir != b'/' and not is_dir_exists(
+                        dst_dir) and not os.path.isfile(root_src_dir):
+                    try:
+                        cephfs.mkdirs(dst_dir, 0o777)
+                    except libcephfs.Error:
+                        # TODO: perhaps, set retval to 1?
+                        pass
+
+                for dir_ in dirs:
+                    dir_name = os.path.join(dst_dir, dir_)
+                    if not is_dir_exists(dir_name):
+                        try:
+                            cephfs.mkdirs(dir_name, 0o777)
+                        except libcephfs.Error:
+                            # TODO: perhaps, set retval to 1?
+                            pass
+
+                for file_ in files:
+                    src_file = os.path.join(src_dir, file_)
+                    dst_file = re.sub(rb'\/+', b'/', b'/' + dst_dir + b'/' + file_)
+                    if (not args.force) and is_file_exists(dst_file):
+                        return
+                    copy_from_local(src_file, os.path.join(cephfs.getcwd(),
+                                    dst_file))
+
+    def complete_get(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    get_parser = argparse.ArgumentParser(
+        description='Copy a file from Ceph File System to Local Directory.')
+    get_parser.add_argument('remote_path', type=str, action=path_to_bytes,
+                            help='Path of the file in the remote system')
+    get_parser.add_argument('local_path', type=str, action=path_to_bytes,
+                            help='Path of the file in the local system')
+    get_parser.add_argument('-f', '--force', action='store_true',
+                            help='Overwrites the destination if it already exists.')
+
+    @with_argparser(get_parser)
+    def do_get(self, args):
+        """
+        Copy a file/directory from CephFS to given path.
+        """
+        if not is_file_exists(args.remote_path) and not \
+                is_dir_exists(args.remote_path):
+            set_exit_code_msg(errno.ENOENT, "error: no file/directory"
+                                            " found at specified remote "
+                                            "path")
+            return
+        if (os.path.isfile(args.local_path) or os.path.isdir(
+                args.local_path)) and not args.force:
+            set_exit_code_msg(msg=f"error: file/directory "
+                                  f"{args.local_path.decode('utf-8')}"
+                                  f" already exists, use --force to "
+                                  f"overwrite")
+            return
+        root_src_dir = args.remote_path
+        root_dst_dir = args.local_path
+        fname = root_src_dir.rsplit(b'/', 1)
+        if args.local_path == b'.':
+            root_dst_dir = os.getcwdb()
+        if args.remote_path == b'.':
+            root_src_dir = cephfs.getcwd()
+        if args.local_path == b'-':
+            if args.remote_path == b'.' or args.remote_path == b'./':
+                set_exit_code_msg(errno.EINVAL, 'error: no remote file name specified')
+                return
+            copy_to_local(root_src_dir, b'-')
+        elif is_file_exists(args.remote_path):
+            copy_to_local(root_src_dir, root_dst_dir)
+        elif b'/' in root_src_dir and is_file_exists(fname[1], fname[0]):
+            copy_to_local(root_src_dir, root_dst_dir)
+        else:
+            files = list(reversed(sorted(dirwalk(root_src_dir))))
+            for file_ in files:
+                dst_dirpath, dst_file = file_.rsplit(b'/', 1)
+                if dst_dirpath in files:
+                    files.remove(dst_dirpath)
+                dst_path = os.path.join(root_dst_dir, dst_dirpath, dst_file)
+                dst_path = os.path.normpath(dst_path)
+                if is_dir_exists(file_):
+                    try:
+                        os.makedirs(dst_path)
+                    except OSError:
+                        pass
+                else:
+                    copy_to_local(file_, dst_path)
+
+        return 0
+
+    def complete_ln(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    ln_parser = argparse.ArgumentParser(
+        description='Add a hard link to an existing file or create a symbolic '
+                    'link to an existing file or directory.')
+    ln_parser.add_argument('target', type=str, action=path_to_bytes,
+                           help='File/Directory of which link is '
+                                'to be created')
+    ln_parser.add_argument('link_name', type=str, action=path_to_bytes,
+                           help='Link to target with the name link_name',
+                           nargs='?')
+    ln_parser.add_argument('-s', '--symbolic', action='store_true',
+                           help='Create symbolic link')
+    ln_parser.add_argument('-v', '--verbose', action='store_true',
+                           help='Print name of each linked file')
+    ln_parser.add_argument('-f', '--force', action='store_true',
+                           help='Force create link/symbolic link')
+
+    @with_argparser(ln_parser)
+    def do_ln(self, args):
+        if not is_file_exists(args.target) \
+                and not is_dir_exists(args.target):
+            set_exit_code_msg(errno.ENOENT,
+                              msg=f"ln: failed to access "
+                                  f"'{args.target.decode('utf-8')}"
+                                  f"': No such file or directory")
+            return
+
+        is_a_dir = False
+        if is_dir_exists(args.target):
+            is_a_dir = True
+
+        target_last_char_slash = False
+        if args.target.decode('utf-8')[len(args.target) - 1] == '/':
+            target_last_char_slash = True
+
+        link_name = ''
+
+        if args.link_name is None:
+            if target_last_char_slash is True:
+                if is_dir_exists(args.target):
+                    pass
+                else:
+                    set_exit_code_msg(errno.ENOTDIR,
+                                      f"ln: failed to access "
+                                      f"'{args.target.decode('utf-8')}': "
+                                      f"Not a directory")
+                    return
+            link_name = os.path.join(cephfs.getcwd(),
+                                     os.path.basename(
+                                         os.path.normpath(args.target)))
+            if (is_file_exists(link_name) or is_dir_exists(
+                    link_name)) and not args.force:
+                set_exit_code_msg(errno.ENOENT,
+                                  msg=f"ln: failed to create link "
+                                      f"{link_name.decode('utf-8')}: "
+                                      f"exists")
+                return
+        else:
+            if is_dir_exists(args.link_name):
+                dest = args.link_name.decode('utf-8').rstrip('/')
+                dest_first_half = dest.encode('utf-8') + b'/'
+                if is_file_exists(args.target):
+                    if target_last_char_slash is True:
+                        set_exit_code_msg(errno.ENOTDIR,
+                                          "ln: failed to access "
+                                          f"'{args.target.decode('utf-8')}': "
+                                          "Not a directory")
+                        return
+                    dest_file = os.path.basename(os.path.normpath(args.target))
+                    link_name = dest_first_half + dest_file
+
+                elif is_dir_exists(args.target):
+                    dest_dir = os.path.basename(os.path.normpath(args.target))
+                    link_name = dest_first_half + dest_dir
+
+            else:
+                # if the destination is not a file or a dir then:
+                # accept it as file so the end part of path cannot have
+                #    a `/` succeeding it.
+                test_path = args.link_name.decode('utf-8')
+                if test_path[len(test_path) - 1] == '/':
+                    set_exit_code_msg(errno.ENOENT, f"'{test_path}': "
+                                                    f"No such file or "
+                                                    f"directory")
+                    return
+                else:
+                    link_name = test_path.encode('utf-8')
+
+        if args.force:
+            try:
+                cephfs.lstat(os.path.join(b'', link_name))
+                if not is_a_dir or (is_a_dir and args.symbolic):
+                    cephfs.unlink(link_name)
+            except libcephfs.ObjectNotFound:
+                pass
+
+        try:
+            if args.symbolic:
+                cephfs.symlink(args.target, link_name)
+            else:
+                if is_a_dir:
+                    set_exit_code_msg(errno.EPERM,
+                                      f"ln: {args.target.decode('utf-8')}: "
+                                      "hard link not allowed for directory")
+                    return
+                cephfs.link(args.target, link_name)
+        except libcephfs.Error as e:
+            set_exit_code_msg(msg=str(e))
+            return
+
+        if args.verbose:
+            poutput(f"{link_name.decode('utf-8')} ->"
+                    f" {args.target.decode('utf-8')}")
+
+    def complete_ls(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    ls_parser = argparse.ArgumentParser(
+        description='Copy a file from Ceph File System from Local Directory.')
+    ls_parser.add_argument('-l', '--long', action='store_true',
+                           help='Detailed list of items in the directory.')
+    ls_parser.add_argument('-r', '--reverse', action='store_true',
+                           help='Reverse order of listing items in the directory.')
+    ls_parser.add_argument('-H', action='store_true', help='Human Readable')
+    ls_parser.add_argument('-a', '--all', action='store_true',
+                           help='Do not Ignore entries starting with .')
+    ls_parser.add_argument('-S', action='store_true', help='Sort by file_size')
+    ls_parser.add_argument('paths', help='Name of Directories',
+                           action=path_to_bytes, nargs='*', default=['.'])
+
+    @with_argparser(ls_parser)
+    def do_ls(self, args):
+        """
+        List all the files and directories in the current working directory
+        """
+        paths = args.paths
+        for path in paths:
+            values = []
+            items = []
+            try:
+                if path.count(b'*') > 0:
+                    all_items = get_all_possible_paths(path)
+                    if len(all_items) == 0:
+                        continue
+                    path = all_items[0].rsplit(b'/', 1)[0]
+                    if path == b'':
+                        path = b'/'
+                    dirs = []
+                    for i in all_items:
+                        for item in ls(path):
+                            d_name = item.d_name
+                            if os.path.basename(i) == d_name:
+                                if item.is_dir():
+                                    dirs.append(os.path.join(path, d_name))
+                                else:
+                                    items.append(item)
+                    if dirs:
+                        paths.extend(dirs)
+                    else:
+                        poutput(path.decode('utf-8'), end=':\n')
+                    items = sorted(items, key=lambda item: item.d_name)
+                else:
+                    if path != b'' and path != cephfs.getcwd() and len(paths) > 1:
+                        poutput(path.decode('utf-8'), end=':\n')
+                    items = sorted(ls(path), key=lambda item: item.d_name)
+                if not args.all:
+                    items = [i for i in items if not i.d_name.startswith(b'.')]
+                if args.S:
+                    items = sorted(items, key=lambda item: cephfs.stat(
+                        path + b'/' + item.d_name, follow_symlink=(
+                            not item.is_symbol_file())).st_size)
+                if args.reverse:
+                    items = reversed(items)
+                for item in items:
+                    filepath = item.d_name
+                    is_dir = item.is_dir()
+                    is_sym_lnk = item.is_symbol_file()
+                    try:
+                        if args.long and args.H:
+                            print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir,
+                                       is_sym_lnk, True)
+                        elif args.long:
+                            print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir,
+                                       is_sym_lnk, False)
+                        elif is_sym_lnk or is_dir:
+                            values.append(style_listing(filepath.decode('utf-8'), is_dir,
+                                          is_sym_lnk))
+                        else:
+                            values.append(filepath)
+                    except libcephfs.Error as e:
+                        set_exit_code_msg(msg=e)
+                if not args.long:
+                    print_list(values, shutil.get_terminal_size().columns)
+                    if path != paths[-1]:
+                        poutput('')
+            except libcephfs.Error as e:
+                set_exit_code_msg(msg=e)
+
+    def complete_rmdir(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    rmdir_parser = argparse.ArgumentParser(
+        description='Remove the directory(ies), if they are empty.')
+    rmdir_parser.add_argument('paths', help='Directory Path(s)', nargs='+',
+                              action=path_to_bytes)
+    rmdir_parser.add_argument('-p', '--parent', action='store_true',
+                              help="remove directory and its ancestors; "
+                                   "e.g., 'rmdir -p a/b/c' is similar to "
+                                   "'rmdir a/b/c a/b a'")
+
+    @with_argparser(rmdir_parser)
+    def do_rmdir(self, args):
+        self.do_rmdir_helper(args)
+
+    def do_rmdir_helper(self, args):
+        """
+        Remove a specific Directory
+        """
+        is_pattern = False
+        paths = args.paths
+        for path in paths:
+            if path.count(b'*') > 0:
+                is_pattern = True
+                all_items = get_all_possible_paths(path)
+                if len(all_items) > 0:
+                    path = all_items[0].rsplit(b'/', 1)[0]
+                if path == b'':
+                    path = b'/'
+                dirs = []
+                for i in all_items:
+                    for item in ls(path):
+                        d_name = item.d_name
+                        if os.path.basename(i) == d_name:
+                            if item.is_dir():
+                                dirs.append(os.path.join(path, d_name))
+                paths.extend(dirs)
+                continue
+            else:
+                is_pattern = False
+
+            if args.parent:
+                path = os.path.join(cephfs.getcwd(), path.rsplit(b'/')[0])
+                files = list(sorted(set(dirwalk(path)), reverse=True))
+                if not files:
+                    path = b'.'
+                for filepath in files:
+                    try:
+                        cephfs.rmdir(os.path.normpath(filepath))
+                    except libcephfs.Error as e:
+                        perror(e)
+                        path = b'.'
+                        break
+            else:
+                path = os.path.normpath(os.path.join(cephfs.getcwd(), path))
+            if not is_pattern and path != os.path.normpath(b''):
+                try:
+                    cephfs.rmdir(path)
+                except libcephfs.Error as e:
+                    if e.get_error_code() == 2:
+                        set_exit_code_msg(e.get_error_code(),
+                                          "rmdir: failed to remove "
+                                          f"{path.decode('utf-8')}: "
+                                          "No such file or directory")
+                    elif e.get_error_code() == 20:
+                        set_exit_code_msg(e.get_error_code(),
+                                          "rmdir: failed to remove "
+                                          f"{path.decode('utf-8')}: "
+                                          "Not a directory")
+                    elif e.get_error_code() == 39:
+                        set_exit_code_msg(e.get_error_code(),
+                                          "rmdir: failed to remove "
+                                          f"{path.decode('utf-8')}: "
+                                          "Directory not empty")
+                    else:
+                        set_exit_code_msg(msg=e)
+
+    def complete_rm(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    rm_parser = argparse.ArgumentParser(description='Remove File.')
+    rm_parser.add_argument('paths', help='File Path.', nargs='+',
+                           action=path_to_bytes)
+
+    @with_argparser(rm_parser)
+    def do_rm(self, args):
+        """
+        Remove a specific file
+        """
+        file_paths = args.paths
+        for path in file_paths:
+            if path.count(b'*') > 0:
+                file_paths.extend([i for i in get_all_possible_paths(
+                    path) if is_file_exists(i)])
+            else:
+                try:
+                    cephfs.unlink(path)
+                except libcephfs.Error as e:
+                    # NOTE: perhaps we need a better msg here
+                    if e.get_error_code() == 2:
+                        set_exit_code_msg(e.get_error_code(),
+                                          "rm: failed to remove "
+                                          f"{path.decode('utf-8')}: "
+                                          "No such file or directory")
+                    elif e.get_error_code() == 21:
+                        set_exit_code_msg(e.get_error_code(),
+                                          "rm: failed to remove "
+                                          f"{path.decode('utf-8')}: "
+                                          "Is a directory")
+                    else:
+                        set_exit_code_msg(msg=e)
+
+    def complete_mv(self, text, line, begidx, endidx):
+        """
+         auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    mv_parser = argparse.ArgumentParser(description='Move File.')
+    mv_parser.add_argument('src_path', type=str, action=path_to_bytes,
+                           help='Source File Path.')
+    mv_parser.add_argument('dest_path', type=str, action=path_to_bytes,
+                           help='Destination File Path.')
+
+    @with_argparser(mv_parser)
+    def do_mv(self, args):
+        """
+        Rename a file or Move a file from source path to the destination
+        """
+        cephfs.rename(args.src_path, args.dest_path)
+
+    def complete_cd(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    cd_parser = argparse.ArgumentParser(description='Change working directory')
+    cd_parser.add_argument('path', type=str, help='Name of the directory.',
+                           action=path_to_bytes, nargs='?', default='/')
+
+    @with_argparser(cd_parser)
+    def do_cd(self, args):
+        """
+        Change working directory
+        """
+        cephfs.chdir(args.path)
+        self.working_dir = cephfs.getcwd().decode('utf-8')
+        self.set_prompt()
+
+    def do_cwd(self, arglist):
+        """
+        Get current working directory.
+        """
+        poutput(cephfs.getcwd().decode('utf-8'))
+
+    def complete_chmod(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    chmod_parser = argparse.ArgumentParser(description='Change permission of a file/directory.')
+    chmod_parser.add_argument('mode', type=str, action=ModeAction, help='Mode')
+    chmod_parser.add_argument('paths', type=str, action=path_to_bytes,
+                              help='Path of the file/directory', nargs='+')
+
+    @with_argparser(chmod_parser)
+    def do_chmod(self, args):
+        """
+        Change permission of a file/directory
+        """
+        for path in args.paths:
+            mode = int(args.mode, base=8)
+            try:
+                cephfs.chmod(path, mode)
+            except libcephfs.Error as e:
+                set_exit_code_msg(msg=e)
+
+    def complete_cat(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    cat_parser = argparse.ArgumentParser(description='')
+    cat_parser.add_argument('paths', help='Name of Files', action=path_to_bytes,
+                            nargs='+')
+
+    @with_argparser(cat_parser)
+    def do_cat(self, args):
+        """
+        Print contents of a file
+        """
+        for path in args.paths:
+            if is_file_exists(path):
+                copy_to_local(path, b'-')
+            else:
+                set_exit_code_msg(errno.ENOENT, '{}: no such file'.format(
+                    path.decode('utf-8')))
+
+    umask_parser = argparse.ArgumentParser(description='Set umask value.')
+    umask_parser.add_argument('mode', help='Mode', type=str, action=ModeAction,
+                              nargs='?', default='')
+
+    @with_argparser(umask_parser)
+    def do_umask(self, args):
+        """
+        Set Umask value.
+        """
+        if args.mode == '':
+            poutput(self.umask.zfill(4))
+        else:
+            mode = int(args.mode, 8)
+            self.umask = str(oct(cephfs.umask(mode))[2:])
+
+    def complete_write(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    write_parser = argparse.ArgumentParser(description='Writes data into a file')
+    write_parser.add_argument('path', type=str, action=path_to_bytes,
+                              help='Name of File')
+
+    @with_argparser(write_parser)
+    def do_write(self, args):
+        """
+        Write data into a file.
+        """
+
+        copy_from_local(b'-', args.path)
+
+    def complete_lcd(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        index_dict = {1: self.path_complete}
+        return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+    lcd_parser = argparse.ArgumentParser(description='')
+    lcd_parser.add_argument('path', type=str, action=path_to_bytes, help='Path')
+
+    @with_argparser(lcd_parser)
+    def do_lcd(self, args):
+        """
+        Moves into the given local directory
+        """
+        try:
+            os.chdir(os.path.expanduser(args.path))
+        except OSError as e:
+            set_exit_code_msg(e.errno, "Cannot change to "
+                              f"{e.filename.decode('utf-8')}: {e.strerror}")
+
+    def complete_lls(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        index_dict = {1: self.path_complete}
+        return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+    lls_parser = argparse.ArgumentParser(
+        description='List files in local system.')
+    lls_parser.add_argument('paths', help='Paths', action=path_to_bytes,
+                            nargs='*')
+
+    @with_argparser(lls_parser)
+    def do_lls(self, args):
+        """
+        Lists all files and folders in the current local directory
+        """
+        if not args.paths:
+            print_list(os.listdir(os.getcwdb()))
+        else:
+            for path in args.paths:
+                try:
+                    items = os.listdir(path)
+                    poutput("{}:".format(path.decode('utf-8')))
+                    print_list(items)
+                except OSError as e:
+                    set_exit_code_msg(e.errno, f"{e.filename.decode('utf-8')}: "
+                                      f"{e.strerror}")
+        # Arguments to the with_argpaser decorator function are sticky.
+        # The items in args.path do not get overwritten in subsequent calls.
+        # The arguments remain in args.paths after the function exits and we
+        # need to clean it up to ensure the next call works as expected.
+        args.paths.clear()
+
+    def do_lpwd(self, arglist):
+        """
+        Prints the absolute path of the current local directory
+        """
+        poutput(os.getcwd())
+
+    def complete_df(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    df_parser = argparse.ArgumentParser(description='Show information about\
+                the amount of available disk space')
+    df_parser.add_argument('file', help='Name of the file', nargs='*',
+                           default=['.'], action=path_to_bytes)
+
+    @with_argparser(df_parser)
+    def do_df(self, arglist):
+        """
+        Display the amount of available disk space for file systems
+        """
+        header = True    # Set to true for printing header only once
+        if b'.' == arglist.file[0]:
+            arglist.file = ls(b'.')
+
+        for file in arglist.file:
+            if isinstance(file, libcephfs.DirEntry):
+                file = file.d_name
+            if file == b'.' or file == b'..':
+                continue
+            try:
+                statfs = cephfs.statfs(file)
+                stat = cephfs.stat(file)
+                block_size = (statfs['f_blocks'] * statfs['f_bsize']) // 1024
+                available = block_size - stat.st_size
+                use = 0
+
+                if block_size > 0:
+                    use = (stat.st_size * 100) // block_size
+
+                if header:
+                    header = False
+                    poutput('{:25s}\t{:5s}\t{:15s}{:10s}{}'.format(
+                            "1K-blocks", "Used", "Available", "Use%",
+                            "Stored on"))
+
+                poutput('{:d}\t{:18d}\t{:8d}\t{:10s} {}'.format(block_size,
+                        stat.st_size, available, str(int(use)) + '%',
+                        file.decode('utf-8')))
+            except libcephfs.OSError as e:
+                set_exit_code_msg(e.get_error_code(), "could not statfs {}: {}".format(
+                    file.decode('utf-8'), e.strerror))
+
+    locate_parser = argparse.ArgumentParser(
+        description='Find file within file system')
+    locate_parser.add_argument('name', help='name', type=str,
+                               action=path_to_bytes)
+    locate_parser.add_argument('-c', '--count', action='store_true',
+                               help='Count list of items located.')
+    locate_parser.add_argument(
+        '-i', '--ignorecase', action='store_true', help='Ignore case')
+
+    @with_argparser(locate_parser)
+    def do_locate(self, args):
+        """
+        Find a file within the File System
+        """
+        if args.name.count(b'*') == 1:
+            if args.name[0] == b'*':
+                args.name += b'/'
+            elif args.name[-1] == '*':
+                args.name = b'/' + args.name
+        args.name = args.name.replace(b'*', b'')
+        if args.ignorecase:
+            locations = locate_file(args.name, False)
+        else:
+            locations = locate_file(args.name)
+        if args.count:
+            poutput(len(locations))
+        else:
+            poutput((b'\n'.join(locations)).decode('utf-8'))
+
+    def complete_du(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    du_parser = argparse.ArgumentParser(
+        description='Disk Usage of a Directory')
+    du_parser.add_argument('paths', type=str, action=get_list_of_bytes_path,
+                           help='Name of the directory.', nargs='*',
+                           default=[b'.'])
+    du_parser.add_argument('-r', action='store_true',
+                           help='Recursive Disk usage of all directories.')
+
+    @with_argparser(du_parser)
+    def do_du(self, args):
+        """
+        Print disk usage of a given path(s).
+        """
+        def print_disk_usage(files):
+            if isinstance(files, bytes):
+                files = (files, )
+
+            for f in files:
+                try:
+                    st = cephfs.lstat(f)
+
+                    if stat.S_ISDIR(st.st_mode):
+                        dusage = int(cephfs.getxattr(f,
+                                     'ceph.dir.rbytes').decode('utf-8'))
+                    else:
+                        dusage = st.st_size
+
+                    # print path in local context
+                    f = os.path.normpath(f)
+                    if f[0] is ord('/'):
+                        f = b'.' + f
+                    poutput('{:10s} {}'.format(humansize(dusage),
+                            f.decode('utf-8')))
+                except libcephfs.Error as e:
+                    set_exit_code_msg(msg=e)
+                    continue
+
+        for path in args.paths:
+            if args.r:
+                print_disk_usage(sorted(set(dirwalk(path)).union({path})))
+            else:
+                print_disk_usage(path)
+
+    quota_parser = argparse.ArgumentParser(
+        description='Quota management for a Directory')
+    quota_parser.add_argument('op', choices=['get', 'set'],
+                              help='Quota operation type.')
+    quota_parser.add_argument('path', type=str, action=path_to_bytes,
+                              help='Name of the directory.')
+    quota_parser.add_argument('--max_bytes', type=int, default=-1, nargs='?',
+                              help='Max cumulative size of the data under '
+                                   'this directory.')
+    quota_parser.add_argument('--max_files', type=int, default=-1, nargs='?',
+                              help='Total number of files under this '
+                                   'directory tree.')
+
+    @with_argparser(quota_parser)
+    def do_quota(self, args):
+        """
+        Quota management.
+        """
+        if not is_dir_exists(args.path):
+            set_exit_code_msg(errno.ENOENT, 'error: no such directory {}'.format(
+                args.path.decode('utf-8')))
+            return
+
+        if args.op == 'set':
+            if (args.max_bytes == -1) and (args.max_files == -1):
+                set_exit_code_msg(errno.EINVAL, 'please specify either '
+                                  '--max_bytes or --max_files or both')
+                return
+
+            if args.max_bytes >= 0:
+                max_bytes = to_bytes(str(args.max_bytes))
+                try:
+                    cephfs.setxattr(args.path, 'ceph.quota.max_bytes',
+                                    max_bytes, os.XATTR_CREATE)
+                    poutput('max_bytes set to %d' % args.max_bytes)
+                except libcephfs.Error as e:
+                    cephfs.setxattr(args.path, 'ceph.quota.max_bytes',
+                                    max_bytes, os.XATTR_REPLACE)
+                    set_exit_code_msg(e.get_error_code(), 'max_bytes reset to '
+                                      f'{args.max_bytes}')
+
+            if args.max_files >= 0:
+                max_files = to_bytes(str(args.max_files))
+                try:
+                    cephfs.setxattr(args.path, 'ceph.quota.max_files',
+                                    max_files, os.XATTR_CREATE)
+                    poutput('max_files set to %d' % args.max_files)
+                except libcephfs.Error as e:
+                    cephfs.setxattr(args.path, 'ceph.quota.max_files',
+                                    max_files, os.XATTR_REPLACE)
+                    set_exit_code_msg(e.get_error_code(), 'max_files reset to '
+                                      f'{args.max_files}')
+        elif args.op == 'get':
+            max_bytes = '0'
+            max_files = '0'
+            try:
+                max_bytes = cephfs.getxattr(args.path, 'ceph.quota.max_bytes')
+                poutput('max_bytes: {}'.format(max_bytes.decode('utf-8')))
+            except libcephfs.Error as e:
+                set_exit_code_msg(e.get_error_code(), 'max_bytes is not set')
+
+            try:
+                max_files = cephfs.getxattr(args.path, 'ceph.quota.max_files')
+                poutput('max_files: {}'.format(max_files.decode('utf-8')))
+            except libcephfs.Error as e:
+                set_exit_code_msg(e.get_error_code(), 'max_files is not set')
+
+    snap_parser = argparse.ArgumentParser(description='Snapshot Management')
+    snap_parser.add_argument('op', type=str,
+                             help='Snapshot operation: create or delete')
+    snap_parser.add_argument('name', type=str, action=path_to_bytes,
+                             help='Name of snapshot')
+    snap_parser.add_argument('dir', type=str, action=path_to_bytes,
+                             help='Directory for which snapshot '
+                                  'needs to be created or deleted')
+
+    @with_argparser(snap_parser)
+    def do_snap(self, args):
+        """
+        Snapshot management for the volume
+        """
+        # setting self.colors to None turns off colorizing and
+        # perror emits plain text
+        self.colors = None
+
+        snapdir = '.snap'
+        conf_snapdir = cephfs.conf_get('client_snapdir')
+        if conf_snapdir is not None:
+            snapdir = conf_snapdir
+        snapdir = to_bytes(snapdir)
+        if args.op == 'create':
+            try:
+                if is_dir_exists(args.dir):
+                    cephfs.mkdir(os.path.join(args.dir, snapdir, args.name), 0o755)
+                else:
+                    set_exit_code_msg(errno.ENOENT, "'{}': no such directory".format(
+                                      args.dir.decode('utf-8')))
+            except libcephfs.Error as e:
+                set_exit_code_msg(e.get_error_code(),
+                                  "snapshot '{}' already exists".format(
+                                  args.name.decode('utf-8')))
+        elif args.op == 'delete':
+            snap_dir = os.path.join(args.dir, snapdir, args.name)
+            try:
+                if is_dir_exists(snap_dir):
+                    newargs = argparse.Namespace(paths=[snap_dir], parent=False)
+                    self.do_rmdir_helper(newargs)
+                else:
+                    set_exit_code_msg(errno.ENOENT, "'{}': no such snapshot".format(
+                        args.name.decode('utf-8')))
+            except libcephfs.Error as e:
+                set_exit_code_msg(e.get_error_code(), "error while deleting "
+                                  "'{}'".format(snap_dir.decode('utf-8')))
+        else:
+            set_exit_code_msg(errno.EINVAL, "snapshot can only be created or "
+                              "deleted; check - help snap")
+
+    def do_help(self, line):
+        """
+        Get details about a command.
+            Usage: help <cmd> - for a specific command
+                   help all - for all the commands
+        """
+        if line == 'all':
+            for k in dir(self):
+                if k.startswith('do_'):
+                    poutput('-' * 80)
+                    super().do_help(k[3:])
+            return
+        parser = self.create_argparser(line)
+        if parser:
+            parser.print_help()
+        else:
+            super().do_help(line)
+
+    def complete_stat(self, text, line, begidx, endidx):
+        """
+        auto complete of file name.
+        """
+        return self.complete_filenames(text, line, begidx, endidx)
+
+    stat_parser = argparse.ArgumentParser(
+        description='Display file or file system status')
+    stat_parser.add_argument('paths', type=str, help='file paths',
+                             action=path_to_bytes, nargs='+')
+
+    @with_argparser(stat_parser)
+    def do_stat(self, args):
+        """
+        Display file or file system status
+        """
+        for path in args.paths:
+            try:
+                stat = cephfs.stat(path)
+                atime = stat.st_atime.isoformat(' ')
+                mtime = stat.st_mtime.isoformat(' ')
+                ctime = stat.st_mtime.isoformat(' ')
+
+                poutput("File: {}\nSize: {:d}\nBlocks: {:d}\nIO Block: {:d}\n"
+                        "Device: {:d}\tInode: {:d}\tLinks: {:d}\nPermission: "
+                        "{:o}/{}\tUid: {:d}\tGid: {:d}\nAccess: {}\nModify: "
+                        "{}\nChange: {}".format(path.decode('utf-8'),
+                                                stat.st_size, stat.st_blocks,
+                                                stat.st_blksize, stat.st_dev,
+                                                stat.st_ino, stat.st_nlink,
+                                                stat.st_mode,
+                                                mode_notation(stat.st_mode),
+                                                stat.st_uid, stat.st_gid, atime,
+                                                mtime, ctime))
+            except libcephfs.Error as e:
+                set_exit_code_msg(msg=e)
+
+    setxattr_parser = argparse.ArgumentParser(
+        description='Set extended attribute for a file')
+    setxattr_parser.add_argument('path', type=str, action=path_to_bytes, help='Name of the file')
+    setxattr_parser.add_argument('name', type=str, help='Extended attribute name')
+    setxattr_parser.add_argument('value', type=str, help='Extended attribute value')
+
+    @with_argparser(setxattr_parser)
+    def do_setxattr(self, args):
+        """
+        Set extended attribute for a file
+        """
+        val_bytes = to_bytes(args.value)
+        name_bytes = to_bytes(args.name)
+        try:
+            cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_CREATE)
+            poutput('{} is successfully set to {}'.format(args.name, args.value))
+        except libcephfs.ObjectExists:
+            cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_REPLACE)
+            poutput('{} is successfully reset to {}'.format(args.name, args.value))
+        except libcephfs.Error as e:
+            set_exit_code_msg(msg=e)
+
+    getxattr_parser = argparse.ArgumentParser(
+        description='Get extended attribute set for a file')
+    getxattr_parser.add_argument('path', type=str, action=path_to_bytes,
+                                 help='Name of the file')
+    getxattr_parser.add_argument('name', type=str, help='Extended attribute name')
+
+    @with_argparser(getxattr_parser)
+    def do_getxattr(self, args):
+        """
+        Get extended attribute for a file
+        """
+        try:
+            poutput('{}'.format(cephfs.getxattr(args.path,
+                                to_bytes(args.name)).decode('utf-8')))
+        except libcephfs.Error as e:
+            set_exit_code_msg(msg=e)
+
+    listxattr_parser = argparse.ArgumentParser(
+        description='List extended attributes set for a file')
+    listxattr_parser.add_argument('path', type=str, action=path_to_bytes,
+                                  help='Name of the file')
+
+    @with_argparser(listxattr_parser)
+    def do_listxattr(self, args):
+        """
+        List extended attributes for a file
+        """
+        try:
+            size, xattr_list = cephfs.listxattr(args.path)
+            if size > 0:
+                poutput('{}'.format(xattr_list.replace(b'\x00', b' ').decode('utf-8')))
+            else:
+                poutput('No extended attribute is set')
+        except libcephfs.Error as e:
+            set_exit_code_msg(msg=e)
+
+
+#######################################################
+#
+# Following are methods that get cephfs-shell started.
+#
+#####################################################
+
+def setup_cephfs(args):
+    """
+    Mounting a cephfs
+    """
+    global cephfs
+    try:
+        cephfs = libcephfs.LibCephFS(conffile='')
+        cephfs.mount(filesystem_name=args.fs)
+    except libcephfs.ObjectNotFound as e:
+        print('couldn\'t find ceph configuration not found')
+        sys.exit(e.get_error_code())
+    except libcephfs.Error as e:
+        print(e)
+        sys.exit(e.get_error_code())
+
+
+def str_to_bool(val):
+    """
+    Return corresponding bool values for strings like 'true' or 'false'.
+    """
+    if not isinstance(val, str):
+        return val
+
+    val = val.replace('\n', '')
+    if val.lower() in ['true', 'yes']:
+        return True
+    elif val.lower() in ['false', 'no']:
+        return False
+    else:
+        return val
+
+
+def read_shell_conf(shell, shell_conf_file):
+    import configparser
+
+    sec = 'cephfs-shell'
+    opts = []
+    if LooseVersion(cmd2_version) >= LooseVersion("0.10.0"):
+        for attr in shell.settables.keys():
+            opts.append(attr)
+    else:
+        if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"):
+            # hardcoding options for 0.7.9 because -
+            # 1. we use cmd2 v0.7.9 with teuthology and
+            # 2. there's no way distinguish between a shell setting and shell
+            #    object attribute until v0.10.0
+            opts = ['abbrev', 'autorun_on_edit', 'colors',
+                    'continuation_prompt', 'debug', 'echo', 'editor',
+                    'feedback_to_output', 'locals_in_py', 'prompt', 'quiet',
+                    'timing']
+        elif LooseVersion(cmd2_version) >= LooseVersion("0.9.23"):
+            opts.append('allow_style')
+        # no equivalent option was defined by cmd2.
+        else:
+            pass
+
+    # default and only section in our conf file.
+    cp = configparser.ConfigParser(default_section=sec, strict=False)
+    cp.read(shell_conf_file)
+    for opt in opts:
+        if cp.has_option(sec, opt):
+            setattr(shell, opt, str_to_bool(cp.get(sec, opt)))
+
+
+def get_shell_conffile_path(arg_conf=''):
+    conf_filename = 'cephfs-shell.conf'
+    env_var = 'CEPHFS_SHELL_CONF'
+
+    arg_conf = '' if not arg_conf else arg_conf
+    home_dir_conf = os.path.expanduser('~/.' + conf_filename)
+    env_conf = os.environ[env_var] if env_var in os.environ else ''
+
+    # here's the priority by which conf gets read.
+    for path in (arg_conf, env_conf, home_dir_conf):
+        if os.path.isfile(path):
+            return path
+    else:
+        return ''
+
+
+def manage_args():
+    main_parser = argparse.ArgumentParser(description='')
+    main_parser.add_argument('-b', '--batch', action='store',
+                             help='Path to CephFS shell script/batch file'
+                                  'containing CephFS shell commands',
+                             type=str)
+    main_parser.add_argument('-c', '--config', action='store',
+                             help='Path to Ceph configuration file.',
+                             type=str)
+    main_parser.add_argument('-f', '--fs', action='store',
+                             help='Name of filesystem to mount.',
+                             type=str)
+    main_parser.add_argument('-t', '--test', action='store',
+                             help='Test against transcript(s) in FILE',
+                             nargs='+')
+    main_parser.add_argument('commands', nargs='*', help='Comma delimited '
+                             'commands. The shell executes the given command '
+                             'and quits immediately with the return value of '
+                             'command. In case no commands are provided, the '
+                             'shell is launched.', default=[])
+
+    args = main_parser.parse_args()
+    args.exe_and_quit = False    # Execute and quit, don't launch the shell.
+
+    if args.batch:
+        if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"):
+            args.commands = ['load ' + args.batch, ',quit']
+        else:
+            args.commands = ['run_script ' + args.batch, ',quit']
+    if args.test:
+        args.commands.extend(['-t,'] + [arg + ',' for arg in args.test])
+    if not args.batch and len(args.commands) > 0:
+        args.exe_and_quit = True
+
+    manage_sys_argv(args)
+
+    return args
+
+
+def manage_sys_argv(args):
+    exe = sys.argv[0]
+    sys.argv.clear()
+    sys.argv.append(exe)
+    sys.argv.extend([i.strip() for i in ' '.join(args.commands).split(',')])
+
+    setup_cephfs(args)
+
+
+def execute_cmd_args(args):
+    """
+    Launch a shell session if no arguments were passed, else just execute
+    the given argument as a shell command and exit the shell session
+    immediately at (last) command's termination with the (last) command's
+    return value.
+    """
+    if not args.exe_and_quit:
+        return shell.cmdloop()
+    return execute_cmds_and_quit(args)
+
+
+def execute_cmds_and_quit(args):
+    """
+    Multiple commands might be passed separated by commas, feed onecmd()
+    one command at a time.
+    """
+    # do_* methods triggered by cephfs-shell commands return None when they
+    # complete running successfully. Until 0.9.6, shell.onecmd() returned this
+    # value to indicate whether the execution of the commands should stop, but
+    # since 0.9.7 it returns the return value of do_* methods only if it's
+    # not None. When it is None it returns False instead of None.
+    if LooseVersion(cmd2_version) <= LooseVersion("0.9.6"):
+        stop_exec_val = None
+    else:
+        stop_exec_val = False
+
+    args_to_onecmd = ''
+    if len(args.commands) <= 1:
+        args.commands = args.commands[0].split(' ')
+    for cmdarg in args.commands:
+        if ',' in cmdarg:
+            args_to_onecmd += ' ' + cmdarg[0:-1]
+            onecmd_retval = shell.onecmd(args_to_onecmd)
+            # if the current command failed, let's abort the execution of
+            # series of commands passed.
+            if onecmd_retval is not stop_exec_val:
+                return onecmd_retval
+            if shell.exit_code != 0:
+                return shell.exit_code
+
+            args_to_onecmd = ''
+            continue
+
+        args_to_onecmd += ' ' + cmdarg
+    return shell.onecmd(args_to_onecmd)
+
+
+if __name__ == '__main__':
+    args = manage_args()
+
+    shell = CephFSShell()
+    # TODO: perhaps, we should add an option to pass ceph.conf?
+    read_shell_conf(shell, get_shell_conffile_path(args.config))
+    # XXX: setting shell.exit_code to zero so that in case there are no errors
+    # and exceptions, it is not set by any method or function of cephfs-shell
+    # and return values from shell.cmdloop() or shell.onecmd() is not an
+    # integer, we can treat it as the return value of cephfs-shell.
+    shell.exit_code = 0
+
+    retval = execute_cmd_args(args)
+    sys.exit(retval if retval else shell.exit_code)
diff --git a/src/tools/cephfs/shell/setup.py b/src/tools/cephfs/shell/setup.py
new file mode 100644
index 000000000..8cf7f28f7
--- /dev/null
+++ b/src/tools/cephfs/shell/setup.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+from setuptools import setup
+
+__version__ = '0.0.1'
+
+setup(
+    name='cephfs-shell',
+    version=__version__,
+    description='Interactive shell for Ceph file system',
+    keywords='cephfs, shell',
+    scripts=['cephfs-shell'],
+    install_requires=[
+        'cephfs',
+        'cmd2',
+        'colorama',
+    ],
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Environment :: Console',
+        'Intended Audience :: System Administrators',
+        'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)',
+        'Operating System :: POSIX :: Linux',
+        'Programming Language :: Python :: 3'
+    ],
+    license='LGPLv2+',
+)
diff --git a/src/tools/cephfs/shell/tox.ini b/src/tools/cephfs/shell/tox.ini
new file mode 100644
index 000000000..c1cbff051
--- /dev/null
+++ b/src/tools/cephfs/shell/tox.ini
@@ -0,0 +1,7 @@
+[tox]
+envlist = py3
+skipsdist = true
+
+[testenv:py3]
+deps = flake8
+commands = flake8 --ignore=W503 --max-line-length=100 cephfs-shell
diff --git a/src/tools/cephfs/top/CMakeLists.txt b/src/tools/cephfs/top/CMakeLists.txt
new file mode 100644
index 000000000..8f9df0187
--- /dev/null
+++ b/src/tools/cephfs/top/CMakeLists.txt
@@ -0,0 +1,11 @@
+include(Distutils)
+distutils_install_module(cephfs-top)
+
+if(WITH_TESTS)
+  include(AddCephTest)
+  add_tox_test(cephfs-top)
+endif()
+
+set(MINIMUM_COMPATIBLE_VERSION 3.6.0)
+find_package(Python3 ${MINIMUM_COMPATIBLE_VERSION} REQUIRED
+  COMPONENTS Interpreter)
diff --git a/src/tools/cephfs/top/cephfs-top b/src/tools/cephfs/top/cephfs-top
new file mode 100755
index 000000000..b39e815fa
--- /dev/null
+++ b/src/tools/cephfs/top/cephfs-top
@@ -0,0 +1,1227 @@
+#!/usr/bin/python3
+
+import argparse
+import sys
+import curses
+import errno
+import json
+import signal
+import time
+import math
+import threading
+
+from collections import OrderedDict
+from datetime import datetime
+from enum import Enum, unique
+from curses import ascii
+
+import rados
+
+
+class FSTopException(Exception):
+    def __init__(self, msg=''):
+        self.error_msg = msg
+
+    def get_error_msg(self):
+        return self.error_msg
+
+
+@unique
+class MetricType(Enum):
+    METRIC_TYPE_NONE = 0
+    METRIC_TYPE_PERCENTAGE = 1
+    METRIC_TYPE_LATENCY = 2
+    METRIC_TYPE_SIZE = 3
+    METRIC_TYPE_STDEV = 4
+
+
+FS_TOP_PROG_STR = 'cephfs-top'
+FS_TOP_ALL_FS_APP = 'ALL_FS_APP'
+FS_TOP_FS_SELECTED_APP = 'SELECTED_FS_APP'
+
+# version match b/w fstop and stats emitted by mgr/stats
+FS_TOP_SUPPORTED_VER = 2
+
+ITEMS_PAD_LEN = 3
+ITEMS_PAD = " " * ITEMS_PAD_LEN
+DEFAULT_REFRESH_INTERVAL = 1
+
+# metadata provided by mgr/stats
+FS_TOP_MAIN_WINDOW_COL_CLIENT_ID = "client_id"
+FS_TOP_MAIN_WINDOW_COL_MNT_ROOT = "mount_root"
+FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR = "mount_point@host/addr"
+
+MAIN_WINDOW_TOP_LINE_ITEMS_START = [ITEMS_PAD,
+                                    FS_TOP_MAIN_WINDOW_COL_CLIENT_ID,
+                                    FS_TOP_MAIN_WINDOW_COL_MNT_ROOT]
+MAIN_WINDOW_TOP_LINE_ITEMS_END = [FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR]
+
+MAIN_WINDOW_TOP_LINE_METRICS_LEGACY = ["READ_LATENCY",
+                                       "WRITE_LATENCY",
+                                       "METADATA_LATENCY"
+                                       ]
+
+# adjust this map according to stats version and maintain order
+# as emitted by mgr/stast
+MAIN_WINDOW_TOP_LINE_METRICS = OrderedDict([
+    ("CAP_HIT", MetricType.METRIC_TYPE_PERCENTAGE),
+    ("READ_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+    ("WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+    ("METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+    ("DENTRY_LEASE", MetricType.METRIC_TYPE_PERCENTAGE),
+    ("OPENED_FILES", MetricType.METRIC_TYPE_NONE),
+    ("PINNED_ICAPS", MetricType.METRIC_TYPE_NONE),
+    ("OPENED_INODES", MetricType.METRIC_TYPE_NONE),
+    ("READ_IO_SIZES", MetricType.METRIC_TYPE_SIZE),
+    ("WRITE_IO_SIZES", MetricType.METRIC_TYPE_SIZE),
+    ("AVG_READ_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+    ("STDEV_READ_LATENCY", MetricType.METRIC_TYPE_STDEV),
+    ("AVG_WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+    ("STDEV_WRITE_LATENCY", MetricType.METRIC_TYPE_STDEV),
+    ("AVG_METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+    ("STDEV_METADATA_LATENCY", MetricType.METRIC_TYPE_STDEV),
+])
+MGR_STATS_COUNTERS = list(MAIN_WINDOW_TOP_LINE_METRICS.keys())
+
+FS_TOP_VERSION_HEADER_FMT = '{prog_name} - {now}'
+FS_TOP_CLIENT_HEADER_FMT = 'Total Client(s): {num_clients} - '\
+    '{num_mounts} FUSE, {num_kclients} kclient, {num_libs} libcephfs'
+FS_TOP_NAME_TOPL_FMT = 'Filesystem: {fs_name} - {client_count} client(s)'
+
+CLIENT_METADATA_KEY = "client_metadata"
+CLIENT_METADATA_MOUNT_POINT_KEY = "mount_point"
+CLIENT_METADATA_MOUNT_ROOT_KEY = "root"
+CLIENT_METADATA_IP_KEY = "IP"
+CLIENT_METADATA_HOSTNAME_KEY = "hostname"
+CLIENT_METADATA_VALID_METRICS_KEY = "valid_metrics"
+
+GLOBAL_METRICS_KEY = "global_metrics"
+GLOBAL_COUNTERS_KEY = "global_counters"
+
+fs_list = []
+# store the current states of cephfs-top
+# last_fs    : last filesystem visited
+# last_field : last field selected for sorting
+# limit      : last limit value
+current_states = {"last_fs": "", "last_field": 'chit', "limit": None}
+metrics_dict = {}
+
+
+def calc_perc(c):
+    if c[0] == 0 and c[1] == 0:
+        return 0.0
+    return round((c[0] / (c[0] + c[1])) * 100, 2)
+
+
+def calc_lat(c):
+    return round(c[0] * 1000 + c[1] / 1000000, 2)
+
+
+def calc_stdev(c):
+    stdev = 0.0
+    if c[1] > 1:
+        stdev = math.sqrt(c[0] / (c[1] - 1)) / 1000000
+    return round(stdev, 2)
+
+
+# in MB
+def calc_size(c):
+    return round(c[1] / (1024 * 1024), 2)
+
+
+# in MB
+def calc_avg_size(c):
+    if c[0] == 0:
+        return 0.0
+    return round(c[1] / (c[0] * 1024 * 1024), 2)
+
+
+# in MB/s
+def calc_speed(size, duration):
+    if duration == 0:
+        return 0.0
+    return round(size / (duration * 1024 * 1024), 2)
+
+
+def wrap(s, sl):
+    """return a '+' suffixed wrapped string"""
+    if len(s) < sl:
+        return s
+    return f'{s[0:sl-1]}+'
+
+
+class FSTopBase(object):
+    def __init__(self):
+        self.last_time = time.time()
+        self.last_read_size = {}
+        self.last_write_size = {}
+        self.dump_json = {}
+
+    @staticmethod
+    def has_metric(metadata, metrics_key):
+        return metrics_key in metadata
+
+    @staticmethod
+    def has_metrics(metadata, metrics_keys):
+        for key in metrics_keys:
+            if not FSTopBase.has_metric(metadata, key):
+                return False
+        return True
+
+    def __build_clients(self, fs):
+        fs_meta = self.dump_json.setdefault(fs, {})
+        fs_key = self.stats_json[GLOBAL_METRICS_KEY].get(fs, {})
+        clients = fs_key.keys()
+        for client_id in clients:
+            cur_time = time.time()
+            duration = cur_time - self.last_time
+            self.last_time = cur_time
+            client_meta = self.stats_json[CLIENT_METADATA_KEY].get(fs, {}).get(client_id, {})
+            for item in MAIN_WINDOW_TOP_LINE_ITEMS_START[1:]:
+                if item == FS_TOP_MAIN_WINDOW_COL_CLIENT_ID:
+                    client_id_meta = fs_meta.setdefault(client_id.split('.')[1], {})
+                elif item == FS_TOP_MAIN_WINDOW_COL_MNT_ROOT:
+                    client_id_meta.update({item:
+                                           client_meta[CLIENT_METADATA_MOUNT_ROOT_KEY]})
+            counters = [m.upper() for m in self.stats_json[GLOBAL_COUNTERS_KEY]]
+            metrics = fs_key.get(client_id, {})
+            cidx = 0
+            for item in counters:
+                if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY:
+                    cidx += 1
+                    continue
+                m = metrics[cidx]
+                key = MGR_STATS_COUNTERS[cidx]
+                typ = MAIN_WINDOW_TOP_LINE_METRICS[key]
+                if item.lower() in client_meta.get(
+                        CLIENT_METADATA_VALID_METRICS_KEY, []):
+                    key_name = self.items(item)
+                    if typ == MetricType.METRIC_TYPE_PERCENTAGE:
+                        client_id_meta.update({f'{key_name}': calc_perc(m)})
+                    elif typ == MetricType.METRIC_TYPE_LATENCY:
+                        client_id_meta.update({f'{key_name}': calc_lat(m)})
+                    elif typ == MetricType.METRIC_TYPE_STDEV:
+                        client_id_meta.update({f'{key_name}': calc_stdev(m)})
+                    elif typ == MetricType.METRIC_TYPE_SIZE:
+                        client_id_meta.update({f'{key_name}': calc_size(m)})
+                        # average io sizes
+                        client_id_meta.update({f'{self.avg_items(item)}':
+                                               calc_avg_size(m)})
+                        # io speeds
+                        size = 0
+                        if key == "READ_IO_SIZES":
+                            if m[1] > 0:
+                                last_size = self.last_read_size.get(client_id, 0)
+                                size = m[1] - last_size
+                                self.last_read_size[client_id] = m[1]
+                        if key == "WRITE_IO_SIZES":
+                            if m[1] > 0:
+                                last_size = self.last_write_size.get(client_id, 0)
+                                size = m[1] - last_size
+                                self.last_write_size[client_id] = m[1]
+                        client_id_meta.update({f'{self.speed_items(item)}':
+                                               calc_speed(abs(size), duration)})
+                    else:
+                        # display 0th element from metric tuple
+                        client_id_meta.update({f'{key_name}': f'{m[0]}'})
+                else:
+                    client_id_meta.update({f'{self.items(item)}': "N/A"})
+                cidx += 1
+
+            for item in MAIN_WINDOW_TOP_LINE_ITEMS_END:
+                if item == FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR:
+                    if FSTopBase.has_metrics(client_meta,
+                                             [CLIENT_METADATA_MOUNT_POINT_KEY,
+                                              CLIENT_METADATA_HOSTNAME_KEY,
+                                              CLIENT_METADATA_IP_KEY]):
+                        mount_point = f'{client_meta[CLIENT_METADATA_MOUNT_POINT_KEY]}'\
+                            f'@{client_meta[CLIENT_METADATA_HOSTNAME_KEY]}/'\
+                            f'{client_meta[CLIENT_METADATA_IP_KEY]}'
+                        client_id_meta.update({item: mount_point})
+                    else:
+                        client_id_meta.update({item: "N/A"})
+
+    def dump_metrics_to_stdout(self, fs_name=None):
+        fs_list = self.get_fs_names()
+        if not fs_list:
+            sys.stdout.write("No filesystem available\n")
+        else:
+            self.stats_json = self.perf_stats_query()
+            if fs_name:  # --dumpfs
+                if fs_name in fs_list:
+                    self.__build_clients(fs_name)
+                else:
+                    sys.stdout.write(f"Filesystem {fs_name} not available\n")
+                    return
+            else:  # --dump
+                for fs in fs_list:
+                    self.__build_clients(fs)
+            sys.stdout.write(json.dumps(self.dump_json))
+            sys.stdout.write("\n")
+
+
+class FSTop(FSTopBase):
+    def __init__(self, args):
+        super(FSTop, self).__init__()
+        self.rados = None
+        self.stdscr = None  # curses instance
+        self.active_screen = ""
+        self.client_name = args.id
+        self.cluster_name = args.cluster
+        self.conffile = args.conffile
+        self.refresh_interval_secs = args.delay
+        self.PAD_HEIGHT = 10000  # height of the fstop_pad
+        self.PAD_WIDTH = 300  # width of the fstop_pad
+        self.exit_ev = threading.Event()
+
+    def handle_signal(self, signum, _):
+        self.exit_ev.set()
+
+    def init(self):
+        try:
+            if self.conffile:
+                r_rados = rados.Rados(rados_id=self.client_name,
+                                      clustername=self.cluster_name,
+                                      conffile=self.conffile)
+            else:
+                r_rados = rados.Rados(rados_id=self.client_name,
+                                      clustername=self.cluster_name)
+            r_rados.conf_read_file()
+            r_rados.connect()
+            self.rados = r_rados
+        except rados.Error as e:
+            if e.errno == errno.ENOENT:
+                raise FSTopException(f'cluster {self.cluster_name}'
+                                     ' does not exist')
+            else:
+                raise FSTopException(f'error connecting to cluster: {e}')
+        self.verify_perf_stats_support()
+        signal.signal(signal.SIGTERM, self.handle_signal)
+        signal.signal(signal.SIGINT, self.handle_signal)
+
+    def fini(self):
+        if self.rados:
+            self.rados.shutdown()
+            self.rados = None
+
+    def selftest(self):
+        stats_json = self.perf_stats_query()
+        if not stats_json['version'] == FS_TOP_SUPPORTED_VER:
+            raise FSTopException('perf stats version mismatch!')
+        missing = [m for m in stats_json["global_counters"]
+                   if m.upper() not in MGR_STATS_COUNTERS]
+        if missing:
+            raise FSTopException('Cannot handle unknown metrics from'
+                                 f'\'ceph fs perf stats\': {missing}')
+
+    def get_fs_names(self):
+        mon_cmd = {'prefix': 'fs ls', 'format': 'json'}
+        try:
+            ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'')
+        except Exception as e:
+            raise FSTopException(f'Error in fs ls: {e}')
+        fs_map = json.loads(buf.decode('utf-8'))
+        global fs_list
+        fs_list.clear()
+        for filesystem in fs_map:
+            fs = filesystem['name']
+            fs_list.append(fs)
+        return fs_list
+
+    def setup_curses(self, win):
+        self.stdscr = win
+        self.stdscr.keypad(True)
+        curses.use_default_colors()
+        curses.start_color()
+        try:
+            curses.curs_set(0)
+        except curses.error:
+            # If the terminal do not support the visibility
+            # requested it will raise an exception
+            pass
+        self.fstop_pad = curses.newpad(self.PAD_HEIGHT, self.PAD_WIDTH)
+        self.run_all_display()
+
+    def display_fs_menu(self, stdscr, selected_row_idx):
+        stdscr.clear()
+        h, w = stdscr.getmaxyx()
+        title = ['Filesystems', 'Press "q" to go back to the previous screen']
+        pos_x1 = w // 2 - len(title[0]) // 2
+        pos_x2 = w // 2 - len(title[1]) // 2
+        stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD)
+        stdscr.addstr(3, pos_x2, title[1], curses.A_DIM)
+        for index, name in enumerate(fs_list):
+            x = w // 2 - len(name) // 2
+            y = h // 2 - len(fs_list) // 2 + index
+            if index == selected_row_idx:
+                stdscr.attron(curses.color_pair(1))
+                stdscr.addstr(y, x, name)
+                stdscr.attroff(curses.color_pair(1))
+            else:
+                stdscr.addstr(y, x, name)
+        stdscr.refresh()
+
+    def display_sort_menu(self, stdscr, selected_row_idx, field_menu):
+        stdscr.clear()
+        title = ['Fields', 'Press "q" to go back to the previous screen']
+        pos_x1 = 0
+        pos_x2 = 0
+        stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD)
+        stdscr.addstr(3, pos_x2, title[1], curses.A_DIM)
+        for index, name in enumerate(field_menu):
+            x = 0
+            y = 5 + index
+            if index == selected_row_idx:
+                stdscr.attron(curses.color_pair(1))
+                stdscr.addstr(y, x, name)
+                stdscr.attroff(curses.color_pair(1))
+            else:
+                stdscr.addstr(y, x, name)
+        stdscr.refresh()
+
+    def display_menu(self, stdscr):
+        stdscr.clear()
+        h, w = stdscr.getmaxyx()
+        title = ['No filesystem available',
+                 'Press "q" to go back to home (All Filesystem Info) screen']
+        pos_x1 = w // 2 - len(title[0]) // 2
+        pos_x2 = w // 2 - len(title[1]) // 2
+        stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD)
+        stdscr.addstr(3, pos_x2, title[1], curses.A_DIM)
+        stdscr.refresh()
+
+    def set_key(self, stdscr):
+        curses.curs_set(0)
+        curses.init_pair(1, curses.COLOR_MAGENTA, curses.COLOR_WHITE)
+        curr_row = 0
+        key = 0
+        endmenu = False
+        while not endmenu:
+            global fs_list
+            fs_list = self.get_fs_names()
+
+            if key == curses.KEY_UP and curr_row > 0:
+                curr_row -= 1
+            elif key == curses.KEY_DOWN and curr_row < len(fs_list) - 1:
+                curr_row += 1
+            elif (key in [curses.KEY_ENTER, 10, 13]) and fs_list:
+                self.stdscr.erase()
+                current_states['last_fs'] = fs_list[curr_row]
+                self.run_display()
+                endmenu = True
+            elif key == ord('q'):
+                self.stdscr.erase()
+                if fs_list and self.active_screen == FS_TOP_FS_SELECTED_APP:
+                    self.run_display()
+                else:
+                    self.run_all_display()
+                endmenu = True
+
+            try:
+                if not fs_list:
+                    self.display_menu(stdscr)
+                else:
+                    self.display_fs_menu(stdscr, curr_row)
+            except curses.error:
+                pass
+            curses.halfdelay(self.refresh_interval_secs)
+            key = stdscr.getch()
+
+    def choose_field(self, stdscr):
+        curses.curs_set(0)
+        curses.init_pair(1, curses.COLOR_BLACK, curses.COLOR_WHITE)
+        field_menu = ["chit= CAP_HIT", "dlease= DENTRY_LEASE", "ofiles= OPENED_FILES",
+                      "oicaps= PINNED_ICAPS", "oinodes= OPENED_INODES",
+                      "rtio= READ_IO_SIZES", "raio= READ_AVG_IO_SIZES",
+                      "rsp= READ_IO_SPEED", "wtio= WRITE_IO_SIZES",
+                      "waio= WRITE_AVG_IO_SIZES", "wsp= WRITE_IO_SPEED",
+                      "rlatavg= AVG_READ_LATENCY", "rlatsd= STDEV_READ_LATENCY",
+                      "wlatavg= AVG_WRITE_LATENCY", "wlatsd= STDEV_WRITE_LATENCY",
+                      "mlatavg= AVG_METADATA_LATENCY", "mlatsd= STDEV_METADATA_LATENCY",
+                      "Default"]
+        curr_row1 = 0
+        key = 0
+        endwhile = False
+        while not endwhile:
+            global current_states, fs_list
+            fs_list = self.get_fs_names()
+
+            if key == curses.KEY_UP and curr_row1 > 0:
+                curr_row1 -= 1
+            elif key == curses.KEY_DOWN and curr_row1 < len(field_menu) - 1:
+                curr_row1 += 1
+            elif (key in [curses.KEY_ENTER, 10, 13]) and fs_list:
+                self.stdscr.erase()
+                if curr_row1 != len(field_menu) - 1:
+                    current_states["last_field"] = (field_menu[curr_row1].split('='))[0]
+                else:
+                    current_states["last_field"] = 'chit'
+                self.header.erase()  # erase the previous text
+                if self.active_screen == FS_TOP_ALL_FS_APP:
+                    self.run_all_display()
+                else:
+                    self.run_display()
+                endwhile = True
+            elif key == ord('q'):
+                self.stdscr.erase()
+                if fs_list and self.active_screen == FS_TOP_FS_SELECTED_APP:
+                    self.run_display()
+                else:
+                    self.run_all_display()
+                endwhile = True
+
+            try:
+                if not fs_list:
+                    self.display_menu(stdscr)
+                else:
+                    self.display_sort_menu(stdscr, curr_row1, field_menu)
+            except curses.error:
+                pass
+            curses.halfdelay(self.refresh_interval_secs)
+            key = stdscr.getch()
+
+    def set_limit(self, stdscr):
+        key = ''
+        endwhile = False
+        while not endwhile:
+            stdscr.clear()
+            h, w = stdscr.getmaxyx()
+            title = 'Enter the limit you want to set (number) and press ENTER,'\
+                    ' press "d" for default, "q" to go back to previous screen '
+            pos_x1 = w // 2 - len(title) // 2
+            try:
+                stdscr.addstr(1, pos_x1, title, curses.A_STANDOUT | curses.A_BOLD)
+            except curses.error:
+                pass
+            curses.halfdelay(self.refresh_interval_secs)
+            inp = stdscr.getch()
+            if inp in [ord('d'), ord('q')] or ascii.isdigit(inp):
+                key = key + chr(inp)
+                if key == 'd':
+                    current_states["limit"] = None
+                elif key == 'q':
+                    endwhile = True
+                elif (key).isnumeric():
+                    i = 1
+                    length = 4
+                    while i <= length:
+                        pos = w // 2 - len(key) // 2
+                        try:
+                            stdscr.move(3, 0)
+                            stdscr.clrtoeol()
+                            stdscr.addstr(3, pos, key, curses.A_BOLD)
+                        except curses.error:
+                            pass
+                        if key[i - 1] == '\n':
+                            break
+                        inp = stdscr.getch()
+                        if inp == ord('q'):
+                            if current_states['limit'] is None:
+                                key = current_states["limit"]
+                            else:
+                                key = current_states['limit'] + " "
+                            break
+                        if inp == curses.KEY_RESIZE:
+                            stdscr.clear()
+                            windowsize = stdscr.getmaxyx()
+                            wd = windowsize[1] - 1
+                            pos_x1 = wd // 2 - len(title) // 2
+                            try:
+                                stdscr.addstr(1, pos_x1, title, curses.A_STANDOUT | curses.A_BOLD)
+                            except curses.error:
+                                pass
+                        if inp == curses.KEY_BACKSPACE or inp == curses.KEY_DC or inp == 127:
+                            if i > 1:
+                                key = key[:-1]
+                                i = i - 1
+                                stdscr.move(4, 0)
+                                stdscr.clrtoeol()
+                            elif i == 1:
+                                curses.wrapper(self.set_limit)
+                        elif i == length:
+                            if inp == ord('\n'):
+                                key = key + chr(inp)
+                                i = i + 1
+                            else:
+                                info = "Max length is reached, press Backspace" \
+                                    " to edit or Enter to set the limit!"
+                                pos = w // 2 - len(info) // 2
+                                try:
+                                    stdscr.addstr(4, pos, info, curses.A_BOLD)
+                                except curses.error:
+                                    pass
+                        elif ascii.isdigit(inp) or inp == ord('\n'):
+                            key = key + chr(inp)
+                            i = i + 1
+                    if key is None:
+                        current_states["limit"] = key
+                    elif int(key) != 0:
+                        current_states["limit"] = key[:-1]
+                self.stdscr.erase()
+                self.header.erase()  # erase the previous text
+                if self.active_screen == FS_TOP_ALL_FS_APP:
+                    self.run_all_display()
+                else:
+                    self.run_display()
+
+    def set_option_all_fs(self, opt):
+        # sets the options for 'All Filesystem Info' screen
+        if opt == ord('m'):
+            if fs_list:
+                curses.wrapper(self.set_key)
+            else:
+                return False
+        elif opt == ord('s'):
+            if fs_list:
+                curses.wrapper(self.choose_field)
+            else:
+                return False
+        elif opt == ord('l'):
+            if fs_list:
+                curses.wrapper(self.set_limit)
+            else:
+                return False
+        elif opt == ord('r'):
+            if fs_list:
+                current_states['last_field'] = 'chit'
+                current_states["limit"] = None
+            return False  # We are already in run_all_display()
+        elif opt == ord('q'):
+            quit()
+        return True
+
+    def set_option_sel_fs(self, opt, selected_fs):
+        # sets the options for 'Selected Filesystem Info' screen
+        if opt == ord('m'):
+            if selected_fs in fs_list:
+                curses.wrapper(self.set_key)
+            else:
+                return False
+        elif opt == ord('s'):
+            if selected_fs in fs_list:
+                curses.wrapper(self.choose_field)
+            else:
+                return False
+        elif opt == ord('l'):
+            if selected_fs in fs_list:
+                curses.wrapper(self.set_limit)
+            else:
+                return False
+        elif opt == ord('r'):
+            if selected_fs in fs_list:
+                current_states['last_field'] = 'chit'
+                current_states["limit"] = None
+            return False  # we are already in run_display()
+        elif opt == ord('q'):
+            self.run_all_display()
+        return True
+
+    def verify_perf_stats_support(self):
+        mon_cmd = {'prefix': 'mgr module ls', 'format': 'json'}
+        try:
+            ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'')
+        except Exception as e:
+            raise FSTopException(f'error checking \'stats\' module: {e}')
+        if ret != 0:
+            raise FSTopException(f'error checking \'stats\' module: {out}')
+        if 'stats' not in json.loads(buf.decode('utf-8'))['enabled_modules']:
+            raise FSTopException('\'stats\' module not enabled. Use'
+                                 '\'ceph mgr module enable stats\' to enable')
+
+    def perf_stats_query(self):
+        mgr_cmd = {'prefix': 'fs perf stats', 'format': 'json'}
+        try:
+            ret, buf, out = self.rados.mgr_command(json.dumps(mgr_cmd), b'')
+        except Exception as e:
+            raise FSTopException(f'error in \'perf stats\' query: {e}')
+        if ret != 0:
+            raise FSTopException(f'error in \'perf stats\' query: {out}')
+        return json.loads(buf.decode('utf-8'))
+
+    def items(self, item):
+        if item == "CAP_HIT":
+            return "chit"
+        if item == "READ_LATENCY":
+            return "rlat"
+        if item == "WRITE_LATENCY":
+            return "wlat"
+        if item == "METADATA_LATENCY":
+            return "mlat"
+        if item == "DENTRY_LEASE":
+            return "dlease"
+        if item == "OPENED_FILES":
+            return "ofiles"
+        if item == "PINNED_ICAPS":
+            return "oicaps"
+        if item == "OPENED_INODES":
+            return "oinodes"
+        if item == "READ_IO_SIZES":
+            return "rtio"
+        if item == "WRITE_IO_SIZES":
+            return "wtio"
+        if item == 'AVG_READ_LATENCY':
+            return 'rlatavg'
+        if item == 'STDEV_READ_LATENCY':
+            return 'rlatsd'
+        if item == 'AVG_WRITE_LATENCY':
+            return 'wlatavg'
+        if item == 'STDEV_WRITE_LATENCY':
+            return 'wlatsd'
+        if item == 'AVG_METADATA_LATENCY':
+            return 'mlatavg'
+        if item == 'STDEV_METADATA_LATENCY':
+            return 'mlatsd'
+        else:
+            # return empty string for none type
+            return ''
+
+    def mtype(self, typ):
+        if typ == MetricType.METRIC_TYPE_PERCENTAGE:
+            return "(%)"
+        elif typ == MetricType.METRIC_TYPE_LATENCY:
+            return "(ms)"
+        elif typ == MetricType.METRIC_TYPE_SIZE:
+            return "(MB)"
+        elif typ == MetricType.METRIC_TYPE_STDEV:
+            return "(ms)"
+        else:
+            # return empty string for none type
+            return ''
+
+    def avg_items(self, item):
+        if item == "READ_IO_SIZES":
+            return "raio"
+        if item == "WRITE_IO_SIZES":
+            return "waio"
+        else:
+            # return empty string for none type
+            return ''
+
+    def speed_items(self, item):
+        if item == "READ_IO_SIZES":
+            return "rsp"
+        if item == "WRITE_IO_SIZES":
+            return "wsp"
+        else:
+            # return empty string for none type
+            return ''
+
+    def speed_mtype(self, typ):
+        if typ == MetricType.METRIC_TYPE_SIZE:
+            return "(MB/s)"
+        else:
+            # return empty string for none type
+            return ''
+
+    def create_table_header(self):  # formerly named as top_line
+        heading = []
+        for item in MAIN_WINDOW_TOP_LINE_ITEMS_START:
+            heading.append(item)
+
+        for item, typ in MAIN_WINDOW_TOP_LINE_METRICS.items():
+            if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY:
+                continue
+            it = f'{self.items(item)}{self.mtype(typ)}'
+            heading.append(it)
+
+            if item == "READ_IO_SIZES" or item == "WRITE_IO_SIZES":
+                # average io sizes
+                it = f'{self.avg_items(item)}{self.mtype(typ)}'
+                heading.append(it)
+
+                # io speeds
+                it = f'{self.speed_items(item)}{self.speed_mtype(typ)}'
+                heading.append(it)
+
+        for item in MAIN_WINDOW_TOP_LINE_ITEMS_END:
+            heading.append(item)
+        title = ITEMS_PAD.join(heading)
+        self.fsstats.addstr(self.tablehead_y, 0, title, curses.A_STANDOUT | curses.A_BOLD)
+
+    def create_client(self, fs_name, client_id, metrics, counters,
+                      client_meta, y_coord):
+        metrics_dict.setdefault(fs_name, {})
+        metrics_dict[fs_name].setdefault(client_id, {})
+        cur_time = time.time()
+        duration = cur_time - self.last_time
+        self.last_time = cur_time
+        xp = 0  # xp is incremented after each addstr to position the next incoming metrics.
+        for item in MAIN_WINDOW_TOP_LINE_ITEMS_START:  # note: the first item is ITEMS_PAD
+            hlen = len(item) + ITEMS_PAD_LEN
+            if item == FS_TOP_MAIN_WINDOW_COL_CLIENT_ID:
+                self.fsstats.addstr(y_coord, xp,
+                                    wrap(client_id.split('.')[1], hlen), curses.A_DIM)
+            elif item == FS_TOP_MAIN_WINDOW_COL_MNT_ROOT:
+                if FSTop.has_metric(client_meta,
+                                    CLIENT_METADATA_MOUNT_ROOT_KEY):
+                    hlen = len(item) + ITEMS_PAD_LEN
+                    self.fsstats.addstr(
+                        y_coord, xp,
+                        wrap(client_meta[CLIENT_METADATA_MOUNT_ROOT_KEY], hlen), curses.A_DIM)
+                else:
+                    self.fsstats.addstr(y_coord, xp, "N/A", curses.A_DIM)
+            xp += hlen
+
+        cidx = 0
+        for item in counters:
+            if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY:
+                cidx += 1
+                continue
+            m = metrics[cidx]
+            key = MGR_STATS_COUNTERS[cidx]
+            typ = MAIN_WINDOW_TOP_LINE_METRICS[key]
+            if item.lower() in client_meta.get(
+                    CLIENT_METADATA_VALID_METRICS_KEY, []):
+                if typ == MetricType.METRIC_TYPE_PERCENTAGE:
+                    perc = calc_perc(m)
+                    metrics_dict[fs_name][client_id][self.items(item)] = perc
+                    self.fsstats.addstr(y_coord, xp,
+                                        f'{perc}', curses.A_DIM)
+                    xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN
+                elif typ == MetricType.METRIC_TYPE_LATENCY:
+                    lat = calc_lat(m)
+                    metrics_dict[fs_name][client_id][self.items(item)] = lat
+                    self.fsstats.addstr(y_coord, xp,
+                                        f'{lat}', curses.A_DIM)
+                    xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN
+                elif typ == MetricType.METRIC_TYPE_STDEV:
+                    stdev = calc_stdev(m)
+                    metrics_dict[fs_name][client_id][self.items(item)] = stdev
+                    self.fsstats.addstr(y_coord, xp,
+                                        f'{stdev}', curses.A_DIM)
+                    xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN
+                elif typ == MetricType.METRIC_TYPE_SIZE:
+                    size = calc_size(m)
+                    metrics_dict[fs_name][client_id][self.items(item)] = size
+                    self.fsstats.addstr(y_coord, xp,
+                                        f'{size}', curses.A_DIM)
+                    xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN
+
+                    # average io sizes
+                    avg_size = calc_avg_size(m)
+                    metrics_dict[fs_name][client_id][self.avg_items(key)] = avg_size
+                    self.fsstats.addstr(y_coord, xp,
+                                        f'{avg_size}', curses.A_DIM)
+                    xp += len(f'{self.avg_items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN
+
+                    # io speeds
+                    size = 0
+                    if key == "READ_IO_SIZES":
+                        if m[1] > 0:
+                            last_size = self.last_read_size.get(client_id, 0)
+                            size = m[1] - last_size
+                            self.last_read_size[client_id] = m[1]
+                    if key == "WRITE_IO_SIZES":
+                        if m[1] > 0:
+                            last_size = self.last_write_size.get(client_id, 0)
+                            size = m[1] - last_size
+                            self.last_write_size[client_id] = m[1]
+                    speed = calc_speed(abs(size), duration)
+                    metrics_dict[fs_name][client_id][self.speed_items(key)] = speed
+                    self.fsstats.addstr(y_coord, xp,
+                                        f'{speed}', curses.A_DIM)
+                    xp += len(f'{self.speed_items(item)}{self.speed_mtype(typ)}') + ITEMS_PAD_LEN
+                else:
+                    # display 0th element from metric tuple
+                    metrics_dict[fs_name][client_id][self.items(item)] = m[0]
+                    self.fsstats.addstr(y_coord, xp, f'{m[0]}', curses.A_DIM)
+                    xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN
+            else:
+                self.fsstats.addstr(y_coord, xp, "N/A", curses.A_DIM)
+                xp += len(self.items(item)) + ITEMS_PAD_LEN
+            cidx += 1
+
+        for item in MAIN_WINDOW_TOP_LINE_ITEMS_END:
+            wrapLen = self.PAD_WIDTH - xp
+            if item == FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR:
+                if FSTop.has_metrics(client_meta,
+                                     [CLIENT_METADATA_MOUNT_POINT_KEY,
+                                      CLIENT_METADATA_HOSTNAME_KEY,
+                                      CLIENT_METADATA_IP_KEY]):
+                    mount_point = f'{client_meta[CLIENT_METADATA_MOUNT_POINT_KEY]}@'\
+                        f'{client_meta[CLIENT_METADATA_HOSTNAME_KEY]}/'\
+                        f'{client_meta[CLIENT_METADATA_IP_KEY]}'
+                    self.fsstats.addstr(
+                        y_coord, xp,
+                        wrap(mount_point, wrapLen), curses.A_DIM)
+                else:
+                    self.fsstats.addstr(y_coord, xp, "N/A", curses.A_DIM)
+                xp += len(self.items(item)) + ITEMS_PAD_LEN
+
+    def create_clients(self, stats_json, fs_name):
+        global metrics_dict, current_states
+        counters = [m.upper() for m in stats_json[GLOBAL_COUNTERS_KEY]]
+        self.tablehead_y += 2
+        res = stats_json[GLOBAL_METRICS_KEY].get(fs_name, {})
+        client_cnt = len(res)
+        self.fsstats.addstr(self.tablehead_y, 0, FS_TOP_NAME_TOPL_FMT.format(
+            fs_name=fs_name, client_count=client_cnt), curses.A_BOLD)
+        self.tablehead_y += 2
+        metrics_dict_client = metrics_dict.get(fs_name, {})
+        if len(metrics_dict) > len(fs_list):
+            stale_fs = set(metrics_dict) - set(fs_list)
+            for key in stale_fs:
+                del metrics_dict[key]
+        if len(metrics_dict_client) > client_cnt:
+            stale_clients = set(metrics_dict_client) - set(res)
+            for key in stale_clients:
+                del metrics_dict_client[key]
+        if client_cnt:
+            if len(metrics_dict_client) != client_cnt:
+                sort_list = sorted(list(res.keys()))
+            else:
+                sort_arg = current_states['last_field']
+                sort_list = sorted(list(res.keys()),
+                                   key=lambda x: metrics_dict[fs_name].get(x, {}).get(sort_arg, 0),
+                                   reverse=True)
+            if current_states['limit'] is not None and int(current_states['limit']) < client_cnt:
+                sort_list = sort_list[0:int(current_states['limit'])]
+            for client_id in sort_list:
+                self.create_client(
+                    fs_name, client_id, res.get(client_id, {}), counters,
+                    stats_json[CLIENT_METADATA_KEY].get(fs_name, {}).get(client_id, {}),
+                    self.tablehead_y)
+                self.tablehead_y += 1
+
+    def create_header(self, stats_json, help, screen_title="", color_id=0):
+        num_clients, num_mounts, num_kclients, num_libs = 0, 0, 0, 0
+        if not stats_json['version'] == FS_TOP_SUPPORTED_VER:
+            self.header.addstr(0, 0, 'perf stats version mismatch!', curses.A_BOLD)
+            return False
+        global fs_list
+        for fs_name in fs_list:
+            client_metadata = stats_json[CLIENT_METADATA_KEY].get(fs_name, {})
+            client_cnt = len(client_metadata)
+            if client_cnt:
+                num_clients = num_clients + client_cnt
+                num_mounts = num_mounts + len(
+                    [client for client, metadata in client_metadata.items() if
+                     CLIENT_METADATA_MOUNT_POINT_KEY in metadata
+                     and metadata[CLIENT_METADATA_MOUNT_POINT_KEY] != 'N/A'])
+                num_kclients = num_kclients + len(
+                    [client for client, metadata in client_metadata.items() if
+                     "kernel_version" in metadata])
+                num_libs = num_clients - (num_mounts + num_kclients)
+        now = datetime.now().ctime()
+        self.header.addstr(0, 0, FS_TOP_VERSION_HEADER_FMT.format(prog_name=FS_TOP_PROG_STR,
+                                                                  now=now), curses.A_BOLD)
+        self.header.addstr(2, 0, screen_title, curses.color_pair(color_id) | curses.A_BOLD)
+        self.header.addstr(3, 0, FS_TOP_CLIENT_HEADER_FMT.format(num_clients=num_clients,
+                                                                 num_mounts=num_mounts,
+                                                                 num_kclients=num_kclients,
+                                                                 num_libs=num_libs), curses.A_DIM)
+        self.header.addstr(4, 0, f"Filters: Sort - {current_states['last_field']}, "
+                           f"Limit - {current_states['limit']}", curses.A_DIM)
+        self.header.addstr(5, 0, help, curses.A_DIM)
+        return True
+
+    def run_display(self):
+        # clear the pads to have a smooth refresh
+        self.header.erase()
+        self.fsstats.erase()
+
+        self.active_screen = FS_TOP_FS_SELECTED_APP
+        screen_title = "Selected Filesystem Info"
+        help_commands = "m - select a filesystem | s - sort menu | l - limit number of clients"\
+                        " | r - reset to default | q - home (All Filesystem Info) screen"
+        curses.init_pair(3, curses.COLOR_MAGENTA, -1)
+
+        top, left = 0, 0  # where to place pad
+        vscrollOffset, hscrollOffset = 0, 0  # scroll offsets
+
+        # calculate the initial viewport height and width
+        windowsize = self.stdscr.getmaxyx()
+        self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+
+        # create header subpad
+        self.header_height = 7
+        self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0)
+
+        # create fsstats subpad
+        fsstats_begin_y = self.header_height
+        fsstats_height = self.PAD_HEIGHT - self.header_height
+        self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0)
+
+        curses.halfdelay(1)
+        cmd = self.stdscr.getch()
+        global fs_list, current_states
+        while not self.exit_ev.is_set():
+            fs_list = self.get_fs_names()
+            fs = current_states["last_fs"]
+            if cmd in [ord('m'), ord('s'), ord('l'), ord('r'), ord('q')]:
+                if self.set_option_sel_fs(cmd, fs):
+                    self.exit_ev.set()
+
+            stats_json = self.perf_stats_query()
+            vscrollEnd = 0
+            if fs not in fs_list:
+                help = f"Error: The selected filesystem '{fs}' is not available now. " \
+                    "[Press 'q' to go back to home (All Filesystem Info) screen]"
+                # reset the sort/limit settings if fs_list is empty, otherwise continue the
+                # settings for the other filesystems.
+                if not fs_list:
+                    current_states["last_field"] = 'chit'
+                    current_states["limit"] = None
+                self.header.erase()  # erase previous text
+                self.fsstats.erase()
+                self.create_header(stats_json, help, screen_title, 3)
+            else:
+                self.tablehead_y = 0
+                help = "COMMANDS: " + help_commands
+                self.fsstats.erase()  # erase previous text
+
+                client_metadata = stats_json[GLOBAL_METRICS_KEY].get(fs, {})
+                if current_states['limit'] is not None and \
+                   int(current_states['limit']) < len(client_metadata):
+                    num_client = int(current_states['limit'])
+                else:
+                    num_client = len(client_metadata)
+                vscrollEnd += num_client
+                if self.create_header(stats_json, help, screen_title, 3):
+                    self.create_table_header()
+                    self.create_clients(stats_json, fs)
+
+            # scroll and refresh
+            if cmd == curses.KEY_DOWN:
+                if (vscrollEnd - vscrollOffset) > 1:
+                    vscrollOffset += 1
+                else:
+                    vscrollOffset = vscrollEnd
+            elif cmd == curses.KEY_UP:
+                if vscrollOffset > 0:
+                    vscrollOffset -= 1
+            elif cmd == curses.KEY_NPAGE:
+                if (vscrollEnd - vscrollOffset) / 20 > 1:
+                    vscrollOffset += 20
+                else:
+                    vscrollOffset = vscrollEnd
+            elif cmd == curses.KEY_PPAGE:
+                if vscrollOffset / 20 >= 1:
+                    vscrollOffset -= 20
+                else:
+                    vscrollOffset = 0
+            elif cmd == curses.KEY_RIGHT:
+                if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1:
+                    hscrollOffset += 1
+            elif cmd == curses.KEY_LEFT:
+                if hscrollOffset > 0:
+                    hscrollOffset -= 1
+            elif cmd == curses.KEY_HOME:
+                hscrollOffset = 0
+            elif cmd == curses.KEY_END:
+                hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1
+            elif cmd == curses.KEY_RESIZE:
+                # terminal resize event. Update the viewport dimensions
+                windowsize = self.stdscr.getmaxyx()
+                self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+
+            if cmd:
+                try:
+                    # refresh the viewport for the header portion
+                    if cmd not in [curses.KEY_DOWN,
+                                   curses.KEY_UP,
+                                   curses.KEY_NPAGE,
+                                   curses.KEY_PPAGE,
+                                   curses.KEY_RIGHT,
+                                   curses.KEY_LEFT]:
+                        self.fstop_pad.refresh(0, 0,
+                                               top, left,
+                                               top + self.header_height, left + self.viewportWidth)
+                    # refresh the viewport for the current table header portion in the fsstats pad
+                    if cmd not in [curses.KEY_DOWN,
+                                   curses.KEY_UP,
+                                   curses.KEY_NPAGE,
+                                   curses.KEY_PPAGE]:
+                        self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset,
+                                               top + fsstats_begin_y, left,
+                                               7, left + self.viewportWidth)
+                    # refresh the viewport for the current client records portion in the fsstats pad
+                    self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset,
+                                           top + fsstats_begin_y + 2, left,
+                                           top + self.viewportHeight, left + self.viewportWidth)
+                except curses.error:
+                    # This happens when the user switches to a terminal of different zoom size.
+                    # just retry it.
+                    pass
+            # End scroll and refresh
+
+            curses.halfdelay(self.refresh_interval_secs * 10)
+            cmd = self.stdscr.getch()
+
+    def run_all_display(self):
+        # clear text from the previous screen
+        if self.active_screen == FS_TOP_FS_SELECTED_APP:
+            self.header.erase()
+
+        self.active_screen = FS_TOP_ALL_FS_APP
+        screen_title = "All Filesystem Info"
+        curses.init_pair(2, curses.COLOR_CYAN, -1)
+
+        top, left = 0, 0  # where to place pad
+        vscrollOffset, hscrollOffset = 0, 0  # scroll offsets
+
+        # calculate the initial viewport height and width
+        windowsize = self.stdscr.getmaxyx()
+        self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+
+        # create header subpad
+        self.header_height = 7
+        self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0)
+
+        # create fsstats subpad
+        fsstats_begin_y = self.header_height
+        fsstats_height = self.PAD_HEIGHT - self.header_height
+        self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0)
+
+        curses.halfdelay(1)
+        cmd = self.stdscr.getch()
+        while not self.exit_ev.is_set():
+            if cmd in [ord('m'), ord('s'), ord('l'), ord('r'), ord('q')]:
+                if self.set_option_all_fs(cmd):
+                    self.exit_ev.set()
+
+            # header display
+            global fs_list, current_states
+            fs_list = self.get_fs_names()
+            current_states["last_fs"] = fs_list
+            stats_json = self.perf_stats_query()
+            vscrollEnd = 0
+            if not fs_list:
+                help = "INFO: No filesystem is available [Press 'q' to quit]"
+                # reset the sort/limit settings
+                current_states["last_field"] = 'chit'
+                current_states["limit"] = None
+                self.header.erase()  # erase previous text
+                self.fsstats.erase()
+                self.create_header(stats_json, help, screen_title, 2)
+            else:
+                self.tablehead_y = 0
+                num_client = 0
+                help = "COMMANDS: m - select a filesystem | s - sort menu |"\
+                    " l - limit number of clients | r - reset to default | q - quit"
+                self.fsstats.erase()  # erase previous text
+                for index, fs in enumerate(fs_list):
+                    #  Get the vscrollEnd in advance
+                    client_metadata = stats_json[GLOBAL_METRICS_KEY].get(fs, {})
+                    if current_states['limit'] is not None and \
+                       int(current_states['limit']) < len(client_metadata):
+                        num_client = int(current_states['limit'])
+                    else:
+                        num_client = len(client_metadata)
+                    vscrollEnd += num_client
+                    if self.create_header(stats_json, help, screen_title, 2):
+                        if not index:  # do it only for the first fs
+                            self.create_table_header()
+                        self.create_clients(stats_json, fs)
+
+            # scroll and refresh
+            if cmd == curses.KEY_DOWN:
+                if (vscrollEnd - vscrollOffset) > 1:
+                    vscrollOffset += 1
+                else:
+                    vscrollOffset = vscrollEnd
+            elif cmd == curses.KEY_UP:
+                if vscrollOffset > 0:
+                    vscrollOffset -= 1
+            elif cmd == curses.KEY_NPAGE:
+                if (vscrollEnd - vscrollOffset) / 20 > 1:
+                    vscrollOffset += 20
+                else:
+                    vscrollOffset = vscrollEnd
+            elif cmd == curses.KEY_PPAGE:
+                if vscrollOffset / 20 >= 1:
+                    vscrollOffset -= 20
+                else:
+                    vscrollOffset = 0
+            elif cmd == curses.KEY_RIGHT:
+                if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1:
+                    hscrollOffset += 1
+            elif cmd == curses.KEY_LEFT:
+                if hscrollOffset > 0:
+                    hscrollOffset -= 1
+            elif cmd == curses.KEY_HOME:
+                hscrollOffset = 0
+            elif cmd == curses.KEY_END:
+                hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1
+            elif cmd == curses.KEY_RESIZE:
+                # terminal resize event. Update the viewport dimensions
+                windowsize = self.stdscr.getmaxyx()
+                self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+            if cmd:
+                try:
+                    # refresh the viewport for the header portion
+                    if cmd not in [curses.KEY_DOWN,
+                                   curses.KEY_UP,
+                                   curses.KEY_NPAGE,
+                                   curses.KEY_PPAGE,
+                                   curses.KEY_RIGHT,
+                                   curses.KEY_LEFT]:
+                        self.fstop_pad.refresh(0, 0,
+                                               top, left,
+                                               top + self.header_height, left + self.viewportWidth)
+                    # refresh the viewport for the current table header portion in the fsstats pad
+                    if cmd not in [curses.KEY_DOWN,
+                                   curses.KEY_UP,
+                                   curses.KEY_NPAGE,
+                                   curses.KEY_PPAGE]:
+                        self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset,
+                                               top + fsstats_begin_y, left,
+                                               7, left + self.viewportWidth)
+                    # refresh the viewport for the current client records portion in the fsstats pad
+                    self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset,
+                                           top + fsstats_begin_y + 2, left,
+                                           top + self.viewportHeight, left + self.viewportWidth)
+                except curses.error:
+                    # This happens when the user switches to a terminal of different zoom size.
+                    # just retry it.
+                    pass
+            # End scroll and refresh
+
+            curses.halfdelay(self.refresh_interval_secs * 10)
+            cmd = self.stdscr.getch()
+# End class FSTop
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Ceph Filesystem top utility')
+    parser.add_argument('--cluster', nargs='?', const='ceph', default='ceph',
+                        help='Ceph cluster to connect (default: ceph)')
+    parser.add_argument('--id', nargs='?', const='fstop', default='fstop',
+                        help='Ceph user to use to connection (default: fstop)')
+    parser.add_argument('--conffile', nargs='?', default=None,
+                        help='Path to cluster configuration file')
+    parser.add_argument('--selftest', dest='selftest', action='store_true',
+                        help='Run in selftest mode')
+    parser.add_argument('-d', '--delay', metavar='DELAY', dest='delay', choices=range(1, 26),
+                        default=DEFAULT_REFRESH_INTERVAL,
+                        type=int,
+                        help='Refresh interval in seconds '
+                        f'(default: {DEFAULT_REFRESH_INTERVAL}, range: 1 - 25)')
+    parser.add_argument('--dump', dest='dump', action='store_true',
+                        help='Dump the metrics to stdout')
+    parser.add_argument('--dumpfs', action='append',
+                        help='Dump the metrics of the given fs to stdout')
+
+    args = parser.parse_args()
+    err = False
+    ft = FSTop(args)
+    try:
+        ft.init()
+        if args.selftest:
+            ft.selftest()
+            sys.stdout.write("selftest ok\n")
+        elif args.dump:
+            ft.dump_metrics_to_stdout()
+        elif args.dumpfs:
+            ft.dump_metrics_to_stdout(args.dumpfs[0])
+        else:
+            curses.wrapper(ft.setup_curses)
+    except FSTopException as fst:
+        err = True
+        sys.stderr.write(f'{fst.get_error_msg()}\n')
+    except Exception as e:
+        err = True
+        sys.stderr.write(f'exception: {e}\n')
+    finally:
+        ft.fini()
+    sys.exit(0 if not err else -1)
diff --git a/src/tools/cephfs/top/setup.py b/src/tools/cephfs/top/setup.py
new file mode 100644
index 000000000..92fbd964c
--- /dev/null
+++ b/src/tools/cephfs/top/setup.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+from setuptools import setup
+
+__version__ = '0.0.1'
+
+setup(
+    name='cephfs-top',
+    version=__version__,
+    description='top(1) like utility for Ceph Filesystem',
+    keywords='cephfs, top',
+    scripts=['cephfs-top'],
+    install_requires=[
+        'rados',
+    ],
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Environment :: Console',
+        'Intended Audience :: System Administrators',
+        'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)',
+        'Operating System :: POSIX :: Linux',
+        'Programming Language :: Python :: 3'
+    ],
+    license='LGPLv2+',
+)
diff --git a/src/tools/cephfs/top/tox.ini b/src/tools/cephfs/top/tox.ini
new file mode 100644
index 000000000..b125c0bc8
--- /dev/null
+++ b/src/tools/cephfs/top/tox.ini
@@ -0,0 +1,7 @@
+[tox]
+envlist = py3
+skipsdist = true
+
+[testenv:py3]
+deps = flake8
+commands = flake8 --ignore=W503 --max-line-length=100 cephfs-top
diff --git a/src/tools/cephfs/type_helper.hpp b/src/tools/cephfs/type_helper.hpp
new file mode 100644
index 000000000..2ec77c25c
--- /dev/null
+++ b/src/tools/cephfs/type_helper.hpp
@@ -0,0 +1,28 @@
+#ifndef TYPE_HELPER_HPP__
+#define TYPE_HELPER_HPP__
+
+template<typename T1, typename T2>
+T1 conv_t(T2 s){
+    T1 target;
+    std::stringstream conv;
+    conv << s;
+    conv >> target;
+    return target;
+}
+
+void string_split(std::string str, std::vector<std::string>& out, std::string split = ":") {
+    std::cout << str << std::endl;
+    auto pos = str.find(split);
+    while(pos != std::string::npos){
+        std::cout << str.substr(0, pos) << std::endl;
+        out.push_back(str.substr(0, pos));
+        if (str.size() > pos + split.size()){
+            str = str.substr(pos + split.size());
+            pos = str.find(split);
+        }else
+            return;
+    }
+    out.push_back(str.substr());
+    return;
+}
+#endif // TYPE_HELPER_HPP__
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/tools/cephfs
parent	Initial commit. (diff)
download	ceph-upstream/18.2.2.tar.xz ceph-upstream/18.2.2.zip