summaryrefslogtreecommitdiffstats
path: root/src/tools/cephfs/JournalTool.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/tools/cephfs/JournalTool.cc')
-rw-r--r--src/tools/cephfs/JournalTool.cc1266
1 files changed, 1266 insertions, 0 deletions
diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc
new file mode 100644
index 000000000..6bca9bb08
--- /dev/null
+++ b/src/tools/cephfs/JournalTool.cc
@@ -0,0 +1,1266 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include <sstream>
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/InoTable.h"
+
+#include "mds/events/ENoOp.h"
+#include "mds/events/EUpdate.h"
+
+#include "JournalScanner.h"
+#include "EventOutput.h"
+#include "Dumper.h"
+#include "Resetter.h"
+
+#include "JournalTool.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+using namespace std;
+
+void JournalTool::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-journal-tool [options] journal <command>\n"
+ << " <command>:\n"
+ << " inspect\n"
+ << " import <path> [--force]\n"
+ << " export <path>\n"
+ << " reset [--force]\n"
+ << " cephfs-journal-tool [options] header <get|set> <field> <value>\n"
+ << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n"
+ << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
+ << " <selector>:\n"
+ << " --range=<start>..<end>\n"
+ << " --path=<substring>\n"
+ << " --inode=<integer>\n"
+ << " --type=<UPDATE|OPEN|SESSION...><\n"
+ << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
+ << " --client=<session id integer>\n"
+ << " <effect>: [get|recover_dentries|splice]\n"
+ << " <output>: [summary|list|binary|json] [--path <path>]\n"
+ << "\n"
+ << "General options:\n"
+ << " --rank=filesystem:{mds-rank|all} journal rank or \"all\" ranks (mandatory)\n"
+ << " --journal=<mdlog|purge_queue> Journal type (purge_queue means\n"
+ << " this journal is used to queue for purge operation,\n"
+ << " default is mdlog, and only mdlog support event mode)\n"
+ << "\n"
+ << "Special options\n"
+ << " --alternate-pool <name> Alternative metadata pool to target\n"
+ << " when using recover_dentries.\n";
+
+ generic_client_usage();
+}
+
+
+/**
+ * Handle arguments and hand off to journal/header/event mode
+ */
+int JournalTool::main(std::vector<const char*> &argv)
+{
+ int r;
+
+ dout(10) << "JournalTool::main " << dendl;
+ // Common arg parsing
+ // ==================
+ if (argv.empty()) {
+ cerr << "missing positional argument" << std::endl;
+ return -EINVAL;
+ }
+
+ std::vector<const char*>::iterator arg = argv.begin();
+
+ std::string rank_str;
+ if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
+ derr << "missing mandatory \"--rank\" argument" << dendl;
+ return -EINVAL;
+ }
+
+ if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) {
+ // Default is mdlog
+ type = "mdlog";
+ }
+
+ r = validate_type(type);
+ if (r != 0) {
+ derr << "journal type is not correct." << dendl;
+ return r;
+ }
+
+ r = role_selector.parse(*fsmap, rank_str, false);
+ if (r != 0) {
+ derr << "Couldn't determine MDS rank." << dendl;
+ return r;
+ }
+
+ std::string mode;
+ if (arg == argv.end()) {
+ derr << "Missing mode [journal|header|event]" << dendl;
+ return -EINVAL;
+ }
+ mode = std::string(*arg);
+ arg = argv.erase(arg);
+
+ // RADOS init
+ // ==========
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+ return r;
+ }
+
+ dout(4) << "JournalTool: connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ ceph_assert(fs != nullptr);
+ int64_t const pool_id = fs->mds_map.get_metadata_pool();
+ dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
+ std::string pool_name;
+ r = rados.pool_reverse_lookup(pool_id, &pool_name);
+ if (r < 0) {
+ derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
+ return r;
+ }
+
+ dout(4) << "JournalTool: creating IoCtx.." << dendl;
+ r = rados.ioctx_create(pool_name.c_str(), input);
+ ceph_assert(r == 0);
+ output.dup(input);
+
+ // Execution
+ // =========
+ // journal and header are general journal mode
+ // event mode is only specific for mdlog
+ auto roles = role_selector.get_roles();
+ if (roles.size() > 1) {
+ const std::string &command = argv[0];
+ bool allowed = can_execute_for_all_ranks(mode, command);
+ if (!allowed) {
+ derr << "operation not allowed for all ranks" << dendl;
+ return -EINVAL;
+ }
+
+ all_ranks = true;
+ }
+ for (auto role : roles) {
+ rank = role.rank;
+ std::vector<const char *> rank_argv(argv);
+ dout(4) << "Executing for rank " << rank << dendl;
+ if (mode == std::string("journal")) {
+ r = main_journal(rank_argv);
+ } else if (mode == std::string("header")) {
+ r = main_header(rank_argv);
+ } else if (mode == std::string("event")) {
+ r = main_event(rank_argv);
+ } else {
+ cerr << "Bad command '" << mode << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (r != 0) {
+ return r;
+ }
+ }
+
+ return r;
+}
+
+int JournalTool::validate_type(const std::string &type)
+{
+ if (type == "mdlog" || type == "purge_queue") {
+ return 0;
+ }
+ return -1;
+}
+
+std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
+ if (!all_ranks) {
+ return prefix;
+ }
+
+ return prefix + "." + std::to_string(rank);
+}
+
+bool JournalTool::can_execute_for_all_ranks(const std::string &mode,
+ const std::string &command) {
+ if (mode == "journal" && command == "import") {
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * Handle arguments for 'journal' mode
+ *
+ * This is for operations that act on the journal as a whole.
+ */
+int JournalTool::main_journal(std::vector<const char*> &argv)
+{
+ if (argv.empty()) {
+ derr << "Missing journal command, please see help" << dendl;
+ return -EINVAL;
+ }
+
+ std::string command = argv[0];
+ if (command == "inspect") {
+ return journal_inspect();
+ } else if (command == "export" || command == "import") {
+ bool force = false;
+ if (argv.size() >= 2) {
+ std::string const path = argv[1];
+ if (argv.size() == 3) {
+ if (std::string(argv[2]) == "--force") {
+ force = true;
+ } else {
+ std::cerr << "Unknown argument " << argv[1] << std::endl;
+ return -EINVAL;
+ }
+ }
+ return journal_export(path, command == "import", force);
+ } else {
+ derr << "Missing path" << dendl;
+ return -EINVAL;
+ }
+ } else if (command == "reset") {
+ bool force = false;
+ if (argv.size() == 2) {
+ if (std::string(argv[1]) == "--force") {
+ force = true;
+ } else {
+ std::cerr << "Unknown argument " << argv[1] << std::endl;
+ return -EINVAL;
+ }
+ } else if (argv.size() > 2) {
+ std::cerr << "Too many arguments!" << std::endl;
+ return -EINVAL;
+ }
+ return journal_reset(force);
+ } else {
+ derr << "Bad journal command '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+}
+
+
+/**
+ * Parse arguments and execute for 'header' mode
+ *
+ * This is for operations that act on the header only.
+ */
+int JournalTool::main_header(std::vector<const char*> &argv)
+{
+ JournalFilter filter(type);
+ JournalScanner js(input, rank, type, filter);
+ int r = js.scan(false);
+ if (r < 0) {
+ std::cerr << "Unable to scan journal" << std::endl;
+ return r;
+ }
+
+ if (!js.header_present) {
+ std::cerr << "Header object not found!" << std::endl;
+ return -ENOENT;
+ } else if (!js.header_valid && js.header == NULL) {
+ // Can't do a read or a single-field write without a copy of the original
+ derr << "Header could not be read!" << dendl;
+ return -ENOENT;
+ } else {
+ ceph_assert(js.header != NULL);
+ }
+
+ if (argv.empty()) {
+ derr << "Missing header command, must be [get|set]" << dendl;
+ return -EINVAL;
+ }
+ std::vector<const char *>::iterator arg = argv.begin();
+ std::string const command = *arg;
+ arg = argv.erase(arg);
+
+ if (command == std::string("get")) {
+ // Write JSON journal dump to stdout
+ JSONFormatter jf(true);
+ js.header->dump(&jf);
+ jf.flush(std::cout);
+ std::cout << std::endl;
+ } else if (command == std::string("set")) {
+ // Need two more args <key> <val>
+ if (argv.size() != 2) {
+ derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
+ return -EINVAL;
+ }
+
+ std::string const field_name = *arg;
+ arg = argv.erase(arg);
+
+ std::string const value_str = *arg;
+ arg = argv.erase(arg);
+ ceph_assert(argv.empty());
+
+ std::string parse_err;
+ uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t *field = NULL;
+ if (field_name == "trimmed_pos") {
+ field = &(js.header->trimmed_pos);
+ } else if (field_name == "expire_pos") {
+ field = &(js.header->expire_pos);
+ } else if (field_name == "write_pos") {
+ field = &(js.header->write_pos);
+ } else if (field_name == "pool_id") {
+ field = (uint64_t*)(&(js.header->layout.pool_id));
+ } else {
+ derr << "Invalid field '" << field_name << "'" << dendl;
+ return -EINVAL;
+ }
+
+ std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
+ *field = new_val;
+
+ dout(4) << "Writing object..." << dendl;
+ bufferlist header_bl;
+ encode(*(js.header), header_bl);
+ output.write_full(js.obj_name(0), header_bl);
+ dout(4) << "Write complete." << dendl;
+ std::cout << "Successfully updated header." << std::endl;
+ } else {
+ derr << "Bad header command '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+/**
+ * Parse arguments and execute for 'event' mode
+ *
+ * This is for operations that act on LogEvents within the log
+ */
+int JournalTool::main_event(std::vector<const char*> &argv)
+{
+ int r;
+
+ if (argv.empty()) {
+ derr << "Missing event command, please see help" << dendl;
+ return -EINVAL;
+ }
+
+ std::vector<const char*>::iterator arg = argv.begin();
+ bool dry_run = false;
+
+ std::string command = *(arg++);
+ if (command != "get" && command != "splice" && command != "recover_dentries") {
+ derr << "Unknown argument '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+
+ if (command == "recover_dentries") {
+ if (type != "mdlog") {
+ derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl;
+ return -EINVAL;
+ } else {
+ if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
+ dry_run = true;
+ }
+ }
+ }
+
+ if (arg == argv.end()) {
+ derr << "Incomplete command line" << dendl;
+ return -EINVAL;
+ }
+
+ // Parse filter options
+ // ====================
+ JournalFilter filter(type);
+ r = filter.parse_args(argv, arg);
+ if (r) {
+ return r;
+ }
+
+ // Parse output options
+ // ====================
+ if (arg == argv.end()) {
+ cerr << "Missing output command" << std::endl;
+ return -EINVAL;
+ }
+ std::string output_style = *(arg++);
+ if (output_style != "binary" && output_style != "json" &&
+ output_style != "summary" && output_style != "list") {
+ cerr << "Unknown argument: '" << output_style << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string output_path = "dump";
+ while(arg != argv.end()) {
+ std::string arg_str;
+ if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
+ output_path = arg_str;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
+ nullptr)) {
+ dout(1) << "Using alternate pool " << arg_str << dendl;
+ int r = rados.ioctx_create(arg_str.c_str(), output);
+ ceph_assert(r == 0);
+ other_pool = true;
+ } else {
+ cerr << "Unknown argument: '" << *arg << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ const std::string dump_path = gen_dump_file_path(output_path);
+
+ // Execute command
+ // ===============
+ JournalScanner js(input, rank, type, filter);
+ if (command == "get") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+ } else if (command == "recover_dentries") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+
+ /**
+ * Iterate over log entries, attempting to scavenge from each one
+ */
+ std::set<inodeno_t> consumed_inos;
+ for (JournalScanner::EventMap::iterator i = js.events.begin();
+ i != js.events.end(); ++i) {
+ auto& le = i->second.log_event;
+ EMetaBlob const *mb = le->get_metablob();
+ if (mb) {
+ int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
+ if (scav_r) {
+ dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
+ << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
+ if (r == 0) {
+ r = scav_r;
+ }
+ // Our goal is to read all we can, so don't stop on errors, but
+ // do record them for possible later output
+ js.errors.insert(std::make_pair(i->first,
+ JournalScanner::EventError(scav_r, cpp_strerror(r))));
+ }
+ }
+ }
+
+ /**
+ * Update InoTable to reflect any inode numbers consumed during scavenge
+ */
+ dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
+ if (consumed_inos.size() && !dry_run) {
+ int consume_r = consume_inos(consumed_inos);
+ if (consume_r) {
+ dout(1) << "Error updating InoTable for " << consumed_inos.size()
+ << " consume inos: " << cpp_strerror(consume_r) << dendl;
+ if (r == 0) {
+ r = consume_r;
+ }
+ }
+ }
+
+ // Remove consumed dentries from lost+found.
+ if (other_pool && !dry_run) {
+ std::set<std::string> found;
+
+ for (auto i : consumed_inos) {
+ char s[20];
+
+ snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
+ dout(20) << "removing " << s << dendl;
+ found.insert(std::string(s));
+ }
+
+ object_t frag_oid;
+ frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
+ frag_t(), "");
+ output.omap_rm_keys(frag_oid.name, found);
+ }
+ } else if (command == "splice") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+
+ uint64_t start, end;
+ if (filter.get_range(start, end)) {
+ // Special case for range filter: erase a numeric range in the log
+ uint64_t range = end - start;
+ int r = erase_region(js, start, range);
+ if (r) {
+ derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else {
+ // General case: erase a collection of individual entries in the log
+ for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
+ dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
+
+ int r = erase_region(js, i->first, i->second.raw_size);
+ if (r) {
+ derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+
+ } else {
+ cerr << "Unknown argument '" << command << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // Generate output
+ // ===============
+ EventOutput output(js, dump_path);
+ int output_result = 0;
+ if (output_style == "binary") {
+ output_result = output.binary();
+ } else if (output_style == "json") {
+ output_result = output.json();
+ } else if (output_style == "summary") {
+ output.summary();
+ } else if (output_style == "list") {
+ output.list();
+ } else {
+ std::cerr << "Bad output command '" << output_style << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (output_result != 0) {
+ std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
+ }
+
+ return output_result;
+}
+
+/**
+ * Provide the user with information about the condition of the journal,
+ * especially indicating what range of log events is available and where
+ * any gaps or corruptions in the journal are.
+ */
+int JournalTool::journal_inspect()
+{
+ int r;
+
+ JournalFilter filter(type);
+ JournalScanner js(input, rank, type, filter);
+ r = js.scan();
+ if (r) {
+ std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
+ return r;
+ }
+
+ js.report(std::cout);
+
+ return 0;
+}
+
+
+/**
+ * Attempt to export a binary dump of the journal.
+ *
+ * This is allowed to fail if the header is malformed or there are
+ * objects inaccessible, in which case the user would have to fall
+ * back to manually listing RADOS objects and extracting them, which
+ * they can do with the ``rados`` CLI.
+ */
+int JournalTool::journal_export(std::string const &path, bool import, bool force)
+{
+ int r = 0;
+ JournalScanner js(input, rank, type);
+
+ if (!import) {
+ /*
+ * If doing an export, first check that the header is valid and
+ * no objects are missing before trying to dump
+ */
+ r = js.scan();
+ if (r < 0) {
+ derr << "Unable to scan journal, assuming badly damaged" << dendl;
+ return r;
+ }
+ if (!js.is_readable()) {
+ derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
+ return -EIO;
+ }
+ }
+
+ /*
+ * Assuming we can cleanly read the journal data, dump it out to a file
+ */
+ {
+ Dumper dumper;
+ r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type);
+ if (r < 0) {
+ derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (import) {
+ r = dumper.undump(path.c_str(), force);
+ } else {
+ const std::string ex_path = gen_dump_file_path(path);
+ r = dumper.dump(ex_path.c_str());
+ }
+ }
+
+ return r;
+}
+
+
+/**
+ * Truncate journal and insert EResetJournal
+ */
+int JournalTool::journal_reset(bool hard)
+{
+ int r = 0;
+ Resetter resetter;
+ r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard);
+ if (r < 0) {
+ derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (hard) {
+ r = resetter.reset_hard();
+ } else {
+ r = resetter.reset();
+ }
+
+ return r;
+}
+
+
+/**
+ * Selective offline replay which only reads out dentries and writes
+ * them to the backing store iff their version is > what is currently
+ * in the backing store.
+ *
+ * In order to write dentries to the backing store, we may create the
+ * required enclosing dirfrag objects.
+ *
+ * Test this by running scavenge on an unflushed journal, then nuking
+ * it offline, then starting an MDS and seeing that the dentries are
+ * visible.
+ *
+ * @param metablob an EMetaBlob retrieved from the journal
+ * @param dry_run if true, do no writes to RADOS
+ * @param consumed_inos output, populated with any inos inserted
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::recover_dentries(
+ EMetaBlob const &metablob,
+ bool const dry_run,
+ std::set<inodeno_t> *consumed_inos)
+{
+ ceph_assert(consumed_inos != NULL);
+
+ int r = 0;
+
+ // Replay fullbits (dentry+inode)
+ for (const auto& frag : metablob.lump_order) {
+ EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
+ lump._decode_bits();
+ object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
+
+ dout(4) << "inspecting lump " << frag_oid.name << dendl;
+
+
+ // We will record old fnode version for use in hard link handling
+ // If we don't read an old fnode, take version as zero and write in
+ // all hardlinks we find.
+ version_t old_fnode_version = 0;
+
+ // Update fnode in omap header of dirfrag object
+ bool write_fnode = false;
+ bufferlist old_fnode_bl;
+ r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
+ if (r == -ENOENT) {
+ // Creating dirfrag from scratch
+ dout(4) << "failed to read OMAP header from directory fragment "
+ << frag_oid.name << " " << cpp_strerror(r) << dendl;
+ write_fnode = true;
+ // Note: creating the dirfrag *without* a backtrace, relying on
+ // MDS to regenerate backtraces on read or in FSCK
+ } else if (r == 0) {
+ // Conditionally update existing omap header
+ fnode_t old_fnode;
+ auto old_fnode_iter = old_fnode_bl.cbegin();
+ try {
+ old_fnode.decode(old_fnode_iter);
+ dout(4) << "frag " << frag_oid.name << " fnode old v" <<
+ old_fnode.version << " vs new v" << lump.fnode->version << dendl;
+ old_fnode_version = old_fnode.version;
+ write_fnode = old_fnode_version < lump.fnode->version;
+ } catch (const buffer::error &err) {
+ dout(1) << "frag " << frag_oid.name
+ << " is corrupt, overwriting" << dendl;
+ write_fnode = true;
+ }
+ } else {
+ // Unexpected error
+ dout(4) << "failed to read OMAP header from directory fragment "
+ << frag_oid.name << " " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if ((other_pool || write_fnode) && !dry_run) {
+ dout(4) << "writing fnode to omap header" << dendl;
+ bufferlist fnode_bl;
+ lump.fnode->encode(fnode_bl);
+ if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
+ r = output.omap_set_header(frag_oid.name, fnode_bl);
+ }
+ if (r != 0) {
+ derr << "Failed to write fnode for frag object "
+ << frag_oid.name << dendl;
+ return r;
+ }
+ }
+
+ std::set<std::string> read_keys;
+
+ // Compose list of potentially-existing dentries we would like to fetch
+ for (const auto& fb : lump.get_dfull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ for(const auto& rb : lump.get_dremote()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ for (const auto& nb : lump.get_dnull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ // Perform bulk read of existing dentries
+ std::map<std::string, bufferlist> read_vals;
+ r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ if (r == -ENOENT && other_pool) {
+ r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ }
+ if (r != 0) {
+ derr << "unexpected error reading fragment object "
+ << frag_oid.name << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Compose list of dentries we will write back
+ std::map<std::string, bufferlist> write_vals;
+ for (const auto& fb : lump.get_dfull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
+ << dendl;
+ bool write_dentry = false;
+ if (read_vals.find(key) == read_vals.end()) {
+ dout(4) << "dentry did not already exist, will create" << dendl;
+ write_dentry = true;
+ } else {
+ dout(4) << "dentry " << key << " existed already" << dendl;
+ dout(4) << "dentry exists, checking versions..." << dendl;
+ bufferlist &old_dentry = read_vals[key];
+ // Decode dentry+inode
+ auto q = old_dentry.cbegin();
+
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ if (dentry_type == 'L' || dentry_type == 'l') {
+ // leave write_dentry false, we have no version to
+ // compare with in a hardlink, so it's not safe to
+ // squash over it with what's in this fullbit
+ dout(10) << "Existing remote inode in slot to be (maybe) written "
+ << "by a full inode from the journal dn '" << fb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode->version;
+ } else if (dentry_type == 'I' || dentry_type == 'i') {
+ // Read out inode version to compare with backing store
+ InodeStore inode;
+ if (dentry_type == 'i') {
+ mempool::mds_co::string alternate_name;
+
+ DECODE_START(2, q);
+ if (struct_v >= 2)
+ decode(alternate_name, q);
+ inode.decode(q);
+ DECODE_FINISH(q);
+ } else {
+ inode.decode_bare(q);
+ }
+ dout(4) << "decoded embedded inode version "
+ << inode.inode->version << " vs fullbit version "
+ << fb.inode->version << dendl;
+ if (inode.inode->version < fb.inode->version) {
+ write_dentry = true;
+ }
+ } else {
+ dout(4) << "corrupt dentry in backing store, overwriting from "
+ "journal" << dendl;
+ write_dentry = true;
+ }
+ }
+
+ if ((other_pool || write_dentry) && !dry_run) {
+ dout(4) << "writing I dentry " << key << " into frag "
+ << frag_oid.name << dendl;
+
+ // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+ bufferlist dentry_bl;
+ encode(fb.dnfirst, dentry_bl);
+ encode('I', dentry_bl);
+ encode_fullbit_as_inode(fb, true, &dentry_bl);
+
+ // Record for writing to RADOS
+ write_vals[key] = dentry_bl;
+ consumed_inos->insert(fb.inode->ino);
+ }
+ }
+
+ for(const auto& rb : lump.get_dremote()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
+ << dendl;
+ bool write_dentry = false;
+ if (read_vals.find(key) == read_vals.end()) {
+ dout(4) << "dentry did not already exist, will create" << dendl;
+ write_dentry = true;
+ } else {
+ dout(4) << "dentry " << key << " existed already" << dendl;
+ dout(4) << "dentry exists, checking versions..." << dendl;
+ bufferlist &old_dentry = read_vals[key];
+ // Decode dentry+inode
+ auto q = old_dentry.cbegin();
+
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ if (dentry_type == 'L' || dentry_type == 'l') {
+ dout(10) << "Existing hardlink inode in slot to be (maybe) written "
+ << "by a remote inode from the journal dn '" << rb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode->version;
+ } else if (dentry_type == 'I' || dentry_type == 'i') {
+ dout(10) << "Existing full inode in slot to be (maybe) written "
+ << "by a remote inode from the journal dn '" << rb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode->version;
+ } else {
+ dout(4) << "corrupt dentry in backing store, overwriting from "
+ "journal" << dendl;
+ write_dentry = true;
+ }
+ }
+
+ if ((other_pool || write_dentry) && !dry_run) {
+ dout(4) << "writing L dentry " << key << " into frag "
+ << frag_oid.name << dendl;
+
+ // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+ bufferlist dentry_bl;
+ encode(rb.dnfirst, dentry_bl);
+ encode('L', dentry_bl);
+ encode(rb.ino, dentry_bl);
+ encode(rb.d_type, dentry_bl);
+
+ // Record for writing to RADOS
+ write_vals[key] = dentry_bl;
+ consumed_inos->insert(rb.ino);
+ }
+ }
+
+ std::set<std::string> null_vals;
+ for (const auto& nb : lump.get_dnull()) {
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
+ << dendl;
+
+ auto it = read_vals.find(key);
+ if (it != read_vals.end()) {
+ dout(4) << "dentry exists, will remove" << dendl;
+
+ auto q = it->second.cbegin();
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ bool remove_dentry = false;
+ if (dentry_type == 'L' || dentry_type == 'l') {
+ dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode->version;
+ } else if (dentry_type == 'I' || dentry_type == 'i') {
+ dout(10) << "Existing full inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode->version;
+ } else {
+ dout(4) << "corrupt dentry in backing store, will remove" << dendl;
+ remove_dentry = true;
+ }
+
+ if (remove_dentry)
+ null_vals.insert(key);
+ }
+ }
+
+ // Write back any new/changed dentries
+ if (!write_vals.empty()) {
+ r = output.omap_set(frag_oid.name, write_vals);
+ if (r != 0) {
+ derr << "error writing dentries to " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ // remove any null dentries
+ if (!null_vals.empty()) {
+ r = output.omap_rm_keys(frag_oid.name, null_vals);
+ if (r != 0) {
+ derr << "error removing dentries from " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ /* Now that we've looked at the dirlumps, we finally pay attention to
+ * the roots (i.e. inodes without ancestry). This is necessary in order
+ * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
+ * important because clients use them to infer completeness
+ * of directories
+ */
+ for (const auto& fb : metablob.roots) {
+ inodeno_t ino = fb.inode->ino;
+ dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
+
+ object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+ dout(4) << "object id " << root_oid.name << dendl;
+
+ bool write_root_ino = false;
+ bufferlist old_root_ino_bl;
+ r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
+ if (r == -ENOENT) {
+ dout(4) << "root does not exist, will create" << dendl;
+ write_root_ino = true;
+ } else if (r >= 0) {
+ r = 0;
+ InodeStore old_inode;
+ dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
+ << ")" << dendl;
+ auto inode_bl_iter = old_root_ino_bl.cbegin();
+ std::string magic;
+ decode(magic, inode_bl_iter);
+ if (magic == CEPH_FS_ONDISK_MAGIC) {
+ dout(4) << "magic ok" << dendl;
+ old_inode.decode(inode_bl_iter);
+
+ if (old_inode.inode->version < fb.inode->version) {
+ write_root_ino = true;
+ }
+ } else {
+ dout(4) << "magic bad: '" << magic << "'" << dendl;
+ write_root_ino = true;
+ }
+ } else {
+ derr << "error reading root inode object " << root_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (write_root_ino && !dry_run) {
+ dout(4) << "writing root ino " << root_oid.name
+ << " version " << fb.inode->version << dendl;
+
+ // Compose: root ino format is magic,InodeStore(bare=false)
+ bufferlist new_root_ino_bl;
+ encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
+ encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
+
+ // Write to RADOS
+ r = output.write_full(root_oid.name, new_root_ino_bl);
+ if (r != 0) {
+ derr << "error writing inode object " << root_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ return r;
+}
+
+
+/**
+ * Erase a region of the log by overwriting it with ENoOp
+ *
+ */
+int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
+{
+ // To erase this region, we use our preamble, the encoding overhead
+ // of an ENoOp, and our trailing start ptr. Calculate how much padding
+ // is needed inside the ENoOp to make up the difference.
+ bufferlist tmp;
+ if (type == "mdlog") {
+ ENoOp enoop(0);
+ enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else if (type == "purge_queue") {
+ PurgeItem pi;
+ pi.encode(tmp);
+ }
+
+ dout(4) << "erase_region " << pos << " len=" << length << dendl;
+
+ // FIXME: get the preamble/postamble length via JournalStream
+ int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
+ dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
+
+ if (padding < 0) {
+ derr << "Erase region " << length << " too short" << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist entry;
+ if (type == "mdlog") {
+ // Serialize an ENoOp with the correct amount of padding
+ ENoOp enoop(padding);
+ enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else if (type == "purge_queue") {
+ PurgeItem pi;
+ pi.pad_size = padding;
+ pi.encode(entry);
+ }
+ JournalStream stream(JOURNAL_FORMAT_RESILIENT);
+ // Serialize region of log stream
+ bufferlist log_data;
+ stream.write(entry, &log_data, pos);
+
+ dout(4) << "erase_region data length " << log_data.length() << dendl;
+ ceph_assert(log_data.length() == length);
+
+ // Write log stream region to RADOS
+ // FIXME: get object size somewhere common to scan_events
+ uint32_t object_size = g_conf()->mds_log_segment_size;
+ if (object_size == 0) {
+ // Default layout object size
+ object_size = file_layout_t::get_default().object_size;
+ }
+
+ uint64_t write_offset = pos;
+ uint64_t obj_offset = (pos / object_size);
+ int r = 0;
+ while(log_data.length()) {
+ std::string const oid = js.obj_name(obj_offset);
+ uint32_t offset_in_obj = write_offset % object_size;
+ uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
+
+ r = output.write(oid, log_data, write_len, offset_in_obj);
+ if (r < 0) {
+ return r;
+ } else {
+ dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
+ r = 0;
+ }
+
+ log_data.splice(0, write_len);
+ write_offset += write_len;
+ obj_offset++;
+ }
+
+ return r;
+}
+
+/**
+ * Given an EMetaBlob::fullbit containing an inode, write out
+ * the encoded inode in the format used by InodeStore (i.e. the
+ * backing store format)
+ *
+ * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
+ * on an offline InodeStore instance. It's way simpler, because we are just
+ * uncritically hauling the data between structs.
+ *
+ * @param fb a fullbit extracted from a journal entry
+ * @param bare if true, leave out [EN|DE]CODE_START decoration
+ * @param out_bl output, write serialized inode to this bufferlist
+ */
+void JournalTool::encode_fullbit_as_inode(
+ const EMetaBlob::fullbit &fb,
+ const bool bare,
+ bufferlist *out_bl)
+{
+ ceph_assert(out_bl != NULL);
+
+ // Compose InodeStore
+ InodeStore new_inode;
+ new_inode.inode = fb.inode;
+ new_inode.xattrs = fb.xattrs;
+ new_inode.dirfragtree = fb.dirfragtree;
+ new_inode.snap_blob = fb.snapbl;
+ new_inode.symlink = fb.symlink;
+ new_inode.old_inodes = fb.old_inodes;
+
+ // Serialize InodeStore
+ if (bare) {
+ new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else {
+ new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ }
+}
+
+/**
+ * Given a list of inode numbers known to be in use by
+ * inodes in the backing store, ensure that none of these
+ * numbers are listed as free in the InoTables in the
+ * backing store.
+ *
+ * Used after injecting inodes into the backing store, to
+ * ensure that the same inode numbers are not subsequently
+ * used for new files during ordinary operation.
+ *
+ * @param inos list of inode numbers to be removed from
+ * free lists in InoTables
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
+{
+ int r = 0;
+
+ // InoTable is a per-MDS structure, so iterate over assigned ranks
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ std::set<mds_rank_t> in_ranks;
+ fs->mds_map.get_mds_set(in_ranks);
+
+ for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
+ rank_i != in_ranks.end(); ++rank_i)
+ {
+ // Compose object name
+ std::ostringstream oss;
+ oss << "mds" << *rank_i << "_inotable";
+ object_t inotable_oid = object_t(oss.str());
+
+ // Read object
+ bufferlist inotable_bl;
+ int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
+ if (read_r < 0) {
+ // Things are really bad if we can't read inotable. Beyond our powers.
+ derr << "unable to read inotable '" << inotable_oid.name << "': "
+ << cpp_strerror(read_r) << dendl;
+ r = r ? r : read_r;
+ continue;
+ }
+
+ // Deserialize InoTable
+ version_t inotable_ver;
+ auto q = inotable_bl.cbegin();
+ decode(inotable_ver, q);
+ InoTable ino_table(NULL);
+ ino_table.decode(q);
+
+ // Update InoTable in memory
+ bool inotable_modified = false;
+ for (std::set<inodeno_t>::iterator i = inos.begin();
+ i != inos.end(); ++i)
+ {
+ const inodeno_t ino = *i;
+ if (ino_table.force_consume(ino)) {
+ dout(4) << "Used ino 0x" << std::hex << ino << std::dec
+ << " requires inotable update" << dendl;
+ inotable_modified = true;
+ }
+ }
+
+ // Serialize and write InoTable
+ if (inotable_modified) {
+ inotable_ver += 1;
+ dout(4) << "writing modified inotable version " << inotable_ver << dendl;
+ bufferlist inotable_new_bl;
+ encode(inotable_ver, inotable_new_bl);
+ ino_table.encode_state(inotable_new_bl);
+ int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
+ if (write_r != 0) {
+ derr << "error writing modified inotable " << inotable_oid.name
+ << ": " << cpp_strerror(write_r) << dendl;
+ r = r ? r : read_r;
+ continue;
+ }
+ }
+ }
+
+ return r;
+}
+