diff options
Diffstat (limited to 'src/tools/cephfs')
38 files changed, 11117 insertions, 0 deletions
diff --git a/src/tools/cephfs/CMakeLists.txt b/src/tools/cephfs/CMakeLists.txt new file mode 100644 index 000000000..5d40f8ffb --- /dev/null +++ b/src/tools/cephfs/CMakeLists.txt @@ -0,0 +1,58 @@ +set(cephfs_journal_tool_srcs + cephfs-journal-tool.cc + JournalTool.cc + JournalFilter.cc + JournalScanner.cc + EventOutput.cc + Dumper.cc + Resetter.cc + RoleSelector.cc + MDSUtility.cc) +add_executable(cephfs-journal-tool ${cephfs_journal_tool_srcs}) +target_link_libraries(cephfs-journal-tool librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +set(cephfs-meta-injection_srcs + cephfs-meta-injection.cc + MetaTool.cc + RoleSelector.cc + MDSUtility.cc) +add_executable(cephfs-meta-injection ${cephfs-meta-injection_srcs}) +target_link_libraries(cephfs-meta-injection librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +set(cephfs_table_tool_srcs + cephfs-table-tool.cc + TableTool.cc + RoleSelector.cc + MDSUtility.cc) +add_executable(cephfs-table-tool ${cephfs_table_tool_srcs}) +target_link_libraries(cephfs-table-tool librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +set(cephfs_data_scan_srcs + cephfs-data-scan.cc + DataScan.cc + RoleSelector.cc + PgFiles.cc + MDSUtility.cc) +add_executable(cephfs-data-scan ${cephfs_data_scan_srcs}) +target_link_libraries(cephfs-data-scan librados cephfs mds osdc global + cls_cephfs_client + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +install(TARGETS + cephfs-journal-tool + cephfs-table-tool + cephfs-data-scan + DESTINATION bin) + +option(WITH_CEPHFS_SHELL "install cephfs-shell" OFF) +if(WITH_CEPHFS_SHELL) + add_subdirectory(shell) +endif() + +option(WITH_CEPHFS_TOP "install cephfs-top utility" ON) +if(WITH_CEPHFS_TOP) + add_subdirectory(top) +endif() diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc new file mode 100644 index 000000000..9f942964d --- /dev/null +++ b/src/tools/cephfs/DataScan.cc @@ -0,0 +1,2239 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include "common/errno.h" +#include "common/ceph_argparse.h" +#include <fstream> +#include "include/util.h" +#include "include/ceph_fs.h" + +#include "mds/CDentry.h" +#include "mds/CInode.h" +#include "mds/CDentry.h" +#include "mds/InoTable.h" +#include "mds/SnapServer.h" +#include "cls/cephfs/cls_cephfs_client.h" + +#include "PgFiles.h" +#include "DataScan.h" +#include "include/compat.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "datascan." << __func__ << ": " + +void DataScan::usage() +{ + std::cout << "Usage: \n" + << " cephfs-data-scan init [--force-init]\n" + << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n" + << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n" + << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n" + << " cephfs-data-scan scan_links\n" + << "\n" + << " --force-corrupt: overrite apparently corrupt structures\n" + << " --force-init: write root inodes even if they exist\n" + << " --force-pool: use data pool even if it is not in FSMap\n" + << " --worker_m: Maximum number of workers\n" + << " --worker_n: Worker number, range 0-(worker_m-1)\n" + << "\n" + << " cephfs-data-scan scan_frags [--force-corrupt]\n" + << " cephfs-data-scan cleanup <data pool name>\n" + << std::endl; + + generic_client_usage(); +} + +bool DataScan::parse_kwarg( + const std::vector<const char*> &args, + std::vector<const char *>::const_iterator &i, + int *r) +{ + if (i + 1 == args.end()) { + return false; + } + + const std::string arg(*i); + const std::string val(*(i + 1)); + + if (arg == std::string("--output-dir")) { + if (driver != NULL) { + derr << "Unexpected --output-dir: output already selected!" << dendl; + *r = -EINVAL; + return false; + } + dout(4) << "Using local file output to '" << val << "'" << dendl; + driver = new LocalFileDriver(val, data_io); + return true; + } else if (arg == std::string("--worker_n")) { + std::string err; + n = strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + std::cerr << "Invalid worker number '" << val << "'" << std::endl; + *r = -EINVAL; + return false; + } + return true; + } else if (arg == std::string("--worker_m")) { + std::string err; + m = strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + std::cerr << "Invalid worker count '" << val << "'" << std::endl; + *r = -EINVAL; + return false; + } + return true; + } else if (arg == std::string("--filter-tag")) { + filter_tag = val; + dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl; + return true; + } else if (arg == std::string("--filesystem")) { + std::shared_ptr<const Filesystem> fs; + *r = fsmap->parse_filesystem(val, &fs); + if (*r != 0) { + std::cerr << "Invalid filesystem '" << val << "'" << std::endl; + return false; + } + fscid = fs->fscid; + return true; + } else if (arg == std::string("--alternate-pool")) { + metadata_pool_name = val; + return true; + } else { + return false; + } +} + +bool DataScan::parse_arg( + const std::vector<const char*> &args, + std::vector<const char *>::const_iterator &i) +{ + const std::string arg(*i); + if (arg == "--force-pool") { + force_pool = true; + return true; + } else if (arg == "--force-corrupt") { + force_corrupt = true; + return true; + } else if (arg == "--force-init") { + force_init = true; + return true; + } else { + return false; + } +} + +int DataScan::main(const std::vector<const char*> &args) +{ + // Parse args + // ========== + if (args.size() < 1) { + cerr << "missing position argument" << std::endl; + return -EINVAL; + } + + // Common RADOS init: open metadata pool + // ===================================== + librados::Rados rados; + int r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "RADOS unavailable" << dendl; + return r; + } + + std::string const &command = args[0]; + std::string data_pool_name; + + std::string pg_files_path; + std::set<pg_t> pg_files_pgs; + + // Consume any known --key val or --flag arguments + for (std::vector<const char *>::const_iterator i = args.begin() + 1; + i != args.end(); ++i) { + if (parse_kwarg(args, i, &r)) { + // Skip the kwarg value field + ++i; + continue; + } else if (r) { + return r; + } + + if (parse_arg(args, i)) { + continue; + } + + // Trailing positional argument + if (i + 1 == args.end() && + (command == "scan_inodes" + || command == "scan_extents" + || command == "cleanup")) { + data_pool_name = *i; + continue; + } + + if (command == "pg_files") { + if (i == args.begin() + 1) { + pg_files_path = *i; + continue; + } else { + pg_t pg; + bool parsed = pg.parse(*i); + if (!parsed) { + std::cerr << "Invalid PG '" << *i << "'" << std::endl; + return -EINVAL; + } else { + pg_files_pgs.insert(pg); + continue; + } + } + + } + + // Fall through: unhandled + std::cerr << "Unknown argument '" << *i << "'" << std::endl; + return -EINVAL; + } + + // If caller didn't specify a namespace, try to pick + // one if only one exists + if (fscid == FS_CLUSTER_ID_NONE) { + if (fsmap->filesystem_count() == 1) { + fscid = fsmap->get_filesystem()->fscid; + } else { + std::cerr << "Specify a filesystem with --filesystem" << std::endl; + return -EINVAL; + } + } + auto fs = fsmap->get_filesystem(fscid); + ceph_assert(fs != nullptr); + + // Default to output to metadata pool + if (driver == NULL) { + driver = new MetadataDriver(); + driver->set_force_corrupt(force_corrupt); + driver->set_force_init(force_init); + dout(4) << "Using metadata pool output" << dendl; + } + + dout(4) << "connecting to RADOS..." << dendl; + r = rados.connect(); + if (r < 0) { + std::cerr << "couldn't connect to cluster: " << cpp_strerror(r) + << std::endl; + return r; + } + + r = driver->init(rados, metadata_pool_name, fsmap, fscid); + if (r < 0) { + return r; + } + + if (command == "pg_files") { + auto pge = PgFiles(objecter, pg_files_pgs); + pge.init(); + return pge.scan_path(pg_files_path); + } + + // Initialize data_io for those commands that need it + if (command == "scan_inodes" || + command == "scan_extents" || + command == "cleanup") { + if (data_pool_name.empty()) { + std::cerr << "Data pool not specified" << std::endl; + return -EINVAL; + } + + data_pool_id = rados.pool_lookup(data_pool_name.c_str()); + if (data_pool_id < 0) { + std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl; + return -ENOENT; + } else { + dout(4) << "data pool '" << data_pool_name + << "' has ID " << data_pool_id << dendl; + } + + if (!fs->mds_map.is_data_pool(data_pool_id)) { + std::cerr << "Warning: pool '" << data_pool_name << "' is not a " + "CephFS data pool!" << std::endl; + if (!force_pool) { + std::cerr << "Use --force-pool to continue" << std::endl; + return -EINVAL; + } + } + + dout(4) << "opening data pool '" << data_pool_name << "'" << dendl; + r = rados.ioctx_create(data_pool_name.c_str(), data_io); + if (r != 0) { + return r; + } + } + + // Initialize metadata_io from MDSMap for scan_frags + if (command == "scan_frags" || command == "scan_links") { + const auto fs = fsmap->get_filesystem(fscid); + if (fs == nullptr) { + std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl; + return -ENOENT; + } + int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool(); + + dout(4) << "resolving metadata pool " << metadata_pool_id << dendl; + int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name); + if (r < 0) { + std::cerr << "Pool " << metadata_pool_id + << " identified in MDS map not found in RADOS!" << std::endl; + return r; + } + + r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io); + if (r != 0) { + return r; + } + + data_pools = fs->mds_map.get_data_pools(); + } + + // Finally, dispatch command + if (command == "scan_inodes") { + return scan_inodes(); + } else if (command == "scan_extents") { + return scan_extents(); + } else if (command == "scan_frags") { + return scan_frags(); + } else if (command == "scan_links") { + return scan_links(); + } else if (command == "cleanup") { + return cleanup(); + } else if (command == "init") { + return driver->init_roots(fs->mds_map.get_first_data_pool()); + } else { + std::cerr << "Unknown command '" << command << "'" << std::endl; + return -EINVAL; + } +} + +int MetadataDriver::inject_unlinked_inode( + inodeno_t inono, int mode, int64_t data_pool_id) +{ + const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode"); + + // Skip if exists + bool already_exists = false; + int r = root_exists(inono, &already_exists); + if (r) { + return r; + } + if (already_exists && !force_init) { + std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already" + " exists, skipping create. Use --force-init to overwrite" + " the existing object." << std::endl; + return 0; + } + + // Compose + InodeStore inode_data; + auto inode = inode_data.get_inode(); + inode->ino = inono; + inode->version = 1; + inode->xattr_version = 1; + inode->mode = 0500 | mode; + // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty + // (we won't actually give the *correct* dirstat here though) + inode->dirstat.nfiles = 1; + + inode->ctime = inode->mtime = ceph_clock_now(); + inode->nlink = 1; + inode->truncate_size = -1ull; + inode->truncate_seq = 1; + inode->uid = g_conf()->mds_root_ino_uid; + inode->gid = g_conf()->mds_root_ino_gid; + + // Force layout to default: should we let users override this so that + // they don't have to mount the filesystem to correct it? + inode->layout = file_layout_t::get_default(); + inode->layout.pool_id = data_pool_id; + inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + + // Assume that we will get our stats wrong, and that we may + // be ignoring dirfrags that exist + inode_data.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE); + + if (inono == CEPH_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) { + sr_t srnode; + srnode.seq = 1; + encode(srnode, inode_data.snap_blob); + } + + // Serialize + bufferlist inode_bl; + encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl); + inode_data.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + // Write + r = metadata_io.write_full(oid.name, inode_bl); + if (r != 0) { + derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl; + return r; + } + + return r; +} + +int MetadataDriver::root_exists(inodeno_t ino, bool *result) +{ + object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); + uint64_t size; + time_t mtime; + int r = metadata_io.stat(oid.name, &size, &mtime); + if (r == -ENOENT) { + *result = false; + return 0; + } else if (r < 0) { + return r; + } + + *result = true; + return 0; +} + +int MetadataDriver::init_roots(int64_t data_pool_id) +{ + int r = 0; + r = inject_unlinked_inode(CEPH_INO_ROOT, S_IFDIR|0755, data_pool_id); + if (r != 0) { + return r; + } + r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id); + if (r != 0) { + return r; + } + bool created = false; + r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created); + if (r != 0) { + return r; + } + + return 0; +} + +int MetadataDriver::check_roots(bool *result) +{ + int r; + r = root_exists(CEPH_INO_ROOT, result); + if (r != 0) { + return r; + } + if (!*result) { + return 0; + } + + r = root_exists(MDS_INO_MDSDIR(0), result); + if (r != 0) { + return r; + } + if (!*result) { + return 0; + } + + return 0; +} + +/** + * Stages: + * + * SERIAL init + * 0. Create root inodes if don't exist + * PARALLEL scan_extents + * 1. Size and mtime recovery: scan ALL objects, and update 0th + * objects with max size and max mtime seen. + * PARALLEL scan_inodes + * 2. Inode recovery: scan ONLY 0th objects, and inject metadata + * into dirfrag OMAPs, creating blank dirfrags as needed. No stats + * or rstats at this stage. Inodes without backtraces go into + * lost+found + * TODO: SERIAL "recover stats" + * 3. Dirfrag statistics: depth first traverse into metadata tree, + * rebuilding dir sizes. + * TODO PARALLEL "clean up" + * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged + * anything onto them) and remove any of the xattrs that we + * used for accumulating. + */ + + +int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id) +{ + if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) { + return -EINVAL; + } + + std::string err; + std::string inode_str = oid.substr(0, oid.find(".")); + *inode_no = strict_strtoll(inode_str.c_str(), 16, &err); + if (!err.empty()) { + return -EINVAL; + } + + std::string pos_string = oid.substr(oid.find(".") + 1); + *obj_id = strict_strtoll(pos_string.c_str(), 16, &err); + if (!err.empty()) { + return -EINVAL; + } + + return 0; +} + + +int DataScan::scan_extents() +{ + return forall_objects(data_io, false, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + // Read size + uint64_t size; + time_t mtime; + int r = data_io.stat(oid, &size, &mtime); + dout(10) << "handling object " << obj_name_ino + << "." << obj_name_offset << dendl; + if (r != 0) { + dout(4) << "Cannot stat '" << oid << "': skipping" << dendl; + return r; + } + + // I need to keep track of + // * The highest object ID seen + // * The size of the highest object ID seen + // * The largest object seen + // + // Given those things, I can later infer the object chunking + // size, the offset of the last object (chunk size * highest ID seen) + // and the actual size (offset of last object + size of highest ID seen) + // + // This logic doesn't take account of striping. + r = ClsCephFSClient::accumulate_inode_metadata( + data_io, + obj_name_ino, + obj_name_offset, + size, + mtime); + if (r < 0) { + derr << "Failed to accumulate metadata data from '" + << oid << "': " << cpp_strerror(r) << dendl; + return r; + } + + return r; + }); +} + +int DataScan::probe_filter(librados::IoCtx &ioctx) +{ + bufferlist filter_bl; + ClsCephFSClient::build_tag_filter("test", &filter_bl); + librados::ObjectCursor range_i; + librados::ObjectCursor range_end; + + std::vector<librados::ObjectItem> tmp_result; + librados::ObjectCursor tmp_next; + int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(), + 1, filter_bl, &tmp_result, &tmp_next); + + return r >= 0; +} + +int DataScan::forall_objects( + librados::IoCtx &ioctx, + bool untagged_only, + std::function<int(std::string, uint64_t, uint64_t)> handler + ) +{ + librados::ObjectCursor range_i; + librados::ObjectCursor range_end; + ioctx.object_list_slice( + ioctx.object_list_begin(), + ioctx.object_list_end(), + n, + m, + &range_i, + &range_end); + + + bufferlist filter_bl; + + bool legacy_filtering = false; + if (untagged_only) { + // probe to deal with older OSDs that don't support + // the cephfs pgls filtering mode + legacy_filtering = !probe_filter(ioctx); + if (!legacy_filtering) { + ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl); + } + } + + int r = 0; + while(range_i < range_end) { + std::vector<librados::ObjectItem> result; + int r = ioctx.object_list(range_i, range_end, 1, + filter_bl, &result, &range_i); + if (r < 0) { + derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto &i : result) { + const std::string &oid = i.oid; + uint64_t obj_name_ino = 0; + uint64_t obj_name_offset = 0; + r = parse_oid(oid, &obj_name_ino, &obj_name_offset); + if (r != 0) { + dout(4) << "Bad object name '" << oid << "', skipping" << dendl; + continue; + } + + if (untagged_only && legacy_filtering) { + dout(20) << "Applying filter to " << oid << dendl; + + // We are only interested in 0th objects during this phase: we touched + // the other objects during scan_extents + if (obj_name_offset != 0) { + dout(20) << "Non-zeroth object" << dendl; + continue; + } + + bufferlist scrub_tag_bl; + int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl); + if (r >= 0) { + std::string read_tag; + auto q = scrub_tag_bl.cbegin(); + try { + decode(read_tag, q); + if (read_tag == filter_tag) { + dout(20) << "skipping " << oid << " because it has the filter_tag" + << dendl; + continue; + } + } catch (const buffer::error &err) { + } + dout(20) << "read non-matching tag '" << read_tag << "'" << dendl; + } else { + dout(20) << "no tag read (" << r << ")" << dendl; + } + + } else if (untagged_only) { + ceph_assert(obj_name_offset == 0); + dout(20) << "OSD matched oid " << oid << dendl; + } + + int this_oid_r = handler(oid, obj_name_ino, obj_name_offset); + if (r == 0 && this_oid_r < 0) { + r = this_oid_r; + } + } + } + + return r; +} + +int DataScan::scan_inodes() +{ + bool roots_present; + int r = driver->check_roots(&roots_present); + if (r != 0) { + derr << "Unexpected error checking roots: '" + << cpp_strerror(r) << "'" << dendl; + return r; + } + + if (!roots_present) { + std::cerr << "Some or all system inodes are absent. Run 'init' from " + "one node before running 'scan_inodes'" << std::endl; + return -EIO; + } + + return forall_objects(data_io, true, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + int r = 0; + + dout(10) << "handling object " + << std::hex << obj_name_ino << "." << obj_name_offset << std::dec + << dendl; + + AccumulateResult accum_res; + inode_backtrace_t backtrace; + file_layout_t loaded_layout = file_layout_t::get_default(); + r = ClsCephFSClient::fetch_inode_accumulate_result( + data_io, oid, &backtrace, &loaded_layout, &accum_res); + + if (r == -EINVAL) { + dout(4) << "Accumulated metadata missing from '" + << oid << ", did you run scan_extents?" << dendl; + return r; + } else if (r < 0) { + dout(4) << "Unexpected error loading accumulated metadata from '" + << oid << "': " << cpp_strerror(r) << dendl; + // FIXME: this creates situation where if a client has a corrupt + // backtrace/layout, we will fail to inject it. We should (optionally) + // proceed if the backtrace/layout is corrupt but we have valid + // accumulated metadata. + return r; + } + + const time_t file_mtime = accum_res.max_mtime; + uint64_t file_size = 0; + bool have_backtrace = !(backtrace.ancestors.empty()); + + // This is the layout we will use for injection, populated either + // from loaded_layout or from best guesses + file_layout_t guessed_layout; + guessed_layout.pool_id = data_pool_id; + + // Calculate file_size, guess the layout + if (accum_res.ceiling_obj_index > 0) { + uint32_t chunk_size = file_layout_t::get_default().object_size; + // When there are multiple objects, the largest object probably + // indicates the chunk size. But not necessarily, because files + // can be sparse. Only make this assumption if size seen + // is a power of two, as chunk sizes typically are. + if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) { + chunk_size = accum_res.max_obj_size; + } + + if (loaded_layout.pool_id == -1) { + // If no stashed layout was found, guess it + guessed_layout.object_size = chunk_size; + guessed_layout.stripe_unit = chunk_size; + guessed_layout.stripe_count = 1; + } else if (!loaded_layout.is_valid() || + loaded_layout.object_size < accum_res.max_obj_size) { + // If the max size seen exceeds what the stashed layout claims, then + // disbelieve it. Guess instead. Same for invalid layouts on disk. + dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino + << std::dec << ", ignoring in favour of best guess" << dendl; + guessed_layout.object_size = chunk_size; + guessed_layout.stripe_unit = chunk_size; + guessed_layout.stripe_count = 1; + } else { + // We have a stashed layout that we can't disprove, so apply it + guessed_layout = loaded_layout; + dout(20) << "loaded layout from xattr:" + << " os: " << guessed_layout.object_size + << " sc: " << guessed_layout.stripe_count + << " su: " << guessed_layout.stripe_unit + << dendl; + // User might have transplanted files from a pool with a different + // ID, so whatever the loaded_layout says, we'll force the injected + // layout to point to the pool we really read from + guessed_layout.pool_id = data_pool_id; + } + + if (guessed_layout.stripe_count == 1) { + // Unstriped file: simple chunking + file_size = guessed_layout.object_size * accum_res.ceiling_obj_index + + accum_res.ceiling_obj_size; + } else { + // Striped file: need to examine the last stripe_count objects + // in the file to determine the size. + + // How many complete (i.e. not last stripe) objects? + uint64_t complete_objs = 0; + if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) { + complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count; + } else { + complete_objs = 0; + } + + // How many potentially-short objects (i.e. last stripe set) objects? + uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs; + + dout(10) << "calculating striped size from complete objs: " + << complete_objs << ", partial objs: " << partial_objs + << dendl; + + // Maximum amount of data that may be in the incomplete objects + uint64_t incomplete_size = 0; + + // For each short object, calculate the max file size within it + // and accumulate the maximum + for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) { + char buf[60]; + snprintf(buf, sizeof(buf), "%llx.%08llx", + (long long unsigned)obj_name_ino, (long long unsigned)i); + + uint64_t osize(0); + time_t omtime(0); + r = data_io.stat(std::string(buf), &osize, &omtime); + if (r == 0) { + if (osize > 0) { + // Upper bound within this object + uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit + * (guessed_layout.stripe_unit * guessed_layout.stripe_count) + + (i % guessed_layout.stripe_count) + * guessed_layout.stripe_unit + (osize - 1) + % guessed_layout.stripe_unit + 1; + incomplete_size = std::max(incomplete_size, upper_size); + } + } else if (r == -ENOENT) { + // Absent object, treat as size 0 and ignore. + } else { + // Unexpected error, carry r to outer scope for handling. + break; + } + } + if (r != 0 && r != -ENOENT) { + derr << "Unexpected error checking size of ino 0x" << std::hex + << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl; + return r; + } + file_size = complete_objs * guessed_layout.object_size + + incomplete_size; + } + } else { + file_size = accum_res.ceiling_obj_size; + if (loaded_layout.pool_id < 0 + || loaded_layout.object_size < accum_res.max_obj_size) { + // No layout loaded, or inconsistent layout, use default + guessed_layout = file_layout_t::get_default(); + guessed_layout.pool_id = data_pool_id; + } else { + guessed_layout = loaded_layout; + } + } + + // Santity checking backtrace ino against object name + if (have_backtrace && backtrace.ino != obj_name_ino) { + dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino + << " doesn't match object name ino 0x" << obj_name_ino + << std::dec << dendl; + have_backtrace = false; + } + + InodeStore dentry; + build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry); + + // Inject inode to the metadata pool + if (have_backtrace) { + inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin()); + if (MDS_INO_IS_MDSDIR(root_bp.dirino)) { + /* Special case for strays: even if we have a good backtrace, + * don't put it in the stray dir, because while that would technically + * give it linkage it would still be invisible to the user */ + r = driver->inject_lost_and_found(obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } else { + /* Happy case: we will inject a named dentry for this inode */ + r = driver->inject_with_backtrace(backtrace, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " with backtrace: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + } else { + /* Backtrace-less case: we will inject a lost+found dentry */ + r = driver->inject_lost_and_found( + obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << obj_name_ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + + return r; + }); +} + +int DataScan::cleanup() +{ + // We are looking for only zeroth object + // + return forall_objects(data_io, true, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + int r = 0; + r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid); + if (r < 0) { + dout(4) << "Error deleting accumulated metadata from '" + << oid << "': " << cpp_strerror(r) << dendl; + } + return r; + }); +} + +bool DataScan::valid_ino(inodeno_t ino) const +{ + return (ino >= inodeno_t((1ull << 40))) + || (MDS_INO_IS_STRAY(ino)) + || (MDS_INO_IS_MDSDIR(ino)) + || ino == CEPH_INO_ROOT + || ino == CEPH_INO_CEPH; +} + +int DataScan::scan_links() +{ + MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver); + if (!metadata_driver) { + derr << "Unexpected --output-dir option for scan_links" << dendl; + return -EINVAL; + } + + interval_set<uint64_t> used_inos; + map<inodeno_t, int> remote_links; + map<snapid_t, SnapInfo> snaps; + snapid_t last_snap = 1; + snapid_t snaprealm_v2_since = 2; + + struct link_info_t { + inodeno_t dirino; + frag_t frag; + string name; + version_t version; + int nlink; + bool is_dir; + map<snapid_t, SnapInfo> snaps; + link_info_t() : version(0), nlink(0), is_dir(false) {} + link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::inode_const_ptr& i) : + dirino(di), frag(df), name(n), + version(i->version), nlink(i->nlink), is_dir(S_IFDIR & i->mode) {} + dirfrag_t dirfrag() const { + return dirfrag_t(dirino, frag); + } + }; + map<inodeno_t, list<link_info_t> > dup_primaries; + map<inodeno_t, link_info_t> bad_nlink_inos; + map<inodeno_t, link_info_t> injected_inos; + + map<dirfrag_t, set<string> > to_remove; + + enum { + SCAN_INOS = 1, + CHECK_LINK, + }; + + for (int step = SCAN_INOS; step <= CHECK_LINK; step++) { + const librados::NObjectIterator it_end = metadata_io.nobjects_end(); + for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) { + const std::string oid = it->get_oid(); + + dout(10) << "step " << step << ": handling object " << oid << dendl; + + uint64_t dir_ino = 0; + uint64_t frag_id = 0; + int r = parse_oid(oid, &dir_ino, &frag_id); + if (r == -EINVAL) { + dout(10) << "Not a dirfrag: '" << oid << "'" << dendl; + continue; + } else { + // parse_oid can only do 0 or -EINVAL + ceph_assert(r == 0); + } + + if (!valid_ino(dir_ino)) { + dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl; + continue; + } + + std::map<std::string, bufferlist> items; + r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items); + if (r < 0) { + derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& p : items) { + auto q = p.second.cbegin(); + string dname; + snapid_t last; + dentry_key_t::decode_helper(p.first, dname, last); + + if (last != CEPH_NOSNAP) { + if (last > last_snap) + last_snap = last; + continue; + } + + try { + snapid_t dnfirst; + decode(dnfirst, q); + if (dnfirst <= CEPH_MAXSNAP) { + if (dnfirst - 1 > last_snap) + last_snap = dnfirst - 1; + } + char dentry_type; + decode(dentry_type, q); + mempool::mds_co::string alternate_name; + if (dentry_type == 'I' || dentry_type == 'i') { + InodeStore inode; + if (dentry_type == 'i') { + DECODE_START(2, q); + if (struct_v >= 2) + decode(alternate_name, q); + inode.decode(q); + DECODE_FINISH(q); + } else { + inode.decode_bare(q); + } + + inodeno_t ino = inode.inode->ino; + + if (step == SCAN_INOS) { + if (used_inos.contains(ino, 1)) { + dup_primaries[ino].size(); + } else { + used_inos.insert(ino); + } + } else if (step == CHECK_LINK) { + sr_t srnode; + if (inode.snap_blob.length()) { + auto p = inode.snap_blob.cbegin(); + decode(srnode, p); + for (auto it = srnode.snaps.begin(); + it != srnode.snaps.end(); ) { + if (it->second.ino != ino || + it->second.snapid != it->first) { + srnode.snaps.erase(it++); + } else { + ++it; + } + } + if (!srnode.past_parents.empty()) { + snapid_t last = srnode.past_parents.rbegin()->first; + if (last + 1 > snaprealm_v2_since) + snaprealm_v2_since = last + 1; + } + } + if (inode.old_inodes && !inode.old_inodes->empty()) { + auto _last_snap = inode.old_inodes->rbegin()->first; + if (_last_snap > last_snap) + last_snap = _last_snap; + } + auto q = dup_primaries.find(ino); + if (q != dup_primaries.end()) { + q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode)); + q->second.back().snaps.swap(srnode.snaps); + } else { + int nlink = 0; + auto r = remote_links.find(ino); + if (r != remote_links.end()) + nlink = r->second; + if (!MDS_INO_IS_STRAY(dir_ino)) + nlink++; + if (inode.inode->nlink != nlink) { + derr << "Bad nlink on " << ino << " expected " << nlink + << " has " << inode.inode->nlink << dendl; + bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode); + bad_nlink_inos[ino].nlink = nlink; + } + snaps.insert(make_move_iterator(begin(srnode.snaps)), + make_move_iterator(end(srnode.snaps))); + } + if (dnfirst == CEPH_NOSNAP) + injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode); + } + } else if (dentry_type == 'L' || dentry_type == 'l') { + inodeno_t ino; + unsigned char d_type; + CDentry::decode_remote(dentry_type, ino, d_type, alternate_name, q); + + if (step == SCAN_INOS) { + remote_links[ino]++; + } else if (step == CHECK_LINK) { + if (!used_inos.contains(ino, 1)) { + derr << "Bad remote link dentry 0x" << std::hex << dir_ino + << std::dec << "/" << dname + << ", ino " << ino << " not found" << dendl; + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str()); + dn_key.encode(key); + to_remove[dirfrag_t(dir_ino, frag_id)].insert(key); + } + } + } else { + derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino + << std::dec << "/" << dname << dendl; + return -EINVAL; + } + } catch (const buffer::error &err) { + derr << "Error decoding dentry 0x" << std::hex << dir_ino + << std::dec << "/" << dname << dendl; + return -EINVAL; + } + } + } + } + + map<unsigned, uint64_t> max_ino_map; + { + auto prev_max_ino = (uint64_t)1 << 40; + for (auto p = used_inos.begin(); p != used_inos.end(); ++p) { + auto cur_max = p.get_start() + p.get_len() - 1; + if (cur_max < prev_max_ino) + continue; // system inodes + + if ((prev_max_ino >> 40) != (cur_max >> 40)) { + unsigned rank = (prev_max_ino >> 40) - 1; + max_ino_map[rank] = prev_max_ino; + } else if ((p.get_start() >> 40) != (cur_max >> 40)) { + unsigned rank = (p.get_start() >> 40) - 1; + max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1; + } + prev_max_ino = cur_max; + } + unsigned rank = (prev_max_ino >> 40) - 1; + max_ino_map[rank] = prev_max_ino; + } + + used_inos.clear(); + + dout(10) << "processing " << dup_primaries.size() << " dup_primaries, " + << remote_links.size() << " remote_links" << dendl; + + for (auto& p : dup_primaries) { + + dout(10) << "handling dup " << p.first << dendl; + + link_info_t newest; + for (auto& q : p.second) { + if (q.version > newest.version) { + newest = q; + } else if (q.version == newest.version && + !MDS_INO_IS_STRAY(q.dirino) && + MDS_INO_IS_STRAY(newest.dirino)) { + newest = q; + } + } + + for (auto& q : p.second) { + // in the middle of dir fragmentation? + if (newest.dirino == q.dirino && newest.name == q.name) { + snaps.insert(make_move_iterator(begin(q.snaps)), + make_move_iterator(end(q.snaps))); + continue; + } + + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str()); + dn_key.encode(key); + to_remove[q.dirfrag()].insert(key); + derr << "Remove duplicated ino 0x" << p.first << " from " + << q.dirfrag() << "/" << q.name << dendl; + } + + int nlink = 0; + auto q = remote_links.find(p.first); + if (q != remote_links.end()) + nlink = q->second; + if (!MDS_INO_IS_STRAY(newest.dirino)) + nlink++; + + if (nlink != newest.nlink) { + derr << "Bad nlink on " << p.first << " expected " << nlink + << " has " << newest.nlink << dendl; + bad_nlink_inos[p.first] = newest; + bad_nlink_inos[p.first].nlink = nlink; + } + } + dup_primaries.clear(); + remote_links.clear(); + + { + objecter->with_osdmap([&](const OSDMap& o) { + for (auto p : data_pools) { + const pg_pool_t *pi = o.get_pg_pool(p); + if (!pi) + continue; + if (pi->snap_seq > last_snap) + last_snap = pi->snap_seq; + } + }); + + if (!snaps.empty()) { + if (snaps.rbegin()->first > last_snap) + last_snap = snaps.rbegin()->first; + } + } + + dout(10) << "removing dup dentries from " << to_remove.size() << " objects" + << dendl; + + for (auto& p : to_remove) { + object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, ""); + + dout(10) << "removing dup dentries from " << p.first << dendl; + + int r = metadata_io.omap_rm_keys(frag_oid.name, p.second); + if (r != 0) { + derr << "Error removing duplicated dentries from " << p.first << dendl; + return r; + } + } + to_remove.clear(); + + dout(10) << "processing " << bad_nlink_inos.size() << " bad_nlink_inos" + << dendl; + + for (auto &p : bad_nlink_inos) { + dout(10) << "handling bad_nlink_ino " << p.first << dendl; + + InodeStore inode; + snapid_t first; + int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first); + if (r < 0) { + derr << "Unexpected error reading dentry " + << p.second.dirfrag() << "/" << p.second.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (inode.inode->ino != p.first || inode.inode->version != p.second.version) + continue; + + inode.get_inode()->nlink = p.second.nlink; + r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first); + if (r < 0) + return r; + } + + dout(10) << "processing " << injected_inos.size() << " injected_inos" + << dendl; + + for (auto &p : injected_inos) { + dout(10) << "handling injected_ino " << p.first << dendl; + + InodeStore inode; + snapid_t first; + int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first); + if (r < 0) { + derr << "Unexpected error reading dentry " + << p.second.dirfrag() << "/" << p.second.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (first != CEPH_NOSNAP) + continue; + + first = last_snap + 1; + r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first); + if (r < 0) + return r; + } + + dout(10) << "updating inotable" << dendl; + + for (auto& p : max_ino_map) { + InoTable inotable(nullptr); + inotable.set_rank(p.first); + bool dirty = false; + int r = metadata_driver->load_table(&inotable); + if (r < 0) { + inotable.reset_state(); + dirty = true; + } + if (inotable.force_consume_to(p.second)) + dirty = true; + if (dirty) { + r = metadata_driver->save_table(&inotable); + if (r < 0) + return r; + } + } + + dout(10) << "updating snaptable" << dendl; + + { + SnapServer snaptable; + snaptable.set_rank(0); + bool dirty = false; + int r = metadata_driver->load_table(&snaptable); + if (r < 0) { + snaptable.reset_state(); + dirty = true; + } + if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps)) + dirty = true; + if (dirty) { + r = metadata_driver->save_table(&snaptable); + if (r < 0) + return r; + } + } + return 0; +} + +int DataScan::scan_frags() +{ + bool roots_present; + int r = driver->check_roots(&roots_present); + if (r != 0) { + derr << "Unexpected error checking roots: '" + << cpp_strerror(r) << "'" << dendl; + return r; + } + + if (!roots_present) { + std::cerr << "Some or all system inodes are absent. Run 'init' from " + "one node before running 'scan_inodes'" << std::endl; + return -EIO; + } + + return forall_objects(metadata_io, true, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + int r = 0; + r = parse_oid(oid, &obj_name_ino, &obj_name_offset); + if (r != 0) { + dout(4) << "Bad object name '" << oid << "', skipping" << dendl; + return r; + } + + if (obj_name_ino < (1ULL << 40)) { + // FIXME: we're skipping stray dirs here: if they're + // orphaned then we should be resetting them some other + // way + dout(10) << "Skipping system ino " << obj_name_ino << dendl; + return 0; + } + + AccumulateResult accum_res; + inode_backtrace_t backtrace; + + // Default to inherit layout (i.e. no explicit layout on dir) which is + // expressed as a zeroed layout struct (see inode_t::has_layout) + file_layout_t loaded_layout; + + int parent_r = 0; + bufferlist parent_bl; + int layout_r = 0; + bufferlist layout_bl; + bufferlist op_bl; + + librados::ObjectReadOperation op; + op.getxattr("parent", &parent_bl, &parent_r); + op.getxattr("layout", &layout_bl, &layout_r); + r = metadata_io.operate(oid, &op, &op_bl); + if (r != 0 && r != -ENODATA) { + derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl; + return r; + } + + if (parent_r != -ENODATA) { + try { + auto q = parent_bl.cbegin(); + backtrace.decode(q); + } catch (buffer::error &e) { + dout(4) << "Corrupt backtrace on '" << oid << "': " << e.what() << dendl; + if (!force_corrupt) { + return -EINVAL; + } else { + // Treat backtrace as absent: we'll inject into lost+found + backtrace = inode_backtrace_t(); + } + } + } + + if (layout_r != -ENODATA) { + try { + auto q = layout_bl.cbegin(); + decode(loaded_layout, q); + } catch (buffer::error &e) { + dout(4) << "Corrupt layout on '" << oid << "': " << e.what() << dendl; + if (!force_corrupt) { + return -EINVAL; + } + } + } + + bool have_backtrace = !(backtrace.ancestors.empty()); + + // Santity checking backtrace ino against object name + if (have_backtrace && backtrace.ino != obj_name_ino) { + dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino + << " doesn't match object name ino 0x" << obj_name_ino + << std::dec << dendl; + have_backtrace = false; + } + + uint64_t fnode_version = 0; + fnode_t fnode; + r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version); + if (r == -EINVAL) { + derr << "Corrupt fnode on " << oid << dendl; + if (force_corrupt) { + fnode.fragstat.mtime = 0; + fnode.fragstat.nfiles = 1; + fnode.fragstat.nsubdirs = 0; + fnode.accounted_fragstat = fnode.fragstat; + } else { + return r; + } + } + + InodeStore dentry; + build_dir_dentry(obj_name_ino, fnode.accounted_fragstat, + loaded_layout, &dentry); + + // Inject inode to the metadata pool + if (have_backtrace) { + inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin()); + if (MDS_INO_IS_MDSDIR(root_bp.dirino)) { + /* Special case for strays: even if we have a good backtrace, + * don't put it in the stray dir, because while that would technically + * give it linkage it would still be invisible to the user */ + r = driver->inject_lost_and_found(obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } else { + /* Happy case: we will inject a named dentry for this inode */ + r = driver->inject_with_backtrace(backtrace, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " with backtrace: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + } else { + /* Backtrace-less case: we will inject a lost+found dentry */ + r = driver->inject_lost_and_found( + obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << obj_name_ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + + return r; + }); +} + +int MetadataTool::read_fnode( + inodeno_t ino, frag_t frag, fnode_t *fnode, + uint64_t *last_version) +{ + ceph_assert(fnode != NULL); + + object_t frag_oid = InodeStore::get_object_name(ino, frag, ""); + bufferlist fnode_bl; + int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl); + *last_version = metadata_io.get_last_version(); + if (r < 0) { + return r; + } + + auto old_fnode_iter = fnode_bl.cbegin(); + try { + (*fnode).decode(old_fnode_iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + return 0; +} + +int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag, + const std::string &dname, InodeStore *inode, snapid_t *dnfirst) +{ + ceph_assert(inode != NULL); + + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str()); + dn_key.encode(key); + + std::set<std::string> keys; + keys.insert(key); + std::map<std::string, bufferlist> vals; + object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, ""); + int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals); + dout(20) << "oid=" << frag_oid.name + << " dname=" << dname + << " frag=" << frag + << ", r=" << r << dendl; + if (r < 0) { + return r; + } + + if (vals.find(key) == vals.end()) { + dout(20) << key << " not found in result" << dendl; + return -ENOENT; + } + + try { + auto q = vals[key].cbegin(); + snapid_t first; + decode(first, q); + char dentry_type; + decode(dentry_type, q); + if (dentry_type == 'I' || dentry_type == 'i') { + if (dentry_type == 'i') { + mempool::mds_co::string alternate_name; + + DECODE_START(2, q); + if (struct_v >= 2) + decode(alternate_name, q); + inode->decode(q); + DECODE_FINISH(q); + } else { + inode->decode_bare(q); + } + } else { + dout(20) << "dentry type '" << dentry_type << "': cannot" + "read an inode out of that" << dendl; + return -EINVAL; + } + if (dnfirst) + *dnfirst = first; + } catch (const buffer::error &err) { + dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino + << std::dec << "/" << dname << dendl; + return -EINVAL; + } + + return 0; +} + +int MetadataDriver::load_table(MDSTable *table) +{ + object_t table_oid = table->get_object_name(); + + bufferlist table_bl; + int r = metadata_io.read(table_oid.name, table_bl, 0, 0); + if (r < 0) { + derr << "unable to read mds table '" << table_oid.name << "': " + << cpp_strerror(r) << dendl; + return r; + } + + try { + version_t table_ver; + auto p = table_bl.cbegin(); + decode(table_ver, p); + table->decode_state(p); + table->force_replay_version(table_ver); + } catch (const buffer::error &err) { + derr << "unable to decode mds table '" << table_oid.name << "': " + << err.what() << dendl; + return -EIO; + } + return 0; +} + +int MetadataDriver::save_table(MDSTable *table) +{ + object_t table_oid = table->get_object_name(); + + bufferlist table_bl; + encode(table->get_version(), table_bl); + table->encode_state(table_bl); + int r = metadata_io.write_full(table_oid.name, table_bl); + if (r != 0) { + derr << "error updating mds table " << table_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +int MetadataDriver::inject_lost_and_found( + inodeno_t ino, const InodeStore &dentry) +{ + // Create lost+found if doesn't exist + bool created = false; + int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created); + if (r < 0) { + return r; + } + InodeStore lf_ino; + r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino); + if (r == -ENOENT || r == -EINVAL) { + if (r == -EINVAL && !force_corrupt) { + return r; + } + + // To have a directory not specify a layout, give it zeros (see + // inode_t::has_layout) + file_layout_t inherit_layout; + + // Construct LF inode + frag_info_t fragstat; + fragstat.nfiles = 1, + build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino); + + // Inject link to LF inode in the root dir + r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino); + if (r < 0) { + return r; + } + } else { + if (!(lf_ino.inode->mode & S_IFDIR)) { + derr << "lost+found exists but is not a directory!" << dendl; + // In this case we error out, and the user should do something about + // this problem. + return -EINVAL; + } + } + + r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created); + if (r < 0) { + return r; + } + + const std::string dname = lost_found_dname(ino); + + // Write dentry into lost+found dirfrag + return inject_linkage(lf_ino.inode->ino, dname, frag_t(), dentry); +} + + +int MetadataDriver::get_frag_of( + inodeno_t dirino, + const std::string &target_dname, + frag_t *result_ft) +{ + object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), ""); + + dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl; + + // Find and load fragtree if existing dirfrag + // ========================================== + bool have_backtrace = false; + bufferlist parent_bl; + int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl); + if (r == -ENODATA) { + dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl; + } else if (r < 0) { + dout(4) << "Unexpected error on '" << root_frag_oid << "': " + << cpp_strerror(r) << dendl; + return r; + } + + // Deserialize backtrace + inode_backtrace_t backtrace; + if (parent_bl.length()) { + try { + auto q = parent_bl.cbegin(); + backtrace.decode(q); + have_backtrace = true; + } catch (buffer::error &e) { + dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': " + << e.what() << dendl; + } + } + + if (!(have_backtrace && backtrace.ancestors.size())) { + // Can't work out fragtree without a backtrace + dout(4) << "No backtrace on '" << root_frag_oid + << "': cannot determine fragtree" << dendl; + return -ENOENT; + } + + // The parentage of dirino + const inode_backpointer_t &bp = *(backtrace.ancestors.begin()); + + // The inode of dirino's parent + const inodeno_t parent_ino = bp.dirino; + + // The dname of dirino in its parent. + const std::string &parent_dname = bp.dname; + + dout(20) << "got backtrace parent " << parent_ino << "/" + << parent_dname << dendl; + + // The primary dentry for dirino + InodeStore existing_dentry; + + // See if we can find ourselves in dirfrag zero of the parent: this + // is a fast path that avoids needing to go further up the tree + // if the parent isn't fragmented (worst case we would have to + // go all the way to the root) + r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry); + if (r >= 0) { + // Great, fast path: return the fragtree from here + if (existing_dentry.inode->ino != dirino) { + dout(4) << "Unexpected inode in dentry! 0x" << std::hex + << existing_dentry.inode->ino + << " vs expected 0x" << dirino << std::dec << dendl; + return -ENOENT; + } + dout(20) << "fast path, fragtree is " + << existing_dentry.dirfragtree << dendl; + *result_ft = existing_dentry.pick_dirfrag(target_dname); + dout(20) << "frag is " << *result_ft << dendl; + return 0; + } else if (r != -ENOENT) { + // Dentry not present in 0th frag, must read parent's fragtree + frag_t parent_frag; + r = get_frag_of(parent_ino, parent_dname, &parent_frag); + if (r == 0) { + // We have the parent fragtree, so try again to load our dentry + r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry); + if (r >= 0) { + // Got it! + *result_ft = existing_dentry.pick_dirfrag(target_dname); + dout(20) << "resolved via parent, frag is " << *result_ft << dendl; + return 0; + } else { + if (r == -EINVAL || r == -ENOENT) { + return -ENOENT; // dentry missing or corrupt, so frag is missing + } else { + return r; + } + } + } else { + // Couldn't resolve parent fragtree, so can't find ours. + return r; + } + } else if (r == -EINVAL) { + // Unreadable dentry, can't know the fragtree. + return -ENOENT; + } else { + // Unexpected error, raise it + return r; + } +} + + +int MetadataDriver::inject_with_backtrace( + const inode_backtrace_t &backtrace, const InodeStore &dentry) + +{ + + // On dirfrags + // =========== + // In order to insert something into a directory, we first (ideally) + // need to know the fragtree for the directory. Sometimes we can't + // get that, in which case we just go ahead and insert it into + // fragment zero for a good chance of that being the right thing + // anyway (most moderate-sized dirs aren't fragmented!) + + // On ancestry + // =========== + // My immediate ancestry should be correct, so if we can find that + // directory's dirfrag then go inject it there. This works well + // in the case that this inode's dentry was somehow lost and we + // are recreating it, because the rest of the hierarchy + // will probably still exist. + // + // It's more of a "better than nothing" approach when rebuilding + // a whole tree, as backtraces will in general not be up to date + // beyond the first parent, if anything in the trace was ever + // moved after the file was created. + + // On inode numbers + // ================ + // The backtrace tells us inodes for each of the parents. If we are + // creating those parent dirfrags, then there is a risk that somehow + // the inode indicated here was also used for data (not a dirfrag) at + // some stage. That would be a zany situation, and we don't check + // for it here, because to do so would require extra IOs for everything + // we inject, and anyway wouldn't guarantee that the inode number + // wasn't in use in some dentry elsewhere in the metadata tree that + // just happened not to have any data objects. + + // On multiple workers touching the same traces + // ============================================ + // When creating linkage for a directory, *only* create it if we are + // also creating the object. That way, we might not manage to get the + // *right* linkage for a directory, but at least we won't multiply link + // it. We assume that if a root dirfrag exists for a directory, then + // it is linked somewhere (i.e. that the metadata pool is not already + // inconsistent). + // + // Making sure *that* is true is someone else's job! Probably someone + // who is not going to run in parallel, so that they can self-consistently + // look at versions and move things around as they go. + // Note this isn't 100% safe: if we die immediately after creating dirfrag + // object, next run will fail to create linkage for the dirfrag object + // and leave it orphaned. + + inodeno_t ino = backtrace.ino; + dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl; + for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin(); + i != backtrace.ancestors.end(); ++i) { + const inode_backpointer_t &backptr = *i; + dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec + << "/" << backptr.dname << dendl; + + // Examine root dirfrag for parent + const inodeno_t parent_ino = backptr.dirino; + const std::string dname = backptr.dname; + + frag_t fragment; + int r = get_frag_of(parent_ino, dname, &fragment); + if (r == -ENOENT) { + // Don't know fragment, fall back to assuming root + dout(20) << "don't know fragment for 0x" << std::hex << + parent_ino << std::dec << "/" << dname << ", will insert to root" + << dendl; + } + + // Find or create dirfrag + // ====================== + bool created_dirfrag; + r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag); + if (r < 0) { + return r; + } + + // Check if dentry already exists + // ============================== + InodeStore existing_dentry; + r = read_dentry(parent_ino, fragment, dname, &existing_dentry); + bool write_dentry = false; + if (r == -ENOENT || r == -EINVAL) { + if (r == -EINVAL && !force_corrupt) { + return r; + } + // Missing or corrupt dentry + write_dentry = true; + } else if (r < 0) { + derr << "Unexpected error reading dentry 0x" << std::hex + << parent_ino << std::dec << "/" + << dname << ": " << cpp_strerror(r) << dendl; + break; + } else { + // Dentry already present, does it link to me? + if (existing_dentry.inode->ino == ino) { + dout(20) << "Dentry 0x" << std::hex + << parent_ino << std::dec << "/" + << dname << " already exists and points to me" << dendl; + } else { + derr << "Dentry 0x" << std::hex + << parent_ino << std::dec << "/" + << dname << " already exists but points to 0x" + << std::hex << existing_dentry.inode->ino << std::dec << dendl; + // Fall back to lost+found! + return inject_lost_and_found(backtrace.ino, dentry); + } + } + + // Inject linkage + // ============== + + if (write_dentry) { + if (i == backtrace.ancestors.begin()) { + // This is the linkage for the file of interest + dout(10) << "Linking inode 0x" << std::hex << ino + << " at 0x" << parent_ino << "/" << dname << std::dec + << " with size=" << dentry.inode->size << " bytes" << dendl; + + r = inject_linkage(parent_ino, dname, fragment, dentry); + } else { + // This is the linkage for an ancestor directory + InodeStore ancestor_dentry; + auto inode = ancestor_dentry.get_inode(); + inode->mode = 0755 | S_IFDIR; + + // Set nfiles to something non-zero, to fool any other code + // that tries to ignore 'empty' directories. This won't be + // accurate, but it should avoid functional issues. + + inode->dirstat.nfiles = 1; + inode->dir_layout.dl_dir_hash = + g_conf()->mds_default_dir_hash; + + inode->nlink = 1; + inode->ino = ino; + inode->uid = g_conf()->mds_root_ino_uid; + inode->gid = g_conf()->mds_root_ino_gid; + inode->version = 1; + inode->backtrace_version = 1; + r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry); + } + + if (r < 0) { + return r; + } + } + + if (!created_dirfrag) { + // If the parent dirfrag already existed, then stop traversing the + // backtrace: assume that the other ancestors already exist too. This + // is an assumption rather than a truth, but it's a convenient way + // to avoid the risk of creating multiply-linked directories while + // injecting data. If there are in fact missing ancestors, this + // should be fixed up using a separate tool scanning the metadata + // pool. + break; + } else { + // Proceed up the backtrace, creating parents + ino = parent_ino; + } + } + + return 0; +} + +int MetadataDriver::find_or_create_dirfrag( + inodeno_t ino, + frag_t fragment, + bool *created) +{ + ceph_assert(created != NULL); + + fnode_t existing_fnode; + *created = false; + + uint64_t read_version = 0; + int r = read_fnode(ino, fragment, &existing_fnode, &read_version); + dout(10) << "read_version = " << read_version << dendl; + + if (r == -ENOENT || r == -EINVAL) { + if (r == -EINVAL && !force_corrupt) { + return r; + } + + // Missing or corrupt fnode, create afresh + bufferlist fnode_bl; + fnode_t blank_fnode; + blank_fnode.version = 1; + // mark it as non-empty + blank_fnode.fragstat.nfiles = 1; + blank_fnode.accounted_fragstat = blank_fnode.fragstat; + blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS); + blank_fnode.encode(fnode_bl); + + + librados::ObjectWriteOperation op; + + if (read_version) { + ceph_assert(r == -EINVAL); + // Case A: We must assert that the version isn't changed since we saw the object + // was unreadable, to avoid the possibility of two data-scan processes + // both creating the frag. + op.assert_version(read_version); + } else { + ceph_assert(r == -ENOENT); + // Case B: The object didn't exist in read_fnode, so while creating it we must + // use an exclusive create to correctly populate *creating with + // whether we created it ourselves or someone beat us to it. + op.create(true); + } + + object_t frag_oid = InodeStore::get_object_name(ino, fragment, ""); + op.omap_set_header(fnode_bl); + r = metadata_io.operate(frag_oid.name, &op); + if (r == -EOVERFLOW || r == -EEXIST) { + // Someone else wrote it (see case A above) + dout(10) << "Dirfrag creation race: 0x" << std::hex + << ino << " " << fragment << std::dec << dendl; + *created = false; + return 0; + } else if (r < 0) { + // We were unable to create or write it, error out + derr << "Failed to create dirfrag 0x" << std::hex + << ino << std::dec << ": " << cpp_strerror(r) << dendl; + return r; + } else { + // Success: the dirfrag object now exists with a value header + dout(10) << "Created dirfrag: 0x" << std::hex + << ino << std::dec << dendl; + *created = true; + } + } else if (r < 0) { + derr << "Unexpected error reading dirfrag 0x" << std::hex + << ino << std::dec << " : " << cpp_strerror(r) << dendl; + return r; + } else { + dout(20) << "Dirfrag already exists: 0x" << std::hex + << ino << " " << fragment << std::dec << dendl; + } + + return 0; +} + +int MetadataDriver::inject_linkage( + inodeno_t dir_ino, const std::string &dname, + const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst) +{ + object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, ""); + + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str()); + dn_key.encode(key); + + bufferlist dentry_bl; + encode(dnfirst, dentry_bl); + encode('I', dentry_bl); + inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + // Write out + std::map<std::string, bufferlist> vals; + vals[key] = dentry_bl; + int r = metadata_io.omap_set(frag_oid.name, vals); + if (r != 0) { + derr << "Error writing dentry 0x" << std::hex + << dir_ino << std::dec << "/" + << dname << ": " << cpp_strerror(r) << dendl; + return r; + } else { + dout(20) << "Injected dentry 0x" << std::hex + << dir_ino << "/" << dname << " pointing to 0x" + << inode.inode->ino << std::dec << dendl; + return 0; + } +} + + +int MetadataDriver::init( + librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap, + fs_cluster_id_t fscid) +{ + if (metadata_pool_name.empty()) { + auto fs = fsmap->get_filesystem(fscid); + ceph_assert(fs != nullptr); + int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool(); + + dout(4) << "resolving metadata pool " << metadata_pool_id << dendl; + int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name); + if (r < 0) { + derr << "Pool " << metadata_pool_id + << " identified in MDS map not found in RADOS!" << dendl; + return r; + } + dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl; + } else { + dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl; + } + return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io); +} + +int LocalFileDriver::init( + librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap, + fs_cluster_id_t fscid) +{ + return 0; +} + +int LocalFileDriver::inject_data( + const std::string &file_path, + uint64_t size, + uint32_t chunk_size, + inodeno_t ino) +{ + // Scrape the file contents out of the data pool and into the + // local filesystem + std::fstream f; + f.open(file_path.c_str(), std::fstream::out | std::fstream::binary); + + for (uint64_t offset = 0; offset < size; offset += chunk_size) { + bufferlist bl; + + char buf[32]; + snprintf(buf, sizeof(buf), + "%llx.%08llx", + (unsigned long long)ino, + (unsigned long long)(offset / chunk_size)); + std::string oid(buf); + + int r = data_io.read(oid, bl, chunk_size, 0); + + if (r <= 0 && r != -ENOENT) { + derr << "error reading data object '" << oid << "': " + << cpp_strerror(r) << dendl; + f.close(); + return r; + } else if (r >=0) { + + f.seekp(offset); + bl.write_stream(f); + } + } + f.close(); + + return 0; +} + + +int LocalFileDriver::inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) +{ + std::string path_builder = path; + + // Iterate through backtrace creating directory parents + std::vector<inode_backpointer_t>::const_reverse_iterator i; + for (i = bt.ancestors.rbegin(); + i != bt.ancestors.rend(); ++i) { + + const inode_backpointer_t &backptr = *i; + path_builder += "/"; + path_builder += backptr.dname; + + // Last entry is the filename itself + bool is_file = (i + 1 == bt.ancestors.rend()); + if (is_file) { + // FIXME: inject_data won't cope with interesting (i.e. striped) + // layouts (need a librados-compatible Filer to read these) + inject_data(path_builder, dentry.inode->size, + dentry.inode->layout.object_size, bt.ino); + } else { + int r = mkdir(path_builder.c_str(), 0755); + if (r != 0 && r != -EPERM) { + derr << "error creating directory: '" << path_builder << "': " + << cpp_strerror(r) << dendl; + return r; + } + } + } + + return 0; +} + +int LocalFileDriver::inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) +{ + std::string lf_path = path + "/lost+found"; + int r = mkdir(lf_path.c_str(), 0755); + if (r != 0 && r != -EPERM) { + derr << "error creating directory: '" << lf_path << "': " + << cpp_strerror(r) << dendl; + return r; + } + + std::string file_path = lf_path + "/" + lost_found_dname(ino); + return inject_data(file_path, dentry.inode->size, + dentry.inode->layout.object_size, ino); +} + +int LocalFileDriver::init_roots(int64_t data_pool_id) +{ + // Ensure that the path exists and is a directory + bool exists; + int r = check_roots(&exists); + if (r != 0) { + return r; + } + + if (exists) { + return 0; + } else { + return ::mkdir(path.c_str(), 0755); + } +} + +int LocalFileDriver::check_roots(bool *result) +{ + // Check if the path exists and is a directory + DIR *d = ::opendir(path.c_str()); + if (d == NULL) { + *result = false; + } else { + int r = closedir(d); + if (r != 0) { + // Weird, but maybe possible with e.g. stale FD on NFS mount? + *result = false; + } else { + *result = true; + } + } + + return 0; +} + +void MetadataTool::build_file_dentry( + inodeno_t ino, uint64_t file_size, time_t file_mtime, + const file_layout_t &layout, InodeStore *out) +{ + ceph_assert(out != NULL); + + auto inode = out->get_inode(); + inode->mode = 0500 | S_IFREG; + inode->size = file_size; + inode->max_size_ever = file_size; + inode->mtime.tv.tv_sec = file_mtime; + inode->atime.tv.tv_sec = file_mtime; + inode->ctime.tv.tv_sec = file_mtime; + + inode->layout = layout; + + inode->truncate_seq = 1; + inode->truncate_size = -1ull; + + inode->inline_data.version = CEPH_INLINE_NONE; + + inode->nlink = 1; + inode->ino = ino; + inode->version = 1; + inode->backtrace_version = 1; + inode->uid = g_conf()->mds_root_ino_uid; + inode->gid = g_conf()->mds_root_ino_gid; +} + +void MetadataTool::build_dir_dentry( + inodeno_t ino, const frag_info_t &fragstat, + const file_layout_t &layout, InodeStore *out) +{ + ceph_assert(out != NULL); + + auto inode = out->get_inode(); + inode->mode = 0755 | S_IFDIR; + inode->dirstat = fragstat; + inode->mtime.tv.tv_sec = fragstat.mtime; + inode->atime.tv.tv_sec = fragstat.mtime; + inode->ctime.tv.tv_sec = fragstat.mtime; + + inode->layout = layout; + inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + + inode->truncate_seq = 1; + inode->truncate_size = -1ull; + + inode->inline_data.version = CEPH_INLINE_NONE; + + inode->nlink = 1; + inode->ino = ino; + inode->version = 1; + inode->backtrace_version = 1; + inode->uid = g_conf()->mds_root_ino_uid; + inode->gid = g_conf()->mds_root_ino_gid; +} + diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h new file mode 100644 index 000000000..5c87fe2bd --- /dev/null +++ b/src/tools/cephfs/DataScan.h @@ -0,0 +1,341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "MDSUtility.h" +#include "include/rados/librados.hpp" + +class InodeStore; +class MDSTable; + +class RecoveryDriver { + protected: + // If true, overwrite structures that generate decoding errors. + bool force_corrupt; + + // If true, overwrite root objects during init_roots even if they + // exist + bool force_init; + + public: + virtual int init( + librados::Rados &rados, + std::string &metadata_pool_name, + const FSMap *fsmap, + fs_cluster_id_t fscid) = 0; + + void set_force_corrupt(const bool val) + { + force_corrupt = val; + } + + void set_force_init(const bool val) + { + force_init = val; + } + + + /** + * Inject an inode + dentry parents into the metadata pool, + * based on a backtrace recovered from the data pool + */ + virtual int inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) = 0; + + /** + * Inject an inode + dentry into the lost+found directory, + * when all we know about a file is its inode. + */ + virtual int inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) = 0; + + /** + * Create any missing roots (i.e. mydir, strays, root inode) + */ + virtual int init_roots( + int64_t data_pool_id) = 0; + + /** + * Pre-injection check that all the roots are present in + * the metadata pool. Used to avoid parallel workers interfering + * with one another, by cueing the user to go run 'init' on a + * single node before running a parallel scan. + * + * @param result: set to true if roots are present, else set to false + * @returns 0 on no unexpected errors, else error code. Missing objects + * are not considered an unexpected error: check *result for + * this case. + */ + virtual int check_roots(bool *result) = 0; + + /** + * Helper to compose dnames for links to lost+found + * inodes. + */ + std::string lost_found_dname(inodeno_t ino) + { + char s[20]; + snprintf(s, sizeof(s), "%llx", (unsigned long long)ino); + return std::string(s); + } + + RecoveryDriver() + : force_corrupt(false), + force_init(false) + {} + + virtual ~RecoveryDriver() {} +}; + +class LocalFileDriver : public RecoveryDriver +{ + protected: + const std::string path; + librados::IoCtx &data_io; + + int inject_data( + const std::string &file_path, + uint64_t size, + uint32_t chunk_size, + inodeno_t ino); + public: + + LocalFileDriver(const std::string &path_, librados::IoCtx &data_io_) + : RecoveryDriver(), path(path_), data_io(data_io_) + {} + + // Implement RecoveryDriver interface + int init( + librados::Rados &rados, + std::string &metadata_pool_name, + const FSMap *fsmap, + fs_cluster_id_t fscid) override; + + int inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) override; + + int inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) override; + + int init_roots(int64_t data_pool_id) override; + + int check_roots(bool *result) override; +}; + +/** + * A class that knows how to work with objects in a CephFS + * metadata pool. + */ +class MetadataTool +{ + protected: + + librados::IoCtx metadata_io; + + /** + * Construct a synthetic InodeStore for a normal file + */ + void build_file_dentry( + inodeno_t ino, uint64_t file_size, time_t file_mtime, + const file_layout_t &layout, + InodeStore *out); + + /** + * Construct a synthetic InodeStore for a directory + */ + void build_dir_dentry( + inodeno_t ino, + const frag_info_t &fragstat, + const file_layout_t &layout, + InodeStore *out); + + /** + * Try and read an fnode from a dirfrag + */ + int read_fnode(inodeno_t ino, frag_t frag, + fnode_t *fnode, uint64_t *read_version); + + /** + * Try and read a dentry from a dirfrag + */ + int read_dentry(inodeno_t parent_ino, frag_t frag, + const std::string &dname, InodeStore *inode, snapid_t *dnfirst=nullptr); +}; + +/** + * A class that knows how to manipulate CephFS metadata pools + */ +class MetadataDriver : public RecoveryDriver, public MetadataTool +{ + protected: + /** + * Create a .inode object, i.e. root or mydir + */ + int inject_unlinked_inode(inodeno_t inono, int mode, int64_t data_pool_id); + + /** + * Check for existence of .inode objects, before + * trying to go ahead and inject metadata. + */ + int root_exists(inodeno_t ino, bool *result); + int find_or_create_dirfrag( + inodeno_t ino, + frag_t fragment, + bool *created); + + + /** + * Work out which fragment of a directory should contain a named + * dentry, recursing up the trace as necessary to retrieve + * fragtrees. + */ + int get_frag_of( + inodeno_t dirino, + const std::string &dname, + frag_t *result_ft); + + public: + + // Implement RecoveryDriver interface + int init( + librados::Rados &rados, + std::string &metadata_pool_name, + const FSMap *fsmap, + fs_cluster_id_t fscid) override; + + int inject_linkage( + inodeno_t dir_ino, const std::string &dname, + const frag_t fragment, const InodeStore &inode, snapid_t dnfirst=CEPH_NOSNAP); + + int inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) override; + + int inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) override; + + int init_roots(int64_t data_pool_id) override; + + int check_roots(bool *result) override; + + int load_table(MDSTable *table); + int save_table(MDSTable *table); +}; + +class DataScan : public MDSUtility, public MetadataTool +{ + protected: + RecoveryDriver *driver; + fs_cluster_id_t fscid; + + string metadata_pool_name; + std::vector<int64_t> data_pools; + + // IoCtx for data pool (where we scrape file backtraces from) + librados::IoCtx data_io; + // Remember the data pool ID for use in layouts + int64_t data_pool_id; + + uint32_t n; + uint32_t m; + + /** + * Scan data pool for backtraces, and inject inodes to metadata pool + */ + int scan_inodes(); + + /** + * Scan data pool for file sizes and mtimes + */ + int scan_extents(); + + /** + * Scan metadata pool for 0th dirfrags to link orphaned + * directory inodes. + */ + int scan_frags(); + + /** + * Cleanup xattrs from data pool + */ + int cleanup(); + + /** + * Check if an inode number is in the permitted ranges + */ + bool valid_ino(inodeno_t ino) const; + + + int scan_links(); + + // Accept pools which are not in the FSMap + bool force_pool; + // Respond to decode errors by overwriting + bool force_corrupt; + // Overwrite root objects even if they exist + bool force_init; + // Only scan inodes without this scrub tag + string filter_tag; + + /** + * @param r set to error on valid key with invalid value + * @return true if argument consumed, else false + */ + bool parse_kwarg( + const std::vector<const char*> &args, + std::vector<const char *>::const_iterator &i, + int *r); + + /** + * @return true if argument consumed, else false + */ + bool parse_arg( + const std::vector<const char*> &arg, + std::vector<const char *>::const_iterator &i); + + int probe_filter(librados::IoCtx &ioctx); + + /** + * Apply a function to all objects in an ioctx's pool, optionally + * restricted to only those objects with a 00000000 offset and + * no tag matching DataScan::scrub_tag. + */ + int forall_objects( + librados::IoCtx &ioctx, + bool untagged_only, + std::function<int(std::string, uint64_t, uint64_t)> handler); + + public: + static void usage(); + int main(const std::vector<const char *> &args); + + DataScan() + : driver(NULL), fscid(FS_CLUSTER_ID_NONE), + data_pool_id(-1), n(0), m(1), + force_pool(false), force_corrupt(false), + force_init(false) + { + } + + ~DataScan() override + { + delete driver; + } +}; + diff --git a/src/tools/cephfs/Dumper.cc b/src/tools/cephfs/Dumper.cc new file mode 100644 index 000000000..f3b07c551 --- /dev/null +++ b/src/tools/cephfs/Dumper.cc @@ -0,0 +1,431 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef _BACKWARD_BACKWARD_WARNING_H +#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_* +#endif + +#include "include/compat.h" +#include "include/fs_types.h" +#include "common/entity_name.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/JournalPointer.h" +#include "osdc/Journaler.h" +#include "mon/MonClient.h" + +#include "Dumper.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +#define HEADER_LEN 4096 + +int Dumper::init(mds_role_t role_, const std::string &type) +{ + role = role_; + + int r = MDSUtility::init(); + if (r < 0) { + return r; + } + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + if (type == "mdlog") { + JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool()); + int jp_load_result = jp.load(objecter); + if (jp_load_result != 0) { + std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl; + return jp_load_result; + } else { + ino = jp.front; + } + } else if (type == "purge_queue") { + ino = MDS_INO_PURGE_QUEUE + role.rank; + } else { + ceph_abort(); // should not get here + } + return 0; +} + + +int Dumper::recover_journal(Journaler *journaler) +{ + C_SaferCond cond; + lock.lock(); + journaler->recover(&cond); + lock.unlock(); + const int r = cond.wait(); + + if (r < 0) { // Error + derr << "error on recovery: " << cpp_strerror(r) << dendl; + return r; + } else { + dout(10) << "completed journal recovery" << dendl; + return 0; + } +} + + +int Dumper::dump(const char *dump_file) +{ + int r = 0; + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, + &finisher); + r = recover_journal(&journaler); + if (r) { + return r; + } + uint64_t start = journaler.get_read_pos(); + uint64_t end = journaler.get_write_pos(); + uint64_t len = end-start; + + Filer filer(objecter, &finisher); + + cout << "journal is " << start << "~" << len << std::endl; + + int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0644); + if (fd >= 0) { + // include an informative header + uuid_d fsid = monc->get_fsid(); + char fsid_str[40]; + fsid.print(fsid_str); + char buf[HEADER_LEN]; + memset(buf, 0, sizeof(buf)); + snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\ + length %llu (0x%llx)\n write_pos %llu (0x%llx)\n format %llu\n\ + trimmed_pos %llu (0x%llx)\n stripe_unit %lu (0x%lx)\n stripe_count %lu (0x%lx)\n\ + object_size %lu (0x%lx)\n fsid %s\n%c", + role.rank, + (unsigned long long)start, (unsigned long long)start, + (unsigned long long)len, (unsigned long long)len, + (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos, + (unsigned long long)journaler.last_committed.stream_format, + (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos, + (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit, + (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count, + (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size, + fsid_str, + 4); + r = safe_write(fd, buf, sizeof(buf)); + if (r) { + derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl; + ::close(fd); + return r; + } + + // write the data + off64_t seeked = ::lseek64(fd, start, SEEK_SET); + if (seeked == (off64_t)-1) { + r = errno; + derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl; + ::close(fd); + return r; + } + + + // Read and write 32MB chunks. Slower than it could be because we're not + // streaming, but that's okay because this is just a debug/disaster tool. + const uint32_t chunk_size = 32 * 1024 * 1024; + + for (uint64_t pos = start; pos < start + len; pos += chunk_size) { + bufferlist bl; + dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl; + + const uint32_t read_size = std::min<uint64_t>(chunk_size, end - pos); + + C_SaferCond cond; + lock.lock(); + filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP, + pos, read_size, &bl, 0, &cond); + lock.unlock(); + r = cond.wait(); + if (r < 0) { + derr << "Error " << r << " (" << cpp_strerror(r) << ") reading " + "journal at offset 0x" << std::hex << pos << std::dec << dendl; + ::close(fd); + return r; + } + dout(10) << "Got 0x" << std::hex << bl.length() << std::dec + << " bytes" << dendl; + + r = bl.write_fd(fd); + if (r) { + derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl; + ::close(fd); + return r; + } + } + + r = ::close(fd); + if (r) { + r = errno; + derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl; + return r; + } + + cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n" + << "NOTE: this is a _sparse_ file; you can\n" + << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n" + << " to efficiently compress it while preserving sparseness." << std::endl; + return 0; + } else { + int err = errno; + derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl; + return err; + } +} + +int Dumper::undump(const char *dump_file, bool force) +{ + cout << "undump " << dump_file << std::endl; + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + int r = 0; + // try get layout info from cluster + Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, + &finisher); + int recovered = recover_journal(&journaler); + if (recovered != 0) { + derr << "recover_journal failed, try to get header from dump file " << dendl; + } + + int fd = ::open(dump_file, O_RDONLY|O_BINARY); + if (fd < 0) { + r = errno; + derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl; + return r; + } + + // Ceph mds0 journal dump + // start offset 232401996 (0xdda2c4c) + // length 1097504 (0x10bf20) + + char buf[HEADER_LEN]; + r = safe_read(fd, buf, sizeof(buf)); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + + long long unsigned start, len, write_pos, format, trimmed_pos; + long unsigned stripe_unit, stripe_count, object_size; + sscanf(strstr(buf, "start offset"), "start offset %llu", &start); + sscanf(strstr(buf, "length"), "length %llu", &len); + sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos); + sscanf(strstr(buf, "format"), "format %llu", &format); + + if (!force) { + // need to check if fsid match onlien cluster fsid + if (strstr(buf, "fsid")) { + uuid_d fsid; + char fsid_str[40]; + sscanf(strstr(buf, "fsid"), "fsid %39s", fsid_str); + r = fsid.parse(fsid_str); + if (!r) { + derr << "Invalid fsid" << dendl; + ::close(fd); + return -EINVAL; + } + + if (fsid != monc->get_fsid()) { + derr << "Imported journal fsid does not match online cluster fsid" << dendl; + derr << "Use --force to skip fsid check" << dendl; + ::close(fd); + return -EINVAL; + } + } else { + derr << "Invalid header, no fsid embeded" << dendl; + ::close(fd); + return -EINVAL; + } + } + + if (recovered == 0) { + stripe_unit = journaler.last_committed.layout.stripe_unit; + stripe_count = journaler.last_committed.layout.stripe_count; + object_size = journaler.last_committed.layout.object_size; + } else { + // try to get layout from dump file header, if failed set layout to default + if (strstr(buf, "stripe_unit")) { + sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit); + } else { + stripe_unit = file_layout_t::get_default().stripe_unit; + } + if (strstr(buf, "stripe_count")) { + sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count); + } else { + stripe_count = file_layout_t::get_default().stripe_count; + } + if (strstr(buf, "object_size")) { + sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size); + } else { + object_size = file_layout_t::get_default().object_size; + } + } + + if (strstr(buf, "trimmed_pos")) { + sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos); + } else { + // Old format dump, any untrimmed objects before expire_pos will + // be discarded as trash. + trimmed_pos = start - (start % object_size); + } + + if (trimmed_pos > start) { + derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos + << " > expire 0x" << start << std::dec << dendl; + ::close(fd); + return -EINVAL; + } + + if (start > write_pos) { + derr << std::hex << "Invalid header (expire 0x" << start + << " > write 0x" << write_pos << std::dec << dendl; + ::close(fd); + return -EINVAL; + } + + cout << "start " << start << + " len " << len << + " write_pos " << write_pos << + " format " << format << + " trimmed_pos " << trimmed_pos << + " stripe_unit " << stripe_unit << + " stripe_count " << stripe_count << + " object_size " << object_size << std::endl; + + Journaler::Header h; + h.trimmed_pos = trimmed_pos; + h.expire_pos = start; + h.write_pos = write_pos; + h.stream_format = format; + h.magic = CEPH_FS_ONDISK_MAGIC; + + h.layout.stripe_unit = stripe_unit; + h.layout.stripe_count = stripe_count; + h.layout.object_size = object_size; + h.layout.pool_id = fs->mds_map.get_metadata_pool(); + + bufferlist hbl; + encode(h, hbl); + + object_t oid = file_object_t(ino, 0); + object_locator_t oloc(fs->mds_map.get_metadata_pool()); + SnapContext snapc; + + cout << "writing header " << oid << std::endl; + C_SaferCond header_cond; + lock.lock(); + objecter->write_full(oid, oloc, snapc, hbl, + ceph::real_clock::now(), 0, + &header_cond); + lock.unlock(); + + r = header_cond.wait(); + if (r != 0) { + derr << "Failed to write header: " << cpp_strerror(r) << dendl; + ::close(fd); + return r; + } + + Filer filer(objecter, &finisher); + + /* Erase any objects at the end of the region to which we shall write + * the new log data. This is to avoid leaving trailing junk after + * the newly written data. Any junk more than one object ahead + * will be taken care of during normal operation by Journaler's + * prezeroing behaviour */ + { + uint32_t const object_size = h.layout.object_size; + ceph_assert(object_size > 0); + uint64_t last_obj = h.write_pos / object_size; + uint64_t purge_count = 2; + /* When the length is zero, the last_obj should be zeroed + * from the offset determined by the new write_pos instead of being purged. + */ + if (!len) { + purge_count = 1; + ++last_obj; + } + C_SaferCond purge_cond; + cout << "Purging " << purge_count << " objects from " << last_obj << std::endl; + lock.lock(); + filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count, + ceph::real_clock::now(), 0, &purge_cond); + lock.unlock(); + purge_cond.wait(); + } + /* When the length is zero, zero the last object + * from the offset determined by the new write_pos. + */ + if (!len) { + uint64_t offset_in_obj = h.write_pos % h.layout.object_size; + uint64_t len = h.layout.object_size - offset_in_obj; + C_SaferCond zero_cond; + cout << "Zeroing " << len << " bytes in the last object." << std::endl; + + lock.lock(); + filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond); + lock.unlock(); + zero_cond.wait(); + } + + // Stream from `fd` to `filer` + uint64_t pos = start; + uint64_t left = len; + while (left > 0) { + // Read + bufferlist j; + lseek64(fd, pos, SEEK_SET); + uint64_t l = std::min<uint64_t>(left, 1024*1024); + j.read_fd(fd, l); + + // Write + cout << " writing " << pos << "~" << l << std::endl; + C_SaferCond write_cond; + lock.lock(); + filer.write(ino, &h.layout, snapc, pos, l, j, + ceph::real_clock::now(), 0, &write_cond); + lock.unlock(); + + r = write_cond.wait(); + if (r != 0) { + derr << "Failed to write header: " << cpp_strerror(r) << dendl; + ::close(fd); + return r; + } + + // Advance + pos += l; + left -= l; + } + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + cout << "done." << std::endl; + return 0; +} + diff --git a/src/tools/cephfs/Dumper.h b/src/tools/cephfs/Dumper.h new file mode 100644 index 000000000..758f3cdea --- /dev/null +++ b/src/tools/cephfs/Dumper.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef JOURNAL_DUMPER_H_ +#define JOURNAL_DUMPER_H_ + + +#include "MDSUtility.h" + +class Journaler; + +/** + * This class lets you dump out an mds journal for troubleshooting or whatever. + * + * It was built to work with cmds so some of the design choices are random. + * To use, create a Dumper, call init(), and then call dump() with the name + * of the file to dump to. + */ + +class Dumper : public MDSUtility { +private: + mds_role_t role; + inodeno_t ino; + +public: + Dumper() : ino(-1) + {} + + int init(mds_role_t role_, const std::string &type); + int recover_journal(Journaler *journaler); + int dump(const char *dumpfile); + int undump(const char *dumpfile, bool force); +}; + +#endif /* JOURNAL_DUMPER_H_ */ diff --git a/src/tools/cephfs/EventOutput.cc b/src/tools/cephfs/EventOutput.cc new file mode 100644 index 000000000..8cb235a82 --- /dev/null +++ b/src/tools/cephfs/EventOutput.cc @@ -0,0 +1,153 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include <iostream> +#include <fstream> + +#include "common/errno.h" +#include "mds/mdstypes.h" +#include "mds/events/EUpdate.h" +#include "mds/LogEvent.h" +#include "JournalScanner.h" + +#include "EventOutput.h" + + +int EventOutput::binary() const +{ + // Binary output, files + int r = ::mkdir(path.c_str(), 0755); + if (r != 0) { + r = -errno; + if (r != -EEXIST) { + std::cerr << "Error creating output directory: " << cpp_strerror(r) << std::endl; + return r; + } + } + + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + bufferlist bin; + std::stringstream filename; + if (auto& le = i->second.log_event; le) { + le->encode(bin, CEPH_FEATURES_SUPPORTED_DEFAULT); + filename << "0x" << std::hex << i->first << std::dec << "_" << le->get_type_str() << ".bin"; + } else if (auto& pi = i->second.pi; pi) { + pi->encode(bin); + filename << "0x" << std::hex << i->first << std::dec << "_" << pi->get_type_str() << ".bin"; + } + + std::string const file_path = path + std::string("/") + filename.str(); + std::ofstream bin_file(file_path.c_str(), std::ofstream::out | std::ofstream::binary); + bin.write_stream(bin_file); + bin_file.close(); + if (bin_file.fail()) { + return -EIO; + } + } + std::cerr << "Wrote output to binary files in directory '" << path << "'" << std::endl; + + return 0; +} + +int EventOutput::json() const +{ + JSONFormatter jf(true); + std::ofstream out_file(path.c_str(), std::ofstream::out); + jf.open_array_section("journal"); + { + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + if (auto& le = i->second.log_event; le) { + jf.open_object_section("log_event"); + le->dump(&jf); + jf.close_section(); // log_event + } else if (auto& pi = i->second.pi; pi) { + jf.open_object_section("purge_action"); + pi->dump(&jf); + jf.close_section(); + } + } + } + jf.close_section(); // journal + jf.flush(out_file); + out_file.close(); + + if (out_file.fail()) { + return -EIO; + } else { + std::cerr << "Wrote output to JSON file '" << path << "'" << std::endl; + return 0; + } +} + +void EventOutput::list() const +{ + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + if (auto& le = i->second.log_event; le) { + std::vector<std::string> ev_paths; + EMetaBlob const *emb = le->get_metablob(); + if (emb) { + emb->get_paths(ev_paths); + } + + std::string detail; + if (le->get_type() == EVENT_UPDATE) { + auto& eu = reinterpret_cast<EUpdate&>(*le); + detail = eu.type; + } + + std::cout << le->get_stamp() << " 0x" + << std::hex << i->first << std::dec << " " + << le->get_type_str() << ": " + << " (" << detail << ")" << std::endl; + for (std::vector<std::string>::iterator i = ev_paths.begin(); i != ev_paths.end(); ++i) { + std::cout << " " << *i << std::endl; + } + } else if (auto& pi = i->second.pi; pi) { + std::cout << pi->stamp << " 0x" + << std::hex << i->first << std::dec << " " + << pi->get_type_str() << std::endl; + } + } +} + +void EventOutput::summary() const +{ + std::map<std::string, int> type_count; + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + std::string type; + if (auto& le = i->second.log_event; le) + type = le->get_type_str(); + else if (auto& pi = i->second.pi; pi) + type = pi->get_type_str(); + if (type_count.count(type) == 0) { + type_count[type] = 0; + } + type_count[type] += 1; + } + + std::cout << "Events by type:" << std::endl; + for (std::map<std::string, int>::iterator i = type_count.begin(); i != type_count.end(); ++i) { + std::cout << " " << i->first << ": " << i->second << std::endl; + } + + std::cout << "Errors: " << scan.errors.size() << std::endl; + if (!scan.errors.empty()) { + for (JournalScanner::ErrorMap::const_iterator i = scan.errors.begin(); + i != scan.errors.end(); ++i) { + std::cout << " 0x" << std::hex << i->first << std::dec + << ": " << i->second.r << " " + << i->second.description << std::endl; + } + } +} diff --git a/src/tools/cephfs/EventOutput.h b/src/tools/cephfs/EventOutput.h new file mode 100644 index 000000000..65d968409 --- /dev/null +++ b/src/tools/cephfs/EventOutput.h @@ -0,0 +1,42 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#ifndef EVENT_OUTPUT_H +#define EVENT_OUTPUT_H + +#include <string> + +class JournalScanner; + +/** + * Different output formats for the results of a journal scan + */ +class EventOutput +{ + private: + JournalScanner const &scan; + std::string const path; + + public: + EventOutput(JournalScanner const &scan_, std::string const &path_) + : scan(scan_), path(path_) {} + + void summary() const; + void list() const; + int json() const; + int binary() const; +}; + +#endif // EVENT_OUTPUT_H + diff --git a/src/tools/cephfs/JournalFilter.cc b/src/tools/cephfs/JournalFilter.cc new file mode 100644 index 000000000..266d7fccb --- /dev/null +++ b/src/tools/cephfs/JournalFilter.cc @@ -0,0 +1,315 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include "JournalFilter.h" + +#include "common/ceph_argparse.h" + +#include "mds/events/ESession.h" +#include "mds/events/EUpdate.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + + +const string JournalFilter::range_separator(".."); + +bool JournalFilter::apply(uint64_t pos, PurgeItem &pi) const +{ + /* Filtering by journal offset range */ + if (pos < range_start || pos >= range_end) { + return false; + } + + if (purge_action != PurgeItem::NONE) { + if (pi.action != purge_action) + return false; + } + + if (inode) { + if (inode != pi.ino) + return false; + } + return true; +} + +/* + * Return whether a LogEvent is to be included or excluded. + * + * The filter parameters are applied on an AND basis: if any + * condition is not met, the event is excluded. Try to do + * the fastest checks first. + */ +bool JournalFilter::apply(uint64_t pos, LogEvent &le) const +{ + /* Filtering by journal offset range */ + if (pos < range_start || pos >= range_end) { + return false; + } + + /* Filtering by event type */ + if (event_type != 0) { + if (le.get_type() != event_type) { + return false; + } + } + + /* Filtering by client */ + if (client_name.num()) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + if (metablob->get_client_name() != client_name) { + return false; + } + } else if (le.get_type() == EVENT_SESSION) { + ESession *es = reinterpret_cast<ESession*>(&le); + if (es->get_client_inst().name != client_name) { + return false; + } + } else { + return false; + } + } + + /* Filtering by inode */ + if (inode) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + std::set<inodeno_t> inodes; + metablob->get_inodes(inodes); + bool match_any = false; + for (std::set<inodeno_t>::iterator i = inodes.begin(); i != inodes.end(); ++i) { + if (*i == inode) { + match_any = true; + break; + } + } + if (!match_any) { + return false; + } + } else { + return false; + } + } + + /* Filtering by frag and dentry */ + if (!frag_dentry.empty() || frag.ino) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + std::map<dirfrag_t, std::set<std::string> > dentries; + metablob->get_dentries(dentries); + + if (frag.ino) { + bool match_any = false; + for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin(); + i != dentries.end(); ++i) { + if (i->first == frag) { + match_any = true; + break; + } + } + if (!match_any) { + return false; + } + } + + if (!frag_dentry.empty()) { + bool match_any = false; + for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin(); + i != dentries.end() && !match_any; ++i) { + std::set<std::string> const &names = i->second; + for (std::set<std::string>::iterator j = names.begin(); + j != names.end() && !match_any; ++j) { + if (*j == frag_dentry) { + match_any = true; + } + } + } + if (!match_any) { + return false; + } + } + + } else { + return false; + } + } + + /* Filtering by file path */ + if (!path_expr.empty()) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + std::vector<std::string> paths; + metablob->get_paths(paths); + bool match_any = false; + for (std::vector<std::string>::iterator p = paths.begin(); p != paths.end(); ++p) { + if ((*p).find(path_expr) != std::string::npos) { + match_any = true; + break; + } + } + if (!match_any) { + return false; + } + } else { + return false; + } + } + + return true; +} + + +int JournalFilter::parse_args( + std::vector<const char*> &argv, + std::vector<const char*>::iterator &arg) +{ + while(arg != argv.end()) { + std::string arg_str; + if (ceph_argparse_witharg(argv, arg, &arg_str, "--range", (char*)NULL)) { + size_t sep_loc = arg_str.find(JournalFilter::range_separator); + if (sep_loc == std::string::npos || arg_str.size() <= JournalFilter::range_separator.size()) { + derr << "Invalid range '" << arg_str << "'" << dendl; + return -EINVAL; + } + + // We have a lower bound + if (sep_loc > 0) { + std::string range_start_str = arg_str.substr(0, sep_loc); + std::string parse_err; + range_start = strict_strtoll(range_start_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid lower bound '" << range_start_str << "': " << parse_err << dendl; + return -EINVAL; + } + } + + if (sep_loc < arg_str.size() - JournalFilter::range_separator.size()) { + std::string range_end_str = arg_str.substr(sep_loc + range_separator.size()); + std::string parse_err; + range_end = strict_strtoll(range_end_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid upper bound '" << range_end_str << "': " << parse_err << dendl; + return -EINVAL; + } + } + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--path\"." << dendl; + return -EINVAL; + } + dout(4) << "Filtering by path '" << arg_str << "'" << dendl; + path_expr = arg_str; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--inode", (char*)NULL)) { + dout(4) << "Filtering by inode '" << arg_str << "'" << dendl; + std::string parse_err; + inode = strict_strtoll(arg_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid inode '" << arg_str << "': " << parse_err << dendl; + return -EINVAL; + } + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--type", (char*)NULL)) { + try { + if (!type.compare("mdlog")) { + event_type = LogEvent::str_to_type(arg_str); + } else if (!type.compare("purge_queue")) { + purge_action = PurgeItem::str_to_type(arg_str); + } + } catch (const std::out_of_range&) { + derr << "Invalid event type '" << arg_str << "'" << dendl; + return -EINVAL; + } + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--frag", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--frag\"." << dendl; + return -EINVAL; + } + std::string const frag_sep = "."; + size_t sep_loc = arg_str.find(frag_sep); + std::string inode_str; + std::string frag_str; + if (sep_loc != std::string::npos) { + inode_str = arg_str.substr(0, sep_loc); + frag_str = arg_str.substr(sep_loc + 1); + } else { + inode_str = arg_str; + frag_str = "0"; + } + + std::string parse_err; + inodeno_t frag_ino = strict_strtoll(inode_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid inode '" << inode_str << "': " << parse_err << dendl; + return -EINVAL; + } + + uint32_t frag_enc = strict_strtoll(frag_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid frag '" << frag_str << "': " << parse_err << dendl; + return -EINVAL; + } + + frag = dirfrag_t(frag_ino, frag_t(frag_enc)); + dout(4) << "dirfrag filter: '" << frag << "'" << dendl; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--dname", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--dname\"." << dendl; + return -EINVAL; + } + frag_dentry = arg_str; + dout(4) << "dentry filter: '" << frag_dentry << "'" << dendl; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--client", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--client\"." << dendl; + return -EINVAL; + } + + std::string parse_err; + int64_t client_num = strict_strtoll(arg_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid client number " << arg_str << dendl; + return -EINVAL; + } + client_name = entity_name_t::CLIENT(client_num); + } else { + // We're done with args the filter understands + break; + } + } + + return 0; +} + +/** + * If the filter params are only range, then return + * true and set start & end. Else return false. + * + * Use this to discover if the user has requested a contiguous range + * rather than any per-event filtering. + */ +bool JournalFilter::get_range(uint64_t &start, uint64_t &end) const +{ + if (!path_expr.empty() + || inode != 0 + || event_type != 0 + || frag.ino != 0 + || client_name.num() != 0 + || (range_start == 0 && range_end == (uint64_t)(-1))) { + return false; + } else { + start = range_start; + end = range_end; + return true; + } +} diff --git a/src/tools/cephfs/JournalFilter.h b/src/tools/cephfs/JournalFilter.h new file mode 100644 index 000000000..f7a2db614 --- /dev/null +++ b/src/tools/cephfs/JournalFilter.h @@ -0,0 +1,73 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#ifndef JOURNAL_FILTER_H +#define JOURNAL_FILTER_H + +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/PurgeQueue.h" + +/** + * A set of conditions for narrowing down a search through the journal + */ +class JournalFilter +{ + private: + + /* Filtering by journal offset range */ + uint64_t range_start; + uint64_t range_end; + static const std::string range_separator; + + /* Filtering by file (sub) path */ + std::string path_expr; + + /* Filtering by inode */ + inodeno_t inode; + + /* Filtering by type */ + LogEvent::EventType event_type; + + std::string type; + + /* Filtering by PurgeItem::Action */ + PurgeItem::Action purge_action; + + /* Filtering by dirfrag */ + dirfrag_t frag; + std::string frag_dentry; //< optional, filter dentry name within fragment + + /* Filtering by metablob client name */ + entity_name_t client_name; + + public: + JournalFilter(std::string t) : + range_start(0), + range_end(-1), + inode(0), + event_type(0), + type(t), + purge_action(PurgeItem::NONE) {} + + bool get_range(uint64_t &start, uint64_t &end) const; + bool apply(uint64_t pos, LogEvent &le) const; + bool apply(uint64_t pos, PurgeItem &pi) const; + int parse_args( + std::vector<const char*> &argv, + std::vector<const char*>::iterator &arg); +}; + +#endif // JOURNAL_FILTER_H + diff --git a/src/tools/cephfs/JournalScanner.cc b/src/tools/cephfs/JournalScanner.cc new file mode 100644 index 000000000..e72542fd4 --- /dev/null +++ b/src/tools/cephfs/JournalScanner.cc @@ -0,0 +1,438 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include "include/rados/librados.hpp" +#include "mds/JournalPointer.h" + +#include "mds/events/ESubtreeMap.h" +#include "mds/PurgeQueue.h" + +#include "JournalScanner.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +/** + * Read journal header, followed by sequential scan through journal space. + * + * Return 0 on success, else error code. Note that success has the special meaning + * that we were able to apply our checks, it does *not* mean that the journal is + * healthy. + */ +int JournalScanner::scan(bool const full) +{ + int r = 0; + + r = set_journal_ino(); + if (r < 0) { + return r; + } + + if (!is_mdlog || pointer_present) { + r = scan_header(); + if (r < 0) { + return r; + } + } + + if (full && header_present) { + r = scan_events(); + if (r < 0) { + return r; + } + } + + return 0; +} + + +int JournalScanner::set_journal_ino() +{ + int r = 0; + if (type == "purge_queue") { + ino = MDS_INO_PURGE_QUEUE + rank; + } + else if (type == "mdlog"){ + r = scan_pointer(); + is_mdlog = true; + } + else { + ceph_abort(); // should not get here + } + return r; +} + +int JournalScanner::scan_pointer() +{ + // Issue read + std::string const pointer_oid = obj_name(MDS_INO_LOG_POINTER_OFFSET + rank, 0); + bufferlist pointer_bl; + int r = io.read(pointer_oid, pointer_bl, INT_MAX, 0); + if (r == -ENOENT) { + // 'Successfully' discovered the pointer is missing. + derr << "Pointer " << pointer_oid << " is absent" << dendl; + return 0; + } else if (r < 0) { + // Error preventing us interrogating pointer + derr << "Pointer " << pointer_oid << " is unreadable" << dendl; + return r; + } else { + dout(4) << "Pointer " << pointer_oid << " is readable" << dendl; + pointer_present = true; + + JournalPointer jp; + try { + auto q = pointer_bl.cbegin(); + jp.decode(q); + } catch(buffer::error &e) { + derr << "Pointer " << pointer_oid << " is corrupt: " << e.what() << dendl; + return 0; + } + + pointer_valid = true; + ino = jp.front; + return 0; + } +} + + +int JournalScanner::scan_header() +{ + int r; + + bufferlist header_bl; + std::string header_name = obj_name(0); + dout(4) << "JournalScanner::scan: reading header object '" << header_name << "'" << dendl; + r = io.read(header_name, header_bl, INT_MAX, 0); + if (r < 0) { + derr << "Header " << header_name << " is unreadable" << dendl; + return 0; // "Successfully" found an error + } else { + header_present = true; + } + + auto header_bl_i = header_bl.cbegin(); + header = new Journaler::Header(); + try + { + header->decode(header_bl_i); + } + catch (buffer::error &e) + { + derr << "Header is corrupt (" << e.what() << ")" << dendl; + delete header; + header = NULL; + return 0; // "Successfully" found an error + } + + if (header->magic != std::string(CEPH_FS_ONDISK_MAGIC)) { + derr << "Header is corrupt (bad magic)" << dendl; + return 0; // "Successfully" found an error + } + if (!((header->trimmed_pos <= header->expire_pos) && (header->expire_pos <= header->write_pos))) { + derr << "Header is invalid (inconsistent offsets)" << dendl; + return 0; // "Successfully" found an error + } + header_valid = true; + + return 0; +} + + +int JournalScanner::scan_events() +{ + uint64_t object_size = g_conf()->mds_log_segment_size; + if (object_size == 0) { + // Default layout object size + object_size = file_layout_t::get_default().object_size; + } + + uint64_t read_offset = header->expire_pos; + dout(10) << std::hex << "Header 0x" + << header->trimmed_pos << " 0x" + << header->expire_pos << " 0x" + << header->write_pos << std::dec << dendl; + dout(10) << "Starting journal scan from offset 0x" << std::hex << read_offset << std::dec << dendl; + + // TODO also check for extraneous objects before the trimmed pos or after the write pos, + // which would indicate a bogus header. + + bufferlist read_buf; + bool gap = false; + uint64_t gap_start = -1; + for (uint64_t obj_offset = (read_offset / object_size); ; obj_offset++) { + uint64_t offset_in_obj = 0; + if (obj_offset * object_size < header->expire_pos) { + // Skip up to expire_pos from start of the object + // (happens for the first object we read) + offset_in_obj = header->expire_pos - obj_offset * object_size; + } + + // Read this journal segment + bufferlist this_object; + std::string const oid = obj_name(obj_offset); + int r = io.read(oid, this_object, INT_MAX, offset_in_obj); + + // Handle absent journal segments + if (r < 0) { + if (obj_offset > (header->write_pos / object_size)) { + dout(4) << "Reached end of journal objects" << dendl; + break; + } else { + derr << "Missing object " << oid << dendl; + } + + objects_missing.push_back(obj_offset); + if (!gap) { + gap_start = read_offset; + gap = true; + } + if (read_buf.length() > 0) { + read_offset += read_buf.length(); + read_buf.clear(); + } + read_offset += object_size - offset_in_obj; + continue; + } else { + dout(4) << "Read 0x" << std::hex << this_object.length() << std::dec + << " bytes from " << oid << " gap=" << gap << dendl; + objects_valid.push_back(oid); + this_object.begin().copy(this_object.length(), read_buf); + } + + if (gap) { + // No valid data at the current read offset, scan forward until we find something valid looking + // or have to drop out to load another object. + dout(4) << "Searching for sentinel from 0x" << std::hex << read_offset + << ", 0x" << read_buf.length() << std::dec << " bytes available" << dendl; + + do { + auto p = read_buf.cbegin(); + uint64_t candidate_sentinel; + decode(candidate_sentinel, p); + + dout(4) << "Data at 0x" << std::hex << read_offset << " = 0x" << candidate_sentinel << std::dec << dendl; + + if (candidate_sentinel == JournalStream::sentinel) { + dout(4) << "Found sentinel at 0x" << std::hex << read_offset << std::dec << dendl; + ranges_invalid.push_back(Range(gap_start, read_offset)); + gap = false; + break; + } else { + // No sentinel, discard this byte + read_buf.splice(0, 1); + read_offset += 1; + } + } while (read_buf.length() >= sizeof(JournalStream::sentinel)); + dout(4) << "read_buf size is " << read_buf.length() << dendl; + } + { + dout(10) << "Parsing data, 0x" << std::hex << read_buf.length() << std::dec << " bytes available" << dendl; + while(true) { + // TODO: detect and handle legacy format journals: can do many things + // on them but on read errors have to give up instead of searching + // for sentinels. + JournalStream journal_stream(JOURNAL_FORMAT_RESILIENT); + bool readable = false; + try { + uint64_t need; + readable = journal_stream.readable(read_buf, &need); + } catch (buffer::error &e) { + readable = false; + dout(4) << "Invalid container encoding at 0x" << std::hex << read_offset << std::dec << dendl; + gap = true; + gap_start = read_offset; + read_buf.splice(0, 1); + read_offset += 1; + break; + } + + if (!readable) { + // Out of data, continue to read next object + break; + } + + bufferlist le_bl; //< Serialized LogEvent blob + dout(10) << "Attempting decode at 0x" << std::hex << read_offset << std::dec << dendl; + // This cannot fail to decode because we pre-checked that a serialized entry + // blob would be readable. + uint64_t start_ptr = 0; + uint64_t consumed = journal_stream.read(read_buf, &le_bl, &start_ptr); + dout(10) << "Consumed 0x" << std::hex << consumed << std::dec << " bytes" << dendl; + if (start_ptr != read_offset) { + derr << "Bad entry start ptr (0x" << std::hex << start_ptr << ") at 0x" + << read_offset << std::dec << dendl; + gap = true; + gap_start = read_offset; + // FIXME: given that entry was invalid, should we be skipping over it? + // maybe push bytes back onto start of read_buf and just advance one byte + // to start scanning instead. e.g. if a bogus size value is found it can + // cause us to consume and thus skip a bunch of following valid events. + read_offset += consumed; + break; + } + bool valid_entry = true; + if (is_mdlog) { + auto le = LogEvent::decode_event(le_bl.cbegin()); + + if (le) { + dout(10) << "Valid entry at 0x" << std::hex << read_offset << std::dec << dendl; + + if (le->get_type() == EVENT_SUBTREEMAP + || le->get_type() == EVENT_SUBTREEMAP_TEST) { + auto&& sle = dynamic_cast<ESubtreeMap&>(*le); + if (sle.expire_pos > read_offset) { + errors.insert(std::make_pair( + read_offset, EventError( + -ERANGE, + "ESubtreeMap has expire_pos ahead of its own position"))); + } + } + + if (filter.apply(read_offset, *le)) { + events.insert_or_assign(read_offset, EventRecord(std::move(le), consumed)); + } + } else { + valid_entry = false; + } + } else if (type == "purge_queue"){ + auto pi = std::make_unique<PurgeItem>(); + try { + auto q = le_bl.cbegin(); + pi->decode(q); + if (filter.apply(read_offset, *pi)) { + events.insert_or_assign(read_offset, EventRecord(std::move(pi), consumed)); + } + } catch (const buffer::error &err) { + valid_entry = false; + } + } else { + ceph_abort(); // should not get here + } + if (!valid_entry) { + dout(10) << "Invalid entry at 0x" << std::hex << read_offset << std::dec << dendl; + gap = true; + gap_start = read_offset; + read_offset += consumed; + break; + } else { + events_valid.push_back(read_offset); + read_offset += consumed; + } + } + } + } + + if (gap) { + // Ended on a gap, assume it ran to end + ranges_invalid.push_back(Range(gap_start, -1)); + } + + dout(4) << "Scanned objects, " << objects_missing.size() << " missing, " << objects_valid.size() << " valid" << dendl; + dout(4) << "Events scanned, " << ranges_invalid.size() << " gaps" << dendl; + dout(4) << "Found " << events_valid.size() << " valid events" << dendl; + dout(4) << "Selected " << events.size() << " events events for processing" << dendl; + + return 0; +} + + +JournalScanner::~JournalScanner() +{ + if (header) { + delete header; + header = NULL; + } + dout(4) << events.size() << " events" << dendl; + events.clear(); +} + + +/** + * Whether the journal data looks valid and replayable + */ +bool JournalScanner::is_healthy() const +{ + return ((!is_mdlog || (pointer_present && pointer_valid)) + && header_present && header_valid + && ranges_invalid.empty() + && objects_missing.empty()); +} + + +/** + * Whether the journal data can be read from RADOS + */ +bool JournalScanner::is_readable() const +{ + return (header_present && header_valid && objects_missing.empty()); +} + + +/** + * Calculate the object name for a given offset + */ +std::string JournalScanner::obj_name(inodeno_t ino, uint64_t offset) const +{ + char name[60]; + snprintf(name, sizeof(name), "%llx.%08llx", + (unsigned long long)(ino), + (unsigned long long)offset); + return std::string(name); +} + + +std::string JournalScanner::obj_name(uint64_t offset) const +{ + return obj_name(ino, offset); +} + + +/* + * Write a human readable summary of the journal health + */ +void JournalScanner::report(std::ostream &out) const +{ + out << "Overall journal integrity: " << (is_healthy() ? "OK" : "DAMAGED") << std::endl; + + if (is_mdlog) { + if (!pointer_present) { + out << "Pointer not found" << std::endl; + } else if (!pointer_valid) { + out << "Pointer could not be decoded" << std::endl; + } + } + if (!header_present) { + out << "Header not found" << std::endl; + } else if (!header_valid) { + out << "Header could not be decoded" << std::endl; + } + + if (objects_missing.size()) { + out << "Objects missing:" << std::endl; + for (std::vector<uint64_t>::const_iterator om = objects_missing.begin(); + om != objects_missing.end(); ++om) { + out << " 0x" << std::hex << *om << std::dec << std::endl; + } + } + + if (ranges_invalid.size()) { + out << "Corrupt regions:" << std::endl; + for (std::vector<Range>::const_iterator r = ranges_invalid.begin(); + r != ranges_invalid.end(); ++r) { + out << " 0x" << std::hex << r->first << "-" << r->second << std::dec << std::endl; + } + } +} + diff --git a/src/tools/cephfs/JournalScanner.h b/src/tools/cephfs/JournalScanner.h new file mode 100644 index 000000000..9197b5596 --- /dev/null +++ b/src/tools/cephfs/JournalScanner.h @@ -0,0 +1,133 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + +#ifndef JOURNAL_SCANNER_H +#define JOURNAL_SCANNER_H + +#include "include/rados/librados_fwd.hpp" + +// For Journaler::Header, can't forward-declare nested classes +#include <osdc/Journaler.h> + +#include "JournalFilter.h" + +/** + * A simple sequential reader for metadata journals. Unlike + * the MDS Journaler class, this is written to detect, record, + * and read past corruptions and missing objects. It is also + * less efficient but more plainly written. + */ +class JournalScanner +{ + private: + librados::IoCtx &io; + + // Input constraints + const int rank; + std::string type; + JournalFilter const filter; + + void gap_advance(); + + public: + JournalScanner( + librados::IoCtx &io_, + int rank_, + const std::string &type_, + JournalFilter const &filter_) : + io(io_), + rank(rank_), + type(type_), + filter(filter_), + is_mdlog(false), + pointer_present(false), + pointer_valid(false), + header_present(false), + header_valid(false), + header(NULL) {}; + + JournalScanner( + librados::IoCtx &io_, + int rank_, + const std::string &type_) : + io(io_), + rank(rank_), + type(type_), + filter(type_), + is_mdlog(false), + pointer_present(false), + pointer_valid(false), + header_present(false), + header_valid(false), + header(NULL) {}; + + ~JournalScanner(); + + int set_journal_ino(); + int scan(bool const full=true); + int scan_pointer(); + int scan_header(); + int scan_events(); + void report(std::ostream &out) const; + + std::string obj_name(uint64_t offset) const; + std::string obj_name(inodeno_t ino, uint64_t offset) const; + + // The results of the scan + inodeno_t ino; // Corresponds to journal ino according their type + struct EventRecord { + EventRecord(std::unique_ptr<LogEvent> le, uint32_t rs) : log_event(std::move(le)), raw_size(rs) {} + EventRecord(std::unique_ptr<PurgeItem> p, uint32_t rs) : pi(std::move(p)), raw_size(rs) {} + std::unique_ptr<LogEvent> log_event; + std::unique_ptr<PurgeItem> pi; + uint32_t raw_size = 0; //< Size from start offset including all encoding overhead + }; + + class EventError { + public: + int r; + std::string description; + EventError(int r_, const std::string &desc_) + : r(r_), description(desc_) {} + }; + + typedef std::map<uint64_t, EventRecord> EventMap; + typedef std::map<uint64_t, EventError> ErrorMap; + typedef std::pair<uint64_t, uint64_t> Range; + bool is_mdlog; + bool pointer_present; //mdlog specific + bool pointer_valid; //mdlog specific + bool header_present; + bool header_valid; + Journaler::Header *header; + + bool is_healthy() const; + bool is_readable() const; + std::vector<std::string> objects_valid; + std::vector<uint64_t> objects_missing; + std::vector<Range> ranges_invalid; + std::vector<uint64_t> events_valid; + EventMap events; + + // For events present in ::events (i.e. scanned successfully), + // any subsequent errors handling them (e.g. replaying) + ErrorMap errors; + + + private: + // Forbid copy construction because I have ptr members + JournalScanner(const JournalScanner &rhs); +}; + +#endif // JOURNAL_SCANNER_H + diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc new file mode 100644 index 000000000..ec9860980 --- /dev/null +++ b/src/tools/cephfs/JournalTool.cc @@ -0,0 +1,1266 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include <sstream> + +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "osdc/Journaler.h" +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/InoTable.h" + +#include "mds/events/ENoOp.h" +#include "mds/events/EUpdate.h" + +#include "JournalScanner.h" +#include "EventOutput.h" +#include "Dumper.h" +#include "Resetter.h" + +#include "JournalTool.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << __func__ << ": " + + + +void JournalTool::usage() +{ + std::cout << "Usage: \n" + << " cephfs-journal-tool [options] journal <command>\n" + << " <command>:\n" + << " inspect\n" + << " import <path> [--force]\n" + << " export <path>\n" + << " reset [--force]\n" + << " cephfs-journal-tool [options] header <get|set> <field> <value>\n" + << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n" + << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n" + << " <selector>:\n" + << " --range=<start>..<end>\n" + << " --path=<substring>\n" + << " --inode=<integer>\n" + << " --type=<UPDATE|OPEN|SESSION...><\n" + << " --frag=<ino>.<frag> [--dname=<dentry string>]\n" + << " --client=<session id integer>\n" + << " <effect>: [get|recover_dentries|splice]\n" + << " <output>: [summary|list|binary|json] [--path <path>]\n" + << "\n" + << "General options:\n" + << " --rank=filesystem:mds-rank|all Journal rank (mandatory)\n" + << " --journal=<mdlog|purge_queue> Journal type (purge_queue means\n" + << " this journal is used to queue for purge operation,\n" + << " default is mdlog, and only mdlog support event mode)\n" + << "\n" + << "Special options\n" + << " --alternate-pool <name> Alternative metadata pool to target\n" + << " when using recover_dentries.\n"; + + generic_client_usage(); +} + + +/** + * Handle arguments and hand off to journal/header/event mode + */ +int JournalTool::main(std::vector<const char*> &argv) +{ + int r; + + dout(10) << "JournalTool::main " << dendl; + // Common arg parsing + // ================== + if (argv.empty()) { + cerr << "missing positional argument" << std::endl; + return -EINVAL; + } + + std::vector<const char*>::iterator arg = argv.begin(); + + std::string rank_str; + if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) { + derr << "missing mandatory \"--rank\" argument" << dendl; + return -EINVAL; + } + + if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) { + // Default is mdlog + type = "mdlog"; + } + + r = validate_type(type); + if (r != 0) { + derr << "journal type is not correct." << dendl; + return r; + } + + r = role_selector.parse(*fsmap, rank_str, false); + if (r != 0) { + derr << "Couldn't determine MDS rank." << dendl; + return r; + } + + std::string mode; + if (arg == argv.end()) { + derr << "Missing mode [journal|header|event]" << dendl; + return -EINVAL; + } + mode = std::string(*arg); + arg = argv.erase(arg); + + // RADOS init + // ========== + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; + return r; + } + + dout(4) << "JournalTool: connecting to RADOS..." << dendl; + r = rados.connect(); + if (r < 0) { + derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; + return r; + } + + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + ceph_assert(fs != nullptr); + int64_t const pool_id = fs->mds_map.get_metadata_pool(); + dout(4) << "JournalTool: resolving pool " << pool_id << dendl; + std::string pool_name; + r = rados.pool_reverse_lookup(pool_id, &pool_name); + if (r < 0) { + derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl; + return r; + } + + dout(4) << "JournalTool: creating IoCtx.." << dendl; + r = rados.ioctx_create(pool_name.c_str(), input); + ceph_assert(r == 0); + output.dup(input); + + // Execution + // ========= + // journal and header are general journal mode + // event mode is only specific for mdlog + auto roles = role_selector.get_roles(); + if (roles.size() > 1) { + const std::string &command = argv[0]; + bool allowed = can_execute_for_all_ranks(mode, command); + if (!allowed) { + derr << "operation not allowed for all ranks" << dendl; + return -EINVAL; + } + + all_ranks = true; + } + for (auto role : roles) { + rank = role.rank; + std::vector<const char *> rank_argv(argv); + dout(4) << "Executing for rank " << rank << dendl; + if (mode == std::string("journal")) { + r = main_journal(rank_argv); + } else if (mode == std::string("header")) { + r = main_header(rank_argv); + } else if (mode == std::string("event")) { + r = main_event(rank_argv); + } else { + cerr << "Bad command '" << mode << "'" << std::endl; + return -EINVAL; + } + + if (r != 0) { + return r; + } + } + + return r; +} + +int JournalTool::validate_type(const std::string &type) +{ + if (type == "mdlog" || type == "purge_queue") { + return 0; + } + return -1; +} + +std::string JournalTool::gen_dump_file_path(const std::string &prefix) { + if (!all_ranks) { + return prefix; + } + + return prefix + "." + std::to_string(rank); +} + +bool JournalTool::can_execute_for_all_ranks(const std::string &mode, + const std::string &command) { + if (mode == "journal" && command == "import") { + return false; + } + + return true; +} + +/** + * Handle arguments for 'journal' mode + * + * This is for operations that act on the journal as a whole. + */ +int JournalTool::main_journal(std::vector<const char*> &argv) +{ + if (argv.empty()) { + derr << "Missing journal command, please see help" << dendl; + return -EINVAL; + } + + std::string command = argv[0]; + if (command == "inspect") { + return journal_inspect(); + } else if (command == "export" || command == "import") { + bool force = false; + if (argv.size() >= 2) { + std::string const path = argv[1]; + if (argv.size() == 3) { + if (std::string(argv[2]) == "--force") { + force = true; + } else { + std::cerr << "Unknown argument " << argv[1] << std::endl; + return -EINVAL; + } + } + return journal_export(path, command == "import", force); + } else { + derr << "Missing path" << dendl; + return -EINVAL; + } + } else if (command == "reset") { + bool force = false; + if (argv.size() == 2) { + if (std::string(argv[1]) == "--force") { + force = true; + } else { + std::cerr << "Unknown argument " << argv[1] << std::endl; + return -EINVAL; + } + } else if (argv.size() > 2) { + std::cerr << "Too many arguments!" << std::endl; + return -EINVAL; + } + return journal_reset(force); + } else { + derr << "Bad journal command '" << command << "'" << dendl; + return -EINVAL; + } +} + + +/** + * Parse arguments and execute for 'header' mode + * + * This is for operations that act on the header only. + */ +int JournalTool::main_header(std::vector<const char*> &argv) +{ + JournalFilter filter(type); + JournalScanner js(input, rank, type, filter); + int r = js.scan(false); + if (r < 0) { + std::cerr << "Unable to scan journal" << std::endl; + return r; + } + + if (!js.header_present) { + std::cerr << "Header object not found!" << std::endl; + return -ENOENT; + } else if (!js.header_valid && js.header == NULL) { + // Can't do a read or a single-field write without a copy of the original + derr << "Header could not be read!" << dendl; + return -ENOENT; + } else { + ceph_assert(js.header != NULL); + } + + if (argv.empty()) { + derr << "Missing header command, must be [get|set]" << dendl; + return -EINVAL; + } + std::vector<const char *>::iterator arg = argv.begin(); + std::string const command = *arg; + arg = argv.erase(arg); + + if (command == std::string("get")) { + // Write JSON journal dump to stdout + JSONFormatter jf(true); + js.header->dump(&jf); + jf.flush(std::cout); + std::cout << std::endl; + } else if (command == std::string("set")) { + // Need two more args <key> <val> + if (argv.size() != 2) { + derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl; + return -EINVAL; + } + + std::string const field_name = *arg; + arg = argv.erase(arg); + + std::string const value_str = *arg; + arg = argv.erase(arg); + ceph_assert(argv.empty()); + + std::string parse_err; + uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid value '" << value_str << "': " << parse_err << dendl; + return -EINVAL; + } + + uint64_t *field = NULL; + if (field_name == "trimmed_pos") { + field = &(js.header->trimmed_pos); + } else if (field_name == "expire_pos") { + field = &(js.header->expire_pos); + } else if (field_name == "write_pos") { + field = &(js.header->write_pos); + } else if (field_name == "pool_id") { + field = (uint64_t*)(&(js.header->layout.pool_id)); + } else { + derr << "Invalid field '" << field_name << "'" << dendl; + return -EINVAL; + } + + std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl; + *field = new_val; + + dout(4) << "Writing object..." << dendl; + bufferlist header_bl; + encode(*(js.header), header_bl); + output.write_full(js.obj_name(0), header_bl); + dout(4) << "Write complete." << dendl; + std::cout << "Successfully updated header." << std::endl; + } else { + derr << "Bad header command '" << command << "'" << dendl; + return -EINVAL; + } + + return 0; +} + + +/** + * Parse arguments and execute for 'event' mode + * + * This is for operations that act on LogEvents within the log + */ +int JournalTool::main_event(std::vector<const char*> &argv) +{ + int r; + + if (argv.empty()) { + derr << "Missing event command, please see help" << dendl; + return -EINVAL; + } + + std::vector<const char*>::iterator arg = argv.begin(); + bool dry_run = false; + + std::string command = *(arg++); + if (command != "get" && command != "splice" && command != "recover_dentries") { + derr << "Unknown argument '" << command << "'" << dendl; + return -EINVAL; + } + + if (command == "recover_dentries") { + if (type != "mdlog") { + derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl; + return -EINVAL; + } else { + if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) { + dry_run = true; + } + } + } + + if (arg == argv.end()) { + derr << "Incomplete command line" << dendl; + return -EINVAL; + } + + // Parse filter options + // ==================== + JournalFilter filter(type); + r = filter.parse_args(argv, arg); + if (r) { + return r; + } + + // Parse output options + // ==================== + if (arg == argv.end()) { + cerr << "Missing output command" << std::endl; + return -EINVAL; + } + std::string output_style = *(arg++); + if (output_style != "binary" && output_style != "json" && + output_style != "summary" && output_style != "list") { + cerr << "Unknown argument: '" << output_style << "'" << std::endl; + return -EINVAL; + } + + std::string output_path = "dump"; + while(arg != argv.end()) { + std::string arg_str; + if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { + output_path = arg_str; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool", + nullptr)) { + dout(1) << "Using alternate pool " << arg_str << dendl; + int r = rados.ioctx_create(arg_str.c_str(), output); + ceph_assert(r == 0); + other_pool = true; + } else { + cerr << "Unknown argument: '" << *arg << "'" << std::endl; + return -EINVAL; + } + } + + const std::string dump_path = gen_dump_file_path(output_path); + + // Execute command + // =============== + JournalScanner js(input, rank, type, filter); + if (command == "get") { + r = js.scan(); + if (r) { + derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; + return r; + } + } else if (command == "recover_dentries") { + r = js.scan(); + if (r) { + derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; + return r; + } + + /** + * Iterate over log entries, attempting to scavenge from each one + */ + std::set<inodeno_t> consumed_inos; + for (JournalScanner::EventMap::iterator i = js.events.begin(); + i != js.events.end(); ++i) { + auto& le = i->second.log_event; + EMetaBlob const *mb = le->get_metablob(); + if (mb) { + int scav_r = recover_dentries(*mb, dry_run, &consumed_inos); + if (scav_r) { + dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec + << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl; + if (r == 0) { + r = scav_r; + } + // Our goal is to read all we can, so don't stop on errors, but + // do record them for possible later output + js.errors.insert(std::make_pair(i->first, + JournalScanner::EventError(scav_r, cpp_strerror(r)))); + } + } + } + + /** + * Update InoTable to reflect any inode numbers consumed during scavenge + */ + dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl; + if (consumed_inos.size() && !dry_run) { + int consume_r = consume_inos(consumed_inos); + if (consume_r) { + dout(1) << "Error updating InoTable for " << consumed_inos.size() + << " consume inos: " << cpp_strerror(consume_r) << dendl; + if (r == 0) { + r = consume_r; + } + } + } + + // Remove consumed dentries from lost+found. + if (other_pool && !dry_run) { + std::set<std::string> found; + + for (auto i : consumed_inos) { + char s[20]; + + snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i); + dout(20) << "removing " << s << dendl; + found.insert(std::string(s)); + } + + object_t frag_oid; + frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND, + frag_t(), ""); + output.omap_rm_keys(frag_oid.name, found); + } + } else if (command == "splice") { + r = js.scan(); + if (r) { + derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; + return r; + } + + uint64_t start, end; + if (filter.get_range(start, end)) { + // Special case for range filter: erase a numeric range in the log + uint64_t range = end - start; + int r = erase_region(js, start, range); + if (r) { + derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec + << ": " << cpp_strerror(r) << dendl; + return r; + } + } else { + // General case: erase a collection of individual entries in the log + for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) { + dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl; + + int r = erase_region(js, i->first, i->second.raw_size); + if (r) { + derr << "Failed to erase event 0x" << std::hex << i->first << std::dec + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + } + + + } else { + cerr << "Unknown argument '" << command << "'" << std::endl; + return -EINVAL; + } + + // Generate output + // =============== + EventOutput output(js, dump_path); + int output_result = 0; + if (output_style == "binary") { + output_result = output.binary(); + } else if (output_style == "json") { + output_result = output.json(); + } else if (output_style == "summary") { + output.summary(); + } else if (output_style == "list") { + output.list(); + } else { + std::cerr << "Bad output command '" << output_style << "'" << std::endl; + return -EINVAL; + } + + if (output_result != 0) { + std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl; + } + + return output_result; +} + +/** + * Provide the user with information about the condition of the journal, + * especially indicating what range of log events is available and where + * any gaps or corruptions in the journal are. + */ +int JournalTool::journal_inspect() +{ + int r; + + JournalFilter filter(type); + JournalScanner js(input, rank, type, filter); + r = js.scan(); + if (r) { + std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl; + return r; + } + + js.report(std::cout); + + return 0; +} + + +/** + * Attempt to export a binary dump of the journal. + * + * This is allowed to fail if the header is malformed or there are + * objects inaccessible, in which case the user would have to fall + * back to manually listing RADOS objects and extracting them, which + * they can do with the ``rados`` CLI. + */ +int JournalTool::journal_export(std::string const &path, bool import, bool force) +{ + int r = 0; + JournalScanner js(input, rank, type); + + if (!import) { + /* + * If doing an export, first check that the header is valid and + * no objects are missing before trying to dump + */ + r = js.scan(); + if (r < 0) { + derr << "Unable to scan journal, assuming badly damaged" << dendl; + return r; + } + if (!js.is_readable()) { + derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl; + return -EIO; + } + } + + /* + * Assuming we can cleanly read the journal data, dump it out to a file + */ + { + Dumper dumper; + r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type); + if (r < 0) { + derr << "dumper::init failed: " << cpp_strerror(r) << dendl; + return r; + } + if (import) { + r = dumper.undump(path.c_str(), force); + } else { + const std::string ex_path = gen_dump_file_path(path); + r = dumper.dump(ex_path.c_str()); + } + } + + return r; +} + + +/** + * Truncate journal and insert EResetJournal + */ +int JournalTool::journal_reset(bool hard) +{ + int r = 0; + Resetter resetter; + r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard); + if (r < 0) { + derr << "resetter::init failed: " << cpp_strerror(r) << dendl; + return r; + } + + if (hard) { + r = resetter.reset_hard(); + } else { + r = resetter.reset(); + } + + return r; +} + + +/** + * Selective offline replay which only reads out dentries and writes + * them to the backing store iff their version is > what is currently + * in the backing store. + * + * In order to write dentries to the backing store, we may create the + * required enclosing dirfrag objects. + * + * Test this by running scavenge on an unflushed journal, then nuking + * it offline, then starting an MDS and seeing that the dentries are + * visible. + * + * @param metablob an EMetaBlob retrieved from the journal + * @param dry_run if true, do no writes to RADOS + * @param consumed_inos output, populated with any inos inserted + * @returns 0 on success, else negative error code + */ +int JournalTool::recover_dentries( + EMetaBlob const &metablob, + bool const dry_run, + std::set<inodeno_t> *consumed_inos) +{ + ceph_assert(consumed_inos != NULL); + + int r = 0; + + // Replay fullbits (dentry+inode) + for (const auto& frag : metablob.lump_order) { + EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second; + lump._decode_bits(); + object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, ""); + + dout(4) << "inspecting lump " << frag_oid.name << dendl; + + + // We will record old fnode version for use in hard link handling + // If we don't read an old fnode, take version as zero and write in + // all hardlinks we find. + version_t old_fnode_version = 0; + + // Update fnode in omap header of dirfrag object + bool write_fnode = false; + bufferlist old_fnode_bl; + r = input.omap_get_header(frag_oid.name, &old_fnode_bl); + if (r == -ENOENT) { + // Creating dirfrag from scratch + dout(4) << "failed to read OMAP header from directory fragment " + << frag_oid.name << " " << cpp_strerror(r) << dendl; + write_fnode = true; + // Note: creating the dirfrag *without* a backtrace, relying on + // MDS to regenerate backtraces on read or in FSCK + } else if (r == 0) { + // Conditionally update existing omap header + fnode_t old_fnode; + auto old_fnode_iter = old_fnode_bl.cbegin(); + try { + old_fnode.decode(old_fnode_iter); + dout(4) << "frag " << frag_oid.name << " fnode old v" << + old_fnode.version << " vs new v" << lump.fnode->version << dendl; + old_fnode_version = old_fnode.version; + write_fnode = old_fnode_version < lump.fnode->version; + } catch (const buffer::error &err) { + dout(1) << "frag " << frag_oid.name + << " is corrupt, overwriting" << dendl; + write_fnode = true; + } + } else { + // Unexpected error + dout(4) << "failed to read OMAP header from directory fragment " + << frag_oid.name << " " << cpp_strerror(r) << dendl; + return r; + } + + if ((other_pool || write_fnode) && !dry_run) { + dout(4) << "writing fnode to omap header" << dendl; + bufferlist fnode_bl; + lump.fnode->encode(fnode_bl); + if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) { + r = output.omap_set_header(frag_oid.name, fnode_bl); + } + if (r != 0) { + derr << "Failed to write fnode for frag object " + << frag_oid.name << dendl; + return r; + } + } + + std::set<std::string> read_keys; + + // Compose list of potentially-existing dentries we would like to fetch + for (const auto& fb : lump.get_dfull()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); + dn_key.encode(key); + read_keys.insert(key); + } + + for(const auto& rb : lump.get_dremote()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); + dn_key.encode(key); + read_keys.insert(key); + } + + for (const auto& nb : lump.get_dnull()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); + dn_key.encode(key); + read_keys.insert(key); + } + + // Perform bulk read of existing dentries + std::map<std::string, bufferlist> read_vals; + r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); + if (r == -ENOENT && other_pool) { + r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); + } + if (r != 0) { + derr << "unexpected error reading fragment object " + << frag_oid.name << ": " << cpp_strerror(r) << dendl; + return r; + } + + // Compose list of dentries we will write back + std::map<std::string, bufferlist> write_vals; + for (const auto& fb : lump.get_dfull()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); + dn_key.encode(key); + + dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn + << dendl; + bool write_dentry = false; + if (read_vals.find(key) == read_vals.end()) { + dout(4) << "dentry did not already exist, will create" << dendl; + write_dentry = true; + } else { + dout(4) << "dentry " << key << " existed already" << dendl; + dout(4) << "dentry exists, checking versions..." << dendl; + bufferlist &old_dentry = read_vals[key]; + // Decode dentry+inode + auto q = old_dentry.cbegin(); + + snapid_t dnfirst; + decode(dnfirst, q); + char dentry_type; + decode(dentry_type, q); + + if (dentry_type == 'L' || dentry_type == 'l') { + // leave write_dentry false, we have no version to + // compare with in a hardlink, so it's not safe to + // squash over it with what's in this fullbit + dout(10) << "Existing remote inode in slot to be (maybe) written " + << "by a full inode from the journal dn '" << fb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + write_dentry = old_fnode_version < lump.fnode->version; + } else if (dentry_type == 'I' || dentry_type == 'i') { + // Read out inode version to compare with backing store + InodeStore inode; + if (dentry_type == 'i') { + mempool::mds_co::string alternate_name; + + DECODE_START(2, q); + if (struct_v >= 2) + decode(alternate_name, q); + inode.decode(q); + DECODE_FINISH(q); + } else { + inode.decode_bare(q); + } + dout(4) << "decoded embedded inode version " + << inode.inode->version << " vs fullbit version " + << fb.inode->version << dendl; + if (inode.inode->version < fb.inode->version) { + write_dentry = true; + } + } else { + dout(4) << "corrupt dentry in backing store, overwriting from " + "journal" << dendl; + write_dentry = true; + } + } + + if ((other_pool || write_dentry) && !dry_run) { + dout(4) << "writing I dentry " << key << " into frag " + << frag_oid.name << dendl; + + // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) + bufferlist dentry_bl; + encode(fb.dnfirst, dentry_bl); + encode('I', dentry_bl); + encode_fullbit_as_inode(fb, true, &dentry_bl); + + // Record for writing to RADOS + write_vals[key] = dentry_bl; + consumed_inos->insert(fb.inode->ino); + } + } + + for(const auto& rb : lump.get_dremote()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); + dn_key.encode(key); + + dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn + << dendl; + bool write_dentry = false; + if (read_vals.find(key) == read_vals.end()) { + dout(4) << "dentry did not already exist, will create" << dendl; + write_dentry = true; + } else { + dout(4) << "dentry " << key << " existed already" << dendl; + dout(4) << "dentry exists, checking versions..." << dendl; + bufferlist &old_dentry = read_vals[key]; + // Decode dentry+inode + auto q = old_dentry.cbegin(); + + snapid_t dnfirst; + decode(dnfirst, q); + char dentry_type; + decode(dentry_type, q); + + if (dentry_type == 'L' || dentry_type == 'l') { + dout(10) << "Existing hardlink inode in slot to be (maybe) written " + << "by a remote inode from the journal dn '" << rb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + write_dentry = old_fnode_version < lump.fnode->version; + } else if (dentry_type == 'I' || dentry_type == 'i') { + dout(10) << "Existing full inode in slot to be (maybe) written " + << "by a remote inode from the journal dn '" << rb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + write_dentry = old_fnode_version < lump.fnode->version; + } else { + dout(4) << "corrupt dentry in backing store, overwriting from " + "journal" << dendl; + write_dentry = true; + } + } + + if ((other_pool || write_dentry) && !dry_run) { + dout(4) << "writing L dentry " << key << " into frag " + << frag_oid.name << dendl; + + // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) + bufferlist dentry_bl; + encode(rb.dnfirst, dentry_bl); + encode('L', dentry_bl); + encode(rb.ino, dentry_bl); + encode(rb.d_type, dentry_bl); + + // Record for writing to RADOS + write_vals[key] = dentry_bl; + consumed_inos->insert(rb.ino); + } + } + + std::set<std::string> null_vals; + for (const auto& nb : lump.get_dnull()) { + std::string key; + dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); + dn_key.encode(key); + + dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn + << dendl; + + auto it = read_vals.find(key); + if (it != read_vals.end()) { + dout(4) << "dentry exists, will remove" << dendl; + + auto q = it->second.cbegin(); + snapid_t dnfirst; + decode(dnfirst, q); + char dentry_type; + decode(dentry_type, q); + + bool remove_dentry = false; + if (dentry_type == 'L' || dentry_type == 'l') { + dout(10) << "Existing hardlink inode in slot to be (maybe) removed " + << "by null journal dn '" << nb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + remove_dentry = old_fnode_version < lump.fnode->version; + } else if (dentry_type == 'I' || dentry_type == 'i') { + dout(10) << "Existing full inode in slot to be (maybe) removed " + << "by null journal dn '" << nb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + remove_dentry = old_fnode_version < lump.fnode->version; + } else { + dout(4) << "corrupt dentry in backing store, will remove" << dendl; + remove_dentry = true; + } + + if (remove_dentry) + null_vals.insert(key); + } + } + + // Write back any new/changed dentries + if (!write_vals.empty()) { + r = output.omap_set(frag_oid.name, write_vals); + if (r != 0) { + derr << "error writing dentries to " << frag_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + + // remove any null dentries + if (!null_vals.empty()) { + r = output.omap_rm_keys(frag_oid.name, null_vals); + if (r != 0) { + derr << "error removing dentries from " << frag_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + } + + /* Now that we've looked at the dirlumps, we finally pay attention to + * the roots (i.e. inodes without ancestry). This is necessary in order + * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally + * important because clients use them to infer completeness + * of directories + */ + for (const auto& fb : metablob.roots) { + inodeno_t ino = fb.inode->ino; + dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl; + + object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); + dout(4) << "object id " << root_oid.name << dendl; + + bool write_root_ino = false; + bufferlist old_root_ino_bl; + r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0); + if (r == -ENOENT) { + dout(4) << "root does not exist, will create" << dendl; + write_root_ino = true; + } else if (r >= 0) { + r = 0; + InodeStore old_inode; + dout(4) << "root exists, will modify (" << old_root_ino_bl.length() + << ")" << dendl; + auto inode_bl_iter = old_root_ino_bl.cbegin(); + std::string magic; + decode(magic, inode_bl_iter); + if (magic == CEPH_FS_ONDISK_MAGIC) { + dout(4) << "magic ok" << dendl; + old_inode.decode(inode_bl_iter); + + if (old_inode.inode->version < fb.inode->version) { + write_root_ino = true; + } + } else { + dout(4) << "magic bad: '" << magic << "'" << dendl; + write_root_ino = true; + } + } else { + derr << "error reading root inode object " << root_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (write_root_ino && !dry_run) { + dout(4) << "writing root ino " << root_oid.name + << " version " << fb.inode->version << dendl; + + // Compose: root ino format is magic,InodeStore(bare=false) + bufferlist new_root_ino_bl; + encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl); + encode_fullbit_as_inode(fb, false, &new_root_ino_bl); + + // Write to RADOS + r = output.write_full(root_oid.name, new_root_ino_bl); + if (r != 0) { + derr << "error writing inode object " << root_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + } + + return r; +} + + +/** + * Erase a region of the log by overwriting it with ENoOp + * + */ +int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length) +{ + // To erase this region, we use our preamble, the encoding overhead + // of an ENoOp, and our trailing start ptr. Calculate how much padding + // is needed inside the ENoOp to make up the difference. + bufferlist tmp; + if (type == "mdlog") { + ENoOp enoop(0); + enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT); + } else if (type == "purge_queue") { + PurgeItem pi; + pi.encode(tmp); + } + + dout(4) << "erase_region " << pos << " len=" << length << dendl; + + // FIXME: get the preamble/postamble length via JournalStream + int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t); + dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl; + + if (padding < 0) { + derr << "Erase region " << length << " too short" << dendl; + return -EINVAL; + } + + bufferlist entry; + if (type == "mdlog") { + // Serialize an ENoOp with the correct amount of padding + ENoOp enoop(padding); + enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT); + } else if (type == "purge_queue") { + PurgeItem pi; + pi.pad_size = padding; + pi.encode(entry); + } + JournalStream stream(JOURNAL_FORMAT_RESILIENT); + // Serialize region of log stream + bufferlist log_data; + stream.write(entry, &log_data, pos); + + dout(4) << "erase_region data length " << log_data.length() << dendl; + ceph_assert(log_data.length() == length); + + // Write log stream region to RADOS + // FIXME: get object size somewhere common to scan_events + uint32_t object_size = g_conf()->mds_log_segment_size; + if (object_size == 0) { + // Default layout object size + object_size = file_layout_t::get_default().object_size; + } + + uint64_t write_offset = pos; + uint64_t obj_offset = (pos / object_size); + int r = 0; + while(log_data.length()) { + std::string const oid = js.obj_name(obj_offset); + uint32_t offset_in_obj = write_offset % object_size; + uint32_t write_len = min(log_data.length(), object_size - offset_in_obj); + + r = output.write(oid, log_data, write_len, offset_in_obj); + if (r < 0) { + return r; + } else { + dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl; + r = 0; + } + + log_data.splice(0, write_len); + write_offset += write_len; + obj_offset++; + } + + return r; +} + +/** + * Given an EMetaBlob::fullbit containing an inode, write out + * the encoded inode in the format used by InodeStore (i.e. the + * backing store format) + * + * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use + * on an offline InodeStore instance. It's way simpler, because we are just + * uncritically hauling the data between structs. + * + * @param fb a fullbit extracted from a journal entry + * @param bare if true, leave out [EN|DE]CODE_START decoration + * @param out_bl output, write serialized inode to this bufferlist + */ +void JournalTool::encode_fullbit_as_inode( + const EMetaBlob::fullbit &fb, + const bool bare, + bufferlist *out_bl) +{ + ceph_assert(out_bl != NULL); + + // Compose InodeStore + InodeStore new_inode; + new_inode.inode = fb.inode; + new_inode.xattrs = fb.xattrs; + new_inode.dirfragtree = fb.dirfragtree; + new_inode.snap_blob = fb.snapbl; + new_inode.symlink = fb.symlink; + new_inode.old_inodes = fb.old_inodes; + + // Serialize InodeStore + if (bare) { + new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + } else { + new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + } +} + +/** + * Given a list of inode numbers known to be in use by + * inodes in the backing store, ensure that none of these + * numbers are listed as free in the InoTables in the + * backing store. + * + * Used after injecting inodes into the backing store, to + * ensure that the same inode numbers are not subsequently + * used for new files during ordinary operation. + * + * @param inos list of inode numbers to be removed from + * free lists in InoTables + * @returns 0 on success, else negative error code + */ +int JournalTool::consume_inos(const std::set<inodeno_t> &inos) +{ + int r = 0; + + // InoTable is a per-MDS structure, so iterate over assigned ranks + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + std::set<mds_rank_t> in_ranks; + fs->mds_map.get_mds_set(in_ranks); + + for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin(); + rank_i != in_ranks.end(); ++rank_i) + { + // Compose object name + std::ostringstream oss; + oss << "mds" << *rank_i << "_inotable"; + object_t inotable_oid = object_t(oss.str()); + + // Read object + bufferlist inotable_bl; + int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0); + if (read_r < 0) { + // Things are really bad if we can't read inotable. Beyond our powers. + derr << "unable to read inotable '" << inotable_oid.name << "': " + << cpp_strerror(read_r) << dendl; + r = r ? r : read_r; + continue; + } + + // Deserialize InoTable + version_t inotable_ver; + auto q = inotable_bl.cbegin(); + decode(inotable_ver, q); + InoTable ino_table(NULL); + ino_table.decode(q); + + // Update InoTable in memory + bool inotable_modified = false; + for (std::set<inodeno_t>::iterator i = inos.begin(); + i != inos.end(); ++i) + { + const inodeno_t ino = *i; + if (ino_table.force_consume(ino)) { + dout(4) << "Used ino 0x" << std::hex << ino << std::dec + << " requires inotable update" << dendl; + inotable_modified = true; + } + } + + // Serialize and write InoTable + if (inotable_modified) { + inotable_ver += 1; + dout(4) << "writing modified inotable version " << inotable_ver << dendl; + bufferlist inotable_new_bl; + encode(inotable_ver, inotable_new_bl); + ino_table.encode_state(inotable_new_bl); + int write_r = output.write_full(inotable_oid.name, inotable_new_bl); + if (write_r != 0) { + derr << "error writing modified inotable " << inotable_oid.name + << ": " << cpp_strerror(write_r) << dendl; + r = r ? r : read_r; + continue; + } + } + } + + return r; +} + diff --git a/src/tools/cephfs/JournalTool.h b/src/tools/cephfs/JournalTool.h new file mode 100644 index 000000000..8d610a866 --- /dev/null +++ b/src/tools/cephfs/JournalTool.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "MDSUtility.h" +#include "RoleSelector.h" +#include <vector> + +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/events/EMetaBlob.h" + +#include "include/rados/librados.hpp" + +#include "JournalFilter.h" + +class JournalScanner; + + +/** + * Command line tool for investigating and repairing filesystems + * with damaged metadata logs + */ +class JournalTool : public MDSUtility +{ + private: + MDSRoleSelector role_selector; + // Bit hacky, use this `rank` member to control behaviour of the + // various main_ functions. + mds_rank_t rank; + // when set, generate per rank dump file path + bool all_ranks = false; + + std::string type; + + // Entry points + int main_journal(std::vector<const char*> &argv); + int main_header(std::vector<const char*> &argv); + int main_event(std::vector<const char*> &argv); + + // Shared functionality + int recover_journal(); + + // Journal operations + int journal_inspect(); + int journal_export(std::string const &path, bool import, bool force); + int journal_reset(bool hard); + + // Header operations + int header_set(); + + // I/O handles + librados::Rados rados; + librados::IoCtx input; + librados::IoCtx output; + + bool other_pool; + + // Metadata backing store manipulation + int read_lost_found(std::set<std::string> &lost); + int recover_dentries( + EMetaBlob const &metablob, + bool const dry_run, + std::set<inodeno_t> *consumed_inos); + + // Splicing + int erase_region(JournalScanner const &jp, uint64_t const pos, uint64_t const length); + + // Backing store helpers + void encode_fullbit_as_inode( + const EMetaBlob::fullbit &fb, + const bool bare, + bufferlist *out_bl); + int consume_inos(const std::set<inodeno_t> &inos); + + //validate type + int validate_type(const std::string &type); + + // generate output file path for dump/export + std::string gen_dump_file_path(const std::string &prefix); + + // check if an operation (mode, command) is safe to be + // executed on all ranks. + bool can_execute_for_all_ranks(const std::string &mode, + const std::string &command); + public: + static void usage(); + JournalTool() : + rank(0), other_pool(false) {} + int main(std::vector<const char*> &argv); +}; + diff --git a/src/tools/cephfs/MDSUtility.cc b/src/tools/cephfs/MDSUtility.cc new file mode 100644 index 000000000..54386d219 --- /dev/null +++ b/src/tools/cephfs/MDSUtility.cc @@ -0,0 +1,155 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "MDSUtility.h" +#include "mon/MonClient.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + + +MDSUtility::MDSUtility() : + Dispatcher(g_ceph_context), + objecter(NULL), + finisher(g_ceph_context, "MDSUtility", "fn_mds_utility"), + waiting_for_mds_map(NULL), + inited(false) +{ + monc = new MonClient(g_ceph_context, poolctx); + messenger = Messenger::create_client_messenger(g_ceph_context, "mds"); + fsmap = new FSMap(); + objecter = new Objecter(g_ceph_context, messenger, monc, poolctx); +} + + +MDSUtility::~MDSUtility() +{ + if (inited) { + shutdown(); + } + delete objecter; + delete monc; + delete messenger; + delete fsmap; + ceph_assert(waiting_for_mds_map == NULL); +} + + +int MDSUtility::init() +{ + // Initialize Messenger + poolctx.start(1); + messenger->start(); + + objecter->set_client_incarnation(0); + objecter->init(); + + // Connect dispatchers before starting objecter + messenger->add_dispatcher_tail(objecter); + messenger->add_dispatcher_tail(this); + + // Initialize MonClient + if (monc->build_initial_monmap() < 0) { + objecter->shutdown(); + messenger->shutdown(); + messenger->wait(); + return -1; + } + + monc->set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD|CEPH_ENTITY_TYPE_MDS); + monc->set_messenger(messenger); + monc->init(); + int r = monc->authenticate(); + if (r < 0) { + derr << "Authentication failed, did you specify an MDS ID with a valid keyring?" << dendl; + monc->shutdown(); + objecter->shutdown(); + messenger->shutdown(); + messenger->wait(); + return r; + } + + client_t whoami = monc->get_global_id(); + messenger->set_myname(entity_name_t::CLIENT(whoami.v)); + + // Start Objecter and wait for OSD map + objecter->start(); + objecter->wait_for_osd_map(); + + // Prepare to receive MDS map and request it + ceph::mutex init_lock = ceph::make_mutex("MDSUtility:init"); + ceph::condition_variable cond; + bool done = false; + ceph_assert(!fsmap->get_epoch()); + lock.lock(); + waiting_for_mds_map = new C_SafeCond(init_lock, cond, &done, NULL); + lock.unlock(); + monc->sub_want("fsmap", 0, CEPH_SUBSCRIBE_ONETIME); + monc->renew_subs(); + + // Wait for MDS map + dout(4) << "waiting for MDS map..." << dendl; + { + std::unique_lock locker{init_lock}; + cond.wait(locker, [&done] { return done; }); + } + dout(4) << "Got MDS map " << fsmap->get_epoch() << dendl; + + finisher.start(); + + inited = true; + return 0; +} + + +void MDSUtility::shutdown() +{ + finisher.stop(); + + lock.lock(); + objecter->shutdown(); + lock.unlock(); + monc->shutdown(); + messenger->shutdown(); + messenger->wait(); + poolctx.finish(); +} + + +bool MDSUtility::ms_dispatch(Message *m) +{ + std::lock_guard locker{lock}; + switch (m->get_type()) { + case CEPH_MSG_FS_MAP: + handle_fs_map((MFSMap*)m); + break; + case CEPH_MSG_OSD_MAP: + break; + default: + return false; + } + m->put(); + return true; +} + + +void MDSUtility::handle_fs_map(MFSMap* m) +{ + *fsmap = m->get_fsmap(); + if (waiting_for_mds_map) { + waiting_for_mds_map->complete(0); + waiting_for_mds_map = NULL; + } +} + + diff --git a/src/tools/cephfs/MDSUtility.h b/src/tools/cephfs/MDSUtility.h new file mode 100644 index 000000000..09f1918ba --- /dev/null +++ b/src/tools/cephfs/MDSUtility.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MDS_UTILITY_H_ +#define MDS_UTILITY_H_ + +#include "osdc/Objecter.h" +#include "mds/FSMap.h" +#include "messages/MFSMap.h" +#include "msg/Dispatcher.h" +#include "msg/Messenger.h" +#include "auth/Auth.h" +#include "common/async/context_pool.h" +#include "common/Finisher.h" +#include "common/Timer.h" + +/// MDS Utility +/** + * This class is the parent for MDS utilities, i.e. classes that + * need access the objects belonging to the MDS without actually + * acting as an MDS daemon themselves. + */ +class MDSUtility : public Dispatcher { +protected: + Objecter *objecter; + FSMap *fsmap; + Messenger *messenger; + MonClient *monc; + + ceph::mutex lock = ceph::make_mutex("MDSUtility::lock"); + Finisher finisher; + ceph::async::io_context_pool poolctx; + + Context *waiting_for_mds_map; + + bool inited; +public: + MDSUtility(); + ~MDSUtility() override; + + void handle_fs_map(MFSMap* m); + bool ms_dispatch(Message *m) override; + bool ms_handle_reset(Connection *con) override { return false; } + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override { return false; } + int init(); + void shutdown(); +}; + +#endif /* MDS_UTILITY_H_ */ diff --git a/src/tools/cephfs/MetaTool.cc b/src/tools/cephfs/MetaTool.cc new file mode 100644 index 000000000..527ae636f --- /dev/null +++ b/src/tools/cephfs/MetaTool.cc @@ -0,0 +1,999 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <string.h> +#include <map> +#include <sstream> +#include <fstream> + +#include "include/types.h" +#include "common/Formatter.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "osdc/Journaler.h" +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/InoTable.h" +#include "mds/CDentry.h" + +#include "mds/events/ENoOp.h" +#include "mds/events/EUpdate.h" + +#include "mds/JournalPointer.h" +// #include "JournalScanner.h" +// #include "EventOutput.h" +// #include "Dumper.h" +// #include "Resetter.h" + +// #include "JournalTool.h" +#include "MetaTool.h" +#include "type_helper.hpp" +#include "include/object.h" + +WRITE_RAW_ENCODER(char) +WRITE_RAW_ENCODER(unsigned char) + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << __func__ << ": " + + +void MetaTool::meta_op::release() +{ + for (const auto& i : inodes) { + delete i.second; + } + + while (!sub_ops.empty()) { + delete sub_ops.top(); + sub_ops.pop(); + } +} + +void MetaTool::inode_meta_t::decode_json(JSONObj *obj) +{ + unsigned long long tmp; + JSONDecoder::decode_json("snapid_t", tmp, obj, true); + _f.val = tmp; + JSONDecoder::decode_json("itype", tmp, obj, true); + _t = tmp; + if (NULL == _i) + _i = new InodeStore; + JSONDecoder::decode_json("store", *_i, obj, true); +} + +void MetaTool::usage() +{ + generic_client_usage(); +} + +int MetaTool::main(string& mode, + string& rank_str, + string& minfo, + string&ino, + string& out, + string& in, + bool confirm + ) +{ + int r = 0; + + std::string manual_meta_pool; + std::string manual_data_pool; + std::string manual_rank_num; + bool manual_mode = false; + if (minfo != "") { + vector<string> v; + string_split(minfo, v); + manual_meta_pool = v.size() >= 1 ? v[0] : ""; + manual_data_pool = v.size() >= 2 ? v[1] : ""; + manual_rank_num = v.size() >= 3 ? v[2] : ""; + std::cout << "("<< minfo<< ")=>" + << " mpool: " << manual_meta_pool + << " dpool: " << manual_data_pool + << " rank: " << manual_rank_num + << std::endl; + if (!manual_meta_pool.empty() && !manual_data_pool.empty() && !manual_rank_num.empty()) { + std::cout << "you specify rank: " << manual_rank_num + << " mpool: " << manual_meta_pool + << " dpool: " << manual_data_pool + << "\nstart manual mode!!"<< std::endl; + manual_mode = true; + } + } + + // RADOS init + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + cerr << "RADOS unavailable" << std::endl; + return r; + } + + if (_debug) + cout << "MetaTool: connecting to RADOS..." << std::endl; + r = rados.connect(); + if (r < 0) { + cerr << "couldn't connect to cluster: " << cpp_strerror(r) << std::endl; + return r; + } + + if (!manual_mode) { + r = role_selector.parse(*fsmap, rank_str); + if (r != 0) { + cerr << "Couldn't determine MDS rank." << std::endl; + return r; + } + + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + assert(fs != nullptr); + + // prepare io for meta pool + int64_t const pool_id = fs->mds_map.get_metadata_pool(); + features = fs->mds_map.get_up_features(); + if (features == 0) + features = CEPH_FEATURES_SUPPORTED_DEFAULT; + else if (features != CEPH_FEATURES_SUPPORTED_DEFAULT) { + cout << "I think we need to check the feature! : " << features << std::endl; + return -1; + } + + std::string pool_name; + r = rados.pool_reverse_lookup(pool_id, &pool_name); + if (r < 0) { + cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl; + return r; + } + + if (_debug) + cout << "MetaTool: creating IoCtx.." << std::endl; + r = rados.ioctx_create(pool_name.c_str(), io_meta); + assert(r == 0); + output.dup(io_meta); + + // prepare io for data pool + for (const auto p : fs->mds_map.get_data_pools()) { + r = rados.pool_reverse_lookup(p, &pool_name); + if (r < 0) { + cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl; + return r; + } + librados::IoCtx* io_data = new librados::IoCtx; + r = rados.ioctx_create(pool_name.c_str(), *io_data); + assert(r == 0); + io_data_v.push_back(io_data); + } + + for (auto role : role_selector.get_roles()) { + rank = role.rank; + + r = process(mode, ino, out, in, confirm); + cout << "executing for rank " << rank << " op[" <<mode<< "] ret : " << r << std::endl; + } + + } else { + features = CEPH_FEATURES_SUPPORTED_DEFAULT; + r = rados.ioctx_create(manual_meta_pool.c_str(), io_meta); + assert(r == 0); + + librados::IoCtx* io_data = new librados::IoCtx; + r = rados.ioctx_create(manual_data_pool.c_str(), *io_data); + assert(r == 0); + io_data_v.push_back(io_data); + + + rank = conv_t<int>(manual_rank_num); + r = process(mode, ino, out, in, confirm); + cout << "op[" << mode << "] ret : " << r << std::endl; + } + return r; +} + +int MetaTool::process(string& mode, string& ino, string out, string in, bool confirm) +{ + if (mode == "showm") { + return show_meta_info(ino, out); + } else if (mode == "showfn") { + return show_fnode(ino, out); + } else if (mode == "listc") { + return list_meta_info(ino, out); + } else if (mode == "amend") { + return amend_meta_info(ino, in, confirm); + } else if (mode == "amendfn") { + return amend_fnode(in, confirm); + } else { + cerr << "bad command '" << mode << "'" << std::endl; + return -EINVAL; + } +} +int MetaTool::show_fnode(string& ino, string& out) +{ + if (ino != "0") { + inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0); + meta_op op(_debug, out); + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_SHOW_FN; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = i_ino; + op.push_op(nsop); + return op_process(op); + } else { + cerr << "parameter error? : ino = " << ino << std::endl; + } + return 0; +} +int MetaTool::amend_fnode(string& in, bool confirm) +{ + meta_op op(_debug, "", in, confirm); + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_AMEND_FN; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = 0; + op.push_op(nsop); + return op_process(op); +} +int MetaTool::amend_meta_info(string& ino, string& in, bool confirm) +{ + if (ino != "0" && in != "") { + inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0); + meta_op op(_debug, "", in, confirm); + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_AMEND; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = i_ino; + op.push_op(nsop); + return op_process(op); + } else { + cerr << "parameter error? : ino = " << ino << std::endl; + } + return 0; +} +int MetaTool::list_meta_info(string& ino, string& out) +{ + if (ino != "0") { + inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0); + meta_op op(_debug, out); + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = i_ino; + op.push_op(nsop); + return op_process(op); + } else { + cerr << "parameter error? : ino = " << ino << std::endl; + } + return 0; +} +int MetaTool::show_meta_info(string& ino, string& out) +{ + if (ino != "0") { + inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0); + meta_op op(_debug, out); + + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_SHOW; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = i_ino; + op.push_op(nsop); + return op_process(op); + } else { + cerr << "parameter error? : ino = " << ino << std::endl; + } + return 0; +} + +int MetaTool::op_process(meta_op& op) +{ + int r = 0; + while (!op.no_sops()) { + if (_debug) + std::cout << "process : " << op.top_op()->detail() << std::endl; + switch(op.top_op()->sub_op_t) { + case meta_op::OP_LIST: + r = list_meta(op); + break; + case meta_op::OP_LTRACE: + r = file_meta(op); + break; + case meta_op::OP_SHOW: + r = show_meta(op); + break; + case meta_op::OP_AMEND: + r = amend_meta(op); + break; + case meta_op::OP_SHOW_FN: + r = show_fn(op); + break; + case meta_op::OP_AMEND_FN: + r = amend_fn(op); + break; + default: + cerr << "unknow op" << std::endl; + } + if (r == 0) + op.pop_op(); + else if (r < 0) + op.clear_sops(); + } + op.release(); + return r; +} + +int MetaTool::amend_meta(meta_op &op) +{ + meta_op::sub_op* sop = op.top_op(); + auto item = op.inodes.find(sop->ino); + auto item_k = op.okeys.find(sop->ino); + if (item != op.inodes.end() && item_k != op.okeys.end()) { + if (_amend_meta(item_k->second, *(item->second), op.infile(), op) < 0) + return -1; + } else { + if (op.inodes.empty()) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->trace_level = 0; + nsop->ino_c = sop->ino; + op.push_op(nsop); + return 1; + } else { + return -1; + } + } + return 0; +} + +void MetaTool::inode_meta_t::encode(::ceph::bufferlist& bl, uint64_t features) +{ + ::encode(_f, bl); + ::encode(_t, bl); + _i->encode_bare(bl, features); +} +int MetaTool::_amend_meta(string& k, inode_meta_t& inode_meta, const string& fn, meta_op& op) +{ + JSONParser parser; + if (!parser.parse(fn.c_str())) { + cout << "Error parsing create user response" << std::endl; + return -1; + } + + try { + inode_meta.decode_json(&parser); + } catch (JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.what() << std::endl; + return -1; + } + + if (!op.confirm_chg() || op.is_debug()) { + cout << "you will amend info of inode ==>: " << std::endl; + _show_meta(inode_meta, ""); + } + + if (!op.confirm_chg()) { + cout << "warning: this operation is irreversibl!!!\n" + << " You must confirm that all logs of mds have been flushed!!!\n" + << " if you want amend it, please add --yes-i-really-really-mean-it!!!" + << std::endl; + return -1; + } + + bufferlist bl; + inode_meta.encode(bl, features); + map<string, bufferlist> to_set; + to_set[k].swap(bl); + inode_backpointer_t bp; + if (!op.top_op()->get_ancestor(bp)) + return -1; + frag_t frag; + auto item = op.inodes.find(bp.dirino); + if (item != op.inodes.end()) { + frag = item->second->get_meta()->pick_dirfrag(bp.dname); + } + string oid = obj_name(bp.dirino, frag); + int ret = io_meta.omap_set(oid, to_set); + to_set.clear(); + return ret; +} +int MetaTool::show_fn(meta_op &op) +{ + meta_op::sub_op* sop = op.top_op(); + auto item = op.inodes.find(sop->ino); + if (item != op.inodes.end()) { + if (_show_fn(*(item->second), op.outfile()) < 0) + return -1; + } else { + if (op.inodes.empty()) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->trace_level = 0; + nsop->ino_c = sop->ino; + op.push_op(nsop); + return 1; + } else + return -1; + } + return 0; +} +int MetaTool::_show_fn(inode_meta_t& inode_meta, const string& fn) +{ + std::list<frag_t> frags; + inode_meta.get_meta()->dirfragtree.get_leaves(frags); + std::stringstream ds; + std::string format = "json"; + std::string oids; + Formatter* f = Formatter::create(format); + f->enable_line_break(); + f->open_object_section("fnodes"); + for (const auto &frag : frags) { + bufferlist hbl; + string oid = obj_name(inode_meta.get_meta()->inode->ino, frag); + int ret = io_meta.omap_get_header(oid, &hbl); + if (ret < 0) { + std::cerr << __func__ << " : can't find oid("<< oid << ")" << std::endl; + return -1; + } + { + fnode_t got_fnode; + try { + auto p = hbl.cbegin(); + ::decode(got_fnode, p); + } catch (const buffer::error &err) { + cerr << "corrupt fnode header in " << oid + << ": " << err.what() << std::endl; + return -1; + } + if (!oids.empty()) + oids += ","; + oids += oid; + f->open_object_section(oid.c_str()); + got_fnode.dump(f); + f->close_section(); + } + } + f->dump_string("oids", oids.c_str()); + f->close_section(); + f->flush(ds); + if (fn != "") { + ofstream o; + o.open(fn); + if (o) { + o << ds.str(); + o.close(); + } else { + cout << "out to file (" << fn << ") failed" << std::endl; + cout << ds.str() << std::endl; + } + } else + std::cout << ds.str() << std::endl; + return 0; +} +int MetaTool::amend_fn(meta_op &op) +{ + if (_amend_fn(op.infile(), op.confirm_chg()) < 0) + return -1; + return 0; +} +int MetaTool::_amend_fn(const string& fn, bool confirm) +{ + JSONParser parser; + if (!parser.parse(fn.c_str())) { + cout << "Error parsing create user response : " << fn << std::endl; + return -1; + } + if (!confirm) { + cout << "warning: this operation is irreversibl!!!\n" + << " You must confirm that all logs of mds have been flushed!!!\n" + << " if you want amend it, please add --yes-i-really-really-mean-it!!!" + << std::endl; + return -1; + } + try { + string tmp; + JSONDecoder::decode_json("oids", tmp, &parser, true); + string::size_type pos1, pos2; + vector<string> v; + string c = ","; + pos2 = tmp.find(c); + pos1 = 0; + while (string::npos != pos2) { + v.push_back(tmp.substr(pos1, pos2-pos1)); + pos1 = pos2 + c.size(); + pos2 = tmp.find(c, pos1); + } + if (pos1 != tmp.length()) + v.push_back(tmp.substr(pos1)); + int ret = 0; + for (auto i : v) { + cout << "amend frag : " << i << "..." << std::endl; + fnode_t fnode; + JSONDecoder::decode_json(i.c_str(), fnode, &parser, true); + bufferlist bl; + fnode.encode(bl); + ret = io_meta.omap_set_header(i, bl); + if (ret < 0) + return ret; + } + } catch (JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.what() << std::endl; + return -1; + } + return 0; +} +int MetaTool::show_meta(meta_op &op) +{ + meta_op::sub_op* sop = op.top_op(); + auto item = op.inodes.find(sop->ino); + if (item != op.inodes.end()) { + if (_show_meta(*(item->second), op.outfile()) < 0) + return -1; + } else { + if (op.inodes.empty()) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->trace_level = 0; + nsop->ino_c = sop->ino; + op.push_op(nsop); + return 1; + } else { + return -1; + } + } + return 0; +} +int MetaTool::_show_meta(inode_meta_t& inode_meta, const string& fn) +{ + std::stringstream ds; + std::string format = "json"; + InodeStore& inode_data = *inode_meta.get_meta(); + Formatter* f = Formatter::create(format); + f->enable_line_break(); + f->open_object_section("meta"); + f->dump_unsigned("snapid_t", inode_meta.get_snapid()); + f->dump_unsigned("itype", inode_meta.get_type()); + f->open_object_section("store"); + inode_data.dump(f); + try { + if (inode_data.snap_blob.length()) { + sr_t srnode; + auto p = inode_data.snap_blob.cbegin(); + decode(srnode, p); + f->open_object_section("snap_blob"); + srnode.dump(f); + f->close_section(); + } + } catch (const buffer::error &err) { + cerr << "corrupt decode in snap_blob" + << ": " << err.what() << std::endl; + return -1; + } + + f->close_section(); + f->close_section(); + f->flush(ds); + + if (fn != "") { + ofstream o; + o.open(fn); + if (o) { + o << ds.str(); + o.close(); + } else { + cout << "out to file (" << fn << ") failed" << std::endl; + cout << ds.str() << std::endl; + } + + } else + std::cout << ds.str() << std::endl; + return 0; +} +int MetaTool::list_meta(meta_op &op) +{ + meta_op::sub_op* sop = op.top_op(); + + bool list_all = false; + string oid; + inodeno_t ino = sop->ino_c; + frag_t frag = sop->frag; + + if (sop->ino_c == 0) { + list_all = true; + oid = obj_name(sop->ino, frag); + } else { + if (_debug) + std::cout << __func__ << " : " << sop->trace_level << " " << op.ancestors.size() << std::endl; + inode_backpointer_t bp; + if (sop->get_c_ancestor(bp)) { + auto item = op.inodes.find(bp.dirino); + if (item != op.inodes.end()) { + frag = item->second->get_meta()->pick_dirfrag(bp.dname); + } + oid = obj_name(bp.dirino, frag); + } else { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->ino = sop->ino_c; + nsop->sub_op_t = meta_op::OP_LTRACE; + nsop->sub_ino_t = meta_op::INO_DIR; + op.push_op(nsop); + return 1; + } + } + if (_debug) + std::cout << __func__ << " : " << string(list_all?"listall ":"info ") << oid << " "<< ino << std::endl; + bufferlist hbl; + int ret = io_meta.omap_get_header(oid, &hbl); + if (ret < 0) { + std::cerr << __func__ << " : can't find it, maybe it (ino:"<< sop->ino<< ")isn't a normal dir!" << std::endl; + return -1; + } + + if (hbl.length() == 0) { // obj has splite + if (list_all) { + if (frag == frag_t()) { + auto item = op.inodes.find(sop->ino); + if (item != op.inodes.end()) { + inodeno_t tmp = sop->ino; + op.pop_op(); + std::list<frag_t> frags; + item->second->get_meta()->dirfragtree.get_leaves(frags); + for (const auto &frag : frags) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->ino = tmp; + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->frag = frag; + op.push_op(nsop); + } + } else { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->ino_c = sop->ino; + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + op.push_op(nsop); + } + return 1; + } else { + cerr << __func__ << " missing some data (" << oid << ")???" << std::endl; + return -1; + } + } else { + if (frag == frag_t()) { + inode_backpointer_t bp; + if (sop->get_c_ancestor(bp)) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->ino_c = bp.dirino; + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->trace_level = sop->trace_level + 1; + op.push_op(nsop); + return 1; + } else { + cerr << __func__ << "can't find obj(" << oid << ") ,miss ancestors or miss some objs??? " << std::endl; + return -1; + } + } else { + cerr << __func__ << "missing some objs(" << oid << ")??? " << std::endl; + return -1; + } + } + } + + fnode_t got_fnode; + try { + auto p = hbl.cbegin(); + ::decode(got_fnode, p); + } catch (const buffer::error &err) { + cerr << "corrupt fnode header in " << oid + << ": " << err.what() << std::endl; + return -1; + } + + if (_debug) { + std::string format = "json"; + Formatter* f = Formatter::create(format); + f->enable_line_break(); + f->dump_string("type", "--fnode--"); + f->open_object_section("fnode"); + got_fnode.dump(f); + f->close_section(); + f->flush(std::cout); + std::cout << std::endl; + } + + // print children + std::map<string, bufferlist> out_vals; + int max_vals = 5; + io_meta.omap_get_vals(oid, "", max_vals, &out_vals); + + bool force_dirty = false; + const set<snapid_t> *snaps = NULL; + unsigned pos = out_vals.size() - 1; + std::string last_dname; + for (map<string, bufferlist>::iterator p = out_vals.begin(); + p != out_vals.end(); + ++p, --pos) { + string dname; + snapid_t last; + dentry_key_t::decode_helper(p->first, dname, last); + if (_debug) + last_dname = dname; + try { + if (!list_all) { + if (show_child(p->first, dname, last, p->second, pos, snaps, + &force_dirty, ino, &op) == 1) { + return 0; + } + } else { + cout << "dname : " << dname << " " << last << std::endl; + if (show_child(p->first, dname, last, p->second, pos, snaps, + &force_dirty) == 1) + return 0; + } + } catch (const buffer::error &err) { + derr << "Corrupt dentry '" << dname << "' : " + << err.what() << "(" << "" << ")" << dendl; + return -1; + } + } + while (out_vals.size() == (size_t)max_vals) { + out_vals.clear(); + io_meta.omap_get_vals(oid, last_dname, max_vals, &out_vals); + pos = out_vals.size() - 1; + for (map<string, bufferlist>::iterator p = (++out_vals.begin()); + p != out_vals.end(); + ++p, --pos) { + string dname; + snapid_t last; + dentry_key_t::decode_helper(p->first, dname, last); + last_dname = dname; + try { + if (!list_all) { + if (show_child(p->first, dname, last, p->second, pos, snaps, + &force_dirty, ino, &op) == 1) { + return 0; + } + } else { + cout << "dname : " << dname << " " << last << std::endl; + if (show_child(p->first, dname, last, p->second, pos, snaps, + &force_dirty) == 1) + return 0; + } + } catch (const buffer::error &err) { + derr << "Corrupt dentry '" << dname << "' : " + << err.what() << "(" << "" << ")" << dendl; + return -1; + } + } + } + + if (!list_all) { + cerr << __func__ << "miss obj(ino:" << ino << ")??? " << std::endl; + return -1; + } + return 0; +} + +int MetaTool::file_meta(meta_op &op) +{ + int r = 0; + if (op.top_op()->sub_ino_t == meta_op::INO_DIR) { + r = _file_meta(op, io_meta); + } else if (op.top_op()->sub_ino_t == meta_op::INO_F) { + for (auto i = io_data_v.begin(); i != io_data_v.end(); ++i) + if ((r = _file_meta(op, **i)) == 1) + break; + } + if (r == 1) { + inode_backpointer_t bp; + if (op.top_op()->get_ancestor(bp)) { + return 0; + } else { + std::cerr << "no trace for obj (ino:" << op.top_op()->ino <<")??" << std::endl; + return -1; + } + } else if (op.top_op()->sub_ino_t == meta_op::INO_DIR) { + std::cerr << "\tmaybe it's a file(ino:" << op.top_op()->ino << ")" << std::endl; + op.top_op()->sub_ino_t = meta_op::INO_F; + return 1; + } + + std::cerr << "can't get (ino:" << op.top_op()->ino <<")trace??" << std::endl; + return -1; +} + +int MetaTool::_file_meta(meta_op &op, librados::IoCtx& io) +{ + inodeno_t ino = op.top_op()->ino; + std::string oid = obj_name(ino); + bufferlist pointer_bl; + std::map<std::string, bufferlist> attrset; + int r = 0; + bool have_data = false; + r = io.getxattrs (oid.c_str(), attrset); + if (0 == r) { + std::stringstream ds; + std::string format = "json"; + Formatter* f = Formatter::create(format); + auto item = attrset.find("parent"); + if (item != attrset.end()) { + inode_backtrace_t i_bt; + try { + bufferlist::const_iterator q = item->second.cbegin(); + i_bt.decode(q); + f->open_array_section("info"); + have_data = true; + if (i_bt.ancestors.size() > 0) + op.ancestors[ino] = i_bt.ancestors[0]; + f->dump_string("type", "--i_bt--"); + f->open_object_section("parent"); + i_bt.dump(f); + f->close_section(); + } catch (buffer::error &e) { + cerr << "failed to decode parent of " << oid << std::endl; + return -1; + } + } else { + cerr << oid << " in " << io.get_pool_name() << " , but no parent" << std::endl; + return -1; + } + + item = attrset.find("layout"); + if (item != attrset.end()) { + file_layout_t layout; + try { + auto q = item->second.cbegin(); + layout.decode(q); + f->dump_string("type", "--layout--"); + f->open_object_section("layout"); + layout.dump(f); + f->close_section(); + + } catch (buffer::error &e) { + cerr << "failed to decode layout of " << oid << std::endl; + return -1; + } + } else { + cerr << oid << " in " << io.get_pool_name() << " , but no layout" << std::endl; + } + if (have_data) { + f->close_section(); + f->flush(ds); + if (_debug) + cout << ino << " : "<< ds.str() << std::endl; + return 1; + } + } + return 0; +} +std::string MetaTool::obj_name(inodeno_t ino, uint64_t offset, const char *suffix) const +{ + char name[60]; + snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)offset, suffix ? suffix : ""); + return std::string(name); +} +std::string MetaTool::obj_name(inodeno_t ino, frag_t fg, const char *suffix) const +{ + char name[60]; + snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)fg, suffix ? suffix : ""); + return std::string(name); +} + +std::string MetaTool::obj_name(const char* ino, uint64_t offset, const char *suffix) const +{ + char name[60]; + snprintf(name, sizeof(name), "%s.%08llx%s", ino, (long long unsigned)offset, suffix ? suffix : ""); + std::string out = name; + transform(out.begin(), out.end(), out.begin(),::tolower); + return out; +} + +int MetaTool::show_child(std::string_view key, + std::string_view dname, + const snapid_t last, + bufferlist &bl, + const int pos, + const std::set<snapid_t> *snaps, + bool *force_dirty, + inodeno_t sp_ino, + meta_op* op) +{ + bufferlist::const_iterator q = bl.cbegin(); + + snapid_t first; + ::decode(first, q); + + // marker + char type; + ::decode(type, q); + + if (_debug) + std::cout << pos << " type '" << type << "' dname '" << dname + << " [" << first << "," << last << "]" + << std::endl; + // bool stale = false; + if (snaps && last != CEPH_NOSNAP) { + derr << "!!!! erro !!!!" << dendl; + return -1; + } + + // CDentry *dn = NULL; + // look for existing dentry for _last_ snap, can't process snap of obj + //if *(stale) + // dn = lookup_exact_snap(dname, last); + //else + // dn = lookup(dname, last); + if (type == 'L' || type == 'l') { + // hard link + inodeno_t ino; + unsigned char d_type; + mempool::mds_co::string alternate_name; + + CDentry::decode_remote(type, ino, d_type, alternate_name, q); + + if (sp_ino > 0) { + if (sp_ino == ino) { + std::cout << "find hard link : " << ino << "," << d_type << std::endl; + return 1; + } + } + + std::cout << "hard link : " << ino << "," << d_type << std::endl; + } else if (type == 'I' || type == 'i') { + // inode + // load inode data before lookuping up or constructing CInode + InodeStore& inode_data = *(new InodeStore); + if (type == 'i') { + mempool::mds_co::string alternate_name; + + DECODE_START(2, q); + if (struct_v >= 2) + decode(alternate_name, q); + inode_data.decode(q); + DECODE_FINISH(q); + } else { + inode_data.decode_bare(q); + } + + std::stringstream ds; + std::string format = "json"; + Formatter* f = Formatter::create(format); + f->enable_line_break(); + f->open_object_section("meta"); + f->dump_unsigned("snapid_t", first); + f->dump_unsigned("itype", type); + f->open_object_section("store"); + inode_data.dump(f); + try { + if (inode_data.snap_blob.length()) { + sr_t srnode; + auto p = inode_data.snap_blob.cbegin(); + srnode.decode(p); + f->open_object_section("snap_blob"); + srnode.dump(f); + f->close_section(); + } + } catch (const buffer::error &err) { + cerr << "corrupt decode in snap_blob" + << ": " << err.what() << std::endl; + } + f->close_section(); + f->close_section(); + f->flush(ds); + + if (sp_ino > 0 && op != NULL && sp_ino == inode_data.inode->ino) { + inode_meta_t* tmp = new inode_meta_t(first, type, &inode_data); + op->inodes[inode_data.inode->ino] = tmp; + op->okeys[inode_data.inode->ino] = key.data(); + return 1; + } else { + delete &inode_data; + } + + if (sp_ino == 0) { + cout << ds.str() << std::endl; + } + } else { + std::cerr << __func__ << "unknow type : " << dname << "," << type << std::endl; + } + return 0; +} diff --git a/src/tools/cephfs/MetaTool.h b/src/tools/cephfs/MetaTool.h new file mode 100644 index 000000000..510be6552 --- /dev/null +++ b/src/tools/cephfs/MetaTool.h @@ -0,0 +1,272 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef METATOOL_H__ +#define METATOOL_H__ + +#include "MDSUtility.h" +#include "RoleSelector.h" +#include <vector> +#include <stack> +using std::stack; +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/events/EMetaBlob.h" + +#include "include/rados/librados.hpp" +#include "common/ceph_json.h" + +using ::ceph::bufferlist; +class MetaTool : public MDSUtility +{ +public: + class inode_meta_t { + public: + inode_meta_t(snapid_t f = CEPH_NOSNAP, char t = char(255), InodeStore* i = NULL): + _f(f),_t(t),_i(i) { + }; + snapid_t get_snapid() const { + return _f; + } + InodeStore* get_meta() const { + if (_t == 'I') + return _i; + else + return NULL; + } + int get_type() const { + return _t; + } + void decode_json(JSONObj *obj); + void encode(::ceph::bufferlist& bl, uint64_t features); + private: + snapid_t _f; + char _t; + InodeStore* _i; + }; +private: + class meta_op { + public: + meta_op(bool debug = false, string out = "", string in = "", bool confirm = false): + _debug(debug), + _out(out), + _in(in), + _confirm(confirm) + {} + void release(); + typedef enum { + OP_LIST = 0, + OP_LTRACE, + OP_SHOW, + OP_AMEND, + OP_SHOW_FN, + OP_AMEND_FN, + OP_NO + } op_type; + + typedef enum { + INO_DIR = 0, + INO_F + } ino_type; + + static string op_type_name(op_type& t) { + string name; + switch (t) { + case OP_LIST: + name = "list dir"; + break; + case OP_LTRACE: + name = "load trace"; + break; + case OP_SHOW: + name = "show info"; + break; + case OP_AMEND: + name = "amend info"; + break; + case OP_SHOW_FN: + name = "show fnode"; + break; + case OP_AMEND_FN: + name = "amend fnode"; + break; + case OP_NO: + name = "noop"; + break; + default: + name = "unknow op type"; + } + return name; + } + static string ino_type_name(ino_type& t) { + string name; + switch (t) { + case INO_DIR: + name = "dir"; + break; + case INO_F: + name = "file"; + break; + default: + name = "unknow file type"; + } + return name; + } + class sub_op { + public: + sub_op(meta_op* mop): + trace_level(0), + _proc(false), + _mop(mop) + {} + void print() { + std::cout << detail() << std::endl; + } + string detail() { + std::stringstream ds; + ds << " [sub_op]" << op_type_name(sub_op_t) << "|" + << ino_type_name(sub_ino_t) << "|" + << ino << "|" + << frag << "|" + << ino_c << "|" + << trace_level << "|" + << name; + return ds.str(); + } + bool get_c_ancestor(inode_backpointer_t& bp) { + if (!_mop || !ino_c) + return false; + auto item = _mop->ancestors.find(ino_c); + if (item != _mop->ancestors.end()) { + bp = item->second; + return true; + } else + return false; + } + bool get_ancestor(inode_backpointer_t& bp) { + if (!_mop || !ino) + return false; + auto item = _mop->ancestors.find(ino); + if (item != _mop->ancestors.end()) { + bp = item->second; + return true; + } else + return false; + } + op_type sub_op_t; + ino_type sub_ino_t; + inodeno_t ino; + frag_t frag; + inodeno_t ino_c; + unsigned trace_level; + std::string name; + bool _proc; + meta_op* _mop; + }; + + std::map<inodeno_t, inode_backpointer_t > ancestors; + std::map<inodeno_t, inode_meta_t* > inodes; + std::map<inodeno_t, string > okeys; + + void clear_sops() { + while(!no_sops()) + pop_op(); + } + bool no_sops() { + return sub_ops.empty(); + } + void push_op(sub_op* sop) { + if (_debug) + std::cout << "<<====" << sop->detail() << std::endl; + sub_ops.push(sop); + } + sub_op* top_op() { + return sub_ops.top(); + } + void pop_op() { + sub_op* sop = sub_ops.top(); + if (_debug) + std::cout << "====>>" << sop->detail() << std::endl;; + delete sop; + sub_ops.pop(); + } + string outfile() { + return _out; + } + string infile() { + return _in; + } + bool is_debug() { + return _debug; + } + bool confirm_chg() { + return _confirm; + } + private: + stack<sub_op*> sub_ops; + bool _debug; + string _out; + string _in; + bool _confirm; + }; + MDSRoleSelector role_selector; + mds_rank_t rank; + + // I/O handles + librados::Rados rados; + librados::IoCtx io_meta; + std::vector<librados::IoCtx*> io_data_v; + librados::IoCtx output; + bool _debug; + uint64_t features; + + std::string obj_name(inodeno_t ino, frag_t fg = frag_t(), const char *suffix = NULL) const; + std::string obj_name(inodeno_t ino, uint64_t offset, const char *suffix = NULL) const; + std::string obj_name(const char* ino, uint64_t offset, const char *suffix = NULL) const; + + // 0 : continue to find + // 1 : stop to find it + int show_child(std::string_view key, + std::string_view dname, + const snapid_t last, + bufferlist &bl, + const int pos, + const std::set<snapid_t> *snaps, + bool *force_dirty, + inodeno_t sp_ino = 0, + meta_op* op = NULL + ); + + int process(string& mode, string& ino, string out, string in, bool confirm); + int show_meta_info(string& ino, string& out); + int list_meta_info(string& ino, string& out); + int amend_meta_info(string& ino, string& in, bool confirm); + int show_fnode(string& ino, string& out); + int amend_fnode(string& in, bool confirm); + int op_process(meta_op &op); + int list_meta(meta_op &op); + int file_meta(meta_op &op); + int show_meta(meta_op &op); + int amend_meta(meta_op &op); + int show_fn(meta_op &op); + int amend_fn(meta_op &op); + public: + int _file_meta(meta_op &op, librados::IoCtx& io); + int _show_meta(inode_meta_t& i, const string& fn); + int _amend_meta(string &k, inode_meta_t& i, const string& fn, meta_op& op); + int _show_fn(inode_meta_t& i, const string& fn); + int _amend_fn(const string& fn, bool confirm); + void usage(); + MetaTool(bool debug=false): + _debug(debug) {} + ~MetaTool() {} + + int main(string& mode, + string& rank_str, + string& minfo, + string&ino, + string& out, + string& in, + bool confirm = false + ); +}; +#endif // METATOOL_H__ diff --git a/src/tools/cephfs/PgFiles.cc b/src/tools/cephfs/PgFiles.cc new file mode 100644 index 000000000..2abca7223 --- /dev/null +++ b/src/tools/cephfs/PgFiles.cc @@ -0,0 +1,194 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "osdc/Striper.h" + +#include "PgFiles.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "pgeffects." << __func__ << ": " + +int PgFiles::init() +{ + int r = ceph_create_with_context(&cmount, g_ceph_context); + if (r != 0) { + return r; + } + + return ceph_init(cmount); +} + +PgFiles::PgFiles(Objecter *o, const std::set<pg_t> &pgs_) + : objecter(o), pgs(pgs_) +{ + for (const auto &i : pgs) { + pools.insert(i.m_pool); + } +} + +PgFiles::~PgFiles() +{ + ceph_release(cmount); +} + +void PgFiles::hit_dir(std::string const &path) +{ + dout(10) << "entering " << path << dendl; + + ceph_dir_result *dr = nullptr; + int r = ceph_opendir(cmount, path.c_str(), &dr); + if (r != 0) { + derr << "Failed to open path: " << cpp_strerror(r) << dendl; + return; + } + + struct dirent de; + while((r = ceph_readdir_r(cmount, dr, &de)) != 0) { + if (r < 0) { + derr << "Error reading path " << path << ": " << cpp_strerror(r) + << dendl; + ceph_closedir(cmount, dr); // best effort, ignore r + return; + } + + if (std::string(de.d_name) == "." || std::string(de.d_name) == "..") { + continue; + } + + struct ceph_statx stx; + std::string de_path = (path + std::string("/") + de.d_name); + r = ceph_statx(cmount, de_path.c_str(), &stx, + CEPH_STATX_INO|CEPH_STATX_SIZE, 0); + if (r != 0) { + derr << "Failed to stat path " << de_path << ": " + << cpp_strerror(r) << dendl; + // Don't hold up the whole process for one bad inode + continue; + } + + if (S_ISREG(stx.stx_mode)) { + hit_file(de_path, stx); + } else if (S_ISDIR(stx.stx_mode)) { + hit_dir(de_path); + } else { + dout(20) << "Skipping non reg/dir file: " << de_path << dendl; + } + } + + r = ceph_closedir(cmount, dr); + if (r != 0) { + derr << "Error closing path " << path << ": " << cpp_strerror(r) << dendl; + return; + } +} + +void PgFiles::hit_file(std::string const &path, const struct ceph_statx &stx) +{ + ceph_assert(S_ISREG(stx.stx_mode)); + + dout(20) << "Hitting file '" << path << "'" << dendl; + + int l_stripe_unit = 0; + int l_stripe_count = 0; + int l_object_size = 0; + int l_pool_id = 0; + int r = ceph_get_path_layout(cmount, path.c_str(), &l_stripe_unit, + &l_stripe_count, &l_object_size, + &l_pool_id); + if (r != 0) { + derr << "Error reading layout on " << path << ": " << cpp_strerror(r) + << dendl; + return; + } + + struct file_layout_t layout; + layout.stripe_unit = l_stripe_unit; + layout.stripe_count = l_stripe_count; + layout.object_size = l_object_size; + layout.pool_id = l_pool_id; + + // Avoid calculating PG if the layout targeted a completely different pool + if (pools.count(layout.pool_id) == 0) { + dout(20) << "Fast check missed: pool " << layout.pool_id << " not in " + "target set" << dendl; + return; + } + + auto num_objects = Striper::get_num_objects(layout, stx.stx_size); + + for (uint64_t i = 0; i < num_objects; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)stx.stx_ino, + (long long unsigned int)i); + dout(20) << " object " << std::string(buf) << dendl; + + pg_t target; + object_t oid; + object_locator_t loc; + loc.pool = layout.pool_id; + loc.key = std::string(buf); + + unsigned pg_num_mask = 0; + unsigned pg_num = 0; + + int r = 0; + objecter->with_osdmap([&r, oid, loc, &target, &pg_num_mask, &pg_num] + (const OSDMap &osd_map) { + r = osd_map.object_locator_to_pg(oid, loc, target); + if (r == 0) { + auto pool = osd_map.get_pg_pool(loc.pool); + pg_num_mask = pool->get_pg_num_mask(); + pg_num = pool->get_pg_num(); + } + }); + if (r != 0) { + // Can happen if layout pointed to pool not in osdmap, for example + continue; + } + + target.m_seed = ceph_stable_mod(target.ps(), pg_num, pg_num_mask); + + dout(20) << " target " << target << dendl; + + if (pgs.count(target)) { + std::cout << path << std::endl; + return; + } + } + +} + +int PgFiles::scan_path(std::string const &path) +{ + int r = ceph_mount(cmount, "/"); + if (r != 0) { + derr << "Failed to mount: " << cpp_strerror(r) << dendl; + return r; + } + + hit_dir(path); + + r = ceph_unmount(cmount); + if (r != 0) { + derr << "Failed to unmount: " << cpp_strerror(r) << dendl; + return r; + } + + return r; +} + diff --git a/src/tools/cephfs/PgFiles.h b/src/tools/cephfs/PgFiles.h new file mode 100644 index 000000000..1ba4b3d28 --- /dev/null +++ b/src/tools/cephfs/PgFiles.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef PG_EFFECTS_H_ +#define PG_EFFECTS_H_ + +#include "include/cephfs/libcephfs.h" +#include "osd/osd_types.h" +#include <set> +#include "osdc/Objecter.h" + +/** + * This utility scans the files (via an online MDS) and works out + * which ones rely on named PGs. For use when someone has + * some bad/damaged PGs and wants to see which files might be + * affected. + */ +class PgFiles +{ +private: + Objecter *objecter; + struct ceph_mount_info *cmount = nullptr; + + std::set<pg_t> pgs; + std::set<uint64_t> pools; + + void hit_file(std::string const &path, const struct ceph_statx &stx); + void hit_dir(std::string const &path); + + +public: + PgFiles(Objecter *o, const std::set<pg_t> &pgs_); + ~PgFiles(); + + int init(); + int scan_path(std::string const &path); +}; + +#endif + diff --git a/src/tools/cephfs/Resetter.cc b/src/tools/cephfs/Resetter.cc new file mode 100644 index 000000000..278a48767 --- /dev/null +++ b/src/tools/cephfs/Resetter.cc @@ -0,0 +1,220 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include <memory> +#include "common/errno.h" +#include "osdc/Journaler.h" +#include "mds/JournalPointer.h" + +#include "mds/mdstypes.h" +#include "mds/MDCache.h" +#include "mon/MonClient.h" +#include "mds/events/EResetJournal.h" + +#include "Resetter.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +int Resetter::init(mds_role_t role_, const std::string &type, bool hard) +{ + role = role_; + int r = MDSUtility::init(); + if (r < 0) { + return r; + } + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(nullptr != fs); + + is_mdlog = false; + if (type == "mdlog") { + JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool()); + int rt = 0; + if (hard) { + jp.front = role.rank + MDS_INO_LOG_OFFSET; + jp.back = 0; + rt = jp.save(objecter); + if (rt != 0) { + derr << "Error writing journal pointer: " << cpp_strerror(rt) << dendl; + return rt; + } + ino = jp.front; // only need to reset ino for mdlog + } else { + rt = jp.load(objecter); + if (rt != 0) { + std::cerr << "Error loading journal: " << cpp_strerror(rt) << + ", pass --force to forcibly reset this journal" << std::endl; + return rt; + } else { + ino = jp.front; + } + } + is_mdlog = true; + } else if (type == "purge_queue") { + ino = MDS_INO_PURGE_QUEUE + role.rank; + } else { + ceph_abort(); // should not get here + } + return 0; +} + +int Resetter::reset() +{ + ceph::mutex mylock = ceph::make_mutex("Resetter::reset::lock"); + ceph::condition_variable cond; + bool done; + int r; + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + Journaler journaler("resetter", ino, + fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, + objecter, 0, 0, &finisher); + { + std::lock_guard locker{lock}; + journaler.recover(new C_SafeCond(mylock, cond, &done, &r)); + } + { + std::unique_lock locker{mylock}; + cond.wait(locker, [&done] { return done; }); + } + if (r != 0) { + if (r == -ENOENT) { + cerr << "journal does not exist on-disk. Did you set a bad rank?" + << std::endl; + std::cerr << "Error loading journal: " << cpp_strerror(r) << + ", pass --force to forcibly reset this journal" << std::endl; + return r; + } else { + cerr << "got error " << r << "from Journaler, failing" << std::endl; + return r; + } + } + + lock.lock(); + uint64_t old_start = journaler.get_read_pos(); + uint64_t old_end = journaler.get_write_pos(); + uint64_t old_len = old_end - old_start; + cout << "old journal was " << old_start << "~" << old_len << std::endl; + + uint64_t new_start = round_up_to(old_end+1, journaler.get_layout_period()); + cout << "new journal start will be " << new_start + << " (" << (new_start - old_end) << " bytes past old end)" << std::endl; + + journaler.set_read_pos(new_start); + journaler.set_write_pos(new_start); + journaler.set_expire_pos(new_start); + journaler.set_trimmed_pos(new_start); + journaler.set_writeable(); + + cout << "writing journal head" << std::endl; + journaler.write_head(new C_SafeCond(mylock, cond, &done, &r)); + lock.unlock(); + { + std::unique_lock locker{mylock}; + cond.wait(locker, [&done] { return done; }); + } + std::lock_guard l{lock}; + if (r != 0) { + return r; + } + + if (is_mdlog) { + r = _write_reset_event(&journaler); // reset envent is specific for mdlog journal + if (r != 0) { + return r; + } + } + cout << "done" << std::endl; + + return 0; +} + +int Resetter::reset_hard() +{ + auto fs = fsmap->get_filesystem(role.fscid); + + Journaler journaler("resetter", ino, + fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, + objecter, 0, 0, &finisher); + journaler.set_writeable(); + + file_layout_t default_log_layout = MDCache::gen_default_log_layout( + fsmap->get_filesystem(role.fscid)->mds_map); + journaler.create(&default_log_layout, g_conf()->mds_journal_format); + + C_SaferCond cond; + { + std::lock_guard l{lock}; + journaler.write_head(&cond); + } + + int r = cond.wait(); + if (r != 0) { + derr << "Error writing journal header: " << cpp_strerror(r) << dendl; + return r; + } + + if (is_mdlog) // reset event is specific for mdlog journal + { + std::lock_guard l{lock}; + r = _write_reset_event(&journaler); + if (r != 0) { + derr << "Error writing EResetJournal: " << cpp_strerror(r) << dendl; + return r; + } + } + + if (is_mdlog) { + dout(4) << "Successfully wrote new journal pointer and header for rank " + << role << dendl; + } else { + dout(4) << "Successfully wrote header for rank " << role << dendl; + } + return 0; +} + +int Resetter::_write_reset_event(Journaler *journaler) +{ + ceph_assert(journaler != NULL); + + auto le = std::make_unique<EResetJournal>(); + + bufferlist bl; + le->encode_with_header(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + cout << "writing EResetJournal entry" << std::endl; + journaler->append_entry(bl); + + int ret; + { + C_SaferCond cond; + journaler->flush(&cond); + ret = cond.wait(); + if (ret < 0) + return ret; + } + { + // wait until all journal prezero ops are done + C_SaferCond cond; + journaler->wait_for_prezero(&cond); + cond.wait(); + } + + return ret; +} + diff --git a/src/tools/cephfs/Resetter.h b/src/tools/cephfs/Resetter.h new file mode 100644 index 000000000..6998e4598 --- /dev/null +++ b/src/tools/cephfs/Resetter.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef JOURNAL_RESETTER_H_ +#define JOURNAL_RESETTER_H_ + + +#include "MDSUtility.h" + +class Journaler; + +/** + * This class lets you reset an mds journal for troubleshooting or whatever. + * + * To use, create a Resetter, call init(), and then call reset() with the name + * of the file to dump to. + */ +class Resetter : public MDSUtility { +private: + mds_role_t role; + inodeno_t ino; + bool is_mdlog; + +protected: + int _write_reset_event(Journaler *journaler); + +public: + Resetter() {} + ~Resetter() {} + + int init(mds_role_t role_, const std::string &type, bool hard); + /** + * For use when no journal header/pointer was present: write one + * out from scratch. + */ + int reset_hard(); + int reset(); +}; + +#endif /* JOURNAL_RESETTER_H_ */ diff --git a/src/tools/cephfs/RoleSelector.cc b/src/tools/cephfs/RoleSelector.cc new file mode 100644 index 000000000..e2d53b86e --- /dev/null +++ b/src/tools/cephfs/RoleSelector.cc @@ -0,0 +1,59 @@ + +#include "RoleSelector.h" + +int MDSRoleSelector::parse_rank( + const FSMap &fsmap, + std::string const &str) +{ + if (str == "all" || str == "*") { + std::set<mds_rank_t> in; + const MDSMap &mds_map = fsmap.get_filesystem(fscid)->mds_map; + mds_map.get_mds_set(in); + + for (auto rank : in) { + roles.push_back(mds_role_t(fscid, rank)); + } + + return 0; + } else { + std::string rank_err; + mds_rank_t rank = strict_strtol(str.c_str(), 10, &rank_err); + if (!rank_err.empty()) { + return -EINVAL; + } + if (fsmap.get_filesystem(fscid)->mds_map.is_dne(rank)) { + return -ENOENT; + } + roles.push_back(mds_role_t(fscid, rank)); + return 0; + } +} + +int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str, + bool allow_unqualified_rank) +{ + auto colon_pos = str.find(":"); + if (colon_pos == std::string::npos) { + // An unqualified rank. Only valid if there is only one + // namespace. + if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) { + fscid = fsmap.get_filesystem()->fscid; + return parse_rank(fsmap, str); + } else { + return -EINVAL; + } + } else if (colon_pos == 0 || colon_pos == str.size() - 1) { + return -EINVAL; + } else { + const std::string ns_str = str.substr(0, colon_pos); + const std::string rank_str = str.substr(colon_pos + 1); + std::shared_ptr<const Filesystem> fs_ptr; + int r = fsmap.parse_filesystem(ns_str, &fs_ptr); + if (r != 0) { + return r; + } + fscid = fs_ptr->fscid; + return parse_rank(fsmap, rank_str); + } +} + diff --git a/src/tools/cephfs/RoleSelector.h b/src/tools/cephfs/RoleSelector.h new file mode 100644 index 000000000..9090b7200 --- /dev/null +++ b/src/tools/cephfs/RoleSelector.h @@ -0,0 +1,36 @@ + +#ifndef ROLE_SELECTOR_H_ +#define ROLE_SELECTOR_H_ + +#include <string> +#include <vector> +#include "mds/mdstypes.h" +#include "mds/FSMap.h" + +/** + * When you want to let the user act on a single rank in a namespace, + * or all of them. + */ +class MDSRoleSelector +{ + public: + const std::vector<mds_role_t> &get_roles() const {return roles;} + int parse(const FSMap &fsmap, std::string const &str, + bool allow_unqualified_rank=true); + MDSRoleSelector() + : fscid(FS_CLUSTER_ID_NONE) + {} + fs_cluster_id_t get_ns() const + { + return fscid; + } + protected: + int parse_rank( + const FSMap &fsmap, + std::string const &str); + std::vector<mds_role_t> roles; + fs_cluster_id_t fscid; +}; + +#endif // ROLE_SELECTOR_H_ + diff --git a/src/tools/cephfs/TableTool.cc b/src/tools/cephfs/TableTool.cc new file mode 100644 index 000000000..e779b4b66 --- /dev/null +++ b/src/tools/cephfs/TableTool.cc @@ -0,0 +1,417 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "common/ceph_argparse.h" +#include "common/errno.h" + +#include "mds/SessionMap.h" +#include "mds/InoTable.h" +#include "mds/SnapServer.h" + +#include "TableTool.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << __func__ << ": " + +void TableTool::usage() +{ + std::cout << "Usage: \n" + << " cephfs-table-tool <all|[mds rank]> <reset|show> <session|snap|inode>" + << " cephfs-table-tool <all|[mds rank]> <take_inos> <max_ino>" + << std::endl; + + generic_client_usage(); +} + + +/** + * For a function that takes an MDS role as an argument and + * returns an error code, execute it on the roles specified + * by `role_selector`. + */ +int TableTool::apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f) +{ + ceph_assert(f != NULL); + + int r = 0; + + f->open_object_section("ranks"); + + for (auto role : role_selector.get_roles()) { + std::ostringstream rank_str; + rank_str << role.rank; + f->open_object_section(rank_str.str().c_str()); + + f->open_object_section("data"); + int rank_r = fptr(role, f); + f->close_section(); + r = r ? r : rank_r; + + f->dump_int("result", rank_r); + f->close_section(); + + + } + + f->close_section(); + + return r; +} + + +/** + * This class wraps an MDS table class (SessionMap, SnapServer, InoTable) + * with offline load/store code such that we can do offline dumps and resets + * on those tables. + */ +template <typename A> +class TableHandler +{ +protected: + // The RADOS object ID for the table + std::string object_name; + + // The role in question (may be NONE) + mds_role_t role; + + // Whether this is an MDSTable subclass (i.e. has leading version field to decode) + bool mds_table; + +public: + TableHandler(mds_role_t r, std::string const &name, bool mds_table_) + : role(r), mds_table(mds_table_) + { + // Compose object name of the table we will dump + std::ostringstream oss; + oss << "mds"; + if (!role.is_none()) { + oss << role.rank; + } + oss << "_" << name; + object_name = oss.str(); + } + + int load_and_dump(librados::IoCtx *io, Formatter *f) + { + ceph_assert(io != NULL); + ceph_assert(f != NULL); + + // Attempt read + bufferlist table_bl; + int read_r = io->read(object_name, table_bl, 0, 0); + if (read_r >= 0) { + auto q = table_bl.cbegin(); + try { + if (mds_table) { + version_t version; + decode(version, q); + f->dump_int("version", version); + } + A table_inst; + table_inst.set_rank(role.rank); + table_inst.decode(q); + table_inst.dump(f); + + return 0; + } catch (buffer::error &e) { + derr << "table " << object_name << " is corrupt" << dendl; + return -EIO; + } + } else { + derr << "error reading table object " << object_name + << ": " << cpp_strerror(read_r) << dendl; + return read_r; + } + } + + int reset(librados::IoCtx *io) + { + A table_inst; + // Compose new (blank) table + table_inst.set_rank(role.rank); + table_inst.reset_state(); + // Write the table out + return write(table_inst, io); + } + +protected: + + int write(const A &table_inst, librados::IoCtx *io) + { + bufferlist new_bl; + if (mds_table) { + version_t version = 1; + encode(version, new_bl); + } + table_inst.encode_state(new_bl); + + // Write out new table + int r = io->write_full(object_name, new_bl); + if (r != 0) { + derr << "error writing table object " << object_name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + return r; + } +}; + +template <typename A> +class TableHandlerOmap +{ +private: + // The RADOS object ID for the table + std::string object_name; + + // The role (rank may be NONE) + mds_role_t role; + + // Whether this is an MDSTable subclass (i.e. has leading version field to decode) + bool mds_table; + +public: + TableHandlerOmap(mds_role_t r, std::string const &name, bool mds_table_) + : role(r), mds_table(mds_table_) + { + // Compose object name of the table we will dump + std::ostringstream oss; + oss << "mds"; + if (!role.is_none()) { + oss << role.rank; + } + oss << "_" << name; + object_name = oss.str(); + } + + int load_and_dump(librados::IoCtx *io, Formatter *f) + { + ceph_assert(io != NULL); + ceph_assert(f != NULL); + + // Read in the header + bufferlist header_bl; + int r = io->omap_get_header(object_name, &header_bl); + if (r != 0) { + derr << "error reading header on '" << object_name << "': " + << cpp_strerror(r) << dendl; + return r; + } + + // Decode the header + A table_inst; + table_inst.set_rank(role.rank); + try { + table_inst.decode_header(header_bl); + } catch (buffer::error &e) { + derr << "table " << object_name << " is corrupt" << dendl; + return -EIO; + } + + // Read and decode OMAP values in chunks + std::string last_key = ""; + while(true) { + std::map<std::string, bufferlist> values; + int r = io->omap_get_vals(object_name, last_key, + g_conf()->mds_sessionmap_keys_per_op, &values); + + if (r != 0) { + derr << "error reading values: " << cpp_strerror(r) << dendl; + return r; + } + + if (values.empty()) { + break; + } + + try { + table_inst.decode_values(values); + } catch (buffer::error &e) { + derr << "table " << object_name << " is corrupt" << dendl; + return -EIO; + } + last_key = values.rbegin()->first; + } + + table_inst.dump(f); + + return 0; + } + + int reset(librados::IoCtx *io) + { + A table_inst; + table_inst.set_rank(role.rank); + table_inst.reset_state(); + bufferlist header_bl; + table_inst.encode_header(&header_bl); + + // Compose a transaction to clear and write header + librados::ObjectWriteOperation op; + op.omap_clear(); + op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK); + op.omap_set_header(header_bl); + + return io->operate(object_name, &op); + } +}; + +class InoTableHandler : public TableHandler<InoTable> +{ + public: + explicit InoTableHandler(mds_role_t r) + : TableHandler(r, "inotable", true) + {} + + int take_inos(librados::IoCtx *io, inodeno_t max, Formatter *f) + { + InoTable inst; + inst.set_rank(role.rank); + inst.reset_state(); + + int r = 0; + if (inst.force_consume_to(max)) { + r = write(inst, io); + } + + f->dump_int("version", inst.get_version()); + inst.dump(f); + + return r; + } +}; + + +int TableTool::main(std::vector<const char*> &argv) +{ + int r; + + dout(10) << __func__ << dendl; + + // RADOS init + // ========== + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; + return r; + } + + dout(4) << "connecting to RADOS..." << dendl; + r = rados.connect(); + if (r < 0) { + derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; + return r; + } + + // Require at least 3 args <rank> <mode> <arg> [args...] + if (argv.size() < 3) { + cerr << "missing required 3 arguments" << std::endl; + return -EINVAL; + } + + const std::string role_str = std::string(argv[0]); + const std::string mode = std::string(argv[1]); + const std::string table = std::string(argv[2]); + + r = role_selector.parse(*fsmap, role_str); + if (r < 0) { + derr << "Bad rank selection: " << role_str << "'" << dendl; + return r; + } + + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + ceph_assert(fs != nullptr); + int64_t const pool_id = fs->mds_map.get_metadata_pool(); + dout(4) << "resolving pool " << pool_id << dendl; + std::string pool_name; + r = rados.pool_reverse_lookup(pool_id, &pool_name); + if (r < 0) { + derr << "Pool " << pool_id << " identified in MDS map not found in RADOS!" + << dendl; + return r; + } + + dout(4) << "creating IoCtx.." << dendl; + r = rados.ioctx_create(pool_name.c_str(), io); + if (r != 0) { + return r; + } + + JSONFormatter jf(true); + if (mode == "reset") { + const std::string table = std::string(argv[2]); + if (table == "session") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).reset(&io); + }, &jf); + } else if (table == "inode") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandler<InoTable>(rank, "inotable", true).reset(&io); + }, &jf); + } else if (table == "snap") { + r = TableHandler<SnapServer>(mds_role_t(), "snaptable", true).reset(&io); + jf.open_object_section("reset_snap_status"); + jf.dump_int("result", r); + jf.close_section(); + } else { + cerr << "Invalid table '" << table << "'" << std::endl; + return -EINVAL; + } + } else if (mode == "show") { + const std::string table = std::string(argv[2]); + if (table == "session") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f); + }, &jf); + } else if (table == "inode") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandler<InoTable>(rank, "inotable", true).load_and_dump(&io, f);; + }, &jf); + } else if (table == "snap") { + jf.open_object_section("show_snap_table"); + { + r = TableHandler<SnapServer>( + mds_role_t(), "snaptable", true).load_and_dump(&io, &jf); + jf.dump_int("result", r); + } + jf.close_section(); + } else { + cerr << "Invalid table '" << table << "'" << std::endl; + return -EINVAL; + } + } else if (mode == "take_inos") { + const std::string ino_str = std::string(argv[2]); + std::string ino_err; + inodeno_t ino = strict_strtoll(ino_str.c_str(), 10, &ino_err); + if (!ino_err.empty()) { + derr << "Bad ino '" << ino_str << "'" << dendl; + return -EINVAL; + } + r = apply_role_fn([this, ino](mds_role_t rank, Formatter *f) -> int { + return InoTableHandler(rank).take_inos(&io, ino, f); + }, &jf); + } else { + cerr << "Invalid mode '" << mode << "'" << std::endl; + return -EINVAL; + } + + // Subcommand should have written to formatter, flush it + jf.flush(std::cout); + std::cout << std::endl; + return r; +} + diff --git a/src/tools/cephfs/TableTool.h b/src/tools/cephfs/TableTool.h new file mode 100644 index 000000000..bf9b95c12 --- /dev/null +++ b/src/tools/cephfs/TableTool.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "MDSUtility.h" +#include "RoleSelector.h" + +#include "include/rados/librados.hpp" + +/** + * Command line tool for debugging the backing store of + * MDSTable instances. + */ +class TableTool : public MDSUtility +{ + private: + MDSRoleSelector role_selector; + + // I/O handles + librados::Rados rados; + librados::IoCtx io; + + int apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f); + + public: + static void usage(); + int main(std::vector<const char*> &argv); + +}; + diff --git a/src/tools/cephfs/cephfs-data-scan.cc b/src/tools/cephfs/cephfs-data-scan.cc new file mode 100644 index 000000000..e6efff66c --- /dev/null +++ b/src/tools/cephfs/cephfs-data-scan.cc @@ -0,0 +1,47 @@ + +#include "include/types.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "DataScan.h" + + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + DataScan::usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + DataScan data_scan; + + // Connect to mon cluster, download MDS map etc + int rc = data_scan.init(); + if (rc != 0) { + std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + + // Finally, execute the user's commands + rc = data_scan.main(args); + if (rc != 0) { + std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl; + } + + + return rc; +} + diff --git a/src/tools/cephfs/cephfs-journal-tool.cc b/src/tools/cephfs/cephfs-journal-tool.cc new file mode 100644 index 000000000..290cb305b --- /dev/null +++ b/src/tools/cephfs/cephfs-journal-tool.cc @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "include/types.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "JournalTool.h" + + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + JournalTool::usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + JournalTool jt; + + // Connect to mon cluster, download MDS map etc + int rc = jt.init(); + if (rc != 0) { + std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + + // Finally, execute the user's commands + rc = jt.main(args); + if (rc != 0) { + std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl; + } + + return rc; +} + diff --git a/src/tools/cephfs/cephfs-meta-injection.cc b/src/tools/cephfs/cephfs-meta-injection.cc new file mode 100644 index 000000000..5768d3869 --- /dev/null +++ b/src/tools/cephfs/cephfs-meta-injection.cc @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <include/types.h> +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "MetaTool.h" +#include <iostream> +#include <string> +#include <vector> + +#include <boost/program_options.hpp> +namespace po = boost::program_options; +using std::string; +using namespace std; +static string version = "cephfs-meta-injection v1.1"; + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + env_to_vec(args); + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + string rank_str, minfo, ino, out,in; + po::options_description general("general options"); + general.add_options() + ("help,h", "produce help message") + ("debug", "show debug info") + ("rank,r", po::value<string>(&rank_str), "the rank of cephfs, default(0) (e.g. -r cephfs_a:0)") + ("minfo", po::value<string>(&minfo), "specify metapool, datapools and rank (e.g. cephfs_metadata_a:cephfs_data_a:0)") + ("ino,i", po::value<string>(&ino), "specify inode. e.g. 1099511627776 or 0x10000000000, you can find it with cmd, 'ls -i'") + ("out,o", po::value<string>(&out), "output file") + ("in", po::value<string>(&in), "input file") + ("yes-i-really-really-mean-it", "need by amend info") + ; + + string mode; + po::options_description modeoptions("mode options"); + modeoptions.add_options() + ("mode", po::value<string>(&mode), + "\tlistc : list all obj of dir\n" \ + "\tshowm : show the info of ino\n" \ + "\tshowfn : show the fnode of dir\n" \ + "\tamend : amend part of the meta data\n" \ + "\tamendfn : amend fnode from file\n" + ); + + po::positional_options_description p; + p.add("mode", 1); + + po::options_description all("all options"); + all.add(modeoptions).add(general); + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).options(all).positional(p).allow_unregistered().run(), vm); + } catch(exception &e) { + cerr << "error : " << e.what() << std::endl; + return -1; + } catch(...) { + cout << "param error" << std::endl; + return 0; + } + + boost::program_options::notify(vm); + if (vm.count("help")) { + std::cout << version << std::endl; + std::cout << "usage : \n" + << " cephfs-meta-injection <listc|showm|showfn|amend|amendfn> -r <fsname:rank> -i <ino>" + << std::endl; + std::cout << "example : \n" + << " amend info of inode(1099531628828)\n" + << " cephfs-meta-injection showm -r cephfs_a:0 -i 1099531628828 -o out\n" + << " alter file\n" + << " cephfs-meta-injection amend -r cephfs_a:0 -i 1099531628828 --in out --yes-i-really-mean-it" + << std::endl; + std::cout << all << std::endl; + return 0; + } + + MetaTool mt(vm.count("debug")); + int rc = mt.init(); + if (rc != 0) { + std::cerr << "error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + rc = mt.main(mode, rank_str, minfo, ino, out, in, vm.count("yes-i-really-really-mean-it")); + if (rc != 0) { + std::cerr << "error (" << cpp_strerror(rc) << ")" << std::endl; + return -1; + } + return rc; +} diff --git a/src/tools/cephfs/cephfs-table-tool.cc b/src/tools/cephfs/cephfs-table-tool.cc new file mode 100644 index 000000000..47b475dd0 --- /dev/null +++ b/src/tools/cephfs/cephfs-table-tool.cc @@ -0,0 +1,47 @@ + +#include "include/types.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "TableTool.h" + + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + TableTool::usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + TableTool tt; + + // Connect to mon cluster, download MDS map etc + int rc = tt.init(); + if (rc != 0) { + std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + + // Finally, execute the user's commands + rc = tt.main(args); + if (rc != 0) { + std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl; + } + + return rc; +} + + diff --git a/src/tools/cephfs/shell/CMakeLists.txt b/src/tools/cephfs/shell/CMakeLists.txt new file mode 100644 index 000000000..5a1f6ad80 --- /dev/null +++ b/src/tools/cephfs/shell/CMakeLists.txt @@ -0,0 +1,7 @@ +include(Distutils) +distutils_install_module(cephfs-shell) + +if(WITH_TESTS) + include(AddCephTest) + add_tox_test(cephfs-shell) +endif() diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell new file mode 100755 index 000000000..51bd569e0 --- /dev/null +++ b/src/tools/cephfs/shell/cephfs-shell @@ -0,0 +1,1684 @@ +#!/usr/bin/python3 +# coding = utf-8 + +import argparse +import os +import os.path +import sys +import cephfs as libcephfs +import shutil +import traceback +import colorama +import fnmatch +import math +import re +import shlex +import stat +import errno + +from cmd2 import Cmd +from cmd2 import __version__ as cmd2_version +from distutils.version import LooseVersion + +if sys.version_info.major < 3: + raise RuntimeError("cephfs-shell is only compatible with python3") + +try: + from cmd2 import with_argparser +except ImportError: + def with_argparser(argparser): + import functools + + def argparser_decorator(func): + @functools.wraps(func) + def wrapper(thiz, cmdline): + if isinstance(cmdline, list): + arglist = cmdline + else: + # do not split if it's already a list + arglist = shlex.split(cmdline, posix=False) + # in case user quotes the command args + arglist = [arg.strip('\'""') for arg in arglist] + try: + args = argparser.parse_args(arglist) + except SystemExit: + shell.exit_code = 1 + # argparse exits at seeing bad arguments + return + else: + return func(thiz, args) + argparser.prog = func.__name__[3:] + if argparser.description is None and func.__doc__: + argparser.description = func.__doc__ + + return wrapper + + return argparser_decorator + + +cephfs = None # holds CephFS Python bindings +shell = None # holds instance of class CephFSShell +exit_codes = {'Misc': 1, + 'KeyboardInterrupt': 2, + errno.EPERM: 3, + errno.EACCES: 4, + errno.ENOENT: 5, + errno.EIO: 6, + errno.ENOSPC: 7, + errno.EEXIST: 8, + errno.ENODATA: 9, + errno.EINVAL: 10, + errno.EOPNOTSUPP: 11, + errno.ERANGE: 12, + errno.EWOULDBLOCK: 13, + errno.ENOTEMPTY: 14, + errno.ENOTDIR: 15, + errno.EDQUOT: 16, + errno.EPIPE: 17, + errno.ESHUTDOWN: 18, + errno.ECONNABORTED: 19, + errno.ECONNREFUSED: 20, + errno.ECONNRESET: 21, + errno.EINTR: 22} + + +######################################################################### +# +# Following are methods are generically useful through class CephFSShell +# +####################################################################### + + +def poutput(s, end='\n'): + shell.poutput(s, end=end) + + +def perror(msg, **kwargs): + shell.perror(msg, **kwargs) + + +def set_exit_code_msg(errcode='Misc', msg=''): + """ + Set exit code and print error message + """ + if isinstance(msg, libcephfs.Error): + shell.exit_code = exit_codes[msg.get_error_code()] + else: + shell.exit_code = exit_codes[errcode] + if msg: + perror(msg) + + +def mode_notation(mode): + """ + """ + permission_bits = {'0': '---', + '1': '--x', + '2': '-w-', + '3': '-wx', + '4': 'r--', + '5': 'r-x', + '6': 'rw-', + '7': 'rwx'} + mode = str(oct(mode)) + notation = '-' + if mode[2] == '4': + notation = 'd' + elif mode[2:4] == '12': + notation = 'l' + for i in mode[-3:]: + notation += permission_bits[i] + return notation + + +def get_chunks(file_size): + chunk_start = 0 + chunk_size = 0x20000 # 131072 bytes, default max ssl buffer size + while chunk_start + chunk_size < file_size: + yield chunk_start, chunk_size + chunk_start += chunk_size + final_chunk_size = file_size - chunk_start + yield chunk_start, final_chunk_size + + +def to_bytes(param): + # don't convert as follows as it can lead unusable results like coverting + # [1, 2, 3, 4] to '[1, 2, 3, 4]' - + # str(param).encode('utf-8') + if isinstance(param, bytes): + return param + elif isinstance(param, str): + return bytes(param, encoding='utf-8') + elif isinstance(param, list): + return [i.encode('utf-8') if isinstance(i, str) else to_bytes(i) for + i in param] + elif isinstance(param, int) or isinstance(param, float): + return str(param).encode('utf-8') + elif param is None: + return None + + +def ls(path, opts=''): + # opts tries to be like /bin/ls opts + almost_all = 'A' in opts + try: + with cephfs.opendir(path) as d: + while True: + dent = cephfs.readdir(d) + if dent is None: + return + elif almost_all and dent.d_name in (b'.', b'..'): + continue + yield dent + except libcephfs.ObjectNotFound as e: + set_exit_code_msg(msg=e) + + +def glob(path, pattern): + paths = [] + parent_dir = os.path.dirname(path) + if parent_dir == b'': + parent_dir = b'/' + if path == b'/' or is_dir_exists(os.path.basename(path), parent_dir): + for i in ls(path, opts='A'): + if fnmatch.fnmatch(i.d_name, pattern): + paths.append(os.path.join(path, i.d_name)) + return paths + + +def locate_file(name, case_sensitive=True): + dir_list = sorted(set(dirwalk(cephfs.getcwd()))) + if not case_sensitive: + return [dname for dname in dir_list if name.lower() in dname.lower()] + else: + return [dname for dname in dir_list if name in dname] + + +def get_all_possible_paths(pattern): + complete_pattern = pattern[:] + paths = [] + is_rel_path = not os.path.isabs(pattern) + if is_rel_path: + dir_ = cephfs.getcwd() + else: + dir_ = b'/' + pattern = pattern[1:] + patterns = pattern.split(b'/') + paths.extend(glob(dir_, patterns[0])) + patterns.pop(0) + for pattern in patterns: + for path in paths: + paths.extend(glob(path, pattern)) + if is_rel_path: + complete_pattern = os.path.join(cephfs.getcwd(), complete_pattern) + return [path for path in paths if fnmatch.fnmatch(path, complete_pattern)] + + +suffixes = ['B', 'K', 'M', 'G', 'T', 'P'] + + +def humansize(nbytes): + i = 0 + while nbytes >= 1024 and i < len(suffixes) - 1: + nbytes /= 1024. + i += 1 + nbytes = math.ceil(nbytes) + f = ('%d' % nbytes).rstrip('.') + return '%s%s' % (f, suffixes[i]) + + +def style_listing(path, is_dir, is_symlink, ls_long=False): + if not (is_dir or is_symlink): + return path + pretty = colorama.Style.BRIGHT + if is_symlink: + pretty += colorama.Fore.CYAN + path + if ls_long: + # Add target path + pretty += ' -> ' + cephfs.readlink(path, size=255).decode('utf-8') + elif is_dir: + pretty += colorama.Fore.BLUE + path + '/' + pretty += colorama.Style.RESET_ALL + return pretty + + +def print_long(path, is_dir, is_symlink, human_readable): + info = cephfs.stat(path, follow_symlink=(not is_symlink)) + pretty = style_listing(os.path.basename(path.decode('utf-8')), is_dir, is_symlink, True) + if human_readable: + sizefmt = '\t {:10s}'.format(humansize(info.st_size)) + else: + sizefmt = '{:12d}'.format(info.st_size) + poutput(f'{mode_notation(info.st_mode)} {sizefmt} {info.st_uid} {info.st_gid} {info.st_mtime}' + f' {pretty}') + + +def word_len(word): + """ + Returns the word length, minus any color codes. + """ + if word[0] == '\x1b': + return len(word) - 9 + return len(word) + + +def is_dir_exists(path, dir_=b''): + path_to_stat = os.path.join(dir_, path) + try: + return ((cephfs.stat(path_to_stat).st_mode & 0o0040000) != 0) + except libcephfs.Error: + return False + + +def is_file_exists(path, dir_=b''): + try: + # if its not a directory, then its a file + return ((cephfs.stat(os.path.join(dir_, path)).st_mode & 0o0040000) == 0) + except libcephfs.Error: + return False + + +def print_list(words, termwidth=79): + if not words: + return + words = [word.decode('utf-8') if isinstance(word, bytes) else word for word in words] + width = max([word_len(word) for word in words]) + 2 + nwords = len(words) + ncols = max(1, (termwidth + 1) // (width + 1)) + nrows = (nwords + ncols - 1) // ncols + for row in range(nrows): + for i in range(row, nwords, nrows): + word = words[i] + print_width = width + if word[0] == '\x1b': + print_width = print_width + 10 + + poutput('%-*s' % (print_width, words[i]), + end='\n' if i + nrows >= nwords else '') + + +def copy_from_local(local_path, remote_path): + stdin = -1 + file_ = None + fd = None + convert_to_bytes = False + if local_path == b'-': + file_ = sys.stdin + convert_to_bytes = True + else: + try: + file_ = open(local_path, 'rb') + except PermissionError as e: + set_exit_code_msg(e.errno, 'error: no permission to read local file {}'.format( + local_path.decode('utf-8'))) + return + stdin = 1 + try: + fd = cephfs.open(remote_path, 'w', 0o666) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + return + progress = 0 + while True: + data = file_.read(65536) + if not data or len(data) == 0: + break + if convert_to_bytes: + data = to_bytes(data) + wrote = cephfs.write(fd, data, progress) + if wrote < 0: + break + progress += wrote + cephfs.close(fd) + if stdin > 0: + file_.close() + poutput('') + + +def copy_to_local(remote_path, local_path): + fd = None + if local_path != b'-': + local_dir = os.path.dirname(local_path) + dir_list = remote_path.rsplit(b'/', 1) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + if len(dir_list) > 2 and dir_list[1] == b'': + return + fd = open(local_path, 'wb+') + file_ = cephfs.open(remote_path, 'r') + file_size = cephfs.stat(remote_path).st_size + if file_size <= 0: + return + progress = 0 + for chunk_start, chunk_size in get_chunks(file_size): + file_chunk = cephfs.read(file_, chunk_start, chunk_size) + progress += len(file_chunk) + if fd: + fd.write(file_chunk) + else: + poutput(file_chunk.decode('utf-8')) + cephfs.close(file_) + if fd: + fd.close() + + +def dirwalk(path): + """ + walk a directory tree, using a generator + """ + path = os.path.normpath(path) + for item in ls(path, opts='A'): + fullpath = os.path.join(path, item.d_name) + src_path = fullpath.rsplit(b'/', 1)[0] + + yield os.path.normpath(fullpath) + if is_dir_exists(item.d_name, src_path): + for x in dirwalk(fullpath): + yield x + + +################################################################## +# +# Following methods are implementation for CephFS Shell commands +# +################################################################# + +class CephFSShell(Cmd): + + def __init__(self): + super().__init__(use_ipython=False) + self.working_dir = cephfs.getcwd().decode('utf-8') + self.set_prompt() + self.interactive = False + self.umask = '2' + + def default(self, line): + perror('Unrecognized command') + + def set_prompt(self): + self.prompt = ('\033[01;33mCephFS:~' + colorama.Fore.LIGHTCYAN_EX + + self.working_dir + colorama.Style.RESET_ALL + + '\033[01;33m>>>\033[00m ') + + def create_argparser(self, command): + try: + argparse_args = getattr(self, 'argparse_' + command) + except AttributeError: + set_exit_code_msg() + return None + doc_lines = getattr( + self, 'do_' + command).__doc__.expandtabs().splitlines() + if '' in doc_lines: + blank_idx = doc_lines.index('') + usage = doc_lines[:blank_idx] + description = doc_lines[blank_idx + 1:] + else: + usage = doc_lines + description = [] + parser = argparse.ArgumentParser( + prog=command, + usage='\n'.join(usage), + description='\n'.join(description), + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + for args, kwargs in argparse_args: + parser.add_argument(*args, **kwargs) + return parser + + def complete_filenames(self, text, line, begidx, endidx): + if not text: + completions = [x.d_name.decode('utf-8') + '/' * int(x.is_dir()) + for x in ls(b".", opts='A')] + else: + if text.count('/') > 0: + completions = [text.rsplit('/', 1)[0] + '/' + + x.d_name.decode('utf-8') + '/' + * int(x.is_dir()) for x in ls('/' + + text.rsplit('/', 1)[0], opts='A') + if x.d_name.decode('utf-8').startswith( + text.rsplit('/', 1)[1])] + else: + completions = [x.d_name.decode('utf-8') + '/' + * int(x.is_dir()) for x in ls(b".", opts='A') + if x.d_name.decode('utf-8').startswith(text)] + if len(completions) == 1 and completions[0][-1] == '/': + dir_, file_ = completions[0].rsplit('/', 1) + completions.extend([dir_ + '/' + x.d_name.decode('utf-8') + + '/' * int(x.is_dir()) for x in + ls('/' + dir_, opts='A') + if x.d_name.decode('utf-8').startswith(file_)]) + return self.delimiter_complete(text, line, begidx, endidx, completions, '/') + return completions + + def onecmd(self, line, **kwargs): + """ + Global error catcher + """ + try: + res = Cmd.onecmd(self, line, **kwargs) + if self.interactive: + self.set_prompt() + return res + except ConnectionError as e: + set_exit_code_msg(e.errno, f'***\n{e}') + except KeyboardInterrupt: + set_exit_code_msg('KeyboardInterrupt', 'Command aborted') + except (libcephfs.Error, Exception) as e: + if shell.debug: + traceback.print_exc(file=sys.stdout) + set_exit_code_msg(msg=e) + + class path_to_bytes(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + values = to_bytes(values) + setattr(namespace, self.dest, values) + + # TODO: move the necessary contents from here to `class path_to_bytes`. + class get_list_of_bytes_path(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + values = to_bytes(values) + + if values == b'.': + values = cephfs.getcwd() + else: + for i in values: + if i == b'.': + values[values.index(i)] = cephfs.getcwd() + + setattr(namespace, self.dest, values) + + def complete_mkdir(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + class ModeAction(argparse.Action): + def __init__(self, option_strings, dest, nargs=None, **kwargs): + if nargs is not None and nargs != '?': + raise ValueError("more than one modes not allowed") + super().__init__(option_strings, dest, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + o_mode = 0 + res = None + try: + o_mode = int(values, base=8) + except ValueError: + res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', values) + if res is None: + parser.error("invalid mode: %s\n" + "mode must be a numeric octal literal\n" + "or ((u?g?o?)|(a?))(=)(r?w?x?)" % + values) + else: + # we are supporting only assignment of mode and not + or - + # as is generally available with the chmod command + # eg. + # >>> res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', 'go=') + # >>> res.groups() + # ('go', 'go', None, '=', '') + val = res.groups() + + if val[3] != '=': + parser.error("need assignment operator between user " + "and mode specifiers") + if val[4] == '': + parser.error("invalid mode: %s\n" + "mode must be combination of: r | w | x" % + values) + users = '' + if val[2] is None: + users = val[1] + else: + users = val[2] + + t_mode = 0 + if users == 'a': + users = 'ugo' + + if 'r' in val[4]: + t_mode |= 4 + if 'w' in val[4]: + t_mode |= 2 + if 'x' in val[4]: + t_mode |= 1 + + if 'u' in users: + o_mode |= (t_mode << 6) + if 'g' in users: + o_mode |= (t_mode << 3) + if 'o' in users: + o_mode |= t_mode + + if o_mode < 0: + parser.error("invalid mode: %s\n" + "mode cannot be negative" % values) + if o_mode > 0o777: + parser.error("invalid mode: %s\n" + "mode cannot be greater than octal 0777" % values) + + setattr(namespace, self.dest, str(oct(o_mode))) + + mkdir_parser = argparse.ArgumentParser( + description='Create the directory(ies), if they do not already exist.') + mkdir_parser.add_argument('dirs', type=str, + action=path_to_bytes, + metavar='DIR_NAME', + help='Name of new_directory.', + nargs='+') + mkdir_parser.add_argument('-m', '--mode', type=str, + action=ModeAction, + help='Sets the access mode for the new directory.') + mkdir_parser.add_argument('-p', '--parent', action='store_true', + help='Create parent directories as necessary. ' + 'When this option is specified, no error is' + 'reported if a directory already exists.') + + @with_argparser(mkdir_parser) + def do_mkdir(self, args): + """ + Create directory. + """ + for path in args.dirs: + if args.mode: + permission = int(args.mode, 8) + else: + permission = 0o777 + if args.parent: + cephfs.mkdirs(path, permission) + else: + try: + cephfs.mkdir(path, permission) + except libcephfs.Error as e: + set_exit_code_msg(e) + + def complete_put(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + index_dict = {1: self.path_complete} + return self.index_based_complete(text, line, begidx, endidx, index_dict) + + put_parser = argparse.ArgumentParser( + description='Copy a file/directory to Ceph File System from Local File System.') + put_parser.add_argument('local_path', type=str, action=path_to_bytes, + help='Path of the file in the local system') + put_parser.add_argument('remote_path', type=str, action=path_to_bytes, + help='Path of the file in the remote system') + put_parser.add_argument('-f', '--force', action='store_true', + help='Overwrites the destination if it already exists.') + + @with_argparser(put_parser) + def do_put(self, args): + """ + Copy a local file/directory to CephFS. + """ + if args.local_path != b'-' and not os.path.isfile(args.local_path) \ + and not os.path.isdir(args.local_path): + set_exit_code_msg(errno.ENOENT, + msg=f"error: " + f"{args.local_path.decode('utf-8')}: " + f"No such file or directory") + return + + if (is_file_exists(args.remote_path) or is_dir_exists( + args.remote_path)) and not args.force: + set_exit_code_msg(msg=f"error: file/directory " + f"{args.remote_path.decode('utf-8')} " + f"exists, use --force to overwrite") + return + + root_src_dir = args.local_path + root_dst_dir = args.remote_path + if args.local_path == b'.' or args.local_path == b'./': + root_src_dir = os.getcwdb() + elif len(args.local_path.rsplit(b'/', 1)) < 2: + root_src_dir = os.path.join(os.getcwdb(), args.local_path) + else: + p = args.local_path.split(b'/') + if p[0] == b'.': + root_src_dir = os.getcwdb() + p.pop(0) + while len(p) > 0: + root_src_dir += b'/' + p.pop(0) + + if root_dst_dir == b'.': + if args.local_path != b'-': + root_dst_dir = root_src_dir.rsplit(b'/', 1)[1] + if root_dst_dir == b'': + root_dst_dir = root_src_dir.rsplit(b'/', 1)[0] + a = root_dst_dir.rsplit(b'/', 1) + if len(a) > 1: + root_dst_dir = a[1] + else: + root_dst_dir = a[0] + else: + set_exit_code_msg(errno.EINVAL, 'error: no filename specified ' + 'for destination') + return + + if root_dst_dir[-1] != b'/': + root_dst_dir += b'/' + + if args.local_path == b'-' or os.path.isfile(root_src_dir): + if args.local_path == b'-': + root_src_dir = b'-' + copy_from_local(root_src_dir, root_dst_dir) + else: + for src_dir, dirs, files in os.walk(root_src_dir): + if isinstance(src_dir, str): + src_dir = to_bytes(src_dir) + dst_dir = src_dir.replace(root_src_dir, root_dst_dir, 1) + dst_dir = re.sub(rb'\/+', b'/', cephfs.getcwd() + + dst_dir) + if args.force and dst_dir != b'/' and not is_dir_exists( + dst_dir[:-1]) and not locate_file(dst_dir): + try: + cephfs.mkdirs(dst_dir, 0o777) + except libcephfs.Error: + pass + if (not args.force) and dst_dir != b'/' and not is_dir_exists( + dst_dir) and not os.path.isfile(root_src_dir): + try: + cephfs.mkdirs(dst_dir, 0o777) + except libcephfs.Error: + # TODO: perhaps, set retval to 1? + pass + + for dir_ in dirs: + dir_name = os.path.join(dst_dir, dir_) + if not is_dir_exists(dir_name): + try: + cephfs.mkdirs(dir_name, 0o777) + except libcephfs.Error: + # TODO: perhaps, set retval to 1? + pass + + for file_ in files: + src_file = os.path.join(src_dir, file_) + dst_file = re.sub(rb'\/+', b'/', b'/' + dst_dir + b'/' + file_) + if (not args.force) and is_file_exists(dst_file): + return + copy_from_local(src_file, os.path.join(cephfs.getcwd(), + dst_file)) + + def complete_get(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + get_parser = argparse.ArgumentParser( + description='Copy a file from Ceph File System to Local Directory.') + get_parser.add_argument('remote_path', type=str, action=path_to_bytes, + help='Path of the file in the remote system') + get_parser.add_argument('local_path', type=str, action=path_to_bytes, + help='Path of the file in the local system') + get_parser.add_argument('-f', '--force', action='store_true', + help='Overwrites the destination if it already exists.') + + @with_argparser(get_parser) + def do_get(self, args): + """ + Copy a file/directory from CephFS to given path. + """ + if not is_file_exists(args.remote_path) and not \ + is_dir_exists(args.remote_path): + set_exit_code_msg(errno.ENOENT, "error: no file/directory" + " found at specified remote " + "path") + return + if (os.path.isfile(args.local_path) or os.path.isdir( + args.local_path)) and not args.force: + set_exit_code_msg(msg=f"error: file/directory " + f"{args.local_path.decode('utf-8')}" + f" already exists, use --force to " + f"overwrite") + return + root_src_dir = args.remote_path + root_dst_dir = args.local_path + fname = root_src_dir.rsplit(b'/', 1) + if args.local_path == b'.': + root_dst_dir = os.getcwdb() + if args.remote_path == b'.': + root_src_dir = cephfs.getcwd() + if args.local_path == b'-': + if args.remote_path == b'.' or args.remote_path == b'./': + set_exit_code_msg(errno.EINVAL, 'error: no remote file name specified') + return + copy_to_local(root_src_dir, b'-') + elif is_file_exists(args.remote_path): + copy_to_local(root_src_dir, root_dst_dir) + elif b'/' in root_src_dir and is_file_exists(fname[1], fname[0]): + copy_to_local(root_src_dir, root_dst_dir) + else: + files = list(reversed(sorted(dirwalk(root_src_dir)))) + for file_ in files: + dst_dirpath, dst_file = file_.rsplit(b'/', 1) + if dst_dirpath in files: + files.remove(dst_dirpath) + dst_path = os.path.join(root_dst_dir, dst_dirpath, dst_file) + dst_path = os.path.normpath(dst_path) + if is_dir_exists(file_): + try: + os.makedirs(dst_path) + except OSError: + pass + else: + copy_to_local(file_, dst_path) + + return 0 + + def complete_ls(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + ls_parser = argparse.ArgumentParser( + description='Copy a file from Ceph File System from Local Directory.') + ls_parser.add_argument('-l', '--long', action='store_true', + help='Detailed list of items in the directory.') + ls_parser.add_argument('-r', '--reverse', action='store_true', + help='Reverse order of listing items in the directory.') + ls_parser.add_argument('-H', action='store_true', help='Human Readable') + ls_parser.add_argument('-a', '--all', action='store_true', + help='Do not Ignore entries starting with .') + ls_parser.add_argument('-S', action='store_true', help='Sort by file_size') + ls_parser.add_argument('paths', help='Name of Directories', + action=path_to_bytes, nargs='*', default=['.']) + + @with_argparser(ls_parser) + def do_ls(self, args): + """ + List all the files and directories in the current working directory + """ + paths = args.paths + for path in paths: + values = [] + items = [] + try: + if path.count(b'*') > 0: + all_items = get_all_possible_paths(path) + if len(all_items) == 0: + continue + path = all_items[0].rsplit(b'/', 1)[0] + if path == b'': + path = b'/' + dirs = [] + for i in all_items: + for item in ls(path): + d_name = item.d_name + if os.path.basename(i) == d_name: + if item.is_dir(): + dirs.append(os.path.join(path, d_name)) + else: + items.append(item) + if dirs: + paths.extend(dirs) + else: + poutput(path.decode('utf-8'), end=':\n') + items = sorted(items, key=lambda item: item.d_name) + else: + if path != b'' and path != cephfs.getcwd() and len(paths) > 1: + poutput(path.decode('utf-8'), end=':\n') + items = sorted(ls(path), key=lambda item: item.d_name) + if not args.all: + items = [i for i in items if not i.d_name.startswith(b'.')] + if args.S: + items = sorted(items, key=lambda item: cephfs.stat( + path + b'/' + item.d_name, follow_symlink=( + not item.is_symbol_file())).st_size) + if args.reverse: + items = reversed(items) + for item in items: + filepath = item.d_name + is_dir = item.is_dir() + is_sym_lnk = item.is_symbol_file() + try: + if args.long and args.H: + print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir, + is_sym_lnk, True) + elif args.long: + print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir, + is_sym_lnk, False) + elif is_sym_lnk or is_dir: + values.append(style_listing(filepath.decode('utf-8'), is_dir, + is_sym_lnk)) + else: + values.append(filepath) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + if not args.long: + print_list(values, shutil.get_terminal_size().columns) + if path != paths[-1]: + poutput('') + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + def complete_rmdir(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + rmdir_parser = argparse.ArgumentParser(description='Remove Directory.') + rmdir_parser.add_argument('paths', help='Directory Path.', nargs='+', + action=path_to_bytes) + rmdir_parser.add_argument('-p', '--parent', action='store_true', + help='Remove parent directories as necessary. ' + 'When this option is specified, no error ' + 'is reported if a directory has any ' + 'sub-directories, files') + + @with_argparser(rmdir_parser) + def do_rmdir(self, args): + self.do_rmdir_helper(args) + + def do_rmdir_helper(self, args): + """ + Remove a specific Directory + """ + is_pattern = False + paths = args.paths + for path in paths: + if path.count(b'*') > 0: + is_pattern = True + all_items = get_all_possible_paths(path) + if len(all_items) > 0: + path = all_items[0].rsplit(b'/', 1)[0] + if path == b'': + path = b'/' + dirs = [] + for i in all_items: + for item in ls(path): + d_name = item.d_name + if os.path.basename(i) == d_name: + if item.is_dir(): + dirs.append(os.path.join(path, d_name)) + paths.extend(dirs) + continue + else: + is_pattern = False + + if args.parent: + path = os.path.join(cephfs.getcwd(), path.rsplit(b'/')[0]) + files = list(sorted(set(dirwalk(path)), reverse=True)) + if not files: + path = b'.' + for filepath in files: + try: + cephfs.rmdir(os.path.normpath(filepath)) + except libcephfs.Error as e: + perror(e) + path = b'.' + break + else: + path = os.path.normpath(os.path.join(cephfs.getcwd(), path)) + if not is_pattern and path != os.path.normpath(b''): + try: + cephfs.rmdir(path) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + def complete_rm(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + rm_parser = argparse.ArgumentParser(description='Remove File.') + rm_parser.add_argument('paths', help='File Path.', nargs='+', + action=path_to_bytes) + + @with_argparser(rm_parser) + def do_rm(self, args): + """ + Remove a specific file + """ + file_paths = args.paths + for path in file_paths: + if path.count(b'*') > 0: + file_paths.extend([i for i in get_all_possible_paths( + path) if is_file_exists(i)]) + else: + try: + cephfs.unlink(path) + except libcephfs.Error as e: + # NOTE: perhaps we need a better msg here + set_exit_code_msg(msg=e) + + def complete_mv(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + mv_parser = argparse.ArgumentParser(description='Move File.') + mv_parser.add_argument('src_path', type=str, action=path_to_bytes, + help='Source File Path.') + mv_parser.add_argument('dest_path', type=str, action=path_to_bytes, + help='Destination File Path.') + + @with_argparser(mv_parser) + def do_mv(self, args): + """ + Rename a file or Move a file from source path to the destination + """ + cephfs.rename(args.src_path, args.dest_path) + + def complete_cd(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + cd_parser = argparse.ArgumentParser(description='Change working directory') + cd_parser.add_argument('path', type=str, help='Name of the directory.', + action=path_to_bytes, nargs='?', default='/') + + @with_argparser(cd_parser) + def do_cd(self, args): + """ + Change working directory + """ + cephfs.chdir(args.path) + self.working_dir = cephfs.getcwd().decode('utf-8') + self.set_prompt() + + def do_cwd(self, arglist): + """ + Get current working directory. + """ + poutput(cephfs.getcwd().decode('utf-8')) + + def complete_chmod(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + chmod_parser = argparse.ArgumentParser(description='Create Directory.') + chmod_parser.add_argument('mode', type=str, action=ModeAction, help='Mode') + chmod_parser.add_argument('paths', type=str, action=path_to_bytes, + help='Name of the file', nargs='+') + + @with_argparser(chmod_parser) + def do_chmod(self, args): + """ + Change permission of a file + """ + for path in args.paths: + mode = int(args.mode, base=8) + try: + cephfs.chmod(path, mode) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + def complete_cat(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + cat_parser = argparse.ArgumentParser(description='') + cat_parser.add_argument('paths', help='Name of Files', action=path_to_bytes, + nargs='+') + + @with_argparser(cat_parser) + def do_cat(self, args): + """ + Print contents of a file + """ + for path in args.paths: + if is_file_exists(path): + copy_to_local(path, b'-') + else: + set_exit_code_msg(errno.ENOENT, '{}: no such file'.format( + path.decode('utf-8'))) + + umask_parser = argparse.ArgumentParser(description='Set umask value.') + umask_parser.add_argument('mode', help='Mode', type=str, action=ModeAction, + nargs='?', default='') + + @with_argparser(umask_parser) + def do_umask(self, args): + """ + Set Umask value. + """ + if args.mode == '': + poutput(self.umask.zfill(4)) + else: + mode = int(args.mode, 8) + self.umask = str(oct(cephfs.umask(mode))[2:]) + + def complete_write(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + write_parser = argparse.ArgumentParser(description='Writes data into a file') + write_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of File') + + @with_argparser(write_parser) + def do_write(self, args): + """ + Write data into a file. + """ + + copy_from_local(b'-', args.path) + + def complete_lcd(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + index_dict = {1: self.path_complete} + return self.index_based_complete(text, line, begidx, endidx, index_dict) + + lcd_parser = argparse.ArgumentParser(description='') + lcd_parser.add_argument('path', type=str, action=path_to_bytes, help='Path') + + @with_argparser(lcd_parser) + def do_lcd(self, args): + """ + Moves into the given local directory + """ + try: + os.chdir(os.path.expanduser(args.path)) + except OSError as e: + set_exit_code_msg(e.errno, "Cannot change to " + f"{e.filename.decode('utf-8')}: {e.strerror}") + + def complete_lls(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + index_dict = {1: self.path_complete} + return self.index_based_complete(text, line, begidx, endidx, index_dict) + + lls_parser = argparse.ArgumentParser( + description='List files in local system.') + lls_parser.add_argument('paths', help='Paths', action=path_to_bytes, + nargs='*') + + @with_argparser(lls_parser) + def do_lls(self, args): + """ + Lists all files and folders in the current local directory + """ + if not args.paths: + print_list(os.listdir(os.getcwdb())) + else: + for path in args.paths: + try: + items = os.listdir(path) + poutput("{}:".format(path.decode('utf-8'))) + print_list(items) + except OSError as e: + set_exit_code_msg(e.errno, f"{e.filename.decode('utf-8')}: " + f"{e.strerror}") + # Arguments to the with_argpaser decorator function are sticky. + # The items in args.path do not get overwritten in subsequent calls. + # The arguments remain in args.paths after the function exits and we + # neeed to clean it up to ensure the next call works as expected. + args.paths.clear() + + def do_lpwd(self, arglist): + """ + Prints the absolute path of the current local directory + """ + poutput(os.getcwd()) + + def complete_df(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + df_parser = argparse.ArgumentParser(description='Show information about\ + the amount of available disk space') + df_parser.add_argument('file', help='Name of the file', nargs='*', + default=['.'], action=path_to_bytes) + + @with_argparser(df_parser) + def do_df(self, arglist): + """ + Display the amount of available disk space for file systems + """ + header = True # Set to true for printing header only once + if b'.' == arglist.file[0]: + arglist.file = ls(b'.') + + for file in arglist.file: + if isinstance(file, libcephfs.DirEntry): + file = file.d_name + if file == b'.' or file == b'..': + continue + try: + statfs = cephfs.statfs(file) + stat = cephfs.stat(file) + block_size = (statfs['f_blocks'] * statfs['f_bsize']) // 1024 + available = block_size - stat.st_size + use = 0 + + if block_size > 0: + use = (stat.st_size * 100) // block_size + + if header: + header = False + poutput('{:25s}\t{:5s}\t{:15s}{:10s}{}'.format( + "1K-blocks", "Used", "Available", "Use%", + "Stored on")) + + poutput('{:d}\t{:18d}\t{:8d}\t{:10s} {}'.format(block_size, + stat.st_size, available, str(int(use)) + '%', + file.decode('utf-8'))) + except libcephfs.OSError as e: + set_exit_code_msg(e.get_error_code(), "could not statfs {}: {}".format( + file.decode('utf-8'), e.strerror)) + + locate_parser = argparse.ArgumentParser( + description='Find file within file system') + locate_parser.add_argument('name', help='name', type=str, + action=path_to_bytes) + locate_parser.add_argument('-c', '--count', action='store_true', + help='Count list of items located.') + locate_parser.add_argument( + '-i', '--ignorecase', action='store_true', help='Ignore case') + + @with_argparser(locate_parser) + def do_locate(self, args): + """ + Find a file within the File System + """ + if args.name.count(b'*') == 1: + if args.name[0] == b'*': + args.name += b'/' + elif args.name[-1] == '*': + args.name = b'/' + args.name + args.name = args.name.replace(b'*', b'') + if args.ignorecase: + locations = locate_file(args.name, False) + else: + locations = locate_file(args.name) + if args.count: + poutput(len(locations)) + else: + poutput((b'\n'.join(locations)).decode('utf-8')) + + def complete_du(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + du_parser = argparse.ArgumentParser( + description='Disk Usage of a Directory') + du_parser.add_argument('paths', type=str, action=get_list_of_bytes_path, + help='Name of the directory.', nargs='*', + default=[b'.']) + du_parser.add_argument('-r', action='store_true', + help='Recursive Disk usage of all directories.') + + @with_argparser(du_parser) + def do_du(self, args): + """ + Print disk usage of a given path(s). + """ + def print_disk_usage(files): + if isinstance(files, bytes): + files = (files, ) + + for f in files: + try: + st = cephfs.lstat(f) + + if stat.S_ISDIR(st.st_mode): + dusage = int(cephfs.getxattr(f, + 'ceph.dir.rbytes').decode('utf-8')) + else: + dusage = st.st_size + + # print path in local context + f = os.path.normpath(f) + if f[0] is ord('/'): + f = b'.' + f + poutput('{:10s} {}'.format(humansize(dusage), + f.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + continue + + for path in args.paths: + if args.r: + print_disk_usage(sorted(set(dirwalk(path)).union({path}))) + else: + print_disk_usage(path) + + quota_parser = argparse.ArgumentParser( + description='Quota management for a Directory') + quota_parser.add_argument('op', choices=['get', 'set'], + help='Quota operation type.') + quota_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of the directory.') + quota_parser.add_argument('--max_bytes', type=int, default=-1, nargs='?', + help='Max cumulative size of the data under ' + 'this directory.') + quota_parser.add_argument('--max_files', type=int, default=-1, nargs='?', + help='Total number of files under this ' + 'directory tree.') + + @with_argparser(quota_parser) + def do_quota(self, args): + """ + Quota management. + """ + if not is_dir_exists(args.path): + set_exit_code_msg(errno.ENOENT, 'error: no such directory {}'.format( + args.path.decode('utf-8'))) + return + + if args.op == 'set': + if (args.max_bytes == -1) and (args.max_files == -1): + set_exit_code_msg(errno.EINVAL, 'please specify either ' + '--max_bytes or --max_files or both') + return + + if args.max_bytes >= 0: + max_bytes = to_bytes(str(args.max_bytes)) + try: + cephfs.setxattr(args.path, 'ceph.quota.max_bytes', + max_bytes, os.XATTR_CREATE) + poutput('max_bytes set to %d' % args.max_bytes) + except libcephfs.Error as e: + cephfs.setxattr(args.path, 'ceph.quota.max_bytes', + max_bytes, os.XATTR_REPLACE) + set_exit_code_msg(e.get_error_code(), 'max_bytes reset to ' + f'{args.max_bytes}') + + if args.max_files >= 0: + max_files = to_bytes(str(args.max_files)) + try: + cephfs.setxattr(args.path, 'ceph.quota.max_files', + max_files, os.XATTR_CREATE) + poutput('max_files set to %d' % args.max_files) + except libcephfs.Error as e: + cephfs.setxattr(args.path, 'ceph.quota.max_files', + max_files, os.XATTR_REPLACE) + set_exit_code_msg(e.get_error_code(), 'max_files reset to ' + f'{args.max_files}') + elif args.op == 'get': + max_bytes = '0' + max_files = '0' + try: + max_bytes = cephfs.getxattr(args.path, 'ceph.quota.max_bytes') + poutput('max_bytes: {}'.format(max_bytes.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(e.get_error_code(), 'max_bytes is not set') + + try: + max_files = cephfs.getxattr(args.path, 'ceph.quota.max_files') + poutput('max_files: {}'.format(max_files.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(e.get_error_code(), 'max_files is not set') + + snap_parser = argparse.ArgumentParser(description='Snapshot Management') + snap_parser.add_argument('op', type=str, + help='Snapshot operation: create or delete') + snap_parser.add_argument('name', type=str, action=path_to_bytes, + help='Name of snapshot') + snap_parser.add_argument('dir', type=str, action=path_to_bytes, + help='Directory for which snapshot ' + 'needs to be created or deleted') + + @with_argparser(snap_parser) + def do_snap(self, args): + """ + Snapshot management for the volume + """ + # setting self.colors to None turns off colorizing and + # perror emits plain text + self.colors = None + + snapdir = '.snap' + conf_snapdir = cephfs.conf_get('client_snapdir') + if conf_snapdir is not None: + snapdir = conf_snapdir + snapdir = to_bytes(snapdir) + if args.op == 'create': + try: + if is_dir_exists(args.dir): + cephfs.mkdir(os.path.join(args.dir, snapdir, args.name), 0o755) + else: + set_exit_code_msg(errno.ENOENT, "'{}': no such directory".format( + args.dir.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(e.get_error_code(), + "snapshot '{}' already exists".format( + args.name.decode('utf-8'))) + elif args.op == 'delete': + snap_dir = os.path.join(args.dir, snapdir, args.name) + try: + if is_dir_exists(snap_dir): + newargs = argparse.Namespace(paths=[snap_dir], parent=False) + self.do_rmdir_helper(newargs) + else: + set_exit_code_msg(errno.ENOENT, "'{}': no such snapshot".format( + args.name.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(e.get_error_code(), "error while deleting " + "'{}'".format(snap_dir.decode('utf-8'))) + else: + set_exit_code_msg(errno.EINVAL, "snapshot can only be created or " + "deleted; check - help snap") + + def do_help(self, line): + """ + Get details about a command. + Usage: help <cmd> - for a specific command + help all - for all the commands + """ + if line == 'all': + for k in dir(self): + if k.startswith('do_'): + poutput('-' * 80) + super().do_help(k[3:]) + return + parser = self.create_argparser(line) + if parser: + parser.print_help() + else: + super().do_help(line) + + def complete_stat(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + stat_parser = argparse.ArgumentParser( + description='Display file or file system status') + stat_parser.add_argument('paths', type=str, help='file paths', + action=path_to_bytes, nargs='+') + + @with_argparser(stat_parser) + def do_stat(self, args): + """ + Display file or file system status + """ + for path in args.paths: + try: + stat = cephfs.stat(path) + atime = stat.st_atime.isoformat(' ') + mtime = stat.st_mtime.isoformat(' ') + ctime = stat.st_mtime.isoformat(' ') + + poutput("File: {}\nSize: {:d}\nBlocks: {:d}\nIO Block: {:d}\n" + "Device: {:d}\tInode: {:d}\tLinks: {:d}\nPermission: " + "{:o}/{}\tUid: {:d}\tGid: {:d}\nAccess: {}\nModify: " + "{}\nChange: {}".format(path.decode('utf-8'), + stat.st_size, stat.st_blocks, + stat.st_blksize, stat.st_dev, + stat.st_ino, stat.st_nlink, + stat.st_mode, + mode_notation(stat.st_mode), + stat.st_uid, stat.st_gid, atime, + mtime, ctime)) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + setxattr_parser = argparse.ArgumentParser( + description='Set extended attribute for a file') + setxattr_parser.add_argument('path', type=str, action=path_to_bytes, help='Name of the file') + setxattr_parser.add_argument('name', type=str, help='Extended attribute name') + setxattr_parser.add_argument('value', type=str, help='Extended attribute value') + + @with_argparser(setxattr_parser) + def do_setxattr(self, args): + """ + Set extended attribute for a file + """ + val_bytes = to_bytes(args.value) + name_bytes = to_bytes(args.name) + try: + cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_CREATE) + poutput('{} is successfully set to {}'.format(args.name, args.value)) + except libcephfs.ObjectExists: + cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_REPLACE) + poutput('{} is successfully reset to {}'.format(args.name, args.value)) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + getxattr_parser = argparse.ArgumentParser( + description='Get extended attribute set for a file') + getxattr_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of the file') + getxattr_parser.add_argument('name', type=str, help='Extended attribute name') + + @with_argparser(getxattr_parser) + def do_getxattr(self, args): + """ + Get extended attribute for a file + """ + try: + poutput('{}'.format(cephfs.getxattr(args.path, + to_bytes(args.name)).decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + listxattr_parser = argparse.ArgumentParser( + description='List extended attributes set for a file') + listxattr_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of the file') + + @with_argparser(listxattr_parser) + def do_listxattr(self, args): + """ + List extended attributes for a file + """ + try: + size, xattr_list = cephfs.listxattr(args.path) + if size > 0: + poutput('{}'.format(xattr_list.replace(b'\x00', b' ').decode('utf-8'))) + else: + poutput('No extended attribute is set') + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + +####################################################### +# +# Following are methods that get cephfs-shell started. +# +##################################################### + +def setup_cephfs(): + """ + Mounting a cephfs + """ + global cephfs + try: + cephfs = libcephfs.LibCephFS(conffile='') + cephfs.mount() + except libcephfs.ObjectNotFound as e: + print('couldn\'t find ceph configuration not found') + sys.exit(e.get_error_code()) + except libcephfs.Error as e: + print(e) + sys.exit(e.get_error_code()) + + +def str_to_bool(val): + """ + Return corresponding bool values for strings like 'true' or 'false'. + """ + if not isinstance(val, str): + return val + + val = val.replace('\n', '') + if val.lower() in ['true', 'yes']: + return True + elif val.lower() in ['false', 'no']: + return False + else: + return val + + +def read_shell_conf(shell, shell_conf_file): + import configparser + + sec = 'cephfs-shell' + opts = [] + if LooseVersion(cmd2_version) >= LooseVersion("0.10.0"): + for attr in shell.settables.keys(): + opts.append(attr) + else: + if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"): + # hardcoding options for 0.7.9 because - + # 1. we use cmd2 v0.7.9 with teuthology and + # 2. there's no way distinguish between a shell setting and shell + # object attribute until v0.10.0 + opts = ['abbrev', 'autorun_on_edit', 'colors', + 'continuation_prompt', 'debug', 'echo', 'editor', + 'feedback_to_output', 'locals_in_py', 'prompt', 'quiet', + 'timing'] + elif LooseVersion(cmd2_version) >= LooseVersion("0.9.23"): + opts.append('allow_style') + # no equivalent option was defined by cmd2. + else: + pass + + # default and only section in our conf file. + cp = configparser.ConfigParser(default_section=sec, strict=False) + cp.read(shell_conf_file) + for opt in opts: + if cp.has_option(sec, opt): + setattr(shell, opt, str_to_bool(cp.get(sec, opt))) + + +def get_shell_conffile_path(arg_conf=''): + conf_filename = 'cephfs-shell.conf' + env_var = 'CEPHFS_SHELL_CONF' + + arg_conf = '' if not arg_conf else arg_conf + home_dir_conf = os.path.expanduser('~/.' + conf_filename) + env_conf = os.environ[env_var] if env_var in os.environ else '' + + # here's the priority by which conf gets read. + for path in (arg_conf, env_conf, home_dir_conf): + if os.path.isfile(path): + return path + else: + return '' + + +def manage_args(): + main_parser = argparse.ArgumentParser(description='') + main_parser.add_argument('-c', '--config', action='store', + help='Path to Ceph configuration file.', + type=str) + main_parser.add_argument('-b', '--batch', action='store', + help='Path to CephFS shell script/batch file' + 'containing CephFS shell commands', + type=str) + main_parser.add_argument('-t', '--test', action='store', + help='Test against transcript(s) in FILE', + nargs='+') + main_parser.add_argument('commands', nargs='*', help='Comma delimited ' + 'commands. The shell executes the given command ' + 'and quits immediately with the return value of ' + 'command. In case no commands are provided, the ' + 'shell is launched.', default=[]) + + args = main_parser.parse_args() + args.exe_and_quit = False # Execute and quit, don't launch the shell. + + if args.batch: + if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"): + args.commands = ['load ' + args.batch, ',quit'] + else: + args.commands = ['run_script ' + args.batch, ',quit'] + if args.test: + args.commands.extend(['-t,'] + [arg + ',' for arg in args.test]) + if not args.batch and len(args.commands) > 0: + args.exe_and_quit = True + + manage_sys_argv(args) + + return args + + +def manage_sys_argv(args): + exe = sys.argv[0] + sys.argv.clear() + sys.argv.append(exe) + sys.argv.extend([i.strip() for i in ' '.join(args.commands).split(',')]) + + setup_cephfs() + + +def execute_cmd_args(args): + """ + Launch a shell session if no arguments were passed, else just execute + the given argument as a shell command and exit the shell session + immediately at (last) command's termination with the (last) command's + return value. + """ + if not args.exe_and_quit: + return shell.cmdloop() + return execute_cmds_and_quit(args) + + +def execute_cmds_and_quit(args): + """ + Multiple commands might be passed separated by commas, feed onecmd() + one command at a time. + """ + # do_* methods triggered by cephfs-shell commands return None when they + # complete running successfully. Until 0.9.6, shell.onecmd() returned this + # value to indicate whether the execution of the commands should stop, but + # since 0.9.7 it returns the return value of do_* methods only if it's + # not None. When it is None it returns False instead of None. + if LooseVersion(cmd2_version) <= LooseVersion("0.9.6"): + stop_exec_val = None + else: + stop_exec_val = False + + args_to_onecmd = '' + if len(args.commands) <= 1: + args.commands = args.commands[0].split(' ') + for cmdarg in args.commands: + if ',' in cmdarg: + args_to_onecmd += ' ' + cmdarg[0:-1] + onecmd_retval = shell.onecmd(args_to_onecmd) + # if the curent command failed, let's abort the execution of + # series of commands passed. + if onecmd_retval is not stop_exec_val: + return onecmd_retval + if shell.exit_code != 0: + return shell.exit_code + + args_to_onecmd = '' + continue + + args_to_onecmd += ' ' + cmdarg + return shell.onecmd(args_to_onecmd) + + +if __name__ == '__main__': + args = manage_args() + + shell = CephFSShell() + # TODO: perhaps, we should add an option to pass ceph.conf? + read_shell_conf(shell, get_shell_conffile_path(args.config)) + # XXX: setting shell.exit_code to zero so that in case there are no errors + # and exceptions, it is not set by any method or function of cephfs-shell + # and return values from shell.cmdloop() or shell.onecmd() is not an + # integer, we can treat it as the return value of cephfs-shell. + shell.exit_code = 0 + + retval = execute_cmd_args(args) + sys.exit(retval if retval else shell.exit_code) diff --git a/src/tools/cephfs/shell/setup.py b/src/tools/cephfs/shell/setup.py new file mode 100644 index 000000000..8cf7f28f7 --- /dev/null +++ b/src/tools/cephfs/shell/setup.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +from setuptools import setup + +__version__ = '0.0.1' + +setup( + name='cephfs-shell', + version=__version__, + description='Interactive shell for Ceph file system', + keywords='cephfs, shell', + scripts=['cephfs-shell'], + install_requires=[ + 'cephfs', + 'cmd2', + 'colorama', + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python :: 3' + ], + license='LGPLv2+', +) diff --git a/src/tools/cephfs/shell/tox.ini b/src/tools/cephfs/shell/tox.ini new file mode 100644 index 000000000..c1cbff051 --- /dev/null +++ b/src/tools/cephfs/shell/tox.ini @@ -0,0 +1,7 @@ +[tox] +envlist = py3 +skipsdist = true + +[testenv:py3] +deps = flake8 +commands = flake8 --ignore=W503 --max-line-length=100 cephfs-shell diff --git a/src/tools/cephfs/top/CMakeLists.txt b/src/tools/cephfs/top/CMakeLists.txt new file mode 100644 index 000000000..49750c850 --- /dev/null +++ b/src/tools/cephfs/top/CMakeLists.txt @@ -0,0 +1,7 @@ +include(Distutils) +distutils_install_module(cephfs-top) + +if(WITH_TESTS) + include(AddCephTest) + add_tox_test(cephfs-top) +endif() diff --git a/src/tools/cephfs/top/cephfs-top b/src/tools/cephfs/top/cephfs-top new file mode 100755 index 000000000..d57c3ab83 --- /dev/null +++ b/src/tools/cephfs/top/cephfs-top @@ -0,0 +1,888 @@ +#!/usr/bin/python3 + +import argparse +import sys +import curses +import errno +import json +import signal +import time +import math +import threading + +from collections import OrderedDict +from datetime import datetime +from enum import Enum, unique + +import rados + + +class FSTopException(Exception): + def __init__(self, msg=''): + self.error_msg = msg + + def get_error_msg(self): + return self.error_msg + + +@unique +class MetricType(Enum): + METRIC_TYPE_NONE = 0 + METRIC_TYPE_PERCENTAGE = 1 + METRIC_TYPE_LATENCY = 2 + METRIC_TYPE_SIZE = 3 + METRIC_TYPE_STDEV = 4 + + +FS_TOP_PROG_STR = 'cephfs-top' +FS_TOP_ALL_FS_APP = 'ALL_FS_APP' +FS_TOP_FS_SELECTED_APP = 'SELECTED_FS_APP' + +# version match b/w fstop and stats emitted by mgr/stats +FS_TOP_SUPPORTED_VER = 2 + +ITEMS_PAD_LEN = 3 +ITEMS_PAD = " " * ITEMS_PAD_LEN +DEFAULT_REFRESH_INTERVAL = 1 +# min refresh interval allowed +MIN_REFRESH_INTERVAL = 0.5 + +# metadata provided by mgr/stats +FS_TOP_MAIN_WINDOW_COL_CLIENT_ID = "client_id" +FS_TOP_MAIN_WINDOW_COL_MNT_ROOT = "mount_root" +FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR = "mount_point@host/addr" + +MAIN_WINDOW_TOP_LINE_ITEMS_START = [ITEMS_PAD, + FS_TOP_MAIN_WINDOW_COL_CLIENT_ID, + FS_TOP_MAIN_WINDOW_COL_MNT_ROOT] +MAIN_WINDOW_TOP_LINE_ITEMS_END = [FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR] + +MAIN_WINDOW_TOP_LINE_METRICS_LEGACY = ["READ_LATENCY", + "WRITE_LATENCY", + "METADATA_LATENCY" + ] + +# adjust this map according to stats version and maintain order +# as emitted by mgr/stast +MAIN_WINDOW_TOP_LINE_METRICS = OrderedDict([ + ("CAP_HIT", MetricType.METRIC_TYPE_PERCENTAGE), + ("READ_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("DENTRY_LEASE", MetricType.METRIC_TYPE_PERCENTAGE), + ("OPENED_FILES", MetricType.METRIC_TYPE_NONE), + ("PINNED_ICAPS", MetricType.METRIC_TYPE_NONE), + ("OPENED_INODES", MetricType.METRIC_TYPE_NONE), + ("READ_IO_SIZES", MetricType.METRIC_TYPE_SIZE), + ("WRITE_IO_SIZES", MetricType.METRIC_TYPE_SIZE), + ("AVG_READ_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("STDEV_READ_LATENCY", MetricType.METRIC_TYPE_STDEV), + ("AVG_WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("STDEV_WRITE_LATENCY", MetricType.METRIC_TYPE_STDEV), + ("AVG_METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("STDEV_METADATA_LATENCY", MetricType.METRIC_TYPE_STDEV), +]) +MGR_STATS_COUNTERS = list(MAIN_WINDOW_TOP_LINE_METRICS.keys()) + +FS_TOP_VERSION_HEADER_FMT = '{prog_name} - {now}' +FS_TOP_CLIENT_HEADER_FMT = 'Total Client(s): {num_clients} - '\ + '{num_mounts} FUSE, {num_kclients} kclient, {num_libs} libcephfs' +FS_TOP_NAME_TOPL_FMT = 'Filesystem: {fs_name} - {client_count} client(s)' + +CLIENT_METADATA_KEY = "client_metadata" +CLIENT_METADATA_MOUNT_POINT_KEY = "mount_point" +CLIENT_METADATA_MOUNT_ROOT_KEY = "root" +CLIENT_METADATA_IP_KEY = "IP" +CLIENT_METADATA_HOSTNAME_KEY = "hostname" +CLIENT_METADATA_VALID_METRICS_KEY = "valid_metrics" + +GLOBAL_METRICS_KEY = "global_metrics" +GLOBAL_COUNTERS_KEY = "global_counters" + +last_time = time.time() +last_read_size = {} +last_write_size = {} + +fs_list = [] + + +def calc_perc(c): + if c[0] == 0 and c[1] == 0: + return 0.0 + return round((c[0] / (c[0] + c[1])) * 100, 2) + + +def calc_lat(c): + return round(c[0] * 1000 + c[1] / 1000000, 2) + + +def calc_stdev(c): + stdev = 0.0 + if c[1] > 1: + stdev = math.sqrt(c[0] / (c[1] - 1)) / 1000000 + return round(stdev, 2) + + +# in MB +def calc_size(c): + return round(c[1] / (1024 * 1024), 2) + + +# in MB +def calc_avg_size(c): + if c[0] == 0: + return 0.0 + return round(c[1] / (c[0] * 1024 * 1024), 2) + + +# in MB/s +def calc_speed(size, duration): + if duration == 0: + return 0.0 + return round(size / (duration * 1024 * 1024), 2) + + +def wrap(s, sl): + """return a '+' suffixed wrapped string""" + if len(s) < sl: + return s + return f'{s[0:sl-1]}+' + + +class FSTop(object): + def __init__(self, args): + self.rados = None + self.stdscr = None # curses instance + self.current_screen = "" + self.client_name = args.id + self.cluster_name = args.cluster + self.conffile = args.conffile + self.refresh_interval_secs = args.delay + self.PAD_HEIGHT = 10000 # height of the fstop_pad + self.PAD_WIDTH = 300 # width of the fstop_pad + self.exit_ev = threading.Event() + + def handle_signal(self, signum, _): + self.exit_ev.set() + + def init(self): + try: + if self.conffile: + r_rados = rados.Rados(rados_id=self.client_name, + clustername=self.cluster_name, + conffile=self.conffile) + else: + r_rados = rados.Rados(rados_id=self.client_name, + clustername=self.cluster_name) + r_rados.conf_read_file() + r_rados.connect() + self.rados = r_rados + except rados.Error as e: + if e.errno == errno.ENOENT: + raise FSTopException(f'cluster {self.cluster_name}' + ' does not exist') + else: + raise FSTopException(f'error connecting to cluster: {e}') + self.verify_perf_stats_support() + signal.signal(signal.SIGTERM, self.handle_signal) + signal.signal(signal.SIGINT, self.handle_signal) + + def fini(self): + if self.rados: + self.rados.shutdown() + self.rados = None + + def selftest(self): + stats_json = self.perf_stats_query() + if not stats_json['version'] == FS_TOP_SUPPORTED_VER: + raise FSTopException('perf stats version mismatch!') + missing = [m for m in stats_json["global_counters"] + if m.upper() not in MGR_STATS_COUNTERS] + if missing: + raise FSTopException('Cannot handle unknown metrics from' + f'\'ceph fs perf stats\': {missing}') + + def get_fs_names(self): + mon_cmd = {'prefix': 'fs ls', 'format': 'json'} + try: + ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'') + except Exception as e: + raise FSTopException(f'Error in fs ls: {e}') + fs_map = json.loads(buf.decode('utf-8')) + global fs_list + fs_list.clear() + for filesystem in fs_map: + fs = filesystem['name'] + fs_list.append(fs) + return fs_list + + def setup_curses(self, win): + self.stdscr = win + self.stdscr.keypad(True) + curses.use_default_colors() + curses.start_color() + try: + curses.curs_set(0) + except curses.error: + # If the terminal do not support the visibility + # requested it will raise an exception + pass + self.fstop_pad = curses.newpad(self.PAD_HEIGHT, self.PAD_WIDTH) + self.run_all_display() + + def display_fs_menu(self, stdscr, selected_row_idx): + stdscr.clear() + h, w = stdscr.getmaxyx() + global fs_list + if not fs_list: + title = ['No filesystem available', + 'Press "q" to go back to home (All Filesystem Info) screen'] + pos_x1 = w // 2 - len(title[0]) // 2 + pos_x2 = w // 2 - len(title[1]) // 2 + stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD) + stdscr.addstr(3, pos_x2, title[1]) + else: + title = ['Filesystems', 'Press "q" to go back to home (All Filesystem Info) screen'] + pos_x1 = w // 2 - len(title[0]) // 2 + pos_x2 = w // 2 - len(title[1]) // 2 + stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD) + stdscr.addstr(3, pos_x2, title[1]) + for index, name in enumerate(fs_list): + x = w // 2 - len(name) // 2 + y = h // 2 - len(fs_list) // 2 + index + if index == selected_row_idx: + stdscr.attron(curses.color_pair(1)) + stdscr.addstr(y, x, name) + stdscr.attroff(curses.color_pair(1)) + else: + stdscr.addstr(y, x, name) + stdscr.refresh() + + def set_key(self, stdscr): + curses.curs_set(0) + curses.init_pair(1, curses.COLOR_BLACK, curses.COLOR_WHITE) + curr_row = 0 + key = 0 + endmenu = False + while not endmenu: + global fs_list + fs_list = self.get_fs_names() + + if key == curses.KEY_UP and curr_row > 0: + curr_row -= 1 + elif key == curses.KEY_DOWN and curr_row < len(fs_list) - 1: + curr_row += 1 + elif (key in [curses.KEY_ENTER, 10, 13]) and fs_list: + self.stdscr.erase() + self.run_display(fs_list[curr_row]) + endmenu = True + elif key == ord('q'): + self.stdscr.erase() + self.run_all_display() + endmenu = True + + try: + self.display_fs_menu(stdscr, curr_row) + except curses.error: + pass + curses.halfdelay(self.refresh_interval_secs) + key = stdscr.getch() + + def set_option(self, opt): + if opt == ord('m'): + curses.wrapper(self.set_key) + elif opt == ord('q'): + if self.current_screen == FS_TOP_ALL_FS_APP: + quit() + else: + self.run_all_display() + + def verify_perf_stats_support(self): + mon_cmd = {'prefix': 'mgr module ls', 'format': 'json'} + try: + ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'') + except Exception as e: + raise FSTopException(f'error checking \'stats\' module: {e}') + if ret != 0: + raise FSTopException(f'error checking \'stats\' module: {out}') + if 'stats' not in json.loads(buf.decode('utf-8'))['enabled_modules']: + raise FSTopException('\'stats\' module not enabled. Use' + '\'ceph mgr module enable stats\' to enable') + + def perf_stats_query(self): + mgr_cmd = {'prefix': 'fs perf stats', 'format': 'json'} + try: + ret, buf, out = self.rados.mgr_command(json.dumps(mgr_cmd), b'') + except Exception as e: + raise FSTopException(f'error in \'perf stats\' query: {e}') + if ret != 0: + raise FSTopException(f'error in \'perf stats\' query: {out}') + return json.loads(buf.decode('utf-8')) + + def items(self, item): + if item == "CAP_HIT": + return "chit" + if item == "READ_LATENCY": + return "rlat" + if item == "WRITE_LATENCY": + return "wlat" + if item == "METADATA_LATENCY": + return "mlat" + if item == "DENTRY_LEASE": + return "dlease" + if item == "OPENED_FILES": + return "ofiles" + if item == "PINNED_ICAPS": + return "oicaps" + if item == "OPENED_INODES": + return "oinodes" + if item == "READ_IO_SIZES": + return "rtio" + if item == "WRITE_IO_SIZES": + return "wtio" + if item == 'AVG_READ_LATENCY': + return 'rlatavg' + if item == 'STDEV_READ_LATENCY': + return 'rlatsd' + if item == 'AVG_WRITE_LATENCY': + return 'wlatavg' + if item == 'STDEV_WRITE_LATENCY': + return 'wlatsd' + if item == 'AVG_METADATA_LATENCY': + return 'mlatavg' + if item == 'STDEV_METADATA_LATENCY': + return 'mlatsd' + else: + # return empty string for none type + return '' + + def mtype(self, typ): + if typ == MetricType.METRIC_TYPE_PERCENTAGE: + return "(%)" + elif typ == MetricType.METRIC_TYPE_LATENCY: + return "(ms)" + elif typ == MetricType.METRIC_TYPE_SIZE: + return "(MB)" + elif typ == MetricType.METRIC_TYPE_STDEV: + return "(ms)" + else: + # return empty string for none type + return '' + + def avg_items(self, item): + if item == "READ_IO_SIZES": + return "raio" + if item == "WRITE_IO_SIZES": + return "waio" + else: + # return empty string for none type + return '' + + def speed_items(self, item): + if item == "READ_IO_SIZES": + return "rsp" + if item == "WRITE_IO_SIZES": + return "wsp" + else: + # return empty string for none type + return '' + + def speed_mtype(self, typ): + if typ == MetricType.METRIC_TYPE_SIZE: + return "(MB/s)" + else: + # return empty string for none type + return '' + + @staticmethod + def has_metric(metadata, metrics_key): + return metrics_key in metadata + + @staticmethod + def has_metrics(metadata, metrics_keys): + for key in metrics_keys: + if not FSTop.has_metric(metadata, key): + return False + return True + + def create_top_line_and_build_coord(self): + xp = 0 + x_coord_map = {} + + heading = [] + for item in MAIN_WINDOW_TOP_LINE_ITEMS_START: + heading.append(item) + nlen = len(item) + len(ITEMS_PAD) + x_coord_map[item] = (xp, nlen) + xp += nlen + + for item, typ in MAIN_WINDOW_TOP_LINE_METRICS.items(): + if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY: + continue + it = f'{self.items(item)}{self.mtype(typ)}' + heading.append(it) + nlen = len(it) + len(ITEMS_PAD) + x_coord_map[item] = (xp, nlen) + xp += nlen + + if item == "READ_IO_SIZES" or item == "WRITE_IO_SIZES": + # average io sizes + it = f'{self.avg_items(item)}{self.mtype(typ)}' + heading.append(it) + nlen = len(it) + len(ITEMS_PAD) + if item == "READ_IO_SIZES": + x_coord_map["READ_IO_AVG"] = (xp, nlen) + if item == "WRITE_IO_SIZES": + x_coord_map["WRITE_IO_AVG"] = (xp, nlen) + xp += nlen + + # io speeds + it = f'{self.speed_items(item)}{self.speed_mtype(typ)}' + heading.append(it) + nlen = len(it) + len(ITEMS_PAD) + if item == "READ_IO_SIZES": + x_coord_map["READ_IO_SPEED"] = (xp, nlen) + if item == "WRITE_IO_SIZES": + x_coord_map["WRITE_IO_SPEED"] = (xp, nlen) + xp += nlen + + for item in MAIN_WINDOW_TOP_LINE_ITEMS_END: + heading.append(item) + nlen = len(item) + len(ITEMS_PAD) + x_coord_map[item] = (xp, nlen) + xp += nlen + title = ITEMS_PAD.join(heading) + self.fsstats.addstr(self.tablehead_y, 0, title, curses.A_STANDOUT | curses.A_BOLD) + return x_coord_map + + def create_client(self, client_id, metrics, counters, + client_meta, x_coord_map, y_coord): + global last_time + size = 0 + cur_time = time.time() + duration = cur_time - last_time + last_time = cur_time + for item in MAIN_WINDOW_TOP_LINE_ITEMS_START: + coord = x_coord_map[item] + hlen = coord[1] - 1 + if item == FS_TOP_MAIN_WINDOW_COL_CLIENT_ID: + self.fsstats.addstr(y_coord, coord[0], + wrap(client_id.split('.')[1], hlen), curses.A_DIM) + elif item == FS_TOP_MAIN_WINDOW_COL_MNT_ROOT: + if FSTop.has_metric(client_meta, + CLIENT_METADATA_MOUNT_ROOT_KEY): + self.fsstats.addstr( + y_coord, coord[0], + wrap(client_meta[CLIENT_METADATA_MOUNT_ROOT_KEY], hlen), curses.A_DIM) + else: + self.fsstats.addstr(y_coord, coord[0], "N/A", curses.A_DIM) + + cidx = 0 + for item in counters: + if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY: + cidx += 1 + continue + coord = x_coord_map[item] + m = metrics[cidx] + key = MGR_STATS_COUNTERS[cidx] + typ = MAIN_WINDOW_TOP_LINE_METRICS[key] + if item.lower() in client_meta.get( + CLIENT_METADATA_VALID_METRICS_KEY, []): + if typ == MetricType.METRIC_TYPE_PERCENTAGE: + self.fsstats.addstr(y_coord, coord[0], + f'{calc_perc(m)}', curses.A_DIM) + elif typ == MetricType.METRIC_TYPE_LATENCY: + self.fsstats.addstr(y_coord, coord[0], + f'{calc_lat(m)}', curses.A_DIM) + elif typ == MetricType.METRIC_TYPE_STDEV: + self.fsstats.addstr(y_coord, coord[0], + f'{calc_stdev(m)}', curses.A_DIM) + elif typ == MetricType.METRIC_TYPE_SIZE: + self.fsstats.addstr(y_coord, coord[0], + f'{calc_size(m)}', curses.A_DIM) + + # average io sizes + if key == "READ_IO_SIZES": + coord = x_coord_map["READ_IO_AVG"] + elif key == "WRITE_IO_SIZES": + coord = x_coord_map["WRITE_IO_AVG"] + self.fsstats.addstr(y_coord, coord[0], + f'{calc_avg_size(m)}', curses.A_DIM) + + # io speeds + if key == "READ_IO_SIZES": + coord = x_coord_map["READ_IO_SPEED"] + elif key == "WRITE_IO_SIZES": + coord = x_coord_map["WRITE_IO_SPEED"] + size = 0 + if key == "READ_IO_SIZES": + if m[1] > 0: + global last_read_size + last_size = last_read_size.get(client_id, 0) + size = m[1] - last_size + last_read_size[client_id] = m[1] + if key == "WRITE_IO_SIZES": + if m[1] > 0: + global last_write_size + last_size = last_write_size.get(client_id, 0) + size = m[1] - last_size + last_write_size[client_id] = m[1] + self.fsstats.addstr(y_coord, coord[0], + f'{calc_speed(abs(size), duration)}', curses.A_DIM) + else: + # display 0th element from metric tuple + self.fsstats.addstr(y_coord, coord[0], f'{m[0]}', curses.A_DIM) + else: + self.fsstats.addstr(y_coord, coord[0], "N/A", curses.A_DIM) + cidx += 1 + + for item in MAIN_WINDOW_TOP_LINE_ITEMS_END: + coord = x_coord_map[item] + wrapLen = self.PAD_WIDTH - coord[0] + # always place the FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR in the + # last, it will be a very long string to display + if item == FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR: + if FSTop.has_metrics(client_meta, + [CLIENT_METADATA_MOUNT_POINT_KEY, + CLIENT_METADATA_HOSTNAME_KEY, + CLIENT_METADATA_IP_KEY]): + mount_point = f'{client_meta[CLIENT_METADATA_MOUNT_POINT_KEY]}@'\ + f'{client_meta[CLIENT_METADATA_HOSTNAME_KEY]}/'\ + f'{client_meta[CLIENT_METADATA_IP_KEY]}' + self.fsstats.addstr( + y_coord, coord[0], + wrap(mount_point, wrapLen), curses.A_DIM) + else: + self.fsstats.addstr(y_coord, coord[0], "N/A", curses.A_DIM) + + def create_clients(self, x_coord_map, stats_json, fs_name): + counters = [m.upper() for m in stats_json[GLOBAL_COUNTERS_KEY]] + self.tablehead_y += 2 + res = stats_json[CLIENT_METADATA_KEY].get(fs_name, {}) + client_cnt = len(res) + self.fsstats.addstr(self.tablehead_y, 0, FS_TOP_NAME_TOPL_FMT.format( + fs_name=fs_name, client_count=client_cnt), curses.A_BOLD | curses.A_ITALIC) + self.tablehead_y += 2 + if client_cnt: + for client_id, metrics in \ + stats_json[GLOBAL_METRICS_KEY][fs_name].items(): + self.create_client( + client_id, metrics, counters, res[client_id], + x_coord_map, self.tablehead_y) + self.tablehead_y += 1 + + def create_header(self, stats_json, help, screen_title="", color_id=0): + num_clients, num_mounts, num_kclients, num_libs = 0, 0, 0, 0 + if not stats_json['version'] == FS_TOP_SUPPORTED_VER: + self.header.addstr(0, 0, 'perf stats version mismatch!', curses.A_BOLD) + return False + global fs_list + for fs_name in fs_list: + client_metadata = stats_json[CLIENT_METADATA_KEY].get(fs_name, {}) + client_cnt = len(client_metadata) + if client_cnt: + num_clients = num_clients + client_cnt + num_mounts = num_mounts + len( + [client for client, metadata in client_metadata.items() if + CLIENT_METADATA_MOUNT_POINT_KEY in metadata + and metadata[CLIENT_METADATA_MOUNT_POINT_KEY] != 'N/A']) + num_kclients = num_kclients + len( + [client for client, metadata in client_metadata.items() if + "kernel_version" in metadata]) + num_libs = num_clients - (num_mounts + num_kclients) + now = datetime.now().ctime() + self.header.addstr(0, 0, FS_TOP_VERSION_HEADER_FMT.format(prog_name=FS_TOP_PROG_STR, + now=now), curses.A_BOLD) + self.header.addstr(2, 0, screen_title, curses.color_pair(color_id) | curses.A_BOLD) + self.header.addstr(3, 0, FS_TOP_CLIENT_HEADER_FMT.format(num_clients=num_clients, + num_mounts=num_mounts, + num_kclients=num_kclients, + num_libs=num_libs), curses.A_DIM) + self.header.addstr(4, 0, help, curses.A_DIM) + return True + + def run_display(self, fs): + # clear the pads to have a smooth refresh + self.header.erase() + self.fsstats.erase() + + self.current_screen = FS_TOP_FS_SELECTED_APP + screen_title = "Selected Filesystem Info" + help_commands = "Press 'q' to go back to home (All Filesystem Info) screen"\ + " | Press 'm' to select another filesystem" + curses.init_pair(3, curses.COLOR_MAGENTA, -1) + + top, left = 0, 0 # where to place pad + vscrollOffset, hscrollOffset = 0, 0 # scroll offsets + + # calculate the initial viewport height and width + windowsize = self.stdscr.getmaxyx() + self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1 + + # create header subpad + self.header_height = 7 + self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0) + + # create fsstats subpad + fsstats_begin_y = self.header_height + fsstats_height = self.PAD_HEIGHT - self.header_height + self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0) + + curses.halfdelay(1) + cmd = self.stdscr.getch() + while not self.exit_ev.is_set(): + if cmd in [ord('m'), ord('q')]: + self.set_option(cmd) + self.exit_ev.set() + + # header display + global fs_list + fs_list = self.get_fs_names() + stats_json = self.perf_stats_query() + vscrollEnd = 0 + if fs not in fs_list: + help = "Error: The selected filesystem is not available now. " + help_commands + self.header.erase() # erase previous text + self.create_header(stats_json, help, screen_title, 3) + else: + self.tablehead_y = 0 + help = "HELP: " + help_commands + self.fsstats.erase() # erase previous text + + vscrollEnd = len(stats_json[CLIENT_METADATA_KEY].get(fs, {})) + if self.create_header(stats_json, help, screen_title, 3): + x_coord_map = self.create_top_line_and_build_coord() + self.create_clients(x_coord_map, stats_json, fs) + + # scroll and refresh + if cmd == curses.KEY_DOWN: + if (vscrollEnd - vscrollOffset) > 1: + vscrollOffset += 1 + else: + vscrollOffset = vscrollEnd + elif cmd == curses.KEY_UP: + if vscrollOffset > 0: + vscrollOffset -= 1 + elif cmd == curses.KEY_NPAGE: + if (vscrollEnd - vscrollOffset) / 20 > 1: + vscrollOffset += 20 + else: + vscrollOffset = vscrollEnd + elif cmd == curses.KEY_PPAGE: + if vscrollOffset / 20 >= 1: + vscrollOffset -= 20 + else: + vscrollOffset = 0 + elif cmd == curses.KEY_RIGHT: + if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1: + hscrollOffset += 1 + elif cmd == curses.KEY_LEFT: + if hscrollOffset > 0: + hscrollOffset -= 1 + elif cmd == curses.KEY_HOME: + hscrollOffset = 0 + elif cmd == curses.KEY_END: + hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1 + elif cmd == curses.KEY_RESIZE: + # terminal resize event. Update the viewport dimensions + windowsize = self.stdscr.getmaxyx() + self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1 + + if cmd: + try: + # refresh the viewport for the header portion + if cmd not in [curses.KEY_DOWN, + curses.KEY_UP, + curses.KEY_NPAGE, + curses.KEY_PPAGE, + curses.KEY_RIGHT, + curses.KEY_LEFT]: + self.fstop_pad.refresh(0, 0, + top, left, + top + self.header_height, left + self.viewportWidth) + # refresh the viewport for the current table header portion in the fsstats pad + if cmd not in [curses.KEY_DOWN, + curses.KEY_UP, + curses.KEY_NPAGE, + curses.KEY_PPAGE]: + self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset, + top + fsstats_begin_y, left, + 7, left + self.viewportWidth) + # refresh the viewport for the current client records portion in the fsstats pad + self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset, + top + fsstats_begin_y + 2, left, + top + self.viewportHeight, left + self.viewportWidth) + except curses.error: + # This happens when the user switches to a terminal of different zoom size. + # just retry it. + pass + # End scroll and refresh + + curses.halfdelay(self.refresh_interval_secs * 10) + cmd = self.stdscr.getch() + + def run_all_display(self): + # clear text from the previous screen + if self.current_screen == FS_TOP_FS_SELECTED_APP: + self.header.erase() + + self.current_screen = FS_TOP_ALL_FS_APP + screen_title = "All Filesystem Info" + curses.init_pair(2, curses.COLOR_CYAN, -1) + + top, left = 0, 0 # where to place pad + vscrollOffset, hscrollOffset = 0, 0 # scroll offsets + + # calculate the initial viewport height and width + windowsize = self.stdscr.getmaxyx() + self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1 + + # create header subpad + self.header_height = 7 + self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0) + + # create fsstats subpad + fsstats_begin_y = self.header_height + fsstats_height = self.PAD_HEIGHT - self.header_height + self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0) + + curses.halfdelay(1) + cmd = self.stdscr.getch() + while not self.exit_ev.is_set(): + if cmd in [ord('m'), ord('q')]: + self.set_option(cmd) + self.exit_ev.set() + + # header display + global fs_list + fs_list = self.get_fs_names() + stats_json = self.perf_stats_query() + vscrollEnd = 0 + if not fs_list: + help = "INFO: No filesystem is available [Press 'q' to quit]" + self.header.erase() # erase previous text + self.fsstats.erase() + self.create_header(stats_json, help, screen_title, 2) + else: + self.tablehead_y = 0 + help = "HELP: Press 'm' to select a filesystem | Press 'q' to quit" + self.fsstats.erase() # erase previous text + for index, fs in enumerate(fs_list): + # Get the vscrollEnd in advance + vscrollEnd += len(stats_json[CLIENT_METADATA_KEY].get(fs, {})) + if self.create_header(stats_json, help, screen_title, 2): + if not index: # do it only for the first fs + x_coord_map = self.create_top_line_and_build_coord() + self.create_clients(x_coord_map, stats_json, fs) + + # scroll and refresh + if cmd == curses.KEY_DOWN: + if (vscrollEnd - vscrollOffset) > 1: + vscrollOffset += 1 + else: + vscrollOffset = vscrollEnd + elif cmd == curses.KEY_UP: + if vscrollOffset > 0: + vscrollOffset -= 1 + elif cmd == curses.KEY_NPAGE: + if (vscrollEnd - vscrollOffset) / 20 > 1: + vscrollOffset += 20 + else: + vscrollOffset = vscrollEnd + elif cmd == curses.KEY_PPAGE: + if vscrollOffset / 20 >= 1: + vscrollOffset -= 20 + else: + vscrollOffset = 0 + elif cmd == curses.KEY_RIGHT: + if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1: + hscrollOffset += 1 + elif cmd == curses.KEY_LEFT: + if hscrollOffset > 0: + hscrollOffset -= 1 + elif cmd == curses.KEY_HOME: + hscrollOffset = 0 + elif cmd == curses.KEY_END: + hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1 + elif cmd == curses.KEY_RESIZE: + # terminal resize event. Update the viewport dimensions + windowsize = self.stdscr.getmaxyx() + self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1 + if cmd: + try: + # refresh the viewport for the header portion + if cmd not in [curses.KEY_DOWN, + curses.KEY_UP, + curses.KEY_NPAGE, + curses.KEY_PPAGE, + curses.KEY_RIGHT, + curses.KEY_LEFT]: + self.fstop_pad.refresh(0, 0, + top, left, + top + self.header_height, left + self.viewportWidth) + # refresh the viewport for the current table header portion in the fsstats pad + if cmd not in [curses.KEY_DOWN, + curses.KEY_UP, + curses.KEY_NPAGE, + curses.KEY_PPAGE]: + self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset, + top + fsstats_begin_y, left, + 7, left + self.viewportWidth) + # refresh the viewport for the current client records portion in the fsstats pad + self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset, + top + fsstats_begin_y + 2, left, + top + self.viewportHeight, left + self.viewportWidth) + except curses.error: + # This happens when the user switches to a terminal of different zoom size. + # just retry it. + pass + # End scroll and refresh + + curses.halfdelay(self.refresh_interval_secs * 10) + cmd = self.stdscr.getch() +# End class FSTop + + +if __name__ == '__main__': + def float_greater_than(x): + value = float(x) + if value < MIN_REFRESH_INTERVAL: + raise argparse.ArgumentTypeError( + 'Refresh interval should be greater than or equal to' + f' {MIN_REFRESH_INTERVAL}') + return value + + parser = argparse.ArgumentParser(description='Ceph Filesystem top utility') + parser.add_argument('--cluster', nargs='?', const='ceph', default='ceph', + help='Ceph cluster to connect (defualt: ceph)') + parser.add_argument('--id', nargs='?', const='fstop', default='fstop', + help='Ceph user to use to connection (default: fstop)') + parser.add_argument('--conffile', nargs='?', default=None, + help='Path to cluster configuration file') + parser.add_argument('--selftest', dest='selftest', action='store_true', + help='Run in selftest mode') + parser.add_argument('-d', '--delay', nargs='?', + default=DEFAULT_REFRESH_INTERVAL, + type=float_greater_than, + help='Refresh interval in seconds ' + f'(default: {DEFAULT_REFRESH_INTERVAL})') + + args = parser.parse_args() + err = False + ft = FSTop(args) + try: + ft.init() + if args.selftest: + ft.selftest() + sys.stdout.write("selftest ok\n") + else: + curses.wrapper(ft.setup_curses) + except FSTopException as fst: + err = True + sys.stderr.write(f'{fst.get_error_msg()}\n') + except Exception as e: + err = True + sys.stderr.write(f'exception: {e}\n') + finally: + ft.fini() + sys.exit(0 if not err else -1) diff --git a/src/tools/cephfs/top/setup.py b/src/tools/cephfs/top/setup.py new file mode 100644 index 000000000..92fbd964c --- /dev/null +++ b/src/tools/cephfs/top/setup.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +from setuptools import setup + +__version__ = '0.0.1' + +setup( + name='cephfs-top', + version=__version__, + description='top(1) like utility for Ceph Filesystem', + keywords='cephfs, top', + scripts=['cephfs-top'], + install_requires=[ + 'rados', + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python :: 3' + ], + license='LGPLv2+', +) diff --git a/src/tools/cephfs/top/tox.ini b/src/tools/cephfs/top/tox.ini new file mode 100644 index 000000000..b125c0bc8 --- /dev/null +++ b/src/tools/cephfs/top/tox.ini @@ -0,0 +1,7 @@ +[tox] +envlist = py3 +skipsdist = true + +[testenv:py3] +deps = flake8 +commands = flake8 --ignore=W503 --max-line-length=100 cephfs-top diff --git a/src/tools/cephfs/type_helper.hpp b/src/tools/cephfs/type_helper.hpp new file mode 100644 index 000000000..06dfa0afe --- /dev/null +++ b/src/tools/cephfs/type_helper.hpp @@ -0,0 +1,28 @@ +#ifndef TYPE_HELPER_HPP__ +#define TYPE_HELPER_HPP__ + +template<typename T1, typename T2> +T1 conv_t(T2 s){ + T1 target; + std::stringstream conv; + conv << s; + conv >> target; + return target; +} + +void string_split(std::string str, vector<string>& out, string split = ":"){ + std::cout << str << std::endl; + auto pos = str.find(split); + while(pos != std::string::npos){ + std::cout << str.substr(0, pos) << std::endl; + out.push_back(str.substr(0, pos)); + if (str.size() > pos + split.size()){ + str = str.substr(pos + split.size()); + pos = str.find(split); + }else + return; + } + out.push_back(str.substr()); + return; +} +#endif // TYPE_HELPER_HPP__ |