diff options
Diffstat (limited to 'src/tools')
247 files changed, 75469 insertions, 0 deletions
diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt new file mode 100644 index 00000000..fc8539ff --- /dev/null +++ b/src/tools/CMakeLists.txt @@ -0,0 +1,129 @@ +set(rados_srcs + rados/rados.cc + RadosDump.cc + rados/RadosImport.cc + rados/PoolDump.cc + ${PROJECT_SOURCE_DIR}/src/common/util.cc + ${PROJECT_SOURCE_DIR}/src/common/obj_bencher.cc + ${PROJECT_SOURCE_DIR}/src/osd/ECUtil.cc) +add_executable(rados ${rados_srcs}) + +target_link_libraries(rados librados global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) +if(WITH_LIBRADOSSTRIPER) + target_link_libraries(rados radosstriper) +else() + target_link_libraries(rados cls_lock_client) +endif() +install(TARGETS rados DESTINATION bin) + +if(WITH_TESTS) +add_executable(ceph_scratchtool scratchtool.c) +target_link_libraries(ceph_scratchtool librados global) +install(TARGETS ceph_scratchtool DESTINATION bin) + +add_executable(ceph_scratchtoolpp scratchtoolpp.cc) +target_link_libraries(ceph_scratchtoolpp librados global) +install(TARGETS ceph_scratchtoolpp DESTINATION bin) + +add_executable(ceph_radosacl radosacl.cc) +target_link_libraries(ceph_radosacl librados global) +install(TARGETS ceph_radosacl DESTINATION bin) + +install(PROGRAMS + ceph-monstore-update-crush.sh + DESTINATION ${CMAKE_INSTALL_LIBDIR}/ceph) +endif(WITH_TESTS) + +add_executable(ceph-osdomap-tool ceph_osdomap_tool.cc) +target_link_libraries(ceph-osdomap-tool os global Boost::program_options) +install(TARGETS ceph-osdomap-tool DESTINATION bin) + +add_executable(ceph-monstore-tool + ceph_monstore_tool.cc + ../mgr/mgr_commands.cc) +target_link_libraries(ceph-monstore-tool os global Boost::program_options) +install(TARGETS ceph-monstore-tool DESTINATION bin) + +add_executable(ceph-objectstore-tool + ceph_objectstore_tool.cc + rebuild_mondb.cc + RadosDump.cc) +target_link_libraries(ceph-objectstore-tool osd os global Boost::program_options ${CMAKE_DL_LIBS}) +if(WITH_FUSE) + target_link_libraries(ceph-objectstore-tool FUSE::FUSE) +endif(WITH_FUSE) +install(TARGETS ceph-objectstore-tool DESTINATION bin) + +if(WITH_LIBCEPHFS) +if(WITH_TESTS) + add_executable(ceph-client-debug ceph-client-debug.cc) + target_link_libraries(ceph-client-debug cephfs global client) + install(TARGETS ceph-client-debug DESTINATION bin) +endif(WITH_TESTS) +endif(WITH_LIBCEPHFS) + +add_executable(ceph-kvstore-tool + kvstore_tool.cc + ceph_kvstore_tool.cc) +target_link_libraries(ceph-kvstore-tool os global) +install(TARGETS ceph-kvstore-tool DESTINATION bin) + +set(ceph_conf_srcs ceph_conf.cc) +add_executable(ceph-conf ${ceph_conf_srcs}) +target_link_libraries(ceph-conf global) +install(TARGETS ceph-conf DESTINATION bin) + +set(crushtool_srcs crushtool.cc) +add_executable(crushtool ${crushtool_srcs}) +target_link_libraries(crushtool global) +install(TARGETS crushtool DESTINATION bin) + +set(monmaptool_srcs monmaptool.cc) +add_executable(monmaptool ${monmaptool_srcs}) +target_link_libraries(monmaptool global) +install(TARGETS monmaptool DESTINATION bin) + +set(osdomaptool_srcs osdmaptool.cc) +add_executable(osdmaptool ${osdomaptool_srcs}) +target_link_libraries(osdmaptool global) +install(TARGETS osdmaptool DESTINATION bin) + +set(ceph-diff-sorted_srcs ceph-diff-sorted.cc) +add_executable(ceph-diff-sorted ${ceph-diff-sorted_srcs}) +install(TARGETS ceph-diff-sorted DESTINATION bin) + +if(WITH_TESTS) +set(ceph_psim_srcs psim.cc) +add_executable(ceph_psim ${ceph_psim_srcs}) +target_link_libraries(ceph_psim global) +install(TARGETS ceph_psim DESTINATION bin) +endif(WITH_TESTS) + +set(ceph_authtool_srcs ceph_authtool.cc) +add_executable(ceph-authtool ${ceph_authtool_srcs}) +target_link_libraries(ceph-authtool global ${EXTRALIBS} ${CRYPTO_LIBS}) +install(TARGETS ceph-authtool DESTINATION bin) + +if(WITH_TESTS) +set(cephdeduptool_srcs ceph_dedup_tool.cc) +add_executable(cephdeduptool ${cephdeduptool_srcs}) +target_link_libraries(cephdeduptool librados global cls_cas_client) +install(TARGETS cephdeduptool DESTINATION bin) +endif(WITH_TESTS) + +if(WITH_CEPHFS) + add_subdirectory(cephfs) +endif(WITH_CEPHFS) + +if(WITH_RBD) + add_subdirectory(rbd) + add_subdirectory(rbd_mirror) + if(LINUX) + add_subdirectory(rbd_nbd) + endif() + if(FREEBSD) + add_subdirectory(rbd_ggate) + endif() +endif(WITH_RBD) + +add_subdirectory(ceph-dencoder) diff --git a/src/tools/RadosDump.cc b/src/tools/RadosDump.cc new file mode 100644 index 00000000..420cd9fc --- /dev/null +++ b/src/tools/RadosDump.cc @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "RadosDump.h" + +int RadosDump::read_super() +{ + bufferlist ebl; + auto ebliter = ebl.cbegin(); + ssize_t bytes; + + bytes = ebl.read_fd(file_fd, super_header::FIXED_LENGTH); + if ((size_t)bytes != super_header::FIXED_LENGTH) { + cerr << "Unexpected EOF" << std::endl; + return -EFAULT; + } + + sh.decode(ebliter); + + return 0; +} + + +int RadosDump::get_header(header *h) +{ + assert (h != NULL); + + bufferlist ebl; + auto ebliter = ebl.cbegin(); + ssize_t bytes; + + bytes = ebl.read_fd(file_fd, sh.header_size); + if ((size_t)bytes != sh.header_size) { + cerr << "Unexpected EOF" << std::endl; + return -EFAULT; + } + + h->decode(ebliter); + + return 0; +} + +int RadosDump::get_footer(footer *f) +{ + ceph_assert(f != NULL); + + bufferlist ebl; + auto ebliter = ebl.cbegin(); + ssize_t bytes; + + bytes = ebl.read_fd(file_fd, sh.footer_size); + if ((size_t)bytes != sh.footer_size) { + cerr << "Unexpected EOF" << std::endl; + return -EFAULT; + } + + f->decode(ebliter); + + if (f->magic != endmagic) { + cerr << "Bad footer magic" << std::endl; + return -EFAULT; + } + + return 0; +} + +int RadosDump::read_section(sectiontype_t *type, bufferlist *bl) +{ + header hdr; + ssize_t bytes; + + int ret = get_header(&hdr); + if (ret) + return ret; + + *type = hdr.type; + + bl->clear(); + bytes = bl->read_fd(file_fd, hdr.size); + if (bytes != hdr.size) { + cerr << "Unexpected EOF" << std::endl; + return -EFAULT; + } + + if (hdr.size > 0) { + footer ft; + ret = get_footer(&ft); + if (ret) + return ret; + } + + return 0; +} + + +int RadosDump::skip_object(bufferlist &bl) +{ + bufferlist ebl; + bool done = false; + while(!done) { + sectiontype_t type; + int ret = read_section(&type, &ebl); + if (ret) + return ret; + + if (type >= END_OF_TYPES) { + cout << "Skipping unknown object section type" << std::endl; + continue; + } + switch(type) { + case TYPE_DATA: + case TYPE_ATTRS: + case TYPE_OMAP_HDR: + case TYPE_OMAP: +#ifdef DIAGNOSTIC + cerr << "Skip type " << (int)type << std::endl; +#endif + break; + case TYPE_OBJECT_END: + done = true; + break; + default: + cerr << "Can't skip unknown type: " << type << std::endl; + return -EFAULT; + } + } + return 0; +} + +//Write super_header with its fixed 16 byte length +void RadosDump::write_super() +{ + if (dry_run) { + return; + } + + bufferlist superbl; + super_header sh; + footer ft; + + header hdr(TYPE_NONE, 0); + hdr.encode(superbl); + + sh.magic = super_header::super_magic; + sh.version = super_header::super_ver; + sh.header_size = superbl.length(); + superbl.clear(); + ft.encode(superbl); + sh.footer_size = superbl.length(); + superbl.clear(); + + sh.encode(superbl); + ceph_assert(super_header::FIXED_LENGTH == superbl.length()); + superbl.write_fd(file_fd); +} diff --git a/src/tools/RadosDump.h b/src/tools/RadosDump.h new file mode 100644 index 00000000..83f02e69 --- /dev/null +++ b/src/tools/RadosDump.h @@ -0,0 +1,409 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RADOS_DUMP_H_ +#define RADOS_DUMP_H_ + +#include <stdint.h> + +#include "include/buffer.h" +#include "include/encoding.h" + +#include "osd/osd_types.h" +#include "osd/OSDMap.h" + +typedef uint8_t sectiontype_t; +typedef uint32_t mymagic_t; +typedef int64_t mysize_t; + +enum { + TYPE_NONE = 0, + TYPE_PG_BEGIN, + TYPE_PG_END, + TYPE_OBJECT_BEGIN, + TYPE_OBJECT_END, + TYPE_DATA, + TYPE_ATTRS, + TYPE_OMAP_HDR, + TYPE_OMAP, + TYPE_PG_METADATA, + TYPE_POOL_BEGIN, + TYPE_POOL_END, + END_OF_TYPES, //Keep at the end +}; + +const uint16_t shortmagic = 0xffce; //goes into stream as "ceff" +//endmagic goes into stream as "ceff ffec" +const mymagic_t endmagic = (0xecff << 16) | shortmagic; + +//The first FIXED_LENGTH bytes are a fixed +//portion of the export output. This includes the overall +//version number, and size of header and footer. +//THIS STRUCTURE CAN ONLY BE APPENDED TO. If it needs to expand, +//the version can be bumped and then anything +//can be added to the export format. +struct super_header { + static const uint32_t super_magic = (shortmagic << 16) | shortmagic; + // ver = 1, Initial version + // ver = 2, Add OSDSuperblock to pg_begin + static const uint32_t super_ver = 2; + static const uint32_t FIXED_LENGTH = 16; + uint32_t magic; + uint32_t version; + uint32_t header_size; + uint32_t footer_size; + + super_header() : magic(0), version(0), header_size(0), footer_size(0) { } + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(magic, bl); + encode(version, bl); + encode(header_size, bl); + encode(footer_size, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(magic, bl); + decode(version, bl); + decode(header_size, bl); + decode(footer_size, bl); + } +}; + +struct header { + sectiontype_t type; + mysize_t size; + header(sectiontype_t type, mysize_t size) : + type(type), size(size) { } + header(): type(0), size(0) { } + + void encode(bufferlist& bl) const { + uint32_t debug_type = (type << 24) | (type << 16) | shortmagic; + ENCODE_START(1, 1, bl); + encode(debug_type, bl); + encode(size, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + uint32_t debug_type; + DECODE_START(1, bl); + decode(debug_type, bl); + type = debug_type >> 24; + decode(size, bl); + DECODE_FINISH(bl); + } +}; + +struct footer { + mymagic_t magic; + footer() : magic(endmagic) { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(magic, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(magic, bl); + DECODE_FINISH(bl); + } +}; + +struct pg_begin { + spg_t pgid; + OSDSuperblock superblock; + + pg_begin(spg_t pg, const OSDSuperblock& sb): + pgid(pg), superblock(sb) { } + pg_begin() { } + + void encode(bufferlist& bl) const { + // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then + // shard will be NO_SHARD for a replicated pool. This means + // that we allow the decode by struct_v 2. + ENCODE_START(3, 2, bl); + encode(pgid.pgid, bl); + encode(superblock, bl); + encode(pgid.shard, bl); + ENCODE_FINISH(bl); + } + // NOTE: New super_ver prevents decode from ver 1 + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(pgid.pgid, bl); + if (struct_v > 1) { + decode(superblock, bl); + } + if (struct_v > 2) { + decode(pgid.shard, bl); + } else { + pgid.shard = shard_id_t::NO_SHARD; + } + DECODE_FINISH(bl); + } +}; + +struct object_begin { + ghobject_t hoid; + + // Duplicate what is in the OI_ATTR so we have it at the start + // of object processing. + object_info_t oi; + + explicit object_begin(const ghobject_t &hoid): hoid(hoid) { } + object_begin() { } + + // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then + // generation will be NO_GEN, shard_id will be NO_SHARD for a replicated + // pool. This means we will allow the decode by struct_v 1. + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(hoid.hobj, bl); + encode(hoid.generation, bl); + encode(hoid.shard_id, bl); + encode(oi, bl, -1); /* FIXME: we always encode with full features */ + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(hoid.hobj, bl); + if (struct_v > 1) { + decode(hoid.generation, bl); + decode(hoid.shard_id, bl); + } else { + hoid.generation = ghobject_t::NO_GEN; + hoid.shard_id = shard_id_t::NO_SHARD; + } + if (struct_v > 2) { + decode(oi, bl); + } + DECODE_FINISH(bl); + } +}; + +struct data_section { + uint64_t offset; + uint64_t len; + bufferlist databl; + data_section(uint64_t offset, uint64_t len, bufferlist bl): + offset(offset), len(len), databl(bl) { } + data_section(): offset(0), len(0) { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(offset, bl); + encode(len, bl); + encode(databl, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(offset, bl); + decode(len, bl); + decode(databl, bl); + DECODE_FINISH(bl); + } +}; + +struct attr_section { + map<string,bufferlist> data; + explicit attr_section(const map<string,bufferlist> &data) : data(data) { } + explicit attr_section(map<string, bufferptr> &data_) + { + for (std::map<std::string, bufferptr>::iterator i = data_.begin(); + i != data_.end(); ++i) { + bufferlist bl; + bl.push_back(i->second); + data[i->first] = bl; + } + } + + attr_section() { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(data, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(data, bl); + DECODE_FINISH(bl); + } +}; + +struct omap_hdr_section { + bufferlist hdr; + explicit omap_hdr_section(bufferlist hdr) : hdr(hdr) { } + omap_hdr_section() { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(hdr, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(hdr, bl); + DECODE_FINISH(bl); + } +}; + +struct omap_section { + map<string, bufferlist> omap; + explicit omap_section(const map<string, bufferlist> &omap) : + omap(omap) { } + omap_section() { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(omap, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(omap, bl); + DECODE_FINISH(bl); + } +}; + +struct metadata_section { + // struct_ver is the on-disk version of original pg + __u8 struct_ver; // for reference + epoch_t map_epoch; + pg_info_t info; + pg_log_t log; + PastIntervals past_intervals; + OSDMap osdmap; + bufferlist osdmap_bl; // Used in lieu of encoding osdmap due to crc checking + map<eversion_t, hobject_t> divergent_priors; + pg_missing_t missing; + + metadata_section( + __u8 struct_ver, + epoch_t map_epoch, + const pg_info_t &info, + const pg_log_t &log, + const PastIntervals &past_intervals, + const pg_missing_t &missing) + : struct_ver(struct_ver), + map_epoch(map_epoch), + info(info), + log(log), + past_intervals(past_intervals), + missing(missing) {} + metadata_section() + : struct_ver(0), + map_epoch(0) { } + + void encode(bufferlist& bl) const { + ENCODE_START(6, 6, bl); + encode(struct_ver, bl); + encode(map_epoch, bl); + encode(info, bl); + encode(log, bl); + encode(past_intervals, bl); + // Equivalent to osdmap.encode(bl, features); but + // preserving exact layout for CRC checking. + bl.append(osdmap_bl); + encode(divergent_priors, bl); + encode(missing, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(6, bl); + decode(struct_ver, bl); + decode(map_epoch, bl); + decode(info, bl); + decode(log, bl); + if (struct_v >= 6) { + decode(past_intervals, bl); + } else if (struct_v > 1) { + cout << "NOTICE: Older export with classic past_intervals" << std::endl; + } else { + cout << "NOTICE: Older export without past_intervals" << std::endl; + } + if (struct_v > 2) { + osdmap.decode(bl); + } else { + cout << "WARNING: Older export without OSDMap information" << std::endl; + } + if (struct_v > 3) { + decode(divergent_priors, bl); + } + if (struct_v > 4) { + decode(missing, bl); + } + DECODE_FINISH(bl); + } +}; + +/** + * Superclass for classes that will need to handle a serialized RADOS + * dump. Requires that the serialized dump be opened with a known FD. + */ +class RadosDump +{ + protected: + int file_fd; + super_header sh; + bool dry_run; + + public: + RadosDump(int file_fd_, bool dry_run_) + : file_fd(file_fd_), dry_run(dry_run_) + {} + + int read_super(); + int get_header(header *h); + int get_footer(footer *f); + int read_section(sectiontype_t *type, bufferlist *bl); + int skip_object(bufferlist &bl); + void write_super(); + + // Define this in .h because it's templated + template <typename T> + int write_section(sectiontype_t type, const T& obj, int fd) { + if (dry_run) + return 0; + bufferlist blhdr, bl, blftr; + obj.encode(bl); + header hdr(type, bl.length()); + hdr.encode(blhdr); + footer ft; + ft.encode(blftr); + + int ret = blhdr.write_fd(fd); + if (ret) return ret; + ret = bl.write_fd(fd); + if (ret) return ret; + ret = blftr.write_fd(fd); + return ret; + } + + int write_simple(sectiontype_t type, int fd) + { + if (dry_run) + return 0; + bufferlist hbl; + + header hdr(type, 0); + hdr.encode(hbl); + return hbl.write_fd(fd); + } +}; + +#endif diff --git a/src/tools/ceph-client-debug.cc b/src/tools/ceph-client-debug.cc new file mode 100644 index 00000000..7a43c9c2 --- /dev/null +++ b/src/tools/ceph-client-debug.cc @@ -0,0 +1,190 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "common/Formatter.h" +#include "common/debug.h" +#include "common/errno.h" +#include "client/Inode.h" +#include "client/Dentry.h" +#include "client/Dir.h" +#include "include/cephfs/libcephfs.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_client + +void usage() +{ + std::cout << "Usage: ceph-client-debug [options] <inode number>" << std::endl; + generic_client_usage(); +} + + +/** + * Given an inode, look up the path from the Client cache: assumes + * client cache is fully populated. + */ +void traverse_dentries(Inode *ino, std::vector<Dentry*> &parts) +{ + if (ino->dentries.empty()) { + return; + } + + Dentry* dn = *(ino->dentries.begin()); + parts.push_back(dn); + traverse_dentries(dn->dir->parent_inode, parts); +} + + +/** + * Given an inode, send lookup requests to the MDS for + * all its ancestors, such that the full trace will be + * populated in client cache. + */ +int lookup_trace(ceph_mount_info *client, inodeno_t const ino) +{ + Inode *inode; + int r = ceph_ll_lookup_inode(client, ino, &inode); + if (r != 0) { + return r; + } else { + if (!inode->dentries.empty()) { + Dentry *dn = *(inode->dentries.begin()); + ceph_assert(dn->dir); + ceph_assert(dn->dir->parent_inode); + r = lookup_trace(client, dn->dir->parent_inode->ino); + if (r) { + return r; + } + } else { + // We reached the root of the tree + ceph_assert(inode->ino == CEPH_INO_ROOT); + } + } + + return r; +} + + +int main(int argc, const char **argv) +{ + // Argument handling + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS| + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + + common_init_finish(g_ceph_context); + + // Expect exactly one positional argument (inode number) + if (args.size() != 1) { + cerr << "missing position argument (inode number)" << std::endl; + exit(1); + } + char const *inode_str = args[0]; + inodeno_t inode = strtoll(inode_str, NULL, 0); + if (inode <= 0) { + derr << "Invalid inode: " << inode_str << dendl; + return -1; + } + + // Initialize filesystem client + struct ceph_mount_info *client; + int r = ceph_create_with_context(&client, g_ceph_context); + if (r) { + derr << "Error initializing libcephfs: " << cpp_strerror(r) << dendl; + return r; + } + + r = ceph_mount(client, "/"); + if (r) { + derr << "Error mounting: " << cpp_strerror(r) << dendl; + ceph_shutdown(client); + return r; + } + + + // Populate client cache with inode of interest & ancestors + r = lookup_trace(client, inode); + if (r) { + derr << "Error looking up inode " << std::hex << inode << std::dec << + ": " << cpp_strerror(r) << dendl; + return -1; + } + + // Retrieve inode of interest + struct vinodeno_t vinode; + vinode.ino = inode; + vinode.snapid = CEPH_NOSNAP; + Inode *ino = ceph_ll_get_inode(client, vinode); + + // Retrieve dentry trace + std::vector<Dentry*> path; + traverse_dentries(ino, path); + + // Print inode and path as a JSON object + JSONFormatter jf(true); + jf.open_object_section("client_debug"); + { + jf.open_object_section("inode"); + { + ino->dump(&jf); + } + jf.close_section(); // inode + jf.open_array_section("path"); + { + for (std::vector<Dentry*>::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) { + jf.open_object_section("dentry"); + { + (*p)->dump(&jf); + } + jf.close_section(); // dentry + } + } + jf.close_section(); // path + } + jf.close_section(); // client_debug + jf.flush(std::cout); + std::cout << std::endl; + + // Release Inode references + ceph_ll_forget(client, ino, 1); + for (std::vector<Dentry*>::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) { + ceph_ll_forget(client, (*p)->inode.get(), 1); + } + ino = NULL; + path.clear(); + + // Shut down + r = ceph_unmount(client); + if (r) { + derr << "Error mounting: " << cpp_strerror(r) << dendl; + } + ceph_shutdown(client); + + return r; +} diff --git a/src/tools/ceph-dencoder/CMakeLists.txt b/src/tools/ceph-dencoder/CMakeLists.txt new file mode 100644 index 00000000..15604d09 --- /dev/null +++ b/src/tools/ceph-dencoder/CMakeLists.txt @@ -0,0 +1,68 @@ +## dencoder +set_source_files_properties( + ceph_dencoder.cc + APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/include/ceph_ver.h) + +if(HAS_VTA) + set_source_files_properties(ceph_dencoder.cc + PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments) +endif() + +set(dencoder_srcs + ceph_dencoder.cc + $<TARGET_OBJECTS:common_texttable_obj>) +if(WITH_RADOSGW) + list(APPEND dencoder_srcs + ${CMAKE_SOURCE_DIR}/src/rgw/rgw_dencoder.cc) +endif() + +add_executable(ceph-dencoder ${dencoder_srcs}) + +if(WITH_RADOSGW) + list(APPEND DENCODER_EXTRALIBS + rgw_a + cls_rgw_client) + if(WITH_RADOSGW_AMQP_ENDPOINT) + list(APPEND DENCODER_EXTRALIBS + rabbitmq) + endif() + if(WITH_RADOSGW_KAFKA_ENDPOINT) + list(APPEND DENCODER_EXTRALIBS + rdkafka) + endif() +endif() + +if(WITH_RBD) + list(APPEND DENCODER_EXTRALIBS + cls_rbd_client + rbd_mirror_types + rbd_types + rbd_replay_types) + if(WITH_KRBD) + list(APPEND DENCODER_EXTRALIBS + krbd) + endif() +endif() + +if(WITH_CEPHFS) + list(APPEND DENCODER_EXTRALIBS + mds) +endif() + +target_link_libraries(ceph-dencoder + global + os + osd + mon + journal + ${DENCODER_EXTRALIBS} + cls_lock_client + cls_refcount_client + cls_log_client + cls_version_client + cls_user_client + cls_journal_client + cls_timeindex_client + ${EXTRALIBS} + ${CMAKE_DL_LIBS}) +install(TARGETS ceph-dencoder DESTINATION bin) diff --git a/src/tools/ceph-dencoder/ceph_dencoder.cc b/src/tools/ceph-dencoder/ceph_dencoder.cc new file mode 100644 index 00000000..1f201ea7 --- /dev/null +++ b/src/tools/ceph-dencoder/ceph_dencoder.cc @@ -0,0 +1,480 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <errno.h> +#include "include/types.h" +#include "ceph_ver.h" +#include "include/encoding.h" +#include "include/ceph_features.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" +#include "msg/Message.h" +#include "include/ceph_assert.h" + +#define TYPE(t) +#define TYPE_STRAYDATA(t) +#define TYPE_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL(t) +#define TYPE_FEATUREFUL_STRAYDATA(t) +#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL_NOCOPY(t) +#define TYPE_NOCOPY(t) +#define MESSAGE(t) +#include "tools/ceph-dencoder/types.h" +#undef TYPE +#undef TYPE_STRAYDATA +#undef TYPE_NONDETERMINISTIC +#undef TYPE_NOCOPY +#undef TYPE_FEATUREFUL +#undef TYPE_FEATUREFUL_STRAYDATA +#undef TYPE_FEATUREFUL_NONDETERMINISTIC +#undef TYPE_FEATUREFUL_NOCOPY +#undef MESSAGE + +#define MB(m) ((m) * 1024 * 1024) + +void usage(ostream &out) +{ + out << "usage: ceph-dencoder [commands ...]" << std::endl; + out << "\n"; + out << " version print version string (to stdout)\n"; + out << "\n"; + out << " import <encfile> read encoded data from encfile\n"; + out << " export <outfile> write encoded data to outfile\n"; + out << "\n"; + out << " set_features <num> set feature bits used for encoding\n"; + out << " get_features print feature bits (int) to stdout\n"; + out << "\n"; + out << " list_types list supported types\n"; + out << " type <classname> select in-memory type\n"; + out << " skip <num> skip <num> leading bytes before decoding\n"; + out << " decode decode into in-memory object\n"; + out << " encode encode in-memory object\n"; + out << " dump_json dump in-memory object as json (to stdout)\n"; + out << " hexdump print encoded data in hex\n"; + out << "\n"; + out << " copy copy object (via operator=)\n"; + out << " copy_ctor copy object (via copy ctor)\n"; + out << "\n"; + out << " count_tests print number of generated test objects (to stdout)\n"; + out << " select_test <n> select generated test object as in-memory object\n"; + out << " is_deterministic exit w/ success if type encodes deterministically\n"; +} +struct Dencoder { + virtual ~Dencoder() {} + virtual string decode(bufferlist bl, uint64_t seek) = 0; + virtual void encode(bufferlist& out, uint64_t features) = 0; + virtual void dump(ceph::Formatter *f) = 0; + virtual void copy() { + cerr << "copy operator= not supported" << std::endl; + } + virtual void copy_ctor() { + cerr << "copy ctor not supported" << std::endl; + } + virtual void generate() = 0; + virtual int num_generated() = 0; + virtual string select_generated(unsigned n) = 0; + virtual bool is_deterministic() = 0; + //virtual void print(ostream& out) = 0; +}; + +template<class T> +class DencoderBase : public Dencoder { +protected: + T* m_object; + list<T*> m_list; + bool stray_okay; + bool nondeterministic; + +public: + DencoderBase(bool stray_okay, bool nondeterministic) + : m_object(new T), + stray_okay(stray_okay), + nondeterministic(nondeterministic) {} + ~DencoderBase() override { + delete m_object; + } + + string decode(bufferlist bl, uint64_t seek) override { + auto p = bl.cbegin(); + p.seek(seek); + try { + using ceph::decode; + decode(*m_object, p); + } + catch (buffer::error& e) { + return e.what(); + } + if (!stray_okay && !p.end()) { + ostringstream ss; + ss << "stray data at end of buffer, offset " << p.get_off(); + return ss.str(); + } + return string(); + } + + void encode(bufferlist& out, uint64_t features) override = 0; + + void dump(ceph::Formatter *f) override { + m_object->dump(f); + } + void generate() override { + T::generate_test_instances(m_list); + } + int num_generated() override { + return m_list.size(); + } + string select_generated(unsigned i) override { + // allow 0- or 1-based (by wrapping) + if (i == 0) + i = m_list.size(); + if ((i == 0) || (i > m_list.size())) + return "invalid id for generated object"; + m_object = *(std::next(m_list.begin(), i-1)); + return string(); + } + + bool is_deterministic() override { + return !nondeterministic; + } +}; + +template<class T> +class DencoderImplNoFeatureNoCopy : public DencoderBase<T> { +public: + DencoderImplNoFeatureNoCopy(bool stray_ok, bool nondeterministic) + : DencoderBase<T>(stray_ok, nondeterministic) {} + void encode(bufferlist& out, uint64_t features) override { + out.clear(); + using ceph::encode; + encode(*this->m_object, out); + } +}; + +template<class T> +class DencoderImplNoFeature : public DencoderImplNoFeatureNoCopy<T> { +public: + DencoderImplNoFeature(bool stray_ok, bool nondeterministic) + : DencoderImplNoFeatureNoCopy<T>(stray_ok, nondeterministic) {} + void copy() override { + T *n = new T; + *n = *this->m_object; + delete this->m_object; + this->m_object = n; + } + void copy_ctor() override { + T *n = new T(*this->m_object); + delete this->m_object; + this->m_object = n; + } +}; + +template<class T> +class DencoderImplFeaturefulNoCopy : public DencoderBase<T> { +public: + DencoderImplFeaturefulNoCopy(bool stray_ok, bool nondeterministic) + : DencoderBase<T>(stray_ok, nondeterministic) {} + void encode(bufferlist& out, uint64_t features) override { + out.clear(); + using ceph::encode; + encode(*(this->m_object), out, features); + } +}; + +template<class T> +class DencoderImplFeatureful : public DencoderImplFeaturefulNoCopy<T> { +public: + DencoderImplFeatureful(bool stray_ok, bool nondeterministic) + : DencoderImplFeaturefulNoCopy<T>(stray_ok, nondeterministic) {} + void copy() override { + T *n = new T; + *n = *this->m_object; + delete this->m_object; + this->m_object = n; + } + void copy_ctor() override { + T *n = new T(*this->m_object); + delete this->m_object; + this->m_object = n; + } +}; + +template<class T> +class MessageDencoderImpl : public Dencoder { + typename T::ref m_object; + list<typename T::ref> m_list; + +public: + MessageDencoderImpl() : m_object(T::create()) {} + ~MessageDencoderImpl() override {} + + string decode(bufferlist bl, uint64_t seek) override { + auto p = bl.cbegin(); + p.seek(seek); + try { + Message::ref n(decode_message(g_ceph_context, 0, p), false); + if (!n) + throw std::runtime_error("failed to decode"); + if (n->get_type() != m_object->get_type()) { + stringstream ss; + ss << "decoded type " << n->get_type() << " instead of expected " << m_object->get_type(); + throw std::runtime_error(ss.str()); + } + m_object = boost::static_pointer_cast<typename T::ref::element_type, std::remove_reference<decltype(n)>::type::element_type>(n); + } + catch (buffer::error& e) { + return e.what(); + } + if (!p.end()) { + ostringstream ss; + ss << "stray data at end of buffer, offset " << p.get_off(); + return ss.str(); + } + return string(); + } + + void encode(bufferlist& out, uint64_t features) override { + out.clear(); + encode_message(m_object.get(), features, out); + } + + void dump(ceph::Formatter *f) override { + m_object->dump(f); + } + void generate() override { + //T::generate_test_instances(m_list); + } + int num_generated() override { + return m_list.size(); + } + string select_generated(unsigned i) override { + // allow 0- or 1-based (by wrapping) + if (i == 0) + i = m_list.size(); + if ((i == 0) || (i > m_list.size())) + return "invalid id for generated object"; + m_object = *(std::next(m_list.begin(), i-1)); + return string(); + } + bool is_deterministic() override { + return true; + } + + //void print(ostream& out) { + //out << m_object << std::endl; + //} +}; + + + +int main(int argc, const char **argv) +{ + // dencoders + map<string,Dencoder*> dencoders; + +#define T_STR(x) #x +#define T_STRINGIFY(x) T_STR(x) +#define TYPE(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(false, false); +#define TYPE_STRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(true, false); +#define TYPE_NONDETERMINISTIC(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(false, true); +#define TYPE_FEATUREFUL(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(false, false); +#define TYPE_FEATUREFUL_STRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(true, false); +#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(false, true); +#define TYPE_FEATUREFUL_NOCOPY(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeaturefulNoCopy<t>(false, false); +#define TYPE_NOCOPY(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeatureNoCopy<t>(false, false); +#define MESSAGE(t) dencoders[T_STRINGIFY(t)] = new MessageDencoderImpl<t>; +#include "tools/ceph-dencoder/types.h" +#undef TYPE +#undef TYPE_STRAYDATA +#undef TYPE_NONDETERMINISTIC +#undef TYPE_NOCOPY +#undef TYPE_FEATUREFUL +#undef TYPE_FEATUREFUL_STRAYDATA +#undef TYPE_FEATUREFUL_NONDETERMINISTIC +#undef TYPE_FEATUREFUL_NOCOPY +#undef T_STR +#undef T_STRINGIFY + + vector<const char*> args; + argv_to_vec(argc, argv, args); + env_to_vec(args); + + Dencoder *den = NULL; + uint64_t features = CEPH_FEATURES_SUPPORTED_DEFAULT; + bufferlist encbl; + uint64_t skip = 0; + + if (args.empty()) { + cerr << "-h for help" << std::endl; + exit(1); + } + for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ++i) { + string err; + + if (*i == string("help") || *i == string("-h") || *i == string("--help")) { + usage(cout); + exit(0); + } else if (*i == string("version")) { + cout << CEPH_GIT_NICE_VER << std::endl; + } else if (*i == string("list_types")) { + for (map<string,Dencoder*>::iterator p = dencoders.begin(); + p != dencoders.end(); + ++p) + cout << p->first << std::endl; + exit(0); + } else if (*i == string("type")) { + ++i; + if (i == args.end()) { + cerr << "expecting type" << std::endl; + exit(1); + } + string cname = *i; + if (!dencoders.count(cname)) { + cerr << "class '" << cname << "' unknown" << std::endl; + exit(1); + } + den = dencoders[cname]; + den->generate(); + } else if (*i == string("skip")) { + ++i; + if (i == args.end()) { + cerr << "expecting byte count" << std::endl; + exit(1); + } + skip = atoi(*i); + } else if (*i == string("get_features")) { + cout << CEPH_FEATURES_SUPPORTED_DEFAULT << std::endl; + exit(0); + } else if (*i == string("set_features")) { + ++i; + if (i == args.end()) { + cerr << "expecting features" << std::endl; + exit(1); + } + features = atoll(*i); + } else if (*i == string("encode")) { + if (!den) { + cerr << "must first select type with 'type <name>'" << std::endl; + exit(1); + } + den->encode(encbl, features | CEPH_FEATURE_RESERVED); // hack for OSDMap + } else if (*i == string("decode")) { + if (!den) { + cerr << "must first select type with 'type <name>'" << std::endl; + exit(1); + } + err = den->decode(encbl, skip); + } else if (*i == string("copy_ctor")) { + if (!den) { + cerr << "must first select type with 'type <name>'" << std::endl; + exit(1); + } + den->copy_ctor(); + } else if (*i == string("copy")) { + if (!den) { + cerr << "must first select type with 'type <name>'" << std::endl; + exit(1); + } + den->copy(); + } else if (*i == string("dump_json")) { + if (!den) { + cerr << "must first select type with 'type <name>'" << std::endl; + exit(1); + } + JSONFormatter jf(true); + jf.open_object_section("object"); + den->dump(&jf); + jf.close_section(); + jf.flush(cout); + cout << std::endl; + + } else if (*i == string("hexdump")) { + encbl.hexdump(cout); + } else if (*i == string("import")) { + ++i; + if (i == args.end()) { + cerr << "expecting filename" << std::endl; + exit(1); + } + int r; + if (*i == string("-")) { + *i = "stdin"; + // Read up to 1mb if stdin specified + r = encbl.read_fd(STDIN_FILENO, MB(1)); + } else { + r = encbl.read_file(*i, &err); + } + if (r < 0) { + cerr << "error reading " << *i << ": " << err << std::endl; + exit(1); + } + + } else if (*i == string("export")) { + ++i; + if (i == args.end()) { + cerr << "expecting filename" << std::endl; + exit(1); + } + int fd = ::open(*i, O_WRONLY|O_CREAT|O_TRUNC, 0644); + if (fd < 0) { + cerr << "error opening " << *i << " for write: " << cpp_strerror(errno) << std::endl; + exit(1); + } + int r = encbl.write_fd(fd); + if (r < 0) { + cerr << "error writing " << *i << ": " << cpp_strerror(errno) << std::endl; + exit(1); + } + ::close(fd); + + } else if (*i == string("count_tests")) { + if (!den) { + cerr << "must first select type with 'type <name>'" << std::endl; + exit(1); + } + cout << den->num_generated() << std::endl; + } else if (*i == string("select_test")) { + if (!den) { + cerr << "must first select type with 'type <name>'" << std::endl; + exit(1); + } + ++i; + if (i == args.end()) { + cerr << "expecting instance number" << std::endl; + exit(1); + } + int n = atoi(*i); + err = den->select_generated(n); + } else if (*i == string("is_deterministic")) { + if (!den) { + cerr << "must first select type with 'type <name>'" << std::endl; + exit(1); + } + if (den->is_deterministic()) + exit(0); + else + exit(1); + } else { + cerr << "unknown option '" << *i << "'" << std::endl; + exit(1); + } + if (err.length()) { + cerr << "error: " << err << std::endl; + exit(1); + } + } + return 0; +} diff --git a/src/tools/ceph-dencoder/ceph_time.h b/src/tools/ceph-dencoder/ceph_time.h new file mode 100644 index 00000000..c27cb574 --- /dev/null +++ b/src/tools/ceph-dencoder/ceph_time.h @@ -0,0 +1,68 @@ +#ifndef TEST_CEPH_TIME_H +#define TEST_CEPH_TIME_H + +#include <list> + +#include "include/encoding.h" +#include "common/ceph_time.h" +#include "common/Formatter.h" + +// wrapper for ceph::real_time that implements the dencoder interface +template <typename Clock> +class time_point_wrapper { + using time_point = typename Clock::time_point; + time_point t; + public: + time_point_wrapper() = default; + explicit time_point_wrapper(const time_point& t) : t(t) {} + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(t, bl); + } + void decode(bufferlist::const_iterator &p) { + using ceph::decode; + decode(t, p); + } + void dump(Formatter* f) { + auto epoch_time = Clock::to_time_t(t); + f->dump_string("time", std::ctime(&epoch_time)); + } + static void generate_test_instances(std::list<time_point_wrapper*>& ls) { + constexpr time_t t{455500800}; // Ghostbusters release date + ls.push_back(new time_point_wrapper(Clock::from_time_t(t))); + } +}; + +using real_time_wrapper = time_point_wrapper<ceph::real_clock>; +WRITE_CLASS_ENCODER(real_time_wrapper) + +using coarse_real_time_wrapper = time_point_wrapper<ceph::coarse_real_clock>; +WRITE_CLASS_ENCODER(coarse_real_time_wrapper) + +// wrapper for ceph::timespan that implements the dencoder interface +class timespan_wrapper { + ceph::timespan d; + public: + timespan_wrapper() = default; + explicit timespan_wrapper(const ceph::timespan& d) : d(d) {} + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(d, bl); + } + void decode(bufferlist::const_iterator &p) { + using ceph::decode; + decode(d, p); + } + void dump(Formatter* f) { + f->dump_int("timespan", d.count()); + } + static void generate_test_instances(std::list<timespan_wrapper*>& ls) { + constexpr std::chrono::seconds d{7377}; // marathon world record (2:02:57) + ls.push_back(new timespan_wrapper(d)); + } +}; +WRITE_CLASS_ENCODER(timespan_wrapper) + +#endif diff --git a/src/tools/ceph-dencoder/sstring.h b/src/tools/ceph-dencoder/sstring.h new file mode 100644 index 00000000..c2493c10 --- /dev/null +++ b/src/tools/ceph-dencoder/sstring.h @@ -0,0 +1,40 @@ +#ifndef TEST_SSTRING_H +#define TEST_SSTRING_H + +#include "common/sstring.hh" + +// wrapper for sstring that implements the dencoder interface +class sstring_wrapper { + using sstring16 = basic_sstring<char, uint32_t, 16>; + sstring16 s1; + using sstring24 = basic_sstring<unsigned char, uint16_t, 24>; + sstring24 s2; + public: + sstring_wrapper() = default; + sstring_wrapper(sstring16&& s1, sstring24&& s2) + : s1(std::move(s1)), s2(std::move(s2)) + {} + + DENC(sstring_wrapper, w, p) { + DENC_START(1, 1, p); + denc(w.s1, p); + denc(w.s2, p); + DENC_FINISH(p); + } + void dump(Formatter* f) { + f->dump_string("s1", s1.c_str()); + f->dump_string("s2", reinterpret_cast<const char*>(s2.c_str())); + } + static void generate_test_instances(std::list<sstring_wrapper*>& ls) { + ls.push_back(new sstring_wrapper()); + // initialize sstrings that fit in internal storage + constexpr auto cstr6 = "abcdef"; + ls.push_back(new sstring_wrapper(sstring16{cstr6}, sstring24{cstr6})); + // initialize sstrings that overflow into external storage + constexpr auto cstr26 = "abcdefghijklmnopqrstuvwxyz"; + ls.push_back(new sstring_wrapper(sstring16{cstr26}, sstring24{cstr26})); + } +}; +WRITE_CLASS_DENC(sstring_wrapper) + +#endif diff --git a/src/tools/ceph-dencoder/types.h b/src/tools/ceph-dencoder/types.h new file mode 100644 index 00000000..6cfd6f16 --- /dev/null +++ b/src/tools/ceph-dencoder/types.h @@ -0,0 +1,880 @@ +#include "acconfig.h" + +#include "ceph_time.h" +TYPE(real_time_wrapper) +TYPE(coarse_real_time_wrapper) +TYPE(timespan_wrapper) + +#include "sstring.h" +TYPE(sstring_wrapper) + +#include "include/CompatSet.h" +TYPE(CompatSet) + +#include "include/filepath.h" +TYPE(filepath) + +#include "include/fs_types.h" +TYPE_FEATUREFUL(file_layout_t) + +#include "include/util.h" +TYPE(ceph_data_stats) + +#include "common/bit_vector.hpp" +TYPE(BitVector<2>) + +#include "common/bloom_filter.hpp" +TYPE(bloom_filter) +TYPE(compressible_bloom_filter) + +#include "common/DecayCounter.h" +TYPE(DecayCounter) + +#include "common/histogram.h" +TYPE(pow2_hist_t) + +#include "common/hobject.h" +TYPE(hobject_t) +TYPE(ghobject_t) + +#include "common/LogEntry.h" +TYPE_FEATUREFUL(LogEntry) +TYPE_FEATUREFUL(LogSummary) + +#include "common/SloppyCRCMap.h" +TYPE(SloppyCRCMap) + +#include "common/snap_types.h" +TYPE(SnapContext) +TYPE(SnapRealmInfo) + +#include "msg/msg_types.h" +TYPE(entity_name_t) +TYPE_FEATUREFUL(entity_addr_t) +TYPE_FEATUREFUL(entity_addrvec_t) +TYPE_FEATUREFUL(entity_inst_t) + +#include "crush/CrushWrapper.h" +TYPE_FEATUREFUL_NOCOPY(CrushWrapper) + +#include "osd/OSDMap.h" +TYPE(osd_info_t) +TYPE(osd_xinfo_t) +TYPE_FEATUREFUL_NOCOPY(OSDMap) +TYPE_FEATUREFUL_STRAYDATA(OSDMap::Incremental) + +#include "osd/osd_types.h" +TYPE(osd_reqid_t) +TYPE(object_locator_t) +TYPE(request_redirect_t) +TYPE(pg_t) +TYPE(coll_t) +TYPE_FEATUREFUL(objectstore_perf_stat_t) +TYPE_FEATUREFUL(osd_stat_t) +TYPE(OSDSuperblock) +TYPE_FEATUREFUL(pool_snap_info_t) +TYPE_FEATUREFUL(pg_pool_t) +TYPE(object_stat_sum_t) +TYPE(object_stat_collection_t) +TYPE(pg_stat_t) +TYPE_FEATUREFUL(pool_stat_t) +TYPE(pg_hit_set_info_t) +TYPE(pg_hit_set_history_t) +TYPE(pg_history_t) +TYPE(pg_info_t) +TYPE(PastIntervals) +TYPE_FEATUREFUL(pg_query_t) +TYPE(ObjectModDesc) +TYPE(pg_log_entry_t) +TYPE(pg_log_dup_t) +TYPE(pg_log_t) +TYPE_FEATUREFUL(pg_missing_item) +TYPE(pg_missing_t) +TYPE(pg_nls_response_t) +TYPE(pg_ls_response_t) +TYPE(object_copy_cursor_t) +TYPE_FEATUREFUL(object_copy_data_t) +TYPE(pg_create_t) +TYPE(OSDSuperblock) +TYPE(SnapSet) +TYPE_FEATUREFUL(watch_info_t) +TYPE(object_manifest_t) +TYPE_FEATUREFUL(object_info_t) +TYPE(SnapSet) +TYPE_FEATUREFUL(ObjectRecoveryInfo) +TYPE(ObjectRecoveryProgress) +TYPE(PushReplyOp) +TYPE_FEATUREFUL(PullOp) +TYPE_FEATUREFUL(PushOp) +TYPE(ScrubMap::object) +TYPE(ScrubMap) +TYPE_FEATUREFUL(obj_list_watch_response_t) +TYPE(clone_info) +TYPE(obj_list_snap_response_t) +TYPE(pool_pg_num_history_t) + +#include "osd/ECUtil.h" +// TYPE(stripe_info_t) non-standard encoding/decoding functions +TYPE(ECUtil::HashInfo) + +#include "osd/ECMsgTypes.h" +TYPE_NOCOPY(ECSubWrite) +TYPE(ECSubWriteReply) +TYPE_FEATUREFUL(ECSubRead) +TYPE(ECSubReadReply) + +#include "osd/HitSet.h" +TYPE_NONDETERMINISTIC(ExplicitHashHitSet) +TYPE_NONDETERMINISTIC(ExplicitObjectHitSet) +TYPE(BloomHitSet) +TYPE_NONDETERMINISTIC(HitSet) // because some subclasses are +TYPE(HitSet::Params) + +#include "os/ObjectStore.h" +TYPE(ObjectStore::Transaction) + +#include "os/filestore/SequencerPosition.h" +TYPE(SequencerPosition) + +#ifdef WITH_BLUESTORE +#include "os/bluestore/bluestore_types.h" +TYPE(bluestore_bdev_label_t) +TYPE(bluestore_cnode_t) +TYPE(bluestore_compression_header_t) +TYPE(bluestore_extent_ref_map_t) +TYPE(bluestore_pextent_t) +TYPE(bluestore_blob_use_tracker_t) +// TODO: bluestore_blob_t repurposes the "feature" param of encode() for its +// struct_v. at a higher level, BlueStore::ExtentMap encodes the extends using +// a different interface than the normal ones. see +// BlueStore::ExtentMap::encode_some(). maybe we can test it using another +// approach. +// TYPE_FEATUREFUL(bluestore_blob_t) +// TYPE(bluestore_shared_blob_t) there is no encode here +TYPE(bluestore_onode_t) +TYPE(bluestore_deferred_op_t) +TYPE(bluestore_deferred_transaction_t) +// TYPE(bluestore_compression_header_t) there is no encode here + +#include "os/bluestore/bluefs_types.h" +TYPE(bluefs_extent_t) +TYPE(bluefs_fnode_t) +TYPE(bluefs_super_t) +TYPE(bluefs_transaction_t) +#endif + +#include "mon/AuthMonitor.h" +TYPE_FEATUREFUL(AuthMonitor::Incremental) + +#include "mon/PGMap.h" +TYPE_FEATUREFUL_NONDETERMINISTIC(PGMapDigest) +TYPE_FEATUREFUL_NONDETERMINISTIC(PGMap) + +#include "mon/MonitorDBStore.h" +TYPE(MonitorDBStore::Transaction) +TYPE(MonitorDBStore::Op) + +#include "mon/MonMap.h" +TYPE_FEATUREFUL(MonMap) + +#include "mon/MonCap.h" +TYPE(MonCap) + +#include "mon/MgrMap.h" +TYPE_FEATUREFUL(MgrMap) + +#include "mon/mon_types.h" +TYPE(LevelDBStoreStats) +TYPE(ScrubResult) + +#include "mon/CreatingPGs.h" +TYPE(creating_pgs_t) + +#include "mgr/ServiceMap.h" +TYPE_FEATUREFUL(ServiceMap) +TYPE_FEATUREFUL(ServiceMap::Service) +TYPE_FEATUREFUL(ServiceMap::Daemon) + +#include "os/filestore/DBObjectMap.h" +TYPE(DBObjectMap::_Header) +TYPE(DBObjectMap::State) + +#include "os/filestore/FileStore.h" +TYPE(FSSuperblock) + +#include "os/kstore/kstore_types.h" +TYPE(kstore_cnode_t) +TYPE(kstore_onode_t) + +#ifdef WITH_CEPHFS +#include "mds/JournalPointer.h" +TYPE(JournalPointer) + +#include "osdc/Journaler.h" +TYPE(Journaler::Header) + +#include "mds/snap.h" +TYPE(SnapInfo) +TYPE(snaplink_t) +TYPE(sr_t) + +#include "mds/mdstypes.h" +TYPE(frag_info_t) +TYPE(nest_info_t) +TYPE(quota_info_t) +TYPE(client_writeable_range_t) +TYPE_FEATUREFUL(inode_t<std::allocator>) +TYPE_FEATUREFUL(old_inode_t<std::allocator>) +TYPE(fnode_t) +TYPE(old_rstat_t) +TYPE_FEATUREFUL(session_info_t) +TYPE(string_snap_t) +TYPE(MDSCacheObjectInfo) +TYPE(mds_table_pending_t) +TYPE(cap_reconnect_t) +TYPE(inode_load_vec_t) +TYPE(dirfrag_load_vec_t) +TYPE(mds_load_t) +TYPE(MDSCacheObjectInfo) +TYPE(inode_backtrace_t) +TYPE(inode_backpointer_t) + +#include "mds/CInode.h" +TYPE_FEATUREFUL(InodeStore) +TYPE_FEATUREFUL(InodeStoreBare) + +#include "mds/MDSMap.h" +TYPE_FEATUREFUL(MDSMap) +TYPE_FEATUREFUL(MDSMap::mds_info_t) + +#include "mds/FSMap.h" +//TYPE_FEATUREFUL(Filesystem) +TYPE_FEATUREFUL(FSMap) + +#include "mds/Capability.h" +TYPE_NOCOPY(Capability) + +#include "mds/inode_backtrace.h" +TYPE(inode_backpointer_t) +TYPE(inode_backtrace_t) + +#include "mds/InoTable.h" +TYPE(InoTable) + +#include "mds/SnapServer.h" +TYPE_STRAYDATA(SnapServer) + +#include "mds/events/ECommitted.h" +TYPE_FEATUREFUL_NOCOPY(ECommitted) + +#include "mds/events/EExport.h" +TYPE_FEATUREFUL_NOCOPY(EExport) + +#include "mds/events/EFragment.h" +TYPE_FEATUREFUL_NOCOPY(EFragment) + +#include "mds/events/EImportFinish.h" +TYPE_FEATUREFUL_NOCOPY(EImportFinish) + +#include "mds/events/EImportStart.h" +TYPE_FEATUREFUL_NOCOPY(EImportStart) + +#include "mds/events/EMetaBlob.h" +TYPE_FEATUREFUL_NOCOPY(EMetaBlob::fullbit) +TYPE(EMetaBlob::remotebit) +TYPE(EMetaBlob::nullbit) +TYPE_FEATUREFUL_NOCOPY(EMetaBlob::dirlump) +TYPE_FEATUREFUL_NOCOPY(EMetaBlob) + +#include "mds/events/EOpen.h" +TYPE_FEATUREFUL_NOCOPY(EOpen) + +#include "mds/events/EResetJournal.h" +TYPE_FEATUREFUL_NOCOPY(EResetJournal) + +#include "mds/events/ESession.h" +TYPE_FEATUREFUL_NOCOPY(ESession) + +#include "mds/events/ESessions.h" +TYPE_FEATUREFUL_NOCOPY(ESessions) + +#include "mds/events/ESlaveUpdate.h" +TYPE(link_rollback) +TYPE(rmdir_rollback) +TYPE(rename_rollback::drec) +TYPE(rename_rollback) +TYPE_FEATUREFUL_NOCOPY(ESlaveUpdate) + +#include "mds/events/ESubtreeMap.h" +TYPE_FEATUREFUL_NOCOPY(ESubtreeMap) + +#include "mds/events/ETableClient.h" +TYPE_FEATUREFUL_NOCOPY(ETableClient) + +#include "mds/events/ETableServer.h" +TYPE_FEATUREFUL_NOCOPY(ETableServer) + +#include "mds/events/EUpdate.h" +TYPE_FEATUREFUL_NOCOPY(EUpdate) +#endif // WITH_CEPHFS + +#ifdef WITH_RBD +#include "librbd/journal/Types.h" +TYPE(librbd::journal::EventEntry) +TYPE(librbd::journal::ClientData) +TYPE(librbd::journal::TagData) +#include "librbd/mirroring_watcher/Types.h" +TYPE(librbd::mirroring_watcher::NotifyMessage) +#include "librbd/trash_watcher/Types.h" +TYPE(librbd::mirroring_watcher::NotifyMessage) +#include "librbd/WatchNotifyTypes.h" +TYPE(librbd::watch_notify::NotifyMessage) +TYPE(librbd::watch_notify::ResponseMessage) + +#include "rbd_replay/ActionTypes.h" +TYPE(rbd_replay::action::Dependency) +TYPE(rbd_replay::action::ActionEntry) + +#include "tools/rbd_mirror/image_map/Types.h" +TYPE(rbd::mirror::image_map::PolicyData) +#endif + +#ifdef WITH_RADOSGW + +#include "rgw/rgw_rados.h" +TYPE(RGWOLHInfo) +TYPE(RGWObjManifestPart) +TYPE(RGWObjManifest) + +#include "rgw/rgw_zone.h" +TYPE(RGWZoneParams) +TYPE(RGWZone) +TYPE(RGWZoneGroup) +TYPE(RGWRealm) +TYPE(RGWPeriod) + +#include "rgw/rgw_acl.h" +TYPE(ACLPermission) +TYPE(ACLGranteeType) +TYPE(ACLGrant) +TYPE(RGWAccessControlList) +TYPE(ACLOwner) +TYPE(RGWAccessControlPolicy) + +#include "rgw/rgw_cache.h" +TYPE(ObjectMetaInfo) +TYPE(ObjectCacheInfo) +TYPE(RGWCacheNotifyInfo) + +#include "rgw/rgw_lc.h" +TYPE(RGWLifecycleConfiguration) + +#include "cls/rgw/cls_rgw_types.h" +TYPE(rgw_bucket_pending_info) +TYPE(rgw_bucket_dir_entry_meta) +TYPE(rgw_bucket_entry_ver) +TYPE(rgw_bucket_dir_entry) +TYPE(rgw_bucket_category_stats) +TYPE(rgw_bucket_dir_header) +TYPE(rgw_bucket_dir) +TYPE(rgw_bucket_entry_ver) +TYPE(cls_rgw_obj_key) +TYPE(rgw_bucket_olh_log_entry) +TYPE(rgw_usage_log_entry) + +#include "cls/rgw/cls_rgw_ops.h" +TYPE(rgw_cls_obj_prepare_op) +TYPE(rgw_cls_obj_complete_op) +TYPE(rgw_cls_list_op) +TYPE(rgw_cls_list_ret) +TYPE(cls_rgw_gc_defer_entry_op) +TYPE(cls_rgw_gc_list_op) +TYPE(cls_rgw_gc_list_ret) +TYPE(cls_rgw_gc_obj_info) +TYPE(cls_rgw_gc_remove_op) +TYPE(cls_rgw_gc_set_entry_op) +TYPE(cls_rgw_obj) +TYPE(cls_rgw_obj_chain) +TYPE(rgw_cls_tag_timeout_op) +TYPE(cls_rgw_bi_log_list_op) +TYPE(cls_rgw_bi_log_trim_op) +TYPE(cls_rgw_bi_log_list_ret) +TYPE(rgw_cls_link_olh_op) +TYPE(rgw_cls_unlink_instance_op) +TYPE(rgw_cls_read_olh_log_op) +TYPE(rgw_cls_read_olh_log_ret) +TYPE(rgw_cls_trim_olh_log_op) +TYPE(rgw_cls_bucket_clear_olh_op) +TYPE(rgw_cls_check_index_ret) +TYPE(cls_rgw_reshard_add_op) +TYPE(cls_rgw_reshard_list_op) +TYPE(cls_rgw_reshard_list_ret) +TYPE(cls_rgw_reshard_get_op) +TYPE(cls_rgw_reshard_get_ret) +TYPE(cls_rgw_reshard_remove_op) +TYPE(cls_rgw_set_bucket_resharding_op) +TYPE(cls_rgw_clear_bucket_resharding_op) +TYPE(cls_rgw_lc_obj_head) + +#include "cls/rgw/cls_rgw_client.h" +TYPE(rgw_bi_log_entry) +TYPE(cls_rgw_reshard_entry) +TYPE(cls_rgw_bucket_instance_entry) + +#include "cls/user/cls_user_types.h" +TYPE(cls_user_bucket) +TYPE(cls_user_bucket_entry) +TYPE(cls_user_stats) +TYPE(cls_user_header) + +#include "cls/user/cls_user_ops.h" +TYPE(cls_user_set_buckets_op) +TYPE(cls_user_remove_bucket_op) +TYPE(cls_user_list_buckets_op) +TYPE(cls_user_list_buckets_ret) +TYPE(cls_user_get_header_op) +TYPE(cls_user_get_header_ret) +TYPE(cls_user_complete_stats_sync_op) + +#include "cls/journal/cls_journal_types.h" +TYPE(cls::journal::ObjectPosition) +TYPE(cls::journal::ObjectSetPosition) +TYPE(cls::journal::Client) +TYPE(cls::journal::Tag) + +#include "rgw/rgw_common.h" +TYPE(RGWAccessKey) +TYPE(RGWSubUser) +TYPE(RGWUserInfo) +TYPE(rgw_bucket) +TYPE(RGWBucketInfo) +TYPE(RGWBucketEnt) +TYPE(RGWUploadPartInfo) +TYPE(rgw_obj) + +#include "rgw/rgw_log.h" +TYPE(rgw_log_entry) + +#include "rgw/rgw_meta_sync_status.h" +TYPE(rgw_meta_sync_info) +TYPE(rgw_meta_sync_marker) +TYPE(rgw_meta_sync_status) + +#include "rgw/rgw_data_sync.h" +TYPE(rgw_data_sync_info) +TYPE(rgw_data_sync_marker) +TYPE(rgw_data_sync_status) + +#endif + +#ifdef WITH_RBD +#include "cls/rbd/cls_rbd.h" +TYPE_FEATUREFUL(cls_rbd_parent) +TYPE_FEATUREFUL(cls_rbd_snap) + +#include "cls/rbd/cls_rbd_types.h" +TYPE(cls::rbd::ParentImageSpec) +TYPE(cls::rbd::ChildImageSpec) +TYPE(cls::rbd::MigrationSpec) +TYPE(cls::rbd::MirrorPeer) +TYPE(cls::rbd::MirrorImage) +TYPE(cls::rbd::MirrorImageMap) +TYPE(cls::rbd::MirrorImageStatus) +TYPE(cls::rbd::GroupImageSpec) +TYPE(cls::rbd::GroupImageStatus) +TYPE(cls::rbd::GroupSnapshot) +TYPE(cls::rbd::GroupSpec) +TYPE(cls::rbd::ImageSnapshotSpec) +TYPE(cls::rbd::SnapshotInfo) +TYPE(cls::rbd::SnapshotNamespace) +#endif + +#include "cls/lock/cls_lock_types.h" +TYPE(rados::cls::lock::locker_id_t) +TYPE_FEATUREFUL(rados::cls::lock::locker_info_t) +TYPE_FEATUREFUL(rados::cls::lock::lock_info_t) + +#include "cls/lock/cls_lock_ops.h" +TYPE(cls_lock_lock_op) +TYPE(cls_lock_unlock_op) +TYPE(cls_lock_break_op) +TYPE(cls_lock_get_info_op) +TYPE_FEATUREFUL(cls_lock_get_info_reply) +TYPE(cls_lock_list_locks_reply) +TYPE(cls_lock_assert_op) +TYPE(cls_lock_set_cookie_op) + +#include "cls/refcount/cls_refcount_ops.h" +TYPE(cls_refcount_get_op) +TYPE(cls_refcount_put_op) +TYPE(cls_refcount_set_op) +TYPE(cls_refcount_read_op) +TYPE(cls_refcount_read_ret) +TYPE(obj_refcount) + +#include "journal/Entry.h" +TYPE(journal::Entry) + +// --- messages --- +#include "messages/MAuth.h" +MESSAGE(MAuth) + +#include "messages/MAuthReply.h" +MESSAGE(MAuthReply) + +#include "messages/MCacheExpire.h" +MESSAGE(MCacheExpire) + +#include "messages/MClientCapRelease.h" +MESSAGE(MClientCapRelease) + +#include "messages/MClientCaps.h" +MESSAGE(MClientCaps) + +#include "messages/MClientLease.h" +MESSAGE(MClientLease) + +#include "messages/MClientReconnect.h" +MESSAGE(MClientReconnect) + +#include "messages/MClientReply.h" +MESSAGE(MClientReply) + +#include "messages/MClientRequest.h" +MESSAGE(MClientRequest) + +#include "messages/MClientRequestForward.h" +MESSAGE(MClientRequestForward) + +#include "messages/MClientQuota.h" +MESSAGE(MClientQuota) + +#include "messages/MClientSession.h" +MESSAGE(MClientSession) + +#include "messages/MClientSnap.h" +MESSAGE(MClientSnap) + +#include "messages/MCommand.h" +MESSAGE(MCommand) + +#include "messages/MCommandReply.h" +MESSAGE(MCommandReply) + +#include "messages/MConfig.h" +MESSAGE(MConfig) + +#include "messages/MDataPing.h" +MESSAGE(MDataPing) + +#include "messages/MDentryLink.h" +MESSAGE(MDentryLink) + +#include "messages/MDentryUnlink.h" +MESSAGE(MDentryUnlink) + +#include "messages/MDirUpdate.h" +MESSAGE(MDirUpdate) + +#include "messages/MDiscover.h" +MESSAGE(MDiscover) + +#include "messages/MDiscoverReply.h" +MESSAGE(MDiscoverReply) + +#include "messages/MExportCaps.h" +MESSAGE(MExportCaps) + +#include "messages/MExportCapsAck.h" +MESSAGE(MExportCapsAck) + +#include "messages/MExportDir.h" +MESSAGE(MExportDir) + +#include "messages/MExportDirAck.h" +MESSAGE(MExportDirAck) + +#include "messages/MExportDirCancel.h" +MESSAGE(MExportDirCancel) + +#include "messages/MExportDirDiscover.h" +MESSAGE(MExportDirDiscover) + +#include "messages/MExportDirDiscoverAck.h" +MESSAGE(MExportDirDiscoverAck) + +#include "messages/MExportDirFinish.h" +MESSAGE(MExportDirFinish) + +#include "messages/MExportDirNotify.h" +MESSAGE(MExportDirNotify) + +#include "messages/MExportDirNotifyAck.h" +MESSAGE(MExportDirNotifyAck) + +#include "messages/MExportDirPrep.h" +MESSAGE(MExportDirPrep) + +#include "messages/MExportDirPrepAck.h" +MESSAGE(MExportDirPrepAck) + +#include "messages/MForward.h" +MESSAGE(MForward) + +#include "messages/MFSMap.h" +MESSAGE(MFSMap) + +#include "messages/MFSMapUser.h" +MESSAGE(MFSMapUser) + +#include "messages/MGatherCaps.h" +MESSAGE(MGatherCaps) + +#include "messages/MGenericMessage.h" +MESSAGE(MGenericMessage) + +#include "messages/MGetConfig.h" +MESSAGE(MGetConfig) + +#include "messages/MGetPoolStats.h" +MESSAGE(MGetPoolStats) + +#include "messages/MGetPoolStatsReply.h" +MESSAGE(MGetPoolStatsReply) + +#include "messages/MHeartbeat.h" +MESSAGE(MHeartbeat) + +#include "messages/MInodeFileCaps.h" +MESSAGE(MInodeFileCaps) + +#include "messages/MLock.h" +MESSAGE(MLock) + +#include "messages/MLog.h" +MESSAGE(MLog) + +#include "messages/MLogAck.h" +MESSAGE(MLogAck) + +#include "messages/MMDSOpenIno.h" +MESSAGE(MMDSOpenIno) + +#include "messages/MMDSOpenInoReply.h" +MESSAGE(MMDSOpenInoReply) + +#include "messages/MMDSBeacon.h" +MESSAGE(MMDSBeacon) + +#include "messages/MMDSCacheRejoin.h" +MESSAGE(MMDSCacheRejoin) + +#include "messages/MMDSFindIno.h" +MESSAGE(MMDSFindIno) + +#include "messages/MMDSFindInoReply.h" +MESSAGE(MMDSFindInoReply) + +#include "messages/MMDSFragmentNotify.h" +MESSAGE(MMDSFragmentNotify) + +#include "messages/MMDSLoadTargets.h" +MESSAGE(MMDSLoadTargets) + +#include "messages/MMDSMap.h" +MESSAGE(MMDSMap) + +#include "messages/MMgrReport.h" +MESSAGE(MMgrReport) + +#include "messages/MMDSResolve.h" +MESSAGE(MMDSResolve) + +#include "messages/MMDSResolveAck.h" +MESSAGE(MMDSResolveAck) + +#include "messages/MMDSSlaveRequest.h" +MESSAGE(MMDSSlaveRequest) + +#include "messages/MMDSSnapUpdate.h" +MESSAGE(MMDSSnapUpdate) + +#include "messages/MMDSTableRequest.h" +MESSAGE(MMDSTableRequest) + +#include "messages/MMgrClose.h" +MESSAGE(MMgrClose) + +#include "messages/MMgrConfigure.h" +MESSAGE(MMgrConfigure) + +#include "messages/MMgrDigest.h" +MESSAGE(MMgrDigest) + +#include "messages/MMgrMap.h" +MESSAGE(MMgrMap) + +#include "messages/MMgrOpen.h" +MESSAGE(MMgrOpen) + +#include "messages/MMonCommand.h" +MESSAGE(MMonCommand) + +#include "messages/MMonCommandAck.h" +MESSAGE(MMonCommandAck) + +#include "messages/MMonElection.h" +MESSAGE(MMonElection) + +#include "messages/MMonGetMap.h" +MESSAGE(MMonGetMap) + +#include "messages/MMonGetVersion.h" +MESSAGE(MMonGetVersion) + +#include "messages/MMonGetVersionReply.h" +MESSAGE(MMonGetVersionReply) + +#include "messages/MMonGlobalID.h" +MESSAGE(MMonGlobalID) + +#include "messages/MMonJoin.h" +MESSAGE(MMonJoin) + +#include "messages/MMonMap.h" +MESSAGE(MMonMap) + +#include "messages/MMonMetadata.h" +MESSAGE(MMonMetadata) + +#include "messages/MMonPaxos.h" +MESSAGE(MMonPaxos) + +#include "messages/MMonProbe.h" +MESSAGE(MMonProbe) + +#include "messages/MMonScrub.h" +MESSAGE(MMonScrub) + +#include "messages/MMonSync.h" +MESSAGE(MMonSync) + +#include "messages/MMonSubscribe.h" +MESSAGE(MMonSubscribe) + +#include "messages/MMonSubscribeAck.h" +MESSAGE(MMonSubscribeAck) + +#include "messages/MNop.h" +MESSAGE(MNop) + +#include "messages/MOSDAlive.h" +MESSAGE(MOSDAlive) + +#include "messages/MOSDBoot.h" +MESSAGE(MOSDBoot) + +#include "messages/MOSDFailure.h" +MESSAGE(MOSDFailure) + +#include "messages/MOSDMap.h" +MESSAGE(MOSDMap) + +#include "messages/MOSDOp.h" +MESSAGE(MOSDOp) + +#include "messages/MOSDOpReply.h" +MESSAGE(MOSDOpReply) + +#include "messages/MOSDPGBackfill.h" +MESSAGE(MOSDPGBackfill) + +#include "messages/MOSDPGCreate.h" +MESSAGE(MOSDPGCreate) + +#include "messages/MOSDPGCreate2.h" +MESSAGE(MOSDPGCreate2) + +#include "messages/MOSDPGInfo.h" +MESSAGE(MOSDPGInfo) + +#include "messages/MOSDPGLog.h" +MESSAGE(MOSDPGLog) + +#include "messages/MOSDPGNotify.h" +MESSAGE(MOSDPGNotify) + +#include "messages/MOSDPGQuery.h" +MESSAGE(MOSDPGQuery) + +#include "messages/MOSDPGRemove.h" +MESSAGE(MOSDPGRemove) + +#include "messages/MOSDPGRecoveryDelete.h" +MESSAGE(MOSDPGRecoveryDelete) + +#include "messages/MOSDPGRecoveryDeleteReply.h" +MESSAGE(MOSDPGRecoveryDeleteReply) + +#include "messages/MOSDPGScan.h" +MESSAGE(MOSDPGScan) + +#include "messages/MOSDPGTemp.h" +MESSAGE(MOSDPGTemp) + +#include "messages/MOSDPGTrim.h" +MESSAGE(MOSDPGTrim) + +#include "messages/MOSDPing.h" +MESSAGE(MOSDPing) + +#include "messages/MOSDRepScrub.h" +MESSAGE(MOSDRepScrub) + +#include "messages/MOSDScrub.h" +MESSAGE(MOSDScrub) + +#include "messages/MOSDScrub2.h" +MESSAGE(MOSDScrub2) + +#include "messages/MOSDForceRecovery.h" +MESSAGE(MOSDForceRecovery) + +#include "messages/MPGStats.h" +MESSAGE(MPGStats) + +#include "messages/MPGStatsAck.h" +MESSAGE(MPGStatsAck) + +#include "messages/MPing.h" +MESSAGE(MPing) + +#include "messages/MPoolOp.h" +MESSAGE(MPoolOp) + +#include "messages/MPoolOpReply.h" +MESSAGE(MPoolOpReply) + +#include "messages/MRemoveSnaps.h" +MESSAGE(MRemoveSnaps) + +#include "messages/MRoute.h" +MESSAGE(MRoute) + +#include "messages/MServiceMap.h" +MESSAGE(MServiceMap) + +#include "messages/MStatfs.h" +MESSAGE(MStatfs) + +#include "messages/MStatfsReply.h" +MESSAGE(MStatfsReply) + +#include "messages/MTimeCheck.h" +MESSAGE(MTimeCheck) + +#include "messages/MTimeCheck2.h" +MESSAGE(MTimeCheck2) + +#include "messages/MWatchNotify.h" +MESSAGE(MWatchNotify) diff --git a/src/tools/ceph-diff-sorted.cc b/src/tools/ceph-diff-sorted.cc new file mode 100644 index 00000000..f8e4c28e --- /dev/null +++ b/src/tools/ceph-diff-sorted.cc @@ -0,0 +1,173 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * diffsorted -- a utility to compute a line-by-line diff on two + * sorted input files + * + * Copyright © 2019 Red Hat + * + * Author: J. Eric Ivancich + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. + */ + + +/* + * SUMMARY + * + * The `diffsorted` utility does a line-by-line diff on two sorted text + * files and indicating lines that are in one file but not the other + * using diff-style notation (although line numbers are not indicated). + * + * USAGE + * + * rgw-diff-sorted file1.txt file2.txt + * + * NOTES + * + * Each files should have its lines in sorted order and should have no + * empty lines. + * + * A potential input file can be sorted using the `sort` utility provided + * that LANG=C to insure byte lexical order. For example: + * + * LANG=C sort unsorted.txt >sorted.txt + * + * or: + * + * export LANG=C + * sort unsorted.txt >sorted.txt + * + * EXIT STATUS + * + * 0 : files same + * 1 : files different + * 2 : usage problem (e.g., wrong number of command-line arguments) + * 3 : problem opening input file + * 4 : bad file content (e.g., unsorted order or empty lines) + */ + + +#include <iostream> +#include <fstream> + + +struct FileOfLines { + const char* filename; + std::ifstream input; + std::string this_line, prev_line; + bool next_eof; + bool is_eof; + + FileOfLines(const char* _filename) : + filename(_filename), + input(filename), + next_eof(false), + is_eof(false) + { } + + void dump(const std::string& prefix) { + do { + std::cout << prefix << this_line << std::endl; + advance(); + } while (!eof()); + } + + bool eof() const { + return is_eof; + } + + bool good() const { + return input.good(); + } + + void advance() { + if (next_eof) { + is_eof = true; + return; + } + + prev_line = this_line; + std::getline(input, this_line); + if (this_line.empty()) { + if (!input.eof()) { + std::cerr << "Error: " << filename << " has an empty line." << + std::endl; + exit(4); + } + is_eof = true; + return; + } else if (input.eof()) { + next_eof = true; + } + + if (this_line < prev_line) { + std::cerr << "Error: " << filename << " is not in sorted order; \"" << + this_line << "\" follows \"" << prev_line << "\"." << std::endl; + exit(4); + } + } + + const std::string line() const { + return this_line; + } +}; + +int main(int argc, const char* argv[]) { + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " <file1> <file2>" << std::endl; + exit(2); + } + + FileOfLines input1(argv[1]); + if (!input1.good()) { + std::cerr << "Error opening " << argv[1] << + "." << std::endl; + exit(3); + } + + FileOfLines input2(argv[2]); + if (!input2.good()) { + std::cerr << "Error opening " << argv[2] << + "." << std::endl; + exit(3); + } + + bool files_same = true; + + input1.advance(); + input2.advance(); + + while (!input1.eof() && !input2.eof()) { + if (input1.line() == input2.line()) { + input1.advance(); + input2.advance(); + } else if (input1.line() < input2.line()) { + files_same = false; + std::cout << "< " << input1.line() << std::endl; + input1.advance(); + } else { + files_same = false; + std::cout << "> " << input2.line() << std::endl; + input2.advance(); + } + } + + if (!input1.eof()) { + files_same = false; + input1.dump("< "); + } else if (!input2.eof()) { + files_same = false; + input2.dump("> "); + } + + if (files_same) { + exit(0); + } else { + exit(1); + } +} diff --git a/src/tools/ceph-lazy/bash_completion.d/ceph-lazy b/src/tools/ceph-lazy/bash_completion.d/ceph-lazy new file mode 100644 index 00000000..4429def4 --- /dev/null +++ b/src/tools/ceph-lazy/bash_completion.d/ceph-lazy @@ -0,0 +1,27 @@ +_ceph-lazy() +{ + local cur prev all_opts commands + COMPREPLY=() + cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" + + commands="host-get-osd host-get-nodes host-osd-usage host-all-usage pg-get-host pg-most-write pg-less-write pg-most-write-kb pg-less-write-kb pg-most-read pg-less-read pg-most-read-kb pg-less-read-kb pg-empty rbd-prefix rbd-count rbd-host rbd-osd rbd-size rbd-all-size osd-most-used osd-less-used osd-get-ppg osd-get-pg object-get-host" + + all_opts="$commands -d -h" + + + +# If first option is -d keep completing without -d & -h + if [[ ${prev} == "-d" && ${#COMP_WORDS[@]} -eq 3 ]] ; then + COMPREPLY=( $(compgen -W "${commands}" -- ${cur}) ) + return 0 +# Do completion for first args + elif [[ ${#COMP_WORDS[@]} -eq 2 ]]; then + COMPREPLY=( $(compgen -W "${all_opts}" -- ${cur}) ) + return 0 +# Else do nothing + else + return 0 + fi +} +complete -F _ceph-lazy ceph-lazy diff --git a/src/tools/ceph-lazy/ceph-lazy b/src/tools/ceph-lazy/ceph-lazy new file mode 100755 index 00000000..39a33192 --- /dev/null +++ b/src/tools/ceph-lazy/ceph-lazy @@ -0,0 +1,709 @@ +#!/usr/bin/env bash +# +# ceph-lazy : Be efficient, be lazy ! +# +# Author: Gregory Charot <gcharot@redhat.com> +# +# This is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# + +# Specify exta option for ceph like the username/keyring/etc. Can also be done with CEPH_ARGS global variable +#CEPH_OPT="-n client.username" +VERSION="1.1.2" + +# +# Print info message to stderr +# + +function echoinfo() { + printf "INFO: %s\n" "$*" >&2; +} + + +# +# Print error message to stderr +# + +function echoerr() { + printf "ERROR: %s\n" "$*" >&2; +} + + +function help() { + >&2 echo "Usage : ceph-lazy [-d | -h] [command] [parameters] + +Ceph complex querying tool - Version $VERSION + +OPTIONS +======== + -d Activate debug mode + -h Print help + +COMMANDS +========= + + Host + ----- + host-get-osd hostname List all OSD IDs attached to a particular node. + host-get-nodes List all storage nodes. + host-osd-usage hostname Show total OSD space usage of a particular node (-d for details). + host-all-usage Show total OSD space usage of each nodes (-d for details) + + Placement groups + ----------------- + pg-get-host pgid Find PG storage hosts (first is primary) + pg-most-write Find most written PG (nb operations) + pg-less-write Find less written PG (nb operations) + pg-most-write-kb Find most written PG (data written) + pg-less-write-kb Find less written PG (data written) + pg-most-read Find most read PG (nb operations) + pg-less-read Find less read PG (nb operations) + pg-most-read-kb Find most read PG (data read) + pg-less-read-kb Find less read PG (data read) + pg-empty Find empty PGs (no stored object) + + RBD + ---- + rbd-prefix pool_name image_name Return RBD image prefix + rbd-count pool_name image_name Count number of objects in a RBD image + rbd-host pool_name image_name Find RBD primary storage hosts + rbd-osd pool_name image_name Find RBD primary OSDs + rbd-size pool_name image_name Print RBD image real size + rbd-all-size pool_name Print all RBD images size (Top first) + + OSD + ---- + osd-most-used Show the most used OSD (capacity) + osd-less-used Show the less used OSD (capacity) + osd-get-ppg osd_id Show all primaries PGS hosted on a OSD + osd-get-pg osd_id Show all PGS hosted on a OSD + + Objects + -------- + object-get-host pool_name object_id Find object storage hosts (first is primary) + " + +} + +# +# Check dependencies +# +function check_requirements() +{ + + # List of command dependencies + local bin_dep="ceph rados rbd osdmaptool jq" + + for cmd in $bin_dep; do + [ $DEBUG -eq 1 ] && echoinfo "Checking for $cmd..." + $cmd --version >/dev/null 2>&1 || { echoerr "$cmd cannot be found... Aborting."; return 1; } + done + + CEPH="ceph $CEPH_OPT" + + [ $DEBUG -eq 1 ] && echoinfo "Checking Ceph connectivity & basic permissions..." + + if ! $CEPH -s &> /dev/null; then + echoerr "Cannot connect to cluster, please check your username & permissions" + echoerr "Command $CEPH -s failed" + return 1 + fi + + JQ="jq -M --raw-output" +} + +# +# Print the host that hosts a specific PG +# +function find_host_from_pg() { + + if [ $# -eq 1 ]; then + local PGID=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "PG $PGID has been found at (first is primary) : " + + for osd in $($CEPH pg $PGID query | $JQ -cr .up[]); do + echo -n "OSD:osd.$osd | Host:" + $CEPH osd find $osd --format json 2> /dev/null | $JQ .crush_location.host + done +} + + +# +# Print the host that hosts a specific object +# +function find_host_from_object() { + + if [ $# -eq 2 ]; then + local pool=$1 + local objid=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + local pgid=$($CEPH osd map $pool $objid --format json 2> /dev/null | $JQ -cr .pgid) + + [ $DEBUG -eq 1 ] && echoinfo $objid found into PG $pgid + + while read host; do + echo "PG:$pgid | $host" + done < <(find_host_from_pg $pgid) +} + + +# +# Print all primary pgs hosted by an OSD +# +function find_prim_pg_from_osd() { + + if [ $# -eq 1 ]; then + local posd=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for primary PGs belonging to OSD $posd" + $CEPH pg dump pgs --format json 2>/dev/null | $JQ --argjson posd $posd '.[] | select(.acting_primary==$posd).pgid' +} + + +# +# Print all pgs (primary & secondary) hosted by an OSD +# +function find_all_pg_from_osd() { + + if [ $# -eq 1 ]; then + local osd=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for all PGs mapped to OSD $osd" + $CEPH pg dump pgs --format json 2> /dev/null | $JQ -M --argjson osd $osd '.[] | select(.up[]==$osd).pgid' +} + + +# +# Check if a given image exists +# +function check_rbd_exists(){ + + pool=$1 + rbd=$2 + + if ! rbd info -p $pool $rbd &> /dev/null; then + echoerr "Unable to find image $pool/$rbd" + exit 1 + fi +} + + +# +# Return RBD prefix from image name +# +function get_rbd_prefix() { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + check_rbd_exists $pool $rbd + + local prefix=$(rbd --image $rbd -p $pool info --format json 2> /dev/null | jq --raw-output .block_name_prefix) + if [ -z $prefix ]; then + echoerr "Unable to find RBD Prefix for image $pool/$rbd" + exit 1 + else + echo $prefix + fi + +} + + +# +# Count number of object in a RBD image +# +function count_rbd_object() { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + check_rbd_exists $pool $rbd + + local rbd_prefix=$(get_rbd_prefix $pool $rbd) + + [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now couning objects..." + + local nb_obj=$(rados -p $pool ls | grep $rbd_prefix | wc -l) + + [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has $nb_obj objects" + echo $nb_obj +} + + +# +# Find primary storage host for a given RBD image +# +function find_prim_host_from_rbd() { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + check_rbd_exists $pool $rbd + + local osd="null" + local osdmap_t=$(mktemp) + local osdtree_t=$(mktemp) + # Get RBD image prefix + local rbd_prefix=$(get_rbd_prefix $pool $rbd) +# Exit if we received an empty prefix + [ -z $rbd_prefix ] && exit 1 + +# Get pool ID from pool name + local pool_id=$(ceph osd lspools -f json | $JQ -M --arg pool $pool '.[]|select(.poolname==$pool).poolnum') + + [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now finding primary host..." + + [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD map to $osdmap_t" + if ! $CEPH osd getmap > $osdmap_t 2> /dev/null; then + echoerr "Failed to retrieve OSD map" + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD tree to $osdtree_t" + + if ! $CEPH osd tree --format json > $osdtree_t; then + echoerr "Failed to retrieve OSD tree" + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for hosts..." + +# For each object in the RBD image + for obj in $(rados -p $pool ls | grep $rbd_prefix); + do +# Map object to osd. osdmaptoot does not support json output so using dirty sed. + osd=$(osdmaptool --test-map-object $obj --pool $pool_id $osdmap_t 2>/dev/null | sed -r 's/.*\[([[:digit:]]+),.*/\1/' | grep -v osdmaptool) +# Map osd to host + $JQ --argjson osd $osd '.nodes[] | select(.type=="host") | select(.children[] == $osd).name' $osdtree_t + done | sort -u + +# Cleaning files + rm -f $osdtree_t $osdmap_t +} + + +# +# Find primary OSDs for a given RBD image +# +function find_prim_osd_from_rbd() { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + check_rbd_exists $pool $rbd + + local osd="null" + local osdmap_t=$(mktemp) + local osdtree_t=$(mktemp) + # Get RBD image prefix + local rbd_prefix=$(get_rbd_prefix $pool $rbd) + +# Exit if we received an empty prefix + [ -z $rbd_prefix ] && exit 1 + + [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now finding primary OSDs..." + + [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD map to $osdmap_t" + if ! $CEPH osd getmap > $osdmap_t; then + echoerr "Failed to retrieve OSD map" + exit 1 + fi + +# For each object in the RBD image + for obj in $(rados -p $pool ls | grep $rbd_prefix); + do +# Map object to osd. osdmaptoot does not support json output so using dirty sed. + osd=$(osdmaptool --test-map-object $obj $osdmap_t 2>/dev/null | sed -r 's/.*\[([[:digit:]]+),.*/\1/' | grep -v osdmaptool) + echo "osd.${osd}" + done | sort -u + +# Cleaning files + rm -f $osdmap_t +} + + +# +# Print RBD image real size - Source http://ceph.com/planet/real-size-of-a-ceph-rbd-image/ +# + +function print_rbd_real_size { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Checking if RBD image exists..." + + check_rbd_exists $pool $rbd + + rbd diff $pool/$rbd | awk '{ SUM += $2 } END { print SUM/1024/1024 " MB" }' + +} + + +# +# Print all RBD image real sizes - Top first +# + +function list_all_rbd_real_size { + + if [ $# -eq 1 ]; then + local pool=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for RBD images in pool $pool" + + while read rbd; do + [ $DEBUG -eq 1 ] && echoinfo "Inspecting image $rbd" + rbd diff $pool/$rbd | awk -v rbd="$rbd" '{ SUM += $2 } END { print SUM/1024/1024 " MB - " rbd }' + done < <(rbd -p $pool ls) | sort -rV +} + + +# +# Print OSDs belonging to a particular storage host +# + +function list_osd_from_host() { + + if [ $# -eq 1 ]; then + local host=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + $CEPH osd tree --format json-pretty 2> /dev/null | $JQ --arg host $host '.nodes[] | select(.type=="host") | select(.name == $host).children[]' | sort -V + +} + + +# +# List all OSD nodes +# + +function list_all_nodes() { + + + $CEPH osd tree --format json | $JQ -M --raw-output '.nodes[] | select(.type=="host") | .name' | sort -V + +} + + +# +# Print Total OSD usage of a particular storage host +# + +function show_host_osd_usage() { + + if [ $# -eq 1 ]; then + local host=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + local pgmap_t=$(mktemp) + + local osd_used_kb=0 + local total_used_kb=0 + + local total_available_kb=0 + local osd_available_kb=0 + + local total_size_kb=0 + local osd_size_kb=0 + local nb_osd=0 + + [ $DEBUG -eq 1 ] && echoinfo "Dumping PG map..." + if ! $CEPH pg dump osds --format json 2>/dev/null > $pgmap_t; then + echoerr "Failed to retrieve PG map" + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for all OSDs on host $host..." + + for osd in $(list_osd_from_host $host); do + + osd_used_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb_used' $pgmap_t) + osd_available_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb_avail' $pgmap_t) + osd_size_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb' $pgmap_t) + + [ $DEBUG -eq 1 ] && echoinfo "OSD:$osd | Size:$(echo "scale=1;$osd_size_kb/1024/1024" | bc -l)GB | Used:$(echo "scale=1;$osd_used_kb /1024/1024" | bc -l)GB | Available:$(echo "scale=1;$osd_available_kb/1024/1024" | bc -l)GB" + + let "total_used_kb=total_used_kb+osd_used_kb" + let "total_available_kb=total_available_kb+osd_available_kb" + let "total_size_kb=total_size_kb+osd_size_kb" + let "nb_osd++" + + done + + echo "Host:$host | OSDs:$nb_osd | Total_Size:$(echo "scale=1;$total_size_kb/1024/1024" | bc -l)GB | Total_Used:$(echo "scale=1;$total_used_kb /1024/1024" | bc -l)GB | Total_Available:$(echo "scale=1;$total_available_kb/1024/1024" | bc -l)GB" + + rm -f $pgmap_t +} + + +# +# Print Total OSD usage of all nodes +# + +function list_all_nodes_osd_usage() { + + + for host in $(list_all_nodes); do + + [ $DEBUG -eq 1 ] && echoinfo "Looking at node $host..." + + show_host_osd_usage $host + done + +} + + +# +# Find most used (space) OSD +# + +function find_most_used_osd() { + + local osd=$($CEPH pg dump osds --format json 2> /dev/null| $JQ 'max_by(.kb_used) | .osd') + local host=$($CEPH osd find $osd 2> /dev/null | $JQ .crush_location.host) + + echo "OSD:osd.${osd} | host:$host" +} + + +# +# Find less used (space) OSD +# + +function find_less_used_osd() { + + local osd=$($CEPH pg dump osds --format json 2> /dev/null| $JQ 'min_by(.kb_used) | .osd') + local host=$($CEPH osd find $osd 2> /dev/null | $JQ .crush_location.host) + + echo "OSD:osd.${osd} | host:$host" +} + + +# +# Query PG stats +# + +function pg_stat_query() { + + if [ $# -eq 1 ]; then + local query_type=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + local pgmap_t=$(mktemp) + + [ $DEBUG -eq 1 ] && echoinfo "Dumping PG map..." + if ! $CEPH pg dump pgs --format json 2>/dev/null > $pgmap_t; then + echoerr "Failed to retrieve PG map" + exit 1 + fi + + local pgid=$($JQ --arg query_type $query_type "$query_type" $pgmap_t) + [ $DEBUG -eq 1 ] && echoinfo "Found PGID $pgid" + + local osd=$($JQ --arg pgid $pgid '.[] | select(.pgid == $pgid).acting_primary' $pgmap_t) + [ $DEBUG -eq 1 ] && echoinfo "Found OSD $osd" + + local host=$($CEPH osd find $osd --format json 2> /dev/null | $JQ .crush_location.host) + [ $DEBUG -eq 1 ] && echoinfo "Found host $host" + + echo "PG:$pgid | OSD:osd.$osd | Host:$host" + + rm -f $pgmap_t +} + + +# +# Find empty pgs (no object stored) +# + +function find_empty_pg() { + + $CEPH pg dump pgs --format json 2>/dev/null | $JQ '.[] | select(.stat_sum.num_objects == 0).pgid' + +} + + +# +# MAIN +# + + +# Print help if no argument is given +if [ $# -eq 0 ]; then + help + exit 1 +fi + +# Activate debug mode if -d is specified as first parameter +if [ "$1" = "-d" ]; then + echoinfo "Debug mode activated" + DEBUG=1 + shift +else + DEBUG=0 +fi + + +# Check if all requirements are met +check_requirements || exit 1 + + +# Call proper function +case $1 in + "-h") + help + exit 0 + ;; + "host-get-osd") + list_osd_from_host $2 + ;; + "host-get-nodes") + list_all_nodes + ;; + "host-osd-usage") + show_host_osd_usage $2 + ;; + "host-all-usage") + list_all_nodes_osd_usage + ;; + "pg-get-host") + find_host_from_pg $2 + ;; + "pg-most-write") + pg_stat_query "max_by(.stat_sum.num_write).pgid" + ;; + "pg-less-write") + pg_stat_query "min_by(.stat_sum.num_write).pgid" + ;; + "pg-most-write-kb") + pg_stat_query "max_by(.stat_sum.num_write_kb).pgid" + ;; + "pg-less-write-kb") + pg_stat_query "min_by(.stat_sum.num_write_kb).pgid" + ;; + "pg-most-read") + pg_stat_query "max_by(.stat_sum.num_read).pgid" + ;; + "pg-less-read") + pg_stat_query "min_by(.stat_sum.num_read).pgid" + ;; + "pg-most-read-kb") + pg_stat_query "max_by(.stat_sum.num_read_kb).pgid" + ;; + "pg-less-read-kb") + pg_stat_query "min_by(.stat_sum.num_read_kb).pgid" + ;; + "rbd-prefix") + get_rbd_prefix $2 $3 + ;; + "rbd-count") + count_rbd_object $2 $3 + ;; + "rbd-host") + find_prim_host_from_rbd $2 $3 + ;; + "rbd-osd") + find_prim_osd_from_rbd $2 $3 + ;; + "rbd-size") + print_rbd_real_size $2 $3 + ;; + "rbd-all-size") + list_all_rbd_real_size $2 + ;; + "osd-most-used") + find_most_used_osd + ;; + "osd-less-used") + find_less_used_osd + ;; + "osd-get-ppg") + find_prim_pg_from_osd $2 + ;; + "osd-get-pg") + find_all_pg_from_osd $2 + ;; + "pg-empty") + find_empty_pg + ;; + "object-get-host") + find_host_from_object $2 $3 + ;; + *) + echoerr "Unknown command : $1" + help + exit 1 + ;; +esac + diff --git a/src/tools/ceph-monstore-update-crush.sh b/src/tools/ceph-monstore-update-crush.sh new file mode 100755 index 00000000..5adfacdc --- /dev/null +++ b/src/tools/ceph-monstore-update-crush.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +# +# Copyright (C) 2015 Red Hat <contact@redhat.com> +# +# Author: Kefu Chai <kchai@redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +verbose= + +test -d ../src && export PATH=$PATH:. + +if ! which jq ; then + echo "Missing jq binary!" + exit 1 +fi + +if [ `uname` = FreeBSD ]; then + GETOPT=/usr/local/bin/getopt +else + GETOPT=getopt +fi + +function osdmap_get() { + local store_path=$1 + local query=$2 + local epoch=${3:+-v $3} + local osdmap=`mktemp` + + $CEPH_BIN/ceph-monstore-tool $store_path get osdmap -- \ + $epoch -o $osdmap > /dev/null || return + + echo $($CEPH_BIN/osdmaptool --dump json $osdmap 2> /dev/null | \ + jq "$query") + + rm -f $osdmap +} + +function test_crush() { + local store_path=$1 + local epoch=$2 + local max_osd=$3 + local crush=$4 + local osdmap=`mktemp` + + $CEPH_BIN/ceph-monstore-tool $store_path get osdmap -- \ + -v $epoch -o $osdmap > /dev/null + $CEPH_BIN/osdmaptool --export-crush $crush $osdmap &> /dev/null + + if $CEPH_BIN/crushtool --test --check $max_osd -i $crush > /dev/null; then + good=true + else + good=false + fi + rm -f $osdmap + $good || return 1 +} + +function die() { + local retval=$? + echo "$@" >&2 + exit $retval +} + +function usage() { + [ $# -gt 0 ] && echo -e "\n$@" + cat <<EOF + +Usage: $0 [options ...] <mon-store> + +Search backward for a latest known-good epoch in monstore. Rewrite the osdmap +epochs after it with the crush map in the found epoch if asked to do so. By +default, print out the crush map in the good epoch. + + [-h|--help] display this message + [--out] write the found crush map to given file (default: stdout) + [--rewrite] rewrite the monitor storage with the found crush map + [--verbose] be more chatty +EOF + [ $# -gt 0 ] && exit 1 + exit 0 +} + +function main() { + local temp + temp=$($GETOPT -o h --long verbose,help,mon-store:,out:,rewrite -n $0 -- "$@") || return 1 + + eval set -- "$temp" + local rewrite + while [ "$1" != "--" ]; do + case "$1" in + --verbose) + verbose=true + # set -xe + # PS4='${FUNCNAME[0]}: $LINENO: ' + shift;; + -h|--help) + usage + return 0;; + --out) + output=$2 + shift 2;; + --osdmap-epoch) + osdmap_epoch=$2 + shift 2;; + --rewrite) + rewrite=true + shift;; + *) + usage "unexpected argument $1" + shift;; + esac + done + shift + + local store_path="$1" + test $store_path || usage "I need the path to mon-store." + + # try accessing the store; if it fails, likely means a mon is running + local last_osdmap_epoch + local max_osd + last_osdmap_epoch=$(osdmap_get $store_path ".epoch") || \ + die "error accessing mon store at $store_path" + # get the max_osd # in last osdmap epoch, crushtool will use it to check + # the crush maps in previous osdmaps + max_osd=$(osdmap_get $store_path ".max_osd" $last_osdmap_epoch) + + local good_crush + local good_epoch + test $verbose && echo "the latest osdmap epoch is $last_osdmap_epoch" + for epoch in `seq $last_osdmap_epoch -1 1`; do + local crush_path=`mktemp` + test $verbose && echo "checking crush map #$epoch" + if test_crush $store_path $epoch $max_osd $crush_path; then + test $verbose && echo "crush map version #$epoch works with osdmap epoch #$osdmap_epoch" + good_epoch=$epoch + good_crush=$crush_path + break + fi + rm -f $crush_path + done + + if test $good_epoch; then + echo "good crush map found at epoch $epoch/$last_osdmap_epoch" + else + echo "Unable to find a crush map for osdmap version #$osdmap_epoch." 2>&1 + return 1 + fi + + if test $good_epoch -eq $last_osdmap_epoch; then + echo "and mon store has no faulty crush maps." + elif test $output; then + $CEPH_BIN/crushtool --decompile $good_crush --outfn $output + elif test $rewrite; then + $CEPH_BIN/ceph-monstore-tool $store_path rewrite-crush -- \ + --crush $good_crush \ + --good-epoch $good_epoch + else + echo + $CEPH_BIN/crushtool --decompile $good_crush + fi + rm -f $good_crush +} + +main "$@" diff --git a/src/tools/ceph_authtool.cc b/src/tools/ceph_authtool.cc new file mode 100644 index 00000000..f5a78c52 --- /dev/null +++ b/src/tools/ceph_authtool.cc @@ -0,0 +1,316 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/ConfUtils.h" +#include "common/ceph_argparse.h" +#include "common/config_proxy.h" +#include "global/global_context.h" +#include "global/global_init.h" + +#include "auth/Crypto.h" +#include "auth/Auth.h" +#include "auth/KeyRing.h" + +void usage() +{ + cout << "usage: ceph-authtool keyringfile [OPTIONS]...\n" + << "where the options are:\n" + << " -l, --list will list all keys and capabilities present in\n" + << " the keyring\n" + << " -p, --print-key will print an encoded key for the specified\n" + << " entityname. This is suitable for the\n" + << " 'mount -o secret=..' argument\n" + << " -C, --create-keyring will create a new keyring, overwriting any\n" + << " existing keyringfile\n" + << " -g, --gen-key will generate a new secret key for the\n" + << " specified entityname\n" + << " --gen-print-key will generate a new secret key without set it\n" + << " to the keyringfile, prints the secret to stdout\n" + << " --import-keyring FILE will import the content of a given keyring\n" + << " into the keyringfile\n" + << " -n NAME, --name NAME specify entityname to operate on\n" + << " -a BASE64, --add-key BASE64 will add an encoded key to the keyring\n" + << " --cap SUBSYSTEM CAPABILITY will set the capability for given subsystem\n" + << " --caps CAPSFILE will set all of capabilities associated with a\n" + << " given key, for all subsystems\n" + << " --mode MODE will set the desired file mode to the keyring\n" + << " e.g: '0644', defaults to '0600'" + << std::endl; + exit(1); +} + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + + std::string add_key; + std::string caps_fn; + std::string import_keyring; + map<string,bufferlist> caps; + std::string fn; + + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + + bool gen_key = false; + bool gen_print_key = false; + bool list = false; + bool print_key = false; + bool create_keyring = false; + int mode = 0600; // keyring file mode + std::vector<const char*>::iterator i; + + /* Handle options unique to ceph-authtool + * -n NAME, --name NAME is handled by global_init + * */ + for (i = args.begin(); i != args.end(); ) { + std::string val; + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_flag(args, i, "-g", "--gen-key", (char*)NULL)) { + gen_key = true; + } else if (ceph_argparse_flag(args, i, "--gen-print-key", (char*)NULL)) { + gen_print_key = true; + } else if (ceph_argparse_witharg(args, i, &val, "-a", "--add-key", (char*)NULL)) { + if (val.empty()) { + cerr << "Option --add-key requires an argument" << std::endl; + exit(1); + } + add_key = val; + } else if (ceph_argparse_flag(args, i, "-l", "--list", (char*)NULL)) { + list = true; + } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) { + caps_fn = val; + } else if (ceph_argparse_witharg(args, i, &val, "--cap", (char*)NULL)) { + std::string my_key = val; + if (i == args.end()) { + cerr << "must give two arguments to --cap: key and val." << std::endl; + exit(1); + } + std::string my_val = *i; + ++i; + encode(my_val, caps[my_key]); + } else if (ceph_argparse_flag(args, i, "-p", "--print-key", (char*)NULL)) { + print_key = true; + } else if (ceph_argparse_flag(args, i, "-C", "--create-keyring", (char*)NULL)) { + create_keyring = true; + } else if (ceph_argparse_witharg(args, i, &val, "--import-keyring", (char*)NULL)) { + import_keyring = val; + } else if (ceph_argparse_witharg(args, i, &val, "--mode", (char*)NULL)) { + std::string err; + mode = strict_strtoll(val.c_str(), 8, &err); + if (!err.empty()) { + cerr << "Option --mode requires an argument" << std::endl; + exit(1); + } + } else if (fn.empty()) { + fn = *i++; + } else { + cerr << argv[0] << ": unexpected '" << *i << "'" << std::endl; + usage(); + } + } + + if (fn.empty() && !gen_print_key) { + cerr << argv[0] << ": must specify filename" << std::endl; + usage(); + } + if (!(gen_key || + gen_print_key || + !add_key.empty() || + list || + !caps_fn.empty() || + !caps.empty() || + print_key || + create_keyring || + !import_keyring.empty())) { + cerr << "no command specified" << std::endl; + usage(); + } + if (gen_key && (!add_key.empty())) { + cerr << "can't both gen-key and add-key" << std::endl; + usage(); + } + + common_init_finish(g_ceph_context); + EntityName ename(g_conf()->name); + + // Enforce the use of gen-key or add-key when creating to avoid ending up + // with an "empty" key (key = AAAAAAAAAAAAAAAA) + if (create_keyring && !gen_key && add_key.empty() && !caps.empty()) { + cerr << "must specify either gen-key or add-key when creating" << std::endl; + usage(); + } + + if (gen_print_key) { + CryptoKey key; + key.create(g_ceph_context, CEPH_CRYPTO_AES); + cout << key << std::endl; + return 0; + } + + // keyring -------- + bool modified = false; + bool added_entity = false; + KeyRing keyring; + + bufferlist bl; + int r = 0; + if (create_keyring) { + cout << "creating " << fn << std::endl; + modified = true; + } else { + std::string err; + r = bl.read_file(fn.c_str(), &err); + if (r >= 0) { + try { + auto iter = bl.cbegin(); + decode(keyring, iter); + } catch (const buffer::error &err) { + cerr << "error reading file " << fn << std::endl; + exit(1); + } + } else { + cerr << "can't open " << fn << ": " << err << std::endl; + exit(1); + } + } + + // Validate that "name" actually has an existing key in this keyring if we + // have not given gen-key or add-key options + if (!gen_key && add_key.empty() && !caps.empty()) { + CryptoKey key; + if (!keyring.get_secret(ename, key)) { + cerr << "can't find existing key for " << ename + << " and neither gen-key nor add-key specified" << std::endl; + exit(1); + } + } + + // write commands + if (!import_keyring.empty()) { + KeyRing other; + bufferlist obl; + std::string err; + int r = obl.read_file(import_keyring.c_str(), &err); + if (r >= 0) { + try { + auto iter = obl.cbegin(); + decode(other, iter); + } catch (const buffer::error &err) { + cerr << "error reading file " << import_keyring << std::endl; + exit(1); + } + + cout << "importing contents of " << import_keyring << " into " << fn << std::endl; + //other.print(cout); + keyring.import(g_ceph_context, other); + modified = true; + } else { + cerr << "can't open " << import_keyring << ": " << err << std::endl; + exit(1); + } + } + if (gen_key) { + EntityAuth eauth; + eauth.key.create(g_ceph_context, CEPH_CRYPTO_AES); + keyring.add(ename, eauth); + modified = true; + } + if (!add_key.empty()) { + EntityAuth eauth; + try { + eauth.key.decode_base64(add_key); + } catch (const buffer::error &err) { + cerr << "can't decode key '" << add_key << "'" << std::endl; + exit(1); + } + keyring.add(ename, eauth); + modified = true; + cout << "added entity " << ename << " " << eauth << std::endl; + added_entity = true; + } + if (!caps_fn.empty()) { + ConfFile cf; + std::deque<std::string> parse_errors; + if (cf.parse_file(caps_fn, &parse_errors, &cerr) != 0) { + cerr << "could not parse caps file " << caps_fn << std::endl; + exit(1); + } + complain_about_parse_errors(g_ceph_context, &parse_errors); + map<string, bufferlist> caps; + const char *key_names[] = { "mon", "osd", "mds", "mgr", NULL }; + for (int i=0; key_names[i]; i++) { + std::string val; + if (cf.read("global", key_names[i], val) == 0) { + bufferlist bl; + encode(val, bl); + string s(key_names[i]); + caps[s] = bl; + } + } + keyring.set_caps(ename, caps); + modified = true; + } + if (!caps.empty()) { + keyring.set_caps(ename, caps); + modified = true; + } + if (added_entity && caps.size() > 0) { + cout << "added " << caps.size() << " caps to entity " << ename << std::endl; + } + + // read commands + if (list) { + try { + keyring.print(cout); + } catch (ceph::buffer::end_of_buffer &eob) { + cout << "Exception (end_of_buffer) in print(), exit." << std::endl; + exit(1); + } + } + if (print_key) { + CryptoKey key; + if (keyring.get_secret(ename, key)) { + cout << key << std::endl; + } else { + cerr << "entity " << ename << " not found" << std::endl; + exit(1); + } + } + + // write result? + if (modified) { + bufferlist bl; + keyring.encode_plaintext(bl); + r = bl.write_file(fn.c_str(), mode); + if (r < 0) { + cerr << "could not write " << fn << std::endl; + exit(1); + } + //cout << "wrote " << bl.length() << " bytes to " << fn << std::endl; + } + return 0; +} diff --git a/src/tools/ceph_conf.cc b/src/tools/ceph_conf.cc new file mode 100644 index 00000000..48511e5c --- /dev/null +++ b/src/tools/ceph_conf.cc @@ -0,0 +1,258 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2010 Dreamhost + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <iomanip> +#include <string> + +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "mon/AuthMonitor.h" +#include "common/Formatter.h" + +using std::deque; +using std::string; + +static void usage(std::ostream& out) +{ + // TODO: add generic_usage once cerr/derr issues are resolved + out << R"(Ceph configuration query tool + +USAGE +ceph-conf <flags> <action> + +ACTIONS + -L|--list-all-sections List all sections + -l|--list-sections <prefix> List sections with the given prefix + --filter-key <key> Filter section list to only include sections + with given key defined. + --filter-key-value <key>=<val> Filter section list to only include sections + with given key/value pair. + --lookup <key> Print a configuration setting to stdout. + Returns 0 (success) if the configuration setting is + found; 1 otherwise. + -r|--resolve-search search for the first file that exists and + can be opened in the resulted comma + delimited search list. + -D|--dump-all dump all variables. + +FLAGS + --name name Set type.id + [-s <section>] Add to list of sections to search + [--format plain|json|json-pretty] + dump variables in plain text, json or pretty + json + +If there is no action given, the action will default to --lookup. + +EXAMPLES +$ ceph-conf --name mon.0 -c /etc/ceph/ceph.conf 'mon addr' +Find out what the value of 'mon addr' is for monitor 0. + +$ ceph-conf -l mon +List sections beginning with 'mon'. + +RETURN CODE +Return code will be 0 on success; error code otherwise. +)"; +} + +static int list_sections(const std::string &prefix, + const std::list<string>& filter_key, + const std::map<string,string>& filter_key_value) +{ + std::vector <std::string> sections; + int ret = g_conf().get_all_sections(sections); + if (ret) + return 2; + for (std::vector<std::string>::const_iterator p = sections.begin(); + p != sections.end(); ++p) { + if (strncmp(prefix.c_str(), p->c_str(), prefix.size())) + continue; + + std::vector<std::string> sec; + sec.push_back(*p); + + int r = 0; + for (std::list<string>::const_iterator q = filter_key.begin(); q != filter_key.end(); ++q) { + string v; + r = g_conf().get_val_from_conf_file(sec, q->c_str(), v, false); + if (r < 0) + break; + } + if (r < 0) + continue; + + for (std::map<string,string>::const_iterator q = filter_key_value.begin(); + q != filter_key_value.end(); + ++q) { + string v; + r = g_conf().get_val_from_conf_file(sec, q->first.c_str(), v, false); + if (r < 0 || v != q->second) { + r = -1; + break; + } + } + if (r < 0) + continue; + + cout << *p << std::endl; + } + return 0; +} + +static int lookup(const std::deque<std::string> §ions, + const std::string &key, bool resolve_search) +{ + std::vector <std::string> my_sections; + for (deque<string>::const_iterator s = sections.begin(); s != sections.end(); ++s) { + my_sections.push_back(*s); + } + g_conf().get_my_sections(my_sections); + std::string val; + int ret = g_conf().get_val_from_conf_file(my_sections, key.c_str(), val, true); + if (ret == -ENOENT) + return 1; + else if (ret == 0) { + if (resolve_search) { + string result; + ret = ceph_resolve_file_search(val, result); + if (!ret) + puts(result.c_str()); + } + else { + puts(val.c_str()); + } + return 0; + } + else { + cerr << "error looking up '" << key << "': error " << ret << std::endl; + return 2; + } +} + +static int dump_all(const string& format) +{ + if (format == "" || format == "plain") { + g_conf().show_config(std::cout); + return 0; + } else { + unique_ptr<Formatter> f(Formatter::create(format)); + if (f) { + f->open_object_section("ceph-conf"); + g_conf().show_config(f.get()); + f->close_section(); + f->flush(std::cout); + return 0; + } + cerr << "format '" << format << "' not recognized." << std::endl; + usage(cerr); + return 1; + } +} + +int main(int argc, const char **argv) +{ + vector<const char*> args; + deque<std::string> sections; + bool resolve_search = false; + std::string action; + std::string lookup_key; + std::string section_list_prefix; + std::list<string> filter_key; + std::map<string,string> filter_key_value; + std::string dump_format; + + argv_to_vec(argc, argv, args); + auto orig_args = args; + auto cct = [&args] { + std::map<std::string,std::string> defaults = {{"log_to_file", "false"}}; + return global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_NO_DAEMON_ACTIONS | + CINIT_FLAG_NO_MON_CONFIG); + }(); + + // do not common_init_finish(); do not start threads; do not do any of thing + // wonky things the daemon whose conf we are examining would do (like initialize + // the admin socket). + //common_init_finish(g_ceph_context); + + std::string val; + for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_witharg(args, i, &val, "-s", "--section", (char*)NULL)) { + sections.push_back(val); + } else if (ceph_argparse_flag(args, i, "-r", "--resolve_search", (char*)NULL)) { + resolve_search = true; + } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) { + action = "help"; + } else if (ceph_argparse_witharg(args, i, &val, "--lookup", (char*)NULL)) { + action = "lookup"; + lookup_key = val; + } else if (ceph_argparse_flag(args, i, "-L", "--list_all_sections", (char*)NULL)) { + action = "list-sections"; + section_list_prefix = ""; + } else if (ceph_argparse_witharg(args, i, &val, "-l", "--list_sections", (char*)NULL)) { + action = "list-sections"; + section_list_prefix = val; + } else if (ceph_argparse_witharg(args, i, &val, "--filter_key", (char*)NULL)) { + filter_key.push_back(val); + } else if (ceph_argparse_witharg(args, i, &val, "--filter_key_value", (char*)NULL)) { + size_t pos = val.find_first_of('='); + if (pos == string::npos) { + cerr << "expecting argument like 'key=value' for --filter-key-value (not '" << val << "')" << std::endl; + usage(cerr); + return EXIT_FAILURE; + } + string key(val, 0, pos); + string value(val, pos+1); + filter_key_value[key] = value; + } else if (ceph_argparse_flag(args, i, "-D", "--dump_all", (char*)NULL)) { + action = "dumpall"; + } else if (ceph_argparse_witharg(args, i, &val, "--format", (char*)NULL)) { + dump_format = val; + } else { + if (((action == "lookup") || (action == "")) && (lookup_key.empty())) { + action = "lookup"; + lookup_key = *i++; + } else { + cerr << "unable to parse option: '" << *i << "'" << std::endl; + cerr << "args:"; + for (auto arg : orig_args) { + cerr << " " << quoted(arg); + } + cerr << std::endl; + usage(cerr); + return EXIT_FAILURE; + } + } + } + + cct->_log->flush(); + if (action == "help") { + usage(cout); + return EXIT_SUCCESS; + } else if (action == "list-sections") { + return list_sections(section_list_prefix, filter_key, filter_key_value); + } else if (action == "lookup") { + return lookup(sections, lookup_key, resolve_search); + } else if (action == "dumpall") { + return dump_all(dump_format); + } else { + cerr << "You must give an action, such as --lookup or --list-all-sections." << std::endl; + cerr << "Pass --help for more help." << std::endl; + return EXIT_FAILURE; + } +} diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc new file mode 100644 index 00000000..1713cde4 --- /dev/null +++ b/src/tools/ceph_dedup_tool.cc @@ -0,0 +1,834 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Myoungwon Oh <ohmyoungwon@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "include/types.h" + +#include "include/rados/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rados/rados_types.hpp" + +#include "acconfig.h" + +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "common/Cond.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/obj_bencher.h" + +#include <iostream> +#include <fstream> +#include <stdlib.h> +#include <time.h> +#include <sstream> +#include <errno.h> +#include <dirent.h> +#include <stdexcept> +#include <climits> +#include <locale> +#include <memory> + +#include "tools/RadosDump.h" +#include "cls/cas/cls_cas_client.h" +#include "include/stringify.h" +#include "global/signal_handler.h" + +using namespace librados; +unsigned default_op_size = 1 << 22; +unsigned default_max_thread = 2; +int32_t default_report_period = 2; +map< string, pair <uint64_t, uint64_t> > chunk_statistics; // < key, <count, chunk_size> > +Mutex glock("chunk_statistics::Locker"); + +void usage() +{ + cout << " usage: [--op <estimate|chunk_scrub|add_chunk_ref|get_chunk_ref>] [--pool <pool_name> ] " << std::endl; + cout << " --object <object_name> " << std::endl; + cout << " --chunk-size <size> chunk-size (byte) " << std::endl; + cout << " --chunk-algorithm <fixed> " << std::endl; + cout << " --fingerprint-algorithm <sha1> " << std::endl; + cout << " --chunk-pool <pool name> " << std::endl; + cout << " --max-thread <threads> " << std::endl; + cout << " --report-perioid <seconds> " << std::endl; + exit(1); +} + +[[noreturn]] static void usage_exit() +{ + usage(); + exit(1); +} + +template <typename I, typename T> +static int rados_sistrtoll(I &i, T *val) { + std::string err; + *val = strict_iecstrtoll(i->second.c_str(), &err); + if (err != "") { + cerr << "Invalid value for " << i->first << ": " << err << std::endl; + return -EINVAL; + } else { + return 0; + } +} + +class EstimateDedupRatio; +class ChunkScrub; +class EstimateThread : public Thread +{ + IoCtx io_ctx; + int n; + int m; + ObjectCursor begin; + ObjectCursor end; + Mutex m_lock; + Cond m_cond; + int32_t timeout; + bool m_stop = false; + uint64_t total_bytes = 0; + uint64_t examined_objects = 0; + uint64_t total_objects = 0; +#define COND_WAIT_INTERVAL 10 + +public: + EstimateThread(IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end, int32_t timeout): + io_ctx(io_ctx), n(n), m(m), begin(begin), end(end), m_lock("EstimateThread::Locker"), timeout(timeout) + {} + void signal(int signum) { + Mutex::Locker l(m_lock); + m_stop = true; + m_cond.Signal(); + } + virtual void print_status(Formatter *f, ostream &out) = 0; + uint64_t count_objects(IoCtx &ioctx, ObjectCursor &begin, ObjectCursor &end); + uint64_t get_examined_objects() { return examined_objects; } + uint64_t get_total_bytes() { return total_bytes; } + uint64_t get_total_objects() { return total_objects; } + friend class EstimateDedupRatio; + friend class ChunkScrub; +}; + +class EstimateDedupRatio : public EstimateThread +{ + string chunk_algo; + string fp_algo; + uint64_t chunk_size; + map< string, pair <uint64_t, uint64_t> > local_chunk_statistics; // < key, <count, chunk_size> > + +public: + EstimateDedupRatio(IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end, + string chunk_algo, string fp_algo, uint64_t chunk_size, int32_t timeout): + EstimateThread(io_ctx, n, m, begin, end, timeout), chunk_algo(chunk_algo), fp_algo(fp_algo), + chunk_size(chunk_size) { } + + void* entry() { + count_objects(io_ctx, begin, end); + estimate_dedup_ratio(); + return NULL; + } + void estimate_dedup_ratio(); + void print_status(Formatter *f, ostream &out); + map< string, pair <uint64_t, uint64_t> > &get_chunk_statistics() { return local_chunk_statistics; } + uint64_t fixed_chunk(string oid, uint64_t offset); +}; + +class ChunkScrub: public EstimateThread +{ + IoCtx chunk_io_ctx; + int fixed_objects = 0; + +public: + ChunkScrub(IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end, + IoCtx& chunk_io_ctx, int32_t timeout): + EstimateThread(io_ctx, n, m, begin, end, timeout), chunk_io_ctx(chunk_io_ctx) + { } + void* entry() { + count_objects(chunk_io_ctx, begin, end); + chunk_scrub_common(); + return NULL; + } + void chunk_scrub_common(); + int get_fixed_objects() { return fixed_objects; } + void print_status(Formatter *f, ostream &out); +}; + +vector<std::unique_ptr<EstimateThread>> estimate_threads; + +uint64_t EstimateThread::count_objects(IoCtx &ioctx, ObjectCursor &begin, ObjectCursor &end) +{ + ObjectCursor shard_start; + ObjectCursor shard_end; + uint64_t count = 0; + + ioctx.object_list_slice( + begin, + end, + n, + m, + &shard_start, + &shard_end); + + ObjectCursor c(shard_start); + while(c < shard_end) + { + std::vector<ObjectItem> result; + int r = ioctx.object_list(c, shard_end, 12, {}, &result, &c); + if (r < 0 ) { + cerr << "error object_list : " << cpp_strerror(r) << std::endl; + return 0; + } + count += result.size(); + total_objects += result.size(); + } + return count; +} + +static void print_dedup_estimate(bool debug = false) +{ + uint64_t total_size = 0; + uint64_t dedup_size = 0; + uint64_t examined_objects = 0; + uint64_t total_objects = 0; + EstimateDedupRatio *ratio = NULL; + for (auto &et : estimate_threads) { + Mutex::Locker l(glock); + ratio = dynamic_cast<EstimateDedupRatio*>(et.get()); + assert(ratio); + for (auto p : ratio->get_chunk_statistics()) { + auto c = chunk_statistics.find(p.first); + if (c != chunk_statistics.end()) { + c->second.first += p.second.first; + } else { + chunk_statistics.insert(p); + } + } + } + + if (debug) { + for (auto p : chunk_statistics) { + cout << " -- " << std::endl; + cout << " key: " << p.first << std::endl; + cout << " count: " << p.second.first << std::endl; + cout << " chunk_size: " << p.second.second << std::endl; + dedup_size += p.second.second; + cout << " -- " << std::endl; + } + } else { + for (auto p : chunk_statistics) { + dedup_size += p.second.second; + } + + } + + for (auto &et : estimate_threads) { + total_size += et->get_total_bytes(); + examined_objects += et->get_examined_objects(); + total_objects += et->get_total_objects(); + } + + cout << " result: " << total_size << " | " << dedup_size << " (total size | deduped size) " << std::endl; + cout << " Dedup ratio: " << (100 - (double)(dedup_size)/total_size*100) << " % " << std::endl; + cout << " Examined objects: " << examined_objects << std::endl; + cout << " Total objects: " << total_objects << std::endl; +} + +static void handle_signal(int signum) +{ + Mutex::Locker l(glock); + for (auto &p : estimate_threads) { + p->signal(signum); + } +} + +void EstimateDedupRatio::print_status(Formatter *f, ostream &out) +{ + if (f) { + f->open_array_section("estimate_dedup_ratio"); + f->dump_string("PID", stringify(get_pid())); + for (auto p : local_chunk_statistics) { + f->open_object_section("fingerprint object"); + f->dump_string("fingperint", p.first); + f->dump_string("count", stringify(p.second.first)); + f->dump_string("chunk_size", stringify(p.second.second)); + } + f->close_section(); + f->open_object_section("Status"); + f->dump_string("Total bytes", stringify(total_bytes)); + f->dump_string("Examined objectes", stringify(examined_objects)); + f->close_section(); + f->flush(out); + cout << std::endl; + } +} + +void EstimateDedupRatio::estimate_dedup_ratio() +{ + ObjectCursor shard_start; + ObjectCursor shard_end; + utime_t cur_time = ceph_clock_now(); + + io_ctx.object_list_slice( + begin, + end, + n, + m, + &shard_start, + &shard_end); + + ObjectCursor c(shard_start); + while(c < shard_end) + { + std::vector<ObjectItem> result; + int r = io_ctx.object_list(c, shard_end, 12, {}, &result, &c); + if (r < 0 ){ + cerr << "error object_list : " << cpp_strerror(r) << std::endl; + return; + } + + for (const auto & i : result) { + const auto &oid = i.oid; + uint64_t offset = 0; + while (true) { + Mutex::Locker l(m_lock); + if (m_stop) { + Formatter *formatter = Formatter::create("json-pretty"); + print_status(formatter, cout); + delete formatter; + return; + } + + uint64_t next_offset; + if (chunk_algo == "fixed") { + next_offset = fixed_chunk(oid, offset); + } else { + // CDC .. + ceph_assert(0 == "no support chunk algorithm"); + } + + if (!next_offset) { + break; + } + offset += next_offset; + m_cond.WaitInterval(m_lock,utime_t(0, COND_WAIT_INTERVAL)); + if (cur_time + utime_t(timeout, 0) < ceph_clock_now()) { + Formatter *formatter = Formatter::create("json-pretty"); + print_status(formatter, cout); + delete formatter; + cur_time = ceph_clock_now(); + } + } + examined_objects++; + } + } +} + +uint64_t EstimateDedupRatio::fixed_chunk(string oid, uint64_t offset) +{ + unsigned op_size = default_op_size; + int ret; + bufferlist outdata; + ret = io_ctx.read(oid, outdata, op_size, offset); + if (ret <= 0) { + return 0; + } + + if (fp_algo == "sha1") { + uint64_t c_offset = 0; + while (c_offset < outdata.length()) { + bufferlist chunk; + if (outdata.length() - c_offset > chunk_size) { + bufferptr bptr(chunk_size); + chunk.push_back(std::move(bptr)); + chunk.copy_in(0, chunk_size, outdata.c_str()); + } else { + bufferptr bptr(outdata.length() - c_offset); + chunk.push_back(std::move(bptr)); + chunk.copy_in(0, outdata.length() - c_offset, outdata.c_str()); + } + sha1_digest_t sha1_val = chunk.sha1(); + string fp = sha1_val.to_str(); + auto p = local_chunk_statistics.find(fp); + if (p != local_chunk_statistics.end()) { + uint64_t count = p->second.first; + count++; + local_chunk_statistics[fp] = make_pair(count, chunk.length()); + } else { + local_chunk_statistics[fp] = make_pair(1, chunk.length()); + } + total_bytes += chunk.length(); + c_offset = c_offset + chunk_size; + } + } else { + ceph_assert(0 == "no support fingerperint algorithm"); + } + + if (outdata.length() < op_size) { + return 0; + } + return outdata.length(); +} + +void ChunkScrub::chunk_scrub_common() +{ + ObjectCursor shard_start; + ObjectCursor shard_end; + int ret; + utime_t cur_time = ceph_clock_now(); + + chunk_io_ctx.object_list_slice( + begin, + end, + n, + m, + &shard_start, + &shard_end); + + ObjectCursor c(shard_start); + while(c < shard_end) + { + std::vector<ObjectItem> result; + int r = chunk_io_ctx.object_list(c, shard_end, 12, {}, &result, &c); + if (r < 0 ){ + cerr << "error object_list : " << cpp_strerror(r) << std::endl; + return; + } + + for (const auto & i : result) { + Mutex::Locker l(m_lock); + if (m_stop) { + Formatter *formatter = Formatter::create("json-pretty"); + print_status(formatter, cout); + delete formatter; + return; + } + auto oid = i.oid; + set<hobject_t> refs; + set<hobject_t> real_refs; + ret = cls_chunk_refcount_read(chunk_io_ctx, oid, &refs); + if (ret < 0) { + continue; + } + + for (auto pp : refs) { + ret = cls_chunk_has_chunk(io_ctx, pp.oid.name, oid); + if (ret != -ENOENT) { + real_refs.insert(pp); + } + } + + if (refs.size() != real_refs.size()) { + ObjectWriteOperation op; + cls_chunk_refcount_set(op, real_refs); + ret = chunk_io_ctx.operate(oid, &op); + if (ret < 0) { + continue; + } + fixed_objects++; + } + examined_objects++; + m_cond.WaitInterval(m_lock,utime_t(0, COND_WAIT_INTERVAL)); + if (cur_time + utime_t(timeout, 0) < ceph_clock_now()) { + Formatter *formatter = Formatter::create("json-pretty"); + print_status(formatter, cout); + delete formatter; + cur_time = ceph_clock_now(); + } + } + } +} + +void ChunkScrub::print_status(Formatter *f, ostream &out) +{ + if (f) { + f->open_array_section("chunk_scrub"); + f->dump_string("PID", stringify(get_pid())); + f->open_object_section("Status"); + f->dump_string("Total object", stringify(total_objects)); + f->dump_string("Examined objectes", stringify(examined_objects)); + f->dump_string("Fixed objectes", stringify(fixed_objects)); + f->close_section(); + f->flush(out); + cout << std::endl; + } +} + +int estimate_dedup_ratio(const std::map < std::string, std::string > &opts, + std::vector<const char*> &nargs) +{ + Rados rados; + IoCtx io_ctx; + std::string chunk_algo; + string fp_algo; + string pool_name; + uint64_t chunk_size = 0; + unsigned max_thread = default_max_thread; + uint32_t report_period = default_report_period; + int ret; + std::map<std::string, std::string>::const_iterator i; + bool debug = false; + ObjectCursor begin; + ObjectCursor end; + + i = opts.find("pool"); + if (i != opts.end()) { + pool_name = i->second.c_str(); + } + i = opts.find("chunk-algorithm"); + if (i != opts.end()) { + chunk_algo = i->second.c_str(); + if (chunk_algo != "fixed") { + usage_exit(); + } + } else { + usage_exit(); + } + + i = opts.find("fingerprint-algorithm"); + if (i != opts.end()) { + fp_algo = i->second.c_str(); + if (fp_algo != "sha1") { + usage_exit(); + } + } else { + usage_exit(); + } + + i = opts.find("chunk-size"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &chunk_size)) { + return -EINVAL; + } + } else { + usage_exit(); + } + + i = opts.find("max-thread"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &max_thread)) { + return -EINVAL; + } + } + + i = opts.find("report-period"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &report_period)) { + return -EINVAL; + } + } + i = opts.find("debug"); + if (i != opts.end()) { + debug = true; + } + + i = opts.find("pgid"); + boost::optional<pg_t> pgid(i != opts.end(), pg_t()); + + ret = rados.init_with_context(g_ceph_context); + if (ret < 0) { + cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl; + goto out; + } + ret = rados.connect(); + if (ret) { + cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl; + ret = -1; + goto out; + } + if (pool_name.empty()) { + cerr << "--create-pool requested but pool_name was not specified!" << std::endl; + usage_exit(); + } + ret = rados.ioctx_create(pool_name.c_str(), io_ctx); + if (ret < 0) { + cerr << "error opening pool " + << pool_name << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + + glock.Lock(); + begin = io_ctx.object_list_begin(); + end = io_ctx.object_list_end(); + for (unsigned i = 0; i < max_thread; i++) { + std::unique_ptr<EstimateThread> ptr (new EstimateDedupRatio(io_ctx, i, max_thread, begin, end, + chunk_algo, fp_algo, chunk_size, + report_period)); + ptr->create("estimate_thread"); + estimate_threads.push_back(move(ptr)); + } + glock.Unlock(); + + for (auto &p : estimate_threads) { + p->join(); + } + + print_dedup_estimate(debug); + + out: + return (ret < 0) ? 1 : 0; +} + +static void print_chunk_scrub() +{ + uint64_t total_objects = 0; + uint64_t examined_objects = 0; + int fixed_objects = 0; + + for (auto &et : estimate_threads) { + total_objects += et->get_total_objects(); + examined_objects += et->get_examined_objects(); + ChunkScrub *ptr = static_cast<ChunkScrub*>(et.get()); + fixed_objects += ptr->get_fixed_objects(); + } + + cout << " Total object : " << total_objects << std::endl; + cout << " Examined object : " << examined_objects << std::endl; + cout << " Fixed object : " << fixed_objects << std::endl; +} + +int chunk_scrub_common(const std::map < std::string, std::string > &opts, + std::vector<const char*> &nargs) +{ + Rados rados; + IoCtx io_ctx, chunk_io_ctx; + std::string object_name, target_object_name; + string pool_name, chunk_pool_name, op_name; + int ret; + unsigned max_thread = default_max_thread; + std::map<std::string, std::string>::const_iterator i; + uint32_t report_period = default_report_period; + ObjectCursor begin; + ObjectCursor end; + + i = opts.find("pool"); + if (i != opts.end()) { + pool_name = i->second.c_str(); + } else { + usage_exit(); + } + i = opts.find("op_name"); + if (i != opts.end()) { + op_name= i->second.c_str(); + } else { + usage_exit(); + } + + i = opts.find("chunk-pool"); + if (i != opts.end()) { + chunk_pool_name = i->second.c_str(); + } else { + usage_exit(); + } + i = opts.find("max-thread"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &max_thread)) { + return -EINVAL; + } + } + i = opts.find("report-period"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &report_period)) { + return -EINVAL; + } + } + i = opts.find("pgid"); + boost::optional<pg_t> pgid(i != opts.end(), pg_t()); + + ret = rados.init_with_context(g_ceph_context); + if (ret < 0) { + cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl; + goto out; + } + ret = rados.connect(); + if (ret) { + cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl; + ret = -1; + goto out; + } + if (pool_name.empty()) { + cerr << "--create-pool requested but pool_name was not specified!" << std::endl; + usage_exit(); + } + ret = rados.ioctx_create(pool_name.c_str(), io_ctx); + if (ret < 0) { + cerr << "error opening pool " + << pool_name << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + ret = rados.ioctx_create(chunk_pool_name.c_str(), chunk_io_ctx); + if (ret < 0) { + cerr << "error opening pool " + << chunk_pool_name << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + + if (op_name == "add_chunk_ref") { + string target_object_name; + i = opts.find("object"); + if (i != opts.end()) { + object_name = i->second.c_str(); + } else { + usage_exit(); + } + i = opts.find("target-ref"); + if (i != opts.end()) { + target_object_name = i->second.c_str(); + } else { + usage_exit(); + } + + set<hobject_t> refs; + ret = cls_chunk_refcount_read(chunk_io_ctx, object_name, &refs); + if (ret < 0) { + cerr << " cls_chunk_refcount_read fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + for (auto p : refs) { + cout << " " << p.oid.name << " "; + } + + uint32_t hash; + ret = chunk_io_ctx.get_object_hash_position2(object_name, &hash); + if (ret < 0) { + return ret; + } + hobject_t oid(sobject_t(target_object_name, CEPH_NOSNAP), "", hash, -1, ""); + refs.insert(oid); + + ObjectWriteOperation op; + cls_chunk_refcount_set(op, refs); + ret = chunk_io_ctx.operate(object_name, &op); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + } + + return ret; + + } else if (op_name == "get_chunk_ref") { + i = opts.find("object"); + if (i != opts.end()) { + object_name = i->second.c_str(); + } else { + usage_exit(); + } + set<hobject_t> refs; + cout << " refs: " << std::endl; + ret = cls_chunk_refcount_read(chunk_io_ctx, object_name, &refs); + for (auto p : refs) { + cout << " " << p.oid.name << " "; + } + cout << std::endl; + return ret; + } + + glock.Lock(); + begin = io_ctx.object_list_begin(); + end = io_ctx.object_list_end(); + for (unsigned i = 0; i < max_thread; i++) { + std::unique_ptr<EstimateThread> ptr (new ChunkScrub(io_ctx, i, max_thread, begin, end, chunk_io_ctx, + report_period)); + ptr->create("estimate_thread"); + estimate_threads.push_back(move(ptr)); + } + glock.Unlock(); + + for (auto &p : estimate_threads) { + p->join(); + } + + print_chunk_scrub(); + +out: + return (ret < 0) ? 1 : 0; +} + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + std::string fn; + string op_name; + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + init_async_signal_handler(); + register_async_signal_handler_oneshot(SIGINT, handle_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_signal); + std::map < std::string, std::string > opts; + std::string val; + std::vector<const char*>::iterator i; + for (i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_witharg(args, i, &val, "--op", (char*)NULL)) { + opts["op_name"] = val; + op_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--pool", (char*)NULL)) { + opts["pool"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--object", (char*)NULL)) { + opts["object"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--chunk-algorithm", (char*)NULL)) { + opts["chunk-algorithm"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--chunk-size", (char*)NULL)) { + opts["chunk-size"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--fingerprint-algorithm", (char*)NULL)) { + opts["fingerprint-algorithm"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--chunk-pool", (char*)NULL)) { + opts["chunk-pool"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--target-ref", (char*)NULL)) { + opts["target-ref"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--max-thread", (char*)NULL)) { + opts["max-thread"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--report-period", (char*)NULL)) { + opts["report-period"] = val; + } else if (ceph_argparse_flag(args, i, "--debug", (char*)NULL)) { + opts["debug"] = "true"; + } else { + if (val[0] == '-') + usage_exit(); + ++i; + } + } + + if (op_name == "estimate") { + return estimate_dedup_ratio(opts, args); + } else if (op_name == "chunk_scrub") { + return chunk_scrub_common(opts, args); + } else if (op_name == "add_chunk_ref") { + return chunk_scrub_common(opts, args); + } else if (op_name == "get_chunk_ref") { + return chunk_scrub_common(opts, args); + } else { + usage(); + exit(0); + } + + unregister_async_signal_handler(SIGINT, handle_signal); + unregister_async_signal_handler(SIGTERM, handle_signal); + shutdown_async_signal_handler(); + + return 0; +} diff --git a/src/tools/ceph_kvstore_tool.cc b/src/tools/ceph_kvstore_tool.cc new file mode 100644 index 00000000..4a4f5214 --- /dev/null +++ b/src/tools/ceph_kvstore_tool.cc @@ -0,0 +1,356 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2012 Inktank, Inc. +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License version 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#include <map> +#include <set> +#include <string> +#include <fstream> + +#include "common/ceph_argparse.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/strtol.h" +#include "common/url_escape.h" + +#include "global/global_context.h" +#include "global/global_init.h" + +#include "kvstore_tool.h" + +void usage(const char *pname) +{ + std::cout << "Usage: " << pname << " <leveldb|rocksdb|bluestore-kv> <store path> command [args...]\n" + << "\n" + << "Commands:\n" + << " list [prefix]\n" + << " list-crc [prefix]\n" + << " dump [prefix]\n" + << " exists <prefix> [key]\n" + << " get <prefix> <key> [out <file>]\n" + << " crc <prefix> <key>\n" + << " get-size [<prefix> <key>]\n" + << " set <prefix> <key> [ver <N>|in <file>]\n" + << " rm <prefix> <key>\n" + << " rm-prefix <prefix>\n" + << " store-copy <path> [num-keys-per-tx] [leveldb|rocksdb|...] \n" + << " store-crc <path>\n" + << " compact\n" + << " compact-prefix <prefix>\n" + << " compact-range <prefix> <start> <end>\n" + << " destructive-repair (use only as last resort! may corrupt healthy data)\n" + << " stats\n" + << std::endl; +} + +int main(int argc, const char *argv[]) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(argv[0]); + exit(0); + } + + map<string,string> defaults = { + { "debug_rocksdb", "2" } + }; + + auto cct = global_init( + &defaults, args, + CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + + ceph_assert((int)args.size() < argc); + for(size_t i=0; i<args.size(); i++) + argv[i+1] = args[i]; + argc = args.size() + 1; + + if (args.size() < 3) { + usage(argv[0]); + return 1; + } + + string type(args[0]); + string path(args[1]); + string cmd(args[2]); + + if (type != "leveldb" && + type != "rocksdb" && + type != "bluestore-kv") { + + std::cerr << "Unrecognized type: " << args[0] << std::endl; + usage(argv[0]); + return 1; + } + + bool need_open_db = (cmd != "destructive-repair"); + bool need_stats = (cmd == "stats"); + StoreTool st(type, path, need_open_db, need_stats); + + if (cmd == "destructive-repair") { + int ret = st.destructive_repair(); + if (!ret) { + std::cout << "destructive-repair completed without reporting an error" + << std::endl; + } else { + std::cout << "destructive-repair failed with " << cpp_strerror(ret) + << std::endl; + } + return ret; + } else if (cmd == "list" || cmd == "list-crc") { + string prefix; + if (argc > 4) + prefix = url_unescape(argv[4]); + + bool do_crc = (cmd == "list-crc"); + st.list(prefix, do_crc, false); + + } else if (cmd == "dump") { + string prefix; + if (argc > 4) + prefix = url_unescape(argv[4]); + st.list(prefix, false, true); + + } else if (cmd == "exists") { + string key; + if (argc < 5) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + if (argc > 5) + key = url_unescape(argv[5]); + + bool ret = st.exists(prefix, key); + std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ") " + << (ret ? "exists" : "does not exist") + << std::endl; + return (ret ? 0 : 1); + + } else if (cmd == "get") { + if (argc < 6) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + + bool exists = false; + bufferlist bl = st.get(prefix, key, exists); + std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ")"; + if (!exists) { + std::cout << " does not exist" << std::endl; + return 1; + } + std::cout << std::endl; + + if (argc >= 7) { + string subcmd(argv[6]); + if (subcmd != "out") { + std::cerr << "unrecognized subcmd '" << subcmd << "'" + << std::endl; + return 1; + } + if (argc < 8) { + std::cerr << "output path not specified" << std::endl; + return 1; + } + string out(argv[7]); + + if (out.empty()) { + std::cerr << "unspecified out file" << std::endl; + return 1; + } + + int err = bl.write_file(argv[7], 0644); + if (err < 0) { + std::cerr << "error writing value to '" << out << "': " + << cpp_strerror(err) << std::endl; + return 1; + } + } else { + ostringstream os; + bl.hexdump(os); + std::cout << os.str() << std::endl; + } + + } else if (cmd == "crc") { + if (argc < 6) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + + bool exists = false; + bufferlist bl = st.get(prefix, key, exists); + std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ") "; + if (!exists) { + std::cout << " does not exist" << std::endl; + return 1; + } + std::cout << " crc " << bl.crc32c(0) << std::endl; + + } else if (cmd == "get-size") { + std::cout << "estimated store size: " << st.get_size() << std::endl; + + if (argc < 5) + return 0; + + if (argc < 6) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + + bool exists = false; + bufferlist bl = st.get(prefix, key, exists); + if (!exists) { + std::cerr << "(" << url_escape(prefix) << "," << url_escape(key) + << ") does not exist" << std::endl; + return 1; + } + std::cout << "(" << url_escape(prefix) << "," << url_escape(key) + << ") size " << byte_u_t(bl.length()) << std::endl; + + } else if (cmd == "set") { + if (argc < 8) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + string subcmd(argv[6]); + + bufferlist val; + string errstr; + if (subcmd == "ver") { + version_t v = (version_t) strict_strtoll(argv[7], 10, &errstr); + if (!errstr.empty()) { + std::cerr << "error reading version: " << errstr << std::endl; + return 1; + } + encode(v, val); + } else if (subcmd == "in") { + int ret = val.read_file(argv[7], &errstr); + if (ret < 0 || !errstr.empty()) { + std::cerr << "error reading file: " << errstr << std::endl; + return 1; + } + } else { + std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl; + usage(argv[0]); + return 1; + } + + bool ret = st.set(prefix, key, val); + if (!ret) { + std::cerr << "error setting (" + << url_escape(prefix) << "," << url_escape(key) << ")" << std::endl; + return 1; + } + } else if (cmd == "rm") { + if (argc < 6) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + + bool ret = st.rm(prefix, key); + if (!ret) { + std::cerr << "error removing (" + << url_escape(prefix) << "," << url_escape(key) << ")" + << std::endl; + return 1; + } + } else if (cmd == "rm-prefix") { + if (argc < 5) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + + bool ret = st.rm_prefix(prefix); + if (!ret) { + std::cerr << "error removing prefix (" + << url_escape(prefix) << ")" + << std::endl; + return 1; + } + } else if (cmd == "store-copy") { + int num_keys_per_tx = 128; // magic number that just feels right. + if (argc < 5) { + usage(argv[0]); + return 1; + } else if (argc > 5) { + string err; + num_keys_per_tx = strict_strtol(argv[5], 10, &err); + if (!err.empty()) { + std::cerr << "invalid num_keys_per_tx: " << err << std::endl; + return 1; + } + } + string other_store_type = argv[1]; + if (argc > 6) { + other_store_type = argv[6]; + } + + int ret = st.copy_store_to(argv[1], argv[4], num_keys_per_tx, other_store_type); + if (ret < 0) { + std::cerr << "error copying store to path '" << argv[4] + << "': " << cpp_strerror(ret) << std::endl; + return 1; + } + + } else if (cmd == "store-crc") { + if (argc < 4) { + usage(argv[0]); + return 1; + } + std::ofstream fs(argv[4]); + uint32_t crc = st.traverse(string(), true, false, &fs); + std::cout << "store at '" << argv[4] << "' crc " << crc << std::endl; + + } else if (cmd == "compact") { + st.compact(); + } else if (cmd == "compact-prefix") { + if (argc < 5) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + st.compact_prefix(prefix); + } else if (cmd == "compact-range") { + if (argc < 7) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string start(url_unescape(argv[5])); + string end(url_unescape(argv[6])); + st.compact_range(prefix, start, end); + } else if (cmd == "stats") { + st.print_stats(); + } else { + std::cerr << "Unrecognized command: " << cmd << std::endl; + return 1; + } + + return 0; +} diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc new file mode 100644 index 00000000..9ff08f32 --- /dev/null +++ b/src/tools/ceph_monstore_tool.cc @@ -0,0 +1,1297 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2012 Inktank, Inc. +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License version 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/parsers.hpp> +#include <boost/scope_exit.hpp> + +#include <stdlib.h> +#include <string> + +#include "common/Formatter.h" +#include "common/errno.h" + +#include "auth/KeyRing.h" +#include "auth/cephx/CephxKeyServer.h" +#include "global/global_init.h" +#include "include/stringify.h" +#include "mgr/mgr_commands.h" +#include "mon/AuthMonitor.h" +#include "mon/MonitorDBStore.h" +#include "mon/Paxos.h" +#include "mon/MonMap.h" +#include "mds/FSMap.h" +#include "mon/MgrMap.h" +#include "osd/OSDMap.h" +#include "crush/CrushCompiler.h" +#include "mon/CreatingPGs.h" + +namespace po = boost::program_options; + +class TraceIter { + int fd; + unsigned idx; + MonitorDBStore::TransactionRef t; +public: + explicit TraceIter(string fname) : fd(-1), idx(-1) { + fd = ::open(fname.c_str(), O_RDONLY); + t.reset(new MonitorDBStore::Transaction); + } + bool valid() { + return fd != -1; + } + MonitorDBStore::TransactionRef cur() { + ceph_assert(valid()); + return t; + } + unsigned num() { return idx; } + void next() { + ++idx; + bufferlist bl; + int r = bl.read_fd(fd, 6); + if (r < 0) { + std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd" + << std::endl; + ::close(fd); + fd = -1; + return; + } else if ((unsigned)r < 6) { + std::cerr << "short read" << std::endl; + ::close(fd); + fd = -1; + return; + } + auto bliter = bl.cbegin(); + uint8_t ver, ver2; + decode(ver, bliter); + decode(ver2, bliter); + uint32_t len; + decode(len, bliter); + r = bl.read_fd(fd, len); + if (r < 0) { + std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd" + << std::endl; + ::close(fd); + fd = -1; + return; + } else if ((unsigned)r < len) { + std::cerr << "short read" << std::endl; + ::close(fd); + fd = -1; + return; + } + bliter = bl.cbegin(); + t.reset(new MonitorDBStore::Transaction); + t->decode(bliter); + } + void init() { + next(); + } + ~TraceIter() { + if (fd != -1) { + ::close(fd); + fd = -1; + } + } +}; + + +int parse_cmd_args( + po::options_description *desc, /// < visible options description + po::options_description *hidden_desc, /// < hidden options description + po::positional_options_description *positional, /// < positional args + vector<string> &cmd_args, /// < arguments to be parsed + po::variables_map *vm /// > post-parsing variable map + ) +{ + // desc_all will aggregate all visible and hidden options for parsing. + // + // From boost's program_options point of view, there is absolutely no + // distinction between 'desc' and 'hidden_desc'. This is a distinction + // that is only useful to us: 'desc' is whatever we are willing to show + // on 'usage()', whereas 'hidden_desc' refers to parameters we wish to + // take advantage of but do not wish to show on 'usage()'. + // + // For example, consider that program_options matches positional arguments + // (specified via 'positional') against the paramenters defined on a + // given 'po::options_description' class. This is performed below, + // supplying both the description and the positional arguments to the + // parser. However, we do not want the parameters that are mapped to + // positional arguments to be shown on usage, as that makes for ugly and + // confusing usage messages. Therefore we dissociate the options' + // description that is to be used as an aid to the user from those options + // that are nothing but useful for internal purposes (i.e., mapping options + // to positional arguments). We still need to aggregate them before parsing + // and that's what 'desc_all' is all about. + // + + ceph_assert(desc != NULL); + + po::options_description desc_all; + desc_all.add(*desc); + if (hidden_desc != NULL) + desc_all.add(*hidden_desc); + + try { + po::command_line_parser parser = po::command_line_parser(cmd_args). + options(desc_all); + + if (positional) { + parser = parser.positional(*positional); + } + + po::parsed_options parsed = parser.run(); + po::store(parsed, *vm); + po::notify(*vm); + } catch (po::error &e) { + std::cerr << "error: " << e.what() << std::endl; + return -EINVAL; + } + return 0; +} + + +/** + * usage: ceph-monstore-tool <store-path> <command> [options] + * + * commands: + * + * store-copy < --out arg > + * dump-keys + * compact + * getmonmap < --out arg [ --version arg ] > + * getosdmap < --out arg [ --version arg ] > + * dump-paxos <--dump-start VER> <--dump-end VER> + * dump-trace < --trace-file arg > + * replay-trace + * random-gen + * rewrite-crush + * + * wanted syntax: + * + * ceph-monstore-tool PATH CMD [options] + * + * ceph-monstore-tool PATH store-copy <PATH2 | -o PATH2> + * ceph-monstore-tool PATH dump-keys + * ceph-monstore-tool PATH compact + * ceph-monstore-tool PATH get monmap [VER] + * ceph-monstore-tool PATH get osdmap [VER] + * ceph-monstore-tool PATH dump-paxos STARTVER ENDVER + * + * + */ +void usage(const char *n, po::options_description &d) +{ + std::cerr << + "usage: " << n << " <store-path> <cmd> [args|options]\n" + << "\n" + << "Commands:\n" + << " store-copy PATH copies store to PATH\n" + << " compact compacts the store\n" + << " get monmap [-- options] get monmap (version VER if specified)\n" + << " (default: last committed)\n" + << " get osdmap [-- options] get osdmap (version VER if specified)\n" + << " (default: last committed)\n" + << " get mdsmap [-- options] get mdsmap (version VER if specified)\n" + << " (default: last committed)\n" + << " get mgr [-- options] get mgr map (version VER if specified)\n" + << " (default: last committed)\n" + << " get crushmap [-- options] get crushmap (version VER if specified)\n" + << " (default: last committed)\n" + << " show-versions [-- options] show the first&last committed version of map\n" + << " (show-versions -- --help for more info)\n" + << " dump-keys dumps store keys to FILE\n" + << " (default: stdout)\n" + << " dump-paxos [-- options] dump paxos transactions\n" + << " (dump-paxos -- --help for more info)\n" + << " dump-trace FILE [-- options] dump contents of trace file FILE\n" + << " (dump-trace -- --help for more info)\n" + << " replay-trace FILE [-- options] replay trace from FILE\n" + << " (replay-trace -- --help for more info)\n" + << " random-gen [-- options] add randomly generated ops to the store\n" + << " (random-gen -- --help for more info)\n" + << " rewrite-crush [-- options] add a rewrite commit to the store\n" + << " (rewrite-crush -- --help for more info)\n" + << " rebuild rebuild store\n" + << " (rebuild -- --help for more info)\n" + << std::endl; + std::cerr << d << std::endl; + std::cerr + << "\nPlease Note:\n" + << "* Ceph-specific options should be in the format --option-name=VAL\n" + << " (specifically, do not forget the '='!!)\n" + << "* Command-specific options need to be passed after a '--'\n" + << " e.g., 'get monmap -- --version 10 --out /tmp/foo'" + << std::endl; +} + +int update_osdmap(MonitorDBStore& store, version_t ver, bool copy, + std::shared_ptr<CrushWrapper> crush, + MonitorDBStore::Transaction* t) { + const string prefix("osdmap"); + + // full + bufferlist bl; + int r = 0; + r = store.get(prefix, store.combine_strings("full", ver), bl); + if (r) { + std::cerr << "Error getting full map: " << cpp_strerror(r) << std::endl; + return r; + } + OSDMap osdmap; + osdmap.decode(bl); + osdmap.crush = crush; + if (copy) { + osdmap.inc_epoch(); + } + bl.clear(); + // be consistent with OSDMonitor::update_from_paxos() + osdmap.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED); + t->put(prefix, store.combine_strings("full", osdmap.get_epoch()), bl); + + // incremental + OSDMap::Incremental inc; + if (copy) { + inc.epoch = osdmap.get_epoch(); + inc.fsid = osdmap.get_fsid(); + } else { + bl.clear(); + r = store.get(prefix, ver, bl); + if (r) { + std::cerr << "Error getting inc map: " << cpp_strerror(r) << std::endl; + return r; + } + OSDMap::Incremental inc(bl); + if (inc.crush.length()) { + inc.crush.clear(); + crush->encode(inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + } + if (inc.fullmap.length()) { + OSDMap fullmap; + fullmap.decode(inc.fullmap); + fullmap.crush = crush; + inc.fullmap.clear(); + fullmap.encode(inc.fullmap); + } + } + ceph_assert(osdmap.have_crc()); + inc.full_crc = osdmap.get_crc(); + bl.clear(); + // be consistent with OSDMonitor::update_from_paxos() + inc.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED); + t->put(prefix, inc.epoch, bl); + return 0; +} + +int rewrite_transaction(MonitorDBStore& store, int version, + const string& crush_file, + MonitorDBStore::Transaction* t) { + const string prefix("osdmap"); + + // calc the known-good epoch + version_t last_committed = store.get(prefix, "last_committed"); + version_t good_version = 0; + if (version <= 0) { + if (last_committed >= (unsigned)-version) { + good_version = last_committed + version; + } else { + std::cerr << "osdmap-version is less than: -" << last_committed << std::endl; + return EINVAL; + } + } else { + good_version = version; + } + if (good_version >= last_committed) { + std::cout << "good epoch is greater or equal to the last committed one: " + << good_version << " >= " << last_committed << std::endl; + return 0; + } + + // load/extract the crush map + int r = 0; + std::shared_ptr<CrushWrapper> crush(new CrushWrapper); + if (crush_file.empty()) { + bufferlist bl; + r = store.get(prefix, store.combine_strings("full", good_version), bl); + if (r) { + std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl; + return r; + } + OSDMap osdmap; + osdmap.decode(bl); + crush = osdmap.crush; + } else { + string err; + bufferlist bl; + r = bl.read_file(crush_file.c_str(), &err); + if (r) { + std::cerr << err << ": " << cpp_strerror(r) << std::endl; + return r; + } + auto p = bl.cbegin(); + crush->decode(p); + } + + // prepare a transaction to rewrite the epochs + // (good_version, last_committed] + // with the good crush map. + // XXX: may need to break this into several paxos versions? + ceph_assert(good_version < last_committed); + for (version_t v = good_version + 1; v <= last_committed; v++) { + cout << "rewriting epoch #" << v << "/" << last_committed << std::endl; + r = update_osdmap(store, v, false, crush, t); + if (r) + return r; + } + + // add a new osdmap epoch to store, so monitors will update their current osdmap + // in addition to the ones stored in epochs. + // + // This is needed due to the way the monitor updates from paxos and the + // facilities we are leveraging to push this update to the rest of the + // quorum. + // + // In a nutshell, we are generating a good version of the osdmap, with a + // proper crush, and building a transaction that will replace the bad + // osdmaps with good osdmaps. But this transaction needs to be applied on + // all nodes, so that the monitors will have good osdmaps to share with + // clients. We thus leverage Paxos, specifically the recovery mechanism, by + // creating a pending value that will be committed once the monitors form an + // initial quorum after being brought back to life. + // + // However, the way the monitor works has the paxos services, including the + // OSDMonitor, updating their state from disk *prior* to the recovery phase + // begins (so they have an up to date state in memory). This means the + // OSDMonitor will see the old, broken map, before the new paxos version is + // applied to disk, and the old version is cached. Even though we have the + // good map now, and we share the good map with clients, we will still be + // working on the old broken map. Instead of mucking around the monitor to + // make this work, we instead opt for adding the same osdmap but with a + // newer version, so that the OSDMonitor picks up on it when it updates from + // paxos after the proposal has been committed. This is not elegant, but + // avoids further unpleasantness that would arise from kludging around the + // current behavior. Also, has the added benefit of making sure the clients + // get an updated version of the map (because last_committed+1 > + // last_committed) :) + // + cout << "adding a new epoch #" << last_committed+1 << std::endl; + r = update_osdmap(store, last_committed++, true, crush, t); + if (r) + return r; + t->put(prefix, store.combine_strings("full", "latest"), last_committed); + t->put(prefix, "last_committed", last_committed); + return 0; +} + +/** + * create a new paxos version which carries a proposal to rewrite all epochs + * of incremental and full map of "osdmap" after a faulty crush map is injected. + * so the leader will trigger a recovery and propagate this fix to its peons, + * after the proposal is accepted, and the transaction in it is applied. all + * monitors will rewrite the bad crush map with the good one, and have a new + * osdmap epoch with the good crush map in it. + */ +int rewrite_crush(const char* progname, + vector<string>& subcmds, + MonitorDBStore& store) { + po::options_description op_desc("Allowed 'rewrite-crush' options"); + int version = -1; + string crush_file; + op_desc.add_options() + ("help,h", "produce this help message") + ("crush", po::value<string>(&crush_file), + ("path to the crush map file " + "(default: will instead extract it from the known-good osdmap)")) + ("good-epoch", po::value<int>(&version), + "known-good epoch of osdmap, if a negative number '-N' is given, the " + "$last_committed-N is used instead (default: -1). " + "Please note, -1 is not necessarily a good epoch, because there are " + "good chance that we have more epochs slipped into the monstore after " + "the one where the crushmap is firstly injected.") + ; + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, NULL, NULL, subcmds, &op_vm); + if (r) { + return -r; + } + if (op_vm.count("help")) { + usage(progname, op_desc); + return 0; + } + + MonitorDBStore::Transaction rewrite_txn; + r = rewrite_transaction(store, version, crush_file, &rewrite_txn); + if (r) { + return r; + } + + // store the transaction into store as a proposal + const string prefix("paxos"); + version_t pending_v = store.get(prefix, "last_committed") + 1; + auto t(std::make_shared<MonitorDBStore::Transaction>()); + bufferlist bl; + rewrite_txn.encode(bl); + cout << "adding pending commit " << pending_v + << " " << bl.length() << " bytes" << std::endl; + t->put(prefix, pending_v, bl); + t->put(prefix, "pending_v", pending_v); + // a large enough yet unique proposal number will probably do the trick + version_t pending_pn = (store.get(prefix, "accepted_pn") / 100 + 4) * 100 + 1; + t->put(prefix, "pending_pn", pending_pn); + store.apply_transaction(t); + return 0; +} + +static int update_auth(MonitorDBStore& st, const string& keyring_path) +{ + // import all keyrings stored in the keyring file + KeyRing keyring; + int r = keyring.load(g_ceph_context, keyring_path); + if (r < 0) { + cerr << "unable to load admin keyring: " << keyring_path << std::endl; + return r; + } + + bufferlist bl; + __u8 v = 1; + encode(v, bl); + + for (const auto& k : keyring.get_keys()) { + KeyServerData::Incremental auth_inc; + auth_inc.name = k.first; + auth_inc.auth = k.second; + if (auth_inc.auth.caps.empty()) { + cerr << "no caps granted to: " << auth_inc.name << std::endl; + return -EINVAL; + } + auth_inc.op = KeyServerData::AUTH_INC_ADD; + + AuthMonitor::Incremental inc; + inc.inc_type = AuthMonitor::AUTH_DATA; + encode(auth_inc, inc.auth_data); + inc.auth_type = CEPH_AUTH_CEPHX; + + inc.encode(bl, CEPH_FEATURES_ALL); + } + + const string prefix("auth"); + auto last_committed = st.get(prefix, "last_committed") + 1; + auto t = make_shared<MonitorDBStore::Transaction>(); + t->put(prefix, last_committed, bl); + t->put(prefix, "last_committed", last_committed); + auto first_committed = st.get(prefix, "first_committed"); + if (!first_committed) { + t->put(prefix, "first_committed", last_committed); + } + st.apply_transaction(t); + return 0; +} + +static int update_mkfs(MonitorDBStore& st, + const string& monmap_path, + const vector<string>& mon_ids) +{ + MonMap monmap; + if (!monmap_path.empty()) { + cout << __func__ << " pulling initial monmap from " << monmap_path << std::endl; + bufferlist bl; + string err; + int r = bl.read_file(monmap_path.c_str(), &err); + if (r < 0) { + cerr << "failed to read monmap from " << monmap_path << ": " + << cpp_strerror(r) << std::endl; + return r; + } + monmap.decode(bl); + } else { + cout << __func__ << " generating seed initial monmap" << std::endl; + int r = monmap.build_initial(g_ceph_context, true, cerr); + if (r) { + cerr << "no initial monitors" << std::endl; + return -EINVAL; + } + vector<string> new_names; + if (!mon_ids.empty()) { + if (mon_ids.size() != monmap.size()) { + cerr << "Please pass the same number of <mon-ids> to name the hosts " + << "listed in 'mon_host'. " + << mon_ids.size() << " mon-id(s) specified, " + << "while you have " << monmap.size() << " mon hosts." << std::endl; + return -EINVAL; + } + new_names = mon_ids; + } else { + for (unsigned rank = 0; rank < monmap.size(); rank++) { + string new_name{"a"}; + new_name[0] += rank; + new_names.push_back(std::move(new_name)); + } + } + for (unsigned rank = 0; rank < monmap.size(); rank++) { + auto name = monmap.get_name(rank); + if (name.compare(0, 7, "noname-") == 0) { + monmap.rename(name, new_names[rank]); + } + } + } + monmap.print(cout); + bufferlist bl; + monmap.encode(bl, CEPH_FEATURES_ALL); + monmap.set_epoch(0); + auto t = make_shared<MonitorDBStore::Transaction>(); + t->put("mkfs", "monmap", bl); + st.apply_transaction(t); + return 0; +} + +static int update_monitor(MonitorDBStore& st) +{ + const string prefix("monitor"); + // a stripped-down Monitor::mkfs() + bufferlist bl; + bl.append(CEPH_MON_ONDISK_MAGIC "\n"); + auto t = make_shared<MonitorDBStore::Transaction>(); + t->put(prefix, "magic", bl); + st.apply_transaction(t); + return 0; +} + +// rebuild +// - creating_pgs +static int update_creating_pgs(MonitorDBStore& st) +{ + bufferlist bl; + auto last_osdmap_epoch = st.get("osdmap", "last_committed"); + int r = st.get("osdmap", st.combine_strings("full", last_osdmap_epoch), bl); + if (r < 0) { + cerr << "unable to losd osdmap e" << last_osdmap_epoch << std::endl; + return r; + } + + OSDMap osdmap; + osdmap.decode(bl); + creating_pgs_t creating; + for (auto& i : osdmap.get_pools()) { + creating.created_pools.insert(i.first); + } + creating.last_scan_epoch = last_osdmap_epoch; + + bufferlist newbl; + ::encode(creating, newbl); + + auto t = make_shared<MonitorDBStore::Transaction>(); + t->put("osd_pg_creating", "creating", newbl); + st.apply_transaction(t); + return 0; +} + +// rebuild +// - mgr +// - mgr_command_desc +static int update_mgrmap(MonitorDBStore& st) +{ + auto t = make_shared<MonitorDBStore::Transaction>(); + + { + MgrMap map; + // mgr expects epoch > 1 + map.epoch++; + auto initial_modules = + get_str_vec(g_ceph_context->_conf.get_val<string>("mgr_initial_modules")); + copy(begin(initial_modules), + end(initial_modules), + inserter(map.modules, end(map.modules))); + bufferlist bl; + map.encode(bl, CEPH_FEATURES_ALL); + t->put("mgr", map.epoch, bl); + t->put("mgr", "last_committed", map.epoch); + } + { + auto mgr_command_descs = mgr_commands; + for (auto& c : mgr_command_descs) { + c.set_flag(MonCommand::FLAG_MGR); + } + bufferlist bl; + encode(mgr_command_descs, bl); + t->put("mgr_command_descs", "", bl); + } + return st.apply_transaction(t); +} + +static int update_paxos(MonitorDBStore& st) +{ + // build a pending paxos proposal from all non-permanent k/v pairs. once the + // proposal is committed, it will gets applied. on the sync provider side, it + // will be a no-op, but on its peers, the paxos commit will help to build up + // the necessary epochs. + bufferlist pending_proposal; + { + MonitorDBStore::Transaction t; + vector<string> prefixes = {"auth", "osdmap", + "mgr", "mgr_command_desc"}; + for (const auto& prefix : prefixes) { + for (auto i = st.get_iterator(prefix); i->valid(); i->next()) { + auto key = i->raw_key(); + auto val = i->value(); + t.put(key.first, key.second, val); + } + } + t.encode(pending_proposal); + } + const string prefix("paxos"); + auto t = make_shared<MonitorDBStore::Transaction>(); + t->put(prefix, "first_committed", 0); + t->put(prefix, "last_committed", 0); + auto pending_v = 1; + t->put(prefix, pending_v, pending_proposal); + t->put(prefix, "pending_v", pending_v); + t->put(prefix, "pending_pn", 400); + st.apply_transaction(t); + return 0; +} + +int rebuild_monstore(const char* progname, + vector<string>& subcmds, + MonitorDBStore& st) +{ + po::options_description op_desc("Allowed 'rebuild' options"); + string keyring_path; + string monmap_path; + vector<string> mon_ids; + op_desc.add_options() + ("keyring", po::value<string>(&keyring_path), + "path to the client.admin key") + ("monmap", po::value<string>(&monmap_path), + "path to the initial monmap") + ("mon-ids", po::value<vector<string>>(&mon_ids)->multitoken(), + "mon ids, use 'a', 'b', ... if not specified"); + po::positional_options_description pos_desc; + pos_desc.add("mon-ids", -1); + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, nullptr, &pos_desc, subcmds, &op_vm); + if (r) { + return -r; + } + if (op_vm.count("help")) { + usage(progname, op_desc); + return 0; + } + if (!keyring_path.empty()) + update_auth(st, keyring_path); + if ((r = update_creating_pgs(st))) { + return r; + } + if ((r = update_mgrmap(st))) { + return r; + } + if ((r = update_paxos(st))) { + return r; + } + if ((r = update_mkfs(st, monmap_path, mon_ids))) { + return r; + } + if ((r = update_monitor(st))) { + return r; + } + return 0; +} + +int main(int argc, char **argv) { + int err = 0; + po::options_description desc("Allowed options"); + string store_path, cmd; + vector<string> subcmds; + desc.add_options() + ("help,h", "produce help message") + ; + + /* Dear Future Developer: + * + * for further improvement, should you need to pass specific options to + * a command (e.g., get osdmap VER --hex), you can expand the current + * format by creating additional 'po::option_description' and passing + * 'subcmds' to 'po::command_line_parser', much like what is currently + * done by default. However, beware: in order to differentiate a + * command-specific option from the generic/global options, you will need + * to pass '--' in the command line (so that the first parser, the one + * below, assumes it has reached the end of all options); e.g., + * 'get osdmap VER -- --hex'. Not pretty; far from intuitive; it was as + * far as I got with this library. Improvements on this format will be + * left as an excercise for the reader. -Joao + */ + po::options_description positional_desc("Positional argument options"); + positional_desc.add_options() + ("store-path", po::value<string>(&store_path), + "path to monitor's store") + ("command", po::value<string>(&cmd), + "Command") + ("subcmd", po::value<vector<string> >(&subcmds), + "Command arguments/Sub-Commands") + ; + po::positional_options_description positional; + positional.add("store-path", 1); + positional.add("command", 1); + positional.add("subcmd", -1); + + po::options_description all_desc("All options"); + all_desc.add(desc).add(positional_desc); + + vector<string> ceph_option_strings; + po::variables_map vm; + try { + po::parsed_options parsed = + po::command_line_parser(argc, argv). + options(all_desc). + positional(positional). + allow_unregistered().run(); + + po::store( + parsed, + vm); + po::notify(vm); + + // Specifying po::include_positional would have our positional arguments + // being collected (thus being part of ceph_option_strings and eventually + // passed on to global_init() below). + // Instead we specify po::exclude_positional, which has the upside of + // completely avoid this, but the downside of having to specify ceph + // options as --VAR=VAL (note the '='); otherwise we will capture the + // positional 'VAL' as belonging to us, never being collected. + ceph_option_strings = po::collect_unrecognized(parsed.options, + po::exclude_positional); + + } catch(po::error &e) { + std::cerr << "error: " << e.what() << std::endl; + return 1; + } + + // parse command structure before calling global_init() and friends. + + if (vm.empty() || vm.count("help") || + store_path.empty() || cmd.empty() || + *cmd.begin() == '-') { + usage(argv[0], desc); + return 1; + } + + vector<const char *> ceph_options; + ceph_options.reserve(ceph_option_strings.size()); + for (vector<string>::iterator i = ceph_option_strings.begin(); + i != ceph_option_strings.end(); + ++i) { + ceph_options.push_back(i->c_str()); + } + + auto cct = global_init( + NULL, ceph_options, CEPH_ENTITY_TYPE_MON, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_MON_CONFIG); + common_init_finish(g_ceph_context); + cct->_conf.apply_changes(nullptr); + + // this is where we'll write *whatever*, on a per-command basis. + // not all commands require some place to write their things. + MonitorDBStore st(store_path); + if (store_path.size()) { + stringstream ss; + int r = st.open(ss); + if (r < 0) { + std::cerr << ss.str() << std::endl; + return EINVAL; + } + } + + if (cmd == "dump-keys") { + KeyValueDB::WholeSpaceIterator iter = st.get_iterator(); + while (iter->valid()) { + pair<string,string> key(iter->raw_key()); + cout << key.first << " / " << key.second << std::endl; + iter->next(); + } + } else if (cmd == "compact") { + st.compact(); + } else if (cmd == "get") { + unsigned v = 0; + string outpath; + bool readable = false; + string map_type; + // visible options for this command + po::options_description op_desc("Allowed 'get' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("out,o", po::value<string>(&outpath), + "output file (default: stdout)") + ("version,v", po::value<unsigned>(&v), + "map version to obtain") + ("readable,r", po::value<bool>(&readable)->default_value(false), + "print the map information in human readable format") + ; + // this is going to be a positional argument; we don't want to show + // it as an option during --help, but we do want to have it captured + // when parsing. + po::options_description hidden_op_desc("Hidden 'get' options"); + hidden_op_desc.add_options() + ("map-type", po::value<string>(&map_type), + "map-type") + ; + po::positional_options_description op_positional; + op_positional.add("map-type", 1); + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional, + subcmds, &op_vm); + if (r < 0) { + err = -r; + goto done; + } + + if (op_vm.count("help") || map_type.empty()) { + usage(argv[0], op_desc); + err = 0; + goto done; + } + + if (v == 0) { + if (map_type == "crushmap") { + v = st.get("osdmap", "last_committed"); + } else { + v = st.get(map_type, "last_committed"); + } + } + + int fd = STDOUT_FILENO; + if (!outpath.empty()){ + fd = ::open(outpath.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666); + if (fd < 0) { + std::cerr << "error opening output file: " + << cpp_strerror(errno) << std::endl; + err = EINVAL; + goto done; + } + } + + BOOST_SCOPE_EXIT((&r) (&fd) (&outpath)) { + ::close(fd); + if (r < 0 && fd != STDOUT_FILENO) { + ::remove(outpath.c_str()); + } + } BOOST_SCOPE_EXIT_END + + bufferlist bl; + r = 0; + if (map_type == "osdmap") { + r = st.get(map_type, st.combine_strings("full", v), bl); + } else if (map_type == "crushmap") { + bufferlist tmp; + r = st.get("osdmap", st.combine_strings("full", v), tmp); + if (r >= 0) { + OSDMap osdmap; + osdmap.decode(tmp); + osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + } + } else { + r = st.get(map_type, v, bl); + } + if (r < 0) { + std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl; + err = EINVAL; + goto done; + } + + if (readable) { + stringstream ss; + bufferlist out; + try { + if (map_type == "monmap") { + MonMap monmap; + monmap.decode(bl); + monmap.print(ss); + } else if (map_type == "osdmap") { + OSDMap osdmap; + osdmap.decode(bl); + osdmap.print(ss); + } else if (map_type == "mdsmap") { + FSMap fs_map; + fs_map.decode(bl); + fs_map.print(ss); + } else if (map_type == "mgr") { + MgrMap mgr_map; + auto p = bl.cbegin(); + mgr_map.decode(p); + JSONFormatter f; + f.dump_object("mgrmap", mgr_map); + f.flush(ss); + } else if (map_type == "crushmap") { + CrushWrapper cw; + auto it = bl.cbegin(); + cw.decode(it); + CrushCompiler cc(cw, std::cerr, 0); + cc.decompile(ss); + } else { + std::cerr << "This type of readable map does not exist: " << map_type + << std::endl << "You can only specify[osdmap|monmap|mdsmap" + "|crushmap|mgr]" << std::endl; + } + } catch (const buffer::error &err) { + std::cerr << "Could not decode for human readable output (you may still" + " use non-readable mode). Detail: " << err << std::endl; + } + + out.append(ss); + out.write_fd(fd); + } else { + bl.write_fd(fd); + } + + if (!outpath.empty()) { + std::cout << "wrote " << map_type + << " version " << v << " to " << outpath + << std::endl; + } + } else if (cmd == "show-versions") { + string map_type; //map type:osdmap,monmap... + // visible options for this command + po::options_description op_desc("Allowed 'show-versions' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("map-type", po::value<string>(&map_type), "map_type"); + + po::positional_options_description op_positional; + op_positional.add("map-type", 1); + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, NULL, &op_positional, + subcmds, &op_vm); + if (r < 0) { + err = -r; + goto done; + } + + if (op_vm.count("help") || map_type.empty()) { + usage(argv[0], op_desc); + err = 0; + goto done; + } + + unsigned int v_first = 0; + unsigned int v_last = 0; + v_first = st.get(map_type, "first_committed"); + v_last = st.get(map_type, "last_committed"); + + std::cout << "first committed:\t" << v_first << "\n" + << "last committed:\t" << v_last << std::endl; + } else if (cmd == "dump-paxos") { + unsigned dstart = 0; + unsigned dstop = ~0; + po::options_description op_desc("Allowed 'dump-paxos' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("start,s", po::value<unsigned>(&dstart), + "starting version (default: 0)") + ("end,e", po::value<unsigned>(&dstop), + "finish version (default: ~0)") + ; + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, NULL, NULL, + subcmds, &op_vm); + if (r < 0) { + err = -r; + goto done; + } + + if (op_vm.count("help")) { + usage(argv[0], op_desc); + err = 0; + goto done; + } + + if (dstart > dstop) { + std::cerr << "error: 'start' version (value: " << dstart << ") " + << " is greater than 'end' version (value: " << dstop << ")" + << std::endl; + err = EINVAL; + goto done; + } + + version_t v = dstart; + for (; v <= dstop; ++v) { + bufferlist bl; + st.get("paxos", v, bl); + if (bl.length() == 0) + break; + cout << "\n--- " << v << " ---" << std::endl; + auto tx(std::make_shared<MonitorDBStore::Transaction>()); + Paxos::decode_append_transaction(tx, bl); + JSONFormatter f(true); + tx->dump(&f); + f.flush(cout); + } + + std::cout << "dumped " << v << " paxos versions" << std::endl; + + } else if (cmd == "dump-trace") { + unsigned dstart = 0; + unsigned dstop = ~0; + string outpath; + + // visible options for this command + po::options_description op_desc("Allowed 'dump-trace' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("start,s", po::value<unsigned>(&dstart), + "starting version (default: 0)") + ("end,e", po::value<unsigned>(&dstop), + "finish version (default: ~0)") + ; + // this is going to be a positional argument; we don't want to show + // it as an option during --help, but we do want to have it captured + // when parsing. + po::options_description hidden_op_desc("Hidden 'dump-trace' options"); + hidden_op_desc.add_options() + ("out,o", po::value<string>(&outpath), + "file to write the dump to") + ; + po::positional_options_description op_positional; + op_positional.add("out", 1); + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional, + subcmds, &op_vm); + if (r < 0) { + err = -r; + goto done; + } + + if (op_vm.count("help")) { + usage(argv[0], op_desc); + err = 0; + goto done; + } + + if (outpath.empty()) { + usage(argv[0], op_desc); + err = EINVAL; + goto done; + } + + if (dstart > dstop) { + std::cerr << "error: 'start' version (value: " << dstart << ") " + << " is greater than 'stop' version (value: " << dstop << ")" + << std::endl; + err = EINVAL; + goto done; + } + + TraceIter iter(outpath.c_str()); + iter.init(); + while (true) { + if (!iter.valid()) + break; + if (iter.num() >= dstop) { + break; + } + if (iter.num() >= dstart) { + JSONFormatter f(true); + iter.cur()->dump(&f, false); + f.flush(std::cout); + std::cout << std::endl; + } + iter.next(); + } + std::cerr << "Read up to transaction " << iter.num() << std::endl; + } else if (cmd == "replay-trace") { + string inpath; + unsigned num_replays = 1; + // visible options for this command + po::options_description op_desc("Allowed 'replay-trace' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("num-replays,n", po::value<unsigned>(&num_replays), + "finish version (default: 1)") + ; + // this is going to be a positional argument; we don't want to show + // it as an option during --help, but we do want to have it captured + // when parsing. + po::options_description hidden_op_desc("Hidden 'replay-trace' options"); + hidden_op_desc.add_options() + ("in,i", po::value<string>(&inpath), + "file to write the dump to") + ; + po::positional_options_description op_positional; + op_positional.add("in", 1); + + // op_desc_all will aggregate all visible and hidden options for parsing. + // when we call 'usage()' we just pass 'op_desc', as that's the description + // holding the visible options. + po::options_description op_desc_all; + op_desc_all.add(op_desc).add(hidden_op_desc); + + po::variables_map op_vm; + try { + po::parsed_options op_parsed = po::command_line_parser(subcmds). + options(op_desc_all).positional(op_positional).run(); + po::store(op_parsed, op_vm); + po::notify(op_vm); + } catch (po::error &e) { + std::cerr << "error: " << e.what() << std::endl; + err = EINVAL; + goto done; + } + + if (op_vm.count("help")) { + usage(argv[0], op_desc); + err = 0; + goto done; + } + + if (inpath.empty()) { + usage(argv[0], op_desc); + err = EINVAL; + goto done; + } + + unsigned num = 0; + for (unsigned i = 0; i < num_replays; ++i) { + TraceIter iter(inpath.c_str()); + iter.init(); + while (true) { + if (!iter.valid()) + break; + std::cerr << "Replaying trans num " << num << std::endl; + st.apply_transaction(iter.cur()); + iter.next(); + ++num; + } + std::cerr << "Read up to transaction " << iter.num() << std::endl; + } + } else if (cmd == "random-gen") { + unsigned tsize = 200; + unsigned tvalsize = 1024; + unsigned ntrans = 100; + po::options_description op_desc("Allowed 'random-gen' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("num-keys,k", po::value<unsigned>(&tsize), + "keys to write in each transaction (default: 200)") + ("size,s", po::value<unsigned>(&tvalsize), + "size (in bytes) of the value to write in each key (default: 1024)") + ("ntrans,n", po::value<unsigned>(&ntrans), + "number of transactions to run (default: 100)") + ; + + po::variables_map op_vm; + try { + po::parsed_options op_parsed = po::command_line_parser(subcmds). + options(op_desc).run(); + po::store(op_parsed, op_vm); + po::notify(op_vm); + } catch (po::error &e) { + std::cerr << "error: " << e.what() << std::endl; + err = EINVAL; + goto done; + } + + if (op_vm.count("help")) { + usage(argv[0], op_desc); + err = 0; + goto done; + } + + unsigned num = 0; + for (unsigned i = 0; i < ntrans; ++i) { + std::cerr << "Applying trans " << i << std::endl; + auto t(std::make_shared<MonitorDBStore::Transaction>()); + string prefix; + prefix.push_back((i%26)+'a'); + for (unsigned j = 0; j < tsize; ++j) { + stringstream os; + os << num; + bufferlist bl; + for (unsigned k = 0; k < tvalsize; ++k) bl.append(rand()); + t->put(prefix, os.str(), bl); + ++num; + } + t->compact_prefix(prefix); + st.apply_transaction(t); + } + } else if (cmd == "store-copy") { + if (subcmds.size() < 1 || subcmds[0].empty()) { + usage(argv[0], desc); + err = EINVAL; + goto done; + } + + string out_path = subcmds[0]; + + MonitorDBStore out_store(out_path); + { + stringstream ss; + int r = out_store.create_and_open(ss); + if (r < 0) { + std::cerr << ss.str() << std::endl; + goto done; + } + } + + + KeyValueDB::WholeSpaceIterator it = st.get_iterator(); + uint64_t total_keys = 0; + uint64_t total_size = 0; + uint64_t total_tx = 0; + + do { + uint64_t num_keys = 0; + + auto tx(std::make_shared<MonitorDBStore::Transaction>()); + + while (it->valid() && num_keys < 128) { + pair<string,string> k = it->raw_key(); + bufferlist v = it->value(); + tx->put(k.first, k.second, v); + + num_keys ++; + total_tx ++; + total_size += v.length(); + + it->next(); + } + + total_keys += num_keys; + + if (!tx->empty()) + out_store.apply_transaction(tx); + + std::cout << "copied " << total_keys << " keys so far (" + << stringify(byte_u_t(total_size)) << ")" << std::endl; + + } while (it->valid()); + out_store.close(); + std::cout << "summary: copied " << total_keys << " keys, using " + << total_tx << " transactions, totalling " + << stringify(byte_u_t(total_size)) << std::endl; + std::cout << "from '" << store_path << "' to '" << out_path << "'" + << std::endl; + } else if (cmd == "rewrite-crush") { + err = rewrite_crush(argv[0], subcmds, st); + } else if (cmd == "rebuild") { + err = rebuild_monstore(argv[0], subcmds, st); + } else { + std::cerr << "Unrecognized command: " << cmd << std::endl; + usage(argv[0], desc); + goto done; + } + + done: + st.close(); + return err; +} diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc new file mode 100644 index 00000000..9ae5750c --- /dev/null +++ b/src/tools/ceph_objectstore_tool.cc @@ -0,0 +1,4249 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/parsers.hpp> +#include <boost/scoped_ptr.hpp> +#include <boost/optional.hpp> + +#include <stdlib.h> + +#include "common/Formatter.h" +#include "common/errno.h" +#include "common/ceph_argparse.h" + +#include "global/global_init.h" + +#include "os/ObjectStore.h" +#include "os/filestore/FileJournal.h" +#include "os/filestore/FileStore.h" +#ifdef HAVE_LIBFUSE +#include "os/FuseStore.h" +#endif + +#include "osd/PGLog.h" +#include "osd/OSD.h" +#include "osd/PG.h" +#include "osd/ECUtil.h" + +#include "json_spirit/json_spirit_value.h" +#include "json_spirit/json_spirit_reader.h" + +#include "rebuild_mondb.h" +#include "ceph_objectstore_tool.h" +#include "include/compat.h" +#include "include/util.h" + +namespace po = boost::program_options; + +#ifdef INTERNAL_TEST +CompatSet get_test_compat_set() { + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); +#ifdef INTERNAL_TEST2 + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); +#endif + return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} +#endif + +const ssize_t max_read = 1024 * 1024; +const int fd_none = INT_MIN; +bool outistty; +bool dry_run; + +struct action_on_object_t { + virtual ~action_on_object_t() {} + virtual void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) = 0; +}; + +int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug) +{ + auto ch = store->open_collection(coll); + unsigned LIST_AT_A_TIME = 100; + ghobject_t next; + while (!next.is_max()) { + vector<ghobject_t> list; + int r = store->collection_list(ch, + next, + ghobject_t::get_max(), + LIST_AT_A_TIME, + &list, + &next); + if (r < 0) { + cerr << "Error listing collection: " << coll << ", " + << cpp_strerror(r) << std::endl; + return r; + } + for (vector<ghobject_t>::iterator obj = list.begin(); + obj != list.end(); + ++obj) { + if (obj->is_pgmeta()) + continue; + object_info_t oi; + if (coll != coll_t::meta()) { + bufferlist attr; + r = store->getattr(ch, *obj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", " + << cpp_strerror(r) << std::endl; + } else { + auto bp = attr.cbegin(); + try { + decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding attr on : " << make_pair(coll, *obj) << ", " + << cpp_strerror(r) << std::endl; + } + } + } + action.call(store, coll, *obj, oi); + } + } + return 0; +} + +int action_on_all_objects_in_pg(ObjectStore *store, string pgidstr, action_on_object_t &action, bool debug) +{ + spg_t pgid; + // Scan collections in case this is an ec pool but no shard specified + unsigned scanned = 0; + int r = 0; + vector<coll_t> colls_to_check; + vector<coll_t> candidates; + r = store->list_collections(candidates); + if (r < 0) { + cerr << "Error listing collections: " << cpp_strerror(r) << std::endl; + return r; + } + pgid.parse(pgidstr.c_str()); + for (vector<coll_t>::iterator i = candidates.begin(); + i != candidates.end(); + ++i) { + spg_t cand_pgid; + if (!i->is_pg(&cand_pgid)) + continue; + + // If an exact match or treat no shard as any shard + if (cand_pgid == pgid || + (pgid.is_no_shard() && pgid.pgid == cand_pgid.pgid)) { + colls_to_check.push_back(*i); + } + } + + if (debug) + cerr << colls_to_check.size() << " pgs to scan" << std::endl; + for (vector<coll_t>::iterator i = colls_to_check.begin(); + i != colls_to_check.end(); + ++i, ++scanned) { + if (debug) + cerr << "Scanning " << *i << ", " << scanned << "/" + << colls_to_check.size() << " completed" << std::endl; + r = _action_on_all_objects_in_pg(store, *i, action, debug); + if (r < 0) + break; + } + return r; +} + +int action_on_all_objects_in_exact_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug) +{ + int r = _action_on_all_objects_in_pg(store, coll, action, debug); + return r; +} + +int _action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool debug) +{ + unsigned scanned = 0; + int r = 0; + vector<coll_t> colls_to_check; + vector<coll_t> candidates; + r = store->list_collections(candidates); + if (r < 0) { + cerr << "Error listing collections: " << cpp_strerror(r) << std::endl; + return r; + } + for (vector<coll_t>::iterator i = candidates.begin(); + i != candidates.end(); + ++i) { + if (i->is_pg()) { + colls_to_check.push_back(*i); + } + } + + if (debug) + cerr << colls_to_check.size() << " pgs to scan" << std::endl; + for (vector<coll_t>::iterator i = colls_to_check.begin(); + i != colls_to_check.end(); + ++i, ++scanned) { + if (debug) + cerr << "Scanning " << *i << ", " << scanned << "/" + << colls_to_check.size() << " completed" << std::endl; + r = _action_on_all_objects_in_pg(store, *i, action, debug); + if (r < 0) + return r; + } + return 0; +} + +int action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool debug) +{ + int r = _action_on_all_objects(store, action, debug); + return r; +} + +struct pgid_object_list { + list<pair<coll_t, ghobject_t> > _objects; + + void insert(coll_t coll, ghobject_t &ghobj) { + _objects.push_back(make_pair(coll, ghobj)); + } + + void dump(Formatter *f, bool human_readable) const { + if (!human_readable) + f->open_array_section("pgid_objects"); + for (list<pair<coll_t, ghobject_t> >::const_iterator i = _objects.begin(); + i != _objects.end(); + ++i) { + f->open_array_section("pgid_object"); + spg_t pgid; + bool is_pg = i->first.is_pg(&pgid); + if (is_pg) + f->dump_string("pgid", stringify(pgid)); + if (!is_pg || !human_readable) + f->dump_string("coll", i->first.to_str()); + f->open_object_section("ghobject"); + i->second.dump(f); + f->close_section(); + f->close_section(); + if (human_readable) { + f->flush(cout); + cout << std::endl; + } + } + if (!human_readable) { + f->close_section(); + f->flush(cout); + cout << std::endl; + } + } +}; + +struct lookup_ghobject : public action_on_object_t { + pgid_object_list _objects; + const string _name; + const boost::optional<std::string> _namespace; + bool _need_snapset; + + lookup_ghobject(const string& name, const boost::optional<std::string>& nspace, bool need_snapset = false) : _name(name), + _namespace(nspace), _need_snapset(need_snapset) { } + + void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) override { + if (_need_snapset && !ghobj.hobj.has_snapset()) + return; + if ((_name.length() == 0 || ghobj.hobj.oid.name == _name) && + (!_namespace || ghobj.hobj.nspace == _namespace)) + _objects.insert(coll, ghobj); + return; + } + + int size() const { + return _objects._objects.size(); + } + + pair<coll_t, ghobject_t> pop() { + pair<coll_t, ghobject_t> front = _objects._objects.front(); + _objects._objects.pop_front(); + return front; + } + + void dump(Formatter *f, bool human_readable) const { + _objects.dump(f, human_readable); + } +}; + +int file_fd = fd_none; +bool debug; +bool force = false; +super_header sh; + +static int get_fd_data(int fd, bufferlist &bl) +{ + uint64_t total = 0; + do { + ssize_t bytes = bl.read_fd(fd, max_read); + if (bytes < 0) { + cerr << "read_fd error " << cpp_strerror(bytes) << std::endl; + return bytes; + } + + if (bytes == 0) + break; + + total += bytes; + } while(true); + + ceph_assert(bl.length() == total); + return 0; +} + +int get_log(ObjectStore *fs, __u8 struct_ver, + spg_t pgid, const pg_info_t &info, + PGLog::IndexedLog &log, pg_missing_t &missing) +{ + try { + auto ch = fs->open_collection(coll_t(pgid)); + if (!ch) { + return -ENOENT; + } + ostringstream oss; + ceph_assert(struct_ver > 0); + PGLog::read_log_and_missing( + fs, ch, + pgid.make_pgmeta_oid(), + info, log, missing, + oss, + g_ceph_context->_conf->osd_ignore_stale_divergent_priors); + if (debug && oss.str().size()) + cerr << oss.str() << std::endl; + } + catch (const buffer::error &e) { + cerr << "read_log_and_missing threw exception error " << e.what() << std::endl; + return -EFAULT; + } + return 0; +} + +void dump_log(Formatter *formatter, ostream &out, pg_log_t &log, + pg_missing_t &missing) +{ + formatter->open_object_section("op_log"); + formatter->open_object_section("pg_log_t"); + log.dump(formatter); + formatter->close_section(); + formatter->flush(out); + formatter->open_object_section("pg_missing_t"); + missing.dump(formatter); + formatter->close_section(); + formatter->close_section(); + formatter->flush(out); +} + +//Based on part of OSD::load_pgs() +int finish_remove_pgs(ObjectStore *store) +{ + vector<coll_t> ls; + int r = store->list_collections(ls); + if (r < 0) { + cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(r) + << std::endl; + return r; + } + + for (vector<coll_t>::iterator it = ls.begin(); + it != ls.end(); + ++it) { + spg_t pgid; + + if (it->is_temp(&pgid) || + (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) { + cout << "finish_remove_pgs " << *it << " removing " << pgid << std::endl; + OSD::recursive_remove_collection(g_ceph_context, store, pgid, *it); + continue; + } + + //cout << "finish_remove_pgs ignoring unrecognized " << *it << std::endl; + } + return 0; +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t) +{ + pg_info_t info(pgid); + coll_t coll(pgid); + ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid()); + + epoch_t map_epoch = 0; + int r = PG::peek_map_epoch(fs, pgid, &map_epoch); + if (r < 0) + cerr << __func__ << " warning: peek_map_epoch reported error" << std::endl; + PastIntervals past_intervals; + __u8 struct_v; + r = PG::read_info(fs, pgid, coll, info, past_intervals, struct_v); + if (r < 0) { + cerr << __func__ << " error on read_info " << cpp_strerror(r) << std::endl; + return r; + } + ceph_assert(struct_v >= 8); + // new omap key + cout << "setting '_remove' omap key" << std::endl; + map<string,bufferlist> values; + encode((char)1, values["_remove"]); + t->omap_setkeys(coll, pgmeta_oid, values); + return 0; +} + +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" + +template<typename Func> +void wait_until_done(ObjectStore::Transaction* txn, Func&& func) +{ + bool finished = false; + std::condition_variable cond; + std::mutex m; + txn->register_on_complete(make_lambda_context([&]() { + std::unique_lock lock{m}; + finished = true; + cond.notify_one(); + })); + std::move(func)(); + std::unique_lock lock{m}; + cond.wait(lock, [&] {return finished;}); +} + +int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid) +{ + if (!dry_run) + finish_remove_pgs(store); + if (!store->collection_exists(coll_t(r_pgid))) + return -ENOENT; + + cout << " marking collection for removal" << std::endl; + if (dry_run) + return 0; + ObjectStore::Transaction rmt; + int r = mark_pg_for_removal(store, r_pgid, &rmt); + if (r < 0) { + return r; + } + ObjectStore::CollectionHandle ch = store->open_collection(coll_t(r_pgid)); + store->queue_transaction(ch, std::move(rmt)); + finish_remove_pgs(store); + return r; +} + +int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info, + PastIntervals &past_intervals) +{ + //Empty for this + coll_t coll(info.pgid); + ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid()); + map<string,bufferlist> km; + pg_info_t last_written_info; + int ret = PG::_prepare_write_info( + g_ceph_context, + &km, epoch, + info, + last_written_info, + past_intervals, + true, true, false); + if (ret) cerr << "Failed to write info" << std::endl; + t.omap_setkeys(coll, pgmeta_oid, km); + return ret; +} + +typedef map<eversion_t, hobject_t> divergent_priors_t; + +int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info, + pg_log_t &log, PastIntervals &past_intervals, + divergent_priors_t &divergent, + pg_missing_t &missing) +{ + cout << __func__ << " epoch " << epoch << " info " << info << std::endl; + int ret = write_info(t, epoch, info, past_intervals); + if (ret) + return ret; + coll_t coll(info.pgid); + map<string,bufferlist> km; + + if (!divergent.empty()) { + ceph_assert(missing.get_items().empty()); + PGLog::write_log_and_missing_wo_missing( + t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true); + } else { + pg_missing_tracker_t tmissing(missing); + bool rebuilt_missing_set_with_deletes = missing.may_include_deletes; + PGLog::write_log_and_missing( + t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true, + &rebuilt_missing_set_with_deletes); + } + t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km); + return 0; +} + +int do_trim_pg_log(ObjectStore *store, const coll_t &coll, + pg_info_t &info, const spg_t &pgid, + epoch_t map_epoch, + PastIntervals &past_intervals) +{ + ghobject_t oid = pgid.make_pgmeta_oid(); + struct stat st; + auto ch = store->open_collection(coll); + int r = store->stat(ch, oid, &st); + ceph_assert(r == 0); + ceph_assert(st.st_size == 0); + + cerr << "Log bounds are: " << "(" << info.log_tail << "," + << info.last_update << "]" << std::endl; + + uint64_t max_entries = g_ceph_context->_conf->osd_max_pg_log_entries; + if (info.last_update.version - info.log_tail.version <= max_entries) { + cerr << "Log not larger than osd_max_pg_log_entries " << max_entries << std::endl; + return 0; + } + + ceph_assert(info.last_update.version > max_entries); + version_t trim_to = info.last_update.version - max_entries; + size_t trim_at_once = g_ceph_context->_conf->osd_pg_log_trim_max; + eversion_t new_tail; + bool done = false; + + while (!done) { + // gather keys so we can delete them in a batch without + // affecting the iterator + set<string> keys_to_trim; + { + ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, oid); + if (!p) + break; + for (p->seek_to_first(); p->valid(); p->next()) { + if (p->key()[0] == '_') + continue; + if (p->key() == "can_rollback_to") + continue; + if (p->key() == "divergent_priors") + continue; + if (p->key() == "rollback_info_trimmed_to") + continue; + if (p->key() == "may_include_deletes_in_missing") + continue; + if (p->key().substr(0, 7) == string("missing")) + continue; + if (p->key().substr(0, 4) == string("dup_")) + continue; + + bufferlist bl = p->value(); + auto bp = bl.cbegin(); + pg_log_entry_t e; + try { + e.decode_with_checksum(bp); + } catch (const buffer::error &e) { + cerr << "Error reading pg log entry: " << e << std::endl; + } + if (debug) { + cerr << "read entry " << e << std::endl; + } + if (e.version.version > trim_to) { + done = true; + break; + } + keys_to_trim.insert(p->key()); + new_tail = e.version; + if (keys_to_trim.size() >= trim_at_once) + break; + } + + if (!p->valid()) + done = true; + } // deconstruct ObjectMapIterator + + // delete the keys + if (!dry_run && !keys_to_trim.empty()) { + cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl; + ObjectStore::Transaction t; + t.omap_rmkeys(coll, oid, keys_to_trim); + store->queue_transaction(ch, std::move(t)); + ch->flush(); + } + } + + // update pg info with new tail + if (!dry_run && new_tail != eversion_t()) { + info.log_tail = new_tail; + ObjectStore::Transaction t; + int ret = write_info(t, map_epoch, info, past_intervals); + if (ret) + return ret; + store->queue_transaction(ch, std::move(t)); + ch->flush(); + } + + // compact the db since we just removed a bunch of data + cerr << "Finished trimming, now compacting..." << std::endl; + if (!dry_run) + store->compact(); + return 0; +} + +const int OMAP_BATCH_SIZE = 25; +void get_omap_batch(ObjectMap::ObjectMapIterator &iter, map<string, bufferlist> &oset) +{ + oset.clear(); + for (int count = OMAP_BATCH_SIZE; count && iter->valid(); --count, iter->next()) { + oset.insert(pair<string, bufferlist>(iter->key(), iter->value())); + } +} + +int ObjectStoreTool::export_file(ObjectStore *store, coll_t cid, ghobject_t &obj) +{ + struct stat st; + mysize_t total; + footer ft; + + auto ch = store->open_collection(cid); + int ret = store->stat(ch, obj, &st); + if (ret < 0) + return ret; + + cerr << "Read " << obj << std::endl; + + total = st.st_size; + if (debug) + cerr << "size=" << total << std::endl; + + object_begin objb(obj); + + { + bufferptr bp; + bufferlist bl; + ret = store->getattr(ch, obj, OI_ATTR, bp); + if (ret < 0) { + cerr << "getattr failure object_info " << ret << std::endl; + return ret; + } + bl.push_back(bp); + decode(objb.oi, bl); + if (debug) + cerr << "object_info: " << objb.oi << std::endl; + } + + // NOTE: we include whiteouts, lost, etc. + + ret = write_section(TYPE_OBJECT_BEGIN, objb, file_fd); + if (ret < 0) + return ret; + + uint64_t offset = 0; + bufferlist rawdatabl; + while(total > 0) { + rawdatabl.clear(); + mysize_t len = max_read; + if (len > total) + len = total; + + ret = store->read(ch, obj, offset, len, rawdatabl); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + + data_section dblock(offset, len, rawdatabl); + if (debug) + cerr << "data section offset=" << offset << " len=" << len << std::endl; + + total -= ret; + offset += ret; + + ret = write_section(TYPE_DATA, dblock, file_fd); + if (ret) return ret; + } + + //Handle attrs for this object + map<string,bufferptr> aset; + ret = store->getattrs(ch, obj, aset); + if (ret) return ret; + attr_section as(aset); + ret = write_section(TYPE_ATTRS, as, file_fd); + if (ret) + return ret; + + if (debug) { + cerr << "attrs size " << aset.size() << std::endl; + } + + //Handle omap information + bufferlist hdrbuf; + ret = store->omap_get_header(ch, obj, &hdrbuf, true); + if (ret < 0) { + cerr << "omap_get_header: " << cpp_strerror(ret) << std::endl; + return ret; + } + + omap_hdr_section ohs(hdrbuf); + ret = write_section(TYPE_OMAP_HDR, ohs, file_fd); + if (ret) + return ret; + + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, obj); + if (!iter) { + ret = -ENOENT; + cerr << "omap_get_iterator: " << cpp_strerror(ret) << std::endl; + return ret; + } + iter->seek_to_first(); + int mapcount = 0; + map<string, bufferlist> out; + while(iter->valid()) { + get_omap_batch(iter, out); + + if (out.empty()) break; + + mapcount += out.size(); + omap_section oms(out); + ret = write_section(TYPE_OMAP, oms, file_fd); + if (ret) + return ret; + } + if (debug) + cerr << "omap map size " << mapcount << std::endl; + + ret = write_simple(TYPE_OBJECT_END, file_fd); + if (ret) + return ret; + + return 0; +} + +int ObjectStoreTool::export_files(ObjectStore *store, coll_t coll) +{ + ghobject_t next; + auto ch = store->open_collection(coll); + while (!next.is_max()) { + vector<ghobject_t> objects; + int r = store->collection_list(ch, next, ghobject_t::get_max(), 300, + &objects, &next); + if (r < 0) + return r; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + ceph_assert(!i->hobj.is_meta()); + if (i->is_pgmeta() || i->hobj.is_temp() || !i->is_no_gen()) { + continue; + } + r = export_file(store, coll, *i); + if (r < 0) + return r; + } + } + return 0; +} + +int set_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) { + OSDMap::Incremental inc; + auto it = bl.cbegin(); + inc.decode(it); + if (e == 0) { + e = inc.epoch; + } else if (e != inc.epoch) { + cerr << "incremental.epoch mismatch: " + << inc.epoch << " != " << e << std::endl; + if (force) { + cerr << "But will continue anyway." << std::endl; + } else { + return -EINVAL; + } + } + auto ch = store->open_collection(coll_t::meta()); + const ghobject_t inc_oid = OSD::get_inc_osdmap_pobject_name(e); + if (!store->exists(ch, inc_oid)) { + cerr << "inc-osdmap (" << inc_oid << ") does not exist." << std::endl; + if (!force) { + return -ENOENT; + } + cout << "Creating a new epoch." << std::endl; + } + if (dry_run) + return 0; + ObjectStore::Transaction t; + t.write(coll_t::meta(), inc_oid, 0, bl.length(), bl); + t.truncate(coll_t::meta(), inc_oid, bl.length()); + store->queue_transaction(ch, std::move(t)); + return 0; +} + +int get_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl) +{ + auto ch = store->open_collection(coll_t::meta()); + if (store->read(ch, + OSD::get_inc_osdmap_pobject_name(e), + 0, 0, bl) < 0) { + return -ENOENT; + } + return 0; +} + +int set_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) { + OSDMap osdmap; + osdmap.decode(bl); + if (e == 0) { + e = osdmap.get_epoch(); + } else if (e != osdmap.get_epoch()) { + cerr << "osdmap.epoch mismatch: " + << e << " != " << osdmap.get_epoch() << std::endl; + if (force) { + cerr << "But will continue anyway." << std::endl; + } else { + return -EINVAL; + } + } + auto ch = store->open_collection(coll_t::meta()); + const ghobject_t full_oid = OSD::get_osdmap_pobject_name(e); + if (!store->exists(ch, full_oid)) { + cerr << "osdmap (" << full_oid << ") does not exist." << std::endl; + if (!force) { + return -ENOENT; + } + cout << "Creating a new epoch." << std::endl; + } + if (dry_run) + return 0; + ObjectStore::Transaction t; + t.write(coll_t::meta(), full_oid, 0, bl.length(), bl); + t.truncate(coll_t::meta(), full_oid, bl.length()); + store->queue_transaction(ch, std::move(t)); + return 0; +} + +int get_osdmap(ObjectStore *store, epoch_t e, OSDMap &osdmap, bufferlist& bl) +{ + ObjectStore::CollectionHandle ch = store->open_collection(coll_t::meta()); + bool found = store->read( + ch, OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0; + if (!found) { + cerr << "Can't find OSDMap for pg epoch " << e << std::endl; + return -ENOENT; + } + osdmap.decode(bl); + if (debug) + cerr << osdmap << std::endl; + return 0; +} + +int get_pg_num_history(ObjectStore *store, pool_pg_num_history_t *h) +{ + ObjectStore::CollectionHandle ch = store->open_collection(coll_t::meta()); + bufferlist bl; + auto pghist = OSD::make_pg_num_history_oid(); + int r = store->read(ch, pghist, 0, 0, bl, 0); + if (r >= 0 && bl.length() > 0) { + auto p = bl.cbegin(); + decode(*h, p); + } + cout << __func__ << " pg_num_history " << *h << std::endl; + return 0; +} + +int add_osdmap(ObjectStore *store, metadata_section &ms) +{ + return get_osdmap(store, ms.map_epoch, ms.osdmap, ms.osdmap_bl); +} + +int ObjectStoreTool::do_export(ObjectStore *fs, coll_t coll, spg_t pgid, + pg_info_t &info, epoch_t map_epoch, __u8 struct_ver, + const OSDSuperblock& superblock, + PastIntervals &past_intervals) +{ + PGLog::IndexedLog log; + pg_missing_t missing; + + cerr << "Exporting " << pgid << " info " << info << std::endl; + + int ret = get_log(fs, struct_ver, pgid, info, log, missing); + if (ret > 0) + return ret; + + if (debug) { + Formatter *formatter = Formatter::create("json-pretty"); + ceph_assert(formatter); + dump_log(formatter, cerr, log, missing); + delete formatter; + } + write_super(); + + pg_begin pgb(pgid, superblock); + // Special case: If replicated pg don't require the importing OSD to have shard feature + if (pgid.is_no_shard()) { + pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); + } + ret = write_section(TYPE_PG_BEGIN, pgb, file_fd); + if (ret) + return ret; + + // The metadata_section is now before files, so import can detect + // errors and abort without wasting time. + metadata_section ms( + struct_ver, + map_epoch, + info, + log, + past_intervals, + missing); + ret = add_osdmap(fs, ms); + if (ret) + return ret; + ret = write_section(TYPE_PG_METADATA, ms, file_fd); + if (ret) + return ret; + + ret = export_files(fs, coll); + if (ret) { + cerr << "export_files error " << ret << std::endl; + return ret; + } + + ret = write_simple(TYPE_PG_END, file_fd); + if (ret) + return ret; + + return 0; +} + +int dump_data(Formatter *formatter, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + data_section ds; + ds.decode(ebliter); + + formatter->open_object_section("data_block"); + formatter->dump_unsigned("offset", ds.offset); + formatter->dump_unsigned("len", ds.len); + // XXX: Add option to dump data like od -cx ? + formatter->close_section(); + formatter->flush(cout); + return 0; +} + +int get_data(ObjectStore *store, coll_t coll, ghobject_t hoid, + ObjectStore::Transaction *t, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + data_section ds; + ds.decode(ebliter); + + if (debug) + cerr << "\tdata: offset " << ds.offset << " len " << ds.len << std::endl; + t->write(coll, hoid, ds.offset, ds.len, ds.databl); + return 0; +} + +int dump_attrs( + Formatter *formatter, ghobject_t hoid, + bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + attr_section as; + as.decode(ebliter); + + // This could have been handled in the caller if we didn't need to + // support exports that didn't include object_info_t in object_begin. + if (hoid.generation == ghobject_t::NO_GEN && + hoid.hobj.is_head()) { + map<string,bufferlist>::iterator mi = as.data.find(SS_ATTR); + if (mi != as.data.end()) { + SnapSet snapset; + auto p = mi->second.cbegin(); + snapset.decode(p); + formatter->open_object_section("snapset"); + snapset.dump(formatter); + formatter->close_section(); + } else { + formatter->open_object_section("snapset"); + formatter->dump_string("error", "missing SS_ATTR"); + formatter->close_section(); + } + } + + formatter->open_object_section("attrs"); + formatter->open_array_section("user"); + for (auto kv : as.data) { + // Skip system attributes + if (('_' != kv.first.at(0)) || kv.first.size() == 1) + continue; + formatter->open_object_section("user_attr"); + formatter->dump_string("name", kv.first.substr(1)); + bool b64; + formatter->dump_string("value", cleanbin(kv.second, b64)); + formatter->dump_bool("Base64", b64); + formatter->close_section(); + } + formatter->close_section(); + formatter->open_array_section("system"); + for (auto kv : as.data) { + // Skip user attributes + if (('_' == kv.first.at(0)) && kv.first.size() != 1) + continue; + formatter->open_object_section("sys_attr"); + formatter->dump_string("name", kv.first); + formatter->close_section(); + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + + return 0; +} + +int get_attrs( + ObjectStore *store, coll_t coll, ghobject_t hoid, + ObjectStore::Transaction *t, bufferlist &bl, + OSDriver &driver, SnapMapper &snap_mapper) +{ + auto ebliter = bl.cbegin(); + attr_section as; + as.decode(ebliter); + + auto ch = store->open_collection(coll); + if (debug) + cerr << "\tattrs: len " << as.data.size() << std::endl; + t->setattrs(coll, hoid, as.data); + + // This could have been handled in the caller if we didn't need to + // support exports that didn't include object_info_t in object_begin. + if (hoid.generation == ghobject_t::NO_GEN && + hoid.hobj.is_head()) { + map<string,bufferlist>::iterator mi = as.data.find(SS_ATTR); + if (mi != as.data.end()) { + SnapSet snapset; + auto p = mi->second.cbegin(); + snapset.decode(p); + cout << "snapset " << snapset << std::endl; + for (auto& p : snapset.clone_snaps) { + ghobject_t clone = hoid; + clone.hobj.snap = p.first; + set<snapid_t> snaps(p.second.begin(), p.second.end()); + if (!store->exists(ch, clone)) { + // no clone, skip. this is probably a cache pool. this works + // because we use a separate transaction per object and clones + // come before head in the archive. + if (debug) + cerr << "\tskipping missing " << clone << " (snaps " + << snaps << ")" << std::endl; + continue; + } + if (debug) + cerr << "\tsetting " << clone.hobj << " snaps " << snaps + << std::endl; + OSDriver::OSTransaction _t(driver.get_transaction(t)); + ceph_assert(!snaps.empty()); + snap_mapper.add_oid(clone.hobj, snaps, &_t); + } + } else { + cerr << "missing SS_ATTR on " << hoid << std::endl; + } + } + return 0; +} + +int dump_omap_hdr(Formatter *formatter, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + omap_hdr_section oh; + oh.decode(ebliter); + + formatter->open_object_section("omap_header"); + formatter->dump_string("value", string(oh.hdr.c_str(), oh.hdr.length())); + formatter->close_section(); + formatter->flush(cout); + return 0; +} + +int get_omap_hdr(ObjectStore *store, coll_t coll, ghobject_t hoid, + ObjectStore::Transaction *t, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + omap_hdr_section oh; + oh.decode(ebliter); + + if (debug) + cerr << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length()) + << std::endl; + t->omap_setheader(coll, hoid, oh.hdr); + return 0; +} + +int dump_omap(Formatter *formatter, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + omap_section os; + os.decode(ebliter); + + formatter->open_object_section("omaps"); + formatter->dump_unsigned("count", os.omap.size()); + formatter->open_array_section("data"); + for (auto o : os.omap) { + formatter->open_object_section("omap"); + formatter->dump_string("name", o.first); + bool b64; + formatter->dump_string("value", cleanbin(o.second, b64)); + formatter->dump_bool("Base64", b64); + formatter->close_section(); + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + return 0; +} + +int get_omap(ObjectStore *store, coll_t coll, ghobject_t hoid, + ObjectStore::Transaction *t, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + omap_section os; + os.decode(ebliter); + + if (debug) + cerr << "\tomap: size " << os.omap.size() << std::endl; + t->omap_setkeys(coll, hoid, os.omap); + return 0; +} + +int ObjectStoreTool::dump_object(Formatter *formatter, + bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + object_begin ob; + ob.decode(ebliter); + + if (ob.hoid.hobj.is_temp()) { + cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl; + return -EFAULT; + } + + formatter->open_object_section("object"); + formatter->open_object_section("oid"); + ob.hoid.dump(formatter); + formatter->close_section(); + formatter->open_object_section("object_info"); + ob.oi.dump(formatter); + formatter->close_section(); + + bufferlist ebl; + bool done = false; + while(!done) { + sectiontype_t type; + int ret = read_section(&type, &ebl); + if (ret) + return ret; + + //cout << "\tdo_object: Section type " << hex << type << dec << std::endl; + //cout << "\t\tsection size " << ebl.length() << std::endl; + if (type >= END_OF_TYPES) { + cout << "Skipping unknown object section type" << std::endl; + continue; + } + switch(type) { + case TYPE_DATA: + if (dry_run) break; + ret = dump_data(formatter, ebl); + if (ret) return ret; + break; + case TYPE_ATTRS: + if (dry_run) break; + ret = dump_attrs(formatter, ob.hoid, ebl); + if (ret) return ret; + break; + case TYPE_OMAP_HDR: + if (dry_run) break; + ret = dump_omap_hdr(formatter, ebl); + if (ret) return ret; + break; + case TYPE_OMAP: + if (dry_run) break; + ret = dump_omap(formatter, ebl); + if (ret) return ret; + break; + case TYPE_OBJECT_END: + done = true; + break; + default: + cerr << "Unknown section type " << type << std::endl; + return -EFAULT; + } + } + formatter->close_section(); + return 0; +} + +int ObjectStoreTool::get_object(ObjectStore *store, + OSDriver& driver, + SnapMapper& mapper, + coll_t coll, + bufferlist &bl, OSDMap &origmap, + bool *skipped_objects) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + auto ebliter = bl.cbegin(); + object_begin ob; + ob.decode(ebliter); + + if (ob.hoid.hobj.is_temp()) { + cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl; + return -EFAULT; + } + ceph_assert(g_ceph_context); + + auto ch = store->open_collection(coll); + if (ob.hoid.hobj.nspace != g_ceph_context->_conf->osd_hit_set_namespace) { + object_t oid = ob.hoid.hobj.oid; + object_locator_t loc(ob.hoid.hobj); + pg_t raw_pgid = origmap.object_locator_to_pg(oid, loc); + pg_t pgid = origmap.raw_pg_to_pg(raw_pgid); + + spg_t coll_pgid; + if (coll.is_pg(&coll_pgid) == false) { + cerr << "INTERNAL ERROR: Bad collection during import" << std::endl; + return -EFAULT; + } + if (coll_pgid.shard != ob.hoid.shard_id) { + cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard + << " but object shard is " << ob.hoid.shard_id << std::endl; + return -EFAULT; + } + + if (coll_pgid.pgid != pgid) { + cerr << "Skipping object '" << ob.hoid << "' which belongs in pg " << pgid << std::endl; + *skipped_objects = true; + skip_object(bl); + return 0; + } + } + + if (!dry_run) + t->touch(coll, ob.hoid); + + cout << "Write " << ob.hoid << std::endl; + + bufferlist ebl; + bool done = false; + while(!done) { + sectiontype_t type; + int ret = read_section(&type, &ebl); + if (ret) + return ret; + + //cout << "\tdo_object: Section type " << hex << type << dec << std::endl; + //cout << "\t\tsection size " << ebl.length() << std::endl; + if (type >= END_OF_TYPES) { + cout << "Skipping unknown object section type" << std::endl; + continue; + } + switch(type) { + case TYPE_DATA: + if (dry_run) break; + ret = get_data(store, coll, ob.hoid, t, ebl); + if (ret) return ret; + break; + case TYPE_ATTRS: + if (dry_run) break; + ret = get_attrs(store, coll, ob.hoid, t, ebl, driver, mapper); + if (ret) return ret; + break; + case TYPE_OMAP_HDR: + if (dry_run) break; + ret = get_omap_hdr(store, coll, ob.hoid, t, ebl); + if (ret) return ret; + break; + case TYPE_OMAP: + if (dry_run) break; + ret = get_omap(store, coll, ob.hoid, t, ebl); + if (ret) return ret; + break; + case TYPE_OBJECT_END: + done = true; + break; + default: + cerr << "Unknown section type " << type << std::endl; + return -EFAULT; + } + } + if (!dry_run) { + wait_until_done(t, [&] { + store->queue_transaction(ch, std::move(*t)); + ch->flush(); + }); + } + return 0; +} + +int dump_pg_metadata(Formatter *formatter, bufferlist &bl, metadata_section &ms) +{ + auto ebliter = bl.cbegin(); + ms.decode(ebliter); + + formatter->open_object_section("metadata_section"); + + formatter->dump_unsigned("pg_disk_version", (int)ms.struct_ver); + formatter->dump_unsigned("map_epoch", ms.map_epoch); + + formatter->open_object_section("OSDMap"); + ms.osdmap.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + + formatter->open_object_section("info"); + ms.info.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + + formatter->open_object_section("log"); + ms.log.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + + formatter->open_object_section("pg_missing_t"); + ms.missing.dump(formatter); + formatter->close_section(); + + // XXX: ms.past_intervals? + + formatter->close_section(); + formatter->flush(cout); + + if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) { + cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl; + return -EFAULT; + } + + return 0; +} + +int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms, + const OSDSuperblock& sb, spg_t pgid) +{ + auto ebliter = bl.cbegin(); + ms.decode(ebliter); + spg_t old_pgid = ms.info.pgid; + ms.info.pgid = pgid; + + if (debug) { + cout << "export pgid " << old_pgid << std::endl; + cout << "struct_v " << (int)ms.struct_ver << std::endl; + cout << "map epoch " << ms.map_epoch << std::endl; + +#ifdef DIAGNOSTIC + Formatter *formatter = new JSONFormatter(true); + formatter->open_object_section("stuff"); + + formatter->open_object_section("importing OSDMap"); + ms.osdmap.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + + cout << "osd current epoch " << sb.current_epoch << std::endl; + + formatter->open_object_section("info"); + ms.info.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + + formatter->open_object_section("log"); + ms.log.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; +#endif + } + + if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) { + cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl; + return -EFAULT; + } + + if (ms.map_epoch > sb.current_epoch) { + cerr << "ERROR: Export PG's map_epoch " << ms.map_epoch << " > OSD's epoch " << sb.current_epoch << std::endl; + cerr << "The OSD you are using is older than the exported PG" << std::endl; + cerr << "Either use another OSD or join selected OSD to cluster to update it first" << std::endl; + return -EINVAL; + } + + // Old exports didn't include OSDMap + if (ms.osdmap.get_epoch() == 0) { + cerr << "WARNING: No OSDMap in old export, this is an ancient export." + " Not supported." << std::endl; + return -EINVAL; + } + + if (ms.osdmap.get_epoch() < sb.oldest_map) { + cerr << "PG export's map " << ms.osdmap.get_epoch() + << " is older than OSD's oldest_map " << sb.oldest_map << std::endl; + if (!force) { + cerr << " pass --force to proceed anyway (with incomplete PastIntervals)" + << std::endl; + return -EINVAL; + } + } + if (debug) { + cerr << "Import pgid " << ms.info.pgid << std::endl; + cerr << "Previous past_intervals " << ms.past_intervals << std::endl; + cerr << "history.same_interval_since " + << ms.info.history.same_interval_since << std::endl; + } + + return 0; +} + +// out: pg_log_t that only has entries that apply to import_pgid using curmap +// reject: Entries rejected from "in" are in the reject.log. Other fields not set. +void filter_divergent_priors(spg_t import_pgid, const OSDMap &curmap, + const string &hit_set_namespace, const divergent_priors_t &in, + divergent_priors_t &out, divergent_priors_t &reject) +{ + out.clear(); + reject.clear(); + + for (divergent_priors_t::const_iterator i = in.begin(); + i != in.end(); ++i) { + + // Reject divergent priors for temporary objects + if (i->second.is_temp()) { + reject.insert(*i); + continue; + } + + if (i->second.nspace != hit_set_namespace) { + object_t oid = i->second.oid; + object_locator_t loc(i->second); + pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc); + pg_t pgid = curmap.raw_pg_to_pg(raw_pgid); + + if (import_pgid.pgid == pgid) { + out.insert(*i); + } else { + reject.insert(*i); + } + } else { + out.insert(*i); + } + } +} + +int ObjectStoreTool::dump_export(Formatter *formatter) +{ + bufferlist ebl; + pg_info_t info; + PGLog::IndexedLog log; + //bool skipped_objects = false; + + int ret = read_super(); + if (ret) + return ret; + + if (sh.magic != super_header::super_magic) { + cerr << "Invalid magic number" << std::endl; + return -EFAULT; + } + + if (sh.version > super_header::super_ver) { + cerr << "Can't handle export format version=" << sh.version << std::endl; + return -EINVAL; + } + + formatter->open_object_section("Export"); + + //First section must be TYPE_PG_BEGIN + sectiontype_t type; + ret = read_section(&type, &ebl); + if (ret) + return ret; + if (type == TYPE_POOL_BEGIN) { + cerr << "Dump of pool exports not supported" << std::endl; + return -EINVAL; + } else if (type != TYPE_PG_BEGIN) { + cerr << "Invalid first section type " << std::to_string(type) << std::endl; + return -EFAULT; + } + + auto ebliter = ebl.cbegin(); + pg_begin pgb; + pgb.decode(ebliter); + spg_t pgid = pgb.pgid; + + formatter->dump_string("pgid", stringify(pgid)); + formatter->dump_string("cluster_fsid", stringify(pgb.superblock.cluster_fsid)); + formatter->dump_string("features", stringify(pgb.superblock.compat_features)); + + bool done = false; + bool found_metadata = false; + metadata_section ms; + bool objects_started = false; + while(!done) { + ret = read_section(&type, &ebl); + if (ret) + return ret; + + if (debug) { + cerr << "dump_export: Section type " << std::to_string(type) << std::endl; + } + if (type >= END_OF_TYPES) { + cerr << "Skipping unknown section type" << std::endl; + continue; + } + switch(type) { + case TYPE_OBJECT_BEGIN: + if (!objects_started) { + formatter->open_array_section("objects"); + objects_started = true; + } + ret = dump_object(formatter, ebl); + if (ret) return ret; + break; + case TYPE_PG_METADATA: + if (objects_started) + cerr << "WARNING: metadata_section out of order" << std::endl; + ret = dump_pg_metadata(formatter, ebl, ms); + if (ret) return ret; + found_metadata = true; + break; + case TYPE_PG_END: + if (objects_started) { + formatter->close_section(); + } + done = true; + break; + default: + cerr << "Unknown section type " << std::to_string(type) << std::endl; + return -EFAULT; + } + } + + if (!found_metadata) { + cerr << "Missing metadata section" << std::endl; + return -EFAULT; + } + + formatter->close_section(); + formatter->flush(cout); + + return 0; +} + +int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb, + bool force, std::string pgidstr) +{ + bufferlist ebl; + pg_info_t info; + PGLog::IndexedLog log; + bool skipped_objects = false; + + if (!dry_run) + finish_remove_pgs(store); + + int ret = read_super(); + if (ret) + return ret; + + if (sh.magic != super_header::super_magic) { + cerr << "Invalid magic number" << std::endl; + return -EFAULT; + } + + if (sh.version > super_header::super_ver) { + cerr << "Can't handle export format version=" << sh.version << std::endl; + return -EINVAL; + } + + //First section must be TYPE_PG_BEGIN + sectiontype_t type; + ret = read_section(&type, &ebl); + if (ret) + return ret; + if (type == TYPE_POOL_BEGIN) { + cerr << "Pool exports cannot be imported into a PG" << std::endl; + return -EINVAL; + } else if (type != TYPE_PG_BEGIN) { + cerr << "Invalid first section type " << std::to_string(type) << std::endl; + return -EFAULT; + } + + auto ebliter = ebl.cbegin(); + pg_begin pgb; + pgb.decode(ebliter); + spg_t pgid = pgb.pgid; + + if (pgidstr.length()) { + spg_t user_pgid; + + bool ok = user_pgid.parse(pgidstr.c_str()); + // This succeeded in main() already + ceph_assert(ok); + if (pgid != user_pgid) { + cerr << "specified pgid " << user_pgid + << " does not match actual pgid " << pgid << std::endl; + return -EINVAL; + } + } + + if (!pgb.superblock.cluster_fsid.is_zero() + && pgb.superblock.cluster_fsid != sb.cluster_fsid) { + cerr << "Export came from different cluster with fsid " + << pgb.superblock.cluster_fsid << std::endl; + return -EINVAL; + } + + if (debug) { + cerr << "Exported features: " << pgb.superblock.compat_features << std::endl; + } + + // Special case: Old export has SHARDS incompat feature on replicated pg, removqqe it + if (pgid.is_no_shard()) + pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); + + if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) { + CompatSet unsupported = sb.compat_features.unsupported(pgb.superblock.compat_features); + + cerr << "Export has incompatible features set " << unsupported << std::endl; + + // Let them import if they specify the --force option + if (!force) + return 11; // Positive return means exit status + } + + // we need the latest OSDMap to check for collisions + OSDMap curmap; + bufferlist bl; + ret = get_osdmap(store, sb.current_epoch, curmap, bl); + if (ret) { + cerr << "Can't find latest local OSDMap " << sb.current_epoch << std::endl; + return ret; + } + if (!curmap.have_pg_pool(pgid.pgid.m_pool)) { + cerr << "Pool " << pgid.pgid.m_pool << " no longer exists" << std::endl; + // Special exit code for this error, used by test code + return 10; // Positive return means exit status + } + + pool_pg_num_history_t pg_num_history; + get_pg_num_history(store, &pg_num_history); + + ghobject_t pgmeta_oid = pgid.make_pgmeta_oid(); + + // Check for PG already present. + coll_t coll(pgid); + if (store->collection_exists(coll)) { + cerr << "pgid " << pgid << " already exists" << std::endl; + return -EEXIST; + } + + ObjectStore::CollectionHandle ch; + + OSDriver driver( + store, + coll_t(), + OSD::make_snapmapper_oid()); + SnapMapper mapper(g_ceph_context, &driver, 0, 0, 0, pgid.shard); + + cout << "Importing pgid " << pgid; + cout << std::endl; + + bool done = false; + bool found_metadata = false; + metadata_section ms; + while(!done) { + ret = read_section(&type, &ebl); + if (ret) + return ret; + + if (debug) { + cout << __func__ << ": Section type " << std::to_string(type) << std::endl; + } + if (type >= END_OF_TYPES) { + cout << "Skipping unknown section type" << std::endl; + continue; + } + switch(type) { + case TYPE_OBJECT_BEGIN: + ceph_assert(found_metadata); + ret = get_object(store, driver, mapper, coll, ebl, ms.osdmap, + &skipped_objects); + if (ret) return ret; + break; + case TYPE_PG_METADATA: + ret = get_pg_metadata(store, ebl, ms, sb, pgid); + if (ret) return ret; + found_metadata = true; + + if (pgid != ms.info.pgid) { + cerr << "specified pgid " << pgid << " does not match import file pgid " + << ms.info.pgid << std::endl; + return -EINVAL; + } + + // make sure there are no conflicting splits or merges + if (ms.osdmap.have_pg_pool(pgid.pgid.pool())) { + auto p = pg_num_history.pg_nums.find(pgid.pgid.m_pool); + if (p != pg_num_history.pg_nums.end() && + !p->second.empty()) { + unsigned start_pg_num = ms.osdmap.get_pg_num(pgid.pgid.pool()); + unsigned pg_num = start_pg_num; + for (auto q = p->second.lower_bound(ms.map_epoch); + q != p->second.end(); + ++q) { + unsigned new_pg_num = q->second; + cout << "pool " << pgid.pgid.pool() << " pg_num " << pg_num + << " -> " << new_pg_num << std::endl; + + // check for merge target + spg_t target; + if (pgid.is_merge_source(pg_num, new_pg_num, &target)) { + // FIXME: this checks assumes the OSD's PG is at the OSD's + // map epoch; it could be, say, at *our* epoch, pre-merge. + coll_t coll(target); + if (store->collection_exists(coll)) { + cerr << "pgid " << pgid << " merges to target " << target + << " which already exists" << std::endl; + return 12; + } + } + + // check for split children + set<spg_t> children; + if (pgid.is_split(start_pg_num, new_pg_num, &children)) { + cerr << " children are " << children << std::endl; + for (auto child : children) { + coll_t coll(child); + if (store->collection_exists(coll)) { + cerr << "pgid " << pgid << " splits to " << children + << " and " << child << " exists" << std::endl; + return 12; + } + } + } + pg_num = new_pg_num; + } + } + } else { + cout << "pool " << pgid.pgid.pool() << " doesn't existing, not checking" + << " for splits or mergers" << std::endl; + } + + if (!dry_run) { + ObjectStore::Transaction t; + ch = store->create_new_collection(coll); + PG::_create( + t, pgid, + pgid.get_split_bits(ms.osdmap.get_pg_pool(pgid.pool())->get_pg_num())); + PG::_init(t, pgid, NULL); + + // mark this coll for removal until we're done + map<string,bufferlist> values; + encode((char)1, values["_remove"]); + t.omap_setkeys(coll, pgid.make_pgmeta_oid(), values); + + store->queue_transaction(ch, std::move(t)); + } + + break; + case TYPE_PG_END: + ceph_assert(found_metadata); + done = true; + break; + default: + cerr << "Unknown section type " << std::to_string(type) << std::endl; + return -EFAULT; + } + } + + if (!found_metadata) { + cerr << "Missing metadata section" << std::endl; + return -EFAULT; + } + + ObjectStore::Transaction t; + if (!dry_run) { + pg_log_t newlog, reject; + pg_log_t::filter_log(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace, + ms.log, newlog, reject); + if (debug) { + for (list<pg_log_entry_t>::iterator i = newlog.log.begin(); + i != newlog.log.end(); ++i) + cerr << "Keeping log entry " << *i << std::endl; + for (list<pg_log_entry_t>::iterator i = reject.log.begin(); + i != reject.log.end(); ++i) + cerr << "Skipping log entry " << *i << std::endl; + } + + divergent_priors_t newdp, rejectdp; + filter_divergent_priors(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace, + ms.divergent_priors, newdp, rejectdp); + ms.divergent_priors = newdp; + if (debug) { + for (divergent_priors_t::iterator i = newdp.begin(); + i != newdp.end(); ++i) + cerr << "Keeping divergent_prior " << *i << std::endl; + for (divergent_priors_t::iterator i = rejectdp.begin(); + i != rejectdp.end(); ++i) + cerr << "Skipping divergent_prior " << *i << std::endl; + } + + ms.missing.filter_objects([&](const hobject_t &obj) { + if (obj.nspace == g_ceph_context->_conf->osd_hit_set_namespace) + return false; + ceph_assert(!obj.is_temp()); + object_t oid = obj.oid; + object_locator_t loc(obj); + pg_t raw_pgid = ms.osdmap.object_locator_to_pg(oid, loc); + pg_t _pgid = ms.osdmap.raw_pg_to_pg(raw_pgid); + + return pgid.pgid != _pgid; + }); + + + if (debug) { + pg_missing_t missing; + Formatter *formatter = Formatter::create("json-pretty"); + dump_log(formatter, cerr, newlog, ms.missing); + delete formatter; + } + + // Just like a split invalidate stats since the object count is changed + if (skipped_objects) + ms.info.stats.stats_invalid = true; + + ret = write_pg( + t, + ms.map_epoch, + ms.info, + newlog, + ms.past_intervals, + ms.divergent_priors, + ms.missing); + if (ret) return ret; + } + + // done, clear removal flag + if (debug) + cerr << "done, clearing removal flag" << std::endl; + + if (!dry_run) { + set<string> remove; + remove.insert("_remove"); + t.omap_rmkeys(coll, pgid.make_pgmeta_oid(), remove); + wait_until_done(&t, [&] { + store->queue_transaction(ch, std::move(t)); + // make sure we flush onreadable items before mapper/driver are destroyed. + ch->flush(); + }); + } + return 0; +} + +int do_list(ObjectStore *store, string pgidstr, string object, boost::optional<std::string> nspace, + Formatter *formatter, bool debug, bool human_readable, bool head) +{ + int r; + lookup_ghobject lookup(object, nspace, head); + if (pgidstr.length() > 0) { + r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug); + } else { + r = action_on_all_objects(store, lookup, debug); + } + if (r) + return r; + lookup.dump(formatter, human_readable); + formatter->flush(cout); + return 0; +} + +int do_meta(ObjectStore *store, string object, Formatter *formatter, bool debug, bool human_readable) +{ + int r; + boost::optional<std::string> nspace; // Not specified + lookup_ghobject lookup(object, nspace); + r = action_on_all_objects_in_exact_pg(store, coll_t::meta(), lookup, debug); + if (r) + return r; + lookup.dump(formatter, human_readable); + formatter->flush(cout); + return 0; +} + +enum rmtype { + BOTH, + SNAPMAP, + NOSNAPMAP +}; + +int remove_object(coll_t coll, ghobject_t &ghobj, + SnapMapper &mapper, + MapCacher::Transaction<std::string, bufferlist> *_t, + ObjectStore::Transaction *t, + enum rmtype type) +{ + if (type == BOTH || type == SNAPMAP) { + int r = mapper.remove_oid(ghobj.hobj, _t); + if (r < 0 && r != -ENOENT) { + cerr << "remove_oid returned " << cpp_strerror(r) << std::endl; + return r; + } + } + + if (type == BOTH || type == NOSNAPMAP) { + t->remove(coll, ghobj); + } + return 0; +} + +int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent); + +int do_remove_object(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, bool all, bool force, enum rmtype type) +{ + auto ch = store->open_collection(coll); + spg_t pg; + coll.is_pg_prefix(&pg); + OSDriver driver( + store, + coll_t(), + OSD::make_snapmapper_oid()); + SnapMapper mapper(g_ceph_context, &driver, 0, 0, 0, pg.shard); + struct stat st; + + int r = store->stat(ch, ghobj, &st); + if (r < 0) { + cerr << "remove: " << cpp_strerror(r) << std::endl; + return r; + } + + SnapSet ss; + if (ghobj.hobj.has_snapset()) { + r = get_snapset(store, coll, ghobj, ss, false); + if (r < 0) { + cerr << "Can't get snapset error " << cpp_strerror(r) << std::endl; + // If --force and bad snapset let them remove the head + if (!(force && !all)) + return r; + } + if (!ss.snaps.empty() && !all) { + if (force) { + cout << "WARNING: only removing " + << (ghobj.hobj.is_head() ? "head" : "snapdir") + << " with snapshots present" << std::endl; + ss.snaps.clear(); + } else { + cerr << "Snapshots are present, use removeall to delete everything" << std::endl; + return -EINVAL; + } + } + } + + ObjectStore::Transaction t; + OSDriver::OSTransaction _t(driver.get_transaction(&t)); + + ghobject_t snapobj = ghobj; + for (vector<snapid_t>::iterator i = ss.snaps.begin() ; + i != ss.snaps.end() ; ++i) { + snapobj.hobj.snap = *i; + cout << "remove " << snapobj << std::endl; + if (!dry_run) { + r = remove_object(coll, snapobj, mapper, &_t, &t, type); + if (r < 0) + return r; + } + } + + cout << "remove " << ghobj << std::endl; + + if (!dry_run) { + r = remove_object(coll, ghobj, mapper, &_t, &t, type); + if (r < 0) + return r; + } + + if (!dry_run) { + wait_until_done(&t, [&] { + store->queue_transaction(ch, std::move(t)); + ch->flush(); + }); + } + return 0; +} + +int do_list_attrs(ObjectStore *store, coll_t coll, ghobject_t &ghobj) +{ + auto ch = store->open_collection(coll); + map<string,bufferptr> aset; + int r = store->getattrs(ch, ghobj, aset); + if (r < 0) { + cerr << "getattrs: " << cpp_strerror(r) << std::endl; + return r; + } + + for (map<string,bufferptr>::iterator i = aset.begin();i != aset.end(); ++i) { + string key(i->first); + if (outistty) + key = cleanbin(key); + cout << key << std::endl; + } + return 0; +} + +int do_list_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj) +{ + auto ch = store->open_collection(coll); + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, ghobj); + if (!iter) { + cerr << "omap_get_iterator: " << cpp_strerror(ENOENT) << std::endl; + return -ENOENT; + } + iter->seek_to_first(); + map<string, bufferlist> oset; + while(iter->valid()) { + get_omap_batch(iter, oset); + + for (map<string,bufferlist>::iterator i = oset.begin();i != oset.end(); ++i) { + string key(i->first); + if (outistty) + key = cleanbin(key); + cout << key << std::endl; + } + } + return 0; +} + +int do_get_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) +{ + auto ch = store->open_collection(coll); + struct stat st; + mysize_t total; + + int ret = store->stat(ch, ghobj, &st); + if (ret < 0) { + cerr << "get-bytes: " << cpp_strerror(ret) << std::endl; + return ret; + } + + total = st.st_size; + if (debug) + cerr << "size=" << total << std::endl; + + uint64_t offset = 0; + bufferlist rawdatabl; + while(total > 0) { + rawdatabl.clear(); + mysize_t len = max_read; + if (len > total) + len = total; + + ret = store->read(ch, ghobj, offset, len, rawdatabl); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + + if (debug) + cerr << "data section offset=" << offset << " len=" << len << std::endl; + + total -= ret; + offset += ret; + + ret = write(fd, rawdatabl.c_str(), ret); + if (ret == -1) { + perror("write"); + return -errno; + } + } + + return 0; +} + +int do_set_bytes(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, int fd) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (debug) + cerr << "Write " << ghobj << std::endl; + + if (!dry_run) { + t->touch(coll, ghobj); + t->truncate(coll, ghobj, 0); + } + + uint64_t offset = 0; + bufferlist rawdatabl; + do { + rawdatabl.clear(); + ssize_t bytes = rawdatabl.read_fd(fd, max_read); + if (bytes < 0) { + cerr << "read_fd error " << cpp_strerror(bytes) << std::endl; + return bytes; + } + + if (bytes == 0) + break; + + if (debug) + cerr << "\tdata: offset " << offset << " bytes " << bytes << std::endl; + if (!dry_run) + t->write(coll, ghobj, offset, bytes, rawdatabl); + + offset += bytes; + // XXX: Should we queue_transaction() every once in a while for very large files + } while(true); + + auto ch = store->open_collection(coll); + if (!dry_run) + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_get_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) +{ + auto ch = store->open_collection(coll); + bufferptr bp; + + int r = store->getattr(ch, ghobj, key.c_str(), bp); + if (r < 0) { + cerr << "getattr: " << cpp_strerror(r) << std::endl; + return r; + } + + string value(bp.c_str(), bp.length()); + if (outistty) { + value = cleanbin(value); + value.push_back('\n'); + } + cout << value; + + return 0; +} + +int do_set_attr(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, string key, int fd) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + bufferlist bl; + + if (debug) + cerr << "Setattr " << ghobj << std::endl; + + int ret = get_fd_data(fd, bl); + if (ret < 0) + return ret; + + if (dry_run) + return 0; + + t->touch(coll, ghobj); + + t->setattr(coll, ghobj, key, bl); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_rm_attr(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, string key) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (debug) + cerr << "Rmattr " << ghobj << std::endl; + + if (dry_run) + return 0; + + t->rmattr(coll, ghobj, key); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_get_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) +{ + auto ch = store->open_collection(coll); + set<string> keys; + map<string, bufferlist> out; + + keys.insert(key); + + int r = store->omap_get_values(ch, ghobj, keys, &out); + if (r < 0) { + cerr << "omap_get_values: " << cpp_strerror(r) << std::endl; + return r; + } + + if (out.empty()) { + cerr << "Key not found" << std::endl; + return -ENOENT; + } + + ceph_assert(out.size() == 1); + + bufferlist bl = out.begin()->second; + string value(bl.c_str(), bl.length()); + if (outistty) { + value = cleanbin(value); + value.push_back('\n'); + } + cout << value; + + return 0; +} + +int do_set_omap(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, string key, int fd) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + map<string, bufferlist> attrset; + bufferlist valbl; + + if (debug) + cerr << "Set_omap " << ghobj << std::endl; + + int ret = get_fd_data(fd, valbl); + if (ret < 0) + return ret; + + attrset.insert(pair<string, bufferlist>(key, valbl)); + + if (dry_run) + return 0; + + t->touch(coll, ghobj); + + t->omap_setkeys(coll, ghobj, attrset); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_rm_omap(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, string key) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + set<string> keys; + + keys.insert(key); + + if (debug) + cerr << "Rm_omap " << ghobj << std::endl; + + if (dry_run) + return 0; + + t->omap_rmkeys(coll, ghobj, keys); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_get_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj) +{ + auto ch = store->open_collection(coll); + bufferlist hdrbl; + + int r = store->omap_get_header(ch, ghobj, &hdrbl, true); + if (r < 0) { + cerr << "omap_get_header: " << cpp_strerror(r) << std::endl; + return r; + } + + string header(hdrbl.c_str(), hdrbl.length()); + if (outistty) { + header = cleanbin(header); + header.push_back('\n'); + } + cout << header; + + return 0; +} + +int do_set_omaphdr(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, int fd) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + bufferlist hdrbl; + + if (debug) + cerr << "Omap_setheader " << ghobj << std::endl; + + int ret = get_fd_data(fd, hdrbl); + if (ret) + return ret; + + if (dry_run) + return 0; + + t->touch(coll, ghobj); + + t->omap_setheader(coll, ghobj, hdrbl); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +struct do_fix_lost : public action_on_object_t { + void call(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, object_info_t &oi) override { + if (oi.is_lost()) { + cout << coll << "/" << ghobj << " is lost"; + if (!dry_run) + cout << ", fixing"; + cout << std::endl; + if (dry_run) + return; + oi.clear_flag(object_info_t::FLAG_LOST); + bufferlist bl; + encode(oi, bl, -1); /* fixme: using full features */ + ObjectStore::Transaction t; + t.setattr(coll, ghobj, OI_ATTR, bl); + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(t)); + } + return; + } +}; + +int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent = false) +{ + auto ch = store->open_collection(coll); + bufferlist attr; + int r = store->getattr(ch, ghobj, SS_ATTR, attr); + if (r < 0) { + if (!silent) + cerr << "Error getting snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + auto bp = attr.cbegin(); + try { + decode(ss, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter) +{ + auto ch = store->open_collection(coll); + int r = 0; + formatter->open_object_section("obj"); + formatter->open_object_section("id"); + ghobj.dump(formatter); + formatter->close_section(); + + bufferlist attr; + int gr = store->getattr(ch, ghobj, OI_ATTR, attr); + if (gr < 0) { + r = gr; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } else { + object_info_t oi; + auto bp = attr.cbegin(); + try { + decode(oi, bp); + formatter->open_object_section("info"); + oi.dump(formatter); + formatter->close_section(); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } + } + struct stat st; + int sr = store->stat(ch, ghobj, &st, true); + if (sr < 0) { + r = sr; + cerr << "Error stat on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } else { + formatter->open_object_section("stat"); + formatter->dump_int("size", st.st_size); + formatter->dump_int("blksize", st.st_blksize); + formatter->dump_int("blocks", st.st_blocks); + formatter->dump_int("nlink", st.st_nlink); + formatter->close_section(); + } + + if (ghobj.hobj.has_snapset()) { + SnapSet ss; + int snr = get_snapset(store, coll, ghobj, ss); + if (snr < 0) { + r = snr; + } else { + formatter->open_object_section("SnapSet"); + ss.dump(formatter); + formatter->close_section(); + } + } + bufferlist hattr; + gr = store->getattr(ch, ghobj, ECUtil::get_hinfo_key(), hattr); + if (gr == 0) { + ECUtil::HashInfo hinfo; + auto hp = hattr.cbegin(); + try { + decode(hinfo, hp); + formatter->open_object_section("hinfo"); + hinfo.dump(formatter); + formatter->close_section(); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding hinfo on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } + } + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + return r; +} + +int corrupt_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter) +{ + auto ch = store->open_collection(coll); + bufferlist attr; + int r = store->getattr(ch, ghobj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + object_info_t oi; + auto bp = attr.cbegin(); + try { + decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + if (!dry_run) { + attr.clear(); + oi.alloc_hint_flags += 0xff; + ObjectStore::Transaction t; + encode(oi, attr, -1); /* fixme: using full features */ + t.setattr(coll, ghobj, OI_ATTR, attr); + auto ch = store->open_collection(coll); + r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +int set_size( + ObjectStore *store, coll_t coll, ghobject_t &ghobj, uint64_t setsize, Formatter* formatter, + bool corrupt) +{ + auto ch = store->open_collection(coll); + if (ghobj.hobj.is_snapdir()) { + cerr << "Can't set the size of a snapdir" << std::endl; + return -EINVAL; + } + bufferlist attr; + int r = store->getattr(ch, ghobj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + object_info_t oi; + auto bp = attr.cbegin(); + try { + decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + struct stat st; + r = store->stat(ch, ghobj, &st, true); + if (r < 0) { + cerr << "Error stat on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } + ghobject_t head(ghobj); + SnapSet ss; + bool found_head = true; + map<snapid_t, uint64_t>::iterator csi; + bool is_snap = ghobj.hobj.is_snap(); + if (is_snap) { + head.hobj = head.hobj.get_head(); + r = get_snapset(store, coll, head, ss, true); + if (r < 0 && r != -ENOENT) { + // Requested get_snapset() silent, so if not -ENOENT show error + cerr << "Error getting snapset on : " << make_pair(coll, head) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + if (r == -ENOENT) { + head.hobj = head.hobj.get_snapdir(); + r = get_snapset(store, coll, head, ss); + if (r < 0) + return r; + found_head = false; + } else { + found_head = true; + } + csi = ss.clone_size.find(ghobj.hobj.snap); + if (csi == ss.clone_size.end()) { + cerr << "SnapSet is missing clone_size for snap " << ghobj.hobj.snap << std::endl; + return -EINVAL; + } + } + if ((uint64_t)st.st_size == setsize && oi.size == setsize + && (!is_snap || csi->second == setsize)) { + cout << "Size of object is already " << setsize << std::endl; + return 0; + } + cout << "Setting size to " << setsize << ", stat size " << st.st_size + << ", obj info size " << oi.size; + if (is_snap) { + cout << ", " << (found_head ? "head" : "snapdir") + << " clone_size " << csi->second; + csi->second = setsize; + } + cout << std::endl; + if (!dry_run) { + attr.clear(); + oi.size = setsize; + ObjectStore::Transaction t; + // Only modify object info if we want to corrupt it + if (!corrupt && (uint64_t)st.st_size != setsize) { + t.truncate(coll, ghobj, setsize); + // Changing objectstore size will invalidate data_digest, so clear it. + oi.clear_data_digest(); + } + encode(oi, attr, -1); /* fixme: using full features */ + t.setattr(coll, ghobj, OI_ATTR, attr); + if (is_snap) { + bufferlist snapattr; + snapattr.clear(); + encode(ss, snapattr); + t.setattr(coll, head, SS_ATTR, snapattr); + } + auto ch = store->open_collection(coll); + r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +int clear_data_digest(ObjectStore *store, coll_t coll, ghobject_t &ghobj) { + auto ch = store->open_collection(coll); + bufferlist attr; + int r = store->getattr(ch, ghobj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + object_info_t oi; + auto bp = attr.cbegin(); + try { + decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + if (!dry_run) { + attr.clear(); + oi.clear_data_digest(); + encode(oi, attr, -1); /* fixme: using full features */ + ObjectStore::Transaction t; + t.setattr(coll, ghobj, OI_ATTR, attr); + auto ch = store->open_collection(coll); + r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +int clear_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, + string arg) +{ + SnapSet ss; + int ret = get_snapset(store, coll, ghobj, ss); + if (ret < 0) + return ret; + + // Use "corrupt" to clear entire SnapSet + // Use "seq" to just corrupt SnapSet.seq + if (arg == "corrupt" || arg == "seq") + ss.seq = 0; + // Use "snaps" to just clear SnapSet.snaps + if (arg == "corrupt" || arg == "snaps") + ss.snaps.clear(); + // By default just clear clone, clone_overlap and clone_size + if (arg == "corrupt") + arg = ""; + if (arg == "" || arg == "clones") + ss.clones.clear(); + if (arg == "" || arg == "clone_overlap") + ss.clone_overlap.clear(); + if (arg == "" || arg == "clone_size") + ss.clone_size.clear(); + // Break all clone sizes by adding 1 + if (arg == "size") { + for (map<snapid_t, uint64_t>::iterator i = ss.clone_size.begin(); + i != ss.clone_size.end(); ++i) + ++(i->second); + } + + if (!dry_run) { + bufferlist bl; + encode(ss, bl); + ObjectStore::Transaction t; + t.setattr(coll, ghobj, SS_ATTR, bl); + auto ch = store->open_collection(coll); + int r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +vector<snapid_t>::iterator find(vector<snapid_t> &v, snapid_t clid) +{ + return std::find(v.begin(), v.end(), clid); +} + +map<snapid_t, interval_set<uint64_t> >::iterator +find(map<snapid_t, interval_set<uint64_t> > &m, snapid_t clid) +{ + return m.find(clid); +} + +map<snapid_t, uint64_t>::iterator find(map<snapid_t, uint64_t> &m, + snapid_t clid) +{ + return m.find(clid); +} + +template<class T> +int remove_from(T &mv, string name, snapid_t cloneid, bool force) +{ + typename T::iterator i = find(mv, cloneid); + if (i != mv.end()) { + mv.erase(i); + } else { + cerr << "Clone " << cloneid << " doesn't exist in " << name; + if (force) { + cerr << " (ignored)" << std::endl; + return 0; + } + cerr << std::endl; + return -EINVAL; + } + return 0; +} + +int remove_clone( + ObjectStore *store, coll_t coll, ghobject_t &ghobj, snapid_t cloneid, bool force) +{ + // XXX: Don't allow this if in a cache tier or former cache tier + // bool allow_incomplete_clones() const { + // return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES); + + SnapSet snapset; + int ret = get_snapset(store, coll, ghobj, snapset); + if (ret < 0) + return ret; + + // Derived from trim_object() + // ...from snapset + vector<snapid_t>::iterator p; + for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p) + if (*p == cloneid) + break; + if (p == snapset.clones.end()) { + cerr << "Clone " << cloneid << " not present"; + return -ENOENT; + } + if (p != snapset.clones.begin()) { + // not the oldest... merge overlap into next older clone + vector<snapid_t>::iterator n = p - 1; + hobject_t prev_coid = ghobj.hobj; + prev_coid.snap = *n; + //bool adjust_prev_bytes = is_present_clone(prev_coid); + + //if (adjust_prev_bytes) + // ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n); + + snapset.clone_overlap[*n].intersection_of( + snapset.clone_overlap[*p]); + + //if (adjust_prev_bytes) + // ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n); + } + + ret = remove_from(snapset.clones, "clones", cloneid, force); + if (ret) return ret; + ret = remove_from(snapset.clone_overlap, "clone_overlap", cloneid, force); + if (ret) return ret; + ret = remove_from(snapset.clone_size, "clone_size", cloneid, force); + if (ret) return ret; + + if (dry_run) + return 0; + + bufferlist bl; + encode(snapset, bl); + ObjectStore::Transaction t; + t.setattr(coll, ghobj, SS_ATTR, bl); + auto ch = store->open_collection(coll); + int r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + cout << "Removal of clone " << cloneid << " complete" << std::endl; + cout << "Use pg repair after OSD restarted to correct stat information" << std::endl; + return 0; +} + +int dup(string srcpath, ObjectStore *src, string dstpath, ObjectStore *dst) +{ + cout << "dup from " << src->get_type() << ": " << srcpath << "\n" + << " to " << dst->get_type() << ": " << dstpath + << std::endl; + int num, i; + vector<coll_t> collections; + int r; + + r = src->mount(); + if (r < 0) { + cerr << "failed to mount src: " << cpp_strerror(r) << std::endl; + return r; + } + r = dst->mount(); + if (r < 0) { + cerr << "failed to mount dst: " << cpp_strerror(r) << std::endl; + goto out_src; + } + + if (src->get_fsid() != dst->get_fsid()) { + cerr << "src fsid " << src->get_fsid() << " != dest " << dst->get_fsid() + << std::endl; + goto out; + } + cout << "fsid " << src->get_fsid() << std::endl; + + // make sure dst is empty + r = dst->list_collections(collections); + if (r < 0) { + cerr << "error listing collections on dst: " << cpp_strerror(r) << std::endl; + goto out; + } + if (!collections.empty()) { + cerr << "destination store is not empty" << std::endl; + goto out; + } + + r = src->list_collections(collections); + if (r < 0) { + cerr << "error listing collections on src: " << cpp_strerror(r) << std::endl; + goto out; + } + + num = collections.size(); + cout << num << " collections" << std::endl; + i = 1; + for (auto cid : collections) { + cout << i++ << "/" << num << " " << cid << std::endl; + auto ch = src->open_collection(cid); + auto dch = dst->create_new_collection(cid); + { + ObjectStore::Transaction t; + int bits = src->collection_bits(ch); + if (bits < 0) { + if (src->get_type() == "filestore" && cid.is_meta()) { + bits = 0; + } else { + cerr << "cannot get bit count for collection " << cid << ": " + << cpp_strerror(bits) << std::endl; + goto out; + } + } + t.create_collection(cid, bits); + dst->queue_transaction(dch, std::move(t)); + } + + ghobject_t pos; + uint64_t n = 0; + uint64_t bytes = 0, keys = 0; + while (true) { + vector<ghobject_t> ls; + r = src->collection_list(ch, pos, ghobject_t::get_max(), 1000, &ls, &pos); + if (r < 0) { + cerr << "collection_list on " << cid << " from " << pos << " got: " + << cpp_strerror(r) << std::endl; + goto out; + } + if (ls.empty()) { + break; + } + + for (auto& oid : ls) { + //cout << " " << cid << " " << oid << std::endl; + if (n % 100 == 0) { + cout << " " << std::setw(16) << n << " objects, " + << std::setw(16) << bytes << " bytes, " + << std::setw(16) << keys << " keys" + << std::setw(1) << "\r" << std::flush; + } + n++; + + ObjectStore::Transaction t; + t.touch(cid, oid); + + map<string,bufferptr> attrs; + src->getattrs(ch, oid, attrs); + if (!attrs.empty()) { + t.setattrs(cid, oid, attrs); + } + + bufferlist bl; + src->read(ch, oid, 0, 0, bl); + if (bl.length()) { + t.write(cid, oid, 0, bl.length(), bl); + bytes += bl.length(); + } + + bufferlist header; + map<string,bufferlist> omap; + src->omap_get(ch, oid, &header, &omap); + if (header.length()) { + t.omap_setheader(cid, oid, header); + ++keys; + } + if (!omap.empty()) { + keys += omap.size(); + t.omap_setkeys(cid, oid, omap); + } + + dst->queue_transaction(dch, std::move(t)); + } + } + cout << " " << std::setw(16) << n << " objects, " + << std::setw(16) << bytes << " bytes, " + << std::setw(16) << keys << " keys" + << std::setw(1) << std::endl; + } + + // keyring + cout << "keyring" << std::endl; + { + bufferlist bl; + string s = srcpath + "/keyring"; + string err; + r = bl.read_file(s.c_str(), &err); + if (r < 0) { + cerr << "failed to copy " << s << ": " << err << std::endl; + } else { + string d = dstpath + "/keyring"; + bl.write_file(d.c_str(), 0600); + } + } + + // osd metadata + cout << "duping osd metadata" << std::endl; + { + for (auto k : {"magic", "whoami", "ceph_fsid", "fsid"}) { + string val; + src->read_meta(k, &val); + dst->write_meta(k, val); + } + } + + dst->write_meta("ready", "ready"); + + cout << "done." << std::endl; + r = 0; + out: + dst->umount(); + out_src: + src->umount(); + return r; +} + +void usage(po::options_description &desc) +{ + cerr << std::endl; + cerr << desc << std::endl; + cerr << std::endl; + cerr << "Positional syntax:" << std::endl; + cerr << std::endl; + cerr << "ceph-objectstore-tool ... <object> (get|set)-bytes [file]" << std::endl; + cerr << "ceph-objectstore-tool ... <object> set-(attr|omap) <key> [file]" << std::endl; + cerr << "ceph-objectstore-tool ... <object> (get|rm)-(attr|omap) <key>" << std::endl; + cerr << "ceph-objectstore-tool ... <object> get-omaphdr" << std::endl; + cerr << "ceph-objectstore-tool ... <object> set-omaphdr [file]" << std::endl; + cerr << "ceph-objectstore-tool ... <object> list-attrs" << std::endl; + cerr << "ceph-objectstore-tool ... <object> list-omap" << std::endl; + cerr << "ceph-objectstore-tool ... <object> remove|removeall" << std::endl; + cerr << "ceph-objectstore-tool ... <object> dump" << std::endl; + cerr << "ceph-objectstore-tool ... <object> set-size" << std::endl; + cerr << "ceph-objectstore-tool ... <object> clear-data-digest" << std::endl; + cerr << "ceph-objectstore-tool ... <object> remove-clone-metadata <cloneid>" << std::endl; + cerr << std::endl; + cerr << "<object> can be a JSON object description as displayed" << std::endl; + cerr << "by --op list." << std::endl; + cerr << "<object> can be an object name which will be looked up in all" << std::endl; + cerr << "the OSD's PGs." << std::endl; + cerr << "<object> can be the empty string ('') which with a provided pgid " << std::endl; + cerr << "specifies the pgmeta object" << std::endl; + cerr << std::endl; + cerr << "The optional [file] argument will read stdin or write stdout" << std::endl; + cerr << "if not specified or if '-' specified." << std::endl; +} + +bool ends_with(const string& check, const string& ending) +{ + return check.size() >= ending.size() && check.rfind(ending) == (check.size() - ending.size()); +} + +// Based on FileStore::dump_journal(), set-up enough to only dump +int mydump_journal(Formatter *f, string journalpath, bool m_journal_dio) +{ + int r; + + if (!journalpath.length()) + return -EINVAL; + + FileJournal *journal = new FileJournal(g_ceph_context, uuid_d(), NULL, NULL, + journalpath.c_str(), m_journal_dio); + r = journal->_fdump(*f, false); + delete journal; + return r; +} + +int apply_layout_settings(ObjectStore *os, const OSDSuperblock &superblock, + const string &pool_name, const spg_t &pgid, bool dry_run, + int target_level) +{ + int r = 0; + + FileStore *fs = dynamic_cast<FileStore*>(os); + if (!fs) { + cerr << "Nothing to do for non-filestore backend" << std::endl; + return 0; // making this return success makes testing easier + } + + OSDMap curmap; + bufferlist bl; + r = get_osdmap(os, superblock.current_epoch, curmap, bl); + if (r) { + cerr << "Can't find local OSDMap: " << cpp_strerror(r) << std::endl; + return r; + } + + int64_t poolid = -1; + if (pool_name.length()) { + poolid = curmap.lookup_pg_pool_name(pool_name); + if (poolid < 0) { + cerr << "Couldn't find pool " << pool_name << ": " << cpp_strerror(poolid) + << std::endl; + return poolid; + } + } + + vector<coll_t> collections, filtered_colls; + r = os->list_collections(collections); + if (r < 0) { + cerr << "Error listing collections: " << cpp_strerror(r) << std::endl; + return r; + } + + for (auto const &coll : collections) { + spg_t coll_pgid; + if (coll.is_pg(&coll_pgid) && + ((poolid >= 0 && coll_pgid.pool() == (uint64_t)poolid) || + coll_pgid == pgid)) { + filtered_colls.push_back(coll); + } + } + + size_t done = 0, total = filtered_colls.size(); + for (auto const &coll : filtered_colls) { + if (dry_run) { + cerr << "Would apply layout settings to " << coll << std::endl; + } else { + cerr << "Finished " << done << "/" << total << " collections" << "\r"; + r = fs->apply_layout_settings(coll, target_level); + if (r < 0) { + cerr << "Error applying layout settings to " << coll << std::endl; + return r; + } + } + ++done; + } + + cerr << "Finished " << total << "/" << total << " collections" << "\r" << std::endl; + return r; +} + +int main(int argc, char **argv) +{ + string dpath, jpath, pgidstr, op, file, mountpoint, mon_store_path, object; + string target_data_path, fsid; + string objcmd, arg1, arg2, type, format, argnspace, pool, rmtypestr; + boost::optional<std::string> nspace; + spg_t pgid; + unsigned epoch = 0; + ghobject_t ghobj; + bool human_readable; + Formatter *formatter; + bool head; + + po::options_description desc("Allowed options"); + desc.add_options() + ("help", "produce help message") + ("type", po::value<string>(&type), + "Arg is one of [bluestore (default), filestore, memstore]") + ("data-path", po::value<string>(&dpath), + "path to object store, mandatory") + ("journal-path", po::value<string>(&jpath), + "path to journal, use if tool can't find it") + ("pgid", po::value<string>(&pgidstr), + "PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, and mandatory for apply-layout-settings if --pool is not specified") + ("pool", po::value<string>(&pool), + "Pool name, mandatory for apply-layout-settings if --pgid is not specified") + ("op", po::value<string>(&op), + "Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, " + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log, statfs]") + ("epoch", po::value<unsigned>(&epoch), + "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified") + ("file", po::value<string>(&file), + "path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap") + ("mon-store-path", po::value<string>(&mon_store_path), + "path of monstore to update-mon-db") + ("fsid", po::value<string>(&fsid), + "fsid for new store created by mkfs") + ("target-data-path", po::value<string>(&target_data_path), + "path of target object store (for --op dup)") + ("mountpoint", po::value<string>(&mountpoint), + "fuse mountpoint") + ("format", po::value<string>(&format)->default_value("json-pretty"), + "Output format which may be json, json-pretty, xml, xml-pretty") + ("debug", "Enable diagnostic output to stderr") + ("force", "Ignore some types of errors and proceed with operation - USE WITH CAUTION: CORRUPTION POSSIBLE NOW OR IN THE FUTURE") + ("skip-journal-replay", "Disable journal replay") + ("skip-mount-omap", "Disable mounting of omap") + ("head", "Find head/snapdir when searching for objects by name") + ("dry-run", "Don't modify the objectstore") + ("namespace", po::value<string>(&argnspace), "Specify namespace when searching for objects") + ("rmtype", po::value<string>(&rmtypestr), "Specify corrupting object removal 'snapmap' or 'nosnapmap' - TESTING USE ONLY") + ; + + po::options_description positional("Positional options"); + positional.add_options() + ("object", po::value<string>(&object), "'' for pgmeta_oid, object name or ghobject in json") + ("objcmd", po::value<string>(&objcmd), "command [(get|set)-bytes, (get|set|rm)-(attr|omap), (get|set)-omaphdr, list-attrs, list-omap, remove]") + ("arg1", po::value<string>(&arg1), "arg1 based on cmd") + ("arg2", po::value<string>(&arg2), "arg2 based on cmd") + ; + + po::options_description all; + all.add(desc).add(positional); + + po::positional_options_description pd; + pd.add("object", 1).add("objcmd", 1).add("arg1", 1).add("arg2", 1); + + vector<string> ceph_option_strings; + + po::variables_map vm; + try { + po::parsed_options parsed = + po::command_line_parser(argc, argv).options(all).allow_unregistered().positional(pd).run(); + po::store( parsed, vm); + po::notify(vm); + ceph_option_strings = po::collect_unrecognized(parsed.options, + po::include_positional); + } catch(po::error &e) { + std::cerr << e.what() << std::endl; + return 1; + } + + if (vm.count("help")) { + usage(desc); + return 1; + } + + // Compatibility with previous option name + if (op == "dump-import") + op = "dump-export"; + + debug = (vm.count("debug") > 0); + + force = (vm.count("force") > 0); + + if (vm.count("namespace")) + nspace = argnspace; + + dry_run = (vm.count("dry-run") > 0); + + osflagbits_t flags = 0; + if (dry_run || vm.count("skip-journal-replay")) + flags |= SKIP_JOURNAL_REPLAY; + if (vm.count("skip-mount-omap")) + flags |= SKIP_MOUNT_OMAP; + if (op == "update-mon-db") + flags |= SKIP_JOURNAL_REPLAY; + + head = (vm.count("head") > 0); + + // infer osd id so we can authenticate + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/whoami", dpath.c_str()); + int fd = ::open(fn, O_RDONLY); + if (fd >= 0) { + bufferlist bl; + bl.read_fd(fd, 64); + string s(bl.c_str(), bl.length()); + int whoami = atoi(s.c_str()); + vector<string> tmp; + // identify ourselves as this osd so we can auth and fetch our configs + tmp.push_back("-n"); + tmp.push_back(string("osd.") + stringify(whoami)); + // populate osd_data so that the default keyring location works + tmp.push_back("--osd-data"); + tmp.push_back(dpath); + tmp.insert(tmp.end(), ceph_option_strings.begin(), + ceph_option_strings.end()); + tmp.swap(ceph_option_strings); + } + + vector<const char *> ceph_options; + ceph_options.reserve(ceph_options.size() + ceph_option_strings.size()); + for (vector<string>::iterator i = ceph_option_strings.begin(); + i != ceph_option_strings.end(); + ++i) { + ceph_options.push_back(i->c_str()); + } + + snprintf(fn, sizeof(fn), "%s/type", dpath.c_str()); + fd = ::open(fn, O_RDONLY); + if (fd >= 0) { + bufferlist bl; + bl.read_fd(fd, 64); + if (bl.length()) { + string dp_type = string(bl.c_str(), bl.length() - 1); // drop \n + if (vm.count("type") && dp_type != "" && type != dp_type) + cerr << "WARNING: Ignoring type \"" << type << "\" - found data-path type \"" + << dp_type << "\"" << std::endl; + type = dp_type; + //cout << "object store type is " << type << std::endl; + } + ::close(fd); + } + + if (!vm.count("type") && type == "") { + type = "bluestore"; + } + if (!vm.count("data-path") && + op != "dump-export" && + !(op == "dump-journal" && type == "filestore")) { + cerr << "Must provide --data-path" << std::endl; + usage(desc); + return 1; + } + if (type == "filestore" && !vm.count("journal-path")) { + jpath = dpath + "/journal"; + } + if (!vm.count("op") && !vm.count("object")) { + cerr << "Must provide --op or object command..." << std::endl; + usage(desc); + return 1; + } + if (op != "list" && op != "apply-layout-settings" && + vm.count("op") && vm.count("object")) { + cerr << "Can't specify both --op and object command syntax" << std::endl; + usage(desc); + return 1; + } + if (op == "apply-layout-settings" && !(vm.count("pool") ^ vm.count("pgid"))) { + cerr << "apply-layout-settings requires either --pool or --pgid" + << std::endl; + usage(desc); + return 1; + } + if (op != "list" && op != "apply-layout-settings" && vm.count("object") && !vm.count("objcmd")) { + cerr << "Invalid syntax, missing command" << std::endl; + usage(desc); + return 1; + } + if (op == "fuse" && mountpoint.length() == 0) { + cerr << "Missing fuse mountpoint" << std::endl; + usage(desc); + return 1; + } + outistty = isatty(STDOUT_FILENO); + + file_fd = fd_none; + if ((op == "export" || op == "export-remove" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) { + if (!vm.count("file") || file == "-") { + if (outistty) { + cerr << "stdout is a tty and no --file filename specified" << std::endl; + return 1; + } + file_fd = STDOUT_FILENO; + } else { + file_fd = open(file.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666); + } + } else if (op == "import" || op == "dump-export" || op == "set-osdmap" || op == "set-inc-osdmap") { + if (!vm.count("file") || file == "-") { + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no --file filename specified" << std::endl; + return 1; + } + file_fd = STDIN_FILENO; + } else { + file_fd = open(file.c_str(), O_RDONLY); + } + } + + ObjectStoreTool tool = ObjectStoreTool(file_fd, dry_run); + + if (vm.count("file") && file_fd == fd_none && !dry_run) { + cerr << "--file option only applies to import, dump-export, export, export-remove, " + << "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl; + return 1; + } + + if (file_fd != fd_none && file_fd < 0) { + string err = string("file: ") + file; + perror(err.c_str()); + return 1; + } + + auto cct = global_init( + NULL, ceph_options, + CEPH_ENTITY_TYPE_OSD, + CODE_ENVIRONMENT_UTILITY_NODOUT, + 0); + common_init_finish(g_ceph_context); + if (debug) { + g_conf().set_val_or_die("log_to_stderr", "true"); + g_conf().set_val_or_die("err_to_stderr", "true"); + } + g_conf().apply_changes(nullptr); + + // Special list handling. Treating pretty_format as human readable, + // with one object per line and not an enclosing array. + human_readable = ends_with(format, "-pretty"); + if ((op == "list" || op == "meta-list") && human_readable) { + // Remove -pretty from end of format which we know is there + format = format.substr(0, format.size() - strlen("-pretty")); + } + + formatter = Formatter::create(format); + if (formatter == NULL) { + cerr << "unrecognized format: " << format << std::endl; + return 1; + } + + // Special handling for filestore journal, so we can dump it without mounting + if (op == "dump-journal" && type == "filestore") { + int ret = mydump_journal(formatter, jpath, g_conf()->journal_dio); + if (ret < 0) { + cerr << "journal-path: " << jpath << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + formatter->flush(cout); + return 0; + } + + if (op == "dump-export") { + int ret = tool.dump_export(formatter); + if (ret < 0) { + cerr << "dump-export: " + << cpp_strerror(ret) << std::endl; + return 1; + } + return 0; + } + + //Verify that data-path really exists + struct stat st; + if (::stat(dpath.c_str(), &st) == -1) { + string err = string("data-path: ") + dpath; + perror(err.c_str()); + return 1; + } + + if (pgidstr.length() && !pgid.parse(pgidstr.c_str())) { + cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl; + return 1; + } + + //Verify that the journal-path really exists + if (type == "filestore") { + if (::stat(jpath.c_str(), &st) == -1) { + string err = string("journal-path: ") + jpath; + perror(err.c_str()); + return 1; + } + if (S_ISDIR(st.st_mode)) { + cerr << "journal-path: " << jpath << ": " + << cpp_strerror(EISDIR) << std::endl; + return 1; + } + } + + ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags); + if (fs == NULL) { + cerr << "Unable to create store of type " << type << std::endl; + return 1; + } + + if (op == "fsck" || op == "fsck-deep") { + int r = fs->fsck(op == "fsck-deep"); + if (r < 0) { + cerr << "fsck failed: " << cpp_strerror(r) << std::endl; + return 1; + } + if (r > 0) { + cerr << "fsck found " << r << " errors" << std::endl; + return 1; + } + cout << "fsck found no errors" << std::endl; + return 0; + } + if (op == "repair" || op == "repair-deep") { + int r = fs->repair(op == "repair-deep"); + if (r < 0) { + cerr << "repair failed: " << cpp_strerror(r) << std::endl; + return 1; + } + if (r > 0) { + cerr << "repair found " << r << " errors" << std::endl; + return 1; + } + cout << "repair found no errors" << std::endl; + return 0; + } + if (op == "mkfs") { + if (fsid.length()) { + uuid_d f; + bool r = f.parse(fsid.c_str()); + if (!r) { + cerr << "failed to parse uuid '" << fsid << "'" << std::endl; + return 1; + } + fs->set_fsid(f); + } + int r = fs->mkfs(); + if (r < 0) { + cerr << "mkfs failed: " << cpp_strerror(r) << std::endl; + return 1; + } + return 0; + } + if (op == "dup") { + string target_type; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/type", target_data_path.c_str()); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + cerr << "Unable to open " << target_data_path << "/type" << std::endl; + exit(1); + } + bufferlist bl; + bl.read_fd(fd, 64); + if (bl.length()) { + target_type = string(bl.c_str(), bl.length() - 1); // drop \n + } + ::close(fd); + ObjectStore *targetfs = ObjectStore::create( + g_ceph_context, target_type, + target_data_path, "", 0); + if (targetfs == NULL) { + cerr << "Unable to open store of type " << target_type << std::endl; + return 1; + } + int r = dup(dpath, fs, target_data_path, targetfs); + if (r < 0) { + cerr << "dup failed: " << cpp_strerror(r) << std::endl; + return 1; + } + return 0; + } + + int ret = fs->mount(); + if (ret < 0) { + if (ret == -EBUSY) { + cerr << "OSD has the store locked" << std::endl; + } else { + cerr << "Mount failed with '" << cpp_strerror(ret) << "'" << std::endl; + } + return 1; + } + + if (op == "fuse") { +#ifdef HAVE_LIBFUSE + FuseStore fuse(fs, mountpoint); + cout << "mounting fuse at " << mountpoint << " ..." << std::endl; + int r = fuse.main(); + if (r < 0) { + cerr << "failed to mount fuse: " << cpp_strerror(r) << std::endl; + return 1; + } +#else + cerr << "fuse support not enabled" << std::endl; +#endif + return 0; + } + + vector<coll_t> ls; + vector<coll_t>::iterator it; + CompatSet supported; + +#ifdef INTERNAL_TEST + supported = get_test_compat_set(); +#else + supported = OSD::get_osd_compat_set(); +#endif + + bufferlist bl; + OSDSuperblock superblock; + auto ch = fs->open_collection(coll_t::meta()); + bufferlist::const_iterator p; + ret = fs->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl); + if (ret < 0) { + cerr << "Failure to read OSD superblock: " << cpp_strerror(ret) << std::endl; + goto out; + } + + p = bl.cbegin(); + decode(superblock, p); + + if (debug) { + cerr << "Cluster fsid=" << superblock.cluster_fsid << std::endl; + } + + if (debug) { + cerr << "Supported features: " << supported << std::endl; + cerr << "On-disk features: " << superblock.compat_features << std::endl; + } + if (supported.compare(superblock.compat_features) == -1) { + CompatSet unsupported = supported.unsupported(superblock.compat_features); + cerr << "On-disk OSD incompatible features set " + << unsupported << std::endl; + ret = -EINVAL; + goto out; + } + + if (op == "apply-layout-settings") { + int target_level = 0; + // Single positional argument with apply-layout-settings + // for target_level. + if (vm.count("object") && isdigit(object[0])) { + target_level = atoi(object.c_str()); + // This requires --arg1 to be specified since + // this is the third positional argument and normally + // used with object operations. + } else if (vm.count("arg1") && isdigit(arg1[0])) { + target_level = atoi(arg1.c_str()); + } + ret = apply_layout_settings(fs, superblock, pool, pgid, dry_run, target_level); + goto out; + } + + if (op != "list" && vm.count("object")) { + // Special case: Create pgmeta_oid if empty string specified + // This can't conflict with any actual object names. + if (object == "") { + ghobj = pgid.make_pgmeta_oid(); + } else { + json_spirit::Value v; + try { + if (!json_spirit::read(object, v) || + (v.type() != json_spirit::array_type && v.type() != json_spirit::obj_type)) { + // Special: Need head/snapdir so set even if user didn't specify + if (vm.count("objcmd") && (objcmd == "remove-clone-metadata")) + head = true; + lookup_ghobject lookup(object, nspace, head); + if (pgidstr.length()) + ret = action_on_all_objects_in_exact_pg(fs, coll_t(pgid), lookup, debug); + else + ret = action_on_all_objects(fs, lookup, debug); + if (ret) { + throw std::runtime_error("Internal error"); + } else { + if (lookup.size() != 1) { + stringstream ss; + if (lookup.size() == 0) + ss << "No object id '" << object << "' found or invalid JSON specified"; + else + ss << "Found " << lookup.size() << " objects with id '" << object + << "', please use a JSON spec from --op list instead"; + throw std::runtime_error(ss.str()); + } + pair<coll_t, ghobject_t> found = lookup.pop(); + pgidstr = found.first.to_str(); + pgid.parse(pgidstr.c_str()); + ghobj = found.second; + } + } else { + stringstream ss; + if (pgidstr.length() == 0 && v.type() != json_spirit::array_type) { + ss << "Without --pgid the object '" << object + << "' must be a JSON array"; + throw std::runtime_error(ss.str()); + } + if (v.type() == json_spirit::array_type) { + json_spirit::Array array = v.get_array(); + if (array.size() != 2) { + ss << "Object '" << object + << "' must be a JSON array with 2 elements"; + throw std::runtime_error(ss.str()); + } + vector<json_spirit::Value>::iterator i = array.begin(); + ceph_assert(i != array.end()); + if (i->type() != json_spirit::str_type) { + ss << "Object '" << object + << "' must be a JSON array with the first element a string"; + throw std::runtime_error(ss.str()); + } + string object_pgidstr = i->get_str(); + if (object_pgidstr != "meta") { + spg_t object_pgid; + object_pgid.parse(object_pgidstr.c_str()); + if (pgidstr.length() > 0) { + if (object_pgid != pgid) { + ss << "object '" << object + << "' has a pgid different from the --pgid=" + << pgidstr << " option"; + throw std::runtime_error(ss.str()); + } + } else { + pgidstr = object_pgidstr; + pgid = object_pgid; + } + } else { + pgidstr = object_pgidstr; + } + ++i; + v = *i; + } + try { + ghobj.decode(v); + } catch (std::runtime_error& e) { + ss << "Decode object JSON error: " << e.what(); + throw std::runtime_error(ss.str()); + } + if (pgidstr != "meta" && (uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) { + cerr << "Object pool and pgid pool don't match" << std::endl; + ret = 1; + goto out; + } + } + } catch (std::runtime_error& e) { + cerr << e.what() << std::endl; + ret = 1; + goto out; + } + } + } + + // The ops which require --pgid option are checked here and + // mentioned in the usage for --pgid. + if ((op == "info" || op == "log" || op == "remove" || op == "export" + || op == "export-remove" || op == "mark-complete" + || op == "reset-last-complete" + || op == "trim-pg-log") && + pgidstr.length() == 0) { + cerr << "Must provide pgid" << std::endl; + usage(desc); + ret = 1; + goto out; + } + + if (op == "import") { + + try { + ret = tool.do_import(fs, superblock, force, pgidstr); + } + catch (const buffer::error &e) { + cerr << "do_import threw exception error " << e.what() << std::endl; + ret = -EFAULT; + } + if (ret == -EFAULT) { + cerr << "Corrupt input for import" << std::endl; + } + if (ret == 0) + cout << "Import successful" << std::endl; + goto out; + } else if (op == "dump-journal-mount") { + // Undocumented feature to dump journal with mounted fs + // This doesn't support the format option, but it uses the + // ObjectStore::dump_journal() and mounts to get replay to run. + ret = fs->dump_journal(cout); + if (ret) { + if (ret == -EOPNOTSUPP) { + cerr << "Object store type \"" << type << "\" doesn't support journal dump" << std::endl; + } else { + cerr << "Journal dump failed with error " << cpp_strerror(ret) << std::endl; + } + } + goto out; + } else if (op == "get-osdmap") { + bufferlist bl; + OSDMap osdmap; + if (epoch == 0) { + epoch = superblock.current_epoch; + } + ret = get_osdmap(fs, epoch, osdmap, bl); + if (ret) { + cerr << "Failed to get osdmap#" << epoch << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + ret = bl.write_fd(file_fd); + if (ret) { + cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl; + } else { + cout << "osdmap#" << epoch << " exported." << std::endl; + } + goto out; + } else if (op == "set-osdmap") { + bufferlist bl; + ret = get_fd_data(file_fd, bl); + if (ret < 0) { + cerr << "Failed to read osdmap " << cpp_strerror(ret) << std::endl; + } else { + ret = set_osdmap(fs, epoch, bl, force); + } + goto out; + } else if (op == "get-inc-osdmap") { + bufferlist bl; + if (epoch == 0) { + epoch = superblock.current_epoch; + } + ret = get_inc_osdmap(fs, epoch, bl); + if (ret < 0) { + cerr << "Failed to get incremental osdmap# " << epoch << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + ret = bl.write_fd(file_fd); + if (ret) { + cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl; + } else { + cout << "inc-osdmap#" << epoch << " exported." << std::endl; + } + goto out; + } else if (op == "set-inc-osdmap") { + bufferlist bl; + ret = get_fd_data(file_fd, bl); + if (ret < 0) { + cerr << "Failed to read incremental osdmap " << cpp_strerror(ret) << std::endl; + goto out; + } else { + ret = set_inc_osdmap(fs, epoch, bl, force); + } + goto out; + } else if (op == "update-mon-db") { + if (!vm.count("mon-store-path")) { + cerr << "Please specify the path to monitor db to update" << std::endl; + ret = -EINVAL; + } else { + ret = update_mon_db(*fs, superblock, dpath + "/keyring", mon_store_path); + } + goto out; + } + + if (op == "remove") { + if (!force && !dry_run) { + cerr << "Please use export-remove or you must use --force option" << std::endl; + ret = -EINVAL; + goto out; + } + ret = initiate_new_remove_pg(fs, pgid); + if (ret < 0) { + cerr << "PG '" << pgid << "' not found" << std::endl; + goto out; + } + cout << "Remove successful" << std::endl; + goto out; + } + + if (op == "fix-lost") { + boost::scoped_ptr<action_on_object_t> action; + action.reset(new do_fix_lost()); + if (pgidstr.length()) + ret = action_on_all_objects_in_exact_pg(fs, coll_t(pgid), *action, debug); + else + ret = action_on_all_objects(fs, *action, debug); + goto out; + } + + if (op == "list") { + ret = do_list(fs, pgidstr, object, nspace, formatter, debug, + human_readable, head); + if (ret < 0) { + cerr << "do_list failed: " << cpp_strerror(ret) << std::endl; + } + goto out; + } + + if (op == "dump-super") { + formatter->open_object_section("superblock"); + superblock.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + goto out; + } + + if (op == "statfs") { + store_statfs_t statsbuf; + ret = fs->statfs(&statsbuf); + if (ret < 0) { + cerr << "error from statfs: " << cpp_strerror(ret) << std::endl; + goto out; + } + formatter->open_object_section("statfs"); + statsbuf.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + goto out; + } + + if (op == "meta-list") { + ret = do_meta(fs, object, formatter, debug, human_readable); + if (ret < 0) { + cerr << "do_meta failed: " << cpp_strerror(ret) << std::endl; + } + goto out; + } + + ret = fs->list_collections(ls); + if (ret < 0) { + cerr << "failed to list pgs: " << cpp_strerror(ret) << std::endl; + goto out; + } + + if (debug && op == "list-pgs") + cout << "Performing list-pgs operation" << std::endl; + + // Find pg + for (it = ls.begin(); it != ls.end(); ++it) { + spg_t tmppgid; + + if (pgidstr == "meta") { + if (it->to_str() == "meta") + break; + else + continue; + } + + if (!it->is_pg(&tmppgid)) { + continue; + } + + if (it->is_temp(&tmppgid)) { + continue; + } + + if (op != "list-pgs" && tmppgid != pgid) { + continue; + } + + if (op != "list-pgs") { + //Found! + break; + } + + cout << tmppgid << std::endl; + } + + if (op == "list-pgs") { + ret = 0; + goto out; + } + + // If not an object command nor any of the ops handled below, then output this usage + // before complaining about a bad pgid + if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log") { + cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, " + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, statfs)" + << std::endl; + usage(desc); + ret = 1; + goto out; + } + epoch_t map_epoch; +// The following code for export, info, log require omap or !skip-mount-omap + if (it != ls.end()) { + + coll_t coll = *it; + + if (vm.count("objcmd")) { + ret = 0; + if (objcmd == "remove" || objcmd == "removeall") { + bool all = (objcmd == "removeall"); + enum rmtype type = BOTH; + if (rmtypestr == "nosnapmap") + type = NOSNAPMAP; + else if (rmtypestr == "snapmap") + type = SNAPMAP; + ret = do_remove_object(fs, coll, ghobj, all, force, type); + goto out; + } else if (objcmd == "list-attrs") { + ret = do_list_attrs(fs, coll, ghobj); + goto out; + } else if (objcmd == "list-omap") { + ret = do_list_omap(fs, coll, ghobj); + goto out; + } else if (objcmd == "get-bytes" || objcmd == "set-bytes") { + if (objcmd == "get-bytes") { + int fd; + if (vm.count("arg1") == 0 || arg1 == "-") { + fd = STDOUT_FILENO; + } else { + fd = open(arg1.c_str(), O_WRONLY|O_TRUNC|O_CREAT|O_EXCL|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_get_bytes(fs, coll, ghobj, fd); + if (fd != STDOUT_FILENO) + close(fd); + } else { + int fd; + if (vm.count("arg1") == 0 || arg1 == "-") { + // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no file specified" << std::endl; + ret = 1; + goto out; + } + fd = STDIN_FILENO; + } else { + fd = open(arg1.c_str(), O_RDONLY|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_set_bytes(fs, coll, ghobj, fd); + if (fd != STDIN_FILENO) + close(fd); + } + goto out; + } else if (objcmd == "get-attr") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + ret = do_get_attr(fs, coll, ghobj, arg1); + goto out; + } else if (objcmd == "set-attr") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + } + + int fd; + if (vm.count("arg2") == 0 || arg2 == "-") { + // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no file specified" << std::endl; + ret = 1; + goto out; + } + fd = STDIN_FILENO; + } else { + fd = open(arg2.c_str(), O_RDONLY|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg2 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_set_attr(fs, coll, ghobj, arg1, fd); + if (fd != STDIN_FILENO) + close(fd); + goto out; + } else if (objcmd == "rm-attr") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + ret = do_rm_attr(fs, coll, ghobj, arg1); + goto out; + } else if (objcmd == "get-omap") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + ret = do_get_omap(fs, coll, ghobj, arg1); + goto out; + } else if (objcmd == "set-omap") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + int fd; + if (vm.count("arg2") == 0 || arg2 == "-") { + // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no file specified" << std::endl; + ret = 1; + goto out; + } + fd = STDIN_FILENO; + } else { + fd = open(arg2.c_str(), O_RDONLY|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg2 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_set_omap(fs, coll, ghobj, arg1, fd); + if (fd != STDIN_FILENO) + close(fd); + goto out; + } else if (objcmd == "rm-omap") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + ret = do_rm_omap(fs, coll, ghobj, arg1); + goto out; + } else if (objcmd == "get-omaphdr") { + if (vm.count("arg1")) { + usage(desc); + ret = 1; + goto out; + } + ret = do_get_omaphdr(fs, coll, ghobj); + goto out; + } else if (objcmd == "set-omaphdr") { + // Extra arg + if (vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + int fd; + if (vm.count("arg1") == 0 || arg1 == "-") { + // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no file specified" << std::endl; + ret = 1; + goto out; + } + fd = STDIN_FILENO; + } else { + fd = open(arg1.c_str(), O_RDONLY|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_set_omaphdr(fs, coll, ghobj, fd); + if (fd != STDIN_FILENO) + close(fd); + goto out; + } else if (objcmd == "dump") { + // There should not be any other arguments + if (vm.count("arg1") || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + ret = print_obj_info(fs, coll, ghobj, formatter); + goto out; + } else if (objcmd == "corrupt-info") { // Undocumented testing feature + // There should not be any other arguments + if (vm.count("arg1") || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + ret = corrupt_info(fs, coll, ghobj, formatter); + goto out; + } else if (objcmd == "set-size" || objcmd == "corrupt-size") { + // Undocumented testing feature + bool corrupt = (objcmd == "corrupt-size"); + // Extra arg + if (vm.count("arg1") == 0 || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) { + cerr << "Invalid size '" << arg1 << "' specified" << std::endl; + ret = 1; + goto out; + } + uint64_t size = atoll(arg1.c_str()); + ret = set_size(fs, coll, ghobj, size, formatter, corrupt); + goto out; + } else if (objcmd == "clear-data-digest") { + ret = clear_data_digest(fs, coll, ghobj); + goto out; + } else if (objcmd == "clear-snapset") { + // UNDOCUMENTED: For testing zap SnapSet + // IGNORE extra args since not in usage anyway + if (!ghobj.hobj.has_snapset()) { + cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl; + ret = 1; + goto out; + } + ret = clear_snapset(fs, coll, ghobj, arg1); + goto out; + } else if (objcmd == "remove-clone-metadata") { + // Extra arg + if (vm.count("arg1") == 0 || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + if (!ghobj.hobj.has_snapset()) { + cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl; + ret = 1; + goto out; + } + if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) { + cerr << "Invalid cloneid '" << arg1 << "' specified" << std::endl; + ret = 1; + goto out; + } + snapid_t cloneid = atoi(arg1.c_str()); + ret = remove_clone(fs, coll, ghobj, cloneid, force); + goto out; + } + cerr << "Unknown object command '" << objcmd << "'" << std::endl; + usage(desc); + ret = 1; + goto out; + } + + map_epoch = 0; + ret = PG::peek_map_epoch(fs, pgid, &map_epoch); + if (ret < 0) + cerr << "peek_map_epoch reports error" << std::endl; + if (debug) + cerr << "map_epoch " << map_epoch << std::endl; + + pg_info_t info(pgid); + PastIntervals past_intervals; + __u8 struct_ver; + ret = PG::read_info(fs, pgid, coll, info, past_intervals, struct_ver); + if (ret < 0) { + cerr << "read_info error " << cpp_strerror(ret) << std::endl; + goto out; + } + if (struct_ver < PG::get_compat_struct_v()) { + cerr << "PG is too old to upgrade, use older Ceph version" << std::endl; + ret = -EFAULT; + goto out; + } + if (debug) + cerr << "struct_v " << (int)struct_ver << std::endl; + + if (op == "export" || op == "export-remove") { + ret = tool.do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock, past_intervals); + if (ret == 0) { + cerr << "Export successful" << std::endl; + if (op == "export-remove") { + ret = initiate_new_remove_pg(fs, pgid); + // Export succeeded, so pgid is there + ceph_assert(ret == 0); + cerr << "Remove successful" << std::endl; + } + } + } else if (op == "info") { + formatter->open_object_section("info"); + info.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + } else if (op == "log") { + PGLog::IndexedLog log; + pg_missing_t missing; + ret = get_log(fs, struct_ver, pgid, info, log, missing); + if (ret < 0) + goto out; + + dump_log(formatter, cout, log, missing); + } else if (op == "mark-complete") { + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (struct_ver < PG::get_compat_struct_v()) { + cerr << "Can't mark-complete, version mismatch " << (int)struct_ver + << " (pg) < compat " << (int)PG::get_compat_struct_v() << " (tool)" + << std::endl; + ret = 1; + goto out; + } + + cout << "Marking complete " << std::endl; + + info.last_update = eversion_t(superblock.current_epoch, info.last_update.version + 1); + info.last_backfill = hobject_t::get_max(); + info.last_epoch_started = superblock.current_epoch; + info.history.last_epoch_started = superblock.current_epoch; + info.history.last_epoch_clean = superblock.current_epoch; + past_intervals.clear(); + + if (!dry_run) { + ret = write_info(*t, map_epoch, info, past_intervals); + if (ret != 0) + goto out; + auto ch = fs->open_collection(coll_t(pgid)); + fs->queue_transaction(ch, std::move(*t)); + } + cout << "Marking complete succeeded" << std::endl; + } else if (op == "trim-pg-log") { + ret = do_trim_pg_log(fs, coll, info, pgid, + map_epoch, past_intervals); + if (ret < 0) { + cerr << "Error trimming pg log: " << cpp_strerror(ret) << std::endl; + goto out; + } + cout << "Finished trimming pg log" << std::endl; + goto out; + } else if (op == "reset-last-complete") { + if (!force) { + std::cerr << "WARNING: reset-last-complete is extremely dangerous and almost " + << "certain to lead to permanent data loss unless you know exactly " + << "what you are doing. Pass --force to proceed anyway." + << std::endl; + ret = -EINVAL; + goto out; + } + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (struct_ver < PG::get_compat_struct_v()) { + cerr << "Can't reset-last-complete, version mismatch " << (int)struct_ver + << " (pg) < compat " << (int)PG::get_compat_struct_v() << " (tool)" + << std::endl; + ret = 1; + goto out; + } + + cout << "Reseting last_complete " << std::endl; + + info.last_complete = info.last_update; + + if (!dry_run) { + ret = write_info(*t, map_epoch, info, past_intervals); + if (ret != 0) + goto out; + fs->queue_transaction(ch, std::move(*t)); + } + cout << "Reseting last_complete succeeded" << std::endl; + + } else { + ceph_assert(!"Should have already checked for valid --op"); + } + } else { + cerr << "PG '" << pgid << "' not found" << std::endl; + ret = -ENOENT; + } + +out: + int r = fs->umount(); + if (r < 0) { + cerr << "umount failed: " << cpp_strerror(r) << std::endl; + // If no previous error, then use umount() error + if (ret == 0) + ret = r; + } + + if (dry_run) { + // Export output can go to stdout, so put this message on stderr + if (op == "export") + cerr << "dry-run: Nothing changed" << std::endl; + else + cout << "dry-run: Nothing changed" << std::endl; + } + + if (ret < 0) + ret = 1; + return ret; +} diff --git a/src/tools/ceph_objectstore_tool.h b/src/tools/ceph_objectstore_tool.h new file mode 100644 index 00000000..aafe886b --- /dev/null +++ b/src/tools/ceph_objectstore_tool.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OBJECTSTORE_TOOL_H_ +#define CEPH_OBJECTSTORE_TOOL_H_ + +#include "RadosDump.h" + +class ObjectStoreTool : public RadosDump +{ + public: + ObjectStoreTool(int file_fd, bool dry_run) + : RadosDump(file_fd, dry_run) + {} + + int dump_export(Formatter *formatter); + int do_import(ObjectStore *store, OSDSuperblock& sb, bool force, + std::string pgidstr); + int do_export(ObjectStore *fs, coll_t coll, spg_t pgid, + pg_info_t &info, epoch_t map_epoch, __u8 struct_ver, + const OSDSuperblock& superblock, + PastIntervals &past_intervals); + int dump_object(Formatter *formatter, + bufferlist &bl); + int get_object( + ObjectStore *store, OSDriver& driver, SnapMapper& mapper, coll_t coll, + bufferlist &bl, OSDMap &curmap, bool *skipped_objects); + int export_file( + ObjectStore *store, coll_t cid, ghobject_t &obj); + int export_files(ObjectStore *store, coll_t coll); +}; + +#endif // CEPH_OBJECSTORE_TOOL_H_ diff --git a/src/tools/ceph_osdomap_tool.cc b/src/tools/ceph_osdomap_tool.cc new file mode 100644 index 00000000..8e15851d --- /dev/null +++ b/src/tools/ceph_osdomap_tool.cc @@ -0,0 +1,211 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2012 Inktank, Inc. +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License kkjversion 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/parsers.hpp> + +#include <stdlib.h> +#include <string> + +#include "common/errno.h" +#include "global/global_init.h" + +#include "os/filestore/DBObjectMap.h" +#include "kv/KeyValueDB.h" + +namespace po = boost::program_options; + +int main(int argc, char **argv) { + po::options_description desc("Allowed options"); + string store_path, cmd, oid, backend; + bool debug = false; + desc.add_options() + ("help", "produce help message") + ("omap-path", po::value<string>(&store_path), + "path to omap directory, mandatory (current/omap usually)") + ("paranoid", "use paranoid checking") + ("debug", "Additional debug output from DBObjectMap") + ("oid", po::value<string>(&oid), "Restrict to this object id when dumping objects") + ("command", po::value<string>(&cmd), + "command arg is one of [dump-raw-keys, dump-raw-key-vals, dump-objects, dump-objects-with-keys, check, dump-headers, repair, compact], mandatory") + ("backend", po::value<string>(&backend), + "DB backend (default rocksdb)") + ; + po::positional_options_description p; + p.add("command", 1); + + vector<string> ceph_option_strings; + po::variables_map vm; + try { + po::parsed_options parsed = + po::command_line_parser(argc, argv).options(desc).positional(p).allow_unregistered().run(); + po::store( + parsed, + vm); + po::notify(vm); + + ceph_option_strings = po::collect_unrecognized(parsed.options, + po::include_positional); + } catch(po::error &e) { + std::cerr << e.what() << std::endl; + return 1; + } + + vector<const char *> ceph_options; + ceph_options.reserve(ceph_option_strings.size()); + for (vector<string>::iterator i = ceph_option_strings.begin(); + i != ceph_option_strings.end(); + ++i) { + ceph_options.push_back(i->c_str()); + } + + if (vm.count("debug")) debug = true; + + if (vm.count("help")) { + std::cerr << desc << std::endl; + return 1; + } + + auto cct = global_init( + NULL, ceph_options, CEPH_ENTITY_TYPE_OSD, + CODE_ENVIRONMENT_UTILITY_NODOUT, 0); + common_init_finish(g_ceph_context); + cct->_conf.apply_changes(nullptr); + if (debug) { + g_conf().set_val_or_die("log_to_stderr", "true"); + g_conf().set_val_or_die("err_to_stderr", "true"); + } + g_conf().apply_changes(nullptr); + + if (vm.count("omap-path") == 0) { + std::cerr << "Required argument --omap-path" << std::endl; + return 1; + } + + if (vm.count("command") == 0) { + std::cerr << "Required argument --command" << std::endl; + return 1; + } + + if (vm.count("backend") == 0) { + backend = "rocksdb"; + } + + KeyValueDB* store(KeyValueDB::create(g_ceph_context, backend, store_path)); + if (store == NULL) { + std::cerr << "Invalid backend '" << backend << "' specified" << std::endl; + return 1; + } + /*if (vm.count("paranoid")) { + std::cerr << "Enabling paranoid checks" << std::endl; + store->options.paranoid_checks = true; + }*/ + DBObjectMap omap(cct.get(), store); + stringstream out; + int r = store->open(out); + if (r < 0) { + std::cerr << "Store open got: " << cpp_strerror(r) << std::endl; + std::cerr << "Output: " << out.str() << std::endl; + return r; + } + // We don't call omap.init() here because it will repair + // the DBObjectMap which we might want to examine for diagnostic + // reasons. Instead use --command repair. + + omap.get_state(); + std::cout << "Version: " << (int)omap.state.v << std::endl; + std::cout << "Seq: " << omap.state.seq << std::endl; + std::cout << "legacy: " << (omap.state.legacy ? "true" : "false") << std::endl; + + if (cmd == "dump-raw-keys") { + KeyValueDB::WholeSpaceIterator i = store->get_wholespace_iterator(); + for (i->seek_to_first(); i->valid(); i->next()) { + std::cout << i->raw_key() << std::endl; + } + return 0; + } else if (cmd == "dump-raw-key-vals") { + KeyValueDB::WholeSpaceIterator i = store->get_wholespace_iterator(); + for (i->seek_to_first(); i->valid(); i->next()) { + std::cout << i->raw_key() << std::endl; + i->value().hexdump(std::cout); + } + return 0; + } else if (cmd == "dump-objects") { + vector<ghobject_t> objects; + r = omap.list_objects(&objects); + if (r < 0) { + std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl; + return r; + } + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (vm.count("oid") != 0 && i->hobj.oid.name != oid) + continue; + std::cout << *i << std::endl; + } + return 0; + } else if (cmd == "dump-objects-with-keys") { + vector<ghobject_t> objects; + r = omap.list_objects(&objects); + if (r < 0) { + std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl; + return r; + } + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (vm.count("oid") != 0 && i->hobj.oid.name != oid) + continue; + std::cout << "Object: " << *i << std::endl; + ObjectMap::ObjectMapIterator j = omap.get_iterator(ghobject_t(i->hobj)); + for (j->seek_to_first(); j->valid(); j->next()) { + std::cout << j->key() << std::endl; + j->value().hexdump(std::cout); + } + } + return 0; + } else if (cmd == "check" || cmd == "repair") { + ostringstream ss; + bool repair = (cmd == "repair"); + r = omap.check(ss, repair, true); + if (r) { + std::cerr << ss.str() << std::endl; + if (r > 0) { + std::cerr << "check got " << r << " error(s)" << std::endl; + return 1; + } + } + std::cout << (repair ? "repair" : "check") << " succeeded" << std::endl; + return 0; + } else if (cmd == "dump-headers") { + vector<DBObjectMap::_Header> headers; + r = omap.list_object_headers(&headers); + if (r < 0) { + std::cerr << "list_object_headers got: " << cpp_strerror(r) << std::endl; + return 1; + } + for (auto i : headers) + std::cout << i << std::endl; + return 0; + } else if (cmd == "resetv2") { + omap.state.v = 2; + omap.state.legacy = false; + omap.set_state(); + } else if (cmd == "compact") { + omap.compact(); + return 0; + } else { + std::cerr << "Did not recognize command " << cmd << std::endl; + return 1; + } +} diff --git a/src/tools/cephfs/CMakeLists.txt b/src/tools/cephfs/CMakeLists.txt new file mode 100644 index 00000000..2cca8dc0 --- /dev/null +++ b/src/tools/cephfs/CMakeLists.txt @@ -0,0 +1,49 @@ +set(cephfs_journal_tool_srcs + cephfs-journal-tool.cc + JournalTool.cc + JournalFilter.cc + JournalScanner.cc + EventOutput.cc + Dumper.cc + Resetter.cc + RoleSelector.cc + MDSUtility.cc) +add_executable(cephfs-journal-tool ${cephfs_journal_tool_srcs}) +target_link_libraries(cephfs-journal-tool librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +set(cephfs_table_tool_srcs + cephfs-table-tool.cc + TableTool.cc + RoleSelector.cc + MDSUtility.cc) +add_executable(cephfs-table-tool ${cephfs_table_tool_srcs}) +target_link_libraries(cephfs-table-tool librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +set(cephfs_data_scan_srcs + cephfs-data-scan.cc + DataScan.cc + RoleSelector.cc + PgFiles.cc + MDSUtility.cc) +add_executable(cephfs-data-scan ${cephfs_data_scan_srcs}) +target_link_libraries(cephfs-data-scan librados cephfs mds osdc global + cls_cephfs_client + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +install(TARGETS + cephfs-journal-tool + cephfs-table-tool + cephfs-data-scan + DESTINATION bin) + +option(WITH_CEPHFS_SHELL "install cephfs-shell" OFF) +if(WITH_CEPHFS_SHELL) + if(NOT WITH_PYTHON3) + message(SEND_ERROR "Please enable WITH_PYTHON3 for cephfs-shell") + endif() + set(PYTHON_VERSION 3) + include(Distutils) + distutils_install_module(cephfs-shell) +endif() diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc new file mode 100644 index 00000000..8fb670ad --- /dev/null +++ b/src/tools/cephfs/DataScan.cc @@ -0,0 +1,2188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include "common/errno.h" +#include "common/ceph_argparse.h" +#include <fstream> +#include "include/util.h" + +#include "mds/CInode.h" +#include "mds/InoTable.h" +#include "mds/SnapServer.h" +#include "cls/cephfs/cls_cephfs_client.h" + +#include "PgFiles.h" +#include "DataScan.h" +#include "include/compat.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "datascan." << __func__ << ": " + +void DataScan::usage() +{ + std::cout << "Usage: \n" + << " cephfs-data-scan init [--force-init]\n" + << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n" + << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n" + << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n" + << " cephfs-data-scan scan_links\n" + << "\n" + << " --force-corrupt: overrite apparently corrupt structures\n" + << " --force-init: write root inodes even if they exist\n" + << " --force-pool: use data pool even if it is not in FSMap\n" + << " --worker_m: Maximum number of workers\n" + << " --worker_n: Worker number, range 0-(worker_m-1)\n" + << "\n" + << " cephfs-data-scan scan_frags [--force-corrupt]\n" + << " cephfs-data-scan cleanup <data pool name>\n" + << std::endl; + + generic_client_usage(); +} + +bool DataScan::parse_kwarg( + const std::vector<const char*> &args, + std::vector<const char *>::const_iterator &i, + int *r) +{ + if (i + 1 == args.end()) { + return false; + } + + const std::string arg(*i); + const std::string val(*(i + 1)); + + if (arg == std::string("--output-dir")) { + if (driver != NULL) { + derr << "Unexpected --output-dir: output already selected!" << dendl; + *r = -EINVAL; + return false; + } + dout(4) << "Using local file output to '" << val << "'" << dendl; + driver = new LocalFileDriver(val, data_io); + return true; + } else if (arg == std::string("--worker_n")) { + std::string err; + n = strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + std::cerr << "Invalid worker number '" << val << "'" << std::endl; + *r = -EINVAL; + return false; + } + return true; + } else if (arg == std::string("--worker_m")) { + std::string err; + m = strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + std::cerr << "Invalid worker count '" << val << "'" << std::endl; + *r = -EINVAL; + return false; + } + return true; + } else if (arg == std::string("--filter-tag")) { + filter_tag = val; + dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl; + return true; + } else if (arg == std::string("--filesystem")) { + std::shared_ptr<const Filesystem> fs; + *r = fsmap->parse_filesystem(val, &fs); + if (*r != 0) { + std::cerr << "Invalid filesystem '" << val << "'" << std::endl; + return false; + } + fscid = fs->fscid; + return true; + } else if (arg == std::string("--alternate-pool")) { + metadata_pool_name = val; + return true; + } else { + return false; + } +} + +bool DataScan::parse_arg( + const std::vector<const char*> &args, + std::vector<const char *>::const_iterator &i) +{ + const std::string arg(*i); + if (arg == "--force-pool") { + force_pool = true; + return true; + } else if (arg == "--force-corrupt") { + force_corrupt = true; + return true; + } else if (arg == "--force-init") { + force_init = true; + return true; + } else { + return false; + } +} + +int DataScan::main(const std::vector<const char*> &args) +{ + // Parse args + // ========== + if (args.size() < 1) { + cerr << "missing position argument" << std::endl; + return -EINVAL; + } + + // Common RADOS init: open metadata pool + // ===================================== + librados::Rados rados; + int r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "RADOS unavailable" << dendl; + return r; + } + + std::string const &command = args[0]; + std::string data_pool_name; + + std::string pg_files_path; + std::set<pg_t> pg_files_pgs; + + // Consume any known --key val or --flag arguments + for (std::vector<const char *>::const_iterator i = args.begin() + 1; + i != args.end(); ++i) { + if (parse_kwarg(args, i, &r)) { + // Skip the kwarg value field + ++i; + continue; + } else if (r) { + return r; + } + + if (parse_arg(args, i)) { + continue; + } + + // Trailing positional argument + if (i + 1 == args.end() && + (command == "scan_inodes" + || command == "scan_extents" + || command == "cleanup")) { + data_pool_name = *i; + continue; + } + + if (command == "pg_files") { + if (i == args.begin() + 1) { + pg_files_path = *i; + continue; + } else { + pg_t pg; + bool parsed = pg.parse(*i); + if (!parsed) { + std::cerr << "Invalid PG '" << *i << "'" << std::endl; + return -EINVAL; + } else { + pg_files_pgs.insert(pg); + continue; + } + } + + } + + // Fall through: unhandled + std::cerr << "Unknown argument '" << *i << "'" << std::endl; + return -EINVAL; + } + + // If caller didn't specify a namespace, try to pick + // one if only one exists + if (fscid == FS_CLUSTER_ID_NONE) { + if (fsmap->filesystem_count() == 1) { + fscid = fsmap->get_filesystem()->fscid; + } else { + std::cerr << "Specify a filesystem with --filesystem" << std::endl; + return -EINVAL; + } + } + auto fs = fsmap->get_filesystem(fscid); + ceph_assert(fs != nullptr); + + // Default to output to metadata pool + if (driver == NULL) { + driver = new MetadataDriver(); + driver->set_force_corrupt(force_corrupt); + driver->set_force_init(force_init); + dout(4) << "Using metadata pool output" << dendl; + } + + dout(4) << "connecting to RADOS..." << dendl; + r = rados.connect(); + if (r < 0) { + std::cerr << "couldn't connect to cluster: " << cpp_strerror(r) + << std::endl; + return r; + } + + r = driver->init(rados, metadata_pool_name, fsmap, fscid); + if (r < 0) { + return r; + } + + if (command == "pg_files") { + auto pge = PgFiles(objecter, pg_files_pgs); + pge.init(); + return pge.scan_path(pg_files_path); + } + + // Initialize data_io for those commands that need it + if (command == "scan_inodes" || + command == "scan_extents" || + command == "cleanup") { + if (data_pool_name.empty()) { + std::cerr << "Data pool not specified" << std::endl; + return -EINVAL; + } + + data_pool_id = rados.pool_lookup(data_pool_name.c_str()); + if (data_pool_id < 0) { + std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl; + return -ENOENT; + } else { + dout(4) << "data pool '" << data_pool_name + << "' has ID " << data_pool_id << dendl; + } + + if (!fs->mds_map.is_data_pool(data_pool_id)) { + std::cerr << "Warning: pool '" << data_pool_name << "' is not a " + "CephFS data pool!" << std::endl; + if (!force_pool) { + std::cerr << "Use --force-pool to continue" << std::endl; + return -EINVAL; + } + } + + dout(4) << "opening data pool '" << data_pool_name << "'" << dendl; + r = rados.ioctx_create(data_pool_name.c_str(), data_io); + if (r != 0) { + return r; + } + } + + // Initialize metadata_io from MDSMap for scan_frags + if (command == "scan_frags" || command == "scan_links") { + const auto fs = fsmap->get_filesystem(fscid); + if (fs == nullptr) { + std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl; + return -ENOENT; + } + int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool(); + + dout(4) << "resolving metadata pool " << metadata_pool_id << dendl; + int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name); + if (r < 0) { + std::cerr << "Pool " << metadata_pool_id + << " identified in MDS map not found in RADOS!" << std::endl; + return r; + } + + r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io); + if (r != 0) { + return r; + } + + data_pools = fs->mds_map.get_data_pools(); + } + + // Finally, dispatch command + if (command == "scan_inodes") { + return scan_inodes(); + } else if (command == "scan_extents") { + return scan_extents(); + } else if (command == "scan_frags") { + return scan_frags(); + } else if (command == "scan_links") { + return scan_links(); + } else if (command == "cleanup") { + return cleanup(); + } else if (command == "init") { + return driver->init_roots(fs->mds_map.get_first_data_pool()); + } else { + std::cerr << "Unknown command '" << command << "'" << std::endl; + return -EINVAL; + } +} + +int MetadataDriver::inject_unlinked_inode( + inodeno_t inono, int mode, int64_t data_pool_id) +{ + const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode"); + + // Skip if exists + bool already_exists = false; + int r = root_exists(inono, &already_exists); + if (r) { + return r; + } + if (already_exists && !force_init) { + std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already" + " exists, skipping create. Use --force-init to overwrite" + " the existing object." << std::endl; + return 0; + } + + // Compose + InodeStore inode; + inode.inode.ino = inono; + inode.inode.version = 1; + inode.inode.xattr_version = 1; + inode.inode.mode = 0500 | mode; + // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty + // (we won't actually give the *correct* dirstat here though) + inode.inode.dirstat.nfiles = 1; + + inode.inode.ctime = + inode.inode.mtime = ceph_clock_now(); + inode.inode.nlink = 1; + inode.inode.truncate_size = -1ull; + inode.inode.truncate_seq = 1; + inode.inode.uid = g_conf()->mds_root_ino_uid; + inode.inode.gid = g_conf()->mds_root_ino_gid; + + // Force layout to default: should we let users override this so that + // they don't have to mount the filesystem to correct it? + inode.inode.layout = file_layout_t::get_default(); + inode.inode.layout.pool_id = data_pool_id; + inode.inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + + // Assume that we will get our stats wrong, and that we may + // be ignoring dirfrags that exist + inode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE); + + if (inono == MDS_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) { + sr_t srnode; + srnode.seq = 1; + encode(srnode, inode.snap_blob); + } + + // Serialize + bufferlist inode_bl; + encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl); + inode.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + // Write + r = metadata_io.write_full(oid.name, inode_bl); + if (r != 0) { + derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl; + return r; + } + + return r; +} + +int MetadataDriver::root_exists(inodeno_t ino, bool *result) +{ + object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); + uint64_t size; + time_t mtime; + int r = metadata_io.stat(oid.name, &size, &mtime); + if (r == -ENOENT) { + *result = false; + return 0; + } else if (r < 0) { + return r; + } + + *result = true; + return 0; +} + +int MetadataDriver::init_roots(int64_t data_pool_id) +{ + int r = 0; + r = inject_unlinked_inode(MDS_INO_ROOT, S_IFDIR|0755, data_pool_id); + if (r != 0) { + return r; + } + r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id); + if (r != 0) { + return r; + } + bool created = false; + r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created); + if (r != 0) { + return r; + } + + return 0; +} + +int MetadataDriver::check_roots(bool *result) +{ + int r; + r = root_exists(MDS_INO_ROOT, result); + if (r != 0) { + return r; + } + if (!*result) { + return 0; + } + + r = root_exists(MDS_INO_MDSDIR(0), result); + if (r != 0) { + return r; + } + if (!*result) { + return 0; + } + + return 0; +} + +/** + * Stages: + * + * SERIAL init + * 0. Create root inodes if don't exist + * PARALLEL scan_extents + * 1. Size and mtime recovery: scan ALL objects, and update 0th + * objects with max size and max mtime seen. + * PARALLEL scan_inodes + * 2. Inode recovery: scan ONLY 0th objects, and inject metadata + * into dirfrag OMAPs, creating blank dirfrags as needed. No stats + * or rstats at this stage. Inodes without backtraces go into + * lost+found + * TODO: SERIAL "recover stats" + * 3. Dirfrag statistics: depth first traverse into metadata tree, + * rebuilding dir sizes. + * TODO PARALLEL "clean up" + * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged + * anything onto them) and remove any of the xattrs that we + * used for accumulating. + */ + + +int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id) +{ + if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) { + return -EINVAL; + } + + std::string err; + std::string inode_str = oid.substr(0, oid.find(".")); + *inode_no = strict_strtoll(inode_str.c_str(), 16, &err); + if (!err.empty()) { + return -EINVAL; + } + + std::string pos_string = oid.substr(oid.find(".") + 1); + *obj_id = strict_strtoll(pos_string.c_str(), 16, &err); + if (!err.empty()) { + return -EINVAL; + } + + return 0; +} + + +int DataScan::scan_extents() +{ + return forall_objects(data_io, false, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + // Read size + uint64_t size; + time_t mtime; + int r = data_io.stat(oid, &size, &mtime); + dout(10) << "handling object " << obj_name_ino + << "." << obj_name_offset << dendl; + if (r != 0) { + dout(4) << "Cannot stat '" << oid << "': skipping" << dendl; + return r; + } + + // I need to keep track of + // * The highest object ID seen + // * The size of the highest object ID seen + // * The largest object seen + // + // Given those things, I can later infer the object chunking + // size, the offset of the last object (chunk size * highest ID seen) + // and the actual size (offset of last object + size of highest ID seen) + // + // This logic doesn't take account of striping. + r = ClsCephFSClient::accumulate_inode_metadata( + data_io, + obj_name_ino, + obj_name_offset, + size, + mtime); + if (r < 0) { + derr << "Failed to accumulate metadata data from '" + << oid << "': " << cpp_strerror(r) << dendl; + return r; + } + + return r; + }); +} + +int DataScan::probe_filter(librados::IoCtx &ioctx) +{ + bufferlist filter_bl; + ClsCephFSClient::build_tag_filter("test", &filter_bl); + librados::ObjectCursor range_i; + librados::ObjectCursor range_end; + + std::vector<librados::ObjectItem> tmp_result; + librados::ObjectCursor tmp_next; + int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(), + 1, filter_bl, &tmp_result, &tmp_next); + + return r >= 0; +} + +int DataScan::forall_objects( + librados::IoCtx &ioctx, + bool untagged_only, + std::function<int(std::string, uint64_t, uint64_t)> handler + ) +{ + librados::ObjectCursor range_i; + librados::ObjectCursor range_end; + ioctx.object_list_slice( + ioctx.object_list_begin(), + ioctx.object_list_end(), + n, + m, + &range_i, + &range_end); + + + bufferlist filter_bl; + + bool legacy_filtering = false; + if (untagged_only) { + // probe to deal with older OSDs that don't support + // the cephfs pgls filtering mode + legacy_filtering = !probe_filter(ioctx); + if (!legacy_filtering) { + ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl); + } + } + + int r = 0; + while(range_i < range_end) { + std::vector<librados::ObjectItem> result; + int r = ioctx.object_list(range_i, range_end, 1, + filter_bl, &result, &range_i); + if (r < 0) { + derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto &i : result) { + const std::string &oid = i.oid; + uint64_t obj_name_ino = 0; + uint64_t obj_name_offset = 0; + r = parse_oid(oid, &obj_name_ino, &obj_name_offset); + if (r != 0) { + dout(4) << "Bad object name '" << oid << "', skipping" << dendl; + continue; + } + + if (untagged_only && legacy_filtering) { + dout(20) << "Applying filter to " << oid << dendl; + + // We are only interested in 0th objects during this phase: we touched + // the other objects during scan_extents + if (obj_name_offset != 0) { + dout(20) << "Non-zeroth object" << dendl; + continue; + } + + bufferlist scrub_tag_bl; + int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl); + if (r >= 0) { + std::string read_tag; + auto q = scrub_tag_bl.cbegin(); + try { + decode(read_tag, q); + if (read_tag == filter_tag) { + dout(20) << "skipping " << oid << " because it has the filter_tag" + << dendl; + continue; + } + } catch (const buffer::error &err) { + } + dout(20) << "read non-matching tag '" << read_tag << "'" << dendl; + } else { + dout(20) << "no tag read (" << r << ")" << dendl; + } + + } else if (untagged_only) { + ceph_assert(obj_name_offset == 0); + dout(20) << "OSD matched oid " << oid << dendl; + } + + int this_oid_r = handler(oid, obj_name_ino, obj_name_offset); + if (r == 0 && this_oid_r < 0) { + r = this_oid_r; + } + } + } + + return r; +} + +int DataScan::scan_inodes() +{ + bool roots_present; + int r = driver->check_roots(&roots_present); + if (r != 0) { + derr << "Unexpected error checking roots: '" + << cpp_strerror(r) << "'" << dendl; + return r; + } + + if (!roots_present) { + std::cerr << "Some or all system inodes are absent. Run 'init' from " + "one node before running 'scan_inodes'" << std::endl; + return -EIO; + } + + return forall_objects(data_io, true, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + int r = 0; + + dout(10) << "handling object " + << std::hex << obj_name_ino << "." << obj_name_offset << std::dec + << dendl; + + AccumulateResult accum_res; + inode_backtrace_t backtrace; + file_layout_t loaded_layout = file_layout_t::get_default(); + r = ClsCephFSClient::fetch_inode_accumulate_result( + data_io, oid, &backtrace, &loaded_layout, &accum_res); + + if (r == -EINVAL) { + dout(4) << "Accumulated metadata missing from '" + << oid << ", did you run scan_extents?" << dendl; + return r; + } else if (r < 0) { + dout(4) << "Unexpected error loading accumulated metadata from '" + << oid << "': " << cpp_strerror(r) << dendl; + // FIXME: this creates situation where if a client has a corrupt + // backtrace/layout, we will fail to inject it. We should (optionally) + // proceed if the backtrace/layout is corrupt but we have valid + // accumulated metadata. + return r; + } + + const time_t file_mtime = accum_res.max_mtime; + uint64_t file_size = 0; + bool have_backtrace = !(backtrace.ancestors.empty()); + + // This is the layout we will use for injection, populated either + // from loaded_layout or from best guesses + file_layout_t guessed_layout; + guessed_layout.pool_id = data_pool_id; + + // Calculate file_size, guess the layout + if (accum_res.ceiling_obj_index > 0) { + uint32_t chunk_size = file_layout_t::get_default().object_size; + // When there are multiple objects, the largest object probably + // indicates the chunk size. But not necessarily, because files + // can be sparse. Only make this assumption if size seen + // is a power of two, as chunk sizes typically are. + if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) { + chunk_size = accum_res.max_obj_size; + } + + if (loaded_layout.pool_id == -1) { + // If no stashed layout was found, guess it + guessed_layout.object_size = chunk_size; + guessed_layout.stripe_unit = chunk_size; + guessed_layout.stripe_count = 1; + } else if (!loaded_layout.is_valid() || + loaded_layout.object_size < accum_res.max_obj_size) { + // If the max size seen exceeds what the stashed layout claims, then + // disbelieve it. Guess instead. Same for invalid layouts on disk. + dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino + << std::dec << ", ignoring in favour of best guess" << dendl; + guessed_layout.object_size = chunk_size; + guessed_layout.stripe_unit = chunk_size; + guessed_layout.stripe_count = 1; + } else { + // We have a stashed layout that we can't disprove, so apply it + guessed_layout = loaded_layout; + dout(20) << "loaded layout from xattr:" + << " os: " << guessed_layout.object_size + << " sc: " << guessed_layout.stripe_count + << " su: " << guessed_layout.stripe_unit + << dendl; + // User might have transplanted files from a pool with a different + // ID, so whatever the loaded_layout says, we'll force the injected + // layout to point to the pool we really read from + guessed_layout.pool_id = data_pool_id; + } + + if (guessed_layout.stripe_count == 1) { + // Unstriped file: simple chunking + file_size = guessed_layout.object_size * accum_res.ceiling_obj_index + + accum_res.ceiling_obj_size; + } else { + // Striped file: need to examine the last stripe_count objects + // in the file to determine the size. + + // How many complete (i.e. not last stripe) objects? + uint64_t complete_objs = 0; + if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) { + complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count; + } else { + complete_objs = 0; + } + + // How many potentially-short objects (i.e. last stripe set) objects? + uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs; + + dout(10) << "calculating striped size from complete objs: " + << complete_objs << ", partial objs: " << partial_objs + << dendl; + + // Maximum amount of data that may be in the incomplete objects + uint64_t incomplete_size = 0; + + // For each short object, calculate the max file size within it + // and accumulate the maximum + for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) { + char buf[60]; + snprintf(buf, sizeof(buf), "%llx.%08llx", + (long long unsigned)obj_name_ino, (long long unsigned)i); + + uint64_t osize(0); + time_t omtime(0); + r = data_io.stat(std::string(buf), &osize, &omtime); + if (r == 0) { + if (osize > 0) { + // Upper bound within this object + uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit + * (guessed_layout.stripe_unit * guessed_layout.stripe_count) + + (i % guessed_layout.stripe_count) + * guessed_layout.stripe_unit + (osize - 1) + % guessed_layout.stripe_unit + 1; + incomplete_size = std::max(incomplete_size, upper_size); + } + } else if (r == -ENOENT) { + // Absent object, treat as size 0 and ignore. + } else { + // Unexpected error, carry r to outer scope for handling. + break; + } + } + if (r != 0 && r != -ENOENT) { + derr << "Unexpected error checking size of ino 0x" << std::hex + << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl; + return r; + } + file_size = complete_objs * guessed_layout.object_size + + incomplete_size; + } + } else { + file_size = accum_res.ceiling_obj_size; + if (loaded_layout.pool_id < 0 + || loaded_layout.object_size < accum_res.max_obj_size) { + // No layout loaded, or inconsistent layout, use default + guessed_layout = file_layout_t::get_default(); + guessed_layout.pool_id = data_pool_id; + } else { + guessed_layout = loaded_layout; + } + } + + // Santity checking backtrace ino against object name + if (have_backtrace && backtrace.ino != obj_name_ino) { + dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino + << " doesn't match object name ino 0x" << obj_name_ino + << std::dec << dendl; + have_backtrace = false; + } + + InodeStore dentry; + build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry); + + // Inject inode to the metadata pool + if (have_backtrace) { + inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin()); + if (MDS_INO_IS_MDSDIR(root_bp.dirino)) { + /* Special case for strays: even if we have a good backtrace, + * don't put it in the stray dir, because while that would technically + * give it linkage it would still be invisible to the user */ + r = driver->inject_lost_and_found(obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } else { + /* Happy case: we will inject a named dentry for this inode */ + r = driver->inject_with_backtrace(backtrace, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " with backtrace: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + } else { + /* Backtrace-less case: we will inject a lost+found dentry */ + r = driver->inject_lost_and_found( + obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << obj_name_ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + + return r; + }); +} + +int DataScan::cleanup() +{ + // We are looking for only zeroth object + // + return forall_objects(data_io, true, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + int r = 0; + r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid); + if (r < 0) { + dout(4) << "Error deleting accumulated metadata from '" + << oid << "': " << cpp_strerror(r) << dendl; + } + return r; + }); +} + +bool DataScan::valid_ino(inodeno_t ino) const +{ + return (ino >= inodeno_t((1ull << 40))) + || (MDS_INO_IS_STRAY(ino)) + || (MDS_INO_IS_MDSDIR(ino)) + || ino == MDS_INO_ROOT + || ino == MDS_INO_CEPH; +} + +int DataScan::scan_links() +{ + MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver); + if (!metadata_driver) { + derr << "Unexpected --output-dir option for scan_links" << dendl; + return -EINVAL; + } + + interval_set<uint64_t> used_inos; + map<inodeno_t, int> remote_links; + map<snapid_t, SnapInfo> snaps; + snapid_t last_snap = 1; + snapid_t snaprealm_v2_since = 2; + + struct link_info_t { + inodeno_t dirino; + frag_t frag; + string name; + version_t version; + int nlink; + bool is_dir; + map<snapid_t, SnapInfo> snaps; + link_info_t() : version(0), nlink(0), is_dir(false) {} + link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::mempool_inode& i) : + dirino(di), frag(df), name(n), + version(i.version), nlink(i.nlink), is_dir(S_IFDIR & i.mode) {} + dirfrag_t dirfrag() const { + return dirfrag_t(dirino, frag); + } + }; + map<inodeno_t, list<link_info_t> > dup_primaries; + map<inodeno_t, link_info_t> bad_nlink_inos; + map<inodeno_t, link_info_t> injected_inos; + + map<dirfrag_t, set<string> > to_remove; + + enum { + SCAN_INOS = 1, + CHECK_LINK, + }; + + for (int step = SCAN_INOS; step <= CHECK_LINK; step++) { + const librados::NObjectIterator it_end = metadata_io.nobjects_end(); + for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) { + const std::string oid = it->get_oid(); + + uint64_t dir_ino = 0; + uint64_t frag_id = 0; + int r = parse_oid(oid, &dir_ino, &frag_id); + if (r == -EINVAL) { + dout(10) << "Not a dirfrag: '" << oid << "'" << dendl; + continue; + } else { + // parse_oid can only do 0 or -EINVAL + ceph_assert(r == 0); + } + + if (!valid_ino(dir_ino)) { + dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl; + continue; + } + + std::map<std::string, bufferlist> items; + r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items); + if (r < 0) { + derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& p : items) { + auto q = p.second.cbegin(); + string dname; + snapid_t last; + dentry_key_t::decode_helper(p.first, dname, last); + + if (last != CEPH_NOSNAP) { + if (last > last_snap) + last_snap = last; + continue; + } + + try { + snapid_t dnfirst; + decode(dnfirst, q); + if (dnfirst <= CEPH_MAXSNAP) { + if (dnfirst - 1 > last_snap) + last_snap = dnfirst - 1; + } + char dentry_type; + decode(dentry_type, q); + if (dentry_type == 'I') { + InodeStore inode; + inode.decode_bare(q); + inodeno_t ino = inode.inode.ino; + + if (step == SCAN_INOS) { + if (used_inos.contains(ino, 1)) { + dup_primaries[ino].size(); + } else { + used_inos.insert(ino); + } + } else if (step == CHECK_LINK) { + sr_t srnode; + if (inode.snap_blob.length()) { + auto p = inode.snap_blob.cbegin(); + decode(srnode, p); + for (auto it = srnode.snaps.begin(); + it != srnode.snaps.end(); ) { + if (it->second.ino != ino || + it->second.snapid != it->first) { + srnode.snaps.erase(it++); + } else { + ++it; + } + } + if (!srnode.past_parents.empty()) { + snapid_t last = srnode.past_parents.rbegin()->first; + if (last + 1 > snaprealm_v2_since) + snaprealm_v2_since = last + 1; + } + } + if (!inode.old_inodes.empty()) { + if (inode.old_inodes.rbegin()->first > last_snap) + last_snap = inode.old_inodes.rbegin()->first; + } + auto q = dup_primaries.find(ino); + if (q != dup_primaries.end()) { + q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode)); + q->second.back().snaps.swap(srnode.snaps); + } else { + int nlink = 0; + auto r = remote_links.find(ino); + if (r != remote_links.end()) + nlink = r->second; + if (!MDS_INO_IS_STRAY(dir_ino)) + nlink++; + if (inode.inode.nlink != nlink) { + derr << "Bad nlink on " << ino << " expected " << nlink + << " has " << inode.inode.nlink << dendl; + bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode); + bad_nlink_inos[ino].nlink = nlink; + } + snaps.insert(make_move_iterator(begin(srnode.snaps)), + make_move_iterator(end(srnode.snaps))); + } + if (dnfirst == CEPH_NOSNAP) + injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode); + } + } else if (dentry_type == 'L') { + inodeno_t ino; + unsigned char d_type; + decode(ino, q); + decode(d_type, q); + + if (step == SCAN_INOS) { + remote_links[ino]++; + } else if (step == CHECK_LINK) { + if (!used_inos.contains(ino, 1)) { + derr << "Bad remote link dentry 0x" << std::hex << dir_ino + << std::dec << "/" << dname + << ", ino " << ino << " not found" << dendl; + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str()); + dn_key.encode(key); + to_remove[dirfrag_t(dir_ino, frag_id)].insert(key); + } + } + } else { + derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino + << std::dec << "/" << dname << dendl; + return -EINVAL; + } + } catch (const buffer::error &err) { + derr << "Error decoding dentry 0x" << std::hex << dir_ino + << std::dec << "/" << dname << dendl; + return -EINVAL; + } + } + } + } + + map<unsigned, uint64_t> max_ino_map; + { + auto prev_max_ino = (uint64_t)1 << 40; + for (auto p = used_inos.begin(); p != used_inos.end(); ++p) { + auto cur_max = p.get_start() + p.get_len() - 1; + if (cur_max < prev_max_ino) + continue; // system inodes + + if ((prev_max_ino >> 40) != (cur_max >> 40)) { + unsigned rank = (prev_max_ino >> 40) - 1; + max_ino_map[rank] = prev_max_ino; + } else if ((p.get_start() >> 40) != (cur_max >> 40)) { + unsigned rank = (p.get_start() >> 40) - 1; + max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1; + } + prev_max_ino = cur_max; + } + unsigned rank = (prev_max_ino >> 40) - 1; + max_ino_map[rank] = prev_max_ino; + } + + used_inos.clear(); + + for (auto& p : dup_primaries) { + link_info_t newest; + for (auto& q : p.second) { + if (q.version > newest.version) { + newest = q; + } else if (q.version == newest.version && + !MDS_INO_IS_STRAY(q.dirino) && + MDS_INO_IS_STRAY(newest.dirino)) { + newest = q; + } + } + + for (auto& q : p.second) { + // in the middle of dir fragmentation? + if (newest.dirino == q.dirino && newest.name == q.name) { + snaps.insert(make_move_iterator(begin(q.snaps)), + make_move_iterator(end(q.snaps))); + continue; + } + + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str()); + dn_key.encode(key); + to_remove[q.dirfrag()].insert(key); + derr << "Remove duplicated ino 0x" << p.first << " from " + << q.dirfrag() << "/" << q.name << dendl; + } + + int nlink = 0; + auto q = remote_links.find(p.first); + if (q != remote_links.end()) + nlink = q->second; + if (!MDS_INO_IS_STRAY(newest.dirino)) + nlink++; + + if (nlink != newest.nlink) { + derr << "Bad nlink on " << p.first << " expected " << nlink + << " has " << newest.nlink << dendl; + bad_nlink_inos[p.first] = newest; + bad_nlink_inos[p.first].nlink = nlink; + } + } + dup_primaries.clear(); + remote_links.clear(); + + { + objecter->with_osdmap([&](const OSDMap& o) { + for (auto p : data_pools) { + const pg_pool_t *pi = o.get_pg_pool(p); + if (!pi) + continue; + if (pi->snap_seq > last_snap) + last_snap = pi->snap_seq; + } + }); + + if (!snaps.empty()) { + if (snaps.rbegin()->first > last_snap) + last_snap = snaps.rbegin()->first; + } + } + + for (auto& p : to_remove) { + object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, ""); + + int r = metadata_io.omap_rm_keys(frag_oid.name, p.second); + if (r != 0) { + derr << "Error removing duplicated dentries from " << p.first << dendl; + return r; + } + } + to_remove.clear(); + + for (auto &p : bad_nlink_inos) { + InodeStore inode; + snapid_t first; + int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first); + if (r < 0) { + derr << "Unexpected error reading dentry " + << p.second.dirfrag() << "/" << p.second.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (inode.inode.ino != p.first || inode.inode.version != p.second.version) + continue; + + inode.inode.nlink = p.second.nlink; + r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first); + if (r < 0) + return r; + } + + for (auto &p : injected_inos) { + InodeStore inode; + snapid_t first; + int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first); + if (r < 0) { + derr << "Unexpected error reading dentry " + << p.second.dirfrag() << "/" << p.second.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (first != CEPH_NOSNAP) + continue; + + first = last_snap + 1; + r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first); + if (r < 0) + return r; + } + + for (auto& p : max_ino_map) { + InoTable inotable(nullptr); + inotable.set_rank(p.first); + bool dirty = false; + int r = metadata_driver->load_table(&inotable); + if (r < 0) { + inotable.reset_state(); + dirty = true; + } + if (inotable.force_consume_to(p.second)) + dirty = true; + if (dirty) { + r = metadata_driver->save_table(&inotable); + if (r < 0) + return r; + } + } + + { + SnapServer snaptable; + snaptable.set_rank(0); + bool dirty = false; + int r = metadata_driver->load_table(&snaptable); + if (r < 0) { + snaptable.reset_state(); + dirty = true; + } + if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps)) + dirty = true; + if (dirty) { + r = metadata_driver->save_table(&snaptable); + if (r < 0) + return r; + } + } + return 0; +} + +int DataScan::scan_frags() +{ + bool roots_present; + int r = driver->check_roots(&roots_present); + if (r != 0) { + derr << "Unexpected error checking roots: '" + << cpp_strerror(r) << "'" << dendl; + return r; + } + + if (!roots_present) { + std::cerr << "Some or all system inodes are absent. Run 'init' from " + "one node before running 'scan_inodes'" << std::endl; + return -EIO; + } + + return forall_objects(metadata_io, true, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + int r = 0; + r = parse_oid(oid, &obj_name_ino, &obj_name_offset); + if (r != 0) { + dout(4) << "Bad object name '" << oid << "', skipping" << dendl; + return r; + } + + if (obj_name_ino < (1ULL << 40)) { + // FIXME: we're skipping stray dirs here: if they're + // orphaned then we should be resetting them some other + // way + dout(10) << "Skipping system ino " << obj_name_ino << dendl; + return 0; + } + + AccumulateResult accum_res; + inode_backtrace_t backtrace; + + // Default to inherit layout (i.e. no explicit layout on dir) which is + // expressed as a zeroed layout struct (see inode_t::has_layout) + file_layout_t loaded_layout; + + int parent_r = 0; + bufferlist parent_bl; + int layout_r = 0; + bufferlist layout_bl; + bufferlist op_bl; + + librados::ObjectReadOperation op; + op.getxattr("parent", &parent_bl, &parent_r); + op.getxattr("layout", &layout_bl, &layout_r); + r = metadata_io.operate(oid, &op, &op_bl); + if (r != 0 && r != -ENODATA) { + derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl; + return r; + } + + if (parent_r != -ENODATA) { + try { + auto q = parent_bl.cbegin(); + backtrace.decode(q); + } catch (buffer::error &e) { + dout(4) << "Corrupt backtrace on '" << oid << "': " << e << dendl; + if (!force_corrupt) { + return -EINVAL; + } else { + // Treat backtrace as absent: we'll inject into lost+found + backtrace = inode_backtrace_t(); + } + } + } + + if (layout_r != -ENODATA) { + try { + auto q = layout_bl.cbegin(); + decode(loaded_layout, q); + } catch (buffer::error &e) { + dout(4) << "Corrupt layout on '" << oid << "': " << e << dendl; + if (!force_corrupt) { + return -EINVAL; + } + } + } + + bool have_backtrace = !(backtrace.ancestors.empty()); + + // Santity checking backtrace ino against object name + if (have_backtrace && backtrace.ino != obj_name_ino) { + dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino + << " doesn't match object name ino 0x" << obj_name_ino + << std::dec << dendl; + have_backtrace = false; + } + + uint64_t fnode_version = 0; + fnode_t fnode; + r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version); + if (r == -EINVAL) { + derr << "Corrupt fnode on " << oid << dendl; + if (force_corrupt) { + fnode.fragstat.mtime = 0; + fnode.fragstat.nfiles = 1; + fnode.fragstat.nsubdirs = 0; + fnode.accounted_fragstat = fnode.fragstat; + } else { + return r; + } + } + + InodeStore dentry; + build_dir_dentry(obj_name_ino, fnode.accounted_fragstat, + loaded_layout, &dentry); + + // Inject inode to the metadata pool + if (have_backtrace) { + inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin()); + if (MDS_INO_IS_MDSDIR(root_bp.dirino)) { + /* Special case for strays: even if we have a good backtrace, + * don't put it in the stray dir, because while that would technically + * give it linkage it would still be invisible to the user */ + r = driver->inject_lost_and_found(obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } else { + /* Happy case: we will inject a named dentry for this inode */ + r = driver->inject_with_backtrace(backtrace, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " with backtrace: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + } else { + /* Backtrace-less case: we will inject a lost+found dentry */ + r = driver->inject_lost_and_found( + obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << obj_name_ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + + return r; + }); +} + +int MetadataTool::read_fnode( + inodeno_t ino, frag_t frag, fnode_t *fnode, + uint64_t *last_version) +{ + ceph_assert(fnode != NULL); + + object_t frag_oid = InodeStore::get_object_name(ino, frag, ""); + bufferlist fnode_bl; + int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl); + *last_version = metadata_io.get_last_version(); + if (r < 0) { + return r; + } + + auto old_fnode_iter = fnode_bl.cbegin(); + try { + (*fnode).decode(old_fnode_iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + return 0; +} + +int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag, + const std::string &dname, InodeStore *inode, snapid_t *dnfirst) +{ + ceph_assert(inode != NULL); + + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str()); + dn_key.encode(key); + + std::set<std::string> keys; + keys.insert(key); + std::map<std::string, bufferlist> vals; + object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, ""); + int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals); + dout(20) << "oid=" << frag_oid.name + << " dname=" << dname + << " frag=" << frag + << ", r=" << r << dendl; + if (r < 0) { + return r; + } + + if (vals.find(key) == vals.end()) { + dout(20) << key << " not found in result" << dendl; + return -ENOENT; + } + + try { + auto q = vals[key].cbegin(); + snapid_t first; + decode(first, q); + char dentry_type; + decode(dentry_type, q); + if (dentry_type == 'I') { + inode->decode_bare(q); + } else { + dout(20) << "dentry type '" << dentry_type << "': cannot" + "read an inode out of that" << dendl; + return -EINVAL; + } + if (dnfirst) + *dnfirst = first; + } catch (const buffer::error &err) { + dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino + << std::dec << "/" << dname << dendl; + return -EINVAL; + } + + return 0; +} + +int MetadataDriver::load_table(MDSTable *table) +{ + object_t table_oid = table->get_object_name(); + + bufferlist table_bl; + int r = metadata_io.read(table_oid.name, table_bl, 0, 0); + if (r < 0) { + derr << "unable to read mds table '" << table_oid.name << "': " + << cpp_strerror(r) << dendl; + return r; + } + + try { + version_t table_ver; + auto p = table_bl.cbegin(); + decode(table_ver, p); + table->decode_state(p); + table->force_replay_version(table_ver); + } catch (const buffer::error &err) { + derr << "unable to decode mds table '" << table_oid.name << "': " + << err.what() << dendl; + return -EIO; + } + return 0; +} + +int MetadataDriver::save_table(MDSTable *table) +{ + object_t table_oid = table->get_object_name(); + + bufferlist table_bl; + encode(table->get_version(), table_bl); + table->encode_state(table_bl); + int r = metadata_io.write_full(table_oid.name, table_bl); + if (r != 0) { + derr << "error updating mds table " << table_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +int MetadataDriver::inject_lost_and_found( + inodeno_t ino, const InodeStore &dentry) +{ + // Create lost+found if doesn't exist + bool created = false; + int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created); + if (r < 0) { + return r; + } + InodeStore lf_ino; + r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino); + if (r == -ENOENT || r == -EINVAL) { + if (r == -EINVAL && !force_corrupt) { + return r; + } + + // To have a directory not specify a layout, give it zeros (see + // inode_t::has_layout) + file_layout_t inherit_layout; + + // Construct LF inode + frag_info_t fragstat; + fragstat.nfiles = 1, + build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino); + + // Inject link to LF inode in the root dir + r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino); + if (r < 0) { + return r; + } + } else { + if (!(lf_ino.inode.mode & S_IFDIR)) { + derr << "lost+found exists but is not a directory!" << dendl; + // In this case we error out, and the user should do something about + // this problem. + return -EINVAL; + } + } + + r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created); + if (r < 0) { + return r; + } + + InodeStore recovered_ino; + + + const std::string dname = lost_found_dname(ino); + + // Write dentry into lost+found dirfrag + return inject_linkage(lf_ino.inode.ino, dname, frag_t(), dentry); +} + + +int MetadataDriver::get_frag_of( + inodeno_t dirino, + const std::string &target_dname, + frag_t *result_ft) +{ + object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), ""); + + dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl; + + // Find and load fragtree if existing dirfrag + // ========================================== + bool have_backtrace = false; + bufferlist parent_bl; + int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl); + if (r == -ENODATA) { + dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl; + } else if (r < 0) { + dout(4) << "Unexpected error on '" << root_frag_oid << "': " + << cpp_strerror(r) << dendl; + return r; + } + + // Deserialize backtrace + inode_backtrace_t backtrace; + if (parent_bl.length()) { + try { + auto q = parent_bl.cbegin(); + backtrace.decode(q); + have_backtrace = true; + } catch (buffer::error &e) { + dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': " << e << dendl; + } + } + + if (!(have_backtrace && backtrace.ancestors.size())) { + // Can't work out fragtree without a backtrace + dout(4) << "No backtrace on '" << root_frag_oid + << "': cannot determine fragtree" << dendl; + return -ENOENT; + } + + // The parentage of dirino + const inode_backpointer_t &bp = *(backtrace.ancestors.begin()); + + // The inode of dirino's parent + const inodeno_t parent_ino = bp.dirino; + + // The dname of dirino in its parent. + const std::string &parent_dname = bp.dname; + + dout(20) << "got backtrace parent " << parent_ino << "/" + << parent_dname << dendl; + + // The primary dentry for dirino + InodeStore existing_dentry; + + // See if we can find ourselves in dirfrag zero of the parent: this + // is a fast path that avoids needing to go further up the tree + // if the parent isn't fragmented (worst case we would have to + // go all the way to the root) + r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry); + if (r >= 0) { + // Great, fast path: return the fragtree from here + if (existing_dentry.inode.ino != dirino) { + dout(4) << "Unexpected inode in dentry! 0x" << std::hex + << existing_dentry.inode.ino + << " vs expected 0x" << dirino << std::dec << dendl; + return -ENOENT; + } + dout(20) << "fast path, fragtree is " + << existing_dentry.dirfragtree << dendl; + *result_ft = existing_dentry.pick_dirfrag(target_dname); + dout(20) << "frag is " << *result_ft << dendl; + return 0; + } else if (r != -ENOENT) { + // Dentry not present in 0th frag, must read parent's fragtree + frag_t parent_frag; + r = get_frag_of(parent_ino, parent_dname, &parent_frag); + if (r == 0) { + // We have the parent fragtree, so try again to load our dentry + r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry); + if (r >= 0) { + // Got it! + *result_ft = existing_dentry.pick_dirfrag(target_dname); + dout(20) << "resolved via parent, frag is " << *result_ft << dendl; + return 0; + } else { + if (r == -EINVAL || r == -ENOENT) { + return -ENOENT; // dentry missing or corrupt, so frag is missing + } else { + return r; + } + } + } else { + // Couldn't resolve parent fragtree, so can't find ours. + return r; + } + } else if (r == -EINVAL) { + // Unreadable dentry, can't know the fragtree. + return -ENOENT; + } else { + // Unexpected error, raise it + return r; + } +} + + +int MetadataDriver::inject_with_backtrace( + const inode_backtrace_t &backtrace, const InodeStore &dentry) + +{ + + // On dirfrags + // =========== + // In order to insert something into a directory, we first (ideally) + // need to know the fragtree for the directory. Sometimes we can't + // get that, in which case we just go ahead and insert it into + // fragment zero for a good chance of that being the right thing + // anyway (most moderate-sized dirs aren't fragmented!) + + // On ancestry + // =========== + // My immediate ancestry should be correct, so if we can find that + // directory's dirfrag then go inject it there. This works well + // in the case that this inode's dentry was somehow lost and we + // are recreating it, because the rest of the hierarchy + // will probably still exist. + // + // It's more of a "better than nothing" approach when rebuilding + // a whole tree, as backtraces will in general not be up to date + // beyond the first parent, if anything in the trace was ever + // moved after the file was created. + + // On inode numbers + // ================ + // The backtrace tells us inodes for each of the parents. If we are + // creating those parent dirfrags, then there is a risk that somehow + // the inode indicated here was also used for data (not a dirfrag) at + // some stage. That would be a zany situation, and we don't check + // for it here, because to do so would require extra IOs for everything + // we inject, and anyway wouldn't guarantee that the inode number + // wasn't in use in some dentry elsewhere in the metadata tree that + // just happened not to have any data objects. + + // On multiple workers touching the same traces + // ============================================ + // When creating linkage for a directory, *only* create it if we are + // also creating the object. That way, we might not manage to get the + // *right* linkage for a directory, but at least we won't multiply link + // it. We assume that if a root dirfrag exists for a directory, then + // it is linked somewhere (i.e. that the metadata pool is not already + // inconsistent). + // + // Making sure *that* is true is someone else's job! Probably someone + // who is not going to run in parallel, so that they can self-consistently + // look at versions and move things around as they go. + // Note this isn't 100% safe: if we die immediately after creating dirfrag + // object, next run will fail to create linkage for the dirfrag object + // and leave it orphaned. + + inodeno_t ino = backtrace.ino; + dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl; + for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin(); + i != backtrace.ancestors.end(); ++i) { + const inode_backpointer_t &backptr = *i; + dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec + << "/" << backptr.dname << dendl; + + // Examine root dirfrag for parent + const inodeno_t parent_ino = backptr.dirino; + const std::string dname = backptr.dname; + + frag_t fragment; + int r = get_frag_of(parent_ino, dname, &fragment); + if (r == -ENOENT) { + // Don't know fragment, fall back to assuming root + dout(20) << "don't know fragment for 0x" << std::hex << + parent_ino << std::dec << "/" << dname << ", will insert to root" + << dendl; + } + + // Find or create dirfrag + // ====================== + bool created_dirfrag; + r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag); + if (r < 0) { + return r; + } + + // Check if dentry already exists + // ============================== + InodeStore existing_dentry; + r = read_dentry(parent_ino, fragment, dname, &existing_dentry); + bool write_dentry = false; + if (r == -ENOENT || r == -EINVAL) { + if (r == -EINVAL && !force_corrupt) { + return r; + } + // Missing or corrupt dentry + write_dentry = true; + } else if (r < 0) { + derr << "Unexpected error reading dentry 0x" << std::hex + << parent_ino << std::dec << "/" + << dname << ": " << cpp_strerror(r) << dendl; + break; + } else { + // Dentry already present, does it link to me? + if (existing_dentry.inode.ino == ino) { + dout(20) << "Dentry 0x" << std::hex + << parent_ino << std::dec << "/" + << dname << " already exists and points to me" << dendl; + } else { + derr << "Dentry 0x" << std::hex + << parent_ino << std::dec << "/" + << dname << " already exists but points to 0x" + << std::hex << existing_dentry.inode.ino << std::dec << dendl; + // Fall back to lost+found! + return inject_lost_and_found(backtrace.ino, dentry); + } + } + + // Inject linkage + // ============== + + if (write_dentry) { + if (i == backtrace.ancestors.begin()) { + // This is the linkage for the file of interest + dout(10) << "Linking inode 0x" << std::hex << ino + << " at 0x" << parent_ino << "/" << dname << std::dec + << " with size=" << dentry.inode.size << " bytes" << dendl; + + r = inject_linkage(parent_ino, dname, fragment, dentry); + } else { + // This is the linkage for an ancestor directory + InodeStore ancestor_dentry; + ancestor_dentry.inode.mode = 0755 | S_IFDIR; + + // Set nfiles to something non-zero, to fool any other code + // that tries to ignore 'empty' directories. This won't be + // accurate, but it should avoid functional issues. + + ancestor_dentry.inode.dirstat.nfiles = 1; + ancestor_dentry.inode.dir_layout.dl_dir_hash = + g_conf()->mds_default_dir_hash; + + ancestor_dentry.inode.nlink = 1; + ancestor_dentry.inode.ino = ino; + ancestor_dentry.inode.uid = g_conf()->mds_root_ino_uid; + ancestor_dentry.inode.gid = g_conf()->mds_root_ino_gid; + ancestor_dentry.inode.version = 1; + ancestor_dentry.inode.backtrace_version = 1; + r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry); + } + + if (r < 0) { + return r; + } + } + + if (!created_dirfrag) { + // If the parent dirfrag already existed, then stop traversing the + // backtrace: assume that the other ancestors already exist too. This + // is an assumption rather than a truth, but it's a convenient way + // to avoid the risk of creating multiply-linked directories while + // injecting data. If there are in fact missing ancestors, this + // should be fixed up using a separate tool scanning the metadata + // pool. + break; + } else { + // Proceed up the backtrace, creating parents + ino = parent_ino; + } + } + + return 0; +} + +int MetadataDriver::find_or_create_dirfrag( + inodeno_t ino, + frag_t fragment, + bool *created) +{ + ceph_assert(created != NULL); + + fnode_t existing_fnode; + *created = false; + + uint64_t read_version = 0; + int r = read_fnode(ino, fragment, &existing_fnode, &read_version); + dout(10) << "read_version = " << read_version << dendl; + + if (r == -ENOENT || r == -EINVAL) { + if (r == -EINVAL && !force_corrupt) { + return r; + } + + // Missing or corrupt fnode, create afresh + bufferlist fnode_bl; + fnode_t blank_fnode; + blank_fnode.version = 1; + // mark it as non-empty + blank_fnode.fragstat.nfiles = 1; + blank_fnode.accounted_fragstat = blank_fnode.fragstat; + blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS); + blank_fnode.encode(fnode_bl); + + + librados::ObjectWriteOperation op; + + if (read_version) { + ceph_assert(r == -EINVAL); + // Case A: We must assert that the version isn't changed since we saw the object + // was unreadable, to avoid the possibility of two data-scan processes + // both creating the frag. + op.assert_version(read_version); + } else { + ceph_assert(r == -ENOENT); + // Case B: The object didn't exist in read_fnode, so while creating it we must + // use an exclusive create to correctly populate *creating with + // whether we created it ourselves or someone beat us to it. + op.create(true); + } + + object_t frag_oid = InodeStore::get_object_name(ino, fragment, ""); + op.omap_set_header(fnode_bl); + r = metadata_io.operate(frag_oid.name, &op); + if (r == -EOVERFLOW || r == -EEXIST) { + // Someone else wrote it (see case A above) + dout(10) << "Dirfrag creation race: 0x" << std::hex + << ino << " " << fragment << std::dec << dendl; + *created = false; + return 0; + } else if (r < 0) { + // We were unable to create or write it, error out + derr << "Failed to create dirfrag 0x" << std::hex + << ino << std::dec << ": " << cpp_strerror(r) << dendl; + return r; + } else { + // Success: the dirfrag object now exists with a value header + dout(10) << "Created dirfrag: 0x" << std::hex + << ino << std::dec << dendl; + *created = true; + } + } else if (r < 0) { + derr << "Unexpected error reading dirfrag 0x" << std::hex + << ino << std::dec << " : " << cpp_strerror(r) << dendl; + return r; + } else { + dout(20) << "Dirfrag already exists: 0x" << std::hex + << ino << " " << fragment << std::dec << dendl; + } + + return 0; +} + +int MetadataDriver::inject_linkage( + inodeno_t dir_ino, const std::string &dname, + const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst) +{ + object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, ""); + + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str()); + dn_key.encode(key); + + bufferlist dentry_bl; + encode(dnfirst, dentry_bl); + encode('I', dentry_bl); + inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + // Write out + std::map<std::string, bufferlist> vals; + vals[key] = dentry_bl; + int r = metadata_io.omap_set(frag_oid.name, vals); + if (r != 0) { + derr << "Error writing dentry 0x" << std::hex + << dir_ino << std::dec << "/" + << dname << ": " << cpp_strerror(r) << dendl; + return r; + } else { + dout(20) << "Injected dentry 0x" << std::hex + << dir_ino << "/" << dname << " pointing to 0x" + << inode.inode.ino << std::dec << dendl; + return 0; + } +} + + +int MetadataDriver::init( + librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap, + fs_cluster_id_t fscid) +{ + if (metadata_pool_name.empty()) { + auto fs = fsmap->get_filesystem(fscid); + ceph_assert(fs != nullptr); + int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool(); + + dout(4) << "resolving metadata pool " << metadata_pool_id << dendl; + int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name); + if (r < 0) { + derr << "Pool " << metadata_pool_id + << " identified in MDS map not found in RADOS!" << dendl; + return r; + } + dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl; + } else { + dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl; + } + return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io); +} + +int LocalFileDriver::init( + librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap, + fs_cluster_id_t fscid) +{ + return 0; +} + +int LocalFileDriver::inject_data( + const std::string &file_path, + uint64_t size, + uint32_t chunk_size, + inodeno_t ino) +{ + // Scrape the file contents out of the data pool and into the + // local filesystem + std::fstream f; + f.open(file_path.c_str(), std::fstream::out | std::fstream::binary); + + for (uint64_t offset = 0; offset < size; offset += chunk_size) { + bufferlist bl; + + char buf[32]; + snprintf(buf, sizeof(buf), + "%llx.%08llx", + (unsigned long long)ino, + (unsigned long long)(offset / chunk_size)); + std::string oid(buf); + + int r = data_io.read(oid, bl, chunk_size, 0); + + if (r <= 0 && r != -ENOENT) { + derr << "error reading data object '" << oid << "': " + << cpp_strerror(r) << dendl; + f.close(); + return r; + } else if (r >=0) { + + f.seekp(offset); + bl.write_stream(f); + } + } + f.close(); + + return 0; +} + + +int LocalFileDriver::inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) +{ + std::string path_builder = path; + + // Iterate through backtrace creating directory parents + std::vector<inode_backpointer_t>::const_reverse_iterator i; + for (i = bt.ancestors.rbegin(); + i != bt.ancestors.rend(); ++i) { + + const inode_backpointer_t &backptr = *i; + path_builder += "/"; + path_builder += backptr.dname; + + // Last entry is the filename itself + bool is_file = (i + 1 == bt.ancestors.rend()); + if (is_file) { + // FIXME: inject_data won't cope with interesting (i.e. striped) + // layouts (need a librados-compatible Filer to read these) + inject_data(path_builder, dentry.inode.size, + dentry.inode.layout.object_size, bt.ino); + } else { + int r = mkdir(path_builder.c_str(), 0755); + if (r != 0 && r != -EPERM) { + derr << "error creating directory: '" << path_builder << "': " + << cpp_strerror(r) << dendl; + return r; + } + } + } + + return 0; +} + +int LocalFileDriver::inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) +{ + std::string lf_path = path + "/lost+found"; + int r = mkdir(lf_path.c_str(), 0755); + if (r != 0 && r != -EPERM) { + derr << "error creating directory: '" << lf_path << "': " + << cpp_strerror(r) << dendl; + return r; + } + + std::string file_path = lf_path + "/" + lost_found_dname(ino); + return inject_data(file_path, dentry.inode.size, + dentry.inode.layout.object_size, ino); +} + +int LocalFileDriver::init_roots(int64_t data_pool_id) +{ + // Ensure that the path exists and is a directory + bool exists; + int r = check_roots(&exists); + if (r != 0) { + return r; + } + + if (exists) { + return 0; + } else { + return ::mkdir(path.c_str(), 0755); + } +} + +int LocalFileDriver::check_roots(bool *result) +{ + // Check if the path exists and is a directory + DIR *d = ::opendir(path.c_str()); + if (d == NULL) { + *result = false; + } else { + int r = closedir(d); + if (r != 0) { + // Weird, but maybe possible with e.g. stale FD on NFS mount? + *result = false; + } else { + *result = true; + } + } + + return 0; +} + +void MetadataTool::build_file_dentry( + inodeno_t ino, uint64_t file_size, time_t file_mtime, + const file_layout_t &layout, InodeStore *out) +{ + ceph_assert(out != NULL); + + out->inode.mode = 0500 | S_IFREG; + out->inode.size = file_size; + out->inode.max_size_ever = file_size; + out->inode.mtime.tv.tv_sec = file_mtime; + out->inode.atime.tv.tv_sec = file_mtime; + out->inode.ctime.tv.tv_sec = file_mtime; + + out->inode.layout = layout; + + out->inode.truncate_seq = 1; + out->inode.truncate_size = -1ull; + + out->inode.inline_data.version = CEPH_INLINE_NONE; + + out->inode.nlink = 1; + out->inode.ino = ino; + out->inode.version = 1; + out->inode.backtrace_version = 1; + out->inode.uid = g_conf()->mds_root_ino_uid; + out->inode.gid = g_conf()->mds_root_ino_gid; +} + +void MetadataTool::build_dir_dentry( + inodeno_t ino, const frag_info_t &fragstat, + const file_layout_t &layout, InodeStore *out) +{ + ceph_assert(out != NULL); + + out->inode.mode = 0755 | S_IFDIR; + out->inode.dirstat = fragstat; + out->inode.mtime.tv.tv_sec = fragstat.mtime; + out->inode.atime.tv.tv_sec = fragstat.mtime; + out->inode.ctime.tv.tv_sec = fragstat.mtime; + + out->inode.layout = layout; + out->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + + out->inode.truncate_seq = 1; + out->inode.truncate_size = -1ull; + + out->inode.inline_data.version = CEPH_INLINE_NONE; + + out->inode.nlink = 1; + out->inode.ino = ino; + out->inode.version = 1; + out->inode.backtrace_version = 1; + out->inode.uid = g_conf()->mds_root_ino_uid; + out->inode.gid = g_conf()->mds_root_ino_gid; +} + diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h new file mode 100644 index 00000000..5c87fe2b --- /dev/null +++ b/src/tools/cephfs/DataScan.h @@ -0,0 +1,341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "MDSUtility.h" +#include "include/rados/librados.hpp" + +class InodeStore; +class MDSTable; + +class RecoveryDriver { + protected: + // If true, overwrite structures that generate decoding errors. + bool force_corrupt; + + // If true, overwrite root objects during init_roots even if they + // exist + bool force_init; + + public: + virtual int init( + librados::Rados &rados, + std::string &metadata_pool_name, + const FSMap *fsmap, + fs_cluster_id_t fscid) = 0; + + void set_force_corrupt(const bool val) + { + force_corrupt = val; + } + + void set_force_init(const bool val) + { + force_init = val; + } + + + /** + * Inject an inode + dentry parents into the metadata pool, + * based on a backtrace recovered from the data pool + */ + virtual int inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) = 0; + + /** + * Inject an inode + dentry into the lost+found directory, + * when all we know about a file is its inode. + */ + virtual int inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) = 0; + + /** + * Create any missing roots (i.e. mydir, strays, root inode) + */ + virtual int init_roots( + int64_t data_pool_id) = 0; + + /** + * Pre-injection check that all the roots are present in + * the metadata pool. Used to avoid parallel workers interfering + * with one another, by cueing the user to go run 'init' on a + * single node before running a parallel scan. + * + * @param result: set to true if roots are present, else set to false + * @returns 0 on no unexpected errors, else error code. Missing objects + * are not considered an unexpected error: check *result for + * this case. + */ + virtual int check_roots(bool *result) = 0; + + /** + * Helper to compose dnames for links to lost+found + * inodes. + */ + std::string lost_found_dname(inodeno_t ino) + { + char s[20]; + snprintf(s, sizeof(s), "%llx", (unsigned long long)ino); + return std::string(s); + } + + RecoveryDriver() + : force_corrupt(false), + force_init(false) + {} + + virtual ~RecoveryDriver() {} +}; + +class LocalFileDriver : public RecoveryDriver +{ + protected: + const std::string path; + librados::IoCtx &data_io; + + int inject_data( + const std::string &file_path, + uint64_t size, + uint32_t chunk_size, + inodeno_t ino); + public: + + LocalFileDriver(const std::string &path_, librados::IoCtx &data_io_) + : RecoveryDriver(), path(path_), data_io(data_io_) + {} + + // Implement RecoveryDriver interface + int init( + librados::Rados &rados, + std::string &metadata_pool_name, + const FSMap *fsmap, + fs_cluster_id_t fscid) override; + + int inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) override; + + int inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) override; + + int init_roots(int64_t data_pool_id) override; + + int check_roots(bool *result) override; +}; + +/** + * A class that knows how to work with objects in a CephFS + * metadata pool. + */ +class MetadataTool +{ + protected: + + librados::IoCtx metadata_io; + + /** + * Construct a synthetic InodeStore for a normal file + */ + void build_file_dentry( + inodeno_t ino, uint64_t file_size, time_t file_mtime, + const file_layout_t &layout, + InodeStore *out); + + /** + * Construct a synthetic InodeStore for a directory + */ + void build_dir_dentry( + inodeno_t ino, + const frag_info_t &fragstat, + const file_layout_t &layout, + InodeStore *out); + + /** + * Try and read an fnode from a dirfrag + */ + int read_fnode(inodeno_t ino, frag_t frag, + fnode_t *fnode, uint64_t *read_version); + + /** + * Try and read a dentry from a dirfrag + */ + int read_dentry(inodeno_t parent_ino, frag_t frag, + const std::string &dname, InodeStore *inode, snapid_t *dnfirst=nullptr); +}; + +/** + * A class that knows how to manipulate CephFS metadata pools + */ +class MetadataDriver : public RecoveryDriver, public MetadataTool +{ + protected: + /** + * Create a .inode object, i.e. root or mydir + */ + int inject_unlinked_inode(inodeno_t inono, int mode, int64_t data_pool_id); + + /** + * Check for existence of .inode objects, before + * trying to go ahead and inject metadata. + */ + int root_exists(inodeno_t ino, bool *result); + int find_or_create_dirfrag( + inodeno_t ino, + frag_t fragment, + bool *created); + + + /** + * Work out which fragment of a directory should contain a named + * dentry, recursing up the trace as necessary to retrieve + * fragtrees. + */ + int get_frag_of( + inodeno_t dirino, + const std::string &dname, + frag_t *result_ft); + + public: + + // Implement RecoveryDriver interface + int init( + librados::Rados &rados, + std::string &metadata_pool_name, + const FSMap *fsmap, + fs_cluster_id_t fscid) override; + + int inject_linkage( + inodeno_t dir_ino, const std::string &dname, + const frag_t fragment, const InodeStore &inode, snapid_t dnfirst=CEPH_NOSNAP); + + int inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) override; + + int inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) override; + + int init_roots(int64_t data_pool_id) override; + + int check_roots(bool *result) override; + + int load_table(MDSTable *table); + int save_table(MDSTable *table); +}; + +class DataScan : public MDSUtility, public MetadataTool +{ + protected: + RecoveryDriver *driver; + fs_cluster_id_t fscid; + + string metadata_pool_name; + std::vector<int64_t> data_pools; + + // IoCtx for data pool (where we scrape file backtraces from) + librados::IoCtx data_io; + // Remember the data pool ID for use in layouts + int64_t data_pool_id; + + uint32_t n; + uint32_t m; + + /** + * Scan data pool for backtraces, and inject inodes to metadata pool + */ + int scan_inodes(); + + /** + * Scan data pool for file sizes and mtimes + */ + int scan_extents(); + + /** + * Scan metadata pool for 0th dirfrags to link orphaned + * directory inodes. + */ + int scan_frags(); + + /** + * Cleanup xattrs from data pool + */ + int cleanup(); + + /** + * Check if an inode number is in the permitted ranges + */ + bool valid_ino(inodeno_t ino) const; + + + int scan_links(); + + // Accept pools which are not in the FSMap + bool force_pool; + // Respond to decode errors by overwriting + bool force_corrupt; + // Overwrite root objects even if they exist + bool force_init; + // Only scan inodes without this scrub tag + string filter_tag; + + /** + * @param r set to error on valid key with invalid value + * @return true if argument consumed, else false + */ + bool parse_kwarg( + const std::vector<const char*> &args, + std::vector<const char *>::const_iterator &i, + int *r); + + /** + * @return true if argument consumed, else false + */ + bool parse_arg( + const std::vector<const char*> &arg, + std::vector<const char *>::const_iterator &i); + + int probe_filter(librados::IoCtx &ioctx); + + /** + * Apply a function to all objects in an ioctx's pool, optionally + * restricted to only those objects with a 00000000 offset and + * no tag matching DataScan::scrub_tag. + */ + int forall_objects( + librados::IoCtx &ioctx, + bool untagged_only, + std::function<int(std::string, uint64_t, uint64_t)> handler); + + public: + static void usage(); + int main(const std::vector<const char *> &args); + + DataScan() + : driver(NULL), fscid(FS_CLUSTER_ID_NONE), + data_pool_id(-1), n(0), m(1), + force_pool(false), force_corrupt(false), + force_init(false) + { + } + + ~DataScan() override + { + delete driver; + } +}; + diff --git a/src/tools/cephfs/Dumper.cc b/src/tools/cephfs/Dumper.cc new file mode 100644 index 00000000..6b758497 --- /dev/null +++ b/src/tools/cephfs/Dumper.cc @@ -0,0 +1,431 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef _BACKWARD_BACKWARD_WARNING_H +#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_* +#endif + +#include "include/compat.h" +#include "include/fs_types.h" +#include "common/entity_name.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/JournalPointer.h" +#include "osdc/Journaler.h" +#include "mon/MonClient.h" + +#include "Dumper.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +#define HEADER_LEN 4096 + +int Dumper::init(mds_role_t role_, const std::string &type) +{ + role = role_; + + int r = MDSUtility::init(); + if (r < 0) { + return r; + } + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + if (type == "mdlog") { + JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool()); + int jp_load_result = jp.load(objecter); + if (jp_load_result != 0) { + std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl; + return jp_load_result; + } else { + ino = jp.front; + } + } else if (type == "purge_queue") { + ino = MDS_INO_PURGE_QUEUE + role.rank; + } else { + ceph_abort(); // should not get here + } + return 0; +} + + +int Dumper::recover_journal(Journaler *journaler) +{ + C_SaferCond cond; + lock.Lock(); + journaler->recover(&cond); + lock.Unlock(); + const int r = cond.wait(); + + if (r < 0) { // Error + derr << "error on recovery: " << cpp_strerror(r) << dendl; + return r; + } else { + dout(10) << "completed journal recovery" << dendl; + return 0; + } +} + + +int Dumper::dump(const char *dump_file) +{ + int r = 0; + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, + &finisher); + r = recover_journal(&journaler); + if (r) { + return r; + } + uint64_t start = journaler.get_read_pos(); + uint64_t end = journaler.get_write_pos(); + uint64_t len = end-start; + + Filer filer(objecter, &finisher); + + cout << "journal is " << start << "~" << len << std::endl; + + int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC, 0644); + if (fd >= 0) { + // include an informative header + uuid_d fsid = monc->get_fsid(); + char fsid_str[40]; + fsid.print(fsid_str); + char buf[HEADER_LEN]; + memset(buf, 0, sizeof(buf)); + snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\ + length %llu (0x%llx)\n write_pos %llu (0x%llx)\n format %llu\n\ + trimmed_pos %llu (0x%llx)\n stripe_unit %lu (0x%lx)\n stripe_count %lu (0x%lx)\n\ + object_size %lu (0x%lx)\n fsid %s\n%c", + role.rank, + (unsigned long long)start, (unsigned long long)start, + (unsigned long long)len, (unsigned long long)len, + (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos, + (unsigned long long)journaler.last_committed.stream_format, + (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos, + (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit, + (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count, + (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size, + fsid_str, + 4); + r = safe_write(fd, buf, sizeof(buf)); + if (r) { + derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl; + ::close(fd); + return r; + } + + // write the data + off64_t seeked = ::lseek64(fd, start, SEEK_SET); + if (seeked == (off64_t)-1) { + r = errno; + derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl; + ::close(fd); + return r; + } + + + // Read and write 32MB chunks. Slower than it could be because we're not + // streaming, but that's okay because this is just a debug/disaster tool. + const uint32_t chunk_size = 32 * 1024 * 1024; + + for (uint64_t pos = start; pos < start + len; pos += chunk_size) { + bufferlist bl; + dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl; + + const uint32_t read_size = std::min<uint64_t>(chunk_size, end - pos); + + C_SaferCond cond; + lock.Lock(); + filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP, + pos, read_size, &bl, 0, &cond); + lock.Unlock(); + r = cond.wait(); + if (r < 0) { + derr << "Error " << r << " (" << cpp_strerror(r) << ") reading " + "journal at offset 0x" << std::hex << pos << std::dec << dendl; + ::close(fd); + return r; + } + dout(10) << "Got 0x" << std::hex << bl.length() << std::dec + << " bytes" << dendl; + + r = bl.write_fd(fd); + if (r) { + derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl; + ::close(fd); + return r; + } + } + + r = ::close(fd); + if (r) { + r = errno; + derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl; + return r; + } + + cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n" + << "NOTE: this is a _sparse_ file; you can\n" + << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n" + << " to efficiently compress it while preserving sparseness." << std::endl; + return 0; + } else { + int err = errno; + derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl; + return err; + } +} + +int Dumper::undump(const char *dump_file, bool force) +{ + cout << "undump " << dump_file << std::endl; + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + int r = 0; + // try get layout info from cluster + Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, + &finisher); + int recovered = recover_journal(&journaler); + if (recovered != 0) { + derr << "recover_journal failed, try to get header from dump file " << dendl; + } + + int fd = ::open(dump_file, O_RDONLY); + if (fd < 0) { + r = errno; + derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl; + return r; + } + + // Ceph mds0 journal dump + // start offset 232401996 (0xdda2c4c) + // length 1097504 (0x10bf20) + + char buf[HEADER_LEN]; + r = safe_read(fd, buf, sizeof(buf)); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + + long long unsigned start, len, write_pos, format, trimmed_pos; + long unsigned stripe_unit, stripe_count, object_size; + sscanf(strstr(buf, "start offset"), "start offset %llu", &start); + sscanf(strstr(buf, "length"), "length %llu", &len); + sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos); + sscanf(strstr(buf, "format"), "format %llu", &format); + + if (!force) { + // need to check if fsid match onlien cluster fsid + if (strstr(buf, "fsid")) { + uuid_d fsid; + char fsid_str[40]; + sscanf(strstr(buf, "fsid"), "fsid %39s", fsid_str); + r = fsid.parse(fsid_str); + if (!r) { + derr << "Invalid fsid" << dendl; + ::close(fd); + return -EINVAL; + } + + if (fsid != monc->get_fsid()) { + derr << "Imported journal fsid does not match online cluster fsid" << dendl; + derr << "Use --force to skip fsid check" << dendl; + ::close(fd); + return -EINVAL; + } + } else { + derr << "Invalid header, no fsid embeded" << dendl; + ::close(fd); + return -EINVAL; + } + } + + if (recovered == 0) { + stripe_unit = journaler.last_committed.layout.stripe_unit; + stripe_count = journaler.last_committed.layout.stripe_count; + object_size = journaler.last_committed.layout.object_size; + } else { + // try to get layout from dump file header, if failed set layout to default + if (strstr(buf, "stripe_unit")) { + sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit); + } else { + stripe_unit = file_layout_t::get_default().stripe_unit; + } + if (strstr(buf, "stripe_count")) { + sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count); + } else { + stripe_count = file_layout_t::get_default().stripe_count; + } + if (strstr(buf, "object_size")) { + sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size); + } else { + object_size = file_layout_t::get_default().object_size; + } + } + + if (strstr(buf, "trimmed_pos")) { + sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos); + } else { + // Old format dump, any untrimmed objects before expire_pos will + // be discarded as trash. + trimmed_pos = start - (start % object_size); + } + + if (trimmed_pos > start) { + derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos + << " > expire 0x" << start << std::dec << dendl; + ::close(fd); + return -EINVAL; + } + + if (start > write_pos) { + derr << std::hex << "Invalid header (expire 0x" << start + << " > write 0x" << write_pos << std::dec << dendl; + ::close(fd); + return -EINVAL; + } + + cout << "start " << start << + " len " << len << + " write_pos " << write_pos << + " format " << format << + " trimmed_pos " << trimmed_pos << + " stripe_unit " << stripe_unit << + " stripe_count " << stripe_count << + " object_size " << object_size << std::endl; + + Journaler::Header h; + h.trimmed_pos = trimmed_pos; + h.expire_pos = start; + h.write_pos = write_pos; + h.stream_format = format; + h.magic = CEPH_FS_ONDISK_MAGIC; + + h.layout.stripe_unit = stripe_unit; + h.layout.stripe_count = stripe_count; + h.layout.object_size = object_size; + h.layout.pool_id = fs->mds_map.get_metadata_pool(); + + bufferlist hbl; + encode(h, hbl); + + object_t oid = file_object_t(ino, 0); + object_locator_t oloc(fs->mds_map.get_metadata_pool()); + SnapContext snapc; + + cout << "writing header " << oid << std::endl; + C_SaferCond header_cond; + lock.Lock(); + objecter->write_full(oid, oloc, snapc, hbl, + ceph::real_clock::now(), 0, + &header_cond); + lock.Unlock(); + + r = header_cond.wait(); + if (r != 0) { + derr << "Failed to write header: " << cpp_strerror(r) << dendl; + ::close(fd); + return r; + } + + Filer filer(objecter, &finisher); + + /* Erase any objects at the end of the region to which we shall write + * the new log data. This is to avoid leaving trailing junk after + * the newly written data. Any junk more than one object ahead + * will be taken care of during normal operation by Journaler's + * prezeroing behaviour */ + { + uint32_t const object_size = h.layout.object_size; + ceph_assert(object_size > 0); + uint64_t last_obj = h.write_pos / object_size; + uint64_t purge_count = 2; + /* When the length is zero, the last_obj should be zeroed + * from the offset determined by the new write_pos instead of being purged. + */ + if (!len) { + purge_count = 1; + ++last_obj; + } + C_SaferCond purge_cond; + cout << "Purging " << purge_count << " objects from " << last_obj << std::endl; + lock.Lock(); + filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count, + ceph::real_clock::now(), 0, &purge_cond); + lock.Unlock(); + purge_cond.wait(); + } + /* When the length is zero, zero the last object + * from the offset determined by the new write_pos. + */ + if (!len) { + uint64_t offset_in_obj = h.write_pos % h.layout.object_size; + uint64_t len = h.layout.object_size - offset_in_obj; + C_SaferCond zero_cond; + cout << "Zeroing " << len << " bytes in the last object." << std::endl; + + lock.Lock(); + filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond); + lock.Unlock(); + zero_cond.wait(); + } + + // Stream from `fd` to `filer` + uint64_t pos = start; + uint64_t left = len; + while (left > 0) { + // Read + bufferlist j; + lseek64(fd, pos, SEEK_SET); + uint64_t l = std::min<uint64_t>(left, 1024*1024); + j.read_fd(fd, l); + + // Write + cout << " writing " << pos << "~" << l << std::endl; + C_SaferCond write_cond; + lock.Lock(); + filer.write(ino, &h.layout, snapc, pos, l, j, + ceph::real_clock::now(), 0, &write_cond); + lock.Unlock(); + + r = write_cond.wait(); + if (r != 0) { + derr << "Failed to write header: " << cpp_strerror(r) << dendl; + ::close(fd); + return r; + } + + // Advance + pos += l; + left -= l; + } + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + cout << "done." << std::endl; + return 0; +} + diff --git a/src/tools/cephfs/Dumper.h b/src/tools/cephfs/Dumper.h new file mode 100644 index 00000000..758f3cde --- /dev/null +++ b/src/tools/cephfs/Dumper.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef JOURNAL_DUMPER_H_ +#define JOURNAL_DUMPER_H_ + + +#include "MDSUtility.h" + +class Journaler; + +/** + * This class lets you dump out an mds journal for troubleshooting or whatever. + * + * It was built to work with cmds so some of the design choices are random. + * To use, create a Dumper, call init(), and then call dump() with the name + * of the file to dump to. + */ + +class Dumper : public MDSUtility { +private: + mds_role_t role; + inodeno_t ino; + +public: + Dumper() : ino(-1) + {} + + int init(mds_role_t role_, const std::string &type); + int recover_journal(Journaler *journaler); + int dump(const char *dumpfile); + int undump(const char *dumpfile, bool force); +}; + +#endif /* JOURNAL_DUMPER_H_ */ diff --git a/src/tools/cephfs/EventOutput.cc b/src/tools/cephfs/EventOutput.cc new file mode 100644 index 00000000..8cb235a8 --- /dev/null +++ b/src/tools/cephfs/EventOutput.cc @@ -0,0 +1,153 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include <iostream> +#include <fstream> + +#include "common/errno.h" +#include "mds/mdstypes.h" +#include "mds/events/EUpdate.h" +#include "mds/LogEvent.h" +#include "JournalScanner.h" + +#include "EventOutput.h" + + +int EventOutput::binary() const +{ + // Binary output, files + int r = ::mkdir(path.c_str(), 0755); + if (r != 0) { + r = -errno; + if (r != -EEXIST) { + std::cerr << "Error creating output directory: " << cpp_strerror(r) << std::endl; + return r; + } + } + + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + bufferlist bin; + std::stringstream filename; + if (auto& le = i->second.log_event; le) { + le->encode(bin, CEPH_FEATURES_SUPPORTED_DEFAULT); + filename << "0x" << std::hex << i->first << std::dec << "_" << le->get_type_str() << ".bin"; + } else if (auto& pi = i->second.pi; pi) { + pi->encode(bin); + filename << "0x" << std::hex << i->first << std::dec << "_" << pi->get_type_str() << ".bin"; + } + + std::string const file_path = path + std::string("/") + filename.str(); + std::ofstream bin_file(file_path.c_str(), std::ofstream::out | std::ofstream::binary); + bin.write_stream(bin_file); + bin_file.close(); + if (bin_file.fail()) { + return -EIO; + } + } + std::cerr << "Wrote output to binary files in directory '" << path << "'" << std::endl; + + return 0; +} + +int EventOutput::json() const +{ + JSONFormatter jf(true); + std::ofstream out_file(path.c_str(), std::ofstream::out); + jf.open_array_section("journal"); + { + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + if (auto& le = i->second.log_event; le) { + jf.open_object_section("log_event"); + le->dump(&jf); + jf.close_section(); // log_event + } else if (auto& pi = i->second.pi; pi) { + jf.open_object_section("purge_action"); + pi->dump(&jf); + jf.close_section(); + } + } + } + jf.close_section(); // journal + jf.flush(out_file); + out_file.close(); + + if (out_file.fail()) { + return -EIO; + } else { + std::cerr << "Wrote output to JSON file '" << path << "'" << std::endl; + return 0; + } +} + +void EventOutput::list() const +{ + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + if (auto& le = i->second.log_event; le) { + std::vector<std::string> ev_paths; + EMetaBlob const *emb = le->get_metablob(); + if (emb) { + emb->get_paths(ev_paths); + } + + std::string detail; + if (le->get_type() == EVENT_UPDATE) { + auto& eu = reinterpret_cast<EUpdate&>(*le); + detail = eu.type; + } + + std::cout << le->get_stamp() << " 0x" + << std::hex << i->first << std::dec << " " + << le->get_type_str() << ": " + << " (" << detail << ")" << std::endl; + for (std::vector<std::string>::iterator i = ev_paths.begin(); i != ev_paths.end(); ++i) { + std::cout << " " << *i << std::endl; + } + } else if (auto& pi = i->second.pi; pi) { + std::cout << pi->stamp << " 0x" + << std::hex << i->first << std::dec << " " + << pi->get_type_str() << std::endl; + } + } +} + +void EventOutput::summary() const +{ + std::map<std::string, int> type_count; + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + std::string type; + if (auto& le = i->second.log_event; le) + type = le->get_type_str(); + else if (auto& pi = i->second.pi; pi) + type = pi->get_type_str(); + if (type_count.count(type) == 0) { + type_count[type] = 0; + } + type_count[type] += 1; + } + + std::cout << "Events by type:" << std::endl; + for (std::map<std::string, int>::iterator i = type_count.begin(); i != type_count.end(); ++i) { + std::cout << " " << i->first << ": " << i->second << std::endl; + } + + std::cout << "Errors: " << scan.errors.size() << std::endl; + if (!scan.errors.empty()) { + for (JournalScanner::ErrorMap::const_iterator i = scan.errors.begin(); + i != scan.errors.end(); ++i) { + std::cout << " 0x" << std::hex << i->first << std::dec + << ": " << i->second.r << " " + << i->second.description << std::endl; + } + } +} diff --git a/src/tools/cephfs/EventOutput.h b/src/tools/cephfs/EventOutput.h new file mode 100644 index 00000000..65d96840 --- /dev/null +++ b/src/tools/cephfs/EventOutput.h @@ -0,0 +1,42 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#ifndef EVENT_OUTPUT_H +#define EVENT_OUTPUT_H + +#include <string> + +class JournalScanner; + +/** + * Different output formats for the results of a journal scan + */ +class EventOutput +{ + private: + JournalScanner const &scan; + std::string const path; + + public: + EventOutput(JournalScanner const &scan_, std::string const &path_) + : scan(scan_), path(path_) {} + + void summary() const; + void list() const; + int json() const; + int binary() const; +}; + +#endif // EVENT_OUTPUT_H + diff --git a/src/tools/cephfs/JournalFilter.cc b/src/tools/cephfs/JournalFilter.cc new file mode 100644 index 00000000..266d7fcc --- /dev/null +++ b/src/tools/cephfs/JournalFilter.cc @@ -0,0 +1,315 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include "JournalFilter.h" + +#include "common/ceph_argparse.h" + +#include "mds/events/ESession.h" +#include "mds/events/EUpdate.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + + +const string JournalFilter::range_separator(".."); + +bool JournalFilter::apply(uint64_t pos, PurgeItem &pi) const +{ + /* Filtering by journal offset range */ + if (pos < range_start || pos >= range_end) { + return false; + } + + if (purge_action != PurgeItem::NONE) { + if (pi.action != purge_action) + return false; + } + + if (inode) { + if (inode != pi.ino) + return false; + } + return true; +} + +/* + * Return whether a LogEvent is to be included or excluded. + * + * The filter parameters are applied on an AND basis: if any + * condition is not met, the event is excluded. Try to do + * the fastest checks first. + */ +bool JournalFilter::apply(uint64_t pos, LogEvent &le) const +{ + /* Filtering by journal offset range */ + if (pos < range_start || pos >= range_end) { + return false; + } + + /* Filtering by event type */ + if (event_type != 0) { + if (le.get_type() != event_type) { + return false; + } + } + + /* Filtering by client */ + if (client_name.num()) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + if (metablob->get_client_name() != client_name) { + return false; + } + } else if (le.get_type() == EVENT_SESSION) { + ESession *es = reinterpret_cast<ESession*>(&le); + if (es->get_client_inst().name != client_name) { + return false; + } + } else { + return false; + } + } + + /* Filtering by inode */ + if (inode) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + std::set<inodeno_t> inodes; + metablob->get_inodes(inodes); + bool match_any = false; + for (std::set<inodeno_t>::iterator i = inodes.begin(); i != inodes.end(); ++i) { + if (*i == inode) { + match_any = true; + break; + } + } + if (!match_any) { + return false; + } + } else { + return false; + } + } + + /* Filtering by frag and dentry */ + if (!frag_dentry.empty() || frag.ino) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + std::map<dirfrag_t, std::set<std::string> > dentries; + metablob->get_dentries(dentries); + + if (frag.ino) { + bool match_any = false; + for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin(); + i != dentries.end(); ++i) { + if (i->first == frag) { + match_any = true; + break; + } + } + if (!match_any) { + return false; + } + } + + if (!frag_dentry.empty()) { + bool match_any = false; + for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin(); + i != dentries.end() && !match_any; ++i) { + std::set<std::string> const &names = i->second; + for (std::set<std::string>::iterator j = names.begin(); + j != names.end() && !match_any; ++j) { + if (*j == frag_dentry) { + match_any = true; + } + } + } + if (!match_any) { + return false; + } + } + + } else { + return false; + } + } + + /* Filtering by file path */ + if (!path_expr.empty()) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + std::vector<std::string> paths; + metablob->get_paths(paths); + bool match_any = false; + for (std::vector<std::string>::iterator p = paths.begin(); p != paths.end(); ++p) { + if ((*p).find(path_expr) != std::string::npos) { + match_any = true; + break; + } + } + if (!match_any) { + return false; + } + } else { + return false; + } + } + + return true; +} + + +int JournalFilter::parse_args( + std::vector<const char*> &argv, + std::vector<const char*>::iterator &arg) +{ + while(arg != argv.end()) { + std::string arg_str; + if (ceph_argparse_witharg(argv, arg, &arg_str, "--range", (char*)NULL)) { + size_t sep_loc = arg_str.find(JournalFilter::range_separator); + if (sep_loc == std::string::npos || arg_str.size() <= JournalFilter::range_separator.size()) { + derr << "Invalid range '" << arg_str << "'" << dendl; + return -EINVAL; + } + + // We have a lower bound + if (sep_loc > 0) { + std::string range_start_str = arg_str.substr(0, sep_loc); + std::string parse_err; + range_start = strict_strtoll(range_start_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid lower bound '" << range_start_str << "': " << parse_err << dendl; + return -EINVAL; + } + } + + if (sep_loc < arg_str.size() - JournalFilter::range_separator.size()) { + std::string range_end_str = arg_str.substr(sep_loc + range_separator.size()); + std::string parse_err; + range_end = strict_strtoll(range_end_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid upper bound '" << range_end_str << "': " << parse_err << dendl; + return -EINVAL; + } + } + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--path\"." << dendl; + return -EINVAL; + } + dout(4) << "Filtering by path '" << arg_str << "'" << dendl; + path_expr = arg_str; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--inode", (char*)NULL)) { + dout(4) << "Filtering by inode '" << arg_str << "'" << dendl; + std::string parse_err; + inode = strict_strtoll(arg_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid inode '" << arg_str << "': " << parse_err << dendl; + return -EINVAL; + } + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--type", (char*)NULL)) { + try { + if (!type.compare("mdlog")) { + event_type = LogEvent::str_to_type(arg_str); + } else if (!type.compare("purge_queue")) { + purge_action = PurgeItem::str_to_type(arg_str); + } + } catch (const std::out_of_range&) { + derr << "Invalid event type '" << arg_str << "'" << dendl; + return -EINVAL; + } + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--frag", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--frag\"." << dendl; + return -EINVAL; + } + std::string const frag_sep = "."; + size_t sep_loc = arg_str.find(frag_sep); + std::string inode_str; + std::string frag_str; + if (sep_loc != std::string::npos) { + inode_str = arg_str.substr(0, sep_loc); + frag_str = arg_str.substr(sep_loc + 1); + } else { + inode_str = arg_str; + frag_str = "0"; + } + + std::string parse_err; + inodeno_t frag_ino = strict_strtoll(inode_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid inode '" << inode_str << "': " << parse_err << dendl; + return -EINVAL; + } + + uint32_t frag_enc = strict_strtoll(frag_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid frag '" << frag_str << "': " << parse_err << dendl; + return -EINVAL; + } + + frag = dirfrag_t(frag_ino, frag_t(frag_enc)); + dout(4) << "dirfrag filter: '" << frag << "'" << dendl; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--dname", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--dname\"." << dendl; + return -EINVAL; + } + frag_dentry = arg_str; + dout(4) << "dentry filter: '" << frag_dentry << "'" << dendl; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--client", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--client\"." << dendl; + return -EINVAL; + } + + std::string parse_err; + int64_t client_num = strict_strtoll(arg_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid client number " << arg_str << dendl; + return -EINVAL; + } + client_name = entity_name_t::CLIENT(client_num); + } else { + // We're done with args the filter understands + break; + } + } + + return 0; +} + +/** + * If the filter params are only range, then return + * true and set start & end. Else return false. + * + * Use this to discover if the user has requested a contiguous range + * rather than any per-event filtering. + */ +bool JournalFilter::get_range(uint64_t &start, uint64_t &end) const +{ + if (!path_expr.empty() + || inode != 0 + || event_type != 0 + || frag.ino != 0 + || client_name.num() != 0 + || (range_start == 0 && range_end == (uint64_t)(-1))) { + return false; + } else { + start = range_start; + end = range_end; + return true; + } +} diff --git a/src/tools/cephfs/JournalFilter.h b/src/tools/cephfs/JournalFilter.h new file mode 100644 index 00000000..f7a2db61 --- /dev/null +++ b/src/tools/cephfs/JournalFilter.h @@ -0,0 +1,73 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#ifndef JOURNAL_FILTER_H +#define JOURNAL_FILTER_H + +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/PurgeQueue.h" + +/** + * A set of conditions for narrowing down a search through the journal + */ +class JournalFilter +{ + private: + + /* Filtering by journal offset range */ + uint64_t range_start; + uint64_t range_end; + static const std::string range_separator; + + /* Filtering by file (sub) path */ + std::string path_expr; + + /* Filtering by inode */ + inodeno_t inode; + + /* Filtering by type */ + LogEvent::EventType event_type; + + std::string type; + + /* Filtering by PurgeItem::Action */ + PurgeItem::Action purge_action; + + /* Filtering by dirfrag */ + dirfrag_t frag; + std::string frag_dentry; //< optional, filter dentry name within fragment + + /* Filtering by metablob client name */ + entity_name_t client_name; + + public: + JournalFilter(std::string t) : + range_start(0), + range_end(-1), + inode(0), + event_type(0), + type(t), + purge_action(PurgeItem::NONE) {} + + bool get_range(uint64_t &start, uint64_t &end) const; + bool apply(uint64_t pos, LogEvent &le) const; + bool apply(uint64_t pos, PurgeItem &pi) const; + int parse_args( + std::vector<const char*> &argv, + std::vector<const char*>::iterator &arg); +}; + +#endif // JOURNAL_FILTER_H + diff --git a/src/tools/cephfs/JournalScanner.cc b/src/tools/cephfs/JournalScanner.cc new file mode 100644 index 00000000..ea9d6ddf --- /dev/null +++ b/src/tools/cephfs/JournalScanner.cc @@ -0,0 +1,438 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include "include/rados/librados.hpp" +#include "mds/JournalPointer.h" + +#include "mds/events/ESubtreeMap.h" +#include "mds/PurgeQueue.h" + +#include "JournalScanner.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +/** + * Read journal header, followed by sequential scan through journal space. + * + * Return 0 on success, else error code. Note that success has the special meaning + * that we were able to apply our checks, it does *not* mean that the journal is + * healthy. + */ +int JournalScanner::scan(bool const full) +{ + int r = 0; + + r = set_journal_ino(); + if (r < 0) { + return r; + } + + if (!is_mdlog || pointer_present) { + r = scan_header(); + if (r < 0) { + return r; + } + } + + if (full && header_present) { + r = scan_events(); + if (r < 0) { + return r; + } + } + + return 0; +} + + +int JournalScanner::set_journal_ino() +{ + int r = 0; + if (type == "purge_queue") { + ino = MDS_INO_PURGE_QUEUE + rank; + } + else if (type == "mdlog"){ + r = scan_pointer(); + is_mdlog = true; + } + else { + ceph_abort(); // should not get here + } + return r; +} + +int JournalScanner::scan_pointer() +{ + // Issue read + std::string const pointer_oid = obj_name(MDS_INO_LOG_POINTER_OFFSET + rank, 0); + bufferlist pointer_bl; + int r = io.read(pointer_oid, pointer_bl, INT_MAX, 0); + if (r == -ENOENT) { + // 'Successfully' discovered the pointer is missing. + derr << "Pointer " << pointer_oid << " is absent" << dendl; + return 0; + } else if (r < 0) { + // Error preventing us interrogating pointer + derr << "Pointer " << pointer_oid << " is unreadable" << dendl; + return r; + } else { + dout(4) << "Pointer " << pointer_oid << " is readable" << dendl; + pointer_present = true; + + JournalPointer jp; + try { + auto q = pointer_bl.cbegin(); + jp.decode(q); + } catch(buffer::error &e) { + derr << "Pointer " << pointer_oid << " is corrupt: " << e.what() << dendl; + return 0; + } + + pointer_valid = true; + ino = jp.front; + return 0; + } +} + + +int JournalScanner::scan_header() +{ + int r; + + bufferlist header_bl; + std::string header_name = obj_name(0); + dout(4) << "JournalScanner::scan: reading header object '" << header_name << "'" << dendl; + r = io.read(header_name, header_bl, INT_MAX, 0); + if (r < 0) { + derr << "Header " << header_name << " is unreadable" << dendl; + return 0; // "Successfully" found an error + } else { + header_present = true; + } + + auto header_bl_i = header_bl.cbegin(); + header = new Journaler::Header(); + try + { + header->decode(header_bl_i); + } + catch (buffer::error &e) + { + derr << "Header is corrupt (" << e.what() << ")" << dendl; + delete header; + header = NULL; + return 0; // "Successfully" found an error + } + + if (header->magic != std::string(CEPH_FS_ONDISK_MAGIC)) { + derr << "Header is corrupt (bad magic)" << dendl; + return 0; // "Successfully" found an error + } + if (!((header->trimmed_pos <= header->expire_pos) && (header->expire_pos <= header->write_pos))) { + derr << "Header is invalid (inconsistent offsets)" << dendl; + return 0; // "Successfully" found an error + } + header_valid = true; + + return 0; +} + + +int JournalScanner::scan_events() +{ + uint64_t object_size = g_conf()->mds_log_segment_size; + if (object_size == 0) { + // Default layout object size + object_size = file_layout_t::get_default().object_size; + } + + uint64_t read_offset = header->expire_pos; + dout(10) << std::hex << "Header 0x" + << header->trimmed_pos << " 0x" + << header->expire_pos << " 0x" + << header->write_pos << std::dec << dendl; + dout(10) << "Starting journal scan from offset 0x" << std::hex << read_offset << std::dec << dendl; + + // TODO also check for extraneous objects before the trimmed pos or after the write pos, + // which would indicate a bogus header. + + bufferlist read_buf; + bool gap = false; + uint64_t gap_start = -1; + for (uint64_t obj_offset = (read_offset / object_size); ; obj_offset++) { + uint64_t offset_in_obj = 0; + if (obj_offset * object_size < header->expire_pos) { + // Skip up to expire_pos from start of the object + // (happens for the first object we read) + offset_in_obj = header->expire_pos - obj_offset * object_size; + } + + // Read this journal segment + bufferlist this_object; + std::string const oid = obj_name(obj_offset); + int r = io.read(oid, this_object, INT_MAX, offset_in_obj); + + // Handle absent journal segments + if (r < 0) { + if (obj_offset > (header->write_pos / object_size)) { + dout(4) << "Reached end of journal objects" << dendl; + break; + } else { + derr << "Missing object " << oid << dendl; + } + + objects_missing.push_back(obj_offset); + if (!gap) { + gap_start = read_offset; + gap = true; + } + if (read_buf.length() > 0) { + read_offset += read_buf.length(); + read_buf.clear(); + } + read_offset += object_size - offset_in_obj; + continue; + } else { + dout(4) << "Read 0x" << std::hex << this_object.length() << std::dec + << " bytes from " << oid << " gap=" << gap << dendl; + objects_valid.push_back(oid); + this_object.copy(0, this_object.length(), read_buf); + } + + if (gap) { + // No valid data at the current read offset, scan forward until we find something valid looking + // or have to drop out to load another object. + dout(4) << "Searching for sentinel from 0x" << std::hex << read_offset + << ", 0x" << read_buf.length() << std::dec << " bytes available" << dendl; + + do { + auto p = read_buf.cbegin(); + uint64_t candidate_sentinel; + decode(candidate_sentinel, p); + + dout(4) << "Data at 0x" << std::hex << read_offset << " = 0x" << candidate_sentinel << std::dec << dendl; + + if (candidate_sentinel == JournalStream::sentinel) { + dout(4) << "Found sentinel at 0x" << std::hex << read_offset << std::dec << dendl; + ranges_invalid.push_back(Range(gap_start, read_offset)); + gap = false; + break; + } else { + // No sentinel, discard this byte + read_buf.splice(0, 1); + read_offset += 1; + } + } while (read_buf.length() >= sizeof(JournalStream::sentinel)); + dout(4) << "read_buf size is " << read_buf.length() << dendl; + } + { + dout(10) << "Parsing data, 0x" << std::hex << read_buf.length() << std::dec << " bytes available" << dendl; + while(true) { + // TODO: detect and handle legacy format journals: can do many things + // on them but on read errors have to give up instead of searching + // for sentinels. + JournalStream journal_stream(JOURNAL_FORMAT_RESILIENT); + bool readable = false; + try { + uint64_t need; + readable = journal_stream.readable(read_buf, &need); + } catch (buffer::error &e) { + readable = false; + dout(4) << "Invalid container encoding at 0x" << std::hex << read_offset << std::dec << dendl; + gap = true; + gap_start = read_offset; + read_buf.splice(0, 1); + read_offset += 1; + break; + } + + if (!readable) { + // Out of data, continue to read next object + break; + } + + bufferlist le_bl; //< Serialized LogEvent blob + dout(10) << "Attempting decode at 0x" << std::hex << read_offset << std::dec << dendl; + // This cannot fail to decode because we pre-checked that a serialized entry + // blob would be readable. + uint64_t start_ptr = 0; + uint64_t consumed = journal_stream.read(read_buf, &le_bl, &start_ptr); + dout(10) << "Consumed 0x" << std::hex << consumed << std::dec << " bytes" << dendl; + if (start_ptr != read_offset) { + derr << "Bad entry start ptr (0x" << std::hex << start_ptr << ") at 0x" + << read_offset << std::dec << dendl; + gap = true; + gap_start = read_offset; + // FIXME: given that entry was invalid, should we be skipping over it? + // maybe push bytes back onto start of read_buf and just advance one byte + // to start scanning instead. e.g. if a bogus size value is found it can + // cause us to consume and thus skip a bunch of following valid events. + read_offset += consumed; + break; + } + bool valid_entry = true; + if (is_mdlog) { + auto le = LogEvent::decode_event(le_bl.cbegin()); + + if (le) { + dout(10) << "Valid entry at 0x" << std::hex << read_offset << std::dec << dendl; + + if (le->get_type() == EVENT_SUBTREEMAP + || le->get_type() == EVENT_SUBTREEMAP_TEST) { + auto&& sle = dynamic_cast<ESubtreeMap&>(*le); + if (sle.expire_pos > read_offset) { + errors.insert(std::make_pair( + read_offset, EventError( + -ERANGE, + "ESubtreeMap has expire_pos ahead of its own position"))); + } + } + + if (filter.apply(read_offset, *le)) { + events.insert_or_assign(read_offset, EventRecord(std::move(le), consumed)); + } + } else { + valid_entry = false; + } + } else if (type == "purge_queue"){ + auto pi = std::make_unique<PurgeItem>(); + try { + auto q = le_bl.cbegin(); + pi->decode(q); + if (filter.apply(read_offset, *pi)) { + events.insert_or_assign(read_offset, EventRecord(std::move(pi), consumed)); + } + } catch (const buffer::error &err) { + valid_entry = false; + } + } else { + ceph_abort(); // should not get here + } + if (!valid_entry) { + dout(10) << "Invalid entry at 0x" << std::hex << read_offset << std::dec << dendl; + gap = true; + gap_start = read_offset; + read_offset += consumed; + break; + } else { + events_valid.push_back(read_offset); + read_offset += consumed; + } + } + } + } + + if (gap) { + // Ended on a gap, assume it ran to end + ranges_invalid.push_back(Range(gap_start, -1)); + } + + dout(4) << "Scanned objects, " << objects_missing.size() << " missing, " << objects_valid.size() << " valid" << dendl; + dout(4) << "Events scanned, " << ranges_invalid.size() << " gaps" << dendl; + dout(4) << "Found " << events_valid.size() << " valid events" << dendl; + dout(4) << "Selected " << events.size() << " events events for processing" << dendl; + + return 0; +} + + +JournalScanner::~JournalScanner() +{ + if (header) { + delete header; + header = NULL; + } + dout(4) << events.size() << " events" << dendl; + events.clear(); +} + + +/** + * Whether the journal data looks valid and replayable + */ +bool JournalScanner::is_healthy() const +{ + return ((!is_mdlog || (pointer_present && pointer_valid)) + && header_present && header_valid + && ranges_invalid.empty() + && objects_missing.empty()); +} + + +/** + * Whether the journal data can be read from RADOS + */ +bool JournalScanner::is_readable() const +{ + return (header_present && header_valid && objects_missing.empty()); +} + + +/** + * Calculate the object name for a given offset + */ +std::string JournalScanner::obj_name(inodeno_t ino, uint64_t offset) const +{ + char name[60]; + snprintf(name, sizeof(name), "%llx.%08llx", + (unsigned long long)(ino), + (unsigned long long)offset); + return std::string(name); +} + + +std::string JournalScanner::obj_name(uint64_t offset) const +{ + return obj_name(ino, offset); +} + + +/* + * Write a human readable summary of the journal health + */ +void JournalScanner::report(std::ostream &out) const +{ + out << "Overall journal integrity: " << (is_healthy() ? "OK" : "DAMAGED") << std::endl; + + if (is_mdlog) { + if (!pointer_present) { + out << "Pointer not found" << std::endl; + } else if (!pointer_valid) { + out << "Pointer could not be decoded" << std::endl; + } + } + if (!header_present) { + out << "Header not found" << std::endl; + } else if (!header_valid) { + out << "Header could not be decoded" << std::endl; + } + + if (objects_missing.size()) { + out << "Objects missing:" << std::endl; + for (std::vector<uint64_t>::const_iterator om = objects_missing.begin(); + om != objects_missing.end(); ++om) { + out << " 0x" << std::hex << *om << std::dec << std::endl; + } + } + + if (ranges_invalid.size()) { + out << "Corrupt regions:" << std::endl; + for (std::vector<Range>::const_iterator r = ranges_invalid.begin(); + r != ranges_invalid.end(); ++r) { + out << " 0x" << std::hex << r->first << "-" << r->second << std::dec << std::endl; + } + } +} + diff --git a/src/tools/cephfs/JournalScanner.h b/src/tools/cephfs/JournalScanner.h new file mode 100644 index 00000000..9197b559 --- /dev/null +++ b/src/tools/cephfs/JournalScanner.h @@ -0,0 +1,133 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + +#ifndef JOURNAL_SCANNER_H +#define JOURNAL_SCANNER_H + +#include "include/rados/librados_fwd.hpp" + +// For Journaler::Header, can't forward-declare nested classes +#include <osdc/Journaler.h> + +#include "JournalFilter.h" + +/** + * A simple sequential reader for metadata journals. Unlike + * the MDS Journaler class, this is written to detect, record, + * and read past corruptions and missing objects. It is also + * less efficient but more plainly written. + */ +class JournalScanner +{ + private: + librados::IoCtx &io; + + // Input constraints + const int rank; + std::string type; + JournalFilter const filter; + + void gap_advance(); + + public: + JournalScanner( + librados::IoCtx &io_, + int rank_, + const std::string &type_, + JournalFilter const &filter_) : + io(io_), + rank(rank_), + type(type_), + filter(filter_), + is_mdlog(false), + pointer_present(false), + pointer_valid(false), + header_present(false), + header_valid(false), + header(NULL) {}; + + JournalScanner( + librados::IoCtx &io_, + int rank_, + const std::string &type_) : + io(io_), + rank(rank_), + type(type_), + filter(type_), + is_mdlog(false), + pointer_present(false), + pointer_valid(false), + header_present(false), + header_valid(false), + header(NULL) {}; + + ~JournalScanner(); + + int set_journal_ino(); + int scan(bool const full=true); + int scan_pointer(); + int scan_header(); + int scan_events(); + void report(std::ostream &out) const; + + std::string obj_name(uint64_t offset) const; + std::string obj_name(inodeno_t ino, uint64_t offset) const; + + // The results of the scan + inodeno_t ino; // Corresponds to journal ino according their type + struct EventRecord { + EventRecord(std::unique_ptr<LogEvent> le, uint32_t rs) : log_event(std::move(le)), raw_size(rs) {} + EventRecord(std::unique_ptr<PurgeItem> p, uint32_t rs) : pi(std::move(p)), raw_size(rs) {} + std::unique_ptr<LogEvent> log_event; + std::unique_ptr<PurgeItem> pi; + uint32_t raw_size = 0; //< Size from start offset including all encoding overhead + }; + + class EventError { + public: + int r; + std::string description; + EventError(int r_, const std::string &desc_) + : r(r_), description(desc_) {} + }; + + typedef std::map<uint64_t, EventRecord> EventMap; + typedef std::map<uint64_t, EventError> ErrorMap; + typedef std::pair<uint64_t, uint64_t> Range; + bool is_mdlog; + bool pointer_present; //mdlog specific + bool pointer_valid; //mdlog specific + bool header_present; + bool header_valid; + Journaler::Header *header; + + bool is_healthy() const; + bool is_readable() const; + std::vector<std::string> objects_valid; + std::vector<uint64_t> objects_missing; + std::vector<Range> ranges_invalid; + std::vector<uint64_t> events_valid; + EventMap events; + + // For events present in ::events (i.e. scanned successfully), + // any subsequent errors handling them (e.g. replaying) + ErrorMap errors; + + + private: + // Forbid copy construction because I have ptr members + JournalScanner(const JournalScanner &rhs); +}; + +#endif // JOURNAL_SCANNER_H + diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc new file mode 100644 index 00000000..f6d7c411 --- /dev/null +++ b/src/tools/cephfs/JournalTool.cc @@ -0,0 +1,1256 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray <john.spray@inktank.com> + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include <sstream> + +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "osdc/Journaler.h" +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/InoTable.h" + +#include "mds/events/ENoOp.h" +#include "mds/events/EUpdate.h" + +#include "JournalScanner.h" +#include "EventOutput.h" +#include "Dumper.h" +#include "Resetter.h" + +#include "JournalTool.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << __func__ << ": " + + + +void JournalTool::usage() +{ + std::cout << "Usage: \n" + << " cephfs-journal-tool [options] journal <command>\n" + << " <command>:\n" + << " inspect\n" + << " import <path> [--force]\n" + << " export <path>\n" + << " reset [--force]\n" + << " cephfs-journal-tool [options] header <get|set> <field> <value>\n" + << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n" + << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n" + << " <selector>:\n" + << " --range=<start>..<end>\n" + << " --path=<substring>\n" + << " --inode=<integer>\n" + << " --type=<UPDATE|OPEN|SESSION...><\n" + << " --frag=<ino>.<frag> [--dname=<dentry string>]\n" + << " --client=<session id integer>\n" + << " <effect>: [get|recover_dentries|splice]\n" + << " <output>: [summary|list|binary|json] [--path <path>]\n" + << "\n" + << "General options:\n" + << " --rank=filesystem:mds-rank|all Journal rank (mandatory)\n" + << " --journal=<mdlog|purge_queue> Journal type (purge_queue means\n" + << " this journal is used to queue for purge operation,\n" + << " default is mdlog, and only mdlog support event mode)\n" + << "\n" + << "Special options\n" + << " --alternate-pool <name> Alternative metadata pool to target\n" + << " when using recover_dentries.\n"; + + generic_client_usage(); +} + + +/** + * Handle arguments and hand off to journal/header/event mode + */ +int JournalTool::main(std::vector<const char*> &argv) +{ + int r; + + dout(10) << "JournalTool::main " << dendl; + // Common arg parsing + // ================== + if (argv.empty()) { + cerr << "missing positional argument" << std::endl; + return -EINVAL; + } + + std::vector<const char*>::iterator arg = argv.begin(); + + std::string rank_str; + if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) { + derr << "missing mandatory \"--rank\" argument" << dendl; + return -EINVAL; + } + + if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) { + // Default is mdlog + type = "mdlog"; + } + + r = validate_type(type); + if (r != 0) { + derr << "journal type is not correct." << dendl; + return r; + } + + r = role_selector.parse(*fsmap, rank_str, false); + if (r != 0) { + derr << "Couldn't determine MDS rank." << dendl; + return r; + } + + std::string mode; + if (arg == argv.end()) { + derr << "Missing mode [journal|header|event]" << dendl; + return -EINVAL; + } + mode = std::string(*arg); + arg = argv.erase(arg); + + // RADOS init + // ========== + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; + return r; + } + + dout(4) << "JournalTool: connecting to RADOS..." << dendl; + r = rados.connect(); + if (r < 0) { + derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; + return r; + } + + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + ceph_assert(fs != nullptr); + int64_t const pool_id = fs->mds_map.get_metadata_pool(); + dout(4) << "JournalTool: resolving pool " << pool_id << dendl; + std::string pool_name; + r = rados.pool_reverse_lookup(pool_id, &pool_name); + if (r < 0) { + derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl; + return r; + } + + dout(4) << "JournalTool: creating IoCtx.." << dendl; + r = rados.ioctx_create(pool_name.c_str(), input); + ceph_assert(r == 0); + output.dup(input); + + // Execution + // ========= + // journal and header are general journal mode + // event mode is only specific for mdlog + auto roles = role_selector.get_roles(); + if (roles.size() > 1) { + const std::string &command = argv[0]; + bool allowed = can_execute_for_all_ranks(mode, command); + if (!allowed) { + derr << "operation not allowed for all ranks" << dendl; + return -EINVAL; + } + + all_ranks = true; + } + for (auto role : roles) { + rank = role.rank; + std::vector<const char *> rank_argv(argv); + dout(4) << "Executing for rank " << rank << dendl; + if (mode == std::string("journal")) { + r = main_journal(rank_argv); + } else if (mode == std::string("header")) { + r = main_header(rank_argv); + } else if (mode == std::string("event")) { + r = main_event(rank_argv); + } else { + cerr << "Bad command '" << mode << "'" << std::endl; + return -EINVAL; + } + + if (r != 0) { + return r; + } + } + + return r; +} + +int JournalTool::validate_type(const std::string &type) +{ + if (type == "mdlog" || type == "purge_queue") { + return 0; + } + return -1; +} + +std::string JournalTool::gen_dump_file_path(const std::string &prefix) { + if (!all_ranks) { + return prefix; + } + + return prefix + "." + std::to_string(rank); +} + +bool JournalTool::can_execute_for_all_ranks(const std::string &mode, + const std::string &command) { + if (mode == "journal" && command == "import") { + return false; + } + + return true; +} + +/** + * Handle arguments for 'journal' mode + * + * This is for operations that act on the journal as a whole. + */ +int JournalTool::main_journal(std::vector<const char*> &argv) +{ + if (argv.empty()) { + derr << "Missing journal command, please see help" << dendl; + return -EINVAL; + } + + std::string command = argv[0]; + if (command == "inspect") { + return journal_inspect(); + } else if (command == "export" || command == "import") { + bool force = false; + if (argv.size() >= 2) { + std::string const path = argv[1]; + if (argv.size() == 3) { + if (std::string(argv[2]) == "--force") { + force = true; + } else { + std::cerr << "Unknown argument " << argv[1] << std::endl; + return -EINVAL; + } + } + return journal_export(path, command == "import", force); + } else { + derr << "Missing path" << dendl; + return -EINVAL; + } + } else if (command == "reset") { + bool force = false; + if (argv.size() == 2) { + if (std::string(argv[1]) == "--force") { + force = true; + } else { + std::cerr << "Unknown argument " << argv[1] << std::endl; + return -EINVAL; + } + } else if (argv.size() > 2) { + std::cerr << "Too many arguments!" << std::endl; + return -EINVAL; + } + return journal_reset(force); + } else { + derr << "Bad journal command '" << command << "'" << dendl; + return -EINVAL; + } +} + + +/** + * Parse arguments and execute for 'header' mode + * + * This is for operations that act on the header only. + */ +int JournalTool::main_header(std::vector<const char*> &argv) +{ + JournalFilter filter(type); + JournalScanner js(input, rank, type, filter); + int r = js.scan(false); + if (r < 0) { + std::cerr << "Unable to scan journal" << std::endl; + return r; + } + + if (!js.header_present) { + std::cerr << "Header object not found!" << std::endl; + return -ENOENT; + } else if (!js.header_valid && js.header == NULL) { + // Can't do a read or a single-field write without a copy of the original + derr << "Header could not be read!" << dendl; + return -ENOENT; + } else { + ceph_assert(js.header != NULL); + } + + if (argv.empty()) { + derr << "Missing header command, must be [get|set]" << dendl; + return -EINVAL; + } + std::vector<const char *>::iterator arg = argv.begin(); + std::string const command = *arg; + arg = argv.erase(arg); + + if (command == std::string("get")) { + // Write JSON journal dump to stdout + JSONFormatter jf(true); + js.header->dump(&jf); + jf.flush(std::cout); + std::cout << std::endl; + } else if (command == std::string("set")) { + // Need two more args <key> <val> + if (argv.size() != 2) { + derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl; + return -EINVAL; + } + + std::string const field_name = *arg; + arg = argv.erase(arg); + + std::string const value_str = *arg; + arg = argv.erase(arg); + ceph_assert(argv.empty()); + + std::string parse_err; + uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid value '" << value_str << "': " << parse_err << dendl; + return -EINVAL; + } + + uint64_t *field = NULL; + if (field_name == "trimmed_pos") { + field = &(js.header->trimmed_pos); + } else if (field_name == "expire_pos") { + field = &(js.header->expire_pos); + } else if (field_name == "write_pos") { + field = &(js.header->write_pos); + } else if (field_name == "pool_id") { + field = (uint64_t*)(&(js.header->layout.pool_id)); + } else { + derr << "Invalid field '" << field_name << "'" << dendl; + return -EINVAL; + } + + std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl; + *field = new_val; + + dout(4) << "Writing object..." << dendl; + bufferlist header_bl; + encode(*(js.header), header_bl); + output.write_full(js.obj_name(0), header_bl); + dout(4) << "Write complete." << dendl; + std::cout << "Successfully updated header." << std::endl; + } else { + derr << "Bad header command '" << command << "'" << dendl; + return -EINVAL; + } + + return 0; +} + + +/** + * Parse arguments and execute for 'event' mode + * + * This is for operations that act on LogEvents within the log + */ +int JournalTool::main_event(std::vector<const char*> &argv) +{ + int r; + + if (argv.empty()) { + derr << "Missing event command, please see help" << dendl; + return -EINVAL; + } + + std::vector<const char*>::iterator arg = argv.begin(); + bool dry_run = false; + + std::string command = *(arg++); + if (command != "get" && command != "splice" && command != "recover_dentries") { + derr << "Unknown argument '" << command << "'" << dendl; + return -EINVAL; + } + + if (command == "recover_dentries") { + if (type != "mdlog") { + derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl; + return -EINVAL; + } else { + if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) { + dry_run = true; + } + } + } + + if (arg == argv.end()) { + derr << "Incomplete command line" << dendl; + return -EINVAL; + } + + // Parse filter options + // ==================== + JournalFilter filter(type); + r = filter.parse_args(argv, arg); + if (r) { + return r; + } + + // Parse output options + // ==================== + if (arg == argv.end()) { + cerr << "Missing output command" << std::endl; + return -EINVAL; + } + std::string output_style = *(arg++); + if (output_style != "binary" && output_style != "json" && + output_style != "summary" && output_style != "list") { + cerr << "Unknown argument: '" << output_style << "'" << std::endl; + return -EINVAL; + } + + std::string output_path = "dump"; + while(arg != argv.end()) { + std::string arg_str; + if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { + output_path = arg_str; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool", + nullptr)) { + dout(1) << "Using alternate pool " << arg_str << dendl; + int r = rados.ioctx_create(arg_str.c_str(), output); + ceph_assert(r == 0); + other_pool = true; + } else { + cerr << "Unknown argument: '" << *arg << "'" << std::endl; + return -EINVAL; + } + } + + const std::string dump_path = gen_dump_file_path(output_path); + + // Execute command + // =============== + JournalScanner js(input, rank, type, filter); + if (command == "get") { + r = js.scan(); + if (r) { + derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; + return r; + } + } else if (command == "recover_dentries") { + r = js.scan(); + if (r) { + derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; + return r; + } + + /** + * Iterate over log entries, attempting to scavenge from each one + */ + std::set<inodeno_t> consumed_inos; + for (JournalScanner::EventMap::iterator i = js.events.begin(); + i != js.events.end(); ++i) { + auto& le = i->second.log_event; + EMetaBlob const *mb = le->get_metablob(); + if (mb) { + int scav_r = recover_dentries(*mb, dry_run, &consumed_inos); + if (scav_r) { + dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec + << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl; + if (r == 0) { + r = scav_r; + } + // Our goal is to read all we can, so don't stop on errors, but + // do record them for possible later output + js.errors.insert(std::make_pair(i->first, + JournalScanner::EventError(scav_r, cpp_strerror(r)))); + } + } + } + + /** + * Update InoTable to reflect any inode numbers consumed during scavenge + */ + dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl; + if (consumed_inos.size() && !dry_run) { + int consume_r = consume_inos(consumed_inos); + if (consume_r) { + dout(1) << "Error updating InoTable for " << consumed_inos.size() + << " consume inos: " << cpp_strerror(consume_r) << dendl; + if (r == 0) { + r = consume_r; + } + } + } + + // Remove consumed dentries from lost+found. + if (other_pool && !dry_run) { + std::set<std::string> found; + + for (auto i : consumed_inos) { + char s[20]; + + snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i); + dout(20) << "removing " << s << dendl; + found.insert(std::string(s)); + } + + object_t frag_oid; + frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND, + frag_t(), ""); + output.omap_rm_keys(frag_oid.name, found); + } + } else if (command == "splice") { + r = js.scan(); + if (r) { + derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; + return r; + } + + uint64_t start, end; + if (filter.get_range(start, end)) { + // Special case for range filter: erase a numeric range in the log + uint64_t range = end - start; + int r = erase_region(js, start, range); + if (r) { + derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec + << ": " << cpp_strerror(r) << dendl; + return r; + } + } else { + // General case: erase a collection of individual entries in the log + for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) { + dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl; + + int r = erase_region(js, i->first, i->second.raw_size); + if (r) { + derr << "Failed to erase event 0x" << std::hex << i->first << std::dec + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + } + + + } else { + cerr << "Unknown argument '" << command << "'" << std::endl; + return -EINVAL; + } + + // Generate output + // =============== + EventOutput output(js, dump_path); + int output_result = 0; + if (output_style == "binary") { + output_result = output.binary(); + } else if (output_style == "json") { + output_result = output.json(); + } else if (output_style == "summary") { + output.summary(); + } else if (output_style == "list") { + output.list(); + } else { + std::cerr << "Bad output command '" << output_style << "'" << std::endl; + return -EINVAL; + } + + if (output_result != 0) { + std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl; + } + + return output_result; +} + +/** + * Provide the user with information about the condition of the journal, + * especially indicating what range of log events is available and where + * any gaps or corruptions in the journal are. + */ +int JournalTool::journal_inspect() +{ + int r; + + JournalFilter filter(type); + JournalScanner js(input, rank, type, filter); + r = js.scan(); + if (r) { + std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl; + return r; + } + + js.report(std::cout); + + return 0; +} + + +/** + * Attempt to export a binary dump of the journal. + * + * This is allowed to fail if the header is malformed or there are + * objects inaccessible, in which case the user would have to fall + * back to manually listing RADOS objects and extracting them, which + * they can do with the ``rados`` CLI. + */ +int JournalTool::journal_export(std::string const &path, bool import, bool force) +{ + int r = 0; + JournalScanner js(input, rank, type); + + if (!import) { + /* + * If doing an export, first check that the header is valid and + * no objects are missing before trying to dump + */ + r = js.scan(); + if (r < 0) { + derr << "Unable to scan journal, assuming badly damaged" << dendl; + return r; + } + if (!js.is_readable()) { + derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl; + return -EIO; + } + } + + /* + * Assuming we can cleanly read the journal data, dump it out to a file + */ + { + Dumper dumper; + r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type); + if (r < 0) { + derr << "dumper::init failed: " << cpp_strerror(r) << dendl; + return r; + } + if (import) { + r = dumper.undump(path.c_str(), force); + } else { + const std::string ex_path = gen_dump_file_path(path); + r = dumper.dump(ex_path.c_str()); + } + } + + return r; +} + + +/** + * Truncate journal and insert EResetJournal + */ +int JournalTool::journal_reset(bool hard) +{ + int r = 0; + Resetter resetter; + r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard); + if (r < 0) { + derr << "resetter::init failed: " << cpp_strerror(r) << dendl; + return r; + } + + if (hard) { + r = resetter.reset_hard(); + } else { + r = resetter.reset(); + } + + return r; +} + + +/** + * Selective offline replay which only reads out dentries and writes + * them to the backing store iff their version is > what is currently + * in the backing store. + * + * In order to write dentries to the backing store, we may create the + * required enclosing dirfrag objects. + * + * Test this by running scavenge on an unflushed journal, then nuking + * it offline, then starting an MDS and seeing that the dentries are + * visible. + * + * @param metablob an EMetaBlob retrieved from the journal + * @param dry_run if true, do no writes to RADOS + * @param consumed_inos output, populated with any inos inserted + * @returns 0 on success, else negative error code + */ +int JournalTool::recover_dentries( + EMetaBlob const &metablob, + bool const dry_run, + std::set<inodeno_t> *consumed_inos) +{ + ceph_assert(consumed_inos != NULL); + + int r = 0; + + // Replay fullbits (dentry+inode) + for (const auto& frag : metablob.lump_order) { + EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second; + lump._decode_bits(); + object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, ""); + + dout(4) << "inspecting lump " << frag_oid.name << dendl; + + + // We will record old fnode version for use in hard link handling + // If we don't read an old fnode, take version as zero and write in + // all hardlinks we find. + version_t old_fnode_version = 0; + + // Update fnode in omap header of dirfrag object + bool write_fnode = false; + bufferlist old_fnode_bl; + r = input.omap_get_header(frag_oid.name, &old_fnode_bl); + if (r == -ENOENT) { + // Creating dirfrag from scratch + dout(4) << "failed to read OMAP header from directory fragment " + << frag_oid.name << " " << cpp_strerror(r) << dendl; + write_fnode = true; + // Note: creating the dirfrag *without* a backtrace, relying on + // MDS to regenerate backtraces on read or in FSCK + } else if (r == 0) { + // Conditionally update existing omap header + fnode_t old_fnode; + auto old_fnode_iter = old_fnode_bl.cbegin(); + try { + old_fnode.decode(old_fnode_iter); + dout(4) << "frag " << frag_oid.name << " fnode old v" << + old_fnode.version << " vs new v" << lump.fnode.version << dendl; + old_fnode_version = old_fnode.version; + write_fnode = old_fnode_version < lump.fnode.version; + } catch (const buffer::error &err) { + dout(1) << "frag " << frag_oid.name + << " is corrupt, overwriting" << dendl; + write_fnode = true; + } + } else { + // Unexpected error + dout(4) << "failed to read OMAP header from directory fragment " + << frag_oid.name << " " << cpp_strerror(r) << dendl; + return r; + } + + if ((other_pool || write_fnode) && !dry_run) { + dout(4) << "writing fnode to omap header" << dendl; + bufferlist fnode_bl; + lump.fnode.encode(fnode_bl); + if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) { + r = output.omap_set_header(frag_oid.name, fnode_bl); + } + if (r != 0) { + derr << "Failed to write fnode for frag object " + << frag_oid.name << dendl; + return r; + } + } + + std::set<std::string> read_keys; + + // Compose list of potentially-existing dentries we would like to fetch + for (const auto& fb : lump.get_dfull()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); + dn_key.encode(key); + read_keys.insert(key); + } + + for(const auto& rb : lump.get_dremote()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); + dn_key.encode(key); + read_keys.insert(key); + } + + for (const auto& nb : lump.get_dnull()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); + dn_key.encode(key); + read_keys.insert(key); + } + + // Perform bulk read of existing dentries + std::map<std::string, bufferlist> read_vals; + r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); + if (r == -ENOENT && other_pool) { + r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); + } + if (r != 0) { + derr << "unexpected error reading fragment object " + << frag_oid.name << ": " << cpp_strerror(r) << dendl; + return r; + } + + // Compose list of dentries we will write back + std::map<std::string, bufferlist> write_vals; + for (const auto& fb : lump.get_dfull()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); + dn_key.encode(key); + + dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn + << dendl; + bool write_dentry = false; + if (read_vals.find(key) == read_vals.end()) { + dout(4) << "dentry did not already exist, will create" << dendl; + write_dentry = true; + } else { + dout(4) << "dentry " << key << " existed already" << dendl; + dout(4) << "dentry exists, checking versions..." << dendl; + bufferlist &old_dentry = read_vals[key]; + // Decode dentry+inode + auto q = old_dentry.cbegin(); + + snapid_t dnfirst; + decode(dnfirst, q); + char dentry_type; + decode(dentry_type, q); + + if (dentry_type == 'L') { + // leave write_dentry false, we have no version to + // compare with in a hardlink, so it's not safe to + // squash over it with what's in this fullbit + dout(10) << "Existing remote inode in slot to be (maybe) written " + << "by a full inode from the journal dn '" << fb.dn.c_str() + << "' with lump fnode version " << lump.fnode.version + << "vs existing fnode version " << old_fnode_version << dendl; + write_dentry = old_fnode_version < lump.fnode.version; + } else if (dentry_type == 'I') { + // Read out inode version to compare with backing store + InodeStore inode; + inode.decode_bare(q); + dout(4) << "decoded embedded inode version " + << inode.inode.version << " vs fullbit version " + << fb.inode.version << dendl; + if (inode.inode.version < fb.inode.version) { + write_dentry = true; + } + } else { + dout(4) << "corrupt dentry in backing store, overwriting from " + "journal" << dendl; + write_dentry = true; + } + } + + if ((other_pool || write_dentry) && !dry_run) { + dout(4) << "writing I dentry " << key << " into frag " + << frag_oid.name << dendl; + + // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) + bufferlist dentry_bl; + encode(fb.dnfirst, dentry_bl); + encode('I', dentry_bl); + encode_fullbit_as_inode(fb, true, &dentry_bl); + + // Record for writing to RADOS + write_vals[key] = dentry_bl; + consumed_inos->insert(fb.inode.ino); + } + } + + for(const auto& rb : lump.get_dremote()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); + dn_key.encode(key); + + dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn + << dendl; + bool write_dentry = false; + if (read_vals.find(key) == read_vals.end()) { + dout(4) << "dentry did not already exist, will create" << dendl; + write_dentry = true; + } else { + dout(4) << "dentry " << key << " existed already" << dendl; + dout(4) << "dentry exists, checking versions..." << dendl; + bufferlist &old_dentry = read_vals[key]; + // Decode dentry+inode + auto q = old_dentry.cbegin(); + + snapid_t dnfirst; + decode(dnfirst, q); + char dentry_type; + decode(dentry_type, q); + + if (dentry_type == 'L') { + dout(10) << "Existing hardlink inode in slot to be (maybe) written " + << "by a remote inode from the journal dn '" << rb.dn.c_str() + << "' with lump fnode version " << lump.fnode.version + << "vs existing fnode version " << old_fnode_version << dendl; + write_dentry = old_fnode_version < lump.fnode.version; + } else if (dentry_type == 'I') { + dout(10) << "Existing full inode in slot to be (maybe) written " + << "by a remote inode from the journal dn '" << rb.dn.c_str() + << "' with lump fnode version " << lump.fnode.version + << "vs existing fnode version " << old_fnode_version << dendl; + write_dentry = old_fnode_version < lump.fnode.version; + } else { + dout(4) << "corrupt dentry in backing store, overwriting from " + "journal" << dendl; + write_dentry = true; + } + } + + if ((other_pool || write_dentry) && !dry_run) { + dout(4) << "writing L dentry " << key << " into frag " + << frag_oid.name << dendl; + + // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) + bufferlist dentry_bl; + encode(rb.dnfirst, dentry_bl); + encode('L', dentry_bl); + encode(rb.ino, dentry_bl); + encode(rb.d_type, dentry_bl); + + // Record for writing to RADOS + write_vals[key] = dentry_bl; + consumed_inos->insert(rb.ino); + } + } + + std::set<std::string> null_vals; + for (const auto& nb : lump.get_dnull()) { + std::string key; + dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); + dn_key.encode(key); + + dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn + << dendl; + + auto it = read_vals.find(key); + if (it != read_vals.end()) { + dout(4) << "dentry exists, will remove" << dendl; + + auto q = it->second.cbegin(); + snapid_t dnfirst; + decode(dnfirst, q); + char dentry_type; + decode(dentry_type, q); + + bool remove_dentry = false; + if (dentry_type == 'L') { + dout(10) << "Existing hardlink inode in slot to be (maybe) removed " + << "by null journal dn '" << nb.dn.c_str() + << "' with lump fnode version " << lump.fnode.version + << "vs existing fnode version " << old_fnode_version << dendl; + remove_dentry = old_fnode_version < lump.fnode.version; + } else if (dentry_type == 'I') { + dout(10) << "Existing full inode in slot to be (maybe) removed " + << "by null journal dn '" << nb.dn.c_str() + << "' with lump fnode version " << lump.fnode.version + << "vs existing fnode version " << old_fnode_version << dendl; + remove_dentry = old_fnode_version < lump.fnode.version; + } else { + dout(4) << "corrupt dentry in backing store, will remove" << dendl; + remove_dentry = true; + } + + if (remove_dentry) + null_vals.insert(key); + } + } + + // Write back any new/changed dentries + if (!write_vals.empty()) { + r = output.omap_set(frag_oid.name, write_vals); + if (r != 0) { + derr << "error writing dentries to " << frag_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + + // remove any null dentries + if (!null_vals.empty()) { + r = output.omap_rm_keys(frag_oid.name, null_vals); + if (r != 0) { + derr << "error removing dentries from " << frag_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + } + + /* Now that we've looked at the dirlumps, we finally pay attention to + * the roots (i.e. inodes without ancestry). This is necessary in order + * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally + * important because clients use them to infer completeness + * of directories + */ + for (const auto& fb : metablob.roots) { + inodeno_t ino = fb.inode.ino; + dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl; + + object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); + dout(4) << "object id " << root_oid.name << dendl; + + bool write_root_ino = false; + bufferlist old_root_ino_bl; + r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0); + if (r == -ENOENT) { + dout(4) << "root does not exist, will create" << dendl; + write_root_ino = true; + } else if (r >= 0) { + r = 0; + InodeStore old_inode; + dout(4) << "root exists, will modify (" << old_root_ino_bl.length() + << ")" << dendl; + auto inode_bl_iter = old_root_ino_bl.cbegin(); + std::string magic; + decode(magic, inode_bl_iter); + if (magic == CEPH_FS_ONDISK_MAGIC) { + dout(4) << "magic ok" << dendl; + old_inode.decode(inode_bl_iter); + + if (old_inode.inode.version < fb.inode.version) { + write_root_ino = true; + } + } else { + dout(4) << "magic bad: '" << magic << "'" << dendl; + write_root_ino = true; + } + } else { + derr << "error reading root inode object " << root_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (write_root_ino && !dry_run) { + dout(4) << "writing root ino " << root_oid.name + << " version " << fb.inode.version << dendl; + + // Compose: root ino format is magic,InodeStore(bare=false) + bufferlist new_root_ino_bl; + encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl); + encode_fullbit_as_inode(fb, false, &new_root_ino_bl); + + // Write to RADOS + r = output.write_full(root_oid.name, new_root_ino_bl); + if (r != 0) { + derr << "error writing inode object " << root_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + } + + return r; +} + + +/** + * Erase a region of the log by overwriting it with ENoOp + * + */ +int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length) +{ + // To erase this region, we use our preamble, the encoding overhead + // of an ENoOp, and our trailing start ptr. Calculate how much padding + // is needed inside the ENoOp to make up the difference. + bufferlist tmp; + if (type == "mdlog") { + ENoOp enoop(0); + enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT); + } else if (type == "purge_queue") { + PurgeItem pi; + pi.encode(tmp); + } + + dout(4) << "erase_region " << pos << " len=" << length << dendl; + + // FIXME: get the preamble/postamble length via JournalStream + int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t); + dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl; + + if (padding < 0) { + derr << "Erase region " << length << " too short" << dendl; + return -EINVAL; + } + + bufferlist entry; + if (type == "mdlog") { + // Serialize an ENoOp with the correct amount of padding + ENoOp enoop(padding); + enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT); + } else if (type == "purge_queue") { + PurgeItem pi; + pi.pad_size = padding; + pi.encode(entry); + } + JournalStream stream(JOURNAL_FORMAT_RESILIENT); + // Serialize region of log stream + bufferlist log_data; + stream.write(entry, &log_data, pos); + + dout(4) << "erase_region data length " << log_data.length() << dendl; + ceph_assert(log_data.length() == length); + + // Write log stream region to RADOS + // FIXME: get object size somewhere common to scan_events + uint32_t object_size = g_conf()->mds_log_segment_size; + if (object_size == 0) { + // Default layout object size + object_size = file_layout_t::get_default().object_size; + } + + uint64_t write_offset = pos; + uint64_t obj_offset = (pos / object_size); + int r = 0; + while(log_data.length()) { + std::string const oid = js.obj_name(obj_offset); + uint32_t offset_in_obj = write_offset % object_size; + uint32_t write_len = min(log_data.length(), object_size - offset_in_obj); + + r = output.write(oid, log_data, write_len, offset_in_obj); + if (r < 0) { + return r; + } else { + dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl; + r = 0; + } + + log_data.splice(0, write_len); + write_offset += write_len; + obj_offset++; + } + + return r; +} + +/** + * Given an EMetaBlob::fullbit containing an inode, write out + * the encoded inode in the format used by InodeStore (i.e. the + * backing store format) + * + * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use + * on an offline InodeStore instance. It's way simpler, because we are just + * uncritically hauling the data between structs. + * + * @param fb a fullbit extracted from a journal entry + * @param bare if true, leave out [EN|DE]CODE_START decoration + * @param out_bl output, write serialized inode to this bufferlist + */ +void JournalTool::encode_fullbit_as_inode( + const EMetaBlob::fullbit &fb, + const bool bare, + bufferlist *out_bl) +{ + ceph_assert(out_bl != NULL); + + // Compose InodeStore + InodeStore new_inode; + new_inode.inode = fb.inode; + new_inode.xattrs = fb.xattrs; + new_inode.dirfragtree = fb.dirfragtree; + new_inode.snap_blob = fb.snapbl; + new_inode.symlink = fb.symlink; + new_inode.old_inodes = fb.old_inodes; + + // Serialize InodeStore + if (bare) { + new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + } else { + new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + } +} + +/** + * Given a list of inode numbers known to be in use by + * inodes in the backing store, ensure that none of these + * numbers are listed as free in the InoTables in the + * backing store. + * + * Used after injecting inodes into the backing store, to + * ensure that the same inode numbers are not subsequently + * used for new files during ordinary operation. + * + * @param inos list of inode numbers to be removed from + * free lists in InoTables + * @returns 0 on success, else negative error code + */ +int JournalTool::consume_inos(const std::set<inodeno_t> &inos) +{ + int r = 0; + + // InoTable is a per-MDS structure, so iterate over assigned ranks + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + std::set<mds_rank_t> in_ranks; + fs->mds_map.get_mds_set(in_ranks); + + for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin(); + rank_i != in_ranks.end(); ++rank_i) + { + // Compose object name + std::ostringstream oss; + oss << "mds" << *rank_i << "_inotable"; + object_t inotable_oid = object_t(oss.str()); + + // Read object + bufferlist inotable_bl; + int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0); + if (read_r < 0) { + // Things are really bad if we can't read inotable. Beyond our powers. + derr << "unable to read inotable '" << inotable_oid.name << "': " + << cpp_strerror(read_r) << dendl; + r = r ? r : read_r; + continue; + } + + // Deserialize InoTable + version_t inotable_ver; + auto q = inotable_bl.cbegin(); + decode(inotable_ver, q); + InoTable ino_table(NULL); + ino_table.decode(q); + + // Update InoTable in memory + bool inotable_modified = false; + for (std::set<inodeno_t>::iterator i = inos.begin(); + i != inos.end(); ++i) + { + const inodeno_t ino = *i; + if (ino_table.force_consume(ino)) { + dout(4) << "Used ino 0x" << std::hex << ino << std::dec + << " requires inotable update" << dendl; + inotable_modified = true; + } + } + + // Serialize and write InoTable + if (inotable_modified) { + inotable_ver += 1; + dout(4) << "writing modified inotable version " << inotable_ver << dendl; + bufferlist inotable_new_bl; + encode(inotable_ver, inotable_new_bl); + ino_table.encode_state(inotable_new_bl); + int write_r = output.write_full(inotable_oid.name, inotable_new_bl); + if (write_r != 0) { + derr << "error writing modified inotable " << inotable_oid.name + << ": " << cpp_strerror(write_r) << dendl; + r = r ? r : read_r; + continue; + } + } + } + + return r; +} + diff --git a/src/tools/cephfs/JournalTool.h b/src/tools/cephfs/JournalTool.h new file mode 100644 index 00000000..8d610a86 --- /dev/null +++ b/src/tools/cephfs/JournalTool.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "MDSUtility.h" +#include "RoleSelector.h" +#include <vector> + +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/events/EMetaBlob.h" + +#include "include/rados/librados.hpp" + +#include "JournalFilter.h" + +class JournalScanner; + + +/** + * Command line tool for investigating and repairing filesystems + * with damaged metadata logs + */ +class JournalTool : public MDSUtility +{ + private: + MDSRoleSelector role_selector; + // Bit hacky, use this `rank` member to control behaviour of the + // various main_ functions. + mds_rank_t rank; + // when set, generate per rank dump file path + bool all_ranks = false; + + std::string type; + + // Entry points + int main_journal(std::vector<const char*> &argv); + int main_header(std::vector<const char*> &argv); + int main_event(std::vector<const char*> &argv); + + // Shared functionality + int recover_journal(); + + // Journal operations + int journal_inspect(); + int journal_export(std::string const &path, bool import, bool force); + int journal_reset(bool hard); + + // Header operations + int header_set(); + + // I/O handles + librados::Rados rados; + librados::IoCtx input; + librados::IoCtx output; + + bool other_pool; + + // Metadata backing store manipulation + int read_lost_found(std::set<std::string> &lost); + int recover_dentries( + EMetaBlob const &metablob, + bool const dry_run, + std::set<inodeno_t> *consumed_inos); + + // Splicing + int erase_region(JournalScanner const &jp, uint64_t const pos, uint64_t const length); + + // Backing store helpers + void encode_fullbit_as_inode( + const EMetaBlob::fullbit &fb, + const bool bare, + bufferlist *out_bl); + int consume_inos(const std::set<inodeno_t> &inos); + + //validate type + int validate_type(const std::string &type); + + // generate output file path for dump/export + std::string gen_dump_file_path(const std::string &prefix); + + // check if an operation (mode, command) is safe to be + // executed on all ranks. + bool can_execute_for_all_ranks(const std::string &mode, + const std::string &command); + public: + static void usage(); + JournalTool() : + rank(0), other_pool(false) {} + int main(std::vector<const char*> &argv); +}; + diff --git a/src/tools/cephfs/MDSUtility.cc b/src/tools/cephfs/MDSUtility.cc new file mode 100644 index 00000000..b5a3219c --- /dev/null +++ b/src/tools/cephfs/MDSUtility.cc @@ -0,0 +1,162 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "MDSUtility.h" +#include "mon/MonClient.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + + +MDSUtility::MDSUtility() : + Dispatcher(g_ceph_context), + objecter(NULL), + lock("MDSUtility::lock"), + finisher(g_ceph_context, "MDSUtility", "fn_mds_utility"), + waiting_for_mds_map(NULL), + inited(false) +{ + monc = new MonClient(g_ceph_context); + messenger = Messenger::create_client_messenger(g_ceph_context, "mds"); + fsmap = new FSMap(); + objecter = new Objecter(g_ceph_context, messenger, monc, NULL, 0, 0); +} + + +MDSUtility::~MDSUtility() +{ + if (inited) { + shutdown(); + } + delete objecter; + delete monc; + delete messenger; + delete fsmap; + ceph_assert(waiting_for_mds_map == NULL); +} + + +int MDSUtility::init() +{ + // Initialize Messenger + messenger->start(); + + objecter->set_client_incarnation(0); + objecter->init(); + + // Connect dispatchers before starting objecter + messenger->add_dispatcher_tail(objecter); + messenger->add_dispatcher_tail(this); + + // Initialize MonClient + if (monc->build_initial_monmap() < 0) { + objecter->shutdown(); + messenger->shutdown(); + messenger->wait(); + return -1; + } + + monc->set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD|CEPH_ENTITY_TYPE_MDS); + monc->set_messenger(messenger); + monc->init(); + int r = monc->authenticate(); + if (r < 0) { + derr << "Authentication failed, did you specify an MDS ID with a valid keyring?" << dendl; + monc->shutdown(); + objecter->shutdown(); + messenger->shutdown(); + messenger->wait(); + return r; + } + + client_t whoami = monc->get_global_id(); + messenger->set_myname(entity_name_t::CLIENT(whoami.v)); + + // Start Objecter and wait for OSD map + objecter->start(); + objecter->wait_for_osd_map(); + + // Prepare to receive MDS map and request it + Mutex init_lock("MDSUtility:init"); + Cond cond; + bool done = false; + ceph_assert(!fsmap->get_epoch()); + lock.Lock(); + waiting_for_mds_map = new C_SafeCond(&init_lock, &cond, &done, NULL); + lock.Unlock(); + monc->sub_want("fsmap", 0, CEPH_SUBSCRIBE_ONETIME); + monc->renew_subs(); + + // Wait for MDS map + dout(4) << "waiting for MDS map..." << dendl; + init_lock.Lock(); + while (!done) + cond.Wait(init_lock); + init_lock.Unlock(); + dout(4) << "Got MDS map " << fsmap->get_epoch() << dendl; + + finisher.start(); + + inited = true; + return 0; +} + + +void MDSUtility::shutdown() +{ + finisher.stop(); + + lock.Lock(); + objecter->shutdown(); + lock.Unlock(); + monc->shutdown(); + messenger->shutdown(); + messenger->wait(); +} + + +bool MDSUtility::ms_dispatch(Message *m) +{ + Mutex::Locker locker(lock); + switch (m->get_type()) { + case CEPH_MSG_FS_MAP: + handle_fs_map((MFSMap*)m); + break; + case CEPH_MSG_OSD_MAP: + break; + default: + return false; + } + m->put(); + return true; +} + + +void MDSUtility::handle_fs_map(MFSMap* m) +{ + *fsmap = m->get_fsmap(); + if (waiting_for_mds_map) { + waiting_for_mds_map->complete(0); + waiting_for_mds_map = NULL; + } +} + + +bool MDSUtility::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) +{ + if (dest_type == CEPH_ENTITY_TYPE_MON) + return true; + + *authorizer = monc->build_authorizer(dest_type); + return *authorizer != NULL; +} diff --git a/src/tools/cephfs/MDSUtility.h b/src/tools/cephfs/MDSUtility.h new file mode 100644 index 00000000..e75a7192 --- /dev/null +++ b/src/tools/cephfs/MDSUtility.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MDS_UTILITY_H_ +#define MDS_UTILITY_H_ + +#include "osdc/Objecter.h" +#include "mds/FSMap.h" +#include "messages/MFSMap.h" +#include "msg/Dispatcher.h" +#include "msg/Messenger.h" +#include "auth/Auth.h" +#include "common/Finisher.h" +#include "common/Timer.h" + +/// MDS Utility +/** + * This class is the parent for MDS utilities, i.e. classes that + * need access the objects belonging to the MDS without actually + * acting as an MDS daemon themselves. + */ +class MDSUtility : public Dispatcher { +protected: + Objecter *objecter; + FSMap *fsmap; + Messenger *messenger; + MonClient *monc; + + Mutex lock; + Finisher finisher; + + Context *waiting_for_mds_map; + + bool inited; +public: + MDSUtility(); + ~MDSUtility() override; + + void handle_fs_map(MFSMap* m); + bool ms_dispatch(Message *m) override; + bool ms_handle_reset(Connection *con) override { return false; } + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override { return false; } + bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) override; + int init(); + void shutdown(); +}; + +#endif /* MDS_UTILITY_H_ */ diff --git a/src/tools/cephfs/PgFiles.cc b/src/tools/cephfs/PgFiles.cc new file mode 100644 index 00000000..2abca722 --- /dev/null +++ b/src/tools/cephfs/PgFiles.cc @@ -0,0 +1,194 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "osdc/Striper.h" + +#include "PgFiles.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "pgeffects." << __func__ << ": " + +int PgFiles::init() +{ + int r = ceph_create_with_context(&cmount, g_ceph_context); + if (r != 0) { + return r; + } + + return ceph_init(cmount); +} + +PgFiles::PgFiles(Objecter *o, const std::set<pg_t> &pgs_) + : objecter(o), pgs(pgs_) +{ + for (const auto &i : pgs) { + pools.insert(i.m_pool); + } +} + +PgFiles::~PgFiles() +{ + ceph_release(cmount); +} + +void PgFiles::hit_dir(std::string const &path) +{ + dout(10) << "entering " << path << dendl; + + ceph_dir_result *dr = nullptr; + int r = ceph_opendir(cmount, path.c_str(), &dr); + if (r != 0) { + derr << "Failed to open path: " << cpp_strerror(r) << dendl; + return; + } + + struct dirent de; + while((r = ceph_readdir_r(cmount, dr, &de)) != 0) { + if (r < 0) { + derr << "Error reading path " << path << ": " << cpp_strerror(r) + << dendl; + ceph_closedir(cmount, dr); // best effort, ignore r + return; + } + + if (std::string(de.d_name) == "." || std::string(de.d_name) == "..") { + continue; + } + + struct ceph_statx stx; + std::string de_path = (path + std::string("/") + de.d_name); + r = ceph_statx(cmount, de_path.c_str(), &stx, + CEPH_STATX_INO|CEPH_STATX_SIZE, 0); + if (r != 0) { + derr << "Failed to stat path " << de_path << ": " + << cpp_strerror(r) << dendl; + // Don't hold up the whole process for one bad inode + continue; + } + + if (S_ISREG(stx.stx_mode)) { + hit_file(de_path, stx); + } else if (S_ISDIR(stx.stx_mode)) { + hit_dir(de_path); + } else { + dout(20) << "Skipping non reg/dir file: " << de_path << dendl; + } + } + + r = ceph_closedir(cmount, dr); + if (r != 0) { + derr << "Error closing path " << path << ": " << cpp_strerror(r) << dendl; + return; + } +} + +void PgFiles::hit_file(std::string const &path, const struct ceph_statx &stx) +{ + ceph_assert(S_ISREG(stx.stx_mode)); + + dout(20) << "Hitting file '" << path << "'" << dendl; + + int l_stripe_unit = 0; + int l_stripe_count = 0; + int l_object_size = 0; + int l_pool_id = 0; + int r = ceph_get_path_layout(cmount, path.c_str(), &l_stripe_unit, + &l_stripe_count, &l_object_size, + &l_pool_id); + if (r != 0) { + derr << "Error reading layout on " << path << ": " << cpp_strerror(r) + << dendl; + return; + } + + struct file_layout_t layout; + layout.stripe_unit = l_stripe_unit; + layout.stripe_count = l_stripe_count; + layout.object_size = l_object_size; + layout.pool_id = l_pool_id; + + // Avoid calculating PG if the layout targeted a completely different pool + if (pools.count(layout.pool_id) == 0) { + dout(20) << "Fast check missed: pool " << layout.pool_id << " not in " + "target set" << dendl; + return; + } + + auto num_objects = Striper::get_num_objects(layout, stx.stx_size); + + for (uint64_t i = 0; i < num_objects; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)stx.stx_ino, + (long long unsigned int)i); + dout(20) << " object " << std::string(buf) << dendl; + + pg_t target; + object_t oid; + object_locator_t loc; + loc.pool = layout.pool_id; + loc.key = std::string(buf); + + unsigned pg_num_mask = 0; + unsigned pg_num = 0; + + int r = 0; + objecter->with_osdmap([&r, oid, loc, &target, &pg_num_mask, &pg_num] + (const OSDMap &osd_map) { + r = osd_map.object_locator_to_pg(oid, loc, target); + if (r == 0) { + auto pool = osd_map.get_pg_pool(loc.pool); + pg_num_mask = pool->get_pg_num_mask(); + pg_num = pool->get_pg_num(); + } + }); + if (r != 0) { + // Can happen if layout pointed to pool not in osdmap, for example + continue; + } + + target.m_seed = ceph_stable_mod(target.ps(), pg_num, pg_num_mask); + + dout(20) << " target " << target << dendl; + + if (pgs.count(target)) { + std::cout << path << std::endl; + return; + } + } + +} + +int PgFiles::scan_path(std::string const &path) +{ + int r = ceph_mount(cmount, "/"); + if (r != 0) { + derr << "Failed to mount: " << cpp_strerror(r) << dendl; + return r; + } + + hit_dir(path); + + r = ceph_unmount(cmount); + if (r != 0) { + derr << "Failed to unmount: " << cpp_strerror(r) << dendl; + return r; + } + + return r; +} + diff --git a/src/tools/cephfs/PgFiles.h b/src/tools/cephfs/PgFiles.h new file mode 100644 index 00000000..1ba4b3d2 --- /dev/null +++ b/src/tools/cephfs/PgFiles.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef PG_EFFECTS_H_ +#define PG_EFFECTS_H_ + +#include "include/cephfs/libcephfs.h" +#include "osd/osd_types.h" +#include <set> +#include "osdc/Objecter.h" + +/** + * This utility scans the files (via an online MDS) and works out + * which ones rely on named PGs. For use when someone has + * some bad/damaged PGs and wants to see which files might be + * affected. + */ +class PgFiles +{ +private: + Objecter *objecter; + struct ceph_mount_info *cmount = nullptr; + + std::set<pg_t> pgs; + std::set<uint64_t> pools; + + void hit_file(std::string const &path, const struct ceph_statx &stx); + void hit_dir(std::string const &path); + + +public: + PgFiles(Objecter *o, const std::set<pg_t> &pgs_); + ~PgFiles(); + + int init(); + int scan_path(std::string const &path); +}; + +#endif + diff --git a/src/tools/cephfs/Resetter.cc b/src/tools/cephfs/Resetter.cc new file mode 100644 index 00000000..8ab134f8 --- /dev/null +++ b/src/tools/cephfs/Resetter.cc @@ -0,0 +1,224 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include <memory> +#include "common/errno.h" +#include "osdc/Journaler.h" +#include "mds/JournalPointer.h" + +#include "mds/mdstypes.h" +#include "mds/MDCache.h" +#include "mon/MonClient.h" +#include "mds/events/EResetJournal.h" + +#include "Resetter.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +int Resetter::init(mds_role_t role_, const std::string &type, bool hard) +{ + role = role_; + int r = MDSUtility::init(); + if (r < 0) { + return r; + } + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(nullptr != fs); + + is_mdlog = false; + if (type == "mdlog") { + JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool()); + int rt = 0; + if (hard) { + jp.front = role.rank + MDS_INO_LOG_OFFSET; + jp.back = 0; + rt = jp.save(objecter); + if (rt != 0) { + derr << "Error writing journal pointer: " << cpp_strerror(rt) << dendl; + return rt; + } + ino = jp.front; // only need to reset ino for mdlog + } else { + rt = jp.load(objecter); + if (rt != 0) { + std::cerr << "Error loading journal: " << cpp_strerror(rt) << + ", pass --force to forcibly reset this journal" << std::endl; + return rt; + } else { + ino = jp.front; + } + } + is_mdlog = true; + } else if (type == "purge_queue") { + ino = MDS_INO_PURGE_QUEUE + role.rank; + } else { + ceph_abort(); // should not get here + } + return 0; +} + +int Resetter::reset() +{ + Mutex mylock("Resetter::reset::lock"); + Cond cond; + bool done; + int r; + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + Journaler journaler("resetter", ino, + fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, + objecter, 0, 0, &finisher); + + lock.Lock(); + journaler.recover(new C_SafeCond(&mylock, &cond, &done, &r)); + lock.Unlock(); + + mylock.Lock(); + while (!done) + cond.Wait(mylock); + mylock.Unlock(); + + if (r != 0) { + if (r == -ENOENT) { + cerr << "journal does not exist on-disk. Did you set a bad rank?" + << std::endl; + std::cerr << "Error loading journal: " << cpp_strerror(r) << + ", pass --force to forcibly reset this journal" << std::endl; + return r; + } else { + cerr << "got error " << r << "from Journaler, failing" << std::endl; + return r; + } + } + + lock.Lock(); + uint64_t old_start = journaler.get_read_pos(); + uint64_t old_end = journaler.get_write_pos(); + uint64_t old_len = old_end - old_start; + cout << "old journal was " << old_start << "~" << old_len << std::endl; + + uint64_t new_start = round_up_to(old_end+1, journaler.get_layout_period()); + cout << "new journal start will be " << new_start + << " (" << (new_start - old_end) << " bytes past old end)" << std::endl; + + journaler.set_read_pos(new_start); + journaler.set_write_pos(new_start); + journaler.set_expire_pos(new_start); + journaler.set_trimmed_pos(new_start); + journaler.set_writeable(); + + cout << "writing journal head" << std::endl; + journaler.write_head(new C_SafeCond(&mylock, &cond, &done, &r)); + lock.Unlock(); + + mylock.Lock(); + while (!done) + cond.Wait(mylock); + mylock.Unlock(); + + Mutex::Locker l(lock); + if (r != 0) { + return r; + } + + if (is_mdlog) { + r = _write_reset_event(&journaler); // reset envent is specific for mdlog journal + if (r != 0) { + return r; + } + } + cout << "done" << std::endl; + + return 0; +} + +int Resetter::reset_hard() +{ + auto fs = fsmap->get_filesystem(role.fscid); + + Journaler journaler("resetter", ino, + fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, + objecter, 0, 0, &finisher); + journaler.set_writeable(); + + file_layout_t default_log_layout = MDCache::gen_default_log_layout( + fsmap->get_filesystem(role.fscid)->mds_map); + journaler.create(&default_log_layout, g_conf()->mds_journal_format); + + C_SaferCond cond; + { + Mutex::Locker l(lock); + journaler.write_head(&cond); + } + + int r = cond.wait(); + if (r != 0) { + derr << "Error writing journal header: " << cpp_strerror(r) << dendl; + return r; + } + + if (is_mdlog) // reset event is specific for mdlog journal + { + Mutex::Locker l(lock); + r = _write_reset_event(&journaler); + if (r != 0) { + derr << "Error writing EResetJournal: " << cpp_strerror(r) << dendl; + return r; + } + } + + if (is_mdlog) { + dout(4) << "Successfully wrote new journal pointer and header for rank " + << role << dendl; + } else { + dout(4) << "Successfully wrote header for rank " << role << dendl; + } + return 0; +} + +int Resetter::_write_reset_event(Journaler *journaler) +{ + ceph_assert(journaler != NULL); + + auto le = std::make_unique<EResetJournal>(); + + bufferlist bl; + le->encode_with_header(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + cout << "writing EResetJournal entry" << std::endl; + journaler->append_entry(bl); + + int ret; + { + C_SaferCond cond; + journaler->flush(&cond); + ret = cond.wait(); + if (ret < 0) + return ret; + } + { + // wait until all journal prezero ops are done + C_SaferCond cond; + journaler->wait_for_prezero(&cond); + cond.wait(); + } + + return ret; +} + diff --git a/src/tools/cephfs/Resetter.h b/src/tools/cephfs/Resetter.h new file mode 100644 index 00000000..6998e459 --- /dev/null +++ b/src/tools/cephfs/Resetter.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef JOURNAL_RESETTER_H_ +#define JOURNAL_RESETTER_H_ + + +#include "MDSUtility.h" + +class Journaler; + +/** + * This class lets you reset an mds journal for troubleshooting or whatever. + * + * To use, create a Resetter, call init(), and then call reset() with the name + * of the file to dump to. + */ +class Resetter : public MDSUtility { +private: + mds_role_t role; + inodeno_t ino; + bool is_mdlog; + +protected: + int _write_reset_event(Journaler *journaler); + +public: + Resetter() {} + ~Resetter() {} + + int init(mds_role_t role_, const std::string &type, bool hard); + /** + * For use when no journal header/pointer was present: write one + * out from scratch. + */ + int reset_hard(); + int reset(); +}; + +#endif /* JOURNAL_RESETTER_H_ */ diff --git a/src/tools/cephfs/RoleSelector.cc b/src/tools/cephfs/RoleSelector.cc new file mode 100644 index 00000000..e2d53b86 --- /dev/null +++ b/src/tools/cephfs/RoleSelector.cc @@ -0,0 +1,59 @@ + +#include "RoleSelector.h" + +int MDSRoleSelector::parse_rank( + const FSMap &fsmap, + std::string const &str) +{ + if (str == "all" || str == "*") { + std::set<mds_rank_t> in; + const MDSMap &mds_map = fsmap.get_filesystem(fscid)->mds_map; + mds_map.get_mds_set(in); + + for (auto rank : in) { + roles.push_back(mds_role_t(fscid, rank)); + } + + return 0; + } else { + std::string rank_err; + mds_rank_t rank = strict_strtol(str.c_str(), 10, &rank_err); + if (!rank_err.empty()) { + return -EINVAL; + } + if (fsmap.get_filesystem(fscid)->mds_map.is_dne(rank)) { + return -ENOENT; + } + roles.push_back(mds_role_t(fscid, rank)); + return 0; + } +} + +int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str, + bool allow_unqualified_rank) +{ + auto colon_pos = str.find(":"); + if (colon_pos == std::string::npos) { + // An unqualified rank. Only valid if there is only one + // namespace. + if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) { + fscid = fsmap.get_filesystem()->fscid; + return parse_rank(fsmap, str); + } else { + return -EINVAL; + } + } else if (colon_pos == 0 || colon_pos == str.size() - 1) { + return -EINVAL; + } else { + const std::string ns_str = str.substr(0, colon_pos); + const std::string rank_str = str.substr(colon_pos + 1); + std::shared_ptr<const Filesystem> fs_ptr; + int r = fsmap.parse_filesystem(ns_str, &fs_ptr); + if (r != 0) { + return r; + } + fscid = fs_ptr->fscid; + return parse_rank(fsmap, rank_str); + } +} + diff --git a/src/tools/cephfs/RoleSelector.h b/src/tools/cephfs/RoleSelector.h new file mode 100644 index 00000000..9090b720 --- /dev/null +++ b/src/tools/cephfs/RoleSelector.h @@ -0,0 +1,36 @@ + +#ifndef ROLE_SELECTOR_H_ +#define ROLE_SELECTOR_H_ + +#include <string> +#include <vector> +#include "mds/mdstypes.h" +#include "mds/FSMap.h" + +/** + * When you want to let the user act on a single rank in a namespace, + * or all of them. + */ +class MDSRoleSelector +{ + public: + const std::vector<mds_role_t> &get_roles() const {return roles;} + int parse(const FSMap &fsmap, std::string const &str, + bool allow_unqualified_rank=true); + MDSRoleSelector() + : fscid(FS_CLUSTER_ID_NONE) + {} + fs_cluster_id_t get_ns() const + { + return fscid; + } + protected: + int parse_rank( + const FSMap &fsmap, + std::string const &str); + std::vector<mds_role_t> roles; + fs_cluster_id_t fscid; +}; + +#endif // ROLE_SELECTOR_H_ + diff --git a/src/tools/cephfs/TableTool.cc b/src/tools/cephfs/TableTool.cc new file mode 100644 index 00000000..e779b4b6 --- /dev/null +++ b/src/tools/cephfs/TableTool.cc @@ -0,0 +1,417 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "common/ceph_argparse.h" +#include "common/errno.h" + +#include "mds/SessionMap.h" +#include "mds/InoTable.h" +#include "mds/SnapServer.h" + +#include "TableTool.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << __func__ << ": " + +void TableTool::usage() +{ + std::cout << "Usage: \n" + << " cephfs-table-tool <all|[mds rank]> <reset|show> <session|snap|inode>" + << " cephfs-table-tool <all|[mds rank]> <take_inos> <max_ino>" + << std::endl; + + generic_client_usage(); +} + + +/** + * For a function that takes an MDS role as an argument and + * returns an error code, execute it on the roles specified + * by `role_selector`. + */ +int TableTool::apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f) +{ + ceph_assert(f != NULL); + + int r = 0; + + f->open_object_section("ranks"); + + for (auto role : role_selector.get_roles()) { + std::ostringstream rank_str; + rank_str << role.rank; + f->open_object_section(rank_str.str().c_str()); + + f->open_object_section("data"); + int rank_r = fptr(role, f); + f->close_section(); + r = r ? r : rank_r; + + f->dump_int("result", rank_r); + f->close_section(); + + + } + + f->close_section(); + + return r; +} + + +/** + * This class wraps an MDS table class (SessionMap, SnapServer, InoTable) + * with offline load/store code such that we can do offline dumps and resets + * on those tables. + */ +template <typename A> +class TableHandler +{ +protected: + // The RADOS object ID for the table + std::string object_name; + + // The role in question (may be NONE) + mds_role_t role; + + // Whether this is an MDSTable subclass (i.e. has leading version field to decode) + bool mds_table; + +public: + TableHandler(mds_role_t r, std::string const &name, bool mds_table_) + : role(r), mds_table(mds_table_) + { + // Compose object name of the table we will dump + std::ostringstream oss; + oss << "mds"; + if (!role.is_none()) { + oss << role.rank; + } + oss << "_" << name; + object_name = oss.str(); + } + + int load_and_dump(librados::IoCtx *io, Formatter *f) + { + ceph_assert(io != NULL); + ceph_assert(f != NULL); + + // Attempt read + bufferlist table_bl; + int read_r = io->read(object_name, table_bl, 0, 0); + if (read_r >= 0) { + auto q = table_bl.cbegin(); + try { + if (mds_table) { + version_t version; + decode(version, q); + f->dump_int("version", version); + } + A table_inst; + table_inst.set_rank(role.rank); + table_inst.decode(q); + table_inst.dump(f); + + return 0; + } catch (buffer::error &e) { + derr << "table " << object_name << " is corrupt" << dendl; + return -EIO; + } + } else { + derr << "error reading table object " << object_name + << ": " << cpp_strerror(read_r) << dendl; + return read_r; + } + } + + int reset(librados::IoCtx *io) + { + A table_inst; + // Compose new (blank) table + table_inst.set_rank(role.rank); + table_inst.reset_state(); + // Write the table out + return write(table_inst, io); + } + +protected: + + int write(const A &table_inst, librados::IoCtx *io) + { + bufferlist new_bl; + if (mds_table) { + version_t version = 1; + encode(version, new_bl); + } + table_inst.encode_state(new_bl); + + // Write out new table + int r = io->write_full(object_name, new_bl); + if (r != 0) { + derr << "error writing table object " << object_name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + return r; + } +}; + +template <typename A> +class TableHandlerOmap +{ +private: + // The RADOS object ID for the table + std::string object_name; + + // The role (rank may be NONE) + mds_role_t role; + + // Whether this is an MDSTable subclass (i.e. has leading version field to decode) + bool mds_table; + +public: + TableHandlerOmap(mds_role_t r, std::string const &name, bool mds_table_) + : role(r), mds_table(mds_table_) + { + // Compose object name of the table we will dump + std::ostringstream oss; + oss << "mds"; + if (!role.is_none()) { + oss << role.rank; + } + oss << "_" << name; + object_name = oss.str(); + } + + int load_and_dump(librados::IoCtx *io, Formatter *f) + { + ceph_assert(io != NULL); + ceph_assert(f != NULL); + + // Read in the header + bufferlist header_bl; + int r = io->omap_get_header(object_name, &header_bl); + if (r != 0) { + derr << "error reading header on '" << object_name << "': " + << cpp_strerror(r) << dendl; + return r; + } + + // Decode the header + A table_inst; + table_inst.set_rank(role.rank); + try { + table_inst.decode_header(header_bl); + } catch (buffer::error &e) { + derr << "table " << object_name << " is corrupt" << dendl; + return -EIO; + } + + // Read and decode OMAP values in chunks + std::string last_key = ""; + while(true) { + std::map<std::string, bufferlist> values; + int r = io->omap_get_vals(object_name, last_key, + g_conf()->mds_sessionmap_keys_per_op, &values); + + if (r != 0) { + derr << "error reading values: " << cpp_strerror(r) << dendl; + return r; + } + + if (values.empty()) { + break; + } + + try { + table_inst.decode_values(values); + } catch (buffer::error &e) { + derr << "table " << object_name << " is corrupt" << dendl; + return -EIO; + } + last_key = values.rbegin()->first; + } + + table_inst.dump(f); + + return 0; + } + + int reset(librados::IoCtx *io) + { + A table_inst; + table_inst.set_rank(role.rank); + table_inst.reset_state(); + bufferlist header_bl; + table_inst.encode_header(&header_bl); + + // Compose a transaction to clear and write header + librados::ObjectWriteOperation op; + op.omap_clear(); + op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK); + op.omap_set_header(header_bl); + + return io->operate(object_name, &op); + } +}; + +class InoTableHandler : public TableHandler<InoTable> +{ + public: + explicit InoTableHandler(mds_role_t r) + : TableHandler(r, "inotable", true) + {} + + int take_inos(librados::IoCtx *io, inodeno_t max, Formatter *f) + { + InoTable inst; + inst.set_rank(role.rank); + inst.reset_state(); + + int r = 0; + if (inst.force_consume_to(max)) { + r = write(inst, io); + } + + f->dump_int("version", inst.get_version()); + inst.dump(f); + + return r; + } +}; + + +int TableTool::main(std::vector<const char*> &argv) +{ + int r; + + dout(10) << __func__ << dendl; + + // RADOS init + // ========== + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; + return r; + } + + dout(4) << "connecting to RADOS..." << dendl; + r = rados.connect(); + if (r < 0) { + derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; + return r; + } + + // Require at least 3 args <rank> <mode> <arg> [args...] + if (argv.size() < 3) { + cerr << "missing required 3 arguments" << std::endl; + return -EINVAL; + } + + const std::string role_str = std::string(argv[0]); + const std::string mode = std::string(argv[1]); + const std::string table = std::string(argv[2]); + + r = role_selector.parse(*fsmap, role_str); + if (r < 0) { + derr << "Bad rank selection: " << role_str << "'" << dendl; + return r; + } + + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + ceph_assert(fs != nullptr); + int64_t const pool_id = fs->mds_map.get_metadata_pool(); + dout(4) << "resolving pool " << pool_id << dendl; + std::string pool_name; + r = rados.pool_reverse_lookup(pool_id, &pool_name); + if (r < 0) { + derr << "Pool " << pool_id << " identified in MDS map not found in RADOS!" + << dendl; + return r; + } + + dout(4) << "creating IoCtx.." << dendl; + r = rados.ioctx_create(pool_name.c_str(), io); + if (r != 0) { + return r; + } + + JSONFormatter jf(true); + if (mode == "reset") { + const std::string table = std::string(argv[2]); + if (table == "session") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).reset(&io); + }, &jf); + } else if (table == "inode") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandler<InoTable>(rank, "inotable", true).reset(&io); + }, &jf); + } else if (table == "snap") { + r = TableHandler<SnapServer>(mds_role_t(), "snaptable", true).reset(&io); + jf.open_object_section("reset_snap_status"); + jf.dump_int("result", r); + jf.close_section(); + } else { + cerr << "Invalid table '" << table << "'" << std::endl; + return -EINVAL; + } + } else if (mode == "show") { + const std::string table = std::string(argv[2]); + if (table == "session") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f); + }, &jf); + } else if (table == "inode") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandler<InoTable>(rank, "inotable", true).load_and_dump(&io, f);; + }, &jf); + } else if (table == "snap") { + jf.open_object_section("show_snap_table"); + { + r = TableHandler<SnapServer>( + mds_role_t(), "snaptable", true).load_and_dump(&io, &jf); + jf.dump_int("result", r); + } + jf.close_section(); + } else { + cerr << "Invalid table '" << table << "'" << std::endl; + return -EINVAL; + } + } else if (mode == "take_inos") { + const std::string ino_str = std::string(argv[2]); + std::string ino_err; + inodeno_t ino = strict_strtoll(ino_str.c_str(), 10, &ino_err); + if (!ino_err.empty()) { + derr << "Bad ino '" << ino_str << "'" << dendl; + return -EINVAL; + } + r = apply_role_fn([this, ino](mds_role_t rank, Formatter *f) -> int { + return InoTableHandler(rank).take_inos(&io, ino, f); + }, &jf); + } else { + cerr << "Invalid mode '" << mode << "'" << std::endl; + return -EINVAL; + } + + // Subcommand should have written to formatter, flush it + jf.flush(std::cout); + std::cout << std::endl; + return r; +} + diff --git a/src/tools/cephfs/TableTool.h b/src/tools/cephfs/TableTool.h new file mode 100644 index 00000000..bf9b95c1 --- /dev/null +++ b/src/tools/cephfs/TableTool.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "MDSUtility.h" +#include "RoleSelector.h" + +#include "include/rados/librados.hpp" + +/** + * Command line tool for debugging the backing store of + * MDSTable instances. + */ +class TableTool : public MDSUtility +{ + private: + MDSRoleSelector role_selector; + + // I/O handles + librados::Rados rados; + librados::IoCtx io; + + int apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f); + + public: + static void usage(); + int main(std::vector<const char*> &argv); + +}; + diff --git a/src/tools/cephfs/cephfs-data-scan.cc b/src/tools/cephfs/cephfs-data-scan.cc new file mode 100644 index 00000000..e6efff66 --- /dev/null +++ b/src/tools/cephfs/cephfs-data-scan.cc @@ -0,0 +1,47 @@ + +#include "include/types.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "DataScan.h" + + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + DataScan::usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + DataScan data_scan; + + // Connect to mon cluster, download MDS map etc + int rc = data_scan.init(); + if (rc != 0) { + std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + + // Finally, execute the user's commands + rc = data_scan.main(args); + if (rc != 0) { + std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl; + } + + + return rc; +} + diff --git a/src/tools/cephfs/cephfs-journal-tool.cc b/src/tools/cephfs/cephfs-journal-tool.cc new file mode 100644 index 00000000..290cb305 --- /dev/null +++ b/src/tools/cephfs/cephfs-journal-tool.cc @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "include/types.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "JournalTool.h" + + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + JournalTool::usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + JournalTool jt; + + // Connect to mon cluster, download MDS map etc + int rc = jt.init(); + if (rc != 0) { + std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + + // Finally, execute the user's commands + rc = jt.main(args); + if (rc != 0) { + std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl; + } + + return rc; +} + diff --git a/src/tools/cephfs/cephfs-shell b/src/tools/cephfs/cephfs-shell new file mode 100644 index 00000000..5db84b56 --- /dev/null +++ b/src/tools/cephfs/cephfs-shell @@ -0,0 +1,1295 @@ +#!/usr/bin/python3 +# coding = utf-8 + +import argparse +import os +import os.path +import sys +from cmd2 import Cmd +import cephfs as libcephfs +import shutil +import traceback +import colorama +import fnmatch +import math +import re +import shlex + +if sys.version_info.major < 3: + raise RuntimeError("cephfs-shell is only compatible with python3") + +try: + from cmd2 import with_argparser +except ImportError: + def with_argparser(argparser): + import functools + + def argparser_decorator(func): + @functools.wraps(func) + def wrapper(thiz, cmdline): + if isinstance(cmdline, list): + arglist = cmdline + else: + # do not split if it's already a list + arglist = shlex.split(cmdline, posix=False) + # in case user quotes the command args + arglist = [arg.strip('\'""') for arg in arglist] + try: + args = argparser.parse_args(arglist) + except SystemExit: + # argparse exits at seeing bad arguments + return + else: + return func(thiz, args) + argparser.prog = func.__name__[3:] + if argparser.description is None and func.__doc__: + argparser.description = func.__doc__ + + return wrapper + + return argparser_decorator + + +cephfs = None +shell = None + + +def poutput(s, end='\n'): + shell.poutput(s, end=end) + + +def setup_cephfs(config_file): + """ + Mouting a cephfs + """ + global cephfs + cephfs = libcephfs.LibCephFS(conffile=config_file) + cephfs.mount() + + +def mode_notation(mode): + """ + """ + permission_bits = {'0': '---', + '1': '--x', + '2': '-w-', + '3': '-wx', + '4': 'r--', + '5': 'r-x', + '6': 'rw-', + '7': 'rwx'} + mode = str(oct(mode)) + notation = '-' + if mode[2] == '4': + notation = 'd' + for i in mode[-3:]: + notation += permission_bits[i] + return notation + + +def get_chunks(file_size): + chunk_start = 0 + chunk_size = 0x20000 # 131072 bytes, default max ssl buffer size + while chunk_start + chunk_size < file_size: + yield(chunk_start, chunk_size) + chunk_start += chunk_size + final_chunk_size = file_size - chunk_start + yield(chunk_start, final_chunk_size) + + +def to_bytes(string): + return bytes(string, encoding='utf-8') + +def ls(path, opts=''): + # opts tries to be like /bin/ls opts + almost_all = 'A' in opts + try: + with cephfs.opendir(path) as d: + while True: + dent = cephfs.readdir(d) + if dent is None: + return + elif almost_all and dent.d_name in (b'.', b'..'): + continue + yield dent + except cephfs.ObjectNotFound: + return [] + +def glob(path, pattern): + paths = [] + parent_dir = os.path.dirname(path) + if parent_dir == b'': + parent_dir = b'/' + if path == b'/' or is_dir_exists(os.path.basename(path), parent_dir): + for i in ls(path, opts='A'): + if fnmatch.fnmatch(i.d_name, pattern): + paths.append(os.path.join(path, i.d_name)) + return paths + + +def locate_file(name, case_sensitive=True): + dir_list = sorted(set(dirwalk(cephfs.getcwd()))) + if not case_sensitive: + return [dname for dname in dir_list if name.lower() in dname.lower()] + else: + return [dname for dname in dir_list if name in dname] + + +def get_all_possible_paths(pattern): + complete_pattern = pattern[:] + paths = [] + is_rel_path = not os.path.isabs(pattern) + if is_rel_path: + dir_ = cephfs.getcwd() + else: + dir_ = b'/' + pattern = pattern[1:] + patterns = pattern.split(b'/') + paths.extend(glob(dir_, patterns[0])) + patterns.pop(0) + for pattern in patterns: + for path in paths: + paths.extend(glob(path, pattern)) + return [path for path in paths if fnmatch.fnmatch(path, + os.path.join(cephfs.getcwd(), complete_pattern))] + + +suffixes = ['B', 'K', 'M', 'G', 'T', 'P'] + + +def humansize(nbytes): + i = 0 + while nbytes >= 1024 and i < len(suffixes)-1: + nbytes /= 1024. + i += 1 + nbytes = math.ceil(nbytes) + f = ('%d' % nbytes).rstrip('.') + return '%s%s' % (f, suffixes[i]) + + +def print_long(path, is_dir, human_readable): + info = cephfs.stat(path) + pretty = os.path.basename(path.decode('utf-8')) + if is_dir: + pretty = colorama.Style.BRIGHT + colorama.Fore.CYAN + pretty + '/' + colorama.Style.RESET_ALL + if human_readable: + poutput('{}\t{:10s} {} {} {} {}'.format( + mode_notation(info.st_mode), + humansize(info.st_size), info.st_uid, + info.st_gid, info.st_mtime, pretty, sep='\t')) + else: + poutput('{} {:12d} {} {} {} {}'.format( + mode_notation(info.st_mode), info.st_size, info.st_uid, + info.st_gid, info.st_mtime, pretty, sep='\t')) + + +def word_len(word): + """ + Returns the word length, minus any color codes. + """ + if word[0] == '\x1b': + return len(word) - 9 + return len(word) + + +def is_dir_exists(path, dir_=b''): + path_to_stat = os.path.join(dir_, path) + try: + return ((cephfs.stat(path_to_stat).st_mode & 0o0040000) != 0) + except libcephfs.Error: + return False + + +def is_file_exists(path, dir_=b''): + try: + # if its not a directory, then its a file + return ((cephfs.stat(os.path.join(dir_, path)).st_mode & 0o0040000) == 0) + except libcephfs.Error: + return False + + +def print_list(words, termwidth=79): + if not words: + return + words = [word.decode('utf-8') if isinstance(word, bytes) else word for word in words] + width = max([word_len(word) for word in words]) + 2 + nwords = len(words) + ncols = max(1, (termwidth + 1) // (width + 1)) + nrows = (nwords + ncols - 1) // ncols + for row in range(nrows): + for i in range(row, nwords, nrows): + word = words[i] + print_width = width + if word[0] == '\x1b': + print_width = print_width + 10 + + poutput('%-*s' % (print_width, words[i]), + end='\n' if i + nrows >= nwords else '') + + +def copy_from_local(local_path, remote_path): + stdin = -1 + file_ = None + fd = None + convert_to_bytes = False + if local_path == b'-': + file_ = sys.stdin + convert_to_bytes = True + else: + try: + file_ = open(local_path, 'rb') + except PermissionError: + perror('error: no permission to read local file {}'.format( + local_path.decode('utf-8')), end='\n', apply_style=True) + return + stdin = 1 + try: + fd = cephfs.open(remote_path, 'w', 0o666) + except libcephfs.Error: + perror('error: no permission to write remote file {}'.format( + remote_path.decode('utf-8')), end='\n', apply_style=True) + return + progress = 0 + while True: + data = file_.read(65536) + if not data or len(data) == 0: + break + if convert_to_bytes: + data = to_bytes(data) + wrote = cephfs.write(fd, data, progress) + if wrote < 0: + break + progress += wrote + cephfs.close(fd) + if stdin > 0: + file_.close() + poutput('') + + +def copy_to_local(remote_path, local_path): + fd = None + if local_path != b'-': + local_dir = os.path.dirname(local_path) + dir_list = remote_path.rsplit(b'/', 1) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + if len(dir_list) > 2 and dir_list[1] == b'': + return + fd = open(local_path, 'wb+') + file_ = cephfs.open(remote_path, 'r') + file_size = cephfs.stat(remote_path).st_size + if file_size <= 0: + return + progress = 0 + for chunk_start, chunk_size in get_chunks(file_size): + file_chunk = cephfs.read(file_, chunk_start, chunk_size) + progress += len(file_chunk) + if fd: + fd.write(file_chunk) + else: + poutput(file_chunk.decode('utf-8')) + cephfs.close(file_) + if fd: + fd.close() + + +def dirwalk(path): + """ + walk a directory tree, using a generator + """ + path = os.path.normpath(path) + for item in ls(path, opts='A'): + fullpath = os.path.join(path, item.d_name) + src_path = fullpath.rsplit(b'/', 1)[0] + + yield os.path.normpath(fullpath) + if is_dir_exists(item.d_name, src_path): + for x in dirwalk(fullpath): + yield x + + +class CephFSShell(Cmd): + + def __init__(self): + super().__init__(use_ipython=False) + self.working_dir = cephfs.getcwd().decode('utf-8') + self.set_prompt() + self.interactive = False + self.umask = '2' + + def default(self, line): + self.poutput('Unrecognized command') + + def set_prompt(self): + self.prompt = ('\033[01;33mCephFS:~' + colorama.Fore.LIGHTCYAN_EX + + self.working_dir + colorama.Style.RESET_ALL + + '\033[01;33m>>>\033[00m ') + + def create_argparser(self, command): + try: + argparse_args = getattr(self, 'argparse_' + command) + except AttributeError: + return None + doc_lines = getattr( + self, 'do_' + command).__doc__.expandtabs().splitlines() + if ''in doc_lines: + blank_idx = doc_lines.index('') + usage = doc_lines[:blank_idx] + description = doc_lines[blank_idx + 1:] + else: + usage = doc_lines + description = [] + parser = argparse.ArgumentParser( + prog=command, + usage='\n'.join(usage), + description='\n'.join(description), + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + for args, kwargs in argparse_args: + parser.add_argument(*args, **kwargs) + return parser + + def complete_filenames(self, text, line, begidx, endidx): + if not text: + completions = [x.d_name.decode('utf-8') + '/' * int(x.is_dir()) + for x in ls(b".", opts='A')] + else: + if text.count('/') > 0: + completions = [text.rsplit('/', 1)[0] + '/' + + x.d_name.decode('utf-8') + '/' + * int(x.is_dir()) for x in ls('/' + + text.rsplit('/', 1)[0], opts='A') + if x.d_name.decode('utf-8').startswith( + text.rsplit('/', 1)[1])] + else: + completions = [x.d_name.decode('utf-8') + '/' + * int(x.is_dir()) for x in ls(b".", opts='A') + if x.d_name.decode('utf-8').startswith(text)] + if len(completions) == 1 and completions[0][-1] == '/': + dir_, file_ = completions[0].rsplit('/', 1) + completions.extend([dir_ + '/' + x.d_name.decode('utf-8') + + '/' * int(x.is_dir()) for x in + ls('/' + dir_, opts='A') + if x.d_name.decode('utf-8').startswith(file_)]) + return self.delimiter_complete(text, line, begidx, endidx, completions, '/') + return completions + + def onecmd(self, line): + """ + Global error catcher + """ + try: + res = Cmd.onecmd(self, line) + if self.interactive: + self.set_prompt() + return res + except ConnectionError as e: + self.poutput('***', e) + except KeyboardInterrupt: + self.poutput('Command aborted') + except Exception as e: + self.poutput(e) + traceback.print_exc(file=sys.stdout) + + class path_to_bytes(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + if isinstance(values, str): + values = to_bytes(values) + if isinstance(values, list): + values = list(map(to_bytes, values)) + setattr(namespace, self.dest, values) + + def complete_mkdir(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + class ModeAction(argparse.Action): + def __init__(self, option_strings, dest, nargs=None, **kwargs): + if nargs is not None and nargs != '?': + raise ValueError("more than one modes not allowed") + super().__init__(option_strings, dest, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + o_mode = 0 + res = None + try: + o_mode = int(values, base=8) + except ValueError: + res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', values) + if res is None: + parser.error("invalid mode: %s\n" + "mode must be a numeric octal literal\n" + "or ((u?g?o?)|(a?))(=)(r?w?x?)" % + values) + else: + # we are supporting only assignment of mode and not + or - + # as is generally available with the chmod command + # eg. + # >>> res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', 'go=') + # >>> res.groups() + # ('go', 'go', None, '=', '') + val = res.groups() + + if val[3] != '=': + parser.error("need assignment operator between user " + "and mode specifiers") + if val[4] == '': + parser.error("invalid mode: %s\n" + "mode must be combination of: r | w | x" % + values) + users = '' + if val[2] is None: + users = val[1] + else: + users = val[2] + + t_mode = 0 + if users == 'a': + users = 'ugo' + + if 'r' in val[4]: + t_mode |= 4 + if 'w' in val[4]: + t_mode |= 2 + if 'x' in val[4]: + t_mode |= 1 + + if 'u' in users: + o_mode |= (t_mode << 6) + if 'g' in users: + o_mode |= (t_mode << 3) + if 'o' in users: + o_mode |= t_mode + + if o_mode < 0: + parser.error("invalid mode: %s\n" + "mode cannot be negative" % values) + if o_mode > 0o777: + parser.error("invalid mode: %s\n" + "mode cannot be greater than octal 0777" % values) + + setattr(namespace, self.dest, str(oct(o_mode))) + + mkdir_parser = argparse.ArgumentParser( + description='Create the directory(ies), if they do not already exist.') + mkdir_parser.add_argument('dirs', type=str, + action=path_to_bytes, + metavar='DIR_NAME', + help='Name of new_directory.', + nargs='+') + mkdir_parser.add_argument('-m', '--mode', type=str, + action=ModeAction, + help='Sets the access mode for the new directory.') + mkdir_parser.add_argument('-p', '--parent', action='store_true', + help='Create parent directories as necessary. \ +When this option is specified, no error is reported if a directory already \ +exists.') + + @with_argparser(mkdir_parser) + def do_mkdir(self, args): + """ + Create directory. + """ + for path in args.dirs: + if args.mode: + permission = int(args.mode, 8) + else: + permission = 0o777 + if args.parent: + cephfs.mkdirs(path, permission) + else: + try: + cephfs.mkdir(path, permission) + except libcephfs.Error: + self.poutput("directory missing in the path; " + "you may want to pass the -p argument") + return + + def complete_put(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + index_dict = {1: self.path_complete} + return self.index_based_complete(text, line, begidx, endidx, index_dict) + + put_parser = argparse.ArgumentParser( + description='Copy a file/directory to Ceph File System from Local File System.') + put_parser.add_argument('local_path', type=str, action=path_to_bytes, + help='Path of the file in the local system') + put_parser.add_argument('remote_path', type=str, action=path_to_bytes, + help='Path of the file in the remote system.', + nargs='?', default='.') + put_parser.add_argument('-f', '--force', action='store_true', + help='Overwrites the destination if it already exists.') + + @with_argparser(put_parser) + def do_put(self, args): + """ + Copy a file to Ceph File System from Local Directory. + """ + root_src_dir = args.local_path + root_dst_dir = args.remote_path + if args.local_path == b'.' or args.local_path == b'./': + root_src_dir = os.getcwdb() + elif len(args.local_path.rsplit(b'/', 1)) < 2: + root_src_dir = os.path.join(os.getcwdb(), args.local_path) + else: + p = args.local_path.split(b'/') + if p[0] == b'.': + root_src_dir = os.getcwdb() + p.pop(0) + while len(p) > 0: + root_src_dir += b'/' + p.pop(0) + + if root_dst_dir == b'.': + if args.local_path != b'-': + root_dst_dir = root_src_dir.rsplit(b'/', 1)[1] + if root_dst_dir == b'': + root_dst_dir = root_src_dir.rsplit(b'/', 1)[0] + a = root_dst_dir.rsplit(b'/', 1) + if len(a) > 1: + root_dst_dir = a[1] + else: + root_dst_dir = a[0] + else: + self.poutput("error: no filename specified for destination") + return + + if root_dst_dir[-1] != b'/': + root_dst_dir += b'/' + + if args.local_path == b'-' or os.path.isfile(root_src_dir): + if not args.force: + if os.path.isfile(root_src_dir): + dst_file = root_dst_dir + if is_file_exists(dst_file): + self.perror('{}: file exists! use --force to overwrite'.format( + dst_file.decode('utf-8')), end='\n', + apply_style=True) + return + if args.local_path == b'-': + root_src_dir = b'-' + copy_from_local(root_src_dir, root_dst_dir) + else: + for src_dir, dirs, files in os.walk(root_src_dir): + if isinstance(src_dir, str): + src_dir = to_bytes(src_dir) + dst_dir = src_dir.replace(root_src_dir, root_dst_dir, 1) + dst_dir = re.sub(rb'\/+', b'/', cephfs.getcwd() + + dst_dir) + if args.force and dst_dir != b'/' and not is_dir_exists( + dst_dir[:-1]) and not locate_file(dst_dir): + try: + cephfs.mkdirs(dst_dir, 0o777) + except libcephfs.Error: + pass + if (not args.force) and dst_dir != b'/' and not is_dir_exists( + dst_dir) and not os.path.isfile(root_src_dir): + try: + cephfs.mkdirs(dst_dir, 0o777) + except libcephfs.Error: + pass + + for dir_ in dirs: + dir_name = os.path.join(dst_dir, dir_) + if not is_dir_exists(dir_name): + try: + cephfs.mkdirs(dir_name, 0o777) + except libcephfs.Error: + pass + + for file_ in files: + src_file = os.path.join(src_dir, file_) + dst_file = re.sub(rb'\/+', b'/', b'/' + dst_dir + b'/' + file_) + if (not args.force) and is_file_exists(dst_file): + return + copy_from_local(src_file, os.path.join(cephfs.getcwd(), + dst_file)) + + def complete_get(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + get_parser = argparse.ArgumentParser( + description='Copy a file from Ceph File System from Local Directory.') + get_parser.add_argument('remote_path', type=str, action=path_to_bytes, + help='Path of the file in the remote system') + get_parser.add_argument('local_path', type=str, action=path_to_bytes, + help='Path of the file in the local system', + nargs='?', default='.') + get_parser.add_argument('-f', '--force', action='store_true', + help='Overwrites the destination if it already exists.') + + @with_argparser(get_parser) + def do_get(self, args): + """ + Copy a file/directory from Ceph File System to Local Directory. + """ + root_src_dir = args.remote_path + root_dst_dir = args.local_path + fname = root_src_dir.rsplit(b'/', 1) + if args.local_path == b'.': + root_dst_dir = os.getcwdb() + if args.remote_path == b'.': + root_src_dir = cephfs.getcwd() + if args.local_path == b'-': + if args.remote_path == b'.' or args.remote_path == b'./': + self.perror('error: no remote file name specified', end='\n', + apply_style=True) + return + copy_to_local(root_src_dir, b'-') + elif is_file_exists(args.remote_path): + copy_to_local(root_src_dir, + root_dst_dir + b'/' + root_src_dir) + elif b'/'in root_src_dir and is_file_exists(fname[1], fname[0]): + copy_to_local(root_src_dir, root_dst_dir) + else: + files = list(reversed(sorted(dirwalk(root_src_dir)))) + if len(files) == 0: + try: + os.makedirs(root_dst_dir + b'/' + root_src_dir) + except OSError: + if args.force: + pass + else: + self.perror('{}: already exists! use --force to overwrite'.format( + root_src_dir.decode('utf-8')), end='\n', + apply_style=True) + return + + for file_ in files: + dst_dirpath, dst_file = file_.rsplit(b'/', 1) + if dst_dirpath in files: + files.remove(dst_dirpath) + dst_path = os.path.join(root_dst_dir, dst_dirpath, dst_file) + dst_path = os.path.normpath(dst_path) + if is_dir_exists(file_): + try: + os.makedirs(dst_path) + except OSError: + pass + else: + if not args.force: + try: + os.stat(dst_path) + self.perror('{}: file already exists! use --force to override'.format( + file_.decode('utf-8')), end='\n', + apply_style=True) + return + except OSError: + copy_to_local(file_, dst_path) + else: + copy_to_local(file_, dst_path) + + return 0 + + def complete_ls(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + ls_parser = argparse.ArgumentParser( + description='Copy a file from Ceph File System from Local Directory.') + ls_parser.add_argument('-l', '--long', action='store_true', + help='Detailed list of items in the directory.') + ls_parser.add_argument('-r', '--reverse', action='store_true', + help='Reverse order of listing items in the directory.') + ls_parser.add_argument('-H', action='store_true', help='Human Readable') + ls_parser.add_argument('-a', '--all', action='store_true', + help='Do not Ignore entries starting with .') + ls_parser.add_argument('-S', action='store_true', help='Sort by file_size') + ls_parser.add_argument('paths', help='Name of Directories', + action=path_to_bytes, nargs='*', default=['.']) + + @with_argparser(ls_parser) + def do_ls(self, args): + """ + List all the files and directories in the current working directory + """ + paths = args.paths + for path in paths: + values = [] + items = [] + if path.count(b'*') > 0: + all_items = get_all_possible_paths(path) + if len(all_items) == 0: + continue + path = all_items[0].rsplit(b'/', 1)[0] + if path == b'': + path = b'/' + dirs = [] + for i in all_items: + for item in ls(path): + d_name = item.d_name + if os.path.basename(i) == d_name: + if item.is_dir(): + dirs.append(os.path.join(path, d_name)) + else: + items.append(item) + if dirs: + paths.extend(dirs) + else: + self.poutput(path.decode('utf-8'), end=':\n') + items = sorted(items, key=lambda item: item.d_name) + else: + if path != b'' and path != cephfs.getcwd() and len(paths) > 1: + self.poutput(path.decode('utf-8'), end=':\n') + items = sorted(ls(path), + key=lambda item: item.d_name) + if not args.all: + items = [i for i in items if not i.d_name.startswith(b'.')] + + if args.S: + items = sorted(items, key=lambda item: cephfs.stat( + path + b'/' + item.d_name).st_size) + + if args.reverse: + items = reversed(items) + for item in items: + filepath = item.d_name + is_dir = item.is_dir() + + if args.long and args.H: + print_long(cephfs.getcwd() + + path + + b'/' + + filepath, + is_dir, True) + elif args.long: + print_long(cephfs.getcwd() + + path + + b'/' + + filepath, + is_dir, False) + elif is_dir: + values.append(colorama.Style.BRIGHT + + colorama.Fore.CYAN + + filepath.decode('utf-8') + + '/' + + colorama.Style.RESET_ALL) + else: + values.append(filepath) + if not args.long: + print_list(values, shutil.get_terminal_size().columns) + if path != paths[-1]: + self.poutput('') + + def complete_rmdir(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + rmdir_parser = argparse.ArgumentParser(description='Remove Directory.') + rmdir_parser.add_argument('paths', help='Directory Path.', nargs='+', + action=path_to_bytes) + rmdir_parser.add_argument('-p', '--parent', action='store_true', + help='Remove parent directories as necessary. \ +When this option is specified, no error is reported if a directory has any \ +sub-directories, files') + + @with_argparser(rmdir_parser) + def do_rmdir(self, args): + """ + Remove a specific Directory + """ + is_pattern = False + paths = args.paths + for path in paths: + if path.count(b'*') > 0: + is_pattern = True + all_items = get_all_possible_paths(path) + if len(all_items) > 0: + path = all_items[0].rsplit(b'/', 1)[0] + if path == b'': + path = b'/' + dirs = [] + for i in all_items: + for item in ls(path): + d_name = item.d_name + if os.path.basename(i) == d_name: + if item.is_dir(): + dirs.append(os.path.join(path, d_name)) + paths.extend(dirs) + continue + else: + is_pattern = False + path = os.path.normpath(os.path.join(cephfs.getcwd(), path)) + if args.parent: + files = reversed(sorted(set(dirwalk(path)))) + for filepath in files: + filepath = os.path.normpath(filepath) + if filepath[1:] != path: + try: + cephfs.rmdir(filepath) + except libcephfs.Error: + cephfs.unlink(filepath) + if not is_pattern and path != os.path.normpath(b''): + try: + cephfs.rmdir(path) + except libcephfs.Error: + self.perror('error: no such directory {} exists'.format( + path.decode('utf-8')), end='\n', + apply_style=True) + + def complete_rm(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + rm_parser = argparse.ArgumentParser(description='Remove File.') + rm_parser.add_argument('paths', help='File Path.', nargs='+', + action=path_to_bytes) + + @with_argparser(rm_parser) + def do_rm(self, args): + """ + Remove a specific file + """ + file_paths = args.paths + for path in file_paths: + if path.count(b'*') > 0: + file_paths.extend([i for i in get_all_possible_paths( + path) if is_file_exists(i)]) + else: + try: + cephfs.unlink(path) + except libcephfs.Error: + self.perror('{}: no such file'.format(path.decode('utf-8')), + end='\n', apply_style=True) + + def complete_mv(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + mv_parser = argparse.ArgumentParser(description='Move File.') + mv_parser.add_argument('src_path', type=str, action=path_to_bytes, + help='Source File Path.') + mv_parser.add_argument('dest_path', type=str, action=path_to_bytes, + help='Destination File Path.') + + @with_argparser(mv_parser) + def do_mv(self, args): + """ + Rename a file or Move a file from source path to the destination + """ + try: + cephfs.rename(args.src_path, args.dest_path) + except libcephfs.Error: + self.poutput("error: need a file name to move to") + + def complete_cd(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + cd_parser = argparse.ArgumentParser(description='Change working directory') + cd_parser.add_argument('path', type=str, help='Name of the directory.', + action=path_to_bytes, nargs='?', default='/') + + @with_argparser(cd_parser) + def do_cd(self, args): + """ + Change working directory + """ + try: + cephfs.chdir(args.path) + self.working_dir = cephfs.getcwd().decode('utf-8') + self.set_prompt() + except libcephfs.Error: + self.perror('{}: no such directory'.format(args.path.decode('utf-8')), + end='\n', apply_style=True) + + def do_cwd(self, arglist): + """ + Get current working directory. + """ + self.poutput(cephfs.getcwd().decode('utf-8')) + + def complete_chmod(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + chmod_parser = argparse.ArgumentParser(description='Create Directory.') + chmod_parser.add_argument('mode', type=str, action=ModeAction, help='Mode') + chmod_parser.add_argument('paths', type=str, action=path_to_bytes, + help='Name of the file', nargs='+') + + @with_argparser(chmod_parser) + def do_chmod(self, args): + """ + Change permission of a file + """ + for path in args.paths: + mode = int(args.mode, base=8) + try: + cephfs.chmod(path, mode) + except libcephfs.Error: + self.perror('{}: no such file or directory'.format( + path.decode('utf-8')), end='\n', apply_style=True) + + def complete_cat(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + cat_parser = argparse.ArgumentParser(description='') + cat_parser.add_argument('paths', help='Name of Files', action=path_to_bytes, + nargs='+') + + @with_argparser(cat_parser) + def do_cat(self, args): + """ + Print contents of a file + """ + for path in args.paths: + if is_file_exists(path): + copy_to_local(path, b'-') + else: + self.perror('{}: no such file'.format(path.decode('utf-8')), + end='\n', apply_style=True) + + umask_parser = argparse.ArgumentParser(description='Set umask value.') + umask_parser.add_argument('mode', help='Mode', type=str, action=ModeAction, + nargs='?', default='') + + @with_argparser(umask_parser) + def do_umask(self, args): + """ + Set Umask value. + """ + if args.mode == '': + self.poutput(self.umask.zfill(4)) + else: + mode = int(args.mode, 8) + self.umask = str(oct(cephfs.umask(mode))[2:]) + + def complete_write(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + write_parser = argparse.ArgumentParser(description='Writes data into a file') + write_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of File') + + @with_argparser(write_parser) + def do_write(self, args): + """ + Write data into a file. + """ + + copy_from_local(b'-', args.path) + + def complete_lcd(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + index_dict = {1: self.path_complete} + return self.index_based_complete(text, line, begidx, endidx, index_dict) + + lcd_parser = argparse.ArgumentParser(description='') + lcd_parser.add_argument('path', type=str, action=path_to_bytes, help='Path') + + @with_argparser(lcd_parser) + def do_lcd(self, args): + """ + Moves into the given local directory + """ + try: + os.chdir(os.path.expanduser(args.path)) + except OSError as e: + self.perror("Cannot change to {}: {}".format(e.filename, + e.strerror), False) + + def complete_lls(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + index_dict = {1: self.path_complete} + return self.index_based_complete(text, line, begidx, endidx, index_dict) + + lls_parser = argparse.ArgumentParser( + description='List files in local system.') + lls_parser.add_argument('paths', help='Paths', action=path_to_bytes, + nargs='*') + + @with_argparser(lls_parser) + def do_lls(self, args): + """ + Lists all files and folders in the current local directory + """ + if not args.paths: + print_list(os.listdir(os.getcwdb())) + else: + for path in args.paths: + try: + items = os.listdir(path) + self.poutput("{}:".format(path.decode('utf-8'))) + print_list(items) + except OSError as e: + self.perror("'{}': {}".format(e.filename, e.strerror), False) + # Arguments to the with_argpaser decorator function are sticky. + # The items in args.path do not get overwritten in subsequent calls. + # The arguments remain in args.paths after the function exits and we + # neeed to clean it up to ensure the next call works as expected. + args.paths.clear() + + def do_lpwd(self, arglist): + """ + Prints the absolute path of the current local directory + """ + self.poutput(os.getcwd()) + + def do_df(self, arglist): + """ + Display the amount of available disk space for file systems + """ + for index, i in enumerate(ls(b".", opts='A')): + if index == 0: + self.poutput('{:25s}\t{:5s}\t{:15s}{:10s}{}'.format( + "1K-blocks", "Used", "Available", "Use%", "Stored on")) + if not is_dir_exists(i.d_name): + statfs = cephfs.statfs(i.d_name) + stat = cephfs.stat(i.d_name) + block_size = statfs['f_blocks']*statfs['f_bsize'] // 1024 + available = block_size - stat.st_size + use = 0 + if block_size > 0: + use = (stat.st_size*100 // block_size) + self.poutput('{:25d}\t{:5d}\t{:10d}\t{:5s} {}'.format( + statfs['f_fsid'], stat.st_size, available, + str(int(use)) + '%', i.d_name.decode('utf-8'))) + + locate_parser = argparse.ArgumentParser( + description='Find file within file system') + locate_parser.add_argument('name', help='name', type=str, + action=path_to_bytes) + locate_parser.add_argument('-c', '--count', action='store_true', + help='Count list of items located.') + locate_parser.add_argument( + '-i', '--ignorecase', action='store_true', help='Ignore case') + + @with_argparser(locate_parser) + def do_locate(self, args): + """ + Find a file within the File System + """ + if args.name.count(b'*') == 1: + if args.name[0] == b'*': + args.name += b'/' + elif args.name[-1] == '*': + args.name = b'/' + args.name + args.name = args.name.replace(b'*', b'') + if args.ignorecase: + locations = locate_file(args.name, False) + else: + locations = locate_file(args.name) + if args.count: + self.poutput(len(locations)) + else: + self.poutput((b'\n'.join(locations)).decode('utf-8')) + + def complete_du(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + du_parser = argparse.ArgumentParser( + description='Disk Usage of a Directory') + du_parser.add_argument('dirs', type=str, action=path_to_bytes, + help='Name of the directory.', nargs='?', + default='.') + du_parser.add_argument('-r', action='store_true', + help='Recursive Disk usage of all directories.') + + @with_argparser(du_parser) + def do_du(self, args): + """ + Disk Usage of a Directory + """ + if args.dirs == b'': + args.dirs = cephfs.getcwd() + for dir_ in args.dirs: + if args.r: + for i in reversed(sorted(set(dirwalk(dir_)))): + i = os.path.normpath(i) + try: + xattr = cephfs.getxattr(i, 'ceph.dir.rbytes') + self.poutput('{:10s} {}'.format( + humansize(int(xattr.decode('utf-8'))), '.' + + i.decode('utf-8'))) + except libcephfs.Error: + continue + else: + dir_ = os.path.normpath(dir_) + self.poutput('{:10s} {}'.format(humansize(int(cephfs.getxattr( + dir_, 'ceph.dir.rbytes').decode('utf-8'))), '.' + + dir_.decode('utf-8'))) + + quota_parser = argparse.ArgumentParser( + description='Quota management for a Directory') + quota_parser.add_argument('op', choices=['get', 'set'], + help='Quota operation type.') + quota_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of the directory.') + quota_parser.add_argument('--max_bytes', type=int, default=-1, nargs='?', + help='Max cumulative size of the data under ' + 'this directory.') + quota_parser.add_argument('--max_files', type=int, default=-1, nargs='?', + help='Total number of files under this ' + 'directory tree.') + + @with_argparser(quota_parser) + def do_quota(self, args): + """ + Quota management. + """ + if not is_dir_exists(args.path): + self.perror('error: no such directory {}'.format(args.path.decode('utf-8')), + end='\n', apply_style=True) + return + + if args.op == 'set': + if (args.max_bytes == -1) and (args.max_files == -1): + self.poutput('please specify either --max_bytes or ' + '--max_files or both') + return + + if args.max_bytes >= 0: + max_bytes = to_bytes(str(args.max_bytes)) + try: + cephfs.setxattr(args.path, 'ceph.quota.max_bytes', + max_bytes, len(max_bytes), + os.XATTR_CREATE) + self.poutput('max_bytes set to %d' % args.max_bytes) + except libcephfs.Error: + cephfs.setxattr(args.path, 'ceph.quota.max_bytes', + max_bytes, len(max_bytes), + os.XATTR_REPLACE) + self.poutput('max_bytes reset to %d' % args.max_bytes) + + if args.max_files >= 0: + max_files = to_bytes(str(args.max_files)) + try: + cephfs.setxattr(args.path, 'ceph.quota.max_files', + max_files, len(max_files), + os.XATTR_CREATE) + self.poutput('max_files set to %d' % args.max_files) + except libcephfs.Error: + cephfs.setxattr(args.path, 'ceph.quota.max_files', + max_files, len(max_files), + os.XATTR_REPLACE) + self.poutput('max_files reset to %d' % args.max_files) + elif args.op == 'get': + max_bytes = '0' + max_files = '0' + try: + max_bytes = cephfs.getxattr(args.path, + 'ceph.quota.max_bytes') + self.poutput('max_bytes: %s' % max_bytes) + except libcephfs.Error: + self.poutput('max_bytes is not set') + pass + + try: + max_files = cephfs.getxattr(args.path, + 'ceph.quota.max_files') + self.poutput('max_files: %s' % max_files) + except libcephfs.Error: + self.poutput('max_files is not set') + pass + + def do_help(self, line): + """ + Get details about a command. + Usage: help <cmd> - for a specific command + help all - for all the commands + """ + if line == 'all': + for k in dir(self): + if k.startswith('do_'): + self.poutput('-'*80) + super().do_help(k[3:]) + return + parser = self.create_argparser(line) + if parser: + parser.print_help() + else: + super().do_help(line) + + def complete_stat(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + stat_parser = argparse.ArgumentParser( + description='Display file or file system status') + stat_parser.add_argument('paths', type=str, help='file paths', + action=path_to_bytes, nargs='+') + + @with_argparser(stat_parser) + def do_stat(self, args): + """ + Display file or file system status + """ + for path in args.paths: + try: + stat = cephfs.stat(path) + atime = stat.st_atime.isoformat(' ') + mtime = stat.st_mtime.isoformat(' ') + ctime = stat.st_mtime.isoformat(' ') + + self.poutput("File: {}\nSize: {:d}\nBlocks: {:d}\nIO Block: {:d}\n\ +Device: {:d}\tInode: {:d}\tLinks: {:d}\nPermission: {:o}/{}\tUid: {:d}\tGid: {:d}\n\ +Access: {}\nModify: {}\nChange: {}".format(path.decode('utf-8'), stat.st_size, + stat.st_blocks, stat.st_blksize, stat.st_dev, + stat.st_ino, stat.st_nlink, stat.st_mode, + mode_notation(stat.st_mode), stat.st_uid, + stat.st_gid, atime, mtime, ctime)) + except libcephfs.Error: + self.perror('{}: no such file or directory'.format(path.decode('utf-8')), + end='\n', apply_style=True) + + +if __name__ == '__main__': + config_file = '' + exe = sys.argv[0] + main_parser = argparse.ArgumentParser(description='') + main_parser.add_argument('-c', '--config', action='store', + help='Configuration file_path', type=str) + main_parser.add_argument( + '-b', '--batch', action='store', help='Batch File path.', type=str) + main_parser.add_argument('-t', '--test', action='store', + help='Test against transcript(s) in FILE', + nargs='+') + main_parser.add_argument('commands', nargs='*', + help='comma delimited commands', default=[]) + args = main_parser.parse_args() + if args.config: + config_file = args.config + if args.batch: + args.commands = ['load ' + args.batch, ',quit'] + if args.test: + args.commands.extend(['-t,'] + [arg+',' for arg in args.test]) + sys.argv.clear() + sys.argv.append(exe) + sys.argv.extend([i.strip() for i in ' '.join(args.commands).split(',')]) + setup_cephfs(config_file) + shell = CephFSShell() + shell.cmdloop() diff --git a/src/tools/cephfs/cephfs-table-tool.cc b/src/tools/cephfs/cephfs-table-tool.cc new file mode 100644 index 00000000..47b475dd --- /dev/null +++ b/src/tools/cephfs/cephfs-table-tool.cc @@ -0,0 +1,47 @@ + +#include "include/types.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "TableTool.h" + + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + TableTool::usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + TableTool tt; + + // Connect to mon cluster, download MDS map etc + int rc = tt.init(); + if (rc != 0) { + std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + + // Finally, execute the user's commands + rc = tt.main(args); + if (rc != 0) { + std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl; + } + + return rc; +} + + diff --git a/src/tools/cephfs/setup.py b/src/tools/cephfs/setup.py new file mode 100644 index 00000000..8cf7f28f --- /dev/null +++ b/src/tools/cephfs/setup.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +from setuptools import setup + +__version__ = '0.0.1' + +setup( + name='cephfs-shell', + version=__version__, + description='Interactive shell for Ceph file system', + keywords='cephfs, shell', + scripts=['cephfs-shell'], + install_requires=[ + 'cephfs', + 'cmd2', + 'colorama', + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python :: 3' + ], + license='LGPLv2+', +) diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc new file mode 100644 index 00000000..07b8b79a --- /dev/null +++ b/src/tools/crushtool.cc @@ -0,0 +1,1304 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> + +#include <fstream> +#include <type_traits> + +#include "common/debug.h" +#include "common/errno.h" +#include "common/config.h" +#include "common/Formatter.h" + +#include "common/ceph_argparse.h" +#include "include/stringify.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "osd/OSDMap.h" +#include "crush/CrushWrapper.h" +#include "crush/CrushCompiler.h" +#include "crush/CrushTester.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_crush + + +const char *infn = "stdin"; + +static int get_fd_data(int fd, bufferlist &bl) +{ + + uint64_t total = 0; + do { + ssize_t bytes = bl.read_fd(fd, 1024*1024); + if (bytes < 0) { + cerr << "read_fd error " << cpp_strerror(-bytes) << "\n"; + return -1; + } + + if (bytes == 0) + break; + + total += bytes; + } while(true); + + ceph_assert(bl.length() == total); + return 0; +} + +//////////////////////////////////////////////////////////////////////////// + +void data_analysis_usage() +{ +cout << "data output from testing routine ...\n"; +cout << " absolute_weights\n"; +cout << " the decimal weight of each OSD\n"; +cout << " data layout: ROW MAJOR\n"; +cout << " OSD id (int), weight (int)\n"; +cout << " batch_device_expected_utilization_all\n"; +cout << " the expected number of objects each OSD should receive per placement batch\n"; +cout << " which may be a decimal value\n"; +cout << " data layout: COLUMN MAJOR\n"; +cout << " round (int), objects expected on OSD 0...OSD n (float)\n"; +cout << " batch_device_utilization_all\n"; +cout << " the number of objects stored on each OSD during each placement round\n"; +cout << " data layout: COLUMN MAJOR\n"; +cout << " round (int), objects stored on OSD 0...OSD n (int)\n"; +cout << " device_utilization_all\n"; +cout << " the number of objects stored on each OSD at the end of placements\n"; +cout << " data_layout: ROW MAJOR\n"; +cout << " OSD id (int), objects stored (int), objects expected (float)\n"; +cout << " device_utilization\n"; +cout << " the number of objects stored on each OSD marked 'up' at the end of placements\n"; +cout << " data_layout: ROW MAJOR\n"; +cout << " OSD id (int), objects stored (int), objects expected (float)\n"; +cout << " placement_information\n"; +cout << " the map of input -> OSD\n"; +cout << " data_layout: ROW MAJOR\n"; +cout << " input (int), OSD's mapped (int)\n"; +cout << " proportional_weights_all\n"; +cout << " the proportional weight of each OSD specified in the CRUSH map\n"; +cout << " data_layout: ROW MAJOR\n"; +cout << " OSD id (int), proportional weight (float)\n"; +cout << " proportional_weights\n"; +cout << " the proportional weight of each 'up' OSD specified in the CRUSH map\n"; +cout << " data_layout: ROW MAJOR\n"; +cout << " OSD id (int), proportional weight (float)\n"; +} + +void usage() +{ + cout << "usage: crushtool ...\n"; + cout << "\n"; + cout << "Display, modify and test a crush map\n"; + cout << "\n"; + cout << "There are five stages, running one after the other:\n"; + cout << "\n"; + cout << " - input/build\n"; + cout << " - tunables adjustments\n"; + cout << " - modifications\n"; + cout << " - display/test\n"; + cout << " - output\n"; + cout << "\n"; + cout << "Options that are not specific to a stage.\n"; + cout << "\n"; + cout << " [--infn|-i infile]\n"; + cout << " read the crush map from infile\n"; + cout << "\n"; + cout << "Options for the input/build stage\n"; + cout << "\n"; + cout << " --decompile|-d map decompile a crush map to source\n"; + cout << " [--outfn|-o outfile]\n"; + cout << " specify output for for (de)compilation\n"; + cout << " --compile|-c map.txt compile a map from source\n"; + cout << " --enable-unsafe-tunables\n"; + cout << " compile with unsafe tunables\n"; + cout << " --build --num_osds N layer1 ...\n"; + cout << " build a new map, where each 'layer' is\n"; + cout << " 'name (uniform|straw2|straw|list|tree) size'\n"; + cout << "\n"; + cout << "Options for the tunables adjustments stage\n"; + cout << "\n"; + cout << " --set-choose-local-tries N\n"; + cout << " set choose local retries before re-descent\n"; + cout << " --set-choose-local-fallback-tries N\n"; + cout << " set choose local retries using fallback\n"; + cout << " permutation before re-descent\n"; + cout << " --set-choose-total-tries N\n"; + cout << " set choose total descent attempts\n"; + cout << " --set-chooseleaf-descend-once <0|1>\n"; + cout << " set chooseleaf to (not) retry the recursive descent\n"; + cout << " --set-chooseleaf-vary-r <0|1>\n"; + cout << " set chooseleaf to (not) vary r based on parent\n"; + cout << " --set-chooseleaf-stable <0|1>\n"; + cout << " set chooseleaf firstn to (not) return stable results\n"; + cout << "\n"; + cout << "Options for the modifications stage\n"; + cout << "\n"; + cout << " -i mapfn --add-item id weight name [--loc type name ...]\n"; + cout << " insert an item into the hierarchy at the\n"; + cout << " given location\n"; + cout << " -i mapfn --update-item id weight name [--loc type name ...]\n"; + cout << " insert or move an item into the hierarchy at the\n"; + cout << " given location\n"; + cout << " -i mapfn --remove-item name\n" + << " remove the given item\n"; + cout << " -i mapfn --reweight-item name weight\n"; + cout << " reweight a given item (and adjust ancestor\n" + << " weights as needed)\n"; + cout << " -i mapfn --add-bucket name type [--loc type name ...]\n" + << " insert a bucket into the hierarchy at the given\n" + << " location\n"; + cout << " -i mapfn --move name --loc type name ...\n" + << " move the given item to specified location\n"; + cout << " -i mapfn --reweight recalculate all bucket weights\n"; + cout << " -i mapfn --rebuild-class-roots\n"; + cout << " rebuild the per-class shadow trees (normally a no-op)\n"; + cout << " -i mapfn --create-simple-rule name root type mode\n" + << " create crush rule <name> to start from <root>,\n" + << " replicate across buckets of type <type>, using\n" + << " a choose mode of <firstn|indep>\n"; + cout << " -i mapfn --create-replicated-rule name root type\n" + << " create crush rule <name> to start from <root>,\n" + << " replicate across buckets of type <type>\n"; + cout << " --device-class <class>\n"; + cout << " use device class <class> for new rule\n"; + cout << " -i mapfn --remove-rule name\n" + << " remove the specified crush rule\n"; + cout << "\n"; + cout << "Options for the display/test stage\n"; + cout << "\n"; + cout << " -f --format the format of --dump, defaults to json-pretty\n"; + cout << " can be one of json, json-pretty, xml, xml-pretty,\n"; + cout << " table, table-kv, html, html-pretty\n"; + cout << " --dump dump the crush map\n"; + cout << " --tree print map summary as a tree\n"; + cout << " --check [max_id] check if any item is referencing an unknown name/type\n"; + cout << " -i mapfn --show-location id\n"; + cout << " show location for given device id\n"; + cout << " -i mapfn --test test a range of inputs on the map\n"; + cout << " [--min-x x] [--max-x x] [--x x]\n"; + cout << " [--min-rule r] [--max-rule r] [--rule r] [--ruleset rs]\n"; + cout << " [--num-rep n]\n"; + cout << " [--pool-id n] specifies pool id\n"; + cout << " [--batches b] split the CRUSH mapping into b > 1 rounds\n"; + cout << " [--weight|-w devno weight]\n"; + cout << " where weight is 0 to 1.0\n"; + cout << " [--simulate] simulate placements using a random\n"; + cout << " number generator in place of the CRUSH\n"; + cout << " algorithm\n"; + cout << " --show-utilization show OSD usage\n"; + cout << " --show-utilization-all\n"; + cout << " include zero weight items\n"; + cout << " --show-statistics show chi squared statistics\n"; + cout << " --show-mappings show mappings\n"; + cout << " --show-bad-mappings show bad mappings\n"; + cout << " --show-choose-tries show choose tries histogram\n"; + cout << " --output-name name\n"; + cout << " prepend the data file(s) generated during the\n"; + cout << " testing routine with name\n"; + cout << " --output-csv\n"; + cout << " export select data generated during testing routine\n"; + cout << " to CSV files for off-line post-processing\n"; + cout << " use --help-output for more information\n"; + cout << " --reclassify transform legacy CRUSH map buckets and rules\n"; + cout << " by adding classes\n"; + cout << " --reclassify-bucket <bucket-match> <class> <default-parent>\n"; + cout << " --reclassify-root <bucket-name> <class>\n"; + cout << " --set-subtree-class <bucket-name> <class>\n"; + cout << " set class for all items beneath bucket-name\n"; + cout << " --compare <otherfile> compare two maps using --test parameters\n"; + cout << "\n"; + cout << "Options for the output stage\n"; + cout << "\n"; + cout << " [--outfn|-o outfile]\n"; + cout << " specify output for modified crush map\n"; + cout << "\n"; +} + +struct bucket_types_t { + const char *name; + int type; +} bucket_types[] = { + { "uniform", CRUSH_BUCKET_UNIFORM }, + { "list", CRUSH_BUCKET_LIST }, + { "straw", CRUSH_BUCKET_STRAW }, + { "straw2", CRUSH_BUCKET_STRAW2 }, + { "tree", CRUSH_BUCKET_TREE }, + { 0, 0 }, +}; + +struct layer_t { + const char *name; + const char *buckettype; + int size; +}; + +template<typename... Args> +bool argparse_withargs(std::vector<const char*> &args, + std::vector<const char*>::iterator& i, + std::ostream& oss, + const char* opt, + Args*... opts) +{ + if (!ceph_argparse_flag(args, i, opt, nullptr)) { + return false; + } + auto parse = [&](auto& opt) { + if (i == args.end()) { + oss << "expecting additional argument to " << opt; + return false; + } + using opt_t = std::remove_pointer_t<decay_t<decltype(opt)>>; + string err; + if constexpr (std::is_same_v<opt_t, string>) { + opt->assign(*i); + } else if constexpr (is_same_v<opt_t, int>) { + *opt = strict_strtol(*i, 10, &err); + } else if constexpr (is_same_v<opt_t, float>) { + *opt = strict_strtof(*i, &err); + } + i = args.erase(i); + if (err.empty()) + return true; + else { + oss << err; + return false; + } + }; + (... && parse(opts)); + return true; +} + +int do_add_bucket(CephContext* cct, + const char* me, + CrushWrapper& crush, + const string& add_name, + const string& add_type, + const map<string,string>& add_loc) { + int bucketno; + if (crush.name_exists(add_name)) { + cerr << me << " bucket '" << add_name << "' already exists" << std::endl; + return -EEXIST; + } + int type = crush.get_type_id(add_type); + if (type <= 0) { + cerr << me << " bad bucket type: " << add_type << std::endl; + return -EINVAL; + } + if (int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT, type, 0, nullptr, nullptr, &bucketno); + r < 0) { + cerr << me << " unable to add bucket: " << cpp_strerror(r) << std::endl; + return r; + } + if (int r = crush.set_item_name(bucketno, add_name); r < 0) { + cerr << me << " bad bucket name: " << add_name << std::endl; + return r; + } + if (!add_loc.empty()) { + if (!crush.check_item_loc(cct, bucketno, add_loc, (int*)nullptr)) { + if (int r = crush.move_bucket(cct, bucketno, add_loc); r < 0) { + cerr << me << " error moving bucket '" << add_name << "' to " << add_loc << std::endl; + return r; + } + } + } + return 0; +} + +// return 1 for no change, 0 for successful change, negative on error +int do_move_item(CephContext* cct, + const char *me, + CrushWrapper& crush, + const string& name, + const map<string,string>& loc) +{ + if (!crush.name_exists(name)) { + cerr << me << " item '" << name << "' does not exist" << std::endl; + return -ENOENT; + } + int id = crush.get_item_id(name); + if (loc.empty()) { + cerr << me << " expecting additional --loc argument to --move" << std::endl; + return -EINVAL; + } + if (crush.check_item_loc(cct, id, loc, (int*)nullptr)) { + // it's already there + cerr << me << " item '" << name << "' already at " << loc << std::endl; + return 1; + } + if (id >= 0) { + switch (int r = crush.create_or_move_item(cct, id, 0, name, loc)) { + case 0: + return 1; + case 1: + return 0; + default: + return r; + } + } else { + return crush.move_bucket(cct, id, loc); + } +} + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + const char *me = argv[0]; + std::string infn, srcfn, outfn, add_name, add_type, remove_name, reweight_name; + std::string move_name; + bool compile = false; + bool decompile = false; + bool check = false; + int max_id = -1; + bool test = false; + bool display = false; + bool tree = false; + string dump_format = "json-pretty"; + bool dump = false; + int full_location = -1; + bool write_to_file = false; + int verbose = 0; + bool unsafe_tunables = false; + + bool rebuild_class_roots = false; + + bool reweight = false; + int add_item = -1; + bool add_bucket = false; + bool update_item = false; + bool move_item = false; + bool add_rule = false; + std::string rule_name, rule_root, rule_type, rule_mode, rule_device_class; + bool del_rule = false; + float add_weight = 0; + map<string,string> add_loc; + float reweight_weight = 0; + + bool adjust = false; + + int build = 0; + int num_osds =0; + vector<layer_t> layers; + + int choose_local_tries = -1; + int choose_local_fallback_tries = -1; + int choose_total_tries = -1; + int chooseleaf_descend_once = -1; + int chooseleaf_vary_r = -1; + int chooseleaf_stable = -1; + int straw_calc_version = -1; + int allowed_bucket_algs = -1; + + bool reclassify = false; + map<string,pair<string,string>> reclassify_bucket; // %suffix or prefix% -> class, default_root + map<string,string> reclassify_root; // bucket -> class + map<string,string> set_subtree_class; // bucket -> class + + string compare; + + CrushWrapper crush; + + CrushTester tester(crush, cout); + + // we use -c, don't confuse the generic arg parsing + // only parse arguments from CEPH_ARGS, if in the environment + vector<const char *> empty_args; + auto cct = global_init(NULL, empty_args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + // crushtool times out occasionally when quits. so do not + // release the g_ceph_context. + cct->get(); + common_init_finish(g_ceph_context); + + int x; + float y; + long long z; + + std::string val; + std::ostringstream err; + int tmp; + for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_witharg(args, i, &val, "-d", "--decompile", (char*)NULL)) { + infn = val; + decompile = true; + } else if (ceph_argparse_witharg(args, i, &val, "-i", "--infn", (char*)NULL)) { + infn = val; + } else if (ceph_argparse_witharg(args, i, &val, "-o", "--outfn", (char*)NULL)) { + outfn = val; + } else if (ceph_argparse_flag(args, i, "-v", "--verbose", (char*)NULL)) { + verbose += 1; + } else if (ceph_argparse_witharg(args, i, &val, "--compare", (char*)NULL)) { + compare = val; + } else if (ceph_argparse_flag(args, i, "--reclassify", (char*)NULL)) { + reclassify = true; + } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-bucket", + (char*)NULL)) { + if (i == args.end()) { + cerr << "expecting additional argument" << std::endl; + return EXIT_FAILURE; + } + string c = *i; + i = args.erase(i); + if (i == args.end()) { + cerr << "expecting additional argument" << std::endl; + return EXIT_FAILURE; + } + reclassify_bucket[val] = make_pair(c, *i); + i = args.erase(i); + } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-root", + (char*)NULL)) { + if (i == args.end()) { + cerr << "expecting additional argument" << std::endl; + return EXIT_FAILURE; + } + reclassify_root[val] = *i; + i = args.erase(i); + } else if (ceph_argparse_witharg(args, i, &val, "--set-subtree-class", + (char*)NULL)) { + if (i == args.end()) { + cerr << "expecting additional argument" << std::endl; + return EXIT_FAILURE; + } + set_subtree_class[val] = *i; + i = args.erase(i); + } else if (ceph_argparse_flag(args, i, "--tree", (char*)NULL)) { + tree = true; + } else if (ceph_argparse_witharg(args, i, &val, "-f", "--format", (char*)NULL)) { + dump_format = val; + } else if (ceph_argparse_flag(args, i, "--dump", (char*)NULL)) { + dump = true; + } else if (ceph_argparse_flag(args, i, "--show_utilization", (char*)NULL)) { + display = true; + tester.set_output_utilization(true); + } else if (ceph_argparse_flag(args, i, "--show_utilization_all", (char*)NULL)) { + display = true; + tester.set_output_utilization_all(true); + } else if (ceph_argparse_flag(args, i, "--show_statistics", (char*)NULL)) { + display = true; + tester.set_output_statistics(true); + } else if (ceph_argparse_flag(args, i, "--show_mappings", (char*)NULL)) { + display = true; + tester.set_output_mappings(true); + } else if (ceph_argparse_flag(args, i, "--show_bad_mappings", (char*)NULL)) { + display = true; + tester.set_output_bad_mappings(true); + } else if (ceph_argparse_flag(args, i, "--show_choose_tries", (char*)NULL)) { + display = true; + tester.set_output_choose_tries(true); + } else if (ceph_argparse_witharg(args, i, &val, "-c", "--compile", (char*)NULL)) { + srcfn = val; + compile = true; + } else if (ceph_argparse_witharg(args, i, &max_id, err, "--check", (char*)NULL)) { + check = true; + } else if (ceph_argparse_flag(args, i, "-t", "--test", (char*)NULL)) { + test = true; + } else if (ceph_argparse_witharg(args, i, &full_location, err, "--show-location", (char*)NULL)) { + } else if (ceph_argparse_flag(args, i, "-s", "--simulate", (char*)NULL)) { + tester.set_random_placement(); + } else if (ceph_argparse_flag(args, i, "--enable-unsafe-tunables", (char*)NULL)) { + unsafe_tunables = true; + } else if (ceph_argparse_witharg(args, i, &choose_local_tries, err, + "--set_choose_local_tries", (char*)NULL)) { + adjust = true; + } else if (ceph_argparse_witharg(args, i, &choose_local_fallback_tries, err, + "--set_choose_local_fallback_tries", (char*)NULL)) { + adjust = true; + } else if (ceph_argparse_witharg(args, i, &choose_total_tries, err, + "--set_choose_total_tries", (char*)NULL)) { + adjust = true; + } else if (ceph_argparse_witharg(args, i, &chooseleaf_descend_once, err, + "--set_chooseleaf_descend_once", (char*)NULL)) { + adjust = true; + } else if (ceph_argparse_witharg(args, i, &chooseleaf_vary_r, err, + "--set_chooseleaf_vary_r", (char*)NULL)) { + adjust = true; + } else if (ceph_argparse_witharg(args, i, &chooseleaf_stable, err, + "--set_chooseleaf_stable", (char*)NULL)) { + adjust = true; + } else if (ceph_argparse_witharg(args, i, &straw_calc_version, err, + "--set_straw_calc_version", (char*)NULL)) { + adjust = true; + } else if (ceph_argparse_witharg(args, i, &allowed_bucket_algs, err, + "--set_allowed_bucket_algs", (char*)NULL)) { + adjust = true; + } else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) { + reweight = true; + } else if (ceph_argparse_flag(args, i, "--rebuild-class-roots", (char*)NULL)) { + rebuild_class_roots = true; + } else if (ceph_argparse_witharg(args, i, &add_item, err, "--add_item", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + if (i == args.end()) { + cerr << "expecting additional argument to --add-item" << std::endl; + return EXIT_FAILURE; + } + add_weight = atof(*i); + i = args.erase(i); + if (i == args.end()) { + cerr << "expecting additional argument to --add-item" << std::endl; + return EXIT_FAILURE; + } + add_name.assign(*i); + i = args.erase(i); + } else if (ceph_argparse_witharg(args, i, &add_item, err, "--update_item", (char*)NULL)) { + update_item = true; + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + if (i == args.end()) { + cerr << "expecting additional argument to --update-item" << std::endl; + return EXIT_FAILURE; + } + add_weight = atof(*i); + i = args.erase(i); + if (i == args.end()) { + cerr << "expecting additional argument to --update-item" << std::endl; + return EXIT_FAILURE; + } + add_name.assign(*i); + i = args.erase(i); + } else if (argparse_withargs(args, i, err, "--add-bucket", + &add_name, &add_type)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + add_bucket = true; + } else if (argparse_withargs(args, i, err, "--move", + &move_name)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + move_item = true; + } else if (ceph_argparse_witharg(args, i, &val, err, "--create-simple-rule", (char*)NULL)) { + rule_name.assign(val); + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + if (i == args.end()) { + cerr << "expecting additional argument to --create-simple-rule" << std::endl; + return EXIT_FAILURE; + } + + rule_root.assign(*i); + i = args.erase(i); + if (i == args.end()) { + cerr << "expecting additional argument to --create-simple-rule" << std::endl; + return EXIT_FAILURE; + } + + rule_type.assign(*i); + i = args.erase(i); + if (i == args.end()) { + cerr << "expecting additional argument to --create-simple-rule" << std::endl; + return EXIT_FAILURE; + } + + rule_mode.assign(*i); + i = args.erase(i); + + cout << "--create-simple-rule:" + << " name=" << rule_name + << " root=" << rule_root + << " type=" << rule_type + << " mode=" << rule_mode + << std::endl; + add_rule = true; + } else if (ceph_argparse_witharg(args, i, &val, err, "--create-replicated-rule", (char*)NULL)) { + rule_name.assign(val); + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + if (i == args.end()) { + cerr << "expecting additional argument to --create-replicated-rule" << std::endl; + return EXIT_FAILURE; + } + + rule_root.assign(*i); + i = args.erase(i); + if (i == args.end()) { + cerr << "expecting additional argument to --create-replicated-rule" << std::endl; + return EXIT_FAILURE; + } + + rule_type.assign(*i); + i = args.erase(i); + rule_mode = "firstn"; + + cout << "--create-replicated-rule:" + << " name=" << rule_name + << " root=" << rule_root + << " type=" << rule_type + << std::endl; + add_rule = true; + + } else if (ceph_argparse_witharg(args, i, &val, "--device-class", (char*)NULL)) { + rule_device_class.assign(val); + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + } else if (ceph_argparse_witharg(args, i, &val, "--remove-rule", (char*)NULL)) { + rule_name.assign(val); + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + del_rule = true; + } else if (ceph_argparse_witharg(args, i, &val, "--loc", (char*)NULL)) { + std::string type(val); + if (i == args.end()) { + cerr << "expecting additional argument to --loc" << std::endl; + return EXIT_FAILURE; + } + std::string name(*i); + i = args.erase(i); + add_loc[type] = name; + } else if (ceph_argparse_flag(args, i, "--output-csv", (char*)NULL)) { + write_to_file = true; + tester.set_output_data_file(true); + tester.set_output_csv(true); + } else if (ceph_argparse_flag(args, i, "--help-output", (char*)NULL)) { + data_analysis_usage(); + return EXIT_SUCCESS; + } else if (ceph_argparse_witharg(args, i, &val, "--output-name", (char*)NULL)) { + std::string name(val); + if (i == args.end()) { + cerr << "expecting additional argument to --output-name" << std::endl; + return EXIT_FAILURE; + } + else { + tester.set_output_data_file_name(name + "-"); + } + } else if (ceph_argparse_witharg(args, i, &val, "--remove_item", (char*)NULL)) { + remove_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--reweight_item", (char*)NULL)) { + reweight_name = val; + if (i == args.end()) { + cerr << "expecting additional argument to --reweight-item" << std::endl; + return EXIT_FAILURE; + } + reweight_weight = atof(*i); + i = args.erase(i); + } else if (ceph_argparse_flag(args, i, "--build", (char*)NULL)) { + build = true; + } else if (ceph_argparse_witharg(args, i, &num_osds, err, "--num_osds", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + } else if (ceph_argparse_witharg(args, i, &x, err, "--num_rep", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_num_rep(x); + } else if (ceph_argparse_witharg(args, i, &x, err, "--max_x", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_max_x(x); + } else if (ceph_argparse_witharg(args, i, &x, err, "--min_x", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_min_x(x); + } else if (ceph_argparse_witharg(args, i, &z, err, "--pool_id", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_pool_id(z); + } else if (ceph_argparse_witharg(args, i, &x, err, "--x", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_x(x); + } else if (ceph_argparse_witharg(args, i, &x, err, "--max_rule", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_max_rule(x); + } else if (ceph_argparse_witharg(args, i, &x, err, "--min_rule", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_min_rule(x); + } else if (ceph_argparse_witharg(args, i, &x, err, "--rule", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_rule(x); + } else if (ceph_argparse_witharg(args, i, &x, err, "--ruleset", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_ruleset(x); + } else if (ceph_argparse_witharg(args, i, &x, err, "--batches", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_batches(x); + } else if (ceph_argparse_witharg(args, i, &y, err, "--mark-down-ratio", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_device_down_ratio(y); + } else if (ceph_argparse_witharg(args, i, &y, err, "--mark-down-bucket-ratio", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + tester.set_bucket_down_ratio(y); + } else if (ceph_argparse_witharg(args, i, &tmp, err, "--weight", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + int dev = tmp; + if (i == args.end()) { + cerr << "expecting additional argument to --weight" << std::endl; + return EXIT_FAILURE; + } + float f = atof(*i); + i = args.erase(i); + tester.set_device_weight(dev, f); + } + else { + ++i; + } + } + + if (test && !check && !display && !write_to_file && compare.empty()) { + cerr << "WARNING: no output selected; use --output-csv or --show-X" << std::endl; + } + + if (decompile + compile + build > 1) { + cerr << "cannot specify more than one of compile, decompile, and build" << std::endl; + return EXIT_FAILURE; + } + if (!check && !compile && !decompile && !build && !test && !reweight && !adjust && !tree && !dump && + add_item < 0 && !add_bucket && !move_item && !add_rule && !del_rule && full_location < 0 && + !reclassify && !rebuild_class_roots && + compare.empty() && + remove_name.empty() && reweight_name.empty()) { + cerr << "no action specified; -h for help" << std::endl; + return EXIT_FAILURE; + } + if ((!build) && (!args.empty())) { + cerr << "unrecognized arguments: " << args << std::endl; + return EXIT_FAILURE; + } + else { + if ((args.size() % 3) != 0U) { + cerr << "remaining args: " << args << std::endl; + cerr << "layers must be specified with 3-tuples of (name, buckettype, size)" + << std::endl; + return EXIT_FAILURE; + } + for (size_t j = 0; j < args.size(); j += 3) { + layer_t l; + l.name = args[j]; + l.buckettype = args[j+1]; + l.size = atoi(args[j+2]); + layers.push_back(l); + } + } + + /* + if (outfn) cout << "outfn " << outfn << std::endl; + if (cinfn) cout << "cinfn " << cinfn << std::endl; + if (dinfn) cout << "dinfn " << dinfn << std::endl; + */ + + bool modified = false; + + // input ---- + + if (!infn.empty()) { + bufferlist bl; + std::string error; + + int r = 0; + if (infn == "-") { + if (isatty(STDIN_FILENO)) { + cerr << "stdin must not be from a tty" << std::endl; + return EXIT_FAILURE; + } + r = get_fd_data(STDIN_FILENO, bl); + if (r < 0) { + cerr << "error reading data from STDIN" << std::endl; + return EXIT_FAILURE; + } + } else { + r = bl.read_file(infn.c_str(), &error); + if (r < 0) { + cerr << me << ": error reading '" << infn << "': " + << error << std::endl; + return EXIT_FAILURE; + } + } + auto p = bl.cbegin(); + try { + crush.decode(p); + } catch(...) { + cerr << me << ": unable to decode " << infn << std::endl; + return EXIT_FAILURE; + } + } + + if (compile) { + crush.create(); + + // read the file + ifstream in(srcfn.c_str()); + if (!in.is_open()) { + cerr << "input file " << srcfn << " not found" << std::endl; + return -ENOENT; + } + + CrushCompiler cc(crush, cerr, verbose); + if (unsafe_tunables) + cc.enable_unsafe_tunables(); + int r = cc.compile(in, srcfn.c_str()); + if (r < 0) + return EXIT_FAILURE; + + modified = true; + } + + if (build) { + if (layers.empty()) { + cerr << me << ": must specify at least one layer" << std::endl; + return EXIT_FAILURE; + } + + crush.create(); + + vector<int> lower_items; + vector<int> lower_weights; + + crush.set_max_devices(num_osds); + for (int i=0; i<num_osds; i++) { + lower_items.push_back(i); + lower_weights.push_back(0x10000); + crush.set_item_name(i, "osd." + stringify(i)); + } + + crush.set_type_name(0, "osd"); + int type = 1; + for (vector<layer_t>::iterator p = layers.begin(); p != layers.end(); ++p, type++) { + layer_t &l = *p; + + dout(2) << "layer " << type + << " " << l.name + << " bucket type " << l.buckettype + << " " << l.size + << dendl; + + crush.set_type_name(type, l.name); + + int buckettype = -1; + for (int i = 0; bucket_types[i].name; i++) + if (l.buckettype && strcmp(l.buckettype, bucket_types[i].name) == 0) { + buckettype = bucket_types[i].type; + break; + } + if (buckettype < 0) { + cerr << "unknown bucket type '" << l.buckettype << "'" << std::endl; + return EXIT_FAILURE; + } + + // build items + vector<int> cur_items; + vector<int> cur_weights; + unsigned lower_pos = 0; // lower pos + + dout(2) << "lower_items " << lower_items << dendl; + dout(2) << "lower_weights " << lower_weights << dendl; + + int i = 0; + while (1) { + if (lower_pos == lower_items.size()) + break; + + int items[num_osds]; + int weights[num_osds]; + + int weight = 0; + int j; + for (j=0; j<l.size || l.size==0; j++) { + if (lower_pos == lower_items.size()) + break; + items[j] = lower_items[lower_pos]; + weights[j] = lower_weights[lower_pos]; + weight += weights[j]; + lower_pos++; + dout(2) << " item " << items[j] << " weight " << weights[j] << dendl; + } + + int id; + int r = crush.add_bucket(0, buckettype, CRUSH_HASH_DEFAULT, type, j, items, weights, &id); + if (r < 0) { + cerr << " Couldn't add bucket: " << cpp_strerror(r) << std::endl; + return r; + } + + char format[20]; + format[sizeof(format)-1] = '\0'; + if (l.size) + snprintf(format, sizeof(format)-1, "%s%%d", l.name); + else + strncpy(format, l.name, sizeof(format)-1); + char name[20]; + snprintf(name, sizeof(name), format, i); + crush.set_item_name(id, name); + + dout(2) << " in bucket " << id << " '" << name << "' size " << j << " weight " << weight << dendl; + + cur_items.push_back(id); + cur_weights.push_back(weight); + i++; + } + + lower_items.swap(cur_items); + lower_weights.swap(cur_weights); + } + + string root = layers.back().size == 0 ? layers.back().name : + string(layers.back().name) + "0"; + + { + set<int> roots; + crush.find_roots(&roots); + if (roots.size() > 1) { + cerr << "The crush rulesets will use the root " << root << "\n" + << "and ignore the others.\n" + << "There are " << roots.size() << " roots, they can be\n" + << "grouped into a single root by appending something like:\n" + << " root straw 0\n" + << std::endl; + } + } + + if (OSDMap::build_simple_crush_rules(g_ceph_context, crush, root, &cerr)) + return EXIT_FAILURE; + + modified = true; + } + + // mutate ---- + + if (choose_local_tries >= 0) { + crush.set_choose_local_tries(choose_local_tries); + modified = true; + } + if (choose_local_fallback_tries >= 0) { + crush.set_choose_local_fallback_tries(choose_local_fallback_tries); + modified = true; + } + if (choose_total_tries >= 0) { + crush.set_choose_total_tries(choose_total_tries); + modified = true; + } + if (chooseleaf_descend_once >= 0) { + crush.set_chooseleaf_descend_once(chooseleaf_descend_once); + modified = true; + } + if (chooseleaf_vary_r >= 0) { + crush.set_chooseleaf_vary_r(chooseleaf_vary_r); + modified = true; + } + if (chooseleaf_stable >= 0) { + crush.set_chooseleaf_stable(chooseleaf_stable); + modified = true; + } + if (straw_calc_version >= 0) { + crush.set_straw_calc_version(straw_calc_version); + modified = true; + } + if (allowed_bucket_algs >= 0) { + crush.set_allowed_bucket_algs(allowed_bucket_algs); + modified = true; + } + + if (!reweight_name.empty()) { + cout << me << " reweighting item " << reweight_name << " to " << reweight_weight << std::endl; + int r; + if (!crush.name_exists(reweight_name)) { + cerr << " name " << reweight_name << " dne" << std::endl; + r = -ENOENT; + } else { + int item = crush.get_item_id(reweight_name); + r = crush.adjust_item_weightf(g_ceph_context, item, reweight_weight); + } + if (r >= 0) + modified = true; + else { + cerr << me << " " << cpp_strerror(r) << std::endl; + return r; + } + } + + if (!remove_name.empty()) { + cout << me << " removing item " << remove_name << std::endl; + int r; + if (!crush.name_exists(remove_name)) { + cerr << " name " << remove_name << " dne" << std::endl; + r = -ENOENT; + } else { + int remove_item = crush.get_item_id(remove_name); + r = crush.remove_item(g_ceph_context, remove_item, false); + } + if (r == 0) + modified = true; + else { + cerr << me << " " << cpp_strerror(r) << std::endl; + return r; + } + } + + if (add_item >= 0) { + int r; + if (update_item) { + r = crush.update_item(g_ceph_context, add_item, add_weight, add_name.c_str(), add_loc); + } else { + r = crush.insert_item(g_ceph_context, add_item, add_weight, add_name.c_str(), add_loc); + } + if (r >= 0) { + modified = true; + } else { + cerr << me << " " << cpp_strerror(r) << std::endl; + return r; + } + } + + if (add_bucket) { + if (int r = do_add_bucket(cct.get(), me, crush, add_name, add_type, add_loc); !r) { + modified = true; + } else { + return r; + } + } + + if (move_item) { + if (int r = do_move_item(cct.get(), me, crush, move_name, add_loc); !r) { + modified = true; + } else { + return r; + } + } + if (add_rule) { + if (crush.rule_exists(rule_name)) { + cerr << "rule " << rule_name << " already exists" << std::endl; + return EXIT_FAILURE; + } + int r = crush.add_simple_rule(rule_name, rule_root, rule_type, + rule_device_class, + rule_mode, pg_pool_t::TYPE_REPLICATED, &err); + if (r < 0) { + cerr << err.str() << std::endl; + return EXIT_FAILURE; + } + modified = true; + } + + if (del_rule) { + if (!crush.rule_exists(rule_name)) { + cerr << "rule " << rule_name << " does not exist" << std::endl; + return 0; + } + int ruleno = crush.get_rule_id(rule_name); + ceph_assert(ruleno >= 0); + int r = crush.remove_rule(ruleno); + if (r < 0) { + cerr << "fail to remove rule " << rule_name << std::endl; + return EXIT_FAILURE; + } + modified = true; + } + + if (reweight) { + crush.reweight(g_ceph_context); + modified = true; + } + if (rebuild_class_roots) { + int r = crush.rebuild_roots_with_classes(g_ceph_context); + if (r < 0) { + cerr << "failed to rebuidl roots with classes" << std::endl; + return EXIT_FAILURE; + } + modified = true; + } + + for (auto& i : set_subtree_class) { + crush.set_subtree_class(i.first, i.second); + modified = true; + } + if (reclassify) { + int r = crush.reclassify( + g_ceph_context, + cout, + reclassify_root, + reclassify_bucket); + if (r < 0) { + cerr << "failed to reclassify map" << std::endl; + return EXIT_FAILURE; + } + modified = true; + } + + // display --- + if (full_location >= 0) { + map<string, string> loc = crush.get_full_location(full_location); + for (map<string,string>::iterator p = loc.begin(); + p != loc.end(); + ++p) { + cout << p->first << "\t" << p->second << std::endl; + } + } + + if (tree) { + crush.dump_tree(&cout, NULL, {}, true); + } + + if (dump) { + boost::scoped_ptr<Formatter> f(Formatter::create(dump_format, "json-pretty", "json-pretty")); + f->open_object_section("crush_map"); + crush.dump(f.get()); + f->close_section(); + f->flush(cout); + cout << "\n"; + } + + if (decompile) { + CrushCompiler cc(crush, cerr, verbose); + if (!outfn.empty()) { + ofstream o; + o.open(outfn.c_str(), ios::out | ios::binary | ios::trunc); + if (!o.is_open()) { + cerr << me << ": error writing '" << outfn << "'" << std::endl; + return EXIT_FAILURE; + } + cc.decompile(o); + o.close(); + } else { + cc.decompile(cout); + } + } + + if (check) { + tester.check_overlapped_rules(); + if (max_id >= 0) { + if (!tester.check_name_maps(max_id)) { + return EXIT_FAILURE; + } + } + } + + if (test) { + if (tester.get_output_utilization_all() || + tester.get_output_utilization()) + tester.set_output_statistics(true); + + int r = tester.test(); + if (r < 0) + return EXIT_FAILURE; + } + + if (compare.size()) { + CrushWrapper crush2; + bufferlist in; + string error; + int r = in.read_file(compare.c_str(), &error); + if (r < 0) { + cerr << me << ": error reading '" << compare << "': " + << error << std::endl; + return EXIT_FAILURE; + } + auto p = in.cbegin(); + try { + crush2.decode(p); + } catch(...) { + cerr << me << ": unable to decode " << compare << std::endl; + return EXIT_FAILURE; + } + r = tester.compare(crush2); + if (r < 0) + return EXIT_FAILURE; + } + + // output --- + if (modified) { + crush.finalize(); + + if (outfn.empty()) { + cout << me << " successfully built or modified map. Use '-o <file>' to write it out." << std::endl; + } else { + bufferlist bl; + crush.encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + int r = bl.write_file(outfn.c_str()); + if (r < 0) { + cerr << me << ": error writing '" << outfn << "': " << cpp_strerror(r) << std::endl; + return EXIT_FAILURE; + } + if (verbose) + cout << "wrote crush map to " << outfn << std::endl; + } + } + + return 0; +} +/* + * Local Variables: + * compile-command: "cd .. ; make crushtool && test/run-cli-tests" + * End: + */ diff --git a/src/tools/histogram_dump.py b/src/tools/histogram_dump.py new file mode 100755 index 00000000..bafc24b0 --- /dev/null +++ b/src/tools/histogram_dump.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# coding: utf-8 +# +# Ceph - scalable distributed file system +# +# Copyright (C) 2017 OVH +# +# This is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public +# License version 2, as published by the Free Software +# Foundation. See file COPYING. +# + +import json +import subprocess +import time +import os +import argparse + + +def shorten(val): + if isinstance(val, str): + return val + for u in ((3, ''), (6, 'k'), (9, 'M'), (12, 'G'), (15, 'T')): + if val < 10**u[0]: + return "{}{}".format(int(val / (10 ** (u[0]-3))), u[1]) + return val + + +def print_histogram(asok, logger, counter, last): + + try: + out = subprocess.check_output( + "ceph --admin-daemon {} perf histogram dump".format(asok), + shell=True) + j = json.loads(out.decode('utf-8')) + except Exception as e: + return (last, + "Couldn't connect to admin socket, result: \n{}".format(e)) + + current = j['osd'][counter]['values'] + axes = j['osd'][counter]['axes'] + content = "" + + content += "{}:\n".format(axes[1]['name']) + for r in axes[1]['ranges']: + content += "{0: >4} ".format( + shorten(r['min']) if 'min' in r else '') + content += "\n" + for r in axes[1]['ranges']: + content += "{0: >4} ".format( + shorten(r['max']) if 'max' in r else '') + content += "\n" + + content += ("{0: >"+str(len(axes[1]['ranges'])*5+14)+"}:\n").format( + axes[0]['name']) + + for i in range(len(current)): + for j in range(len(current[i])): + try: + diff = current[i][j] - last[i][j] + except IndexError: + diff = '-' + content += "{0: >4} ".format(shorten(diff)) + + r = axes[0]['ranges'][i] + content += "{0: >6} : {1}\n".format( + shorten(r['min']) if 'min' in r else '', + shorten(r['max']) if 'max' in r else '') + return (current, content) + + +def loop_print(asok, logger, counter): + last = [] + while True: + + last, content = print_histogram(asok, logger, counter, last) + print("{}{}".format("\n"*100, content)) + time.sleep(1) + + +def main(): + parser = argparse.ArgumentParser( + description='Continuously display ceph performance histogram') + parser.add_argument( + '--asok', + type=str, + default='/var/run/ceph/*.asok', + help='Path to asok file, can use wildcards') + parser.add_argument( + '--logger', + type=str, + default='osd') + parser.add_argument( + '--counter', + type=str, + default='op_w_latency_in_bytes_histogram') + args = parser.parse_args() + + loop_print(args.asok, args.logger, args.counter) + + +if __name__ == '__main__': + main() diff --git a/src/tools/kvstore_tool.cc b/src/tools/kvstore_tool.cc new file mode 100644 index 00000000..ed33b29c --- /dev/null +++ b/src/tools/kvstore_tool.cc @@ -0,0 +1,316 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "kvstore_tool.h" + +#include <iostream> + +#include "common/errno.h" +#include "common/url_escape.h" +#include "include/buffer.h" +#include "kv/KeyValueDB.h" + +StoreTool::StoreTool(const string& type, + const string& path, + bool need_open_db, + bool need_stats) + : store_path(path) +{ + + if (need_stats) { + g_conf()->rocksdb_perf = true; + g_conf()->rocksdb_collect_compaction_stats = true; + } + + if (type == "bluestore-kv") { +#ifdef WITH_BLUESTORE + if (load_bluestore(path, need_open_db) != 0) + exit(1); +#else + cerr << "bluestore not compiled in" << std::endl; + exit(1); +#endif + } else { + auto db_ptr = KeyValueDB::create(g_ceph_context, type, path); + if (need_open_db) { + if (int r = db_ptr->open(std::cerr); r < 0) { + cerr << "failed to open type " << type << " path " << path << ": " + << cpp_strerror(r) << std::endl; + exit(1); + } + db.reset(db_ptr); + } + } +} + +int StoreTool::load_bluestore(const string& path, bool need_open_db) +{ + auto bluestore = new BlueStore(g_ceph_context, path); + KeyValueDB *db_ptr; + int r = bluestore->start_kv_only(&db_ptr, need_open_db); + if (r < 0) { + return -EINVAL; + } + db = decltype(db){db_ptr, Deleter(bluestore)}; + return 0; +} + +uint32_t StoreTool::traverse(const string& prefix, + const bool do_crc, + const bool do_value_dump, + ostream *out) +{ + KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator(); + + if (prefix.empty()) + iter->seek_to_first(); + else + iter->seek_to_first(prefix); + + uint32_t crc = -1; + + while (iter->valid()) { + pair<string,string> rk = iter->raw_key(); + if (!prefix.empty() && (rk.first != prefix)) + break; + + if (out) + *out << url_escape(rk.first) << "\t" << url_escape(rk.second); + if (do_crc) { + bufferlist bl; + bl.append(rk.first); + bl.append(rk.second); + bl.append(iter->value()); + + crc = bl.crc32c(crc); + if (out) { + *out << "\t" << bl.crc32c(0); + } + } + if (out) + *out << std::endl; + if (out && do_value_dump) { + bufferptr bp = iter->value_as_ptr(); + bufferlist value; + value.append(bp); + ostringstream os; + value.hexdump(os); + std::cout << os.str() << std::endl; + } + iter->next(); + } + + return crc; +} + +void StoreTool::list(const string& prefix, const bool do_crc, + const bool do_value_dump) +{ + traverse(prefix, do_crc, do_value_dump,& std::cout); +} + +bool StoreTool::exists(const string& prefix) +{ + ceph_assert(!prefix.empty()); + KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator(); + iter->seek_to_first(prefix); + return (iter->valid() && (iter->raw_key().first == prefix)); +} + +bool StoreTool::exists(const string& prefix, const string& key) +{ + ceph_assert(!prefix.empty()); + + if (key.empty()) { + return exists(prefix); + } + bool exists = false; + get(prefix, key, exists); + return exists; +} + +bufferlist StoreTool::get(const string& prefix, + const string& key, + bool& exists) +{ + ceph_assert(!prefix.empty() && !key.empty()); + + map<string,bufferlist> result; + std::set<std::string> keys; + keys.insert(key); + db->get(prefix, keys, &result); + + if (result.count(key) > 0) { + exists = true; + return result[key]; + } else { + exists = false; + return bufferlist(); + } +} + +uint64_t StoreTool::get_size() +{ + map<string,uint64_t> extras; + uint64_t s = db->get_estimated_size(extras); + for (auto& [name, size] : extras) { + std::cout << name << " - " << size << std::endl; + } + std::cout << "total: " << s << std::endl; + return s; +} + +bool StoreTool::set(const string &prefix, const string &key, bufferlist &val) +{ + ceph_assert(!prefix.empty()); + ceph_assert(!key.empty()); + ceph_assert(val.length() > 0); + + KeyValueDB::Transaction tx = db->get_transaction(); + tx->set(prefix, key, val); + int ret = db->submit_transaction_sync(tx); + + return (ret == 0); +} + +bool StoreTool::rm(const string& prefix, const string& key) +{ + ceph_assert(!prefix.empty()); + ceph_assert(!key.empty()); + + KeyValueDB::Transaction tx = db->get_transaction(); + tx->rmkey(prefix, key); + int ret = db->submit_transaction_sync(tx); + + return (ret == 0); +} + +bool StoreTool::rm_prefix(const string& prefix) +{ + ceph_assert(!prefix.empty()); + + KeyValueDB::Transaction tx = db->get_transaction(); + tx->rmkeys_by_prefix(prefix); + int ret = db->submit_transaction_sync(tx); + + return (ret == 0); +} + +void StoreTool::print_summary(const uint64_t total_keys, const uint64_t total_size, + const uint64_t total_txs, const string& store_path, + const string& other_path, const int duration) const +{ + std::cout << "summary:" << std::endl; + std::cout << " copied " << total_keys << " keys" << std::endl; + std::cout << " used " << total_txs << " transactions" << std::endl; + std::cout << " total size " << byte_u_t(total_size) << std::endl; + std::cout << " from '" << store_path << "' to '" << other_path << "'" + << std::endl; + std::cout << " duration " << duration << " seconds" << std::endl; +} + +int StoreTool::print_stats() const +{ + ostringstream ostr; + Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty"); + int ret = -1; + if (g_conf()->rocksdb_perf) { + db->get_statistics(f); + ostr << "db_statistics "; + f->flush(ostr); + ret = 0; + } else { + ostr << "db_statistics not enabled"; + f->flush(ostr); + } + std::cout << ostr.str() << std::endl; + delete f; + return ret; +} + +int StoreTool::copy_store_to(const string& type, const string& other_path, + const int num_keys_per_tx, + const string& other_type) +{ + if (num_keys_per_tx <= 0) { + std::cerr << "must specify a number of keys/tx > 0" << std::endl; + return -EINVAL; + } + + // open or create a leveldb store at @p other_path + boost::scoped_ptr<KeyValueDB> other; + KeyValueDB *other_ptr = KeyValueDB::create(g_ceph_context, + other_type, + other_path); + if (int err = other_ptr->create_and_open(std::cerr); err < 0) { + return err; + } + other.reset(other_ptr); + + KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator(); + it->seek_to_first(); + uint64_t total_keys = 0; + uint64_t total_size = 0; + uint64_t total_txs = 0; + + auto duration = [start=coarse_mono_clock::now()] { + const auto now = coarse_mono_clock::now(); + auto seconds = std::chrono::duration<double>(now - start); + return seconds.count(); + }; + + do { + int num_keys = 0; + + KeyValueDB::Transaction tx = other->get_transaction(); + + while (it->valid() && num_keys < num_keys_per_tx) { + auto [prefix, key] = it->raw_key(); + bufferlist v = it->value(); + tx->set(prefix, key, v); + + num_keys++; + total_size += v.length(); + + it->next(); + } + + total_txs++; + total_keys += num_keys; + + if (num_keys > 0) + other->submit_transaction_sync(tx); + + std::cout << "ts = " << duration() << "s, copied " << total_keys + << " keys so far (" << byte_u_t(total_size) << ")" + << std::endl; + + } while (it->valid()); + + print_summary(total_keys, total_size, total_txs, store_path, other_path, + duration()); + + return 0; +} + +void StoreTool::compact() +{ + db->compact(); +} + +void StoreTool::compact_prefix(const string& prefix) +{ + db->compact_prefix(prefix); +} + +void StoreTool::compact_range(const string& prefix, + const string& start, + const string& end) +{ + db->compact_range(prefix, start, end); +} + +int StoreTool::destructive_repair() +{ + return db->repair(std::cout); +} diff --git a/src/tools/kvstore_tool.h b/src/tools/kvstore_tool.h new file mode 100644 index 00000000..d8c89661 --- /dev/null +++ b/src/tools/kvstore_tool.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <ostream> +#include <string> + +#include "acconfig.h" +#include "include/buffer_fwd.h" +#ifdef WITH_BLUESTORE +#include "os/bluestore/BlueStore.h" +#endif + +class KeyValueDB; + +class StoreTool +{ +#ifdef WITH_BLUESTORE + struct Deleter { + BlueStore *bluestore; + Deleter() + : bluestore(nullptr) {} + Deleter(BlueStore *store) + : bluestore(store) {} + void operator()(KeyValueDB *db) { + if (bluestore) { + bluestore->umount(); + delete bluestore; + } else { + delete db; + } + } + }; + std::unique_ptr<KeyValueDB, Deleter> db; +#else + std::unique_ptr<KeyValueDB> db; +#endif + + const std::string store_path; + +public: + StoreTool(const std::string& type, + const std::string& path, + bool need_open_db = true, + bool need_stats = false); + int load_bluestore(const std::string& path, bool need_open_db); + uint32_t traverse(const std::string& prefix, + const bool do_crc, + const bool do_value_dump, + ostream *out); + void list(const std::string& prefix, + const bool do_crc, + const bool do_value_dump); + bool exists(const std::string& prefix); + bool exists(const std::string& prefix, const std::string& key); + ceph::bufferlist get(const std::string& prefix, + const std::string& key, + bool& exists); + uint64_t get_size(); + bool set(const std::string& prefix, + const std::string& key, + ceph::bufferlist& val); + bool rm(const std::string& prefix, const std::string& key); + bool rm_prefix(const std::string& prefix); + void print_summary(const uint64_t total_keys, const uint64_t total_size, + const uint64_t total_txs, const std::string& store_path, + const std::string& other_path, const int duration) const; + int copy_store_to(const std::string& type, const std::string& other_path, + const int num_keys_per_tx, const std::string& other_type); + void compact(); + void compact_prefix(const std::string& prefix); + void compact_range(const std::string& prefix, + const std::string& start, + const std::string& end); + int destructive_repair(); + + int print_stats() const; +}; diff --git a/src/tools/monmaptool.cc b/src/tools/monmaptool.cc new file mode 100644 index 00000000..ef819a3a --- /dev/null +++ b/src/tools/monmaptool.cc @@ -0,0 +1,473 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include <string> + +#include "common/ceph_argparse.h" +#include "common/errno.h" + +#include "global/global_init.h" +#include "include/str_list.h" +#include "mon/MonMap.h" + + +void usage() +{ + cout << "usage: monmaptool [--print] [--create [--clobber] [--fsid uuid]]\n" + << " [--enable-all-features]\n" + << " [--generate] [--set-initial-members]\n" + << " [--add name 1.2.3.4:567] [--rm name]\n" + << " [--feature-list [plain|parseable]]\n" + << " [--feature-set <value> [--optional|--persistent]]\n" + << " [--feature-unset <value> [--optional|--persistent]]\n" + << " [--set-min-mon-release <release-major-number>]\n" + << " <mapfilename>" + << std::endl; +} + +void helpful_exit() +{ + cerr << "monmaptool -h for usage" << std::endl; + exit(1); +} + +struct feature_op_t { + enum type_t { + PERSISTENT, + OPTIONAL, + PLAIN, + PARSEABLE, + NONE + }; + + enum op_t { + OP_SET, + OP_UNSET, + OP_LIST + }; + + op_t op; + type_t type; + mon_feature_t feature; + + feature_op_t() : op(OP_LIST), type(NONE) { } + // default to 'persistent' feature if not specified + feature_op_t(op_t o) : op(o), type(PERSISTENT) { } + feature_op_t(op_t o, type_t t) : op(o), type(t) { } + feature_op_t(op_t o, type_t t, mon_feature_t &f) : + op(o), type(t), feature(t) { } + + void set_optional() { + type = OPTIONAL; + } + void set_persistent() { + type = PERSISTENT; + } + bool parse_value(string &s, ostream *errout = NULL) { + + feature = ceph::features::mon::get_feature_by_name(s); + if (feature != ceph::features::mon::FEATURE_NONE) { + return true; + } + + // try parsing as numerical value + uint64_t feature_val; + string interr; + feature_val = strict_strtoll(s.c_str(), 10, &interr); + if (!interr.empty()) { + if (errout) { + *errout << "unknown features name '" << s + << "' or unable to parse value: " << interr << std::endl; + } + return false; + } + feature = mon_feature_t(feature_val); + return true; + } +}; + +void features_list(feature_op_t &f, MonMap &m) +{ + if (f.type == feature_op_t::type_t::PLAIN) { + + cout << "MONMAP FEATURES:" << std::endl; + cout << " persistent: "; + m.persistent_features.print_with_value(cout); + cout << std::endl; + cout << " optional: "; + m.optional_features.print_with_value(cout); + cout << std::endl; + cout << " required: "; + m.get_required_features().print_with_value(cout); + cout << std::endl; + + cout << std::endl; + cout << "AVAILABLE FEATURES:" << std::endl; + cout << " supported: "; + ceph::features::mon::get_supported().print_with_value(cout); + cout << std::endl; + cout << " persistent: "; + ceph::features::mon::get_persistent().print_with_value(cout); + cout << std::endl; + } else if (f.type == feature_op_t::type_t::PARSEABLE) { + + cout << "monmap:persistent:"; + m.persistent_features.print_with_value(cout); + cout << std::endl; + cout << "monmap:optional:"; + m.optional_features.print_with_value(cout); + cout << std::endl; + cout << "monmap:required:"; + m.get_required_features().print_with_value(cout); + cout << std::endl; + cout << "available:supported:"; + ceph::features::mon::get_supported().print_with_value(cout); + cout << std::endl; + cout << "available:persistent:"; + ceph::features::mon::get_persistent().print_with_value(cout); + cout << std::endl; + } +} + +bool handle_features(list<feature_op_t>& lst, MonMap &m) +{ + if (lst.empty()) + return false; + + bool modified = false; + + for (auto &f : lst) { + if (f.op == feature_op_t::op_t::OP_LIST) { + features_list(f, m); + } else if (f.op == feature_op_t::op_t::OP_SET || + f.op == feature_op_t::op_t::OP_UNSET) { + + modified = true; + + mon_feature_t &target = + ( f.type == feature_op_t::type_t::OPTIONAL ? + m.optional_features : m.persistent_features ); + + if (f.op == feature_op_t::op_t::OP_SET) { + target.set_feature(f.feature); + } else { + target.unset_feature(f.feature); + } + } else { + cerr << "unknown feature operation type '" << f.op << "'" << std::endl; + } + } + return modified; +} + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + const char *me = argv[0]; + + std::string fn; + bool print = false; + bool create = false; + bool enable_all_features = false; + bool clobber = false; + bool modified = false; + bool show_features = false; + bool generate = false; + bool filter = false; + int min_mon_release = -1; + map<string,entity_addr_t> add; + map<string,entity_addrvec_t> addv; + list<string> rm; + list<feature_op_t> features; + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + std::string val; + for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_flag(args, i, "-p", "--print", (char*)NULL)) { + print = true; + } else if (ceph_argparse_flag(args, i, "--create", (char*)NULL)) { + create = true; + } else if (ceph_argparse_flag(args, i, "--enable-all-features", (char*)NULL)) { + enable_all_features = true; + } else if (ceph_argparse_flag(args, i, "--clobber", (char*)NULL)) { + clobber = true; + } else if (ceph_argparse_flag(args, i, "--generate", (char*)NULL)) { + generate = true; + } else if (ceph_argparse_flag(args, i, "--set-initial-members", (char*)NULL)) { + filter = true; + } else if (ceph_argparse_witharg(args, i, &val, "--set-min-mon-release", + (char*)NULL)) { + min_mon_release = atoi(val.c_str()); + } else if (ceph_argparse_flag(args, i, "--add", (char*)NULL)) { + string name = *i; + i = args.erase(i); + if (i == args.end()) + helpful_exit(); + entity_addr_t addr; + if (!addr.parse(*i)) { + cerr << me << ": invalid ip:port '" << *i << "'" << std::endl; + return -1; + } + add[name] = addr; + modified = true; + i = args.erase(i); + } else if (ceph_argparse_flag(args, i, "--addv", (char*)NULL)) { + string name = *i; + i = args.erase(i); + if (i == args.end()) + helpful_exit(); + entity_addrvec_t addrs; + if (!addrs.parse(*i)) { + cerr << me << ": invalid ip:port '" << *i << "'" << std::endl; + return -1; + } + addv[name] = addrs; + modified = true; + i = args.erase(i); + } else if (ceph_argparse_witharg(args, i, &val, "--rm", (char*)NULL)) { + rm.push_back(val); + modified = true; + } else if (ceph_argparse_flag(args, i, "--feature-list", (char*)NULL)) { + string format = *i; + if (format == "plain" || format == "parseable") { + i = args.erase(i); + } else { + format = "plain"; + } + + feature_op_t f(feature_op_t::op_t::OP_LIST, + feature_op_t::type_t::PLAIN); + + if (format == "parseable") { + f.type = feature_op_t::type_t::PARSEABLE; + } else if (format != "plain") { + cerr << "invalid format type for list: '" << val << "'" << std::endl; + helpful_exit(); + } + + features.push_back(f); + show_features = true; + } else if (ceph_argparse_witharg(args, i, &val, + "--feature-set", (char*)NULL)) { + // parse value + feature_op_t f(feature_op_t::op_t::OP_SET); + if (!f.parse_value(val, &cerr)) { + helpful_exit(); + } + features.push_back(f); + + } else if (ceph_argparse_witharg(args, i, &val, + "--feature-unset", (char*)NULL)) { + // parse value + feature_op_t f(feature_op_t::op_t::OP_UNSET); + if (!f.parse_value(val, &cerr)) { + helpful_exit(); + } + features.push_back(f); + } else if (ceph_argparse_flag(args, i, "--optional", (char*)NULL)) { + if (features.empty()) { + helpful_exit(); + } + features.back().set_optional(); + } else if (ceph_argparse_flag(args, i, "--persistent", (char*)NULL)) { + if (features.empty()) { + helpful_exit(); + } + features.back().set_persistent(); + } else { + ++i; + } + } + if (args.empty()) { + cerr << me << ": must specify monmap filename" << std::endl; + helpful_exit(); + } + else if (args.size() > 1) { + cerr << me << ": too many arguments" << std::endl; + helpful_exit(); + } + fn = args[0]; + + MonMap monmap; + + cout << me << ": monmap file " << fn << std::endl; + + int r = 0; + if (!(create && clobber)) { + try { + r = monmap.read(fn.c_str()); + } catch (...) { + cerr << me << ": unable to read monmap file" << std::endl; + return -1; + } + } + + if (!create && r < 0) { + cerr << me << ": couldn't open " << fn << ": " << cpp_strerror(r) << std::endl; + return -1; + } + else if (create && !clobber && r == 0) { + cerr << me << ": " << fn << " exists, --clobber to overwrite" << std::endl; + return -1; + } + + if (create) { + monmap.epoch = 0; + monmap.created = ceph_clock_now(); + monmap.last_changed = monmap.created; + srand(getpid() + time(0)); + if (g_conf().get_val<uuid_d>("fsid").is_zero()) { + monmap.generate_fsid(); + cout << me << ": generated fsid " << monmap.fsid << std::endl; + } + modified = true; + } + if (enable_all_features) { + // populate persistent features, too + monmap.persistent_features = ceph::features::mon::get_persistent(); + modified = true; + } + + if (generate) { + int r = monmap.build_initial(g_ceph_context, true, cerr); + if (r < 0) + return r; + } + + if (min_mon_release >= 0) { + monmap.min_mon_release = min_mon_release; + cout << "setting min_mon_release = " << min_mon_release << std::endl; + modified = true; + } + + if (filter) { + // apply initial members + list<string> initial_members; + get_str_list(g_conf()->mon_initial_members, initial_members); + if (!initial_members.empty()) { + cout << "initial_members " << initial_members << ", filtering seed monmap" << std::endl; + set<entity_addrvec_t> removed; + monmap.set_initial_members(g_ceph_context, initial_members, + string(), entity_addrvec_t(), + &removed); + cout << "removed " << removed << std::endl; + } + modified = true; + } + + if (!g_conf().get_val<uuid_d>("fsid").is_zero()) { + monmap.fsid = g_conf().get_val<uuid_d>("fsid"); + cout << me << ": set fsid to " << monmap.fsid << std::endl; + modified = true; + } + + for (auto& p : add) { + entity_addr_t addr = p.second; + entity_addrvec_t addrs; + if (monmap.contains(p.first)) { + cerr << me << ": map already contains mon." << p.first << std::endl; + helpful_exit(); + } + if (addr.get_port() == 0) { + if (monmap.persistent_features.contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + addr.set_type(entity_addr_t::TYPE_MSGR2); + addr.set_port(CEPH_MON_PORT_IANA); + addrs.v.push_back(addr); + addr.set_type(entity_addr_t::TYPE_LEGACY); + addr.set_port(CEPH_MON_PORT_LEGACY); + addrs.v.push_back(addr); + } else { + addr.set_type(entity_addr_t::TYPE_LEGACY); + addr.set_port(CEPH_MON_PORT_LEGACY); + addrs.v.push_back(addr); + } + } else if (addr.get_port() == CEPH_MON_PORT_LEGACY) { + addr.set_type(entity_addr_t::TYPE_LEGACY); + addrs.v.push_back(addr); + } else { + if (monmap.persistent_features.contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + addr.set_type(entity_addr_t::TYPE_MSGR2); + } + addrs.v.push_back(addr); + } + if (monmap.contains(addrs)) { + cerr << me << ": map already contains " << addrs << std::endl; + helpful_exit(); + } + monmap.add(p.first, addrs); + } + for (auto& p : addv) { + if (monmap.contains(p.first)) { + cerr << me << ": map already contains mon." << p.first << std::endl; + helpful_exit(); + } + if (monmap.contains(p.second)) { + cerr << me << ": map already contains " << p.second << std::endl; + helpful_exit(); + } + monmap.add(p.first, p.second); + } + for (auto& p : rm) { + cout << me << ": removing " << p << std::endl; + if (!monmap.contains(p)) { + cerr << me << ": map does not contain " << p << std::endl; + helpful_exit(); + } + monmap.remove(p); + } + + if (handle_features(features, monmap)) { + modified = true; + } + + if (!print && !modified && !show_features) { + cerr << "no action specified" << std::endl; + helpful_exit(); + } + + if (print) + monmap.print(cout); + + if (modified) { + // write it out + cout << me << ": writing epoch " << monmap.epoch + << " to " << fn + << " (" << monmap.size() << " monitors)" + << std::endl; + int r = monmap.write(fn.c_str()); + if (r < 0) { + cerr << "monmaptool: error writing to '" << fn << "': " << cpp_strerror(r) << std::endl; + return 1; + } + } + + + return 0; +} diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc new file mode 100644 index 00000000..887086e5 --- /dev/null +++ b/src/tools/osdmaptool.cc @@ -0,0 +1,799 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <string> +#include <sys/stat.h> + +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "mon/health_check.h" +#include <time.h> +#include <algorithm> + +#include "global/global_init.h" +#include "osd/OSDMap.h" + + +void usage() +{ + cout << " usage: [--print] <mapfilename>" << std::endl; + cout << " --create-from-conf creates an osd map with default configurations" << std::endl; + cout << " --createsimple <numosd> [--clobber] [--pg-bits <bitsperosd>] [--pgp-bits <bits>] creates a relatively generic OSD map with <numosd> devices" << std::endl; + cout << " --pgp-bits <bits> pgp_num map attribute will be shifted by <bits>" << std::endl; + cout << " --pg-bits <bits> pg_num map attribute will be shifted by <bits>" << std::endl; + cout << " --clobber allows osdmaptool to overwrite <mapfilename> if it already exists" << std::endl; + cout << " --export-crush <file> write osdmap's crush map to <file>" << std::endl; + cout << " --import-crush <file> replace osdmap's crush map with <file>" << std::endl; + cout << " --health dump health checks" << std::endl; + cout << " --test-map-pgs [--pool <poolid>] [--pg_num <pg_num>] [--range-first <first> --range-last <last>] map all pgs" << std::endl; + cout << " --test-map-pgs-dump [--pool <poolid>] [--range-first <first> --range-last <last>] map all pgs" << std::endl; + cout << " --test-map-pgs-dump-all [--pool <poolid>] [--range-first <first> --range-last <last>] map all pgs to osds" << std::endl; + cout << " --mark-up-in mark osds up and in (but do not persist)" << std::endl; + cout << " --mark-out <osdid> mark an osd as out (but do not persist)" << std::endl; + cout << " --with-default-pool include default pool when creating map" << std::endl; + cout << " --clear-temp clear pg_temp and primary_temp" << std::endl; + cout << " --clean-temps clean pg_temps" << std::endl; + cout << " --test-random do random placements" << std::endl; + cout << " --test-map-pg <pgid> map a pgid to osds" << std::endl; + cout << " --test-map-object <objectname> [--pool <poolid>] map an object to osds" + << std::endl; + cout << " --upmap-cleanup <file> clean up pg_upmap[_items] entries, writing" << std::endl; + cout << " commands to <file> [default: - for stdout]" << std::endl; + cout << " --upmap <file> calculate pg upmap entries to balance pg layout" << std::endl; + cout << " writing commands to <file> [default: - for stdout]" << std::endl; + cout << " --upmap-max <max-count> set max upmap entries to calculate [default: 10]" << std::endl; + cout << " --upmap-deviation <max-deviation>" << std::endl; + cout << " max deviation from target [default: 5]" << std::endl; + cout << " --upmap-pool <poolname> restrict upmap balancing to 1 or more pools" << std::endl; + cout << " --upmap-save write modified OSDMap with upmap changes" << std::endl; + cout << " --upmap-active Act like an active balancer, keep applying changes until balanced" << std::endl; + cout << " --dump <format> displays the map in plain text when <format> is 'plain', 'json' if specified format is not supported" << std::endl; + cout << " --tree displays a tree of the map" << std::endl; + cout << " --test-crush [--range-first <first> --range-last <last>] map pgs to acting osds" << std::endl; + exit(1); +} + +void print_inc_upmaps(const OSDMap::Incremental& pending_inc, int fd) +{ + ostringstream ss; + for (auto& i : pending_inc.old_pg_upmap) { + ss << "ceph osd rm-pg-upmap " << i << std::endl; + } + for (auto& i : pending_inc.new_pg_upmap) { + ss << "ceph osd pg-upmap " << i.first; + for (auto osd : i.second) { + ss << " " << osd; + } + ss << std::endl; + } + for (auto& i : pending_inc.old_pg_upmap_items) { + ss << "ceph osd rm-pg-upmap-items " << i << std::endl; + } + for (auto& i : pending_inc.new_pg_upmap_items) { + ss << "ceph osd pg-upmap-items " << i.first; + for (auto p : i.second) { + ss << " " << p.first << " " << p.second; + } + ss << std::endl; + } + string s = ss.str(); + int r = safe_write(fd, s.c_str(), s.size()); + if (r < 0) { + cerr << "error writing output: " << cpp_strerror(r) << std::endl; + exit(1); + } +} + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + + const char *me = argv[0]; + + std::string fn; + bool print = false; + boost::scoped_ptr<Formatter> print_formatter; + bool tree = false; + boost::scoped_ptr<Formatter> tree_formatter; + bool createsimple = false; + bool createpool = false; + bool create_from_conf = false; + int num_osd = 0; + int pg_bits = 6; + int pgp_bits = 6; + bool clobber = false; + bool modified = false; + std::string export_crush, import_crush, test_map_pg, test_map_object; + bool test_crush = false; + int range_first = -1; + int range_last = -1; + int pool = -1; + bool mark_up_in = false; + int marked_out = -1; + bool clear_temp = false; + bool clean_temps = false; + bool test_map_pgs = false; + bool test_map_pgs_dump = false; + bool test_random = false; + bool upmap_cleanup = false; + bool upmap = false; + bool upmap_save = false; + bool health = false; + std::string upmap_file = "-"; + int upmap_max = 10; + int upmap_deviation = 5; + bool upmap_active = false; + std::set<std::string> upmap_pools; + int64_t pg_num = -1; + bool test_map_pgs_dump_all = false; + + std::string val; + std::ostringstream err; + for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_flag(args, i, "-p", "--print", (char*)NULL)) { + print = true; + } else if (ceph_argparse_witharg(args, i, &val, err, "--dump", (char*)NULL)) { + print = true; + if (!val.empty() && val != "plain") { + print_formatter.reset(Formatter::create(val, "", "json")); + } + } else if (ceph_argparse_witharg(args, i, &val, err, "--tree", (char*)NULL)) { + tree = true; + if (!val.empty() && val != "plain") { + tree_formatter.reset(Formatter::create(val, "", "json")); + } + } else if (ceph_argparse_witharg(args, i, &pg_bits, err, "--osd-pg-bits", (char*)NULL)) { + } else if (ceph_argparse_witharg(args, i, &pgp_bits, err, "--osd-pgp-bits", (char*)NULL)) { + } else if (ceph_argparse_witharg(args, i, &upmap_file, "--upmap-cleanup", (char*)NULL)) { + upmap_cleanup = true; + } else if (ceph_argparse_witharg(args, i, &upmap_file, "--upmap-save", (char*)NULL)) { + upmap_save = true; + } else if (ceph_argparse_witharg(args, i, &upmap_file, "--upmap", (char*)NULL)) { + upmap_cleanup = true; + upmap = true; + } else if (ceph_argparse_witharg(args, i, &upmap_max, err, "--upmap-max", (char*)NULL)) { + } else if (ceph_argparse_witharg(args, i, &upmap_deviation, err, "--upmap-deviation", (char*)NULL)) { + } else if (ceph_argparse_witharg(args, i, &val, "--upmap-pool", (char*)NULL)) { + upmap_pools.insert(val); + } else if (ceph_argparse_witharg(args, i, &num_osd, err, "--createsimple", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + exit(EXIT_FAILURE); + } + createsimple = true; + } else if (ceph_argparse_flag(args, i, "--upmap-active", (char*)NULL)) { + upmap_active = true; + } else if (ceph_argparse_flag(args, i, "--health", (char*)NULL)) { + health = true; + } else if (ceph_argparse_flag(args, i, "--with-default-pool", (char*)NULL)) { + createpool = true; + } else if (ceph_argparse_flag(args, i, "--create-from-conf", (char*)NULL)) { + create_from_conf = true; + } else if (ceph_argparse_flag(args, i, "--mark-up-in", (char*)NULL)) { + mark_up_in = true; + } else if (ceph_argparse_witharg(args, i, &val, "--mark-out", (char*)NULL)) { + marked_out = std::stoi(val); + } else if (ceph_argparse_flag(args, i, "--clear-temp", (char*)NULL)) { + clear_temp = true; + } else if (ceph_argparse_flag(args, i, "--clean-temps", (char*)NULL)) { + clean_temps = true; + } else if (ceph_argparse_flag(args, i, "--test-map-pgs", (char*)NULL)) { + test_map_pgs = true; + } else if (ceph_argparse_flag(args, i, "--test-map-pgs-dump", (char*)NULL)) { + test_map_pgs_dump = true; + } else if (ceph_argparse_flag(args, i, "--test-map-pgs-dump-all", (char*)NULL)) { + test_map_pgs_dump_all = true; + } else if (ceph_argparse_flag(args, i, "--test-random", (char*)NULL)) { + test_random = true; + } else if (ceph_argparse_flag(args, i, "--clobber", (char*)NULL)) { + clobber = true; + } else if (ceph_argparse_witharg(args, i, &pg_bits, err, "--pg_bits", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + exit(EXIT_FAILURE); + } + } else if (ceph_argparse_witharg(args, i, &pgp_bits, err, "--pgp_bits", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + exit(EXIT_FAILURE); + } + } else if (ceph_argparse_witharg(args, i, &val, "--export_crush", (char*)NULL)) { + export_crush = val; + } else if (ceph_argparse_witharg(args, i, &val, "--import_crush", (char*)NULL)) { + import_crush = val; + } else if (ceph_argparse_witharg(args, i, &val, "--test_map_pg", (char*)NULL)) { + test_map_pg = val; + } else if (ceph_argparse_witharg(args, i, &val, "--test_map_object", (char*)NULL)) { + test_map_object = val; + } else if (ceph_argparse_flag(args, i, "--test_crush", (char*)NULL)) { + test_crush = true; + } else if (ceph_argparse_witharg(args, i, &val, err, "--pg_num", (char*)NULL)) { + string interr; + pg_num = strict_strtoll(val.c_str(), 10, &interr); + if (interr.length() > 0) { + cerr << "error parsing integer value " << interr << std::endl; + exit(EXIT_FAILURE); + } + } else if (ceph_argparse_witharg(args, i, &range_first, err, "--range_first", (char*)NULL)) { + } else if (ceph_argparse_witharg(args, i, &range_last, err, "--range_last", (char*)NULL)) { + } else if (ceph_argparse_witharg(args, i, &pool, err, "--pool", (char*)NULL)) { + if (!err.str().empty()) { + cerr << err.str() << std::endl; + exit(EXIT_FAILURE); + } + } else { + ++i; + } + } + if (args.empty()) { + cerr << me << ": must specify osdmap filename" << std::endl; + usage(); + } + else if (args.size() > 1) { + cerr << me << ": too many arguments" << std::endl; + usage(); + } + if (upmap_deviation < 1) { + cerr << me << ": upmap-deviation must be >= 1" << std::endl; + usage(); + } + fn = args[0]; + + if (range_first >= 0 && range_last >= 0) { + set<OSDMap*> maps; + OSDMap *prev = NULL; + for (int i=range_first; i <= range_last; i++) { + ostringstream f; + f << fn << "/" << i; + bufferlist bl; + string error, s = f.str(); + int r = bl.read_file(s.c_str(), &error); + if (r < 0) { + cerr << "unable to read " << s << ": " << cpp_strerror(r) << std::endl; + exit(1); + } + cout << s << " got " << bl.length() << " bytes" << std::endl; + OSDMap *o = new OSDMap; + o->decode(bl); + maps.insert(o); + if (prev) + OSDMap::dedup(prev, o); + prev = o; + } + exit(0); + } + + OSDMap osdmap; + bufferlist bl; + + cerr << me << ": osdmap file '" << fn << "'" << std::endl; + + int r = 0; + struct stat st; + if (!createsimple && !create_from_conf && !clobber) { + std::string error; + r = bl.read_file(fn.c_str(), &error); + if (r == 0) { + try { + osdmap.decode(bl); + } + catch (const buffer::error &e) { + cerr << me << ": error decoding osdmap '" << fn << "'" << std::endl; + return -1; + } + } + else { + cerr << me << ": couldn't open " << fn << ": " << error << std::endl; + return -1; + } + } + else if ((createsimple || create_from_conf) && !clobber && ::stat(fn.c_str(), &st) == 0) { + cerr << me << ": " << fn << " exists, --clobber to overwrite" << std::endl; + return -1; + } + + if (createsimple || create_from_conf) { + if (createsimple) { + if (num_osd < 1) { + cerr << me << ": osd count must be > 0" << std::endl; + exit(1); + } + } else { + num_osd = -1; + } + uuid_d fsid; + if (createpool) { + osdmap.build_simple_with_pool( + g_ceph_context, 0, fsid, num_osd, pg_bits, pgp_bits); + } else { + osdmap.build_simple(g_ceph_context, 0, fsid, num_osd); + } + modified = true; + } + + if (mark_up_in) { + cout << "marking all OSDs up and in" << std::endl; + int n = osdmap.get_max_osd(); + for (int i=0; i<n; i++) { + osdmap.set_state(i, osdmap.get_state(i) | CEPH_OSD_UP); + osdmap.set_weight(i, CEPH_OSD_IN); + osdmap.crush->adjust_item_weightf(g_ceph_context, i, 1.0); + } + } + + if (marked_out >=0 && marked_out < osdmap.get_max_osd()) { + cout << "marking OSD@" << marked_out << " as out" << std::endl; + int id = marked_out; + osdmap.set_state(id, osdmap.get_state(id) | CEPH_OSD_UP); + osdmap.set_weight(id, CEPH_OSD_OUT); + osdmap.crush->adjust_item_weightf(g_ceph_context, id, 1.0); + } + + if (clear_temp) { + cout << "clearing pg/primary temp" << std::endl; + osdmap.clear_temp(); + } + if (clean_temps) { + cout << "cleaning pg temps" << std::endl; + OSDMap::Incremental pending_inc(osdmap.get_epoch()+1); + OSDMap tmpmap; + tmpmap.deepish_copy_from(osdmap); + tmpmap.apply_incremental(pending_inc); + OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc); + } + int upmap_fd = STDOUT_FILENO; + if (upmap || upmap_cleanup) { + if (upmap_file != "-") { + upmap_fd = ::open(upmap_file.c_str(), O_CREAT|O_WRONLY|O_TRUNC, 0644); + if (upmap_fd < 0) { + cerr << "error opening " << upmap_file << ": " << cpp_strerror(errno) + << std::endl; + exit(1); + } + cout << "writing upmap command output to: " << upmap_file << std::endl; + } + } + if (upmap_cleanup) { + cout << "checking for upmap cleanups" << std::endl; + OSDMap::Incremental pending_inc(osdmap.get_epoch()+1); + pending_inc.fsid = osdmap.get_fsid(); + int r = osdmap.clean_pg_upmaps(g_ceph_context, &pending_inc); + if (r > 0) { + print_inc_upmaps(pending_inc, upmap_fd); + r = osdmap.apply_incremental(pending_inc); + ceph_assert(r == 0); + } + } + if (upmap) { + cout << "upmap, max-count " << upmap_max + << ", max deviation " << upmap_deviation + << std::endl; + vector<int64_t> pools; + set<int64_t> upmap_pool_nums; + for (auto& s : upmap_pools) { + int64_t p = osdmap.lookup_pg_pool_name(s); + if (p < 0) { + cerr << " pool " << s << " does not exist" << std::endl; + exit(1); + } + pools.push_back(p); + upmap_pool_nums.insert(p); + } + if (!pools.empty()) { + cout << " limiting to pools " << upmap_pools << " (" << pools << ")" + << std::endl; + } else { + mempool::osdmap::map<int64_t,pg_pool_t> opools = osdmap.get_pools(); + for (auto& i : opools) { + pools.push_back(i.first); + } + } + if (pools.empty()) { + cout << "No pools available" << std::endl; + goto skip_upmap; + } + int rounds = 0; + struct timespec round_start; + int r = clock_gettime(CLOCK_MONOTONIC, &round_start); + assert(r == 0); + do { + std::random_device rd; + std::shuffle(pools.begin(), pools.end(), std::mt19937{rd()}); + cout << "pools "; + for (auto& i: pools) + cout << osdmap.get_pool_name(i) << " "; + cout << std::endl; + OSDMap::Incremental pending_inc(osdmap.get_epoch()+1); + pending_inc.fsid = osdmap.get_fsid(); + int total_did = 0; + int left = upmap_max; + struct timespec begin, end; + r = clock_gettime(CLOCK_MONOTONIC, &begin); + assert(r == 0); + for (auto& i: pools) { + set<int64_t> one_pool; + one_pool.insert(i); + int did = osdmap.calc_pg_upmaps( + g_ceph_context, upmap_deviation, + left, one_pool, + &pending_inc); + total_did += did; + left -= did; + if (left <= 0) + break; + } + r = clock_gettime(CLOCK_MONOTONIC, &end); + assert(r == 0); + cout << "prepared " << total_did << "/" << upmap_max << " changes" << std::endl; + float elapsed_time = (end.tv_sec - begin.tv_sec) + 1.0e-9*(end.tv_nsec - begin.tv_nsec); + if (upmap_active) + cout << "Time elapsed " << elapsed_time << " secs" << std::endl; + if (total_did > 0) { + print_inc_upmaps(pending_inc, upmap_fd); + if (upmap_save || upmap_active) { + int r = osdmap.apply_incremental(pending_inc); + ceph_assert(r == 0); + if (upmap_save) + modified = true; + } + } else { + cout << "Unable to find further optimization, " + << "or distribution is already perfect" + << std::endl; + if (upmap_active) { + map<int,set<pg_t>> pgs_by_osd; + for (auto& i : osdmap.get_pools()) { + if (!upmap_pool_nums.empty() && !upmap_pool_nums.count(i.first)) + continue; + for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) { + pg_t pg(ps, i.first); + vector<int> up; + osdmap.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr); + //ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl; + for (auto osd : up) { + if (osd != CRUSH_ITEM_NONE) + pgs_by_osd[osd].insert(pg); + } + } + } + for (auto& i : pgs_by_osd) + cout << "osd." << i.first << " pgs " << i.second.size() << std::endl; + float elapsed_time = (end.tv_sec - round_start.tv_sec) + 1.0e-9*(end.tv_nsec - round_start.tv_nsec); + cout << "Total time elapsed " << elapsed_time << " secs, " << rounds << " rounds" << std::endl; + } + break; + } + ++rounds; + } while(upmap_active); + } +skip_upmap: + if (upmap_file != "-") { + ::close(upmap_fd); + } + + if (!import_crush.empty()) { + bufferlist cbl; + std::string error; + r = cbl.read_file(import_crush.c_str(), &error); + if (r) { + cerr << me << ": error reading crush map from " << import_crush + << ": " << error << std::endl; + exit(1); + } + + // validate + CrushWrapper cw; + auto p = cbl.cbegin(); + cw.decode(p); + + if (cw.get_max_devices() > osdmap.get_max_osd()) { + cerr << me << ": crushmap max_devices " << cw.get_max_devices() + << " > osdmap max_osd " << osdmap.get_max_osd() << std::endl; + exit(1); + } + + // apply + OSDMap::Incremental inc; + inc.fsid = osdmap.get_fsid(); + inc.epoch = osdmap.get_epoch()+1; + inc.crush = cbl; + osdmap.apply_incremental(inc); + cout << me << ": imported " << cbl.length() << " byte crush map from " << import_crush << std::endl; + modified = true; + } + + if (!export_crush.empty()) { + bufferlist cbl; + osdmap.crush->encode(cbl, CEPH_FEATURES_SUPPORTED_DEFAULT); + r = cbl.write_file(export_crush.c_str()); + if (r < 0) { + cerr << me << ": error writing crush map to " << import_crush << std::endl; + exit(1); + } + cout << me << ": exported crush map to " << export_crush << std::endl; + } + + if (!test_map_object.empty()) { + object_t oid(test_map_object); + if (pool == -1) { + cout << me << ": assuming pool 1 (use --pool to override)" << std::endl; + pool = 1; + } + if (!osdmap.have_pg_pool(pool)) { + cerr << "There is no pool " << pool << std::endl; + exit(1); + } + object_locator_t loc(pool); + pg_t raw_pgid = osdmap.object_locator_to_pg(oid, loc); + pg_t pgid = osdmap.raw_pg_to_pg(raw_pgid); + + vector<int> acting; + osdmap.pg_to_acting_osds(pgid, acting); + cout << " object '" << oid + << "' -> " << pgid + << " -> " << acting + << std::endl; + } + if (!test_map_pg.empty()) { + pg_t pgid; + if (!pgid.parse(test_map_pg.c_str())) { + cerr << me << ": failed to parse pg '" << test_map_pg << std::endl; + usage(); + } + cout << " parsed '" << test_map_pg << "' -> " << pgid << std::endl; + + vector<int> raw, up, acting; + int raw_primary, up_primary, acting_primary; + osdmap.pg_to_raw_osds(pgid, &raw, &raw_primary); + osdmap.pg_to_up_acting_osds(pgid, &up, &up_primary, + &acting, &acting_primary); + cout << pgid << " raw (" << raw << ", p" << raw_primary + << ") up (" << up << ", p" << up_primary + << ") acting (" << acting << ", p" << acting_primary << ")" + << std::endl; + } + if (test_map_pgs || test_map_pgs_dump || test_map_pgs_dump_all) { + if (pool != -1 && !osdmap.have_pg_pool(pool)) { + cerr << "There is no pool " << pool << std::endl; + exit(1); + } + int n = osdmap.get_max_osd(); + vector<int> count(n, 0); + vector<int> first_count(n, 0); + vector<int> primary_count(n, 0); + vector<int> size(30, 0); + int max_size = 0; + if (test_random) + srand(getpid()); + auto& pools = osdmap.get_pools(); + for (auto p = pools.begin(); p != pools.end(); ++p) { + if (pool != -1 && p->first != pool) + continue; + if (pg_num > 0) + p->second.set_pg_num(pg_num); + + cout << "pool " << p->first + << " pg_num " << p->second.get_pg_num() << std::endl; + for (unsigned i = 0; i < p->second.get_pg_num(); ++i) { + pg_t pgid = pg_t(i, p->first); + + vector<int> osds, raw, up, acting; + int primary, calced_primary, up_primary, acting_primary; + if (test_random) { + osds.resize(p->second.size); + for (unsigned i=0; i<osds.size(); ++i) { + osds[i] = rand() % osdmap.get_max_osd(); + } + primary = osds[0]; + } else if (test_map_pgs_dump_all) { + osdmap.pg_to_raw_osds(pgid, &raw, &calced_primary); + osdmap.pg_to_up_acting_osds(pgid, &up, &up_primary, + &acting, &acting_primary); + osds = acting; + primary = acting_primary; + } else { + osdmap.pg_to_acting_osds(pgid, &osds, &primary); + } + size[osds.size()]++; + if ((unsigned)max_size < osds.size()) + max_size = osds.size(); + + if (test_map_pgs_dump) { + cout << pgid << "\t" << osds << "\t" << primary << std::endl; + } else if (test_map_pgs_dump_all) { + cout << pgid << " raw (" << raw << ", p" << calced_primary + << ") up (" << up << ", p" << up_primary + << ") acting (" << acting << ", p" << acting_primary << ")" + << std::endl; + } + + for (unsigned i=0; i<osds.size(); i++) { + //cout << " rep " << i << " on " << osds[i] << std::endl; + count[osds[i]]++; + } + if (osds.size()) + first_count[osds[0]]++; + if (primary >= 0) + primary_count[primary]++; + } + } + + uint64_t total = 0; + int in = 0; + int min_osd = -1; + int max_osd = -1; + cout << "#osd\tcount\tfirst\tprimary\tc wt\twt\n"; + for (int i=0; i<n; i++) { + if (!osdmap.is_in(i)) + continue; + if (osdmap.crush->get_item_weight(i) <= 0) + continue; + in++; + cout << "osd." << i + << "\t" << count[i] + << "\t" << first_count[i] + << "\t" << primary_count[i] + << "\t" << osdmap.crush->get_item_weightf(i) + << "\t" << osdmap.get_weightf(i) + << std::endl; + total += count[i]; + if (count[i] && + (min_osd < 0 || + count[i] < count[min_osd])) + min_osd = i; + if (count[i] && + (max_osd < 0 || + count[i] > count[max_osd])) + max_osd = i; + + } + uint64_t avg = in ? (total / in) : 0; + double dev = 0; + for (int i=0; i<n; i++) { + if (!osdmap.is_in(i)) + continue; + if (osdmap.crush->get_item_weight(i) <= 0) + continue; + dev += (avg - count[i]) * (avg - count[i]); + } + dev /= in; + dev = sqrt(dev); + + //double edev = sqrt(pgavg) * (double)avg / pgavg; + double edev = sqrt((double)total / (double)in * (1.0 - (1.0 / (double)in))); + cout << " in " << in << std::endl; + cout << " avg " << avg + << " stddev " << dev + << " (" << (dev/avg) << "x)" + << " (expected " << edev << " " << (edev/avg) << "x))" + << std::endl; + + if (min_osd >= 0) + cout << " min osd." << min_osd << " " << count[min_osd] << std::endl; + if (max_osd >= 0) + cout << " max osd." << max_osd << " " << count[max_osd] << std::endl; + + for (int i=0; i<=max_size; i++) { + if (size[i]) + cout << "size " << i << "\t" << size[i] << std::endl; + } + } + if (test_crush) { + int pass = 0; + while (1) { + cout << "pass " << ++pass << std::endl; + + ceph::unordered_map<pg_t,vector<int> > m; + for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin(); + p != osdmap.get_pools().end(); + ++p) { + const pg_pool_t *pool = osdmap.get_pg_pool(p->first); + for (ps_t ps = 0; ps < pool->get_pg_num(); ps++) { + pg_t pgid(ps, p->first); + for (int i=0; i<100; i++) { + cout << pgid << " attempt " << i << std::endl; + + vector<int> r; + osdmap.pg_to_acting_osds(pgid, r); + //cout << pgid << " " << r << std::endl; + if (m.count(pgid)) { + if (m[pgid] != r) { + cout << pgid << " had " << m[pgid] << " now " << r << std::endl; + ceph_abort(); + } + } else + m[pgid] = r; + } + } + } + } + } + + if (!print && !health && !tree && !modified && + export_crush.empty() && import_crush.empty() && + test_map_pg.empty() && test_map_object.empty() && + !test_map_pgs && !test_map_pgs_dump && !test_map_pgs_dump_all && + !upmap && !upmap_cleanup) { + cerr << me << ": no action specified?" << std::endl; + usage(); + } + + if (modified) + osdmap.inc_epoch(); + + if (health) { + health_check_map_t checks; + osdmap.check_health(cct.get(), &checks); + JSONFormatter jf(true); + jf.dump_object("checks", checks); + jf.flush(cout); + } + if (print) { + if (print_formatter) { + print_formatter->open_object_section("osdmap"); + osdmap.dump(print_formatter.get()); + print_formatter->close_section(); + print_formatter->flush(cout); + } else { + osdmap.print(cout); + } + } + + if (tree) { + if (tree_formatter) { + tree_formatter->open_object_section("tree"); + osdmap.print_tree(tree_formatter.get(), NULL); + tree_formatter->close_section(); + tree_formatter->flush(cout); + cout << std::endl; + } else { + osdmap.print_tree(NULL, &cout); + } + } + if (modified) { + bl.clear(); + osdmap.encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT | CEPH_FEATURE_RESERVED); + + // write it out + cout << me << ": writing epoch " << osdmap.get_epoch() + << " to " << fn + << std::endl; + int r = bl.write_file(fn.c_str()); + if (r) { + cerr << "osdmaptool: error writing to '" << fn << "': " + << cpp_strerror(r) << std::endl; + return 1; + } + } + + + return 0; +} diff --git a/src/tools/psim.cc b/src/tools/psim.cc new file mode 100644 index 00000000..90e6fb95 --- /dev/null +++ b/src/tools/psim.cc @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd/OSDMap.h" +#include "include/buffer.h" + +int main(int argc, char **argv) +{ + /* + * you need to create a suitable osdmap first. e.g., for 40 osds, + * $ ./osdmaptool --createsimple 40 --clobber .ceph_osdmap + */ + bufferlist bl; + std::string error; + if (bl.read_file(".ceph_osdmap", &error)) { + cout << argv[0] << ": error reading .ceph_osdmap: " << error << std::endl; + return 1; + } + OSDMap osdmap; + + try { + osdmap.decode(bl); + } catch (ceph::buffer::end_of_buffer &eob) { + cout << "Exception (end_of_buffer) in decode(), exit." << std::endl; + exit(1); + } + + //osdmap.set_primary_affinity(0, 0x8000); + //osdmap.set_primary_affinity(3, 0); + + int n = osdmap.get_max_osd(); + int count[n]; + int first_count[n]; + int primary_count[n]; + int size[4]; + + memset(count, 0, sizeof(count)); + memset(first_count, 0, sizeof(first_count)); + memset(primary_count, 0, sizeof(primary_count)); + memset(size, 0, sizeof(size)); + + for (int i=0; i<n; i++) { + osdmap.set_state(i, osdmap.get_state(i) | CEPH_OSD_UP); + //if (i<12) + osdmap.set_weight(i, CEPH_OSD_IN); + } + + //pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(0); + //p->type = pg_pool_t::TYPE_ERASURE; + + for (int n = 0; n < 10; n++) { // namespaces + char nspace[20]; + snprintf(nspace, sizeof(nspace), "n%d", n); + for (int f = 0; f < 5000; f++) { // files + for (int b = 0; b < 4; b++) { // blocks + char foo[20]; + snprintf(foo, sizeof(foo), "%d.%d", f, b); + object_t oid(foo); + ceph_object_layout l = osdmap.make_object_layout(oid, 0, nspace); + vector<int> osds; + pg_t pgid = pg_t(l.ol_pgid); + //pgid.u.ps = f * 4 + b; + int primary; + osdmap.pg_to_acting_osds(pgid, &osds, &primary); + size[osds.size()]++; +#if 0 + if (0) { + hash<object_t> H; + int x = H(oid); + x = ceph_stable_mod(x, 1023, 1023); + int s = crush_hash32(x) % 15; + //cout << "ceph_psim: x = " << x << " s = " << s << std::endl; + //osds[0] = s; + } +#endif + //osds[0] = crush_hash32(f) % n; + //cout << "oid " << oid << " pgid " << pgid << " on " << osds << std::endl; + for (unsigned i=0; i<osds.size(); i++) { + //cout << " rep " << i << " on " << osds[i] << std::endl; + count[osds[i]]++; + } + if (osds.size()) + first_count[osds[0]]++; + if (primary >= 0) + primary_count[primary]++; + } + } + } + + uint64_t avg = 0; + for (int i=0; i<n; i++) { + cout << "osd." << i << "\t" << count[i] + << "\t" << first_count[i] + << "\t" << primary_count[i] + << std::endl; + avg += count[i]; + } + avg /= n; + double dev = 0; + for (int i=0; i<n; i++) + dev += (avg - count[i]) * (avg - count[i]); + dev /= n; + dev = sqrt(dev); + + double pgavg = (double)osdmap.get_pg_pool(0)->get_pg_num() / (double)n; + double edev = sqrt(pgavg) * (double)avg / pgavg; + cout << " avg " << avg + << " stddev " << dev + << " (expected " << edev << ")" + << " (indep object placement would be " << sqrt(avg) << ")" << std::endl; + + for (int i=0; i<4; i++) { + cout << "size" << i << "\t" << size[i] << std::endl; + } + + return 0; +} diff --git a/src/tools/rados/PoolDump.cc b/src/tools/rados/PoolDump.cc new file mode 100644 index 00000000..9bfafa10 --- /dev/null +++ b/src/tools/rados/PoolDump.cc @@ -0,0 +1,169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/rados/librados.hpp" +#include "common/errno.h" + +#include "PoolDump.h" + +using namespace librados; + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rados + +/** + * Export RADOS objects from a live cluster + * to a serialized format via a file descriptor. + * + * @returns 0 on success, else error code + */ +int PoolDump::dump(IoCtx *io_ctx) +{ + ceph_assert(io_ctx != NULL); + + int r = 0; + write_super(); + + r = write_simple(TYPE_POOL_BEGIN, file_fd); + if (r != 0) { + return r; + } + + io_ctx->set_namespace(all_nspaces); + librados::NObjectIterator i = io_ctx->nobjects_begin(); + + librados::NObjectIterator i_end = io_ctx->nobjects_end(); + for (; i != i_end; ++i) { + const std::string oid = i->get_oid(); + dout(10) << "OID '" << oid << "'" << dendl; + + // Compose OBJECT_BEGIN + // ==================== + object_begin obj_begin; + obj_begin.hoid.hobj.oid = i->get_oid(); + obj_begin.hoid.hobj.nspace = i->get_nspace(); + obj_begin.hoid.hobj.set_key(i->get_locator()); + + // Only output head, RadosImport only wants that + obj_begin.hoid.hobj.snap = CEPH_NOSNAP; + + // Skip setting object_begin.oi, RadosImport doesn't care + + r = write_section(TYPE_OBJECT_BEGIN, obj_begin, file_fd); + if (r != 0) { + return r; + } + + // Compose TYPE_DATA chunks + // ======================== + const uint32_t op_size = 4096 * 1024; + uint64_t offset = 0; + io_ctx->set_namespace(i->get_nspace()); + io_ctx->locator_set_key(i->get_locator()); + while (true) { + bufferlist outdata; + r = io_ctx->read(oid, outdata, op_size, offset); + if (r <= 0) { + // Error or no data + break; + } + + r = write_section(TYPE_DATA, + data_section(offset, outdata.length(), outdata), file_fd); + if (r != 0) { + // Output stream error + return r; + } + + if (outdata.length() < op_size) { + // No more data + break; + } + offset += outdata.length(); + } + + // Compose TYPE_ATTRS chunk + // ======================== + std::map<std::string, bufferlist> raw_xattrs; + std::map<std::string, bufferlist> xattrs; + r = io_ctx->getxattrs(oid, raw_xattrs); + if (r < 0) { + cerr << "error getting xattr set " << oid << ": " << cpp_strerror(r) + << std::endl; + return r; + } + // Prepend "_" to mimic how user keys are represented in a pg export + for (std::map<std::string, bufferlist>::iterator i = raw_xattrs.begin(); + i != raw_xattrs.end(); ++i) { + std::pair< std::string, bufferlist> item(std::string("_") + std::string(i->first.c_str()), i->second); + xattrs.insert(item); + } + r = write_section(TYPE_ATTRS, attr_section(xattrs), file_fd); + if (r != 0) { + return r; + } + + // Compose TYPE_OMAP_HDR section + // ============================= + bufferlist omap_header; + r = io_ctx->omap_get_header(oid, &omap_header); + if (r < 0) { + cerr << "error getting omap header " << oid + << ": " << cpp_strerror(r) << std::endl; + return r; + } + r = write_section(TYPE_OMAP_HDR, omap_hdr_section(omap_header), file_fd); + if (r != 0) { + return r; + } + + // Compose TYPE_OMAP + int MAX_READ = 512; + string last_read = ""; + do { + map<string, bufferlist> values; + r = io_ctx->omap_get_vals(oid, last_read, MAX_READ, &values); + if (r < 0) { + cerr << "error getting omap keys " << oid << ": " + << cpp_strerror(r) << std::endl; + return r; + } + if (values.size()) { + last_read = values.rbegin()->first; + } else { + break; + } + + r = write_section(TYPE_OMAP, omap_section(values), file_fd); + if (r != 0) { + return r; + } + r = values.size(); + } while (r == MAX_READ); + + // Close object + // ============= + r = write_simple(TYPE_OBJECT_END, file_fd); + if (r != 0) { + return r; + } + } + + r = write_simple(TYPE_POOL_END, file_fd); +#if defined(__linux__) + if (file_fd != STDOUT_FILENO) + posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + return r; +} diff --git a/src/tools/rados/PoolDump.h b/src/tools/rados/PoolDump.h new file mode 100644 index 00000000..33abd886 --- /dev/null +++ b/src/tools/rados/PoolDump.h @@ -0,0 +1,29 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef POOL_DUMP_H_ +#define POOL_DUMP_H_ + +#include "include/rados/librados_fwd.hpp" +#include "tools/RadosDump.h" + +class PoolDump : public RadosDump +{ + public: + explicit PoolDump(int file_fd_) : RadosDump(file_fd_, false) {} + int dump(librados::IoCtx *io_ctx); +}; + +#endif // POOL_DUMP_H_ diff --git a/src/tools/rados/RadosImport.cc b/src/tools/rados/RadosImport.cc new file mode 100644 index 00000000..0a901b70 --- /dev/null +++ b/src/tools/rados/RadosImport.cc @@ -0,0 +1,399 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "common/errno.h" + +#include "osd/PGLog.h" +#include "RadosImport.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rados + +int RadosImport::import(std::string pool, bool no_overwrite) +{ + librados::IoCtx ioctx; + librados::Rados cluster; + + char *id = getenv("CEPH_CLIENT_ID"); + if (id) cerr << "Client id is: " << id << std::endl; + int ret = cluster.init(id); + if (ret) { + cerr << "Error " << ret << " in cluster.init" << std::endl; + return ret; + } + ret = cluster.conf_read_file(NULL); + if (ret) { + cerr << "Error " << ret << " in cluster.conf_read_file" << std::endl; + return ret; + } + ret = cluster.conf_parse_env(NULL); + if (ret) { + cerr << "Error " << ret << " in cluster.conf_read_env" << std::endl; + return ret; + } + ret = cluster.connect(); + if (ret) { + cerr << "Error " << ret << " in cluster.connect" << std::endl; + return ret; + } + + ret = cluster.ioctx_create(pool.c_str(), ioctx); + if (ret < 0) { + cerr << "ioctx_create " << pool << " failed with " << ret << std::endl; + return ret; + } + + return import(ioctx, no_overwrite); +} + +int RadosImport::import(librados::IoCtx &io_ctx, bool no_overwrite) +{ + bufferlist ebl; + pg_info_t info; + PGLog::IndexedLog log; + + int ret = read_super(); + if (ret) + return ret; + + if (sh.magic != super_header::super_magic) { + cerr << "Invalid magic number: 0x" + << std::hex << sh.magic << " vs. 0x" << super_header::super_magic + << std::dec << std::endl; + return -EFAULT; + } + + if (sh.version > super_header::super_ver) { + cerr << "Can't handle export format version=" << sh.version << std::endl; + return -EINVAL; + } + + //First section must be TYPE_PG_BEGIN + sectiontype_t type; + ret = read_section(&type, &ebl); + if (ret) + return ret; + + bool pool_mode = false; + if (type == TYPE_POOL_BEGIN) { + pool_mode = true; + cout << "Importing pool" << std::endl; + } else if (type == TYPE_PG_BEGIN) { + auto ebliter = ebl.cbegin(); + pg_begin pgb; + pgb.decode(ebliter); + spg_t pgid = pgb.pgid;; + if (!pgid.is_no_shard()) { + cerr << "Importing Erasure Coded shard is not supported" << std::endl; + return -EOPNOTSUPP; + } + dout(10) << "Exported features: " << pgb.superblock.compat_features << dendl; + cout << "Importing from pgid " << pgid << std::endl; + } else { + cerr << "Invalid initial section code " << type << std::endl; + return -EFAULT; + } + + // XXX: How to check export features? +#if 0 + if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) { + cerr << "Export has incompatible features set " + << pgb.superblock.compat_features << std::endl; + return -EINVAL; + } +#endif + +#if defined(__linux__) + if (file_fd != STDIN_FILENO) + posix_fadvise(file_fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#endif + + bool done = false; + bool found_metadata = false; + while(!done) { + ret = read_section(&type, &ebl); + if (ret) + return ret; + + //cout << "do_import: Section type " << hex << type << dec << std::endl; + if (type >= END_OF_TYPES) { + cout << "Skipping unknown section type" << std::endl; + continue; + } + switch(type) { + case TYPE_OBJECT_BEGIN: + ret = get_object_rados(io_ctx, ebl, no_overwrite); + if (ret) { + cerr << "Error inserting object: " << ret << std::endl; + return ret; + } + break; + case TYPE_PG_METADATA: + dout(10) << "Don't care about the old metadata" << dendl; + found_metadata = true; + break; + case TYPE_PG_END: + done = true; + break; + case TYPE_POOL_END: + done = true; + break; + default: + return -EFAULT; + } + } + + if (!(pool_mode || found_metadata)) { + cerr << "Missing metadata section!" << std::endl; + } + +#if defined(__linux__) + if (file_fd != STDIN_FILENO) + posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + return 0; +} + +int RadosImport::get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite) +{ + auto ebliter = bl.cbegin(); + object_begin ob; + ob.decode(ebliter); + map<string,bufferlist>::iterator i; + bufferlist abl; + bool skipping; + + data_section ds; + attr_section as; + omap_hdr_section oh; + omap_section os; + + ceph_assert(g_ceph_context); + if (ob.hoid.hobj.nspace == g_ceph_context->_conf->osd_hit_set_namespace) { + cout << "Skipping internal object " << ob.hoid << std::endl; + skip_object(bl); + return 0; + } + + if (!ob.hoid.hobj.is_head()) { + cout << "Skipping non-head for " << ob.hoid << std::endl; + skip_object(bl); + return 0; + } + + ioctx.set_namespace(ob.hoid.hobj.get_namespace()); + ioctx.locator_set_key(ob.hoid.hobj.get_key()); + + string msg("Write"); + skipping = false; + if (dry_run) { + uint64_t psize; + time_t pmtime; + int ret = ioctx.stat(ob.hoid.hobj.oid.name, &psize, &pmtime); + if (ret == 0) { + if (no_overwrite) + // Could set skipping, but dry-run doesn't change anything either + msg = "Skipping existing"; + else + msg = "***Overwrite***"; + } + } else { + int ret = ioctx.create(ob.hoid.hobj.oid.name, true); + if (ret && ret != -EEXIST) { + cerr << "create failed: " << cpp_strerror(ret) << std::endl; + return ret; + } + if (ret == -EEXIST) { + if (no_overwrite) { + msg = "Skipping existing"; + skipping = true; + } else { + msg = "***Overwrite***"; + ret = ioctx.remove(ob.hoid.hobj.oid.name); + if (ret < 0) { + cerr << "remove failed: " << cpp_strerror(ret) << std::endl; + return ret; + } + ret = ioctx.create(ob.hoid.hobj.oid.name, true); + // If object re-appeared after removal, let's just skip it + if (ret == -EEXIST) { + skipping = true; + msg = "Skipping in-use object"; + ret = 0; + } + if (ret < 0) { + cerr << "create failed: " << cpp_strerror(ret) << std::endl; + return ret; + } + } + } + } + + cout << msg << " " << ob.hoid << std::endl; + + bool need_align = false; + uint64_t alignment = 0; + if (align) { + need_align = true; + alignment = align; + } else { + int ret = ioctx.pool_requires_alignment2(&need_align); + if (ret < 0) { + cerr << "pool_requires_alignment2 failed: " << cpp_strerror(ret) + << std::endl; + return ret; + } + + if (need_align) { + ret = ioctx.pool_required_alignment2(&alignment); + if (ret < 0) { + cerr << "pool_required_alignment2 failed: " << cpp_strerror(ret) + << std::endl; + return ret; + } + ceph_assert(alignment != 0); + } + } + + if (need_align) { + dout(10) << "alignment = " << alignment << dendl; + } + + bufferlist ebl, databl; + uint64_t in_offset = 0, out_offset = 0; + bool done = false; + while(!done) { + sectiontype_t type; + int ret = read_section(&type, &ebl); + if (ret) { + cerr << "Error reading section: " << ret << std::endl; + return ret; + } + + ebliter = ebl.cbegin(); + //cout << "\tdo_object: Section type " << hex << type << dec << std::endl; + //cout << "\t\tsection size " << ebl.length() << std::endl; + if (type >= END_OF_TYPES) { + cout << "Skipping unknown object section type" << std::endl; + continue; + } + switch(type) { + case TYPE_DATA: + ds.decode(ebliter); + dout(10) << "\tdata: offset " << ds.offset << " len " << ds.len << dendl; + if (need_align) { + if (ds.offset != in_offset) { + cerr << "Discontiguous object data in export" << std::endl; + return -EFAULT; + } + ceph_assert(ds.databl.length() == ds.len); + databl.claim_append(ds.databl); + in_offset += ds.len; + if (databl.length() >= alignment) { + uint64_t rndlen = uint64_t(databl.length() / alignment) * alignment; + dout(10) << "write offset=" << out_offset << " len=" << rndlen << dendl; + if (!dry_run && !skipping) { + ret = ioctx.write(ob.hoid.hobj.oid.name, databl, rndlen, out_offset); + if (ret) { + cerr << "write failed: " << cpp_strerror(ret) << std::endl; + return ret; + } + } + out_offset += rndlen; + bufferlist n; + if (databl.length() > rndlen) { + ceph_assert(databl.length() - rndlen < alignment); + n.substr_of(databl, rndlen, databl.length() - rndlen); + } + databl = n; + } + break; + } + if (!dry_run && !skipping) { + ret = ioctx.write(ob.hoid.hobj.oid.name, ds.databl, ds.len, ds.offset); + if (ret) { + cerr << "write failed: " << cpp_strerror(ret) << std::endl; + return ret; + } + } + break; + case TYPE_ATTRS: + as.decode(ebliter); + + dout(10) << "\tattrs: len " << as.data.size() << dendl; + if (dry_run || skipping) + break; + for (std::map<string,bufferlist>::iterator i = as.data.begin(); + i != as.data.end(); ++i) { + // The user xattrs that we want all begin with "_" with length > 1. + // Drop key "_" and all attributes that do not start with '_' + if (i->first == "_" || i->first[0] != '_') + continue; + ret = ioctx.setxattr(ob.hoid.hobj.oid.name, i->first.substr(1).c_str(), i->second); + if (ret) { + cerr << "setxattr failed: " << cpp_strerror(ret) << std::endl; + if (ret != -EOPNOTSUPP) + return ret; + } + } + break; + case TYPE_OMAP_HDR: + oh.decode(ebliter); + + dout(10) << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length()) + << dendl; + if (dry_run || skipping) + break; + ret = ioctx.omap_set_header(ob.hoid.hobj.oid.name, oh.hdr); + if (ret) { + cerr << "omap_set_header failed: " << cpp_strerror(ret) << std::endl; + if (ret != -EOPNOTSUPP) + return ret; + } + break; + case TYPE_OMAP: + os.decode(ebliter); + + dout(10) << "\tomap: size " << os.omap.size() << dendl; + if (dry_run || skipping) + break; + ret = ioctx.omap_set(ob.hoid.hobj.oid.name, os.omap); + if (ret) { + cerr << "omap_set failed: " << cpp_strerror(ret) << std::endl; + if (ret != -EOPNOTSUPP) + return ret; + } + break; + case TYPE_OBJECT_END: + done = true; + if (need_align && databl.length() > 0) { + ceph_assert(databl.length() < alignment); + dout(10) << "END write offset=" << out_offset << " len=" << databl.length() << dendl; + if (dry_run || skipping) + break; + ret = ioctx.write(ob.hoid.hobj.oid.name, databl, databl.length(), out_offset); + if (ret) { + cerr << "write failed: " << cpp_strerror(ret) << std::endl; + return ret; + } + } + break; + default: + cerr << "Unexpected section type " << type << std::endl; + return -EFAULT; + } + } + return 0; +} diff --git a/src/tools/rados/RadosImport.h b/src/tools/rados/RadosImport.h new file mode 100644 index 00000000..3a516630 --- /dev/null +++ b/src/tools/rados/RadosImport.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RADOS_IMPORT_H_ +#define RADOS_IMPORT_H_ + +#include <string> + +#include "include/rados/librados.hpp" +#include "include/buffer_fwd.h" + +#include "tools/RadosDump.h" + +/** + * Specialization of RadosDump that adds + * methods for importing objects from a stream + * to a live cluster. + */ +class RadosImport : public RadosDump +{ + protected: + uint64_t align; + int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite); + + public: + RadosImport(int file_fd_, uint64_t align_, bool dry_run_) + : RadosDump(file_fd_, dry_run_), align(align_) + {} + + int import(std::string pool, bool no_overwrite); + int import(librados::IoCtx &io_ctx, bool no_overwrite); +}; + +#endif // RADOS_IMPORT_H_ diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc new file mode 100644 index 00000000..280a51dd --- /dev/null +++ b/src/tools/rados/rados.cc @@ -0,0 +1,4135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/types.h" + +#include "include/rados/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rados/rados_types.hpp" + +#include "acconfig.h" +#ifdef WITH_LIBRADOSSTRIPER + #include "include/radosstriper/libradosstriper.hpp" + using namespace libradosstriper; +#endif + +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "common/Cond.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/obj_bencher.h" +#include "common/TextTable.h" +#include "include/stringify.h" +#include "mds/inode_backtrace.h" +#include "include/random.h" +#include <iostream> +#include <fstream> + +#include <stdlib.h> +#include <time.h> +#include <sstream> +#include <errno.h> +#include <dirent.h> +#include <stdexcept> +#include <climits> +#include <locale> +#include <memory> +#include <optional> + +#include "cls/lock/cls_lock_client.h" +#include "include/compat.h" +#include "include/util.h" +#include "common/hobject.h" + +#include "PoolDump.h" +#include "RadosImport.h" + +#include "osd/ECUtil.h" + +using namespace librados; +using ceph::util::generate_random_number; + +// two steps seem to be necessary to do this right +#define STR(x) _STR(x) +#define _STR(x) #x + +void usage(ostream& out) +{ + out << \ +"usage: rados [options] [commands]\n" +"POOL COMMANDS\n" +" lspools list pools\n" +" cppool <pool-name> <dest-pool> copy content of a pool\n" +" purge <pool-name> --yes-i-really-really-mean-it\n" +" remove all objects from pool <pool-name> without removing it\n" +" df show per-pool and total usage\n" +" ls list objects in pool\n\n" +"\n" +"POOL SNAP COMMANDS\n" +" lssnap list snaps\n" +" mksnap <snap-name> create snap <snap-name>\n" +" rmsnap <snap-name> remove snap <snap-name>\n" +"\n" +"OBJECT COMMANDS\n" +" get <obj-name> <outfile> fetch object\n" +" put <obj-name> <infile> [--offset offset]\n" +" write object with start offset (default:0)\n" +" append <obj-name> <infile> append object\n" +" truncate <obj-name> length truncate object\n" +" create <obj-name> create object\n" +" rm <obj-name> ...[--force-full] [force no matter full or not]remove object(s)\n" +" cp <obj-name> [target-obj] copy object\n" +" listxattr <obj-name>\n" +" getxattr <obj-name> attr\n" +" setxattr <obj-name> attr val\n" +" rmxattr <obj-name> attr\n" +" stat <obj-name> stat the named object\n" +" stat2 <obj-name> stat2 the named object (with high precision time)\n" +" touch <obj-name> [timestamp] change the named object modification time\n" +" mapext <obj-name>\n" +" rollback <obj-name> <snap-name> roll back object to snap <snap-name>\n" +"\n" +" listsnaps <obj-name> list the snapshots of this object\n" +" bench <seconds> write|seq|rand [-t concurrent_operations] [--no-cleanup] [--run-name run_name] [--no-hints] [--reuse-bench]\n" +" default is 16 concurrent IOs and 4 MB ops\n" +" default is to clean up after write benchmark\n" +" default run-name is 'benchmark_last_metadata'\n" +" cleanup [--run-name run_name] [--prefix prefix]\n" +" clean up a previous benchmark operation\n" +" default run-name is 'benchmark_last_metadata'\n" +" load-gen [options] generate load on the cluster\n" +" listomapkeys <obj-name> list the keys in the object map\n" +" listomapvals <obj-name> list the keys and vals in the object map \n" +" getomapval <obj-name> <key> [file] show the value for the specified key\n" +" in the object's object map\n" +" setomapval <obj-name> <key> <val>\n" +" rmomapkey <obj-name> <key>\n" +" clearomap <obj-name> [obj-name2 obj-name3...] clear all the omap keys for the specified objects\n" +" getomapheader <obj-name> [file]\n" +" setomapheader <obj-name> <val>\n" +" watch <obj-name> add watcher on this object\n" +" notify <obj-name> <message> notify watcher of this object with message\n" +" listwatchers <obj-name> list the watchers of this object\n" +" set-alloc-hint <obj-name> <expected-object-size> <expected-write-size>\n" +" set allocation hint for an object\n" +" set-redirect <object A> --target-pool <caspool> <target object A> [--with-reference]\n" +" set redirect target\n" +" set-chunk <object A> <offset> <length> --target-pool <caspool> <target object A> <taget-offset> [--with-reference]\n" +" convert an object to chunked object\n" +" tier-promote <obj-name> promote the object to the base tier\n" +" unset-manifest <obj-name> unset redirect or chunked object\n" +"\n" +"IMPORT AND EXPORT\n" +" export [filename]\n" +" Serialize pool contents to a file or standard out.\n" +" import [--dry-run] [--no-overwrite] < filename | - >\n" +" Load pool contents from a file or standard in\n" +"\n" +"ADVISORY LOCKS\n" +" lock list <obj-name>\n" +" List all advisory locks on an object\n" +" lock get <obj-name> <lock-name>\n" +" Try to acquire a lock\n" +" lock break <obj-name> <lock-name> <locker-name>\n" +" Try to break a lock acquired by another client\n" +" lock info <obj-name> <lock-name>\n" +" Show lock information\n" +" options:\n" +" --lock-tag Lock tag, all locks operation should use\n" +" the same tag\n" +" --lock-cookie Locker cookie\n" +" --lock-description Description of lock\n" +" --lock-duration Lock duration (in seconds)\n" +" --lock-type Lock type (shared, exclusive)\n" +"\n" +"SCRUB AND REPAIR:\n" +" list-inconsistent-pg <pool> list inconsistent PGs in given pool\n" +" list-inconsistent-obj <pgid> list inconsistent objects in given PG\n" +" list-inconsistent-snapset <pgid> list inconsistent snapsets in the given PG\n" +"\n" +"CACHE POOLS: (for testing/development only)\n" +" cache-flush <obj-name> flush cache pool object (blocking)\n" +" cache-try-flush <obj-name> flush cache pool object (non-blocking)\n" +" cache-evict <obj-name> evict cache pool object\n" +" cache-flush-evict-all flush+evict all objects\n" +" cache-try-flush-evict-all try-flush+evict all objects\n" +"\n" +"GLOBAL OPTIONS:\n" +" --object_locator object_locator\n" +" set object_locator for operation\n" +" -p pool\n" +" --pool=pool\n" +" select given pool by name\n" +" --target-pool=pool\n" +" select target pool by name\n" +" --pgid PG id\n" +" select given PG id\n" +" -f [--format plain|json|json-pretty]\n" +" --format=[--format plain|json|json-pretty]\n" +" -b op_size\n" +" set the block size for put/get ops and for write benchmarking\n" +" -O object_size\n" +" set the object size for put/get ops and for write benchmarking\n" +" --max-objects\n" +" set the max number of objects for write benchmarking\n" +" --obj-name-file file\n" +" use the content of the specified file in place of <obj-name>\n" +" -s name\n" +" --snap name\n" +" select given snap name for (read) IO\n" +" --create\n" +" create the pool or directory that was specified\n" +" -N namespace\n" +" --namespace=namespace\n" +" specify the namespace to use for the object\n" +" --all\n" +" Use with ls to list objects in all namespaces\n" +" Put in CEPH_ARGS environment variable to make this the default\n" +" --default\n" +" Use with ls to list objects in default namespace\n" +" Takes precedence over --all in case --all is in environment\n" +" --target-locator\n" +" Use with cp to specify the locator of the new object\n" +" --target-nspace\n" +" Use with cp to specify the namespace of the new object\n" +#ifdef WITH_LIBRADOSSTRIPER +" --striper\n" +" Use radostriper interface rather than pure rados\n" +" Available for stat, get, put, truncate, rm, ls and \n" +" all xattr related operations\n" +#endif +"\n" +"BENCH OPTIONS:\n" +" -t N\n" +" --concurrent-ios=N\n" +" Set number of concurrent I/O operations\n" +" --show-time\n" +" prefix output with date/time\n" +" --no-verify\n" +" do not verify contents of read objects\n" +" --write-object\n" +" write contents to the objects\n" +" --write-omap\n" +" write contents to the omap\n" +" --write-xattr\n" +" write contents to the extended attributes\n" +"\n" +"LOAD GEN OPTIONS:\n" +" --num-objects total number of objects\n" +" --min-object-size min object size\n" +" --max-object-size max object size\n" +" --min-op-len min io size of operations\n" +" --max-op-len max io size of operations\n" +" --max-ops max number of operations\n" +" --max-backlog max backlog size\n" +" --read-percent percent of operations that are read\n" +" --target-throughput target throughput (in bytes)\n" +" --run-length total time (in seconds)\n" +" --offset-align at what boundary to align random op offsets" +"CACHE POOLS OPTIONS:\n" +" --with-clones include clones when doing flush or evict\n" +"OMAP OPTIONS:\n" +" --omap-key-file file read the omap key from a file\n"; +} + +namespace detail { + +#ifdef WITH_LIBRADOSSTRIPER +RadosStriper& striper() +{ + static RadosStriper s; + return s; +} +#endif + +int read([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, buffer::list& out_data, const unsigned op_size, const uint64_t offset, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().read(oid, &out_data, op_size, offset); +#endif + + return io_ctx.read(oid, out_data, op_size, offset); +} + +int write([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, buffer::list& indata, const uint64_t count, const uint64_t offset, [[maybe_unused]] const bool use_striper) +{ + #ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().write(oid, indata, count, offset); +#endif + + return io_ctx.write(oid, indata, count, offset); +} + +int write_full([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, bufferlist& indata, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().write_full(oid, indata); +#endif + + return io_ctx.write_full(oid, indata); +} + +int trunc([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const uint64_t offset, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().trunc(oid, offset); +#endif + + return io_ctx.trunc(oid, offset); +} + +int append([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, buffer::list& indata, const uint64_t count, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().append(oid, indata, count); +#endif + + return io_ctx.append(oid, indata, count); +} + +int setxattr([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const std::string& attr_name, buffer::list& bl, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().setxattr(oid, attr_name.c_str(), bl); +#endif + + return io_ctx.setxattr(oid, attr_name.c_str(), bl); +} + +int getxattr([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const std::string& attr_name, buffer::list& bl, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().getxattr(oid, attr_name.c_str(), bl); +#endif + + return io_ctx.getxattr(oid, attr_name.c_str(), bl); +} + +int rmxattr([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const std::string& attr_name, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().rmxattr(oid, attr_name.c_str()); +#endif + + return io_ctx.rmxattr(oid, attr_name.c_str()); +} + +int getxattrs([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, std::map<std::string, buffer::list>& attrset, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().getxattrs(oid, attrset); +#endif + + return io_ctx.getxattrs(oid, attrset); +} + +int remove([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const int flags, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().remove(oid, flags); +#endif + + return io_ctx.remove(oid, flags); +} + +int remove([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().remove(oid); +#endif + + return io_ctx.remove(oid); +} + +std::string get_oid(librados::NObjectIterator& i, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return i->get_oid().substr(0, i->get_oid().length()-17); +#endif + + return i->get_oid(); +} + +int stat([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, uint64_t& size, time_t& mtime, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().stat(oid, &size, &mtime); +#endif + + return io_ctx.stat(oid, &size, &mtime); +} + +int stat2([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, uint64_t& size, timespec& mtime, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) + return striper().stat2(oid, &size, &mtime); +#endif + + return io_ctx.stat2(oid, &size, &mtime); +} + +void dump_name(Formatter *formatter, const librados::NObjectIterator& i, [[maybe_unused]] const bool use_striper) +{ +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) { + formatter->dump_string("name", i->get_oid().substr(0, i->get_oid().length()-17)); + return; + } +#endif + + formatter->dump_string("name", i->get_oid()); +} + +} // namespace detail + +unsigned default_op_size = 1 << 22; + +[[noreturn]] static void usage_exit() +{ + usage(cerr); + exit(1); +} + + +template <typename I, typename T> +static int rados_sistrtoll(I &i, T *val) { + std::string err; + *val = strict_iecstrtoll(i->second.c_str(), &err); + if (err != "") { + cerr << "Invalid value for " << i->first << ": " << err << std::endl; + return -EINVAL; + } else { + return 0; + } +} + + +static int dump_data(std::string const &filename, bufferlist const &data) +{ + int fd; + if (filename == "-") { + fd = STDOUT_FILENO; + } else { + fd = TEMP_FAILURE_RETRY(::open(filename.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0644)); + if (fd < 0) { + int err = errno; + cerr << "failed to open file: " << cpp_strerror(err) << std::endl; + return -err; + } + } + + int r = data.write_fd(fd); + + if (fd != 1) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + + return r; +} + + +static int do_get(IoCtx& io_ctx, const std::string& oid, const char *outfile, unsigned op_size, [[maybe_unused]] const bool use_striper) +{ + int fd; + if (strcmp(outfile, "-") == 0) { + fd = STDOUT_FILENO; + } else { + fd = TEMP_FAILURE_RETRY(::open(outfile, O_WRONLY|O_CREAT|O_TRUNC, 0644)); + if (fd < 0) { + int err = errno; + cerr << "failed to open file: " << cpp_strerror(err) << std::endl; + return -err; + } + } + + uint64_t offset = 0; + int ret; + while (true) { + bufferlist outdata; + + ret = detail::read(io_ctx, oid, outdata, op_size, offset, use_striper); + if (ret <= 0) { + goto out; + } + ret = outdata.write_fd(fd); + if (ret < 0) { + cerr << "error writing to file: " << cpp_strerror(ret) << std::endl; + goto out; + } + if (outdata.length() < op_size) + break; + offset += outdata.length(); + } + ret = 0; + + out: + if (fd != 1) + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; +} + +static int do_copy(IoCtx& io_ctx, const char *objname, + IoCtx& target_ctx, const char *target_obj) +{ + __le32 src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + __le32 dest_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_DONTNEED; + ObjectWriteOperation op; + op.copy_from(objname, io_ctx, 0, src_fadvise_flags); + op.set_op_flags2(dest_fadvise_flags); + + return target_ctx.operate(target_obj, &op); +} + +static int do_copy_pool(Rados& rados, const char *src_pool, const char *target_pool) +{ + IoCtx src_ctx, target_ctx; + int ret = rados.ioctx_create(src_pool, src_ctx); + if (ret < 0) { + cerr << "cannot open source pool: " << src_pool << std::endl; + return ret; + } + ret = rados.ioctx_create(target_pool, target_ctx); + if (ret < 0) { + cerr << "cannot open target pool: " << target_pool << std::endl; + return ret; + } + src_ctx.set_namespace(all_nspaces); + librados::NObjectIterator i = src_ctx.nobjects_begin(); + librados::NObjectIterator i_end = src_ctx.nobjects_end(); + for (; i != i_end; ++i) { + string nspace = i->get_nspace(); + string oid = i->get_oid(); + string locator = i->get_locator(); + + string target_name = (nspace.size() ? nspace + "/" : "") + oid; + string src_name = target_name; + if (locator.size()) + src_name += "(@" + locator + ")"; + cout << src_pool << ":" << src_name << " => " + << target_pool << ":" << target_name << std::endl; + + src_ctx.locator_set_key(locator); + src_ctx.set_namespace(nspace); + target_ctx.set_namespace(nspace); + ret = do_copy(src_ctx, oid.c_str(), target_ctx, oid.c_str()); + if (ret < 0) { + cerr << "error copying object: " << cpp_strerror(errno) << std::endl; + return ret; + } + } + + return 0; +} + +static int do_put(IoCtx& io_ctx, + const std::string& oid, const char *infile, int op_size, + uint64_t obj_offset, + const bool use_striper) +{ + bool stdio = (strcmp(infile, "-") == 0); + int ret = 0; + int fd = STDIN_FILENO; + if (!stdio) + fd = open(infile, O_RDONLY); + if (fd < 0) { + cerr << "error reading input file " << infile << ": " << cpp_strerror(errno) << std::endl; + return 1; + } + int count = op_size; + uint64_t offset = obj_offset; + while (count != 0) { + bufferlist indata; + count = indata.read_fd(fd, op_size); + if (count < 0) { + ret = -errno; + cerr << "error reading input file " << infile << ": " << cpp_strerror(ret) << std::endl; + goto out; + } + + if (count == 0) { + if (offset == obj_offset) { // in case we have to create an empty object & if obj_offset > 0 do a hole + ret = detail::write_full(io_ctx, oid, indata, use_striper); // indata is empty + + if (ret < 0) { + goto out; + } + + if (offset) { + ret = detail::trunc(io_ctx, oid, offset, use_striper); // before truncate, object must be existed. + + if (ret < 0) { + goto out; + } + } + } + continue; + } + + if (0 == offset) + ret = detail::write_full(io_ctx, oid, indata, use_striper); + else + ret = detail::write(io_ctx, oid, indata, count, offset, use_striper); + + if (ret < 0) { + goto out; + } + offset += count; + } + ret = 0; + out: + if (fd != STDOUT_FILENO) + VOID_TEMP_FAILURE_RETRY(close(fd)); + return ret; +} + +static int do_append(IoCtx& io_ctx, + const std::string& oid, const char *infile, int op_size, + const bool use_striper) +{ + bool stdio = (strcmp(infile, "-") == 0); + int ret = 0; + int fd = STDIN_FILENO; + if (!stdio) + fd = open(infile, O_RDONLY); + if (fd < 0) { + cerr << "error reading input file " << infile << ": " << cpp_strerror(errno) << std::endl; + return 1; + } + int count = op_size; + while (count != 0) { + bufferlist indata; + count = indata.read_fd(fd, op_size); + if (count < 0) { + ret = -errno; + cerr << "error reading input file " << infile << ": " << cpp_strerror(ret) << std::endl; + goto out; + } + ret = detail::append(io_ctx, oid, indata, count, use_striper); + + if (ret < 0) { + goto out; + } + } + ret = 0; +out: + if (fd != STDOUT_FILENO) + VOID_TEMP_FAILURE_RETRY(close(fd)); + return ret; +} + +class RadosWatchCtx : public librados::WatchCtx2 { + IoCtx& ioctx; + string name; +public: + RadosWatchCtx(IoCtx& io, const char *imgname) : ioctx(io), name(imgname) {} + ~RadosWatchCtx() override {} + void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) override { + cout << "NOTIFY" + << " cookie " << cookie + << " notify_id " << notify_id + << " from " << notifier_id + << std::endl; + bl.hexdump(cout); + ioctx.notify_ack(name, notify_id, cookie, bl); + } + void handle_error(uint64_t cookie, int err) override { + cout << "ERROR" + << " cookie " << cookie + << " err " << cpp_strerror(err) + << std::endl; + } +}; + +static const char alphanum_table[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + +void gen_rand_alphanumeric(char *dest, int size) /* size should be the required string size + 1 */ +{ + const int max = sizeof(alphanum_table) - 2; + + int i; + for (i=0; i<size - 1; i++) { + int pos = generate_random_number(0, max); + dest[i] = alphanum_table[pos]; + } + dest[i] = '\0'; +} + +struct obj_info { + string name; + size_t len; +}; + +class LoadGen { + size_t total_sent; + size_t total_completed; + + IoCtx io_ctx; + Rados *rados; + + map<int, obj_info> objs; + + utime_t start_time; + + bool going_down; + +public: + int read_percent; + int num_objs; + size_t min_obj_len; + size_t max_obj_len; + size_t min_op_len; + size_t max_op_len; + size_t max_ops; + size_t max_backlog; + size_t target_throughput; + size_t offset_align = 0; + int run_length; + + enum { + OP_READ, + OP_WRITE, + }; + + struct LoadGenOp { + int id; + int type; + string oid; + size_t off; + size_t len; + bufferlist bl; + LoadGen *lg; + librados::AioCompletion *completion; + + LoadGenOp() : id(0), type(0), off(0), len(0), lg(NULL), completion(NULL) {} + explicit LoadGenOp(LoadGen *_lg) : id(0), type(0), off(0), len(0), lg(_lg), completion(NULL) {} + }; + + int max_op; + + map<int, LoadGenOp *> pending_ops; + + void gen_op(LoadGenOp *op); + uint64_t gen_next_op(); + void run_op(LoadGenOp *op); + + uint64_t cur_sent_rate() { + return total_sent / time_passed(); + } + + uint64_t cur_completed_rate() { + return total_completed / time_passed(); + } + + uint64_t total_expected() { + return target_throughput * time_passed(); + } + + float time_passed() { + utime_t now = ceph_clock_now(); + now -= start_time; + uint64_t ns = now.nsec(); + float total = (float) ns / 1000000000.0; + total += now.sec(); + return total; + } + + Mutex lock; + Cond cond; + + explicit LoadGen(Rados *_rados) : rados(_rados), going_down(false), lock("LoadGen") { + read_percent = 80; + min_obj_len = 1024; + max_obj_len = 5ull * 1024ull * 1024ull * 1024ull; + min_op_len = 1024; + target_throughput = 5 * 1024 * 1024; // B/sec + max_op_len = 2 * 1024 * 1024; + max_ops = 16; + max_backlog = target_throughput * 2; + run_length = 60; + + total_sent = 0; + total_completed = 0; + num_objs = 200; + max_op = 0; + } + int bootstrap(const char *pool); + int run(); + void cleanup(); + + void io_cb(completion_t c, LoadGenOp *op) { + Mutex::Locker l(lock); + + total_completed += op->len; + + double rate = (double)cur_completed_rate() / (1024 * 1024); + std::streamsize original_precision = cout.precision(); + cout.precision(3); + cout << "op " << op->id << " completed, throughput=" << rate << "MB/sec" << std::endl; + cout.precision(original_precision); + + map<int, LoadGenOp *>::iterator iter = pending_ops.find(op->id); + if (iter != pending_ops.end()) + pending_ops.erase(iter); + + if (!going_down) + op->completion->release(); + + delete op; + + cond.Signal(); + } +}; + +static void _load_gen_cb(completion_t c, void *param) +{ + LoadGen::LoadGenOp *op = (LoadGen::LoadGenOp *)param; + op->lg->io_cb(c, op); +} + +int LoadGen::bootstrap(const char *pool) +{ + char buf[128]; + int i; + + if (!pool) { + cerr << "ERROR: pool name was not specified" << std::endl; + return -EINVAL; + } + + int ret = rados->ioctx_create(pool, io_ctx); + if (ret < 0) { + cerr << "error opening pool " << pool << ": " << cpp_strerror(ret) << std::endl; + return ret; + } + + int buf_len = 1; + bufferptr p = buffer::create(buf_len); + bufferlist bl; + memset(p.c_str(), 0, buf_len); + bl.push_back(p); + + list<librados::AioCompletion *> completions; + for (i = 0; i < num_objs; i++) { + obj_info info; + gen_rand_alphanumeric(buf, 16); + info.name = "obj-"; + info.name.append(buf); + info.len = generate_random_number(min_obj_len, max_obj_len); + + // throttle... + while (completions.size() > max_ops) { + AioCompletion *c = completions.front(); + c->wait_for_complete(); + ret = c->get_return_value(); + c->release(); + completions.pop_front(); + if (ret < 0) { + cerr << "aio_write failed" << std::endl; + return ret; + } + } + + librados::AioCompletion *c = rados->aio_create_completion(NULL, NULL, NULL); + completions.push_back(c); + // generate object + ret = io_ctx.aio_write(info.name, c, bl, buf_len, info.len - buf_len); + if (ret < 0) { + cerr << "couldn't write obj: " << info.name << " ret=" << ret << std::endl; + return ret; + } + objs[i] = info; + } + + list<librados::AioCompletion *>::iterator iter; + for (iter = completions.begin(); iter != completions.end(); ++iter) { + AioCompletion *c = *iter; + c->wait_for_complete(); + ret = c->get_return_value(); + c->release(); + if (ret < 0) { // yes, we leak. + cerr << "aio_write failed" << std::endl; + return ret; + } + } + return 0; +} + +void LoadGen::run_op(LoadGenOp *op) +{ + op->completion = rados->aio_create_completion(op, _load_gen_cb, NULL); + + switch (op->type) { + case OP_READ: + io_ctx.aio_read(op->oid, op->completion, &op->bl, op->len, op->off); + break; + case OP_WRITE: + bufferptr p = buffer::create(op->len); + memset(p.c_str(), 0, op->len); + op->bl.push_back(p); + + io_ctx.aio_write(op->oid, op->completion, op->bl, op->len, op->off); + break; + } + + total_sent += op->len; +} + +void LoadGen::gen_op(LoadGenOp *op) +{ + int i = generate_random_number<int>(0, objs.size() - 1); + obj_info& info = objs[i]; + op->oid = info.name; + + size_t len = generate_random_number(min_op_len, max_op_len); + if (len > info.len) + len = info.len; + size_t off = generate_random_number<size_t>(0, info.len); + + if (off + len > info.len) + off = info.len - len; + + if (offset_align) + off = p2align(off, offset_align); + + op->off = off; + op->len = len; + + i = generate_random_number(1, 100); + if (i > read_percent) + op->type = OP_WRITE; + else + op->type = OP_READ; + + cout << (op->type == OP_READ ? "READ" : "WRITE") << " : oid=" << op->oid << " off=" << op->off << " len=" << op->len << std::endl; +} + +uint64_t LoadGen::gen_next_op() +{ + lock.Lock(); + + LoadGenOp *op = new LoadGenOp(this); + gen_op(op); + op->id = max_op++; + pending_ops[op->id] = op; + + lock.Unlock(); + + run_op(op); + + return op->len; +} + +int LoadGen::run() +{ + start_time = ceph_clock_now(); + utime_t end_time = start_time; + end_time += run_length; + utime_t stamp_time = start_time; + uint32_t total_sec = 0; + + while (1) { + lock.Lock(); + utime_t one_second(1, 0); + cond.WaitInterval(lock, one_second); + lock.Unlock(); + utime_t now = ceph_clock_now(); + + if (now > end_time) + break; + + uint64_t expected = total_expected(); + lock.Lock(); + uint64_t sent = total_sent; + uint64_t completed = total_completed; + lock.Unlock(); + + if (now - stamp_time >= utime_t(1, 0)) { + double rate = (double)cur_completed_rate() / (1024 * 1024); + ++total_sec; + std::streamsize original_precision = cout.precision(); + cout.precision(3); + cout << setw(5) << total_sec << ": throughput=" << rate << "MB/sec" << " pending data=" << sent - completed << std::endl; + cout.precision(original_precision); + stamp_time = now; + } + + while (sent < expected && + sent - completed < max_backlog && + pending_ops.size() < max_ops) { + sent += gen_next_op(); + } + } + + // get a reference to all pending requests + vector<librados::AioCompletion *> completions; + lock.Lock(); + going_down = true; + map<int, LoadGenOp *>::iterator iter; + for (iter = pending_ops.begin(); iter != pending_ops.end(); ++iter) { + LoadGenOp *op = iter->second; + completions.push_back(op->completion); + } + lock.Unlock(); + + cout << "waiting for all operations to complete" << std::endl; + + // now wait on all the pending requests + for (vector<librados::AioCompletion *>::iterator citer = completions.begin(); citer != completions.end(); ++citer) { + librados::AioCompletion *c = *citer; + c->wait_for_complete(); + c->release(); + } + + return 0; +} + +void LoadGen::cleanup() +{ + cout << "cleaning up objects" << std::endl; + map<int, obj_info>::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + obj_info& info = iter->second; + int ret = io_ctx.remove(info.name); + if (ret < 0) + cerr << "couldn't remove obj: " << info.name << " ret=" << ret << std::endl; + } +} + +enum OpWriteDest { + OP_WRITE_DEST_OBJ = 2 << 0, + OP_WRITE_DEST_OMAP = 2 << 1, + OP_WRITE_DEST_XATTR = 2 << 2, +}; + +class RadosBencher : public ObjBencher { + librados::AioCompletion **completions; + librados::Rados& rados; + librados::IoCtx& io_ctx; + librados::NObjectIterator oi; + bool iterator_valid; + OpWriteDest write_destination; + +protected: + int completions_init(int concurrentios) override { + completions = new librados::AioCompletion *[concurrentios]; + return 0; + } + void completions_done() override { + delete[] completions; + completions = NULL; + } + int create_completion(int slot, void (*cb)(void *, void*), void *arg) override { + completions[slot] = rados.aio_create_completion((void *) arg, 0, cb); + + if (!completions[slot]) + return -EINVAL; + + return 0; + } + void release_completion(int slot) override { + completions[slot]->release(); + completions[slot] = 0; + } + + int aio_read(const std::string& oid, int slot, bufferlist *pbl, size_t len, + size_t offset) override { + return io_ctx.aio_read(oid, completions[slot], pbl, len, offset); + } + + int aio_write(const std::string& oid, int slot, bufferlist& bl, size_t len, + size_t offset) override { + librados::ObjectWriteOperation op; + + if (write_destination & OP_WRITE_DEST_OBJ) { + if (data.hints) + op.set_alloc_hint2(data.object_size, data.op_size, + ALLOC_HINT_FLAG_SEQUENTIAL_WRITE | + ALLOC_HINT_FLAG_SEQUENTIAL_READ | + ALLOC_HINT_FLAG_APPEND_ONLY | + ALLOC_HINT_FLAG_IMMUTABLE); + op.write(offset, bl); + } + + if (write_destination & OP_WRITE_DEST_OMAP) { + std::map<std::string, librados::bufferlist> omap; + omap[string("bench-omap-key-") + stringify(offset)] = bl; + op.omap_set(omap); + } + + if (write_destination & OP_WRITE_DEST_XATTR) { + char key[80]; + snprintf(key, sizeof(key), "bench-xattr-key-%d", (int)offset); + op.setxattr(key, bl); + } + + return io_ctx.aio_operate(oid, completions[slot], &op); + } + + int aio_remove(const std::string& oid, int slot) override { + return io_ctx.aio_remove(oid, completions[slot]); + } + + int sync_read(const std::string& oid, bufferlist& bl, size_t len) override { + return io_ctx.read(oid, bl, len, 0); + } + int sync_write(const std::string& oid, bufferlist& bl, size_t len) override { + return io_ctx.write_full(oid, bl); + } + + int sync_remove(const std::string& oid) override { + return io_ctx.remove(oid); + } + + bool completion_is_done(int slot) override { + return completions[slot]->is_safe(); + } + + int completion_wait(int slot) override { + return completions[slot]->wait_for_safe_and_cb(); + } + int completion_ret(int slot) override { + return completions[slot]->get_return_value(); + } + + bool get_objects(std::list<Object>* objects, int num) override { + int count = 0; + + if (!iterator_valid) { + oi = io_ctx.nobjects_begin(); + iterator_valid = true; + } + + librados::NObjectIterator ei = io_ctx.nobjects_end(); + + if (oi == ei) { + iterator_valid = false; + return false; + } + + objects->clear(); + for ( ; oi != ei && count < num; ++oi) { + Object obj(oi->get_oid(), oi->get_nspace()); + objects->push_back(obj); + ++count; + } + + return true; + } + + void set_namespace( const std::string& ns) override { + io_ctx.set_namespace(ns); + } + +public: + RadosBencher(CephContext *cct_, librados::Rados& _r, librados::IoCtx& _i) + : ObjBencher(cct_), completions(NULL), rados(_r), io_ctx(_i), iterator_valid(false), write_destination(OP_WRITE_DEST_OBJ) {} + ~RadosBencher() override { } + + void set_write_destination(OpWriteDest dest) { + write_destination = dest; + } +}; + +static int do_lock_cmd(std::vector<const char*> &nargs, + const std::map < std::string, std::string > &opts, + IoCtx *ioctx, + Formatter *formatter) +{ + if (nargs.size() < 3) + usage_exit(); + + string cmd(nargs[1]); + string oid(nargs[2]); + + string lock_tag; + string lock_cookie; + string lock_description; + int lock_duration = 0; + ClsLockType lock_type = LOCK_EXCLUSIVE; + + map<string, string>::const_iterator i; + i = opts.find("lock-tag"); + if (i != opts.end()) { + lock_tag = i->second; + } + i = opts.find("lock-cookie"); + if (i != opts.end()) { + lock_cookie = i->second; + } + i = opts.find("lock-description"); + if (i != opts.end()) { + lock_description = i->second; + } + i = opts.find("lock-duration"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &lock_duration)) { + return -EINVAL; + } + } + i = opts.find("lock-type"); + if (i != opts.end()) { + const string& type_str = i->second; + if (type_str.compare("exclusive") == 0) { + lock_type = LOCK_EXCLUSIVE; + } else if (type_str.compare("shared") == 0) { + lock_type = LOCK_SHARED; + } else { + cerr << "unknown lock type was specified, aborting" << std::endl; + return -EINVAL; + } + } + + if (cmd.compare("list") == 0) { + list<string> locks; + int ret = rados::cls::lock::list_locks(ioctx, oid, &locks); + if (ret < 0) { + cerr << "ERROR: rados_list_locks(): " << cpp_strerror(ret) << std::endl; + return ret; + } + + formatter->open_object_section("object"); + formatter->dump_string("objname", oid); + formatter->open_array_section("locks"); + list<string>::iterator iter; + for (iter = locks.begin(); iter != locks.end(); ++iter) { + formatter->open_object_section("lock"); + formatter->dump_string("name", *iter); + formatter->close_section(); + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + return 0; + } + + if (nargs.size() < 4) + usage_exit(); + + string lock_name(nargs[3]); + + if (cmd.compare("info") == 0) { + map<rados::cls::lock::locker_id_t, rados::cls::lock::locker_info_t> lockers; + ClsLockType type = LOCK_NONE; + string tag; + int ret = rados::cls::lock::get_lock_info(ioctx, oid, lock_name, &lockers, &type, &tag); + if (ret < 0) { + cerr << "ERROR: rados_lock_get_lock_info(): " << cpp_strerror(ret) << std::endl; + return ret; + } + + formatter->open_object_section("lock"); + formatter->dump_string("name", lock_name); + formatter->dump_string("type", cls_lock_type_str(type)); + formatter->dump_string("tag", tag); + formatter->open_array_section("lockers"); + map<rados::cls::lock::locker_id_t, rados::cls::lock::locker_info_t>::iterator iter; + for (iter = lockers.begin(); iter != lockers.end(); ++iter) { + const rados::cls::lock::locker_id_t& id = iter->first; + const rados::cls::lock::locker_info_t& info = iter->second; + formatter->open_object_section("locker"); + formatter->dump_stream("name") << id.locker; + formatter->dump_string("cookie", id.cookie); + formatter->dump_string("description", info.description); + formatter->dump_stream("expiration") << info.expiration; + formatter->dump_stream("addr") << info.addr.get_legacy_str(); + formatter->close_section(); + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + + return ret; + } else if (cmd.compare("get") == 0) { + rados::cls::lock::Lock l(lock_name); + l.set_cookie(lock_cookie); + l.set_tag(lock_tag); + l.set_duration(utime_t(lock_duration, 0)); + l.set_description(lock_description); + int ret; + switch (lock_type) { + case LOCK_SHARED: + ret = l.lock_shared(ioctx, oid); + break; + default: + ret = l.lock_exclusive(ioctx, oid); + } + if (ret < 0) { + cerr << "ERROR: failed locking: " << cpp_strerror(ret) << std::endl; + return ret; + } + + return ret; + } + + if (nargs.size() < 5) + usage_exit(); + + if (cmd.compare("break") == 0) { + string locker(nargs[4]); + rados::cls::lock::Lock l(lock_name); + l.set_cookie(lock_cookie); + l.set_tag(lock_tag); + entity_name_t name; + if (!name.parse(locker)) { + cerr << "ERROR: failed to parse locker name (" << locker << ")" << std::endl; + return -EINVAL; + } + int ret = l.break_lock(ioctx, oid, name); + if (ret < 0) { + cerr << "ERROR: failed breaking lock: " << cpp_strerror(ret) << std::endl; + return ret; + } + } else { + usage_exit(); + } + + return 0; +} + +static int do_cache_flush(IoCtx& io_ctx, string oid) +{ + ObjectReadOperation op; + op.cache_flush(); + librados::AioCompletion *completion = + librados::Rados::aio_create_completion(); + io_ctx.aio_operate(oid.c_str(), completion, &op, + librados::OPERATION_IGNORE_CACHE | + librados::OPERATION_IGNORE_OVERLAY, + NULL); + completion->wait_for_safe(); + int r = completion->get_return_value(); + completion->release(); + return r; +} + +static int do_cache_try_flush(IoCtx& io_ctx, string oid) +{ + ObjectReadOperation op; + op.cache_try_flush(); + librados::AioCompletion *completion = + librados::Rados::aio_create_completion(); + io_ctx.aio_operate(oid.c_str(), completion, &op, + librados::OPERATION_IGNORE_CACHE | + librados::OPERATION_IGNORE_OVERLAY | + librados::OPERATION_SKIPRWLOCKS, + NULL); + completion->wait_for_safe(); + int r = completion->get_return_value(); + completion->release(); + return r; +} + +static int do_cache_evict(IoCtx& io_ctx, string oid) +{ + ObjectReadOperation op; + op.cache_evict(); + librados::AioCompletion *completion = + librados::Rados::aio_create_completion(); + io_ctx.aio_operate(oid.c_str(), completion, &op, + librados::OPERATION_IGNORE_CACHE | + librados::OPERATION_IGNORE_OVERLAY | + librados::OPERATION_SKIPRWLOCKS, + NULL); + completion->wait_for_safe(); + int r = completion->get_return_value(); + completion->release(); + return r; +} + +static int do_cache_flush_evict_all(IoCtx& io_ctx, bool blocking) +{ + int errors = 0; + io_ctx.set_namespace(all_nspaces); + try { + librados::NObjectIterator i = io_ctx.nobjects_begin(); + librados::NObjectIterator i_end = io_ctx.nobjects_end(); + for (; i != i_end; ++i) { + int r; + cout << i->get_nspace() << "\t" << i->get_oid() << "\t" << i->get_locator() << std::endl; + if (i->get_locator().size()) { + io_ctx.locator_set_key(i->get_locator()); + } else { + io_ctx.locator_set_key(string()); + } + io_ctx.set_namespace(i->get_nspace()); + snap_set_t ls; + io_ctx.snap_set_read(LIBRADOS_SNAP_DIR); + r = io_ctx.list_snaps(i->get_oid(), &ls); + if (r < 0) { + cerr << "error listing snap shots " << i->get_nspace() << "/" << i->get_oid() << ": " + << cpp_strerror(r) << std::endl; + ++errors; + continue; + } + std::vector<clone_info_t>::iterator ci = ls.clones.begin(); + // no snapshots + if (ci == ls.clones.end()) { + io_ctx.snap_set_read(CEPH_NOSNAP); + if (blocking) + r = do_cache_flush(io_ctx, i->get_oid()); + else + r = do_cache_try_flush(io_ctx, i->get_oid()); + if (r < 0) { + cerr << "failed to flush " << i->get_nspace() << "/" << i->get_oid() << ": " + << cpp_strerror(r) << std::endl; + ++errors; + continue; + } + r = do_cache_evict(io_ctx, i->get_oid()); + if (r < 0) { + cerr << "failed to evict " << i->get_nspace() << "/" << i->get_oid() << ": " + << cpp_strerror(r) << std::endl; + ++errors; + continue; + } + } else { + // has snapshots + for (std::vector<clone_info_t>::iterator ci = ls.clones.begin(); + ci != ls.clones.end(); ++ci) { + io_ctx.snap_set_read(ci->cloneid); + if (blocking) + r = do_cache_flush(io_ctx, i->get_oid()); + else + r = do_cache_try_flush(io_ctx, i->get_oid()); + if (r < 0) { + cerr << "failed to flush " << i->get_nspace() << "/" << i->get_oid() << ": " + << cpp_strerror(r) << std::endl; + ++errors; + break; + } + r = do_cache_evict(io_ctx, i->get_oid()); + if (r < 0) { + cerr << "failed to evict " << i->get_nspace() << "/" << i->get_oid() << ": " + << cpp_strerror(r) << std::endl; + ++errors; + break; + } + } + } + } + } + catch (const std::exception& e) { + cerr << e.what() << std::endl; + return -1; + } + return errors ? -1 : 0; +} + +static int do_get_inconsistent_pg_cmd(const std::vector<const char*> &nargs, + Rados& rados, + Formatter& formatter) +{ + if (nargs.size() < 2) { + usage_exit(); + } + int64_t pool_id = rados.pool_lookup(nargs[1]); + if (pool_id < 0) { + cerr << "pool \"" << nargs[1] << "\" not found" << std::endl; + return (int)pool_id; + } + std::vector<PlacementGroup> pgs; + int ret = rados.get_inconsistent_pgs(pool_id, &pgs); + if (ret) { + return ret; + } + formatter.open_array_section("pgs"); + for (auto& pg : pgs) { + formatter.dump_stream("pg") << pg; + } + formatter.close_section(); + formatter.flush(cout); + cout << std::endl; + return 0; +} + +static void dump_errors(const err_t &err, Formatter &f, const char *name) +{ + f.open_array_section(name); + if (err.has_shard_missing()) + f.dump_string("error", "missing"); + if (err.has_stat_error()) + f.dump_string("error", "stat_error"); + if (err.has_read_error()) + f.dump_string("error", "read_error"); + if (err.has_data_digest_mismatch_info()) + f.dump_string("error", "data_digest_mismatch_info"); + if (err.has_omap_digest_mismatch_info()) + f.dump_string("error", "omap_digest_mismatch_info"); + if (err.has_size_mismatch_info()) + f.dump_string("error", "size_mismatch_info"); + if (err.has_ec_hash_error()) + f.dump_string("error", "ec_hash_error"); + if (err.has_ec_size_error()) + f.dump_string("error", "ec_size_error"); + if (err.has_info_missing()) + f.dump_string("error", "info_missing"); + if (err.has_info_corrupted()) + f.dump_string("error", "info_corrupted"); + if (err.has_obj_size_info_mismatch()) + f.dump_string("error", "obj_size_info_mismatch"); + if (err.has_snapset_missing()) + f.dump_string("error", "snapset_missing"); + if (err.has_snapset_corrupted()) + f.dump_string("error", "snapset_corrupted"); + if (err.has_hinfo_missing()) + f.dump_string("error", "hinfo_missing"); + if (err.has_hinfo_corrupted()) + f.dump_string("error", "hinfo_corrupted"); + f.close_section(); +} + +static void dump_shard(const shard_info_t& shard, + const inconsistent_obj_t& inc, + Formatter &f) +{ + dump_errors(shard, f, "errors"); + + if (shard.has_shard_missing()) + return; + + if (!shard.has_stat_error()) + f.dump_unsigned("size", shard.size); + if (shard.omap_digest_present) { + f.dump_format("omap_digest", "0x%08x", shard.omap_digest); + } + if (shard.data_digest_present) { + f.dump_format("data_digest", "0x%08x", shard.data_digest); + } + + if ((inc.union_shards.has_info_missing() + || inc.union_shards.has_info_corrupted() + || inc.has_object_info_inconsistency() + || shard.has_obj_size_info_mismatch()) && + !shard.has_info_missing()) { + map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(OI_ATTR); + ceph_assert(k != shard.attrs.end()); // Can't be missing + if (!shard.has_info_corrupted()) { + object_info_t oi; + bufferlist bl; + auto bliter = k->second.cbegin(); + decode(oi, bliter); // Can't be corrupted + f.open_object_section("object_info"); + oi.dump(&f); + f.close_section(); + } else { + bool b64; + f.dump_string("object_info", cleanbin(k->second, b64)); + } + } + if ((inc.union_shards.has_snapset_missing() + || inc.union_shards.has_snapset_corrupted() + || inc.has_snapset_inconsistency()) && + !shard.has_snapset_missing()) { + map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(SS_ATTR); + ceph_assert(k != shard.attrs.end()); // Can't be missing + if (!shard.has_snapset_corrupted()) { + SnapSet ss; + bufferlist bl; + auto bliter = k->second.cbegin(); + decode(ss, bliter); // Can't be corrupted + f.open_object_section("snapset"); + ss.dump(&f); + f.close_section(); + } else { + bool b64; + f.dump_string("snapset", cleanbin(k->second, b64)); + } + } + if ((inc.union_shards.has_hinfo_missing() + || inc.union_shards.has_hinfo_corrupted() + || inc.has_hinfo_inconsistency()) && + !shard.has_hinfo_missing()) { + map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(ECUtil::get_hinfo_key()); + ceph_assert(k != shard.attrs.end()); // Can't be missing + if (!shard.has_hinfo_corrupted()) { + ECUtil::HashInfo hi; + bufferlist bl; + auto bliter = k->second.cbegin(); + decode(hi, bliter); // Can't be corrupted + f.open_object_section("hashinfo"); + hi.dump(&f); + f.close_section(); + } else { + bool b64; + f.dump_string("hashinfo", cleanbin(k->second, b64)); + } + } + if (inc.has_attr_name_mismatch() || inc.has_attr_value_mismatch()) { + f.open_array_section("attrs"); + for (auto kv : shard.attrs) { + // System attribute handled above + if (kv.first == OI_ATTR || kv.first[0] != '_') + continue; + f.open_object_section("attr"); + // Skip leading underscore since only giving user attrs + f.dump_string("name", kv.first.substr(1)); + bool b64; + f.dump_string("value", cleanbin(kv.second, b64)); + f.dump_bool("Base64", b64); + f.close_section(); + } + f.close_section(); + } +} + +static void dump_obj_errors(const obj_err_t &err, Formatter &f) +{ + f.open_array_section("errors"); + if (err.has_object_info_inconsistency()) + f.dump_string("error", "object_info_inconsistency"); + if (err.has_data_digest_mismatch()) + f.dump_string("error", "data_digest_mismatch"); + if (err.has_omap_digest_mismatch()) + f.dump_string("error", "omap_digest_mismatch"); + if (err.has_size_mismatch()) + f.dump_string("error", "size_mismatch"); + if (err.has_attr_value_mismatch()) + f.dump_string("error", "attr_value_mismatch"); + if (err.has_attr_name_mismatch()) + f.dump_string("error", "attr_name_mismatch"); + if (err.has_snapset_inconsistency()) + f.dump_string("error", "snapset_inconsistency"); + if (err.has_hinfo_inconsistency()) + f.dump_string("error", "hinfo_inconsistency"); + if (err.has_size_too_large()) + f.dump_string("error", "size_too_large"); + f.close_section(); +} + +static void dump_object_id(const object_id_t& object, + Formatter &f) +{ + f.dump_string("name", object.name); + f.dump_string("nspace", object.nspace); + f.dump_string("locator", object.locator); + switch (object.snap) { + case CEPH_NOSNAP: + f.dump_string("snap", "head"); + break; + case CEPH_SNAPDIR: + f.dump_string("snap", "snapdir"); + break; + default: + f.dump_unsigned("snap", object.snap); + break; + } +} + +static void dump_inconsistent(const inconsistent_obj_t& inc, + Formatter &f) +{ + f.open_object_section("object"); + dump_object_id(inc.object, f); + f.dump_unsigned("version", inc.version); + f.close_section(); + + dump_obj_errors(inc, f); + dump_errors(inc.union_shards, f, "union_shard_errors"); + for (const auto& shard_info : inc.shards) { + shard_info_t shard = const_cast<shard_info_t&>(shard_info.second); + if (shard.selected_oi) { + object_info_t oi; + bufferlist bl; + auto k = shard.attrs.find(OI_ATTR); + ceph_assert(k != shard.attrs.end()); // Can't be missing + auto bliter = k->second.cbegin(); + decode(oi, bliter); // Can't be corrupted + f.open_object_section("selected_object_info"); + oi.dump(&f); + f.close_section(); + break; + } + } + f.open_array_section("shards"); + for (const auto& shard_info : inc.shards) { + f.open_object_section("shard"); + auto& osd_shard = shard_info.first; + f.dump_int("osd", osd_shard.osd); + f.dump_bool("primary", shard_info.second.primary); + auto shard = osd_shard.shard; + if (shard != shard_id_t::NO_SHARD) + f.dump_unsigned("shard", shard); + dump_shard(shard_info.second, inc, f); + f.close_section(); + } + f.close_section(); +} + +static void dump_inconsistent(const inconsistent_snapset_t& inc, + Formatter &f) +{ + dump_object_id(inc.object, f); + + if (inc.ss_bl.length()) { + SnapSet ss; + bufferlist bl = inc.ss_bl; + auto bliter = bl.cbegin(); + decode(ss, bliter); // Can't be corrupted + f.open_object_section("snapset"); + ss.dump(&f); + f.close_section(); + } + f.open_array_section("errors"); + if (inc.snapset_missing()) + f.dump_string("error", "snapset_missing"); + if (inc.snapset_corrupted()) + f.dump_string("error", "snapset_corrupted"); + if (inc.info_missing()) + f.dump_string("error", "info_missing"); + if (inc.info_corrupted()) + f.dump_string("error", "info_corrupted"); + if (inc.snapset_error()) + f.dump_string("error", "snapset_error"); + if (inc.headless()) + f.dump_string("error", "headless"); + if (inc.size_mismatch()) + f.dump_string("error", "size_mismatch"); + if (inc.extra_clones()) + f.dump_string("error", "extra_clones"); + if (inc.clone_missing()) + f.dump_string("error", "clone_missing"); + f.close_section(); + + if (inc.extra_clones()) { + f.open_array_section("extra clones"); + for (auto snap : inc.clones) { + f.dump_unsigned("snap", snap); + } + f.close_section(); + } + + if (inc.clone_missing()) { + f.open_array_section("missing"); + for (auto snap : inc.missing) { + f.dump_unsigned("snap", snap); + } + f.close_section(); + } +} + +// dispatch the call by type +static int do_get_inconsistent(Rados& rados, + const PlacementGroup& pg, + const librados::object_id_t &start, + unsigned max_return, + AioCompletion *c, + std::vector<inconsistent_obj_t>* objs, + uint32_t* interval) +{ + return rados.get_inconsistent_objects(pg, start, max_return, c, + objs, interval); +} + +static int do_get_inconsistent(Rados& rados, + const PlacementGroup& pg, + const librados::object_id_t &start, + unsigned max_return, + AioCompletion *c, + std::vector<inconsistent_snapset_t>* snapsets, + uint32_t* interval) +{ + return rados.get_inconsistent_snapsets(pg, start, max_return, c, + snapsets, interval); +} + +template <typename T> +static int do_get_inconsistent_cmd(const std::vector<const char*> &nargs, + Rados& rados, + Formatter& formatter) +{ + if (nargs.size() < 2) { + usage_exit(); + } + PlacementGroup pg; + int ret = 0; + ret = pg.parse(nargs[1]); + if (!ret) { + cerr << "bad pg: " << nargs[1] << std::endl; + return ret; + } + uint32_t interval = 0, first_interval = 0; + const unsigned max_item_num = 32; + bool opened = false; + for (librados::object_id_t start;;) { + std::vector<T> items; + auto completion = librados::Rados::aio_create_completion(); + ret = do_get_inconsistent(rados, pg, start, max_item_num, completion, + &items, &interval); + completion->wait_for_safe(); + ret = completion->get_return_value(); + completion->release(); + if (ret < 0) { + if (ret == -EAGAIN) + cerr << "interval#" << interval << " expired." << std::endl; + else if (ret == -ENOENT) + cerr << "No scrub information available for pg " << pg << std::endl; + break; + } + // It must be the same interval every time. EAGAIN would + // occur if interval changes. + ceph_assert(start.name.empty() || first_interval == interval); + if (start.name.empty()) { + first_interval = interval; + formatter.open_object_section("info"); + formatter.dump_int("epoch", interval); + formatter.open_array_section("inconsistents"); + opened = true; + } + for (auto& inc : items) { + formatter.open_object_section("inconsistent"); + dump_inconsistent(inc, formatter); + formatter.close_section(); + } + if (items.size() < max_item_num) { + formatter.close_section(); + break; + } + if (!items.empty()) { + start = items.back().object; + } + items.clear(); + } + if (opened) { + formatter.close_section(); + formatter.flush(cout); + } + return ret; +} + +static std::string prettify(const std::string& s) +{ + if (std::find_if_not(s.begin(), s.end(), + (int (*)(int))isprint) != s.end()) { + return "(binary key)"; + } else { + return s; + } +} + +/********************************************** + +**********************************************/ +static int rados_tool_common(const std::map < std::string, std::string > &opts, + std::vector<const char*> &nargs) +{ + int ret; + bool create_pool = false; + const char *pool_name = NULL; + const char *target_pool_name = NULL; + string oloc, target_oloc, nspace, target_nspace; + int concurrent_ios = 16; + unsigned op_size = default_op_size; + unsigned object_size = 0; + unsigned max_objects = 0; + uint64_t obj_offset = 0; + bool block_size_specified = false; + int bench_write_dest = 0; + bool cleanup = true; + bool hints = true; // for rados bench + bool reuse_bench = false; + bool no_verify = false; + bool use_striper = false; + bool with_clones = false; + const char *snapname = NULL; + snap_t snapid = CEPH_NOSNAP; + std::map<std::string, std::string>::const_iterator i; + + uint64_t offset_align = 0; + uint64_t min_obj_len = 0; + uint64_t max_obj_len = 0; + uint64_t min_op_len = 0; + uint64_t max_op_len = 0; + uint64_t max_ops = 0; + uint64_t max_backlog = 0; + uint64_t target_throughput = 0; + int64_t read_percent = -1; + uint64_t num_objs = 0; + int run_length = 0; + + bool show_time = false; + bool wildcard = false; + + std::string run_name; + std::string prefix; + bool forcefull = false; + unique_ptr<Formatter> formatter = nullptr; + bool pretty_format = false; + const char *output = NULL; + std::optional<std::string> omap_key; + std::optional<std::string> obj_name; + bool with_reference = false; + + Rados rados; + IoCtx io_ctx; + + i = opts.find("create"); + if (i != opts.end()) { + create_pool = true; + } + i = opts.find("pool"); + if (i != opts.end()) { + pool_name = i->second.c_str(); + } + i = opts.find("target_pool"); + if (i != opts.end()) { + target_pool_name = i->second.c_str(); + } + i = opts.find("object_locator"); + if (i != opts.end()) { + oloc = i->second; + } + i = opts.find("target_locator"); + if (i != opts.end()) { + target_oloc = i->second; + } + i = opts.find("target_nspace"); + if (i != opts.end()) { + target_nspace = i->second; + } + i = opts.find("concurrent-ios"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &concurrent_ios)) { + return -EINVAL; + } + } + i = opts.find("run-name"); + if (i != opts.end()) { + run_name = i->second; + } + + i = opts.find("force-full"); + if (i != opts.end()) { + forcefull = true; + } + i = opts.find("prefix"); + if (i != opts.end()) { + prefix = i->second; + } + i = opts.find("block-size"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &op_size)) { + return -EINVAL; + } + block_size_specified = true; + } + i = opts.find("object-size"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &object_size)) { + return -EINVAL; + } + block_size_specified = true; + } + i = opts.find("max-objects"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &max_objects)) { + return -EINVAL; + } + } + i = opts.find("offset"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &obj_offset)) { + return -EINVAL; + } + } + i = opts.find("snap"); + if (i != opts.end()) { + snapname = i->second.c_str(); + } + i = opts.find("snapid"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &snapid)) { + return -EINVAL; + } + } + i = opts.find("min-object-size"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &min_obj_len)) { + return -EINVAL; + } + } + i = opts.find("max-object-size"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &max_obj_len)) { + return -EINVAL; + } + } + i = opts.find("min-op-len"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &min_op_len)) { + return -EINVAL; + } + } + i = opts.find("max-op-len"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &max_op_len)) { + return -EINVAL; + } + } + i = opts.find("max-ops"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &max_ops)) { + return -EINVAL; + } + } + i = opts.find("max-backlog"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &max_backlog)) { + return -EINVAL; + } + } + i = opts.find("target-throughput"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &target_throughput)) { + return -EINVAL; + } + } + i = opts.find("read-percent"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &read_percent)) { + return -EINVAL; + } + } + i = opts.find("num-objects"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &num_objs)) { + return -EINVAL; + } + } + i = opts.find("run-length"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &run_length)) { + return -EINVAL; + } + } + i = opts.find("show-time"); + if (i != opts.end()) { + show_time = true; + } + i = opts.find("no-cleanup"); + if (i != opts.end()) { + cleanup = false; + } + i = opts.find("no-hints"); + if (i != opts.end()) { + hints = false; + } + i = opts.find("reuse-bench"); + if (i != opts.end()) { + reuse_bench = true; + } + i = opts.find("pretty-format"); + if (i != opts.end()) { + pretty_format = true; + } + i = opts.find("format"); + if (i != opts.end()) { + const char *format = i->second.c_str(); + formatter.reset(Formatter::create(format)); + if (!formatter) { + cerr << "unrecognized format: " << format << std::endl; + return -EINVAL; + } + } + i = opts.find("namespace"); + if (i != opts.end()) { + nspace = i->second; + } + i = opts.find("no-verify"); + if (i != opts.end()) { + no_verify = true; + } + i = opts.find("output"); + if (i != opts.end()) { + output = i->second.c_str(); + } + i = opts.find("write-dest-obj"); + if (i != opts.end()) { + bench_write_dest |= static_cast<int>(OP_WRITE_DEST_OBJ); + } + i = opts.find("write-dest-omap"); + if (i != opts.end()) { + bench_write_dest |= static_cast<int>(OP_WRITE_DEST_OMAP); + } + i = opts.find("write-dest-xattr"); + if (i != opts.end()) { + bench_write_dest |= static_cast<int>(OP_WRITE_DEST_XATTR); + } + i = opts.find("with-clones"); + if (i != opts.end()) { + with_clones = true; + } + i = opts.find("omap-key-file"); + if (i != opts.end()) { + string err; + bufferlist indata; + ret = indata.read_file(i->second.c_str(), &err); + if (ret < 0) { + cerr << err << std::endl; + return 1; + } + omap_key = std::string(indata.c_str(), indata.length()); + } + i = opts.find("obj-name-file"); + if (i != opts.end()) { + string err; + bufferlist indata; + ret = indata.read_file(i->second.c_str(), &err); + if (ret < 0) { + cerr << err << std::endl; + return 1; + } + obj_name = std::string(indata.c_str(), indata.length()); + } + i = opts.find("offset_align"); + if (i != opts.end()) { + if (rados_sistrtoll(i, &offset_align)) { + return -EINVAL; + } + } + i = opts.find("with-reference"); + if (i != opts.end()) { + with_reference = true; + } + + // open rados + ret = rados.init_with_context(g_ceph_context); + if (ret < 0) { + cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl; + return 1; + } + + ret = rados.connect(); + if (ret) { + cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl; + return 1; + } + + if (create_pool && !pool_name) { + cerr << "--create-pool requested but pool_name was not specified!" << std::endl; + usage(cerr); + return 1; + } + + if (create_pool) { + ret = rados.pool_create(pool_name); + if (ret < 0) { + cerr << "error creating pool " << pool_name << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + } + + i = opts.find("pgid"); + boost::optional<pg_t> pgid(i != opts.end(), pg_t()); + if (pgid && (!pgid->parse(i->second.c_str()) || (pool_name && rados.pool_lookup(pool_name) != pgid->pool()))) { + cerr << "invalid pgid" << std::endl; + return 1; + } + + // open io context. + if (pool_name || pgid) { + ret = pool_name ? rados.ioctx_create(pool_name, io_ctx) : rados.ioctx_create2(pgid->pool(), io_ctx); + if (ret < 0) { + cerr << "error opening pool " + << (pool_name ? pool_name : std::string("with id ") + std::to_string(pgid->pool())) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + + // align op_size + { + bool requires; + ret = io_ctx.pool_requires_alignment2(&requires); + if (ret < 0) { + cerr << "error checking pool alignment requirement" + << cpp_strerror(ret) << std::endl; + return 1; + } + + if (requires) { + uint64_t align = 0; + ret = io_ctx.pool_required_alignment2(&align); + if (ret < 0) { + cerr << "error getting pool alignment" + << cpp_strerror(ret) << std::endl; + return 1; + } + + const uint64_t prev_op_size = op_size; + op_size = uint64_t((op_size + align - 1) / align) * align; + // Warn: if user specified and it was rounded + if (prev_op_size != default_op_size && prev_op_size != op_size) + cerr << "INFO: op_size has been rounded to " << op_size << std::endl; + } + } + +#ifdef WITH_LIBRADOSSTRIPER + // create striper interface + if (opts.find("striper") != opts.end()) { + // Note that this call does a tricky thing by reaching into a "singleton". We count + // on this happening only once: + ret = RadosStriper::striper_create(io_ctx, &detail::striper()); + if (0 != ret) { + cerr << "error opening pool " << pool_name << " with striper interface: " + << cpp_strerror(ret) << std::endl; + return 1; + } + use_striper = true; + } +#endif // USE_LIBRADOSSTRIPER + } + + // snapname? + if (snapname) { + if (!pool_name) { + cerr << "pool name must be specified with --snap" << std::endl; + return 1; + } + ret = io_ctx.snap_lookup(snapname, &snapid); + if (ret < 0) { + cerr << "error looking up snap '" << snapname << "': " << cpp_strerror(ret) << std::endl; + return 1; + } + } + if (oloc.size()) { + if (!pool_name) { + cerr << "pool name must be specified with --object_locator" << std::endl; + return 1; + } + io_ctx.locator_set_key(oloc); + } + // Use namespace from command line if specified + if (opts.find("namespace") != opts.end()) { + if (!pool_name) { + cerr << "pool name must be specified with --namespace" << std::endl; + return 1; + } + io_ctx.set_namespace(nspace); + // Use wildcard if --all specified and --default NOT specified + } else if (opts.find("all") != opts.end() && opts.find("default") == opts.end()) { + // Only the ls should ever set namespace to special value + wildcard = true; + } + if (snapid != CEPH_NOSNAP) { + if (!pool_name) { + cerr << "pool name must be specified with --snapid" << std::endl; + return 1; + } + string name; + ret = io_ctx.snap_get_name(snapid, &name); + if (ret < 0) { + cerr << "snapid " << snapid << " doesn't exist in pool " + << io_ctx.get_pool_name() << std::endl; + return 1; + } + io_ctx.snap_set_read(snapid); + cout << "selected snap " << snapid << " '" << name << "'" << std::endl; + } + + ceph_assert(!nargs.empty()); + + // list pools? + if (strcmp(nargs[0], "lspools") == 0) { + list<string> vec; + ret = rados.pool_list(vec); + if (ret < 0) { + cerr << "error listing pools: " << cpp_strerror(ret) << std::endl; + return 1; + } + for (list<string>::iterator i = vec.begin(); i != vec.end(); ++i) + cout << *i << std::endl; + } + else if (strcmp(nargs[0], "df") == 0) { + // pools + list<string> vec; + + if (!pool_name) { + ret = rados.pool_list(vec); + if (ret < 0) { + cerr << "error listing pools: " << cpp_strerror(ret) << std::endl; + return 1; + } + } else { + vec.push_back(pool_name); + } + + map<string,librados::pool_stat_t> stats; + ret = rados.get_pool_stats(vec, stats); + if (ret < 0) { + cerr << "error fetching pool stats: " << cpp_strerror(ret) << std::endl; + return 1; + } + + TextTable tab; + + if (!formatter) { + tab.define_column("POOL_NAME", TextTable::LEFT, TextTable::LEFT); + tab.define_column("USED", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("OBJECTS", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("CLONES", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("COPIES", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("MISSING_ON_PRIMARY", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("UNFOUND", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("DEGRADED", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("RD_OPS", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("RD", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("WR_OPS", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("WR", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("USED COMPR", TextTable::RIGHT, TextTable::RIGHT); + tab.define_column("UNDER COMPR", TextTable::RIGHT, TextTable::RIGHT); + } else { + formatter->open_object_section("stats"); + formatter->open_array_section("pools"); + } + for (map<string,librados::pool_stat_t>::iterator i = stats.begin(); + i != stats.end(); + ++i) { + const char *pool_name = i->first.c_str(); + librados::pool_stat_t& s = i->second; + if (!formatter) { + tab << pool_name + << byte_u_t(s.num_bytes) + << s.num_objects + << s.num_object_clones + << s.num_object_copies + << s.num_objects_missing_on_primary + << s.num_objects_unfound + << s.num_objects_degraded + << s.num_rd + << byte_u_t(s.num_rd_kb << 10) + << s.num_wr + << byte_u_t(s.num_wr_kb << 10) + << byte_u_t(s.compressed_bytes_alloc) + << byte_u_t(s.compressed_bytes_orig) + << TextTable::endrow; + } else { + formatter->open_object_section("pool"); + int64_t pool_id = rados.pool_lookup(pool_name); + formatter->dump_string("name", pool_name); + if (pool_id >= 0) + formatter->dump_int("id", pool_id); + else + cerr << "ERROR: lookup_pg_pool_name for name=" << pool_name + << " returned " << pool_id << std::endl; + formatter->dump_int("size_bytes",s.num_bytes); + formatter->dump_int("size_kb", s.num_kb); + formatter->dump_int("num_objects", s.num_objects); + formatter->dump_int("num_object_clones", s.num_object_clones); + formatter->dump_int("num_object_copies", s.num_object_copies); + formatter->dump_int("num_objects_missing_on_primary", s.num_objects_missing_on_primary); + formatter->dump_int("num_objects_unfound", s.num_objects_unfound); + formatter->dump_int("num_objects_degraded", s.num_objects_degraded); + formatter->dump_int("read_ops", s.num_rd); + formatter->dump_int("read_bytes", s.num_rd_kb * 1024ull); + formatter->dump_int("write_ops", s.num_wr); + formatter->dump_int("write_bytes", s.num_wr_kb * 1024ull); + formatter->dump_int("compress_bytes_used", s.compressed_bytes_alloc); + formatter->dump_int("compress_under_bytes", s.compressed_bytes_orig); + formatter->close_section(); + } + } + + if (!formatter) { + cout << tab; + } + + // total + cluster_stat_t tstats; + ret = rados.cluster_stat(tstats); + if (ret < 0) { + cerr << "error getting total cluster usage: " << cpp_strerror(ret) << std::endl; + return 1; + } + if (!formatter) { + cout << std::endl; + cout << "total_objects " << tstats.num_objects + << std::endl; + cout << "total_used " << byte_u_t(tstats.kb_used << 10) + << std::endl; + cout << "total_avail " << byte_u_t(tstats.kb_avail << 10) + << std::endl; + cout << "total_space " << byte_u_t(tstats.kb << 10) + << std::endl; + } else { + formatter->close_section(); + formatter->dump_int("total_objects", tstats.num_objects); + formatter->dump_int("total_used", tstats.kb_used); + formatter->dump_int("total_avail", tstats.kb_avail); + formatter->dump_int("total_space", tstats.kb); + formatter->close_section(); + formatter->flush(cout); + } + } + + else if (strcmp(nargs[0], "ls") == 0) { + if (!pool_name && !pgid) { + cerr << "either pool name or pg id needs to be specified" << std::endl; + return 1; + } + + if (wildcard) { + io_ctx.set_namespace(all_nspaces); + } + bool use_stdout = (!output && (nargs.size() < 2 || (strcmp(nargs[1], "-") == 0))); + if (!use_stdout && !output) { + cerr << "Please use --output to specify the output file name" << std::endl; + return 1; + } + + ostream *outstream; + if (use_stdout) { + outstream = &cout; + } else { + outstream = new ofstream(output); + } + + { + if (formatter) { + formatter->open_array_section("objects"); + } + try { + librados::NObjectIterator i = pgid ? io_ctx.nobjects_begin(pgid->ps()) : io_ctx.nobjects_begin(); + const librados::NObjectIterator i_end = io_ctx.nobjects_end(); + for (; i != i_end; ++i) { +#ifdef WITH_LIBRADOSSTRIPER + if (use_striper) { + // in case of --striper option, we only list striped + // objects, so we only display the first object of + // each, without its suffix '.000...000' + size_t l = i->get_oid().length(); + if (l <= 17 || + (0 != i->get_oid().compare(l-17, 17,".0000000000000000"))) { + continue; + } + } +#endif // WITH_LIBRADOSSTRIPER + if (pgid) { + uint32_t ps; + if (io_ctx.get_object_pg_hash_position2(i->get_oid(), &ps) || pgid->ps() != ps) { + break; + } + } + if (!formatter) { + // Only include namespace in output when wildcard specified + if (wildcard) { + *outstream << i->get_nspace() << "\t"; + } + *outstream << detail::get_oid(i, use_striper); + if (i->get_locator().size()) { + *outstream << "\t" << i->get_locator(); + } + *outstream << std::endl; + } else { + formatter->open_object_section("object"); + formatter->dump_string("namespace", i->get_nspace()); + + detail::dump_name(formatter.get(), i, use_striper); + + if (i->get_locator().size()) { + formatter->dump_string("locator", i->get_locator()); + } + formatter->close_section(); //object + + constexpr int TARGET_BYTES_PER_FLUSH = 4096; + if (formatter->get_len() >= TARGET_BYTES_PER_FLUSH) { + formatter->flush(*outstream); + } + } + } + } + catch (const std::exception& e) { + cerr << e.what() << std::endl; + return 1; + } + } + if (formatter) { + formatter->close_section(); //objects + formatter->flush(*outstream); + if (pretty_format) { + *outstream << std::endl; + } + formatter->flush(*outstream); + } + if (!stdout) { + delete outstream; + } + } + else if (strcmp(nargs[0], "mapext") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + std::map<uint64_t,uint64_t> m; + ret = io_ctx.mapext(*obj_name, 0, -1, m); + if (ret < 0) { + cerr << "mapext error on " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + std::map<uint64_t,uint64_t>::iterator iter; + for (iter = m.begin(); iter != m.end(); ++iter) { + cout << hex << iter->first << "\t" << iter->second << dec << std::endl; + } + } + else if (strcmp(nargs[0], "stat") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + uint64_t size; + time_t mtime; + + ret = detail::stat(io_ctx, *obj_name, size, mtime, use_striper); + + if (ret < 0) { + cerr << " error stat-ing " << pool_name << "/" << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } else { + utime_t t(mtime, 0); + cout << pool_name << "/" << prettify(*obj_name) + << " mtime " << t << ", size " << size << std::endl; + } + } + else if (strcmp(nargs[0], "stat2") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + uint64_t size; + struct timespec mtime; + + ret = detail::stat2(io_ctx, *obj_name, size, mtime, use_striper); + + if (ret < 0) { + cerr << " error stat-ing " << pool_name << "/" << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } else { + utime_t t(mtime); + cout << pool_name << "/" << prettify(*obj_name) + << " mtime " << t << ", size " << size << std::endl; + } + } + else if (strcmp(nargs[0], "touch") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + time_t timestamp = time(NULL); + if (nargs.size() > (obj_name ? 1 : 2)) { + char* endptr = NULL; + timestamp = static_cast<time_t>(strtoll(nargs[obj_name ? 1 : 2], &endptr, 10)); + if (*endptr) { + cerr << "Invalid value for timestamp: '" << nargs[obj_name ? 1 : 2] << "'" << std::endl; + ret = -EINVAL; + return 1; + } + } + if (!obj_name) { + obj_name = nargs[1]; + } + ObjectWriteOperation op; + op.create(false); + op.mtime(×tamp); + ret = io_ctx.operate(*obj_name, &op); + if (ret < 0) { + cerr << " error touch-ing " << pool_name << "/" << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + } + else if (strcmp(nargs[0], "get") == 0) { + if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) { + usage(cerr); + return 1; + } + const char* out_filename; + if (obj_name) { + out_filename = nargs[1]; + } else { + obj_name = nargs[1]; + out_filename = nargs[2]; + } + ret = do_get(io_ctx, *obj_name, out_filename, op_size, use_striper); + if (ret < 0) { + cerr << "error getting " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + } + else if (strcmp(nargs[0], "put") == 0) { + if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) { + usage(cerr); + return 1; + } + const char* in_filename; + if (obj_name) { + in_filename = nargs[1]; + } else { + obj_name = nargs[1]; + in_filename = nargs[2]; + } + ret = do_put(io_ctx, *obj_name, in_filename, op_size, obj_offset, use_striper); + if (ret < 0) { + cerr << "error putting " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + } + else if (strcmp(nargs[0], "append") == 0) { + if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) { + usage(cerr); + return 1; + } + const char* in_filename; + if (obj_name) { + in_filename = nargs[1]; + } else { + obj_name = nargs[1]; + in_filename = nargs[2]; + } + ret = do_append(io_ctx, *obj_name, in_filename, op_size, use_striper); + if (ret < 0) { + cerr << "error appending " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + } + else if (strcmp(nargs[0], "truncate") == 0) { + if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) { + usage(cerr); + return 1; + } + + char* endptr = NULL; + long size; + if (!obj_name) { + obj_name = nargs[1]; + size = strtoll(nargs[2], &endptr, 10); + } else { + size = strtoll(nargs[1], &endptr, 10); + } + if (*endptr) { + cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl; + ret = -EINVAL; + return 1; + } + if (size < 0) { + cerr << "error, cannot truncate to negative value" << std::endl; + usage(cerr); + return 1; + } + + ret = detail::trunc(io_ctx, *obj_name, size, use_striper); + + if (ret < 0) { + cerr << "error truncating oid " + << prettify(*obj_name) << " to " << size << ": " + << cpp_strerror(ret) << std::endl; + } else { + ret = 0; + } + } + else if (strcmp(nargs[0], "setxattr") == 0) { + if (!pool_name || nargs.size() < (obj_name ? 2 : 3) || + nargs.size() > (obj_name ? 3 : 4)) { + usage(cerr); + return 1; + } + string attr_name(nargs[obj_name ? 1 : 2]); + bufferlist bl; + if (nargs.size() == (obj_name ? 3 : 4)) { + string attr_val(nargs[obj_name ? 2 : 3]); + bl.append(attr_val.c_str(), attr_val.length()); + } else { + do { + ret = bl.read_fd(STDIN_FILENO, 1024); // from stdin + if (ret < 0) + return 1; + } while (ret > 0); + } + if (!obj_name) { + obj_name = nargs[1]; + } + + ret = detail::setxattr(io_ctx, *obj_name, attr_name, bl, use_striper); + + if (ret < 0) { + cerr << "error setting xattr " << pool_name << "/" << prettify(*obj_name) << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + else + ret = 0; + } + else if (strcmp(nargs[0], "getxattr") == 0) { + if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) { + usage(cerr); + return 1; + } + string attr_name(nargs[obj_name ? 1 : 2]); + if (!obj_name) { + obj_name = nargs[1]; + } + bufferlist bl; + ret = detail::getxattr(io_ctx, *obj_name, attr_name, bl, use_striper); + + if (ret < 0) { + cerr << "error getting xattr " << pool_name << "/" << prettify(*obj_name) << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + else + ret = 0; + string s(bl.c_str(), bl.length()); + cout << s; + } else if (strcmp(nargs[0], "rmxattr") == 0) { + if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) { + usage(cerr); + return 1; + } + + string attr_name(nargs[obj_name ? 1 : 2]); + if (!obj_name) { + obj_name = nargs[1]; + } + ret = detail::rmxattr(io_ctx, *obj_name, attr_name, use_striper); + + if (ret < 0) { + cerr << "error removing xattr " << pool_name << "/" << prettify(*obj_name) << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "listxattr") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + bufferlist bl; + map<std::string, bufferlist> attrset; + + ret = detail::getxattrs(io_ctx, *obj_name, attrset, use_striper); + + if (ret < 0) { + cerr << "error getting xattr set " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + + for (map<std::string, bufferlist>::iterator iter = attrset.begin(); + iter != attrset.end(); ++iter) { + cout << iter->first << std::endl; + } + } else if (strcmp(nargs[0], "getomapheader") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + string outfile; + if (nargs.size() >= (obj_name ? 2 : 3)) { + outfile = nargs[obj_name ? 1 : 2]; + } + if (!obj_name) { + obj_name = nargs[1]; + } + bufferlist header; + ret = io_ctx.omap_get_header(*obj_name, &header); + if (ret < 0) { + cerr << "error getting omap header " << pool_name << "/" << prettify(*obj_name) + << ": " << cpp_strerror(ret) << std::endl; + return 1; + } else { + if (!outfile.empty()) { + cerr << "Writing to " << outfile << std::endl; + dump_data(outfile, header); + } else { + cout << "header (" << header.length() << " bytes) :\n"; + header.hexdump(cout); + cout << std::endl; + } + ret = 0; + } + } else if (strcmp(nargs[0], "setomapheader") == 0) { + if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) { + usage(cerr); + return 1; + } + + bufferlist bl; + if (!obj_name) { + obj_name = nargs[1]; + bl.append(nargs[2]); // val + } else { + bl.append(nargs[1]); // val + } + ret = io_ctx.omap_set_header(*obj_name, bl); + if (ret < 0) { + cerr << "error setting omap value " << pool_name << "/" << prettify(*obj_name) + << ": " << cpp_strerror(ret) << std::endl; + return 1; + } else { + ret = 0; + } + } else if (strcmp(nargs[0], "setomapval") == 0) { + uint32_t min_args = (omap_key ? 2 : 3); + if (!pool_name || nargs.size() < min_args || nargs.size() > min_args + 1) { + usage(cerr); + return 1; + } + + string oid(nargs[1]); + if (!omap_key) { + omap_key = nargs[2]; + } + + bufferlist bl; + if (nargs.size() > min_args) { + string val(nargs[min_args]); + bl.append(val); + } else { + do { + ret = bl.read_fd(STDIN_FILENO, 1024); // from stdin + if (ret < 0) { + return 1; + } + } while (ret > 0); + } + + map<string, bufferlist> values; + values[*omap_key] = bl; + + ret = io_ctx.omap_set(oid, values); + if (ret < 0) { + cerr << "error setting omap value " << pool_name << "/" << oid << "/" + << prettify(*omap_key) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } else { + ret = 0; + } + } else if (strcmp(nargs[0], "getomapval") == 0) { + uint32_t min_args = (omap_key ? (obj_name ? 1 : 2) + : (obj_name ? 2 : 3)); + if (!pool_name || nargs.size() < min_args || nargs.size() > min_args + 1) { + usage(cerr); + return 1; + } + + if (!omap_key) { + omap_key = nargs[obj_name ? 1 : 2]; + } + + set<string> keys; + keys.insert(*omap_key); + + std::string outfile; + if (nargs.size() > min_args) { + outfile = nargs[min_args]; + } + if (!obj_name) { + obj_name = nargs[1]; + } + + map<string, bufferlist> values; + ret = io_ctx.omap_get_vals_by_keys(*obj_name, keys, &values); + if (ret < 0) { + cerr << "error getting omap value " << pool_name << "/" << prettify(*obj_name) << "/" + << prettify(*omap_key) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } else { + ret = 0; + } + + if (values.size() && values.begin()->first == *omap_key) { + if (!outfile.empty()) { + cerr << "Writing to " << outfile << std::endl; + dump_data(outfile, values.begin()->second); + } else { + cout << "value (" << values.begin()->second.length() << " bytes) :\n"; + values.begin()->second.hexdump(cout); + cout << std::endl; + } + ret = 0; + } else { + cout << "No such key: " << pool_name << "/" << prettify(*obj_name) << "/" + << prettify(*omap_key) << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "rmomapkey") == 0) { + uint32_t num_args = (omap_key ? (obj_name ? 1 : 2) + : (obj_name ? 2 : 3)); + if (!pool_name || nargs.size() != num_args) { + usage(cerr); + return 1; + } + + if (!omap_key) { + omap_key = nargs[obj_name ? 1 : 2]; + } + if (!obj_name) { + obj_name = nargs[1]; + } + set<string> keys; + keys.insert(*omap_key); + + ret = io_ctx.omap_rm_keys(*obj_name, keys); + if (ret < 0) { + cerr << "error removing omap key " << pool_name << "/" << prettify(*obj_name) << "/" + << prettify(*omap_key) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } else { + ret = 0; + } + } else if (strcmp(nargs[0], "clearomap") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + // strip nargs[0] which is "clearomap" + std::vector<std::string> oids(std::next(std::begin(nargs)), + std::end(nargs)); + if (obj_name) { + oids.push_back(*obj_name); + } + + for (const auto& oid : oids) { + ret = io_ctx.omap_clear(oid); + if (ret < 0) { + cerr << "error clearing omap keys " << pool_name << "/" << prettify(*obj_name) << "/" + << cpp_strerror(ret) << std::endl; + return 1; + } + } + ret = 0; + } else if (strcmp(nargs[0], "listomapvals") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + string last_read = ""; + int MAX_READ = 512; + do { + map<string, bufferlist> values; + ret = io_ctx.omap_get_vals(*obj_name, last_read, MAX_READ, &values); + if (ret < 0) { + cerr << "error getting omap keys " << pool_name << "/" << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + ret = values.size(); + for (map<string, bufferlist>::const_iterator it = values.begin(); + it != values.end(); ++it) { + last_read = it->first; + // dump key in hex if it contains nonprintable characters + if (std::count_if(it->first.begin(), it->first.end(), + (int (*)(int))isprint) < (int)it->first.length()) { + cout << "key (" << it->first.length() << " bytes):\n"; + bufferlist keybl; + keybl.append(it->first); + keybl.hexdump(cout); + } else { + cout << it->first; + } + cout << std::endl; + cout << "value (" << it->second.length() << " bytes) :\n"; + it->second.hexdump(cout); + cout << std::endl; + } + } while (ret == MAX_READ); + ret = 0; + } + else if (strcmp(nargs[0], "cp") == 0) { + // XXX: binary names aren't supported for this operation + if (!pool_name) { + usage(cerr); + return 1; + } + + if (nargs.size() < 2 || nargs.size() > 3) { + usage(cerr); + return 1; + } + + const char *target = target_pool_name; + if (!target) + target = pool_name; + + const char *target_obj; + if (nargs.size() < 3) { + if (strcmp(target, pool_name) == 0) { + cerr << "cannot copy object into itself" << std::endl; + return 1; + } + target_obj = nargs[1]; + } else { + target_obj = nargs[2]; + } + + // open io context. + IoCtx target_ctx; + ret = rados.ioctx_create(target, target_ctx); + if (ret < 0) { + cerr << "error opening target pool " << target << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + if (target_oloc.size()) { + target_ctx.locator_set_key(target_oloc); + } + if (target_nspace.size()) { + target_ctx.set_namespace(target_nspace); + } + + ret = do_copy(io_ctx, nargs[1], target_ctx, target_obj); + if (ret < 0) { + cerr << "error copying " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "rm") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + // strip nargs[0] which is "rm" + std::vector<std::string> oids(std::next(std::begin(nargs)), + std::end(nargs)); + if (obj_name) { + oids.push_back(*obj_name); + } + for (const auto& oid : oids) { + if (forcefull) { + ret = detail::remove(io_ctx, oid, (CEPH_OSD_FLAG_FULL_FORCE | + CEPH_OSD_FLAG_FULL_TRY), use_striper); + } else { + ret = detail::remove(io_ctx, oid, use_striper); + } + + if (ret < 0) { + string name = (nspace.size() ? nspace + "/" : "" ) + prettify(oid); + cerr << "error removing " << pool_name << ">" << name << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + } + } + else if (strcmp(nargs[0], "create") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + ret = io_ctx.create(*obj_name, true); + if (ret < 0) { + cerr << "error creating " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + } + else if (strcmp(nargs[0], "cppool") == 0) { + bool force = nargs.size() == 4 && !strcmp(nargs[3], "--yes-i-really-mean-it"); + if (nargs.size() != 3 && !(nargs.size() == 4 && force)) { + usage(cerr); + return 1; + } + const char *src_pool = nargs[1]; + const char *target_pool = nargs[2]; + + if (strcmp(src_pool, target_pool) == 0) { + cerr << "cannot copy pool into itself" << std::endl; + return 1; + } + + cerr << "WARNING: pool copy does not preserve user_version, which some " + << " apps may rely on." << std::endl; + + if (rados.get_pool_is_selfmanaged_snaps_mode(src_pool)) { + cerr << "WARNING: pool " << src_pool << " has selfmanaged snaps, which are not preserved\n" + << " by the cppool operation. This will break any snapshot user." + << std::endl; + if (!force) { + cerr << " If you insist on making a broken copy, you can pass\n" + << " --yes-i-really-mean-it to proceed anyway." + << std::endl; + exit(1); + } + } + + ret = do_copy_pool(rados, src_pool, target_pool); + if (ret < 0) { + cerr << "error copying pool " << src_pool << " => " << target_pool << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + cout << "successfully copied pool " << nargs[1] << std::endl; + } + else if (strcmp(nargs[0], "purge") == 0) { + if (nargs.size() < 2) { + usage(cerr); + return 1; + } + if (nargs.size() < 3 || + strcmp(nargs[2], "--yes-i-really-really-mean-it") != 0) { + cerr << "WARNING:\n" + << " This will PERMANENTLY DESTROY all objects from a pool with no way back.\n" + << " To confirm, follow pool with --yes-i-really-really-mean-it" << std::endl; + return 1; + } + ret = rados.ioctx_create(nargs[1], io_ctx); + if (ret < 0) { + cerr << "error pool " << nargs[1] << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + io_ctx.set_namespace(all_nspaces); + io_ctx.set_osdmap_full_try(); + RadosBencher bencher(g_ceph_context, rados, io_ctx); + ret = bencher.clean_up_slow("", concurrent_ios); + if (ret >= 0) { + cout << "successfully purged pool " << nargs[1] << std::endl; + } else { //error + cerr << "pool " << nargs[1] << " could not be purged" << std::endl; + cerr << "Check your monitor configuration - `mon allow pool delete` is set to false by default," + << " change it to true to allow deletion of pools" << std::endl; + } + } + else if (strcmp(nargs[0], "lssnap") == 0) { + if (!pool_name || nargs.size() != 1) { + usage(cerr); + return 1; + } + + vector<snap_t> snaps; + io_ctx.snap_list(&snaps); + for (vector<snap_t>::iterator i = snaps.begin(); + i != snaps.end(); + ++i) { + string s; + time_t t; + if (io_ctx.snap_get_name(*i, &s) < 0) + continue; + if (io_ctx.snap_get_stamp(*i, &t) < 0) + continue; + struct tm bdt; + localtime_r(&t, &bdt); + cout << *i << "\t" << s << "\t"; + + std::ios_base::fmtflags original_flags = cout.flags(); + cout.setf(std::ios::right); + cout.fill('0'); + cout << std::setw(4) << (bdt.tm_year+1900) + << '.' << std::setw(2) << (bdt.tm_mon+1) + << '.' << std::setw(2) << bdt.tm_mday + << ' ' + << std::setw(2) << bdt.tm_hour + << ':' << std::setw(2) << bdt.tm_min + << ':' << std::setw(2) << bdt.tm_sec + << std::endl; + cout.flags(original_flags); + } + cout << snaps.size() << " snaps" << std::endl; + } + + else if (strcmp(nargs[0], "mksnap") == 0) { + if (!pool_name || nargs.size() < 2) { + usage(cerr); + return 1; + } + + if (rados.get_pool_is_selfmanaged_snaps_mode(pool_name)) { + cerr << "can't create snapshot: pool " << pool_name + << " is in selfmanaged snaps mode" << std::endl; + return 1; + } + + ret = io_ctx.snap_create(nargs[1]); + if (ret < 0) { + cerr << "error creating pool " << pool_name << " snapshot " << nargs[1] + << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + cout << "created pool " << pool_name << " snap " << nargs[1] << std::endl; + } + + else if (strcmp(nargs[0], "rmsnap") == 0) { + if (!pool_name || nargs.size() < 2) { + usage(cerr); + return 1; + } + + ret = io_ctx.snap_remove(nargs[1]); + if (ret < 0) { + cerr << "error removing pool " << pool_name << " snapshot " << nargs[1] + << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + cout << "removed pool " << pool_name << " snap " << nargs[1] << std::endl; + } + + else if (strcmp(nargs[0], "rollback") == 0) { + if (!pool_name || nargs.size() < 3) { + usage(cerr); + return 1; + } + + ret = io_ctx.snap_rollback(nargs[1], nargs[2]); + if (ret < 0) { + cerr << "error rolling back pool " << pool_name << " to snapshot " << nargs[1] + << cpp_strerror(ret) << std::endl; + return 1; + } + cout << "rolled back pool " << pool_name + << " to snapshot " << nargs[2] << std::endl; + } + else if (strcmp(nargs[0], "bench") == 0) { + if (!pool_name || nargs.size() < 3) { + usage(cerr); + return 1; + } + char* endptr = NULL; + int seconds = strtol(nargs[1], &endptr, 10); + if (*endptr) { + cerr << "Invalid value for seconds: '" << nargs[1] << "'" << std::endl; + return 1; + } + int operation = 0; + if (strcmp(nargs[2], "write") == 0) + operation = OP_WRITE; + else if (strcmp(nargs[2], "seq") == 0) + operation = OP_SEQ_READ; + else if (strcmp(nargs[2], "rand") == 0) + operation = OP_RAND_READ; + else { + usage(cerr); + return 1; + } + if (operation != OP_WRITE) { + if (block_size_specified) { + cerr << "-b|--block_size option can be used only with 'write' bench test" + << std::endl; + return 1; + } + if (bench_write_dest != 0) { + cerr << "--write-object, --write-omap and --write-xattr options can " + "only be used with the 'write' bench test" + << std::endl; + return 1; + } + } + else if (bench_write_dest == 0) { + bench_write_dest = OP_WRITE_DEST_OBJ; + } + + if (!formatter && output) { + cerr << "-o|--output option can only be used with '--format' option" + << std::endl; + return 1; + } + RadosBencher bencher(g_ceph_context, rados, io_ctx); + bencher.set_show_time(show_time); + bencher.set_write_destination(static_cast<OpWriteDest>(bench_write_dest)); + + ostream *outstream = NULL; + if (formatter) { + bencher.set_formatter(formatter.get()); + if (output) + outstream = new ofstream(output); + else + outstream = &cout; + bencher.set_outstream(*outstream); + } + if (!object_size) + object_size = op_size; + else if (object_size < op_size) + op_size = object_size; + cout << "hints = " << (int)hints << std::endl; + ret = bencher.aio_bench(operation, seconds, + concurrent_ios, op_size, object_size, + max_objects, cleanup, hints, run_name, reuse_bench, no_verify); + if (ret != 0) + cerr << "error during benchmark: " << cpp_strerror(ret) << std::endl; + if (formatter && output) + delete outstream; + } + else if (strcmp(nargs[0], "cleanup") == 0) { + if (!pool_name) { + usage(cerr); + return 1; + } + if (wildcard) + io_ctx.set_namespace(all_nspaces); + RadosBencher bencher(g_ceph_context, rados, io_ctx); + ret = bencher.clean_up(prefix, concurrent_ios, run_name); + if (ret != 0) + cerr << "error during cleanup: " << cpp_strerror(ret) << std::endl; + } + else if (strcmp(nargs[0], "watch") == 0) { + if (!pool_name || nargs.size() < 2) { + usage(cerr); + return 1; + } + string oid(nargs[1]); + RadosWatchCtx ctx(io_ctx, oid.c_str()); + uint64_t cookie; + ret = io_ctx.watch2(oid, &cookie, &ctx); + if (ret != 0) + cerr << "error calling watch: " << cpp_strerror(ret) << std::endl; + else { + cout << "press enter to exit..." << std::endl; + getchar(); + io_ctx.unwatch2(cookie); + rados.watch_flush(); + } + } + else if (strcmp(nargs[0], "notify") == 0) { + if (!pool_name || nargs.size() < 3) { + usage(cerr); + return 1; + } + string oid(nargs[1]); + string msg(nargs[2]); + bufferlist bl, replybl; + encode(msg, bl); + ret = io_ctx.notify2(oid, bl, 10000, &replybl); + if (ret != 0) + cerr << "error calling notify: " << cpp_strerror(ret) << std::endl; + if (replybl.length()) { + map<pair<uint64_t,uint64_t>,bufferlist> rm; + set<pair<uint64_t,uint64_t> > missed; + auto p = replybl.cbegin(); + decode(rm, p); + decode(missed, p); + for (map<pair<uint64_t,uint64_t>,bufferlist>::iterator p = rm.begin(); + p != rm.end(); + ++p) { + cout << "reply client." << p->first.first + << " cookie " << p->first.second + << " : " << p->second.length() << " bytes" << std::endl; + if (p->second.length()) + p->second.hexdump(cout); + } + for (multiset<pair<uint64_t,uint64_t> >::iterator p = missed.begin(); + p != missed.end(); ++p) { + cout << "timeout client." << p->first + << " cookie " << p->second << std::endl; + } + } + } else if (strcmp(nargs[0], "set-alloc-hint") == 0) { + // cmd, [oid, ] obj_size, write_size + if (!pool_name || nargs.size() < (obj_name ? 3 : 4)) { + usage(cerr); + return 1; + } + string err; + uint64_t expected_object_size = strict_strtoll(nargs[obj_name ? 1 : 2], 10, &err); + if (!err.empty()) { + cerr << "couldn't parse expected_object_size: " << err << std::endl; + usage(cerr); + return 1; + } + uint64_t expected_write_size = strict_strtoll(nargs[obj_name ? 2 : 3], 10, &err); + if (!err.empty()) { + cerr << "couldn't parse expected_write_size: " << err << std::endl; + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + ret = io_ctx.set_alloc_hint(*obj_name, expected_object_size, expected_write_size); + if (ret < 0) { + cerr << "error setting alloc-hint " << pool_name << "/" << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "load-gen") == 0) { + if (!pool_name) { + cerr << "error: must specify pool" << std::endl; + usage(cerr); + return 1; + } + LoadGen lg(&rados); + if (min_obj_len) + lg.min_obj_len = min_obj_len; + if (max_obj_len) + lg.max_obj_len = max_obj_len; + if (min_op_len) + lg.min_op_len = min_op_len; + if (max_op_len) + lg.max_op_len = max_op_len; + if (max_ops) + lg.max_ops = max_ops; + if (max_backlog) + lg.max_backlog = max_backlog; + if (target_throughput) + lg.target_throughput = target_throughput; + if (read_percent >= 0) + lg.read_percent = read_percent; + if (num_objs) + lg.num_objs = num_objs; + if (run_length) + lg.run_length = run_length; + if (offset_align) + lg.offset_align = offset_align; + + cout << "run length " << run_length << " seconds" << std::endl; + cout << "preparing " << lg.num_objs << " objects" << std::endl; + ret = lg.bootstrap(pool_name); + if (ret < 0) { + cerr << "load-gen bootstrap failed" << std::endl; + return 1; + } + cout << "load-gen will run " << lg.run_length << " seconds" << std::endl; + lg.run(); + lg.cleanup(); + } else if (strcmp(nargs[0], "listomapkeys") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + set<string> out_keys; + ret = io_ctx.omap_get_keys(*obj_name, "", LONG_MAX, &out_keys); + if (ret < 0) { + cerr << "error getting omap key set " << pool_name << "/" + << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + + for (set<string>::iterator iter = out_keys.begin(); + iter != out_keys.end(); ++iter) { + cout << *iter << std::endl; + } + } else if (strcmp(nargs[0], "lock") == 0) { + if (!pool_name) { + usage(cerr); + return 1; + } + + if (!formatter) { + formatter = std::make_unique<JSONFormatter>(pretty_format); + } + ret = do_lock_cmd(nargs, opts, &io_ctx, formatter.get()); + } else if (strcmp(nargs[0], "listwatchers") == 0) { + if (!pool_name || nargs.size() < 2) { + usage(cerr); + return 1; + } + + string oid(nargs[1]); + std::list<obj_watch_t> lw; + + ret = io_ctx.list_watchers(oid, &lw); + if (ret < 0) { + cerr << "error listing watchers " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + else + ret = 0; + + for (std::list<obj_watch_t>::iterator i = lw.begin(); i != lw.end(); ++i) { + cout << "watcher=" << i->addr << " client." << i->watcher_id << " cookie=" << i->cookie << std::endl; + } + } else if (strcmp(nargs[0], "listsnaps") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + + snap_set_t ls; + io_ctx.snap_set_read(LIBRADOS_SNAP_DIR); + ret = io_ctx.list_snaps(*obj_name, &ls); + if (ret < 0) { + cerr << "error listing snap shots " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + else + ret = 0; + + map<snap_t,string> snamemap; + if (formatter || pretty_format) { + vector<snap_t> snaps; + io_ctx.snap_list(&snaps); + for (vector<snap_t>::iterator i = snaps.begin(); + i != snaps.end(); ++i) { + string s; + if (io_ctx.snap_get_name(*i, &s) < 0) + continue; + snamemap.insert(pair<snap_t,string>(*i, s)); + } + } + + if (formatter) { + formatter->open_object_section("object"); + formatter->dump_string("name", *obj_name); + formatter->open_array_section("clones"); + } else { + cout << prettify(*obj_name) << ":" << std::endl; + cout << "cloneid snaps size overlap" << std::endl; + } + + for (std::vector<clone_info_t>::iterator ci = ls.clones.begin(); + ci != ls.clones.end(); ++ci) { + + if (formatter) formatter->open_object_section("clone"); + + if (ci->cloneid == librados::SNAP_HEAD) { + if (formatter) + formatter->dump_string("id", "head"); + else + cout << "head"; + } else { + if (formatter) + formatter->dump_unsigned("id", ci->cloneid); + else + cout << ci->cloneid; + } + + if (formatter) + formatter->open_array_section("snapshots"); + else + cout << "\t"; + + if (!formatter && ci->snaps.empty()) { + cout << "-"; + } + for (std::vector<snap_t>::const_iterator snapindex = ci->snaps.begin(); + snapindex != ci->snaps.end(); ++snapindex) { + + map<snap_t,string>::iterator si; + + if (formatter || pretty_format) si = snamemap.find(*snapindex); + + if (formatter) { + formatter->open_object_section("snapshot"); + formatter->dump_unsigned("id", *snapindex); + if (si != snamemap.end()) + formatter->dump_string("name", si->second); + formatter->close_section(); //snapshot + } else { + if (snapindex != ci->snaps.begin()) cout << ","; + if (!pretty_format || (si == snamemap.end())) + cout << *snapindex; + else + cout << si->second << "(" << *snapindex << ")"; + } + } + + if (formatter) { + formatter->close_section(); //Snapshots + formatter->dump_unsigned("size", ci->size); + } else { + cout << "\t" << ci->size; + } + + if (ci->cloneid != librados::SNAP_HEAD) { + if (formatter) + formatter->open_array_section("overlaps"); + else + cout << "\t["; + + for (std::vector< std::pair<uint64_t,uint64_t> >::iterator ovi = ci->overlap.begin(); + ovi != ci->overlap.end(); ++ovi) { + if (formatter) { + formatter->open_object_section("section"); + formatter->dump_unsigned("start", ovi->first); + formatter->dump_unsigned("length", ovi->second); + formatter->close_section(); //section + } else { + if (ovi != ci->overlap.begin()) cout << ","; + cout << ovi->first << "~" << ovi->second; + } + } + if (formatter) + formatter->close_section(); //overlaps + else + cout << "]" << std::endl; + } + if (formatter) formatter->close_section(); //clone + } + if (formatter) { + formatter->close_section(); //clones + formatter->close_section(); //object + formatter->flush(cout); + } else { + cout << std::endl; + } + } else if (strcmp(nargs[0], "list-inconsistent-pg") == 0) { + if (!formatter) { + formatter = std::make_unique<JSONFormatter>(pretty_format); + } + ret = do_get_inconsistent_pg_cmd(nargs, rados, *formatter); + } else if (strcmp(nargs[0], "list-inconsistent-obj") == 0) { + if (!formatter) { + formatter = std::make_unique<JSONFormatter>(pretty_format); + } + ret = do_get_inconsistent_cmd<inconsistent_obj_t>(nargs, rados, *formatter); + } else if (strcmp(nargs[0], "list-inconsistent-snapset") == 0) { + if (!formatter) { + formatter = std::make_unique<JSONFormatter>(pretty_format); + } + ret = do_get_inconsistent_cmd<inconsistent_snapset_t>(nargs, rados, *formatter); + } else if (strcmp(nargs[0], "cache-flush") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + if (with_clones) { + snap_set_t ls; + io_ctx.snap_set_read(LIBRADOS_SNAP_DIR); + ret = io_ctx.list_snaps(*obj_name, &ls); + if (ret < 0) { + cerr << "error listing snapshots " << pool_name << "/" << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + for (std::vector<clone_info_t>::iterator ci = ls.clones.begin(); + ci != ls.clones.end(); ++ci) { + if (snapid != CEPH_NOSNAP && ci->cloneid > snapid) + break; + io_ctx.snap_set_read(ci->cloneid); + ret = do_cache_flush(io_ctx, *obj_name); + if (ret < 0) { + cerr << "error from cache-flush " << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + } + } else { + ret = do_cache_flush(io_ctx, *obj_name); + if (ret < 0) { + cerr << "error from cache-flush " << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + } + } else if (strcmp(nargs[0], "cache-try-flush") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + if (with_clones) { + snap_set_t ls; + io_ctx.snap_set_read(LIBRADOS_SNAP_DIR); + ret = io_ctx.list_snaps(*obj_name, &ls); + if (ret < 0) { + cerr << "error listing snapshots " << pool_name << "/" << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + for (std::vector<clone_info_t>::iterator ci = ls.clones.begin(); + ci != ls.clones.end(); ++ci) { + if (snapid != CEPH_NOSNAP && ci->cloneid > snapid) + break; + io_ctx.snap_set_read(ci->cloneid); + ret = do_cache_try_flush(io_ctx, *obj_name); + if (ret < 0) { + cerr << "error from cache-flush " << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + } + } else { + ret = do_cache_try_flush(io_ctx, *obj_name); + if (ret < 0) { + cerr << "error from cache-flush " << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + } + } else if (strcmp(nargs[0], "cache-evict") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + if (with_clones) { + snap_set_t ls; + io_ctx.snap_set_read(LIBRADOS_SNAP_DIR); + ret = io_ctx.list_snaps(*obj_name, &ls); + if (ret < 0) { + cerr << "error listing snapshots " << pool_name << "/" << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + for (std::vector<clone_info_t>::iterator ci = ls.clones.begin(); + ci != ls.clones.end(); ++ci) { + if (snapid != CEPH_NOSNAP && ci->cloneid > snapid) + break; + io_ctx.snap_set_read(ci->cloneid); + ret = do_cache_evict(io_ctx, *obj_name); + if (ret < 0) { + cerr << "error from cache-flush " << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + } + } else { + ret = do_cache_evict(io_ctx, *obj_name); + if (ret < 0) { + cerr << "error from cache-flush " << prettify(*obj_name) << ": " + << cpp_strerror(ret) << std::endl; + return 1; + } + } + } else if (strcmp(nargs[0], "cache-flush-evict-all") == 0) { + if (!pool_name) { + usage(cerr); + return 1; + } + ret = do_cache_flush_evict_all(io_ctx, true); + if (ret < 0) { + cerr << "cache-flush-evict-all finished with errors" << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "cache-try-flush-evict-all") == 0) { + if (!pool_name) { + usage(cerr); + return 1; + } + ret = do_cache_flush_evict_all(io_ctx, false); + if (ret < 0) { + cerr << "cache-try-flush-evict-all finished with errors" << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "set-redirect") == 0) { + if (!pool_name) { + usage(cerr); + return 1; + } + + const char *target = target_pool_name; + if (!target) + target = pool_name; + + const char *target_obj; + if (nargs.size() < 3) { + if (strcmp(target, pool_name) == 0) { + cerr << "cannot copy object into itself" << std::endl; + return 1; + } + target_obj = nargs[1]; + } else { + target_obj = nargs[2]; + } + + IoCtx target_ctx; + ret = rados.ioctx_create(target, target_ctx); + if (target_oloc.size()) { + target_ctx.locator_set_key(target_oloc); + } + if (target_nspace.size()) { + target_ctx.set_namespace(target_nspace); + } + + ObjectWriteOperation op; + if (with_reference) { + op.set_redirect(target_obj, target_ctx, 0, CEPH_OSD_OP_FLAG_WITH_REFERENCE); + } else { + op.set_redirect(target_obj, target_ctx, 0); + } + ret = io_ctx.operate(nargs[1], &op); + if (ret < 0) { + cerr << "error set-redirect " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << cpp_strerror(ret) << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "set-chunk") == 0) { + if (!pool_name) { + usage(cerr); + return 1; + } + + const char *target = target_pool_name; + if (!target) + target = pool_name; + + uint64_t offset; + uint64_t length; + uint64_t tgt_offset; + string tgt_oid; + if (nargs.size() < 6) { + usage(cerr); + return 1; + } else { + char* endptr = NULL; + offset = strtoull(nargs[2], &endptr, 10); + if (*endptr) { + cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl; + return 1; + } + length = strtoull(nargs[3], &endptr, 10); + if (*endptr) { + cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl; + return 1; + } + tgt_oid = string(nargs[4]); + tgt_offset = strtoull(nargs[5], &endptr, 10); + if (*endptr) { + cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl; + return 1; + } + } + + IoCtx target_ctx; + ret = rados.ioctx_create(target, target_ctx); + ObjectWriteOperation op; + if (with_reference) { + op.set_chunk(offset, length, target_ctx, tgt_oid, tgt_offset, CEPH_OSD_OP_FLAG_WITH_REFERENCE); + } else { + op.set_chunk(offset, length, target_ctx, tgt_oid, tgt_offset); + } + ret = io_ctx.operate(nargs[1], &op); + if (ret < 0) { + cerr << "error set-chunk " << pool_name << "/" << nargs[1] << " " << " offset " << offset + << " length " << length << " target_pool " << target + << "tgt_offset: " << tgt_offset << " : " << cpp_strerror(ret) << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "tier-promote") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + ObjectWriteOperation op; + op.tier_promote(); + ret = io_ctx.operate(*obj_name, &op); + if (ret < 0) { + cerr << "error tier-promote " << pool_name << "/" << prettify(*obj_name) << " : " + << cpp_strerror(ret) << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "unset-manifest") == 0) { + if (!pool_name || (nargs.size() < 2 && !obj_name)) { + usage(cerr); + return 1; + } + if (!obj_name) { + obj_name = nargs[1]; + } + ObjectWriteOperation op; + op.unset_manifest(); + ret = io_ctx.operate(*obj_name, &op); + if (ret < 0) { + cerr << "error unset-manifest " << pool_name << "/" << prettify(*obj_name) << " : " + << cpp_strerror(ret) << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "export") == 0) { + // export [filename] + if (!pool_name || nargs.size() > 2) { + usage(cerr); + return 1; + } + + int file_fd; + if (nargs.size() < 2 || std::string(nargs[1]) == "-") { + file_fd = STDOUT_FILENO; + } else { + file_fd = open(nargs[1], O_WRONLY|O_CREAT|O_TRUNC, 0666); + if (file_fd < 0) { + cerr << "Error opening '" << nargs[1] << "': " + << cpp_strerror(file_fd) << std::endl; + return 1; + } + } + + ret = PoolDump(file_fd).dump(&io_ctx); + + if (file_fd != STDIN_FILENO) { + VOID_TEMP_FAILURE_RETRY(::close(file_fd)); + } + + if (ret < 0) { + cerr << "error from export: " + << cpp_strerror(ret) << std::endl; + return 1; + } + } else if (strcmp(nargs[0], "import") == 0) { + // import [--no-overwrite] [--dry-run] <filename | - > + if (!pool_name || nargs.size() > 4 || nargs.size() < 2) { + usage(cerr); + return 1; + } + + // Last arg is the filename + std::string const filename = nargs[nargs.size() - 1]; + + // All other args may be flags + bool dry_run = false; + bool no_overwrite = false; + for (unsigned i = 1; i < nargs.size() - 1; ++i) { + std::string arg(nargs[i]); + + if (arg == std::string("--no-overwrite")) { + no_overwrite = true; + } else if (arg == std::string("--dry-run")) { + dry_run = true; + } else { + std::cerr << "Invalid argument '" << arg << "'" << std::endl; + return 1; + } + } + + int file_fd; + if (filename == "-") { + file_fd = STDIN_FILENO; + } else { + file_fd = open(filename.c_str(), O_RDONLY); + if (file_fd < 0) { + cerr << "Error opening '" << filename << "': " + << cpp_strerror(file_fd) << std::endl; + return 1; + } + } + + ret = RadosImport(file_fd, 0, dry_run).import(io_ctx, no_overwrite); + + if (file_fd != STDIN_FILENO) { + VOID_TEMP_FAILURE_RETRY(::close(file_fd)); + } + + if (ret < 0) { + cerr << "error from import: " + << cpp_strerror(ret) << std::endl; + return 1; + } + } else { + cerr << "unrecognized command " << nargs[0] << "; -h or --help for usage" << std::endl; + ret = -EINVAL; + } + + if (ret < 0) + cerr << "error " << (-ret) << ": " << cpp_strerror(ret) << std::endl; + + return (ret < 0) ? 1 : 0; +} + +int main(int argc, const char **argv) +{ + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(cout); + exit(0); + } + + std::map < std::string, std::string > opts; + std::string val; + + // Necessary to support usage of -f for formatting, + // since global_init will remove the -f using ceph + // argparse procedures. + for (auto j = args.begin(); j != args.end(); ++j) { + if (strcmp(*j, "--") == 0) { + break; + } else if ((j+1) == args.end()) { + // This can't be a formatting call (no format arg) + break; + } else if (strcmp(*j, "-f") == 0) { + val = *(j+1); + unique_ptr<Formatter> formatter(Formatter::create(val.c_str())); + + if (formatter) { + j = args.erase(j); + opts["format"] = val; + + j = args.erase(j); + break; + } + } + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + std::vector<const char*>::iterator i; + for (i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_flag(args, i, "--force-full", (char*)NULL)) { + opts["force-full"] = "true"; + } else if (ceph_argparse_flag(args, i, "-d", "--delete-after", (char*)NULL)) { + opts["delete-after"] = "true"; + } else if (ceph_argparse_flag(args, i, "-C", "--create", "--create-pool", + (char*)NULL)) { + opts["create"] = "true"; + } else if (ceph_argparse_flag(args, i, "--pretty-format", (char*)NULL)) { + opts["pretty-format"] = "true"; + } else if (ceph_argparse_flag(args, i, "--show-time", (char*)NULL)) { + opts["show-time"] = "true"; + } else if (ceph_argparse_flag(args, i, "--no-cleanup", (char*)NULL)) { + opts["no-cleanup"] = "true"; + } else if (ceph_argparse_flag(args, i, "--no-hints", (char*)NULL)) { + opts["no-hints"] = "true"; + } else if (ceph_argparse_flag(args, i, "--reuse-bench", (char*)NULL)) { + opts["reuse-bench"] = "true"; + } else if (ceph_argparse_flag(args, i, "--no-verify", (char*)NULL)) { + opts["no-verify"] = "true"; + } else if (ceph_argparse_witharg(args, i, &val, "--run-name", (char*)NULL)) { + opts["run-name"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--prefix", (char*)NULL)) { + opts["prefix"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) { + opts["pool"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--target-pool", (char*)NULL)) { + opts["target_pool"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--object-locator" , (char *)NULL)) { + opts["object_locator"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--target-locator" , (char *)NULL)) { + opts["target_locator"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--target-nspace" , (char *)NULL)) { + opts["target_nspace"] = val; +#ifdef WITH_LIBRADOSSTRIPER + } else if (ceph_argparse_flag(args, i, "--striper" , (char *)NULL)) { + opts["striper"] = "true"; +#endif + } else if (ceph_argparse_witharg(args, i, &val, "-t", "--concurrent-ios", (char*)NULL)) { + opts["concurrent-ios"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--block-size", (char*)NULL)) { + opts["block-size"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "-b", (char*)NULL)) { + opts["block-size"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--object-size", (char*)NULL)) { + opts["object-size"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--max-objects", (char*)NULL)) { + opts["max-objects"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--offset", (char*)NULL)) { + opts["offset"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "-O", (char*)NULL)) { + opts["object-size"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "-s", "--snap", (char*)NULL)) { + opts["snap"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "-S", "--snapid", (char*)NULL)) { + opts["snapid"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--min-object-size", (char*)NULL)) { + opts["min-object-size"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--max-object-size", (char*)NULL)) { + opts["max-object-size"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--min-op-len", (char*)NULL)) { + opts["min-op-len"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--max-op-len", (char*)NULL)) { + opts["max-op-len"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--max-ops", (char*)NULL)) { + opts["max-ops"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--max-backlog", (char*)NULL)) { + opts["max-backlog"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--target-throughput", (char*)NULL)) { + opts["target-throughput"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--offset-align", (char*)NULL)) { + opts["offset_align"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--read-percent", (char*)NULL)) { + opts["read-percent"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--num-objects", (char*)NULL)) { + opts["num-objects"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--run-length", (char*)NULL)) { + opts["run-length"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--workers", (char*)NULL)) { + opts["workers"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "-f", "--format", (char*)NULL)) { + opts["format"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--lock-tag", (char*)NULL)) { + opts["lock-tag"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--lock-cookie", (char*)NULL)) { + opts["lock-cookie"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--lock-description", (char*)NULL)) { + opts["lock-description"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--lock-duration", (char*)NULL)) { + opts["lock-duration"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--lock-type", (char*)NULL)) { + opts["lock-type"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "-N", "--namespace", (char*)NULL)) { + opts["namespace"] = val; + } else if (ceph_argparse_flag(args, i, "--all", (char*)NULL)) { + opts["all"] = "true"; + } else if (ceph_argparse_flag(args, i, "--default", (char*)NULL)) { + opts["default"] = "true"; + } else if (ceph_argparse_witharg(args, i, &val, "-o", "--output", (char*)NULL)) { + opts["output"] = val; + } else if (ceph_argparse_flag(args, i, "--write-omap", (char*)NULL)) { + opts["write-dest-omap"] = "true"; + } else if (ceph_argparse_flag(args, i, "--write-object", (char*)NULL)) { + opts["write-dest-obj"] = "true"; + } else if (ceph_argparse_flag(args, i, "--write-xattr", (char*)NULL)) { + opts["write-dest-xattr"] = "true"; + } else if (ceph_argparse_flag(args, i, "--with-clones", (char*)NULL)) { + opts["with-clones"] = "true"; + } else if (ceph_argparse_witharg(args, i, &val, "--omap-key-file", (char*)NULL)) { + opts["omap-key-file"] = val; + } else if (ceph_argparse_witharg(args, i, &val, "--obj-name-file", (char*)NULL)) { + opts["obj-name-file"] = val; + } else if (ceph_argparse_flag(args, i, "--with-reference", (char*)NULL)) { + opts["with-reference"] = "true"; + } else if (ceph_argparse_witharg(args, i, &val, "--pgid", (char*)NULL)) { + opts["pgid"] = val; + } else { + if (val[0] == '-') + usage_exit(); + ++i; + } + } + + if (args.empty()) { + cerr << "rados: you must give an action. Try --help" << std::endl; + return 1; + } + + return rados_tool_common(opts, args); +} diff --git a/src/tools/radosacl.cc b/src/tools/radosacl.cc new file mode 100644 index 00000000..3b071705 --- /dev/null +++ b/src/tools/radosacl.cc @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <stdlib.h> +#include <time.h> +#include <errno.h> + +#include "include/types.h" +#include "include/rados/librados.hpp" + +using namespace librados; + +void buf_to_hex(const unsigned char *buf, int len, char *str) +{ + str[0] = '\0'; + for (int i = 0; i < len; i++) { + sprintf(&str[i*2], "%02x", (int)buf[i]); + } +} + + +#define ID_SIZE 8 + +#define ACL_RD 0x1 +#define ACL_WR 0x2 + +struct ACLID { + char id[ID_SIZE + 1]; + + void encode(bufferlist& bl) const { + bl.append((const char *)id, ID_SIZE); + } + void decode(bufferlist::const_iterator& iter) { + iter.copy(ID_SIZE, (char *)id); + } +}; +WRITE_CLASS_ENCODER(ACLID) + +typedef __u32 ACLFlags; + + +inline bool operator<(const ACLID& l, const ACLID& r) +{ + return (memcmp(&l, &r, ID_SIZE) < 0); +} + +struct ACLPair { + ACLID id; + ACLFlags flags; +}; + +class ObjectACLs { + map<ACLID, ACLFlags> acls_map; + +public: + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(acls_map, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(acls_map, bl); + } + + int read_acl(ACLID& id, ACLFlags *flags); + void set_acl(ACLID& id, ACLFlags flags); +}; +WRITE_CLASS_ENCODER(ObjectACLs) + +int ObjectACLs::read_acl(ACLID& id, ACLFlags *flags) +{ + if (!flags) + return -EINVAL; + + map<ACLID, ACLFlags>::iterator iter = acls_map.find(id); + + if (iter == acls_map.end()) + return -ENOENT; + + *flags = iter->second; + + return 0; +} + +void ObjectACLs::set_acl(ACLID& id, ACLFlags flags) +{ + acls_map[id] = flags; +} + + + +class ACLEntity +{ + string name; + map<ACLID, ACLEntity> groups; +}; + +typedef map<ACLID, ACLEntity> tACLIDEntityMap; + +static map<ACLID, ACLEntity> users; +static map<ACLID, ACLEntity> groups; + +void get_user(ACLID& aclid, ACLEntity *entity) +{ + //users.find(aclid); +} + + + + + +int main(int argc, const char **argv) +{ + Rados rados; + if (rados.init(NULL) < 0) { + cerr << "couldn't initialize rados!" << std::endl; + exit(1); + } + if (rados.conf_read_file(NULL)) { + cerr << "couldn't read Ceph configuration file!" << std::endl; + exit(1); + } + if (rados.connect() < 0) { + cerr << "couldn't connect to cluster!" << std::endl; + exit(1); + } + + time_t tm; + bufferlist bl, bl2; + char buf[128]; + + time(&tm); + snprintf(buf, 128, "%s", ctime(&tm)); + bl.append(buf, strlen(buf)); + + const char *oid = "bar"; + + IoCtx io_ctx; + int r = rados.ioctx_create("data", io_ctx); + cout << "open io_ctx result = " << r << " pool = " << io_ctx.get_pool_name() << std::endl; + + ACLID id; + + snprintf(id.id, sizeof(id.id), "%.8x", 0x1234); + cout << "id=" << id.id << std::endl; + + r = io_ctx.exec(oid, "acl", "get", bl, bl2); + cout << "exec(acl get) returned " << r + << " len=" << bl2.length() << std::endl; + ObjectACLs oa; + if (r >= 0) { + auto iter = bl2.cbegin(); + oa.decode(iter); + } + + oa.set_acl(id, ACL_RD); + bl.clear(); + oa.encode(bl); + r = io_ctx.exec(oid, "acl", "set", bl, bl2); + cout << "exec(acl set) returned " << r + << " len=" << bl2.length() << std::endl; + + const unsigned char *md5 = (const unsigned char *)bl2.c_str(); + char md5_str[bl2.length()*2 + 1]; + buf_to_hex(md5, bl2.length(), md5_str); + cout << "md5 result=" << md5_str << std::endl; + + int size = io_ctx.read(oid, bl2, 128, 0); + cout << "read result=" << bl2.c_str() << std::endl; + cout << "size=" << size << std::endl; + + return 0; +} + diff --git a/src/tools/rbd/ArgumentTypes.cc b/src/tools/rbd/ArgumentTypes.cc new file mode 100644 index 00000000..ae5f9fd7 --- /dev/null +++ b/src/tools/rbd/ArgumentTypes.cc @@ -0,0 +1,515 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/rbd/features.h" +#include "common/config_proxy.h" +#include "common/strtol.h" +#include "common/Formatter.h" +#include "global/global_context.h" +#include <iostream> +#include <boost/tokenizer.hpp> + +namespace rbd { +namespace argument_types { + +namespace po = boost::program_options; + +const std::map<uint64_t, std::string> ImageFeatures::FEATURE_MAPPING = { + {RBD_FEATURE_LAYERING, RBD_FEATURE_NAME_LAYERING}, + {RBD_FEATURE_STRIPINGV2, RBD_FEATURE_NAME_STRIPINGV2}, + {RBD_FEATURE_EXCLUSIVE_LOCK, RBD_FEATURE_NAME_EXCLUSIVE_LOCK}, + {RBD_FEATURE_OBJECT_MAP, RBD_FEATURE_NAME_OBJECT_MAP}, + {RBD_FEATURE_FAST_DIFF, RBD_FEATURE_NAME_FAST_DIFF}, + {RBD_FEATURE_DEEP_FLATTEN, RBD_FEATURE_NAME_DEEP_FLATTEN}, + {RBD_FEATURE_JOURNALING, RBD_FEATURE_NAME_JOURNALING}, + {RBD_FEATURE_DATA_POOL, RBD_FEATURE_NAME_DATA_POOL}, + {RBD_FEATURE_OPERATIONS, RBD_FEATURE_NAME_OPERATIONS}, + {RBD_FEATURE_MIGRATING, RBD_FEATURE_NAME_MIGRATING}, +}; + +Format::Formatter Format::create_formatter(bool pretty) const { + if (value == "json") { + return Formatter(new JSONFormatter(pretty)); + } else if (value == "xml") { + return Formatter(new XMLFormatter(pretty)); + } + return Formatter(); +} + +std::string get_name_prefix(ArgumentModifier modifier) { + switch (modifier) { + case ARGUMENT_MODIFIER_SOURCE: + return SOURCE_PREFIX; + case ARGUMENT_MODIFIER_DEST: + return DEST_PREFIX; + default: + return ""; + } +} + +std::string get_description_prefix(ArgumentModifier modifier) { + switch (modifier) { + case ARGUMENT_MODIFIER_SOURCE: + return "source "; + case ARGUMENT_MODIFIER_DEST: + return "destination "; + default: + return ""; + } +} + +void add_pool_option(po::options_description *opt, + ArgumentModifier modifier, + const std::string &desc_suffix) { + std::string name = POOL_NAME + ",p"; + std::string description = "pool name"; + switch (modifier) { + case ARGUMENT_MODIFIER_NONE: + break; + case ARGUMENT_MODIFIER_SOURCE: + description = "source " + description; + break; + case ARGUMENT_MODIFIER_DEST: + name = DEST_POOL_NAME; + description = "destination " + description; + break; + } + description += desc_suffix; + + // TODO add validator + opt->add_options() + (name.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_namespace_option(boost::program_options::options_description *opt, + ArgumentModifier modifier) { + std::string name = NAMESPACE_NAME; + std::string description = "namespace name"; + switch (modifier) { + case ARGUMENT_MODIFIER_NONE: + break; + case ARGUMENT_MODIFIER_SOURCE: + description = "source " + description; + break; + case ARGUMENT_MODIFIER_DEST: + name = DEST_NAMESPACE_NAME; + description = "destination " + description; + break; + } + + // TODO add validator + opt->add_options() + (name.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_image_option(po::options_description *opt, + ArgumentModifier modifier, + const std::string &desc_suffix) { + std::string name = IMAGE_NAME; + std::string description = "image name"; + switch (modifier) { + case ARGUMENT_MODIFIER_NONE: + break; + case ARGUMENT_MODIFIER_SOURCE: + description = "source " + description; + break; + case ARGUMENT_MODIFIER_DEST: + name = DEST_IMAGE_NAME; + description = "destination " + description; + break; + } + description += desc_suffix; + + // TODO add validator + opt->add_options() + (name.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_image_id_option(po::options_description *opt, + const std::string &desc_suffix) { + std::string name = IMAGE_ID; + std::string description = "image id"; + description += desc_suffix; + + // TODO add validator + opt->add_options() + (name.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_snap_option(po::options_description *opt, + ArgumentModifier modifier) { + + std::string name = SNAPSHOT_NAME; + std::string description = "snapshot name"; + switch (modifier) { + case ARGUMENT_MODIFIER_NONE: + break; + case ARGUMENT_MODIFIER_DEST: + name = DEST_SNAPSHOT_NAME; + description = "destination " + description; + break; + case ARGUMENT_MODIFIER_SOURCE: + description = "source " + description; + break; + } + + // TODO add validator + opt->add_options() + (name.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_snap_id_option(po::options_description *opt) { + opt->add_options() + (SNAPSHOT_ID.c_str(), po::value<uint64_t>(), "snapshot id"); +} + +void add_pool_options(boost::program_options::options_description *pos, + boost::program_options::options_description *opt, + bool namespaces_supported) { + opt->add_options() + ((POOL_NAME + ",p").c_str(), po::value<std::string>(), "pool name"); + if (namespaces_supported) { + add_namespace_option(opt, ARGUMENT_MODIFIER_NONE); + pos->add_options() + ("pool-spec", "pool specification\n" + "(example: <pool-name>[/<namespace>]"); + } else { + pos->add_options() + ("pool-name", "pool name"); + } +} + +void add_image_spec_options(po::options_description *pos, + po::options_description *opt, + ArgumentModifier modifier) { + pos->add_options() + ((get_name_prefix(modifier) + IMAGE_SPEC).c_str(), + (get_description_prefix(modifier) + "image specification\n" + + "(example: [<pool-name>/[<namespace>/]]<image-name>)").c_str()); + add_pool_option(opt, modifier); + add_namespace_option(opt, modifier); + add_image_option(opt, modifier); +} + +void add_snap_spec_options(po::options_description *pos, + po::options_description *opt, + ArgumentModifier modifier) { + pos->add_options() + ((get_name_prefix(modifier) + SNAPSHOT_SPEC).c_str(), + (get_description_prefix(modifier) + "snapshot specification\n" + + "(example: [<pool-name>/[<namespace>/]]<image-name>@<snapshot-name>)").c_str()); + add_pool_option(opt, modifier); + add_namespace_option(opt, modifier); + add_image_option(opt, modifier); + add_snap_option(opt, modifier); +} + +void add_image_or_snap_spec_options(po::options_description *pos, + po::options_description *opt, + ArgumentModifier modifier) { + pos->add_options() + ((get_name_prefix(modifier) + IMAGE_OR_SNAPSHOT_SPEC).c_str(), + (get_description_prefix(modifier) + "image or snapshot specification\n" + + "(example: [<pool-name>/[<namespace>/]]<image-name>[@<snap-name>])").c_str()); + add_pool_option(opt, modifier); + add_namespace_option(opt, modifier); + add_image_option(opt, modifier); + add_snap_option(opt, modifier); +} + +void add_create_image_options(po::options_description *opt, + bool include_format) { + // TODO get default image format from conf + if (include_format) { + opt->add_options() + (IMAGE_FORMAT.c_str(), po::value<ImageFormat>(), + "image format [1 (deprecated) or 2]") + (IMAGE_NEW_FORMAT.c_str(), + po::value<ImageNewFormat>()->zero_tokens(), + "use image format 2\n(deprecated)"); + } + + opt->add_options() + (IMAGE_ORDER.c_str(), po::value<ImageOrder>(), + "object order [12 <= order <= 25]") + (IMAGE_OBJECT_SIZE.c_str(), po::value<ImageObjectSize>(), + "object size in B/K/M [4K <= object size <= 32M]") + (IMAGE_FEATURES.c_str(), po::value<ImageFeatures>()->composing(), + ("image features\n" + get_short_features_help(true)).c_str()) + (IMAGE_SHARED.c_str(), po::bool_switch(), "shared image") + (IMAGE_STRIPE_UNIT.c_str(), po::value<ImageObjectSize>(), "stripe unit in B/K/M") + (IMAGE_STRIPE_COUNT.c_str(), po::value<uint64_t>(), "stripe count") + (IMAGE_DATA_POOL.c_str(), po::value<std::string>(), "data pool"); + + add_create_journal_options(opt); +} + +void add_create_journal_options(po::options_description *opt) { + opt->add_options() + (JOURNAL_SPLAY_WIDTH.c_str(), po::value<uint64_t>(), + "number of active journal objects") + (JOURNAL_OBJECT_SIZE.c_str(), po::value<JournalObjectSize>(), + "size of journal objects [4K <= size <= 64M]") + (JOURNAL_POOL.c_str(), po::value<std::string>(), + "pool for journal objects"); +} + +void add_size_option(boost::program_options::options_description *opt) { + opt->add_options() + ((IMAGE_SIZE + ",s").c_str(), po::value<ImageSize>()->required(), + "image size (in M/G/T) [default: M]"); +} + +void add_sparse_size_option(boost::program_options::options_description *opt) { + opt->add_options() + (IMAGE_SPARSE_SIZE.c_str(), po::value<ImageObjectSize>(), + "sparse size in B/K/M [default: 4K]"); +} + +void add_path_options(boost::program_options::options_description *pos, + boost::program_options::options_description *opt, + const std::string &description) { + pos->add_options() + (PATH_NAME.c_str(), po::value<std::string>(), description.c_str()); + opt->add_options() + (PATH.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_limit_option(po::options_description *opt) { + std::string description = "maximum allowed snapshot count"; + + opt->add_options() + (LIMIT.c_str(), po::value<uint64_t>(), description.c_str()); +} + +void add_no_progress_option(boost::program_options::options_description *opt) { + opt->add_options() + (NO_PROGRESS.c_str(), po::bool_switch(), "disable progress output"); +} + +void add_format_options(boost::program_options::options_description *opt) { + opt->add_options() + (FORMAT.c_str(), po::value<Format>(), "output format (plain, json, or xml) [default: plain]") + (PRETTY_FORMAT.c_str(), po::bool_switch(), + "pretty formatting (json and xml)"); +} + +void add_verbose_option(boost::program_options::options_description *opt) { + opt->add_options() + (VERBOSE.c_str(), po::bool_switch(), "be verbose"); +} + +void add_no_error_option(boost::program_options::options_description *opt) { + opt->add_options() + (NO_ERROR.c_str(), po::bool_switch(), "continue after error"); +} + +void add_export_format_option(boost::program_options::options_description *opt) { + opt->add_options() + ("export-format", po::value<ExportFormat>(), "format of image file"); +} + +void add_flatten_option(boost::program_options::options_description *opt) { + opt->add_options() + (IMAGE_FLATTEN.c_str(), po::bool_switch(), + "fill clone with parent data (make it independent)"); +} + +std::string get_short_features_help(bool append_suffix) { + std::ostringstream oss; + bool first_feature = true; + oss << "["; + for (auto &pair : ImageFeatures::FEATURE_MAPPING) { + if ((pair.first & RBD_FEATURES_IMPLICIT_ENABLE) != 0ULL) { + // hide implicitly enabled features from list + continue; + } else if (!append_suffix && (pair.first & RBD_FEATURES_MUTABLE) == 0ULL) { + // hide non-mutable features for the 'rbd feature XYZ' command + continue; + } + + if (!first_feature) { + oss << ", "; + } + first_feature = false; + + std::string suffix; + if (append_suffix) { + if ((pair.first & rbd::utils::get_rbd_default_features(g_ceph_context)) != 0) { + suffix += "+"; + } + if ((pair.first & RBD_FEATURES_MUTABLE) != 0) { + suffix += "*"; + } else if ((pair.first & RBD_FEATURES_DISABLE_ONLY) != 0) { + suffix += "-"; + } + if (!suffix.empty()) { + suffix = "(" + suffix + ")"; + } + } + oss << pair.second << suffix; + } + oss << "]"; + return oss.str(); +} + +std::string get_long_features_help() { + std::ostringstream oss; + oss << "Image Features:" << std::endl + << " (*) supports enabling/disabling on existing images" << std::endl + << " (-) supports disabling-only on existing images" << std::endl + << " (+) enabled by default for new images if features not specified" + << std::endl; + return oss.str(); +} + +void validate(boost::any& v, const std::vector<std::string>& values, + ImageSize *target_type, int) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + + std::string parse_error; + uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error); + if (!parse_error.empty()) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + + //NOTE: We can remove below given three lines of code once all applications, + //which use this CLI will adopt B/K/M/G/T/P/E with size value + if (isdigit(*s.rbegin())) { + size = size << 20; // Default MB to Bytes + } + v = boost::any(size); +} + +void validate(boost::any& v, const std::vector<std::string>& values, + ImageOrder *target_type, int dummy) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + try { + uint64_t order = boost::lexical_cast<uint64_t>(s); + if (order >= 12 && order <= 25) { + v = boost::any(order); + return; + } + } catch (const boost::bad_lexical_cast &) { + } + throw po::validation_error(po::validation_error::invalid_option_value); +} + +void validate(boost::any& v, const std::vector<std::string>& values, + ImageObjectSize *target_type, int dummy) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + + std::string parse_error; + uint64_t objectsize = strict_iecstrtoll(s.c_str(), &parse_error); + if (!parse_error.empty()) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + v = boost::any(objectsize); +} + +void validate(boost::any& v, const std::vector<std::string>& values, + ImageFormat *target_type, int dummy) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + try { + uint32_t format = boost::lexical_cast<uint32_t>(s); + if (format == 1 || format == 2) { + v = boost::any(format); + return; + } + } catch (const boost::bad_lexical_cast &) { + } + throw po::validation_error(po::validation_error::invalid_option_value); +} + +void validate(boost::any& v, const std::vector<std::string>& values, + ImageNewFormat *target_type, int dummy) { + std::cout << "rbd: --new-format is deprecated, use --image-format" + << std::endl; + v = boost::any(true); +} + +void validate(boost::any& v, const std::vector<std::string>& values, + ImageFeatures *target_type, int) { + if (v.empty()) { + v = boost::any(static_cast<uint64_t>(0)); + } + + uint64_t &features = boost::any_cast<uint64_t &>(v); + for (auto &value : values) { + boost::char_separator<char> sep(","); + boost::tokenizer<boost::char_separator<char> > tok(value, sep); + for (auto &token : tok) { + bool matched = false; + for (auto &it : ImageFeatures::FEATURE_MAPPING) { + if (token == it.second) { + features |= it.first; + matched = true; + break; + } + } + + if (!matched) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + } + } +} + +void validate(boost::any& v, const std::vector<std::string>& values, + Format *target_type, int) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + if (s == "plain" || s == "json" || s == "xml") { + v = boost::any(Format(s)); + } else { + throw po::validation_error(po::validation_error::invalid_option_value); + } +} + +void validate(boost::any& v, const std::vector<std::string>& values, + JournalObjectSize *target_type, int) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + + std::string parse_error; + uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error); + if (parse_error.empty() && (size >= (1 << 12)) && (size <= (1 << 26))) { + v = boost::any(size); + return; + } + throw po::validation_error(po::validation_error::invalid_option_value); +} + +void validate(boost::any& v, const std::vector<std::string>& values, + ExportFormat *target_type, int) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + + std::string parse_error; + uint64_t format = strict_iecstrtoll(s.c_str(), &parse_error); + if (!parse_error.empty() || (format != 1 && format != 2)) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + + v = boost::any(format); +} + +void validate(boost::any& v, const std::vector<std::string>& values, + Secret *target_type, int) { + std::cerr << "rbd: --secret is deprecated, use --keyfile" << std::endl; + + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + g_conf().set_val_or_die("keyfile", s.c_str()); + v = boost::any(s); +} + +} // namespace argument_types +} // namespace rbd diff --git a/src/tools/rbd/ArgumentTypes.h b/src/tools/rbd/ArgumentTypes.h new file mode 100644 index 00000000..23bb02b9 --- /dev/null +++ b/src/tools/rbd/ArgumentTypes.h @@ -0,0 +1,218 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_ARGUMENT_TYPES_H +#define CEPH_RBD_ARGUMENT_TYPES_H + +#include "include/int_types.h" +#include <set> +#include <string> +#include <vector> +#include <boost/any.hpp> +#include <boost/program_options.hpp> +#include <boost/shared_ptr.hpp> + +namespace ceph { class Formatter; } + +namespace rbd { +namespace argument_types { + +enum ArgumentModifier { + ARGUMENT_MODIFIER_NONE, + ARGUMENT_MODIFIER_SOURCE, + ARGUMENT_MODIFIER_DEST +}; + +enum SpecFormat { + SPEC_FORMAT_IMAGE, + SPEC_FORMAT_SNAPSHOT, + SPEC_FORMAT_IMAGE_OR_SNAPSHOT +}; + +static const std::string SOURCE_PREFIX("source-"); +static const std::string DEST_PREFIX("dest-"); + +// positional arguments +static const std::string POSITIONAL_COMMAND_SPEC("positional-command-spec"); +static const std::string POSITIONAL_ARGUMENTS("positional-arguments"); +static const std::string IMAGE_SPEC("image-spec"); +static const std::string SNAPSHOT_SPEC("snap-spec"); +static const std::string IMAGE_OR_SNAPSHOT_SPEC("image-or-snap-spec"); +static const std::string PATH_NAME("path-name"); +static const std::string IMAGE_ID("image-id"); + +// optional arguments +static const std::string CONFIG_PATH("conf"); +static const std::string POOL_NAME("pool"); +static const std::string DEST_POOL_NAME("dest-pool"); +static const std::string NAMESPACE_NAME("namespace"); +static const std::string DEST_NAMESPACE_NAME("dest-namespace"); +static const std::string IMAGE_NAME("image"); +static const std::string DEST_IMAGE_NAME("dest"); +static const std::string SNAPSHOT_NAME("snap"); +static const std::string SNAPSHOT_ID("snap-id"); +static const std::string DEST_SNAPSHOT_NAME("dest-snap"); +static const std::string PATH("path"); +static const std::string FROM_SNAPSHOT_NAME("from-snap"); +static const std::string WHOLE_OBJECT("whole-object"); + +static const std::string IMAGE_FORMAT("image-format"); +static const std::string IMAGE_NEW_FORMAT("new-format"); +static const std::string IMAGE_ORDER("order"); +static const std::string IMAGE_OBJECT_SIZE("object-size"); +static const std::string IMAGE_FEATURES("image-feature"); +static const std::string IMAGE_SHARED("image-shared"); +static const std::string IMAGE_SIZE("size"); +static const std::string IMAGE_STRIPE_UNIT("stripe-unit"); +static const std::string IMAGE_STRIPE_COUNT("stripe-count"); +static const std::string IMAGE_DATA_POOL("data-pool"); +static const std::string IMAGE_SPARSE_SIZE("sparse-size"); +static const std::string IMAGE_THICK_PROVISION("thick-provision"); +static const std::string IMAGE_FLATTEN("flatten"); + +static const std::string JOURNAL_OBJECT_SIZE("journal-object-size"); +static const std::string JOURNAL_SPLAY_WIDTH("journal-splay-width"); +static const std::string JOURNAL_POOL("journal-pool"); + +static const std::string NO_PROGRESS("no-progress"); +static const std::string FORMAT("format"); +static const std::string PRETTY_FORMAT("pretty-format"); +static const std::string VERBOSE("verbose"); +static const std::string NO_ERROR("no-error"); + +static const std::string LIMIT("limit"); + +static const std::set<std::string> SWITCH_ARGUMENTS = { + WHOLE_OBJECT, NO_PROGRESS, PRETTY_FORMAT, VERBOSE, NO_ERROR}; + +struct ImageSize {}; +struct ImageOrder {}; +struct ImageObjectSize {}; +struct ImageFormat {}; +struct ImageNewFormat {}; + +struct ImageFeatures { + static const std::map<uint64_t, std::string> FEATURE_MAPPING; + + uint64_t features; +}; + +template <typename T> +struct TypedValue { + T value; + TypedValue(const T& t) : value(t) {} +}; + +struct Format : public TypedValue<std::string> { + typedef boost::shared_ptr<ceph::Formatter> Formatter; + + Format(const std::string &format) : TypedValue<std::string>(format) {} + + Formatter create_formatter(bool pretty) const; +}; + +struct JournalObjectSize {}; + +struct ExportFormat {}; + +struct Secret {}; + +void add_export_format_option(boost::program_options::options_description *opt); + +std::string get_name_prefix(ArgumentModifier modifier); +std::string get_description_prefix(ArgumentModifier modifier); + +void add_all_option(boost::program_options::options_description *opt, + std::string description); + +void add_pool_option(boost::program_options::options_description *opt, + ArgumentModifier modifier, + const std::string &desc_suffix = ""); +void add_namespace_option(boost::program_options::options_description *opt, + ArgumentModifier modifier); + +void add_image_option(boost::program_options::options_description *opt, + ArgumentModifier modifier, + const std::string &desc_suffix = ""); + +void add_image_id_option(boost::program_options::options_description *opt, + const std::string &desc_suffix = ""); + +void add_snap_option(boost::program_options::options_description *opt, + ArgumentModifier modifier); +void add_snap_id_option(boost::program_options::options_description *opt); + +void add_pool_options(boost::program_options::options_description *pos, + boost::program_options::options_description *opt, + bool namespaces_supported); + +void add_image_spec_options(boost::program_options::options_description *pos, + boost::program_options::options_description *opt, + ArgumentModifier modifier); + +void add_snap_spec_options(boost::program_options::options_description *pos, + boost::program_options::options_description *opt, + ArgumentModifier modifier); + +void add_image_or_snap_spec_options( + boost::program_options::options_description *pos, + boost::program_options::options_description *opt, + ArgumentModifier modifier); + +void add_create_image_options(boost::program_options::options_description *opt, + bool include_format); + +void add_create_journal_options( + boost::program_options::options_description *opt); + +void add_size_option(boost::program_options::options_description *opt); + +void add_sparse_size_option(boost::program_options::options_description *opt); + +void add_path_options(boost::program_options::options_description *pos, + boost::program_options::options_description *opt, + const std::string &description); + +void add_limit_option(boost::program_options::options_description *opt); + +void add_no_progress_option(boost::program_options::options_description *opt); + +void add_format_options(boost::program_options::options_description *opt); + +void add_verbose_option(boost::program_options::options_description *opt); + +void add_no_error_option(boost::program_options::options_description *opt); + +void add_flatten_option(boost::program_options::options_description *opt); + +std::string get_short_features_help(bool append_suffix); +std::string get_long_features_help(); + +void validate(boost::any& v, const std::vector<std::string>& values, + ExportFormat *target_type, int); +void validate(boost::any& v, const std::vector<std::string>& values, + ImageSize *target_type, int); +void validate(boost::any& v, const std::vector<std::string>& values, + ImageOrder *target_type, int); +void validate(boost::any& v, const std::vector<std::string>& values, + ImageObjectSize *target_type, int); +void validate(boost::any& v, const std::vector<std::string>& values, + ImageFormat *target_type, int); +void validate(boost::any& v, const std::vector<std::string>& values, + ImageNewFormat *target_type, int); +void validate(boost::any& v, const std::vector<std::string>& values, + ImageFeatures *target_type, int); +void validate(boost::any& v, const std::vector<std::string>& values, + Format *target_type, int); +void validate(boost::any& v, const std::vector<std::string>& values, + JournalObjectSize *target_type, int); +void validate(boost::any& v, const std::vector<std::string>& values, + Secret *target_type, int); + + +std::ostream &operator<<(std::ostream &os, const ImageFeatures &features); + +} // namespace argument_types +} // namespace rbd + +#endif // CEPH_RBD_ARGUMENT_TYPES_H diff --git a/src/tools/rbd/CMakeLists.txt b/src/tools/rbd/CMakeLists.txt new file mode 100644 index 00000000..0e38a033 --- /dev/null +++ b/src/tools/rbd/CMakeLists.txt @@ -0,0 +1,65 @@ +set(CURSES_NEED_NCURSES TRUE) +find_package(Curses REQUIRED) + +set(rbd_srcs + rbd.cc + ArgumentTypes.cc + IndentStream.cc + MirrorDaemonServiceInfo.cc + OptionPrinter.cc + Shell.cc + Utils.cc + action/Bench.cc + action/Children.cc + action/Clone.cc + action/Config.cc + action/Copy.cc + action/Create.cc + action/Device.cc + action/Diff.cc + action/DiskUsage.cc + action/Export.cc + action/Feature.cc + action/Flatten.cc + action/Ggate.cc + action/Group.cc + action/ImageMeta.cc + action/Import.cc + action/Info.cc + action/Journal.cc + action/Kernel.cc + action/List.cc + action/Lock.cc + action/MergeDiff.cc + action/Migration.cc + action/MirrorPool.cc + action/MirrorImage.cc + action/Namespace.cc + action/Nbd.cc + action/ObjectMap.cc + action/Perf.cc + action/Pool.cc + action/Remove.cc + action/Rename.cc + action/Resize.cc + action/Snap.cc + action/Sparsify.cc + action/Status.cc + action/Trash.cc + action/Watch.cc) + +add_executable(rbd ${rbd_srcs} + $<TARGET_OBJECTS:common_texttable_obj>) +set_target_properties(rbd PROPERTIES OUTPUT_NAME rbd) +target_link_libraries(rbd librbd librados + cls_journal_client cls_rbd_client + rbd_types + journal + ceph-common global ${CURSES_LIBRARIES} + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) +if(WITH_KRBD) + target_link_libraries(rbd + krbd) +endif() + +install(TARGETS rbd DESTINATION bin) diff --git a/src/tools/rbd/IndentStream.cc b/src/tools/rbd/IndentStream.cc new file mode 100644 index 00000000..83591a8c --- /dev/null +++ b/src/tools/rbd/IndentStream.cc @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/IndentStream.h" + +namespace rbd { + +int IndentBuffer::overflow (int c) { + if (traits_type::eq_int_type(traits_type::eof(), c)) { + return traits_type::not_eof(c); + } + + int r; + switch (c) { + case '\n': + m_buffer += c; + flush_line(); + r = m_streambuf->sputn(m_buffer.c_str(), m_buffer.size()); + m_buffer.clear(); + return r; + case '\t': + // convert tab to single space and fall-through + c = ' '; + default: + if (m_indent + m_buffer.size() >= m_line_length) { + size_t word_offset = m_buffer.find_last_of(m_delim); + bool space_delim = (m_delim == " "); + if (word_offset == std::string::npos && !space_delim) { + word_offset = m_buffer.find_last_of(" "); + } + + if (word_offset != std::string::npos) { + flush_line(); + m_streambuf->sputn(m_buffer.c_str(), word_offset); + m_buffer = std::string(m_buffer, + word_offset + (space_delim ? 1 : 0)); + } else { + flush_line(); + m_streambuf->sputn(m_buffer.c_str(), m_buffer.size()); + m_buffer.clear(); + } + m_streambuf->sputc('\n'); + } + m_buffer += c; + return c; + } +} + +void IndentBuffer::flush_line() { + if (m_initial_offset >= m_indent) { + m_initial_offset = 0; + m_streambuf->sputc('\n'); + } + + m_streambuf->sputn(m_indent_prefix.c_str(), m_indent - m_initial_offset); + m_initial_offset = 0; +} + +} // namespace rbd diff --git a/src/tools/rbd/IndentStream.h b/src/tools/rbd/IndentStream.h new file mode 100644 index 00000000..85ccc85b --- /dev/null +++ b/src/tools/rbd/IndentStream.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_INDENT_STREAM_H +#define CEPH_RBD_INDENT_STREAM_H + +#include "include/int_types.h" +#include <iostream> +#include <streambuf> +#include <iomanip> + +namespace rbd { + +class IndentBuffer : public std::streambuf { +public: + IndentBuffer(size_t indent, size_t initial_offset, size_t line_length, + std::streambuf *streambuf) + : m_indent(indent), m_initial_offset(initial_offset), + m_line_length(line_length), m_streambuf(streambuf), + m_delim(" "), m_indent_prefix(m_indent, ' ') { + } + + void set_delimiter(const std::string &delim) { + m_delim = delim; + } + +protected: + int overflow (int c) override; + +private: + size_t m_indent; + size_t m_initial_offset; + size_t m_line_length; + std::streambuf *m_streambuf; + + std::string m_delim; + std::string m_indent_prefix; + std::string m_buffer; + + void flush_line(); +}; + +class IndentStream : public std::ostream { +public: + IndentStream(size_t indent, size_t initial_offset, size_t line_length, + std::ostream &os) + : std::ostream(&m_indent_buffer), + m_indent_buffer(indent, initial_offset, line_length, os.rdbuf()) { + } + + void set_delimiter(const std::string &delim) { + m_indent_buffer.set_delimiter(delim); + } +private: + IndentBuffer m_indent_buffer; +}; + +} // namespace rbd + +#endif // CEPH_RBD_INDENT_STREAM_ITERATOR_H diff --git a/src/tools/rbd/MirrorDaemonServiceInfo.cc b/src/tools/rbd/MirrorDaemonServiceInfo.cc new file mode 100644 index 00000000..4870c1b2 --- /dev/null +++ b/src/tools/rbd/MirrorDaemonServiceInfo.cc @@ -0,0 +1,174 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_json.h" +#include "common/errno.h" +#include "include/rados/librados.hpp" +#include "include/stringify.h" +#include "tools/rbd/MirrorDaemonServiceInfo.h" + +#include <boost/scope_exit.hpp> +#include <iostream> + +namespace rbd { + +int MirrorDaemonServiceInfo::init() { + + std::string cmd = "{\"prefix\": \"service dump\"}"; + + bufferlist in_bl; + bufferlist out_bl; + int r = librados::Rados(m_io_ctx).mgr_command(cmd, in_bl, &out_bl, nullptr); + if (r < 0) { + std::cerr << "rbd: failed to get service dump: " << cpp_strerror(r) + << std::endl; + return r; + } + + bool json_valid = false; + json_spirit::mValue json_root; + if (json_spirit::read(out_bl.to_str(), json_root)) { + try { + auto& json_obj = json_root.get_obj(); + if (json_obj.count("services")) { + auto &services = json_obj["services"].get_obj(); + if (services.count("rbd-mirror")) { + auto &mirror_service = services["rbd-mirror"].get_obj(); + if (mirror_service.count("daemons")) { + for (auto &it : mirror_service["daemons"].get_obj()) { + if (it.second.type() != json_spirit::obj_type || + !it.second.get_obj().count("metadata")) { + continue; + } + auto &service_id = it.first; + auto &daemon_metadata = it.second.get_obj()["metadata"].get_obj(); + for (auto &iter : daemon_metadata) { + if (iter.second.type() != json_spirit::str_type) { + continue; + } + m_daemons_metadata[service_id][iter.first] = iter.second.get_str(); + } + } + } + } + } + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + std::cerr << "rbd: failed to parse service status" << std::endl; + return -EBADMSG; + } + + cmd = "{\"prefix\": \"service status\"}"; + + out_bl.clear(); + r = librados::Rados(m_io_ctx).mgr_command(cmd, in_bl, &out_bl, nullptr); + if (r < 0) { + std::cerr << "rbd: failed to get service status: " << cpp_strerror(r) + << std::endl; + return r; + } + + json_valid = false; + if (json_spirit::read(out_bl.to_str(), json_root)) { + try { + auto& json_obj = json_root.get_obj(); + if (json_obj.count("rbd-mirror")) { + auto &mirror_service = json_obj["rbd-mirror"].get_obj(); + for (auto &it : mirror_service) { + auto &service_id = it.first; + auto &daemon = it.second.get_obj(); + if (daemon.count("status") && + daemon["status"].get_obj().count("json")) { + auto& status_json_str = + daemon["status"].get_obj()["json"].get_str(); + json_spirit::mValue status_json_root; + if (json_spirit::read(status_json_str, status_json_root)) { + auto& status = status_json_root.get_obj(); + auto iter = status.find(stringify(m_io_ctx.get_id())); + if (iter != status.end() && + iter->second.get_obj().count("instance_id")) { + auto &instance_id = + iter->second.get_obj()["instance_id"].get_str(); + m_instance_id_to_service_id[instance_id] = service_id; + } + } + } + } + } + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + std::cerr << "rbd: failed to parse service status" << std::endl; + return -EBADMSG; + } + + return 0; +} + +std::string MirrorDaemonServiceInfo::get_description( + const std::string &instance_id) const { + if (!m_instance_id_to_service_id.count(instance_id)) { + return {}; + } + + auto service_id = m_instance_id_to_service_id.find(instance_id)->second; + + auto it = m_daemons_metadata.find(service_id); + if (it == m_daemons_metadata.end()) { + return service_id; + } + + auto &metadata = it->second; + auto iter = metadata.find("id"); + std::string description = (iter != metadata.end()) ? + iter->second : service_id; + iter = metadata.find("hostname"); + if (iter != metadata.end()) { + description += " on " + iter->second; + } + + return description; +} + +void MirrorDaemonServiceInfo::dump( + const std::string &instance_id, + argument_types::Format::Formatter formatter) const { + formatter->open_object_section("daemon_service"); + BOOST_SCOPE_EXIT(formatter) { + formatter->close_section(); + } BOOST_SCOPE_EXIT_END; + + if (instance_id.empty() || + !m_instance_id_to_service_id.count(instance_id)) { + return; + } + + auto service_id = m_instance_id_to_service_id.find(instance_id)->second; + formatter->dump_string("service_id", service_id); + formatter->dump_string("instance_id", instance_id); + + auto it = m_daemons_metadata.find(service_id); + if (it == m_daemons_metadata.end()) { + return; + } + + auto &metadata = it->second; + auto iter = metadata.find("id"); + if (iter != metadata.end()) { + formatter->dump_string("daemon_id", iter->second); + } + iter = metadata.find("hostname"); + if (iter != metadata.end()) { + formatter->dump_string("hostname", iter->second); + } +} + +} // namespace rbd + diff --git a/src/tools/rbd/MirrorDaemonServiceInfo.h b/src/tools/rbd/MirrorDaemonServiceInfo.h new file mode 100644 index 00000000..7c3c3856 --- /dev/null +++ b/src/tools/rbd/MirrorDaemonServiceInfo.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_DAEMON_SERVICE_INFO_H +#define CEPH_RBD_MIRROR_DAEMON_SERVICE_INFO_H + +#include "include/rados/librados_fwd.hpp" +#include "tools/rbd/ArgumentTypes.h" + +#include <string> +#include <map> + +namespace rbd { + +class MirrorDaemonServiceInfo { +public: + MirrorDaemonServiceInfo(librados::IoCtx &io_ctx) : m_io_ctx(io_ctx) { + } + + int init(); + + std::string get_description(const std::string &instance_id) const; + void dump(const std::string &instance_id, + argument_types::Format::Formatter formatter) const; + +private: + librados::IoCtx &m_io_ctx; + std::map<std::string, std::string> m_instance_id_to_service_id; + std::map<std::string, std::map<std::string, std::string>> m_daemons_metadata; +}; + +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_DAEMON_SERVICE_INFO_H diff --git a/src/tools/rbd/OptionPrinter.cc b/src/tools/rbd/OptionPrinter.cc new file mode 100644 index 00000000..14affb0b --- /dev/null +++ b/src/tools/rbd/OptionPrinter.cc @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/OptionPrinter.h" +#include "tools/rbd/IndentStream.h" + +namespace rbd { + +namespace po = boost::program_options; + +const std::string OptionPrinter::POSITIONAL_ARGUMENTS("Positional arguments"); +const std::string OptionPrinter::OPTIONAL_ARGUMENTS("Optional arguments"); + +const size_t OptionPrinter::MAX_DESCRIPTION_OFFSET; + +OptionPrinter::OptionPrinter(const OptionsDescription &positional, + const OptionsDescription &optional) + : m_positional(positional), m_optional(optional) { +} + +void OptionPrinter::print_short(std::ostream &os, size_t initial_offset) { + size_t name_width = std::min(initial_offset, MAX_DESCRIPTION_OFFSET) + 1; + + IndentStream indent_stream(name_width, initial_offset, LINE_WIDTH, os); + indent_stream.set_delimiter("["); + for (size_t i = 0; i < m_optional.options().size(); ++i) { + bool required = m_optional.options()[i]->semantic()->is_required(); + if (!required) { + indent_stream << "["; + } + indent_stream << "--" << m_optional.options()[i]->long_name(); + if (m_optional.options()[i]->semantic()->max_tokens() != 0) { + indent_stream << " <" << m_optional.options()[i]->long_name() << ">"; + } + if (!required) { + indent_stream << "]"; + } + indent_stream << " "; + } + + if (m_optional.options().size() > 0 || m_positional.options().size() == 0) { + indent_stream << std::endl; + } + + if (m_positional.options().size() > 0) { + indent_stream.set_delimiter(" "); + for (size_t i = 0; i < m_positional.options().size(); ++i) { + indent_stream << "<" << m_positional.options()[i]->long_name() << "> "; + if (m_positional.options()[i]->semantic()->max_tokens() > 1) { + indent_stream << "[<" << m_positional.options()[i]->long_name() + << "> ...]"; + break; + } + } + indent_stream << std::endl; + } +} + +void OptionPrinter::print_detailed(std::ostream &os) { + std::string indent_prefix(2, ' '); + size_t name_width = compute_name_width(indent_prefix.size()); + + if (m_positional.options().size() > 0) { + std::cout << POSITIONAL_ARGUMENTS << std::endl; + for (size_t i = 0; i < m_positional.options().size(); ++i) { + std::stringstream ss; + ss << indent_prefix << "<" << m_positional.options()[i]->long_name() + << ">"; + + std::cout << ss.str(); + IndentStream indent_stream(name_width, ss.str().size(), LINE_WIDTH, os); + indent_stream << m_positional.options()[i]->description() << std::endl; + } + std::cout << std::endl; + } + + if (m_optional.options().size() > 0) { + std::cout << OPTIONAL_ARGUMENTS << std::endl; + for (size_t i = 0; i < m_optional.options().size(); ++i) { + std::stringstream ss; + ss << indent_prefix + << m_optional.options()[i]->format_name() << " " + << m_optional.options()[i]->format_parameter(); + + std::cout << ss.str(); + IndentStream indent_stream(name_width, ss.str().size(), LINE_WIDTH, os); + indent_stream << m_optional.options()[i]->description() << std::endl; + } + std::cout << std::endl; + } +} + +size_t OptionPrinter::compute_name_width(size_t indent) { + size_t width = MIN_NAME_WIDTH; + std::vector<OptionsDescription> descs = {m_positional, m_optional}; + for (size_t desc_idx = 0; desc_idx < descs.size(); ++desc_idx) { + const OptionsDescription &desc = descs[desc_idx]; + for (size_t opt_idx = 0; opt_idx < desc.options().size(); ++opt_idx) { + size_t name_width = desc.options()[opt_idx]->format_name().size() + + desc.options()[opt_idx]->format_parameter().size() + + 1; + width = std::max(width, name_width); + } + } + width += indent; + width = std::min(width, MAX_DESCRIPTION_OFFSET) + 1; + return width; +} + +} // namespace rbd diff --git a/src/tools/rbd/OptionPrinter.h b/src/tools/rbd/OptionPrinter.h new file mode 100644 index 00000000..e18a5f88 --- /dev/null +++ b/src/tools/rbd/OptionPrinter.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_OPTION_PRINTER_H +#define CEPH_RBD_OPTION_PRINTER_H + +#include "include/int_types.h" +#include <string> +#include <vector> +#include <boost/program_options.hpp> + +namespace rbd { + +class OptionPrinter { +public: + typedef boost::program_options::options_description OptionsDescription; + + static const std::string POSITIONAL_ARGUMENTS; + static const std::string OPTIONAL_ARGUMENTS; + + static const size_t LINE_WIDTH = 80; + static const size_t MIN_NAME_WIDTH = 20; + static const size_t MAX_DESCRIPTION_OFFSET = LINE_WIDTH / 2; + + OptionPrinter(const OptionsDescription &positional, + const OptionsDescription &optional); + + void print_short(std::ostream &os, size_t initial_offset); + void print_detailed(std::ostream &os); + +private: + const OptionsDescription &m_positional; + const OptionsDescription &m_optional; + + size_t compute_name_width(size_t indent); +}; + +} // namespace rbd + +#endif // CEPH_RBD_OPTION_PRINTER_H diff --git a/src/tools/rbd/Shell.cc b/src/tools/rbd/Shell.cc new file mode 100644 index 00000000..9993c691 --- /dev/null +++ b/src/tools/rbd/Shell.cc @@ -0,0 +1,432 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/Shell.h" +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/IndentStream.h" +#include "tools/rbd/OptionPrinter.h" +#include "common/ceph_argparse.h" +#include "common/config.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "include/stringify.h" +#include <algorithm> +#include <iostream> +#include <set> + +namespace rbd { + +namespace at = argument_types; +namespace po = boost::program_options; + +namespace { + +static const std::string APP_NAME("rbd"); +static const std::string HELP_SPEC("help"); +static const std::string BASH_COMPLETION_SPEC("bash-completion"); + +boost::intrusive_ptr<CephContext> global_init( + int argc, const char **argv, std::vector<std::string> *command_args, + std::vector<std::string> *global_init_args) { + std::vector<const char*> cmd_args; + argv_to_vec(argc, argv, cmd_args); + std::vector<const char*> args(cmd_args); + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_MON_CONFIG); + + *command_args = {args.begin(), args.end()}; + + // Scan command line arguments for ceph global init args (those are + // filtered out from args vector by global_init). + + auto cursor = args.begin(); + for (auto &arg : cmd_args) { + auto iter = cursor; + for (; iter != args.end(); iter++) { + if (*iter == arg) { + break; + } + } + if (iter == args.end()) { + // filtered out by global_init + global_init_args->push_back(arg); + } else { + cursor = ++iter; + } + } + + return cct; +} + +std::string format_command_spec(const Shell::CommandSpec &spec) { + return joinify<std::string>(spec.begin(), spec.end(), " "); +} + +std::string format_alias_spec(const Shell::CommandSpec &spec, + const Shell::CommandSpec &alias_spec) { + auto spec_it = spec.begin(); + auto alias_it = alias_spec.begin(); + int level = 0; + while (spec_it != spec.end() && alias_it != alias_spec.end() && + *spec_it == *alias_it) { + spec_it++; + alias_it++; + level++; + } + ceph_assert(spec_it != spec.end() && alias_it != alias_spec.end()); + + if (level < 2) { + return joinify<std::string>(alias_spec.begin(), alias_spec.end(), " "); + } else { + return "... " + joinify<std::string>(alias_it, alias_spec.end(), " "); + } +} + +std::string format_command_name(const Shell::CommandSpec &spec, + const Shell::CommandSpec &alias_spec) { + std::string name = format_command_spec(spec); + if (!alias_spec.empty()) { + name += " (" + format_alias_spec(spec, alias_spec) + ")"; + } + return name; +} + +std::string format_option_suffix( + const boost::shared_ptr<po::option_description> &option) { + std::string suffix; + if (option->semantic()->max_tokens() != 0) { + if (option->description().find("path") != std::string::npos || + option->description().find("file") != std::string::npos) { + suffix += " path"; + } else if (option->description().find("host") != std::string::npos) { + suffix += " host"; + } else { + suffix += " arg"; + } + } + return suffix; +} + +} // anonymous namespace + +std::vector<Shell::Action *>& Shell::get_actions() { + static std::vector<Action *> actions; + + return actions; +} + +std::set<std::string>& Shell::get_switch_arguments() { + static std::set<std::string> switch_arguments; + + return switch_arguments; +} + +int Shell::execute(int argc, const char **argv) { + std::vector<std::string> arguments; + std::vector<std::string> ceph_global_init_args; + auto cct = global_init(argc, argv, &arguments, &ceph_global_init_args); + + std::vector<std::string> command_spec; + get_command_spec(arguments, &command_spec); + bool is_alias = true; + + if (command_spec.empty() || command_spec == CommandSpec({"help"})) { + // list all available actions + print_help(); + return 0; + } else if (command_spec[0] == HELP_SPEC) { + // list help for specific action + command_spec.erase(command_spec.begin()); + Action *action = find_action(command_spec, NULL, &is_alias); + if (action == NULL) { + print_unknown_action(command_spec); + return EXIT_FAILURE; + } else { + print_action_help(action, is_alias); + return 0; + } + } else if (command_spec[0] == BASH_COMPLETION_SPEC) { + command_spec.erase(command_spec.begin()); + print_bash_completion(command_spec); + return 0; + } + + CommandSpec *matching_spec; + Action *action = find_action(command_spec, &matching_spec, &is_alias); + if (action == NULL) { + print_unknown_action(command_spec); + return EXIT_FAILURE; + } + + po::variables_map vm; + try { + po::options_description positional_opts; + po::options_description command_opts; + (*action->get_arguments)(&positional_opts, &command_opts); + + // dynamically allocate options for our command (e.g. snap list) and + // its associated positional arguments + po::options_description argument_opts; + argument_opts.add_options() + (at::POSITIONAL_COMMAND_SPEC.c_str(), + po::value<std::vector<std::string> >()->required(), "") + (at::POSITIONAL_ARGUMENTS.c_str(), + po::value<std::vector<std::string> >(), ""); + + po::positional_options_description positional_options; + positional_options.add(at::POSITIONAL_COMMAND_SPEC.c_str(), + matching_spec->size()); + if (!positional_opts.options().empty()) { + int max_count = positional_opts.options().size(); + if (positional_opts.options().back()->semantic()->max_tokens() > 1) + max_count = -1; + positional_options.add(at::POSITIONAL_ARGUMENTS.c_str(), max_count); + } + + po::options_description group_opts; + group_opts.add(command_opts) + .add(argument_opts); + + po::store(po::command_line_parser(arguments) + .style(po::command_line_style::default_style & + ~po::command_line_style::allow_guessing) + .options(group_opts) + .positional(positional_options) + .run(), vm); + + if (vm[at::POSITIONAL_COMMAND_SPEC].as<std::vector<std::string> >() != + *matching_spec) { + std::cerr << "rbd: failed to parse command" << std::endl; + return EXIT_FAILURE; + } + + int r = (*action->execute)(vm, ceph_global_init_args); + if (r != 0) { + return std::abs(r); + } + } catch (po::required_option& e) { + std::cerr << "rbd: " << e.what() << std::endl; + return EXIT_FAILURE; + } catch (po::too_many_positional_options_error& e) { + std::cerr << "rbd: too many arguments" << std::endl; + return EXIT_FAILURE; + } catch (po::error& e) { + std::cerr << "rbd: " << e.what() << std::endl; + return EXIT_FAILURE; + } + + return 0; +} + +void Shell::get_command_spec(const std::vector<std::string> &arguments, + std::vector<std::string> *command_spec) { + for (size_t i = 0; i < arguments.size(); ++i) { + std::string arg(arguments[i]); + if (arg == "-h" || arg == "--help") { + *command_spec = {HELP_SPEC}; + return; + } else if (arg == "--") { + // all arguments after a double-dash are positional + if (i + 1 < arguments.size()) { + command_spec->insert(command_spec->end(), + arguments.data() + i + 1, + arguments.data() + arguments.size()); + } + return; + } else if (arg[0] == '-') { + // if the option is not a switch, skip its value + if (arg.size() >= 2 && + (arg[1] == '-' || + get_switch_arguments().count(arg.substr(1, 1)) == 0) && + (arg[1] != '-' || + get_switch_arguments().count(arg.substr(2, std::string::npos)) == 0) && + at::SWITCH_ARGUMENTS.count(arg.substr(2, std::string::npos)) == 0 && + arg.find('=') == std::string::npos) { + ++i; + } + } else { + command_spec->push_back(arg); + } + } +} + +Shell::Action *Shell::find_action(const CommandSpec &command_spec, + CommandSpec **matching_spec, bool *is_alias) { + for (size_t i = 0; i < get_actions().size(); ++i) { + Action *action = get_actions()[i]; + if (action->command_spec.size() <= command_spec.size()) { + if (std::includes(action->command_spec.begin(), + action->command_spec.end(), + command_spec.begin(), + command_spec.begin() + action->command_spec.size())) { + if (matching_spec != NULL) { + *matching_spec = &action->command_spec; + } + *is_alias = false; + return action; + } + } + if (!action->alias_command_spec.empty() && + action->alias_command_spec.size() <= command_spec.size()) { + if (std::includes(action->alias_command_spec.begin(), + action->alias_command_spec.end(), + command_spec.begin(), + command_spec.begin() + + action->alias_command_spec.size())) { + if (matching_spec != NULL) { + *matching_spec = &action->alias_command_spec; + } + *is_alias = true; + return action; + } + } + } + return NULL; +} + +void Shell::get_global_options(po::options_description *opts) { + opts->add_options() + ((at::CONFIG_PATH + ",c").c_str(), po::value<std::string>(), "path to cluster configuration") + ("cluster", po::value<std::string>(), "cluster name") + ("id", po::value<std::string>(), "client id (without 'client.' prefix)") + ("user", po::value<std::string>(), "client id (without 'client.' prefix)") + ("name,n", po::value<std::string>(), "client name") + ("mon_host,m", po::value<std::string>(), "monitor host") + ("secret", po::value<at::Secret>(), "path to secret key (deprecated)") + ("keyfile,K", po::value<std::string>(), "path to secret key") + ("keyring,k", po::value<std::string>(), "path to keyring"); +} + +void Shell::print_help() { + std::cout << "usage: " << APP_NAME << " <command> ..." + << std::endl << std::endl + << "Command-line interface for managing Ceph RBD images." + << std::endl << std::endl; + + std::vector<Action *> actions(get_actions()); + std::sort(actions.begin(), actions.end(), + [](Action *lhs, Action *rhs) { return lhs->command_spec < + rhs->command_spec; }); + + std::cout << OptionPrinter::POSITIONAL_ARGUMENTS << ":" << std::endl + << " <command>" << std::endl; + + // since the commands have spaces, we have to build our own formatter + std::string indent(4, ' '); + size_t name_width = OptionPrinter::MIN_NAME_WIDTH; + for (size_t i = 0; i < actions.size(); ++i) { + Action *action = actions[i]; + std::string name = format_command_name(action->command_spec, + action->alias_command_spec); + name_width = std::max(name_width, name.size()); + } + name_width += indent.size(); + name_width = std::min(name_width, OptionPrinter::MAX_DESCRIPTION_OFFSET) + 1; + + for (size_t i = 0; i < actions.size(); ++i) { + Action *action = actions[i]; + if (!action->visible) + continue; + std::stringstream ss; + ss << indent + << format_command_name(action->command_spec, action->alias_command_spec); + + std::cout << ss.str(); + if (!action->description.empty()) { + IndentStream indent_stream(name_width, ss.str().size(), + OptionPrinter::LINE_WIDTH, + std::cout); + indent_stream << action->description << std::endl; + } else { + std::cout << std::endl; + } + } + + po::options_description global_opts(OptionPrinter::OPTIONAL_ARGUMENTS); + get_global_options(&global_opts); + std::cout << std::endl << global_opts << std::endl + << "See '" << APP_NAME << " help <command>' for help on a specific " + << "command." << std::endl; + } + +void Shell::print_action_help(Action *action, bool is_alias) { + std::stringstream ss; + ss << "usage: " << APP_NAME << " " + << format_command_spec(is_alias ? action->alias_command_spec : action->command_spec); + std::cout << ss.str(); + + po::options_description positional; + po::options_description options; + (*action->get_arguments)(&positional, &options); + + OptionPrinter option_printer(positional, options); + option_printer.print_short(std::cout, ss.str().size()); + + if (!action->description.empty()) { + std::cout << std::endl << action->description << std::endl; + } + + std::cout << std::endl; + option_printer.print_detailed(std::cout); + + if (!action->help.empty()) { + std::cout << action->help << std::endl; + } +} + +void Shell::print_unknown_action(const std::vector<std::string> &command_spec) { + std::cerr << "error: unknown option '" + << joinify<std::string>(command_spec.begin(), + command_spec.end(), " ") << "'" + << std::endl << std::endl; + print_help(); +} + +void Shell::print_bash_completion(const CommandSpec &command_spec) { + + bool is_alias = true; + + Action *action = find_action(command_spec, NULL, &is_alias); + po::options_description global_opts; + get_global_options(&global_opts); + print_bash_completion_options(global_opts); + + if (action != nullptr) { + po::options_description positional_opts; + po::options_description command_opts; + (*action->get_arguments)(&positional_opts, &command_opts); + print_bash_completion_options(command_opts); + } else { + std::cout << "|help"; + for (size_t i = 0; i < get_actions().size(); ++i) { + Action *action = get_actions()[i]; + std::cout << "|" + << joinify<std::string>(action->command_spec.begin(), + action->command_spec.end(), " "); + if (!action->alias_command_spec.empty()) { + std::cout << "|" + << joinify<std::string>(action->alias_command_spec.begin(), + action->alias_command_spec.end(), + " "); + } + } + } + std::cout << "|" << std::endl; +} + +void Shell::print_bash_completion_options(const po::options_description &ops) { + for (size_t i = 0; i < ops.options().size(); ++i) { + auto option = ops.options()[i]; + std::string long_name(option->canonical_display_name(0)); + std::string short_name(option->canonical_display_name( + po::command_line_style::allow_dash_for_short)); + + std::cout << "|--" << long_name << format_option_suffix(option); + if (long_name != short_name) { + std::cout << "|" << short_name << format_option_suffix(option); + } + } +} + +} // namespace rbd diff --git a/src/tools/rbd/Shell.h b/src/tools/rbd/Shell.h new file mode 100644 index 00000000..fe3dee46 --- /dev/null +++ b/src/tools/rbd/Shell.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_SHELL_H +#define CEPH_RBD_SHELL_H + +#include "include/int_types.h" +#include <set> +#include <string> +#include <vector> +#include <boost/program_options.hpp> + +namespace rbd { + +class Shell { +public: + typedef std::vector<std::string> CommandSpec; + + struct Action { + typedef void (*GetArguments)(boost::program_options::options_description *, + boost::program_options::options_description *); + typedef int (*Execute)(const boost::program_options::variables_map &, + const std::vector<std::string> &); + + CommandSpec command_spec; + CommandSpec alias_command_spec; + const std::string description; + const std::string help; + GetArguments get_arguments; + Execute execute; + bool visible; + + template <typename Args, typename Execute> + Action(const std::initializer_list<std::string> &command_spec, + const std::initializer_list<std::string> &alias_command_spec, + const std::string &description, const std::string &help, + Args args, Execute execute, bool visible = true) + : command_spec(command_spec), alias_command_spec(alias_command_spec), + description(description), help(help), get_arguments(args), + execute(execute), visible(visible) { + Shell::get_actions().push_back(this); + } + + }; + + struct SwitchArguments { + SwitchArguments(const std::initializer_list<std::string> &arguments) { + Shell::get_switch_arguments().insert(arguments.begin(), arguments.end()); + } + }; + + int execute(int argc, const char **argv); + +private: + static std::vector<Action *>& get_actions(); + static std::set<std::string>& get_switch_arguments(); + + void get_command_spec(const std::vector<std::string> &arguments, + std::vector<std::string> *command_spec); + Action *find_action(const CommandSpec &command_spec, + CommandSpec **matching_spec, bool *is_alias); + + void get_global_options(boost::program_options::options_description *opts); + + void print_help(); + void print_action_help(Action *action, bool is_alias); + void print_unknown_action(const CommandSpec &command_spec); + + void print_bash_completion(const CommandSpec &command_spec); + void print_bash_completion_options( + const boost::program_options::options_description &ops); +}; + +} // namespace rbd + +#endif // CEPH_RBD_SHELL_H diff --git a/src/tools/rbd/Utils.cc b/src/tools/rbd/Utils.cc new file mode 100644 index 00000000..d4f50022 --- /dev/null +++ b/src/tools/rbd/Utils.cc @@ -0,0 +1,907 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/Utils.h" +#include "include/ceph_assert.h" +#include "include/Context.h" +#include "include/encoding.h" +#include "common/common_init.h" +#include "include/stringify.h" +#include "include/rbd/features.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "global/global_context.h" +#include <iostream> +#include <regex> +#include <boost/algorithm/string.hpp> +#include <boost/lexical_cast.hpp> + +namespace rbd { +namespace utils { + +namespace at = argument_types; +namespace po = boost::program_options; + +int ProgressContext::update_progress(uint64_t offset, uint64_t total) { + if (progress) { + int pc = total ? (offset * 100ull / total) : 0; + if (pc != last_pc) { + cerr << "\r" << operation << ": " + << pc << "% complete..."; + cerr.flush(); + last_pc = pc; + } + } + return 0; +} + +void ProgressContext::finish() { + if (progress) { + cerr << "\r" << operation << ": 100% complete...done." << std::endl; + } +} + +void ProgressContext::fail() { + if (progress) { + cerr << "\r" << operation << ": " << last_pc << "% complete...failed." + << std::endl; + } +} + +void aio_context_callback(librbd::completion_t completion, void *arg) +{ + librbd::RBD::AioCompletion *aio_completion = + reinterpret_cast<librbd::RBD::AioCompletion*>(completion); + Context *context = reinterpret_cast<Context *>(arg); + context->complete(aio_completion->get_return_value()); + aio_completion->release(); +} + +int read_string(int fd, unsigned max, std::string *out) { + char buf[4]; + + int r = safe_read_exact(fd, buf, 4); + if (r < 0) + return r; + + bufferlist bl; + bl.append(buf, 4); + auto p = bl.cbegin(); + uint32_t len; + decode(len, p); + if (len > max) + return -EINVAL; + + char sbuf[len]; + r = safe_read_exact(fd, sbuf, len); + if (r < 0) + return r; + out->assign(sbuf, len); + return len; +} + +int extract_spec(const std::string &spec, std::string *pool_name, + std::string *namespace_name, std::string *name, + std::string *snap_name, SpecValidation spec_validation) { + if (!g_ceph_context->_conf.get_val<bool>("rbd_validate_names")) { + spec_validation = SPEC_VALIDATION_NONE; + } + + std::regex pattern; + switch (spec_validation) { + case SPEC_VALIDATION_FULL: + // disallow "/" and "@" in all names + pattern = "^(?:([^/@]+)/(?:([^/@]+)/)?)?([^/@]+)(?:@([^/@]+))?$"; + break; + case SPEC_VALIDATION_SNAP: + // disallow "/" and "@" in snap name + pattern = "^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$"; + break; + case SPEC_VALIDATION_NONE: + // relaxed pattern assumes pool is before first "/", + // namespace is before second "/", and snap name is after first "@" + pattern = "^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@(.+))?$"; + break; + default: + ceph_abort(); + break; + } + + std::smatch match; + if (!std::regex_match(spec, match, pattern)) { + std::cerr << "rbd: invalid spec '" << spec << "'" << std::endl; + return -EINVAL; + } + + if (match[1].matched) { + if (pool_name != nullptr) { + *pool_name = match[1]; + } else { + std::cerr << "rbd: pool name specified for a command that doesn't use it" + << std::endl; + return -EINVAL; + } + } + + if (match[2].matched) { + if (namespace_name != nullptr) { + *namespace_name = match[2]; + } else { + std::cerr << "rbd: namespace name specified for a command that doesn't " + << "use it" << std::endl; + return -EINVAL; + } + } + + if (name != nullptr) { + *name = match[3]; + } + + if (match[4].matched) { + if (snap_name != nullptr) { + *snap_name = match[4]; + } else { + std::cerr << "rbd: snapshot name specified for a command that doesn't " + << "use it" << std::endl; + return -EINVAL; + } + } + return 0; +} + +std::string get_positional_argument(const po::variables_map &vm, size_t index) { + if (vm.count(at::POSITIONAL_ARGUMENTS) == 0) { + return ""; + } + + const std::vector<std::string> &args = + boost::any_cast<std::vector<std::string> >( + vm[at::POSITIONAL_ARGUMENTS].value()); + if (index < args.size()) { + return args[index]; + } + return ""; +} + +std::string get_default_pool_name() { + return g_ceph_context->_conf.get_val<std::string>("rbd_default_pool"); +} + +int get_pool_and_namespace_names( + const boost::program_options::variables_map &vm, + bool default_empty_pool_name, bool validate_pool_name, + std::string* pool_name, std::string* namespace_name, size_t *arg_index) { + if (namespace_name != nullptr && vm.count(at::NAMESPACE_NAME)) { + *namespace_name = vm[at::NAMESPACE_NAME].as<std::string>(); + } + + if (vm.count(at::POOL_NAME)) { + *pool_name = vm[at::POOL_NAME].as<std::string>(); + } else { + *pool_name = get_positional_argument(vm, *arg_index); + if (!pool_name->empty()) { + if (namespace_name != nullptr) { + auto slash_pos = pool_name->find_last_of('/'); + if (slash_pos != std::string::npos) { + *namespace_name = pool_name->substr(slash_pos + 1); + } + *pool_name = pool_name->substr(0, slash_pos); + } + ++(*arg_index); + } + } + + if (default_empty_pool_name && pool_name->empty()) { + *pool_name = get_default_pool_name(); + } + + if (!g_ceph_context->_conf.get_val<bool>("rbd_validate_names")) { + validate_pool_name = false; + } + + if (validate_pool_name && + pool_name->find_first_of("/@") != std::string::npos) { + std::cerr << "rbd: invalid pool '" << *pool_name << "'" << std::endl; + return -EINVAL; + } else if (namespace_name != nullptr && + namespace_name->find_first_of("/@") != std::string::npos) { + std::cerr << "rbd: invalid namespace '" << *namespace_name << "'" + << std::endl; + return -EINVAL; + } + + return 0; +} + +int get_pool_image_id(const po::variables_map &vm, + size_t *spec_arg_index, + std::string *pool_name, + std::string *namespace_name, + std::string *image_id) { + + if (vm.count(at::POOL_NAME) && pool_name != nullptr) { + *pool_name = vm[at::POOL_NAME].as<std::string>(); + } + if (vm.count(at::NAMESPACE_NAME) && namespace_name != nullptr) { + *namespace_name = vm[at::NAMESPACE_NAME].as<std::string>(); + } + if (vm.count(at::IMAGE_ID) && image_id != nullptr) { + *image_id = vm[at::IMAGE_ID].as<std::string>(); + } + + int r; + if (image_id != nullptr && spec_arg_index != nullptr && image_id->empty()) { + std::string spec = get_positional_argument(vm, (*spec_arg_index)++); + if (!spec.empty()) { + r = extract_spec(spec, pool_name, namespace_name, image_id, nullptr, + SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + } + } + + if (pool_name != nullptr && pool_name->empty()) { + *pool_name = get_default_pool_name(); + } + + if (image_id != nullptr && image_id->empty()) { + std::cerr << "rbd: image id was not specified" << std::endl; + return -EINVAL; + } + + return 0; +} + +int get_pool_image_snapshot_names(const po::variables_map &vm, + at::ArgumentModifier mod, + size_t *spec_arg_index, + std::string *pool_name, + std::string *namespace_name, + std::string *image_name, + std::string *snap_name, + bool image_name_required, + SnapshotPresence snapshot_presence, + SpecValidation spec_validation) { + std::string pool_key = (mod == at::ARGUMENT_MODIFIER_DEST ? + at::DEST_POOL_NAME : at::POOL_NAME); + std::string image_key = (mod == at::ARGUMENT_MODIFIER_DEST ? + at::DEST_IMAGE_NAME : at::IMAGE_NAME); + return get_pool_generic_snapshot_names(vm, mod, spec_arg_index, pool_key, + pool_name, namespace_name, image_key, + "image", image_name, snap_name, + image_name_required, snapshot_presence, + spec_validation); +} + +int get_pool_generic_snapshot_names(const po::variables_map &vm, + at::ArgumentModifier mod, + size_t *spec_arg_index, + const std::string& pool_key, + std::string *pool_name, + std::string *namespace_name, + const std::string& generic_key, + const std::string& generic_key_desc, + std::string *generic_name, + std::string *snap_name, + bool generic_name_required, + SnapshotPresence snapshot_presence, + SpecValidation spec_validation) { + std::string namespace_key = (mod == at::ARGUMENT_MODIFIER_DEST ? + at::DEST_NAMESPACE_NAME : at::NAMESPACE_NAME); + std::string snap_key = (mod == at::ARGUMENT_MODIFIER_DEST ? + at::DEST_SNAPSHOT_NAME : at::SNAPSHOT_NAME); + + if (vm.count(pool_key) && pool_name != nullptr) { + *pool_name = vm[pool_key].as<std::string>(); + } + if (vm.count(namespace_key) && namespace_name != nullptr) { + *namespace_name = vm[namespace_key].as<std::string>(); + } + if (vm.count(generic_key) && generic_name != nullptr) { + *generic_name = vm[generic_key].as<std::string>(); + } + if (vm.count(snap_key) && snap_name != nullptr) { + *snap_name = vm[snap_key].as<std::string>(); + } + + int r; + if ((generic_key == at::IMAGE_NAME || generic_key == at::DEST_IMAGE_NAME) && + generic_name != nullptr && !generic_name->empty()) { + // despite the separate pool and snapshot name options, + // we can also specify them via the image option + std::string image_name_copy(*generic_name); + r = extract_spec(image_name_copy, pool_name, namespace_name, generic_name, + snap_name, spec_validation); + if (r < 0) { + return r; + } + } + + if (generic_name != nullptr && spec_arg_index != nullptr && + generic_name->empty()) { + std::string spec = get_positional_argument(vm, (*spec_arg_index)++); + if (!spec.empty()) { + r = extract_spec(spec, pool_name, namespace_name, generic_name, snap_name, + spec_validation); + if (r < 0) { + return r; + } + } + } + + if (pool_name != nullptr && pool_name->empty()) { + *pool_name = get_default_pool_name(); + } + + if (generic_name != nullptr && generic_name_required && + generic_name->empty()) { + std::string prefix = at::get_description_prefix(mod); + std::cerr << "rbd: " + << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string()) + << generic_key_desc << " name was not specified" << std::endl; + return -EINVAL; + } + + std::regex pattern("^[^@/]+?$"); + if (spec_validation == SPEC_VALIDATION_FULL) { + // validate pool name while creating/renaming/copying/cloning/importing/etc + if ((pool_name != nullptr) && !std::regex_match (*pool_name, pattern)) { + std::cerr << "rbd: invalid pool name '" << *pool_name << "'" << std::endl; + return -EINVAL; + } + } + + if (namespace_name != nullptr && !namespace_name->empty() && + !std::regex_match (*namespace_name, pattern)) { + std::cerr << "rbd: invalid namespace name '" << *namespace_name << "'" + << std::endl; + return -EINVAL; + } + + if (snap_name != nullptr) { + r = validate_snapshot_name(mod, *snap_name, snapshot_presence, + spec_validation); + if (r < 0) { + return r; + } + } + return 0; +} + +int validate_snapshot_name(at::ArgumentModifier mod, + const std::string &snap_name, + SnapshotPresence snapshot_presence, + SpecValidation spec_validation) { + std::string prefix = at::get_description_prefix(mod); + switch (snapshot_presence) { + case SNAPSHOT_PRESENCE_PERMITTED: + break; + case SNAPSHOT_PRESENCE_NONE: + if (!snap_name.empty()) { + std::cerr << "rbd: " + << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string()) + << "snapshot name specified for a command that doesn't use it" + << std::endl; + return -EINVAL; + } + break; + case SNAPSHOT_PRESENCE_REQUIRED: + if (snap_name.empty()) { + std::cerr << "rbd: " + << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string()) + << "snapshot name was not specified" << std::endl; + return -EINVAL; + } + break; + } + + if (spec_validation == SPEC_VALIDATION_SNAP) { + // disallow "/" and "@" in snap name + std::regex pattern("^[^@/]*?$"); + if (!std::regex_match (snap_name, pattern)) { + std::cerr << "rbd: invalid snap name '" << snap_name << "'" << std::endl; + return -EINVAL; + } + } + return 0; +} + +int get_image_options(const boost::program_options::variables_map &vm, + bool get_format, librbd::ImageOptions *opts) { + uint64_t order = 0, stripe_unit = 0, stripe_count = 0, object_size = 0; + uint64_t features = 0, features_clear = 0; + std::string data_pool; + bool order_specified = true; + bool features_specified = false; + bool features_clear_specified = false; + bool stripe_specified = false; + + if (vm.count(at::IMAGE_ORDER)) { + order = vm[at::IMAGE_ORDER].as<uint64_t>(); + std::cerr << "rbd: --order is deprecated, use --object-size" + << std::endl; + } else if (vm.count(at::IMAGE_OBJECT_SIZE)) { + object_size = vm[at::IMAGE_OBJECT_SIZE].as<uint64_t>(); + order = std::round(std::log2(object_size)); + } else { + order_specified = false; + } + + if (vm.count(at::IMAGE_FEATURES)) { + features = vm[at::IMAGE_FEATURES].as<uint64_t>(); + features_specified = true; + } else { + features = get_rbd_default_features(g_ceph_context); + } + + if (vm.count(at::IMAGE_STRIPE_UNIT)) { + stripe_unit = vm[at::IMAGE_STRIPE_UNIT].as<uint64_t>(); + stripe_specified = true; + } + + if (vm.count(at::IMAGE_STRIPE_COUNT)) { + stripe_count = vm[at::IMAGE_STRIPE_COUNT].as<uint64_t>(); + stripe_specified = true; + } + + if (vm.count(at::IMAGE_SHARED) && vm[at::IMAGE_SHARED].as<bool>()) { + if (features_specified) { + features &= ~RBD_FEATURES_SINGLE_CLIENT; + } else { + features_clear |= RBD_FEATURES_SINGLE_CLIENT; + features_clear_specified = true; + } + } + + if (vm.count(at::IMAGE_DATA_POOL)) { + data_pool = vm[at::IMAGE_DATA_POOL].as<std::string>(); + } + + if (get_format) { + uint64_t format = 0; + bool format_specified = false; + if (vm.count(at::IMAGE_NEW_FORMAT)) { + format = 2; + format_specified = true; + } else if (vm.count(at::IMAGE_FORMAT)) { + format = vm[at::IMAGE_FORMAT].as<uint32_t>(); + format_specified = true; + } + if (format == 1) { + std::cerr << "rbd: image format 1 is deprecated" << std::endl; + } + + if (features_specified && features != 0) { + if (format_specified && format == 1) { + std::cerr << "rbd: features not allowed with format 1; " + << "use --image-format 2" << std::endl; + return -EINVAL; + } else { + format = 2; + format_specified = true; + } + } + + if ((stripe_unit || stripe_count) && + (stripe_unit != (1ull << order) && stripe_count != 1)) { + if (format_specified && format == 1) { + std::cerr << "rbd: non-default striping not allowed with format 1; " + << "use --image-format 2" << std::endl; + return -EINVAL; + } else { + format = 2; + format_specified = true; + } + } + + if (!data_pool.empty()) { + if (format_specified && format == 1) { + std::cerr << "rbd: data pool not allowed with format 1; " + << "use --image-format 2" << std::endl; + return -EINVAL; + } else { + format = 2; + format_specified = true; + } + } + + if (format_specified) { + int r = g_conf().set_val("rbd_default_format", stringify(format)); + ceph_assert(r == 0); + opts->set(RBD_IMAGE_OPTION_FORMAT, format); + } + } + + if (order_specified) + opts->set(RBD_IMAGE_OPTION_ORDER, order); + if (features_specified) + opts->set(RBD_IMAGE_OPTION_FEATURES, features); + if (features_clear_specified) { + opts->set(RBD_IMAGE_OPTION_FEATURES_CLEAR, features_clear); + } + if (stripe_specified) { + opts->set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + opts->set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + } + if (!data_pool.empty()) { + opts->set(RBD_IMAGE_OPTION_DATA_POOL, data_pool); + } + int r = get_journal_options(vm, opts); + if (r < 0) { + return r; + } + + r = get_flatten_option(vm, opts); + if (r < 0) { + return r; + } + + return 0; +} + +int get_journal_options(const boost::program_options::variables_map &vm, + librbd::ImageOptions *opts) { + + if (vm.count(at::JOURNAL_OBJECT_SIZE)) { + uint64_t size = vm[at::JOURNAL_OBJECT_SIZE].as<uint64_t>(); + uint64_t order = 12; + while ((1ULL << order) < size) { + order++; + } + opts->set(RBD_IMAGE_OPTION_JOURNAL_ORDER, order); + + int r = g_conf().set_val("rbd_journal_order", stringify(order)); + ceph_assert(r == 0); + } + if (vm.count(at::JOURNAL_SPLAY_WIDTH)) { + opts->set(RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, + vm[at::JOURNAL_SPLAY_WIDTH].as<uint64_t>()); + + int r = g_conf().set_val("rbd_journal_splay_width", + stringify( + vm[at::JOURNAL_SPLAY_WIDTH].as<uint64_t>())); + ceph_assert(r == 0); + } + if (vm.count(at::JOURNAL_POOL)) { + opts->set(RBD_IMAGE_OPTION_JOURNAL_POOL, + vm[at::JOURNAL_POOL].as<std::string>()); + + int r = g_conf().set_val("rbd_journal_pool", + vm[at::JOURNAL_POOL].as<std::string>()); + ceph_assert(r == 0); + } + + return 0; +} + +int get_flatten_option(const boost::program_options::variables_map &vm, + librbd::ImageOptions *opts) { + if (vm.count(at::IMAGE_FLATTEN) && vm[at::IMAGE_FLATTEN].as<bool>()) { + uint64_t flatten = 1; + opts->set(RBD_IMAGE_OPTION_FLATTEN, flatten); + } + return 0; +} + +int get_image_size(const boost::program_options::variables_map &vm, + uint64_t *size) { + if (vm.count(at::IMAGE_SIZE) == 0) { + std::cerr << "rbd: must specify --size <M/G/T>" << std::endl; + return -EINVAL; + } + + *size = vm[at::IMAGE_SIZE].as<uint64_t>(); + return 0; +} + +int get_path(const boost::program_options::variables_map &vm, + size_t *arg_index, std::string *path) { + if (vm.count(at::PATH)) { + *path = vm[at::PATH].as<std::string>(); + } else { + *path = get_positional_argument(vm, *arg_index); + if (!path->empty()) { + ++(*arg_index); + } + } + + if (path->empty()) { + std::cerr << "rbd: path was not specified" << std::endl; + return -EINVAL; + } + return 0; +} + +int get_formatter(const po::variables_map &vm, + at::Format::Formatter *formatter) { + if (vm.count(at::FORMAT)) { + bool pretty = vm[at::PRETTY_FORMAT].as<bool>(); + *formatter = vm[at::FORMAT].as<at::Format>().create_formatter(pretty); + if (*formatter == nullptr && pretty) { + std::cerr << "rbd: --pretty-format only works when --format " + << "is json or xml" << std::endl; + return -EINVAL; + } else if (*formatter != nullptr && !pretty) { + formatter->get()->enable_line_break(); + } + } else if (vm[at::PRETTY_FORMAT].as<bool>()) { + std::cerr << "rbd: --pretty-format only works when --format " + << "is json or xml" << std::endl; + return -EINVAL; + } + return 0; +} + +void init_context() { + g_conf().set_val_or_die("rbd_cache_writethrough_until_flush", "false"); + g_conf().apply_changes(nullptr); + common_init_finish(g_ceph_context); +} + +int init_rados(librados::Rados *rados) { + init_context(); + + int r = rados->init_with_context(g_ceph_context); + if (r < 0) { + std::cerr << "rbd: couldn't initialize rados!" << std::endl; + return r; + } + + r = rados->connect(); + if (r < 0) { + std::cerr << "rbd: couldn't connect to the cluster!" << std::endl; + return r; + } + + return 0; +} + +int init(const std::string &pool_name, const std::string& namespace_name, + librados::Rados *rados, librados::IoCtx *io_ctx) { + init_context(); + + int r = init_rados(rados); + if (r < 0) { + return r; + } + + r = init_io_ctx(*rados, pool_name, namespace_name, io_ctx); + if (r < 0) { + return r; + } + return 0; +} + +int init_io_ctx(librados::Rados &rados, const std::string &pool_name, + const std::string& namespace_name, librados::IoCtx *io_ctx) { + int r = rados.ioctx_create(pool_name.c_str(), *io_ctx); + if (r < 0) { + if (r == -ENOENT && pool_name == get_default_pool_name()) { + std::cerr << "rbd: error opening default pool " + << "'" << pool_name << "'" << std::endl + << "Ensure that the default pool has been created or specify " + << "an alternate pool name." << std::endl; + } else { + std::cerr << "rbd: error opening pool '" << pool_name << "': " + << cpp_strerror(r) << std::endl; + } + return r; + } + + return set_namespace(namespace_name, io_ctx); +} + +int set_namespace(const std::string& namespace_name, librados::IoCtx *io_ctx) { + if (!namespace_name.empty()) { + librbd::RBD rbd; + bool exists = false; + int r = rbd.namespace_exists(*io_ctx, namespace_name.c_str(), &exists); + if (r < 0) { + std::cerr << "rbd: error asserting namespace: " + << cpp_strerror(r) << std::endl; + return r; + } + if (!exists) { + std::cerr << "rbd: namespace '" << namespace_name << "' does not exist." + << std::endl; + return -ENOENT; + } + } + io_ctx->set_namespace(namespace_name); + return 0; +} + +void disable_cache() { + g_conf().set_val_or_die("rbd_cache", "false"); +} + +int open_image(librados::IoCtx &io_ctx, const std::string &image_name, + bool read_only, librbd::Image *image) { + int r; + librbd::RBD rbd; + if (read_only) { + r = rbd.open_read_only(io_ctx, *image, image_name.c_str(), NULL); + } else { + r = rbd.open(io_ctx, *image, image_name.c_str()); + } + + if (r < 0) { + std::cerr << "rbd: error opening image " << image_name << ": " + << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +int open_image_by_id(librados::IoCtx &io_ctx, const std::string &image_id, + bool read_only, librbd::Image *image) { + int r; + librbd::RBD rbd; + if (read_only) { + r = rbd.open_by_id_read_only(io_ctx, *image, image_id.c_str(), NULL); + } else { + r = rbd.open_by_id(io_ctx, *image, image_id.c_str()); + } + + if (r < 0) { + std::cerr << "rbd: error opening image with id " << image_id << ": " + << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +int init_and_open_image(const std::string &pool_name, + const std::string &namespace_name, + const std::string &image_name, + const std::string &image_id, + const std::string &snap_name, bool read_only, + librados::Rados *rados, librados::IoCtx *io_ctx, + librbd::Image *image) { + int r = init(pool_name, namespace_name, rados, io_ctx); + if (r < 0) { + return r; + } + + if (image_id.empty()) { + r = open_image(*io_ctx, image_name, read_only, image); + } else { + r = open_image_by_id(*io_ctx, image_id, read_only, image); + } + if (r < 0) { + return r; + } + + if (!snap_name.empty()) { + r = snap_set(*image, snap_name); + if (r < 0) { + return r; + } + } + return 0; +} + +int snap_set(librbd::Image &image, const std::string &snap_name) { + int r = image.snap_set(snap_name.c_str()); + if (r < 0) { + std::cerr << "error setting snapshot context: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void calc_sparse_extent(const bufferptr &bp, + size_t sparse_size, + size_t buffer_offset, + uint64_t buffer_length, + size_t *write_length, + bool *zeroed) { + if (sparse_size == 0) { + // sparse writes are disabled -- write the full extent + ceph_assert(buffer_offset == 0); + *write_length = buffer_length; + *zeroed = false; + return; + } + + *write_length = 0; + size_t original_offset = buffer_offset; + while (buffer_offset < buffer_length) { + size_t extent_size = std::min<size_t>( + sparse_size, buffer_length - buffer_offset); + + bufferptr extent(bp, buffer_offset, extent_size); + + bool extent_is_zero = extent.is_zero(); + if (original_offset == buffer_offset) { + *zeroed = extent_is_zero; + } else if (*zeroed != extent_is_zero) { + ceph_assert(*write_length > 0); + return; + } + + buffer_offset += extent_size; + *write_length += extent_size; + } +} + +std::string image_id(librbd::Image& image) { + std::string id; + int r = image.get_id(&id); + if (r < 0) { + return std::string(); + } + return id; +} + +std::string mirror_image_state(librbd::mirror_image_state_t state) { + switch (state) { + case RBD_MIRROR_IMAGE_DISABLING: + return "disabling"; + case RBD_MIRROR_IMAGE_ENABLED: + return "enabled"; + case RBD_MIRROR_IMAGE_DISABLED: + return "disabled"; + default: + return "unknown"; + } +} + +std::string mirror_image_status_state(librbd::mirror_image_status_state_t state) { + switch (state) { + case MIRROR_IMAGE_STATUS_STATE_UNKNOWN: + return "unknown"; + case MIRROR_IMAGE_STATUS_STATE_ERROR: + return "error"; + case MIRROR_IMAGE_STATUS_STATE_SYNCING: + return "syncing"; + case MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY: + return "starting_replay"; + case MIRROR_IMAGE_STATUS_STATE_REPLAYING: + return "replaying"; + case MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY: + return "stopping_replay"; + case MIRROR_IMAGE_STATUS_STATE_STOPPED: + return "stopped"; + default: + return "unknown (" + stringify(static_cast<uint32_t>(state)) + ")"; + } +} + +std::string mirror_image_status_state(librbd::mirror_image_status_t status) { + return (status.up ? "up+" : "down+") + + mirror_image_status_state(status.state); +} + +std::string timestr(time_t t) { + struct tm tm; + + localtime_r(&t, &tm); + + char buf[32]; + strftime(buf, sizeof(buf), "%F %T", &tm); + + return buf; +} + +uint64_t get_rbd_default_features(CephContext* cct) { + auto features = cct->_conf.get_val<std::string>("rbd_default_features"); + return boost::lexical_cast<uint64_t>(features); +} + +bool is_not_user_snap_namespace(librbd::Image* image, + const librbd::snap_info_t &snap_info) +{ + librbd::snap_namespace_type_t namespace_type; + int r = image->snap_get_namespace_type(snap_info.id, &namespace_type); + if (r < 0) { + return false; + } + return namespace_type != RBD_SNAP_NAMESPACE_TYPE_USER; +} + +} // namespace utils +} // namespace rbd diff --git a/src/tools/rbd/Utils.h b/src/tools/rbd/Utils.h new file mode 100644 index 00000000..81ea2c71 --- /dev/null +++ b/src/tools/rbd/Utils.h @@ -0,0 +1,204 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_UTILS_H +#define CEPH_RBD_UTILS_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "tools/rbd/ArgumentTypes.h" +#include <string> +#include <boost/program_options.hpp> + +namespace rbd { +namespace utils { + +namespace detail { + +template <typename T, void(T::*MF)(int)> +void aio_completion_callback(librbd::completion_t completion, + void *arg) { + librbd::RBD::AioCompletion *aio_completion = + reinterpret_cast<librbd::RBD::AioCompletion*>(completion); + + // complete the AIO callback in separate thread context + T *t = reinterpret_cast<T *>(arg); + int r = aio_completion->get_return_value(); + aio_completion->release(); + + (t->*MF)(r); +} + +} // namespace detail + +static const std::string RBD_DIFF_BANNER ("rbd diff v1\n"); +static const size_t RBD_DEFAULT_SPARSE_SIZE = 4096; + +static const std::string RBD_IMAGE_BANNER_V2 ("rbd image v2\n"); +static const std::string RBD_IMAGE_DIFFS_BANNER_V2 ("rbd image diffs v2\n"); +static const std::string RBD_DIFF_BANNER_V2 ("rbd diff v2\n"); + +#define RBD_DIFF_FROM_SNAP 'f' +#define RBD_DIFF_TO_SNAP 't' +#define RBD_DIFF_IMAGE_SIZE 's' +#define RBD_DIFF_WRITE 'w' +#define RBD_DIFF_ZERO 'z' +#define RBD_DIFF_END 'e' + +#define RBD_SNAP_PROTECTION_STATUS 'p' + +#define RBD_EXPORT_IMAGE_ORDER 'O' +#define RBD_EXPORT_IMAGE_FEATURES 'T' +#define RBD_EXPORT_IMAGE_STRIPE_UNIT 'U' +#define RBD_EXPORT_IMAGE_STRIPE_COUNT 'C' +#define RBD_EXPORT_IMAGE_META 'M' +#define RBD_EXPORT_IMAGE_END 'E' + +enum SnapshotPresence { + SNAPSHOT_PRESENCE_NONE, + SNAPSHOT_PRESENCE_PERMITTED, + SNAPSHOT_PRESENCE_REQUIRED +}; + +enum SpecValidation { + SPEC_VALIDATION_FULL, + SPEC_VALIDATION_SNAP, + SPEC_VALIDATION_NONE +}; + +struct ProgressContext : public librbd::ProgressContext { + const char *operation; + bool progress; + int last_pc; + + ProgressContext(const char *o, bool no_progress) + : operation(o), progress(!no_progress), last_pc(0) { + } + + int update_progress(uint64_t offset, uint64_t total) override; + void finish(); + void fail(); +}; + +template <typename T, void(T::*MF)(int)> +librbd::RBD::AioCompletion *create_aio_completion(T *t) { + return new librbd::RBD::AioCompletion( + t, &detail::aio_completion_callback<T, MF>); +} + +void aio_context_callback(librbd::completion_t completion, void *arg); + +int read_string(int fd, unsigned max, std::string *out); + +int extract_spec(const std::string &spec, std::string *pool_name, + std::string *namespace_name, std::string *name, + std::string *snap_name, SpecValidation spec_validation); + +std::string get_positional_argument( + const boost::program_options::variables_map &vm, size_t index); + +std::string get_default_pool_name(); +int get_pool_and_namespace_names( + const boost::program_options::variables_map &vm, + bool default_empty_pool_name, bool validate_pool_name, + std::string* pool_name, std::string* namespace_name, size_t *arg_index); + +int get_pool_image_snapshot_names( + const boost::program_options::variables_map &vm, + argument_types::ArgumentModifier mod, size_t *spec_arg_index, + std::string *pool_name, std::string *namespace_name, + std::string *image_name, std::string *snap_name, bool image_name_required, + SnapshotPresence snapshot_presence, SpecValidation spec_validation); + +int get_pool_generic_snapshot_names( + const boost::program_options::variables_map &vm, + argument_types::ArgumentModifier mod, size_t *spec_arg_index, + const std::string& pool_key, std::string *pool_name, + std::string *namespace_name, const std::string& generic_key, + const std::string& generic_key_desc, std::string *generic_name, + std::string *snap_name, bool generic_name_required, + SnapshotPresence snapshot_presence, SpecValidation spec_validation); + +int get_pool_image_id(const boost::program_options::variables_map &vm, + size_t *spec_arg_index, + std::string *pool_name, + std::string *namespace_name, + std::string *image_id); + +int validate_snapshot_name(argument_types::ArgumentModifier mod, + const std::string &snap_name, + SnapshotPresence snapshot_presence, + SpecValidation spec_validation); + +int get_image_options(const boost::program_options::variables_map &vm, + bool get_format, librbd::ImageOptions* opts); + +int get_journal_options(const boost::program_options::variables_map &vm, + librbd::ImageOptions *opts); + +int get_flatten_option(const boost::program_options::variables_map &vm, + librbd::ImageOptions *opts); + +int get_image_size(const boost::program_options::variables_map &vm, + uint64_t *size); + +int get_path(const boost::program_options::variables_map &vm, + size_t *arg_index, std::string *path); + +int get_formatter(const boost::program_options::variables_map &vm, + argument_types::Format::Formatter *formatter); + +void init_context(); + +int init_rados(librados::Rados *rados); + +int init(const std::string &pool_name, const std::string& namespace_name, + librados::Rados *rados, librados::IoCtx *io_ctx); +int init_io_ctx(librados::Rados &rados, const std::string &pool_name, + const std::string& namespace_name, librados::IoCtx *io_ctx); +int set_namespace(const std::string& namespace_name, librados::IoCtx *io_ctx); + +void disable_cache(); + +int open_image(librados::IoCtx &io_ctx, const std::string &image_name, + bool read_only, librbd::Image *image); + +int open_image_by_id(librados::IoCtx &io_ctx, const std::string &image_id, + bool read_only, librbd::Image *image); + +int init_and_open_image(const std::string &pool_name, + const std::string &namespace_name, + const std::string &image_name, + const std::string &image_id, + const std::string &snap_name, bool read_only, + librados::Rados *rados, librados::IoCtx *io_ctx, + librbd::Image *image); + +int snap_set(librbd::Image &image, const std::string &snap_name); + +void calc_sparse_extent(const bufferptr &bp, + size_t sparse_size, + size_t buffer_offset, + uint64_t length, + size_t *write_length, + bool *zeroed); + +bool is_not_user_snap_namespace(librbd::Image* image, + const librbd::snap_info_t &snap_info); + +std::string image_id(librbd::Image& image); + +std::string mirror_image_state(librbd::mirror_image_state_t mirror_image_state); +std::string mirror_image_status_state(librbd::mirror_image_status_state_t state); +std::string mirror_image_status_state(librbd::mirror_image_status_t status); + +std::string timestr(time_t t); + +// duplicate here to not include librbd_internal lib +uint64_t get_rbd_default_features(CephContext* cct); + +} // namespace utils +} // namespace rbd + +#endif // CEPH_RBD_UTILS_H diff --git a/src/tools/rbd/action/Bench.cc b/src/tools/rbd/action/Bench.cc new file mode 100644 index 00000000..27843c7b --- /dev/null +++ b/src/tools/rbd/action/Bench.cc @@ -0,0 +1,539 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include "common/strtol.h" +#include "common/Cond.h" +#include "common/Mutex.h" +#include "global/signal_handler.h" +#include <atomic> +#include <chrono> +#include <iostream> +#include <boost/accumulators/accumulators.hpp> +#include <boost/accumulators/statistics/stats.hpp> +#include <boost/accumulators/statistics/rolling_sum.hpp> +#include <boost/program_options.hpp> + +using namespace std::chrono; + +static std::atomic<bool> terminating; +static void handle_signal(int signum) +{ + ceph_assert(signum == SIGINT || signum == SIGTERM); + terminating = true; +} + +namespace rbd { +namespace action { +namespace bench { + +namespace at = argument_types; +namespace po = boost::program_options; + +namespace { + +enum io_type_t { + IO_TYPE_READ = 0, + IO_TYPE_WRITE, + IO_TYPE_RW, + + IO_TYPE_NUM, +}; + +struct IOType {}; +struct Size {}; +struct IOPattern {}; + +void validate(boost::any& v, const std::vector<std::string>& values, + Size *target_type, int) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + + std::string parse_error; + uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error); + if (!parse_error.empty()) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + v = boost::any(size); +} + +void validate(boost::any& v, const std::vector<std::string>& values, + IOPattern *target_type, int) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + if (s == "rand") { + v = boost::any(true); + } else if (s == "seq") { + v = boost::any(false); + } else { + throw po::validation_error(po::validation_error::invalid_option_value); + } +} + +io_type_t get_io_type(string io_type_string) { + if (io_type_string == "read") + return IO_TYPE_READ; + else if (io_type_string == "write") + return IO_TYPE_WRITE; + else if (io_type_string == "readwrite" || io_type_string == "rw") + return IO_TYPE_RW; + else + return IO_TYPE_NUM; +} + +void validate(boost::any& v, const std::vector<std::string>& values, + IOType *target_type, int) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + io_type_t io_type = get_io_type(s); + if (io_type >= IO_TYPE_NUM) + throw po::validation_error(po::validation_error::invalid_option_value); + else + v = boost::any(io_type); +} + +} // anonymous namespace + +static void rbd_bencher_completion(void *c, void *pc); +struct rbd_bencher; + +struct bencher_completer { + rbd_bencher *bencher; + bufferlist *bl; + +public: + bencher_completer(rbd_bencher *bencher, bufferlist *bl) + : bencher(bencher), bl(bl) + { } + + ~bencher_completer() + { + if (bl) + delete bl; + } +}; + +struct rbd_bencher { + librbd::Image *image; + Mutex lock; + Cond cond; + int in_flight; + io_type_t io_type; + uint64_t io_size; + bufferlist write_bl; + + explicit rbd_bencher(librbd::Image *i, io_type_t io_type, uint64_t io_size) + : image(i), + lock("rbd_bencher::lock"), + in_flight(0), + io_type(io_type), + io_size(io_size) + { + if (io_type == IO_TYPE_WRITE || io_type == IO_TYPE_RW) { + bufferptr bp(io_size); + memset(bp.c_str(), rand() & 0xff, io_size); + write_bl.push_back(bp); + } + } + + void start_io(int max, uint64_t off, uint64_t len, int op_flags, bool read_flag) + { + { + Mutex::Locker l(lock); + in_flight++; + } + + librbd::RBD::AioCompletion *c; + if (read_flag) { + bufferlist *read_bl = new bufferlist(); + c = new librbd::RBD::AioCompletion((void *)(new bencher_completer(this, read_bl)), + rbd_bencher_completion); + image->aio_read2(off, len, *read_bl, c, op_flags); + } else { + c = new librbd::RBD::AioCompletion((void *)(new bencher_completer(this, NULL)), + rbd_bencher_completion); + image->aio_write2(off, len, write_bl, c, op_flags); + } + } + + int wait_for(int max, bool interrupt_on_terminating) { + Mutex::Locker l(lock); + while (in_flight > max && !(terminating && interrupt_on_terminating)) { + utime_t dur; + dur.set_from_double(.2); + cond.WaitInterval(lock, dur); + } + + return terminating ? -EINTR : 0; + } + +}; + +void rbd_bencher_completion(void *vc, void *pc) +{ + librbd::RBD::AioCompletion *c = (librbd::RBD::AioCompletion *)vc; + bencher_completer *bc = static_cast<bencher_completer *>(pc); + rbd_bencher *b = bc->bencher; + //cout << "complete " << c << std::endl; + int ret = c->get_return_value(); + if (b->io_type == IO_TYPE_WRITE && ret != 0) { + cout << "write error: " << cpp_strerror(ret) << std::endl; + exit(ret < 0 ? -ret : ret); + } else if (b->io_type == IO_TYPE_READ && (unsigned int)ret != b->io_size) { + cout << "read error: " << cpp_strerror(ret) << std::endl; + exit(ret < 0 ? -ret : ret); + } + b->lock.Lock(); + b->in_flight--; + b->cond.Signal(); + b->lock.Unlock(); + c->release(); + delete bc; +} + +bool should_read(uint64_t read_proportion) +{ + uint64_t rand_num = rand() % 100; + + if (rand_num < read_proportion) + return true; + else + return false; +} + +int do_bench(librbd::Image& image, io_type_t io_type, + uint64_t io_size, uint64_t io_threads, + uint64_t io_bytes, bool random, uint64_t read_proportion) +{ + uint64_t size = 0; + image.size(&size); + if (io_size > size) { + std::cerr << "rbd: io-size " << byte_u_t(io_size) << " " + << "larger than image size " << byte_u_t(size) << std::endl; + return -EINVAL; + } + + if (io_size > std::numeric_limits<uint32_t>::max()) { + std::cerr << "rbd: io-size should be less than 4G" << std::endl; + return -EINVAL; + } + + int r = image.flush(); + if (r < 0 && (r != -EROFS || io_type != IO_TYPE_READ)) { + std::cerr << "rbd: failed to flush: " << cpp_strerror(r) << std::endl; + return r; + } + + rbd_bencher b(&image, io_type, io_size); + + std::cout << "bench " + << " type " << (io_type == IO_TYPE_READ ? "read" : + io_type == IO_TYPE_WRITE ? "write" : "readwrite") + << (io_type == IO_TYPE_RW ? " read:write=" + + to_string(read_proportion) + ":" + to_string(100 - read_proportion) : "") + << " io_size " << io_size + << " io_threads " << io_threads + << " bytes " << io_bytes + << " pattern " << (random ? "random" : "sequential") + << std::endl; + + srand(time(NULL) % (unsigned long) -1); + + coarse_mono_time start = coarse_mono_clock::now(); + chrono::duration<double> last = chrono::duration<double>::zero(); + unsigned ios = 0; + + vector<uint64_t> thread_offset; + uint64_t i; + uint64_t start_pos; + + uint64_t unit_len = size/io_size/io_threads; + // disturb all thread's offset + for (i = 0; i < io_threads; i++) { + if (random) { + start_pos = (rand() % (size / io_size)) * io_size; + } else { + start_pos = unit_len * i * io_size; + } + thread_offset.push_back(start_pos); + } + + const int WINDOW_SIZE = 5; + typedef boost::accumulators::accumulator_set< + double, boost::accumulators::stats< + boost::accumulators::tag::rolling_sum> > RollingSum; + + RollingSum time_acc( + boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE); + RollingSum ios_acc( + boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE); + RollingSum off_acc( + boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE); + uint64_t cur_ios = 0; + uint64_t cur_off = 0; + + int op_flags; + if (random) { + op_flags = LIBRADOS_OP_FLAG_FADVISE_RANDOM; + } else { + op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL; + } + + printf(" SEC OPS OPS/SEC BYTES/SEC\n"); + uint64_t off; + int read_ops = 0; + int write_ops = 0; + + for (off = 0; off < io_bytes; ) { + // Issue I/O + i = 0; + int r = 0; + while (i < io_threads && off < io_bytes) { + bool read_flag = should_read(read_proportion); + + r = b.wait_for(io_threads - 1, true); + if (r < 0) { + break; + } + b.start_io(io_threads, thread_offset[i], io_size, op_flags, read_flag); + + ++i; + ++ios; + off += io_size; + + ++cur_ios; + cur_off += io_size; + + if (read_flag) + read_ops++; + else + write_ops++; + } + + if (r < 0) { + break; + } + + // Set the thread_offsets of next I/O + for (i = 0; i < io_threads; ++i) { + if (random) { + thread_offset[i] = (rand() % (size / io_size)) * io_size; + continue; + } + if (off < (io_size * unit_len * io_threads) ) { + thread_offset[i] += io_size; + } else { + // thread_offset is adjusted to the chunks unassigned to threads. + thread_offset[i] = off + (i * io_size); + } + if (thread_offset[i] + io_size > size) + thread_offset[i] = unit_len * i * io_size; + } + + coarse_mono_time now = coarse_mono_clock::now(); + chrono::duration<double> elapsed = now - start; + if (last == chrono::duration<double>::zero()) { + last = elapsed; + } else if ((int)elapsed.count() != (int)last.count()) { + time_acc((elapsed - last).count()); + ios_acc(static_cast<double>(cur_ios)); + off_acc(static_cast<double>(cur_off)); + cur_ios = 0; + cur_off = 0; + + double time_sum = boost::accumulators::rolling_sum(time_acc); + printf("%5d %8d %8.2lf %8.2lf\n", + (int)elapsed.count(), + (int)(ios - io_threads), + boost::accumulators::rolling_sum(ios_acc) / time_sum, + boost::accumulators::rolling_sum(off_acc) / time_sum); + last = elapsed; + } + } + b.wait_for(0, false); + + if (io_type != IO_TYPE_READ) { + r = image.flush(); + if (r < 0) { + std::cerr << "rbd: failed to flush at the end: " << cpp_strerror(r) + << std::endl; + } + } + + coarse_mono_time now = coarse_mono_clock::now(); + chrono::duration<double> elapsed = now - start; + + printf("elapsed: %5d ops: %8d ops/sec: %8.2lf bytes/sec: %8.2lf\n", + (int)elapsed.count(), ios, (double)ios / elapsed.count(), + (double)off / elapsed.count()); + + if (io_type == IO_TYPE_RW) { + printf("read_ops: %5d read_ops/sec: %8.2lf read_bytes/sec: %8.2lf\n", + read_ops, (double)read_ops / elapsed.count(), + (double)read_ops * io_size / elapsed.count()); + + printf("write_ops: %5d write_ops/sec: %8.2lf write_bytes/sec: %8.2lf\n", + write_ops, (double)write_ops / elapsed.count(), + (double)write_ops * io_size / elapsed.count()); + } + + return 0; +} + +void add_bench_common_options(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + + options->add_options() + ("io-size", po::value<Size>(), "IO size (in B/K/M/G/T) [default: 4K]") + ("io-threads", po::value<uint32_t>(), "ios in flight [default: 16]") + ("io-total", po::value<Size>(), "total size for IO (in B/K/M/G/T) [default: 1G]") + ("io-pattern", po::value<IOPattern>(), "IO pattern (rand or seq) [default: seq]") + ("rw-mix-read", po::value<uint64_t>(), "read proportion in readwrite (<= 100) [default: 50]"); +} + +void get_arguments_for_write(po::options_description *positional, + po::options_description *options) { + add_bench_common_options(positional, options); +} + +void get_arguments_for_bench(po::options_description *positional, + po::options_description *options) { + add_bench_common_options(positional, options); + + options->add_options() + ("io-type", po::value<IOType>()->required(), "IO type (read , write, or readwrite(rw))"); +} + +int bench_execute(const po::variables_map &vm, io_type_t bench_io_type) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + utils::SnapshotPresence snap_presence = utils::SNAPSHOT_PRESENCE_NONE; + if (bench_io_type == IO_TYPE_READ) + snap_presence = utils::SNAPSHOT_PRESENCE_PERMITTED; + + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, snap_presence, utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + uint64_t bench_io_size; + if (vm.count("io-size")) { + bench_io_size = vm["io-size"].as<uint64_t>(); + } else { + bench_io_size = 4096; + } + if (bench_io_size == 0) { + std::cerr << "rbd: --io-size should be greater than zero." << std::endl; + return -EINVAL; + } + + uint32_t bench_io_threads; + if (vm.count("io-threads")) { + bench_io_threads = vm["io-threads"].as<uint32_t>(); + } else { + bench_io_threads = 16; + } + if (bench_io_threads == 0) { + std::cerr << "rbd: --io-threads should be greater than zero." << std::endl; + return -EINVAL; + } + + uint64_t bench_bytes; + if (vm.count("io-total")) { + bench_bytes = vm["io-total"].as<uint64_t>(); + } else { + bench_bytes = 1 << 30; + } + + bool bench_random; + if (vm.count("io-pattern")) { + bench_random = vm["io-pattern"].as<bool>(); + } else { + bench_random = false; + } + + uint64_t bench_read_proportion; + if (bench_io_type == IO_TYPE_READ) { + bench_read_proportion = 100; + } else if (bench_io_type == IO_TYPE_WRITE) { + bench_read_proportion = 0; + } else { + if (vm.count("rw-mix-read")) { + bench_read_proportion = vm["rw-mix-read"].as<uint64_t>(); + } else { + bench_read_proportion = 50; + } + + if (bench_read_proportion > 100) { + std::cerr << "rbd: --rw-mix-read should not be larger than 100." << std::endl; + return -EINVAL; + } + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", + snap_name, false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + register_async_signal_handler_oneshot(SIGINT, handle_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_signal); + + r = do_bench(image, bench_io_type, bench_io_size, bench_io_threads, + bench_bytes, bench_random, bench_read_proportion); + + unregister_async_signal_handler(SIGHUP, sighup_handler); + unregister_async_signal_handler(SIGINT, handle_signal); + unregister_async_signal_handler(SIGTERM, handle_signal); + shutdown_async_signal_handler(); + + if (r < 0) { + std::cerr << "bench failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +int execute_for_write(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::cerr << "rbd: bench-write is deprecated, use rbd bench --io-type write ..." << std::endl; + return bench_execute(vm, IO_TYPE_WRITE); +} + +int execute_for_bench(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + io_type_t bench_io_type; + if (vm.count("io-type")) { + bench_io_type = vm["io-type"].as<io_type_t>(); + } else { + std::cerr << "rbd: --io-type must be specified." << std::endl; + return -EINVAL; + } + + return bench_execute(vm, bench_io_type); +} + +Shell::Action action_write( + {"bench-write"}, {}, "Simple write benchmark. (Deprecated, please use `rbd bench --io-type write` instead.)", + "", &get_arguments_for_write, &execute_for_write, false); + +Shell::Action action_bench( + {"bench"}, {}, "Simple benchmark.", "", &get_arguments_for_bench, &execute_for_bench); + +} // namespace bench +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Children.cc b/src/tools/rbd/action/Children.cc new file mode 100644 index 00000000..f459e92b --- /dev/null +++ b/src/tools/rbd/action/Children.cc @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace children { + +namespace at = argument_types; +namespace po = boost::program_options; + +int do_list_children(librados::IoCtx &io_ctx, librbd::Image &image, + bool all_flag, bool descendants_flag, Formatter *f) +{ + std::vector<librbd::linked_image_spec_t> children; + librbd::RBD rbd; + int r; + if (descendants_flag) { + r = image.list_descendants(&children); + } else { + r = image.list_children3(&children); + } + if (r < 0) + return r; + + if (f) + f->open_array_section("children"); + + for (auto& child : children) { + bool trash = child.trash; + if (f) { + if (all_flag) { + f->open_object_section("child"); + f->dump_string("pool", child.pool_name); + f->dump_string("pool_namespace", child.pool_namespace); + f->dump_string("image", child.image_name); + f->dump_string("id", child.image_id); + f->dump_bool("trash", child.trash); + f->close_section(); + } else if (!trash) { + f->open_object_section("child"); + f->dump_string("pool", child.pool_name); + f->dump_string("pool_namespace", child.pool_namespace); + f->dump_string("image", child.image_name); + f->close_section(); + } + } else if (all_flag || !trash) { + if (child.pool_name.empty()) { + std::cout << "(child missing " << child.pool_id << "/"; + } else { + std::cout << child.pool_name << "/"; + } + if (!child.pool_namespace.empty()) { + std::cout << child.pool_namespace << "/"; + } + if (child.image_name.empty()) { + std::cout << child.image_id << ")"; + } else { + std::cout << child.image_name; + if (trash) { + std::cout << " (trash " << child.image_id << ")"; + } + } + std::cout << std::endl; + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } + + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_NONE); + at::add_snap_id_option(options); + options->add_options() + ("all,a", po::bool_switch(), "list all children (include trash)"); + options->add_options() + ("descendants", po::bool_switch(), "include all descendants"); + at::add_format_options(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + uint64_t snap_id = LIBRADOS_SNAP_HEAD; + if (vm.count(at::SNAPSHOT_ID)) { + snap_id = vm[at::SNAPSHOT_ID].as<uint64_t>(); + } + + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + if (snap_id != LIBRADOS_SNAP_HEAD && !snap_name.empty()) { + std::cerr << "rbd: trying to access snapshot using both name and id." + << std::endl; + return -EINVAL; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + if (!snap_name.empty()) { + r = image.snap_set(snap_name.c_str()); + } else if (snap_id != LIBRADOS_SNAP_HEAD) { + r = image.snap_set_by_id(snap_id); + } + if (r == -ENOENT) { + std::cerr << "rbd: snapshot does not exist." << std::endl; + return r; + } else if (r < 0) { + std::cerr << "rbd: error setting snapshot: " << cpp_strerror(r) + << std::endl; + return r; + } + + r = do_list_children(io_ctx, image, vm["all"].as<bool>(), + vm["descendants"].as<bool>(), formatter.get()); + if (r < 0) { + std::cerr << "rbd: listing children failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"children"}, {}, "Display children of an image or its snapshot.", "", + &get_arguments, &execute); + +} // namespace children +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Clone.cc b/src/tools/rbd/action/Clone.cc new file mode 100644 index 00000000..6406c957 --- /dev/null +++ b/src/tools/rbd/action/Clone.cc @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace clone { + +namespace at = argument_types; +namespace po = boost::program_options; + +int do_clone(librbd::RBD &rbd, librados::IoCtx &p_ioctx, + const char *p_name, const char *p_snapname, + librados::IoCtx &c_ioctx, const char *c_name, + librbd::ImageOptions& opts) { + return rbd.clone3(p_ioctx, p_name, p_snapname, c_ioctx, c_name, opts); +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE); + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST); + at::add_create_image_options(options, false); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string dst_pool_name; + std::string dst_namespace_name; + std::string dst_image_name; + std::string dst_snap_name; + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name, + &dst_namespace_name, &dst_image_name, &dst_snap_name, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librbd::ImageOptions opts; + r = utils::get_image_options(vm, false, &opts); + if (r < 0) { + return r; + } + opts.set(RBD_IMAGE_OPTION_FORMAT, static_cast<uint64_t>(2)); + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librados::IoCtx dst_io_ctx; + r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name, &dst_io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = do_clone(rbd, io_ctx, image_name.c_str(), snap_name.c_str(), dst_io_ctx, + dst_image_name.c_str(), opts); + if (r == -EXDEV) { + std::cerr << "rbd: clone v2 required for cross-namespace clones." + << std::endl; + return r; + } else if (r < 0) { + std::cerr << "rbd: clone error: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"clone"}, {}, "Clone a snapshot into a CoW child image.", + at::get_long_features_help(), &get_arguments, &execute); + +} // namespace clone +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Config.cc b/src/tools/rbd/action/Config.cc new file mode 100644 index 00000000..2868c7ad --- /dev/null +++ b/src/tools/rbd/action/Config.cc @@ -0,0 +1,890 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "common/ceph_context.h" +#include "common/ceph_json.h" +#include "common/escape.h" +#include "common/errno.h" +#include "common/options.h" +#include "global/global_context.h" +#include "include/stringify.h" + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" + +#include <iostream> + +#include <boost/algorithm/string/predicate.hpp> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace config { + +namespace at = argument_types; +namespace po = boost::program_options; + +namespace { + +const std::string METADATA_CONF_PREFIX = "conf_"; +const uint32_t MAX_KEYS = 64; + +void add_config_entity_option( + boost::program_options::options_description *positional) { + positional->add_options() + ("config-entity", "config entity (global, client, client.<id>)"); +} + +void add_pool_option(boost::program_options::options_description *positional) { + positional->add_options() + ("pool-name", "pool name"); +} + +void add_key_option(po::options_description *positional) { + positional->add_options() + ("key", "config key"); +} + +int get_config_entity(const po::variables_map &vm, std::string *config_entity) { + *config_entity = utils::get_positional_argument(vm, 0); + + if (*config_entity != "global" && *config_entity != "client" && + !boost::starts_with(*config_entity, ("client."))) { + std::cerr << "rbd: invalid config entity: " << *config_entity + << " (must be global, client or client.<id>)" << std::endl; + return -EINVAL; + } + + return 0; +} + +int get_pool(const po::variables_map &vm, std::string *pool_name) { + *pool_name = utils::get_positional_argument(vm, 0); + if (pool_name->empty()) { + std::cerr << "rbd: pool name was not specified" << std::endl; + return -EINVAL; + } + + return 0; +} + +int get_key(const po::variables_map &vm, size_t *arg_index, + std::string *key) { + *key = utils::get_positional_argument(vm, *arg_index); + if (key->empty()) { + std::cerr << "rbd: config key was not specified" << std::endl; + return -EINVAL; + } else { + ++(*arg_index); + } + + if (!boost::starts_with(*key, "rbd_")) { + std::cerr << "rbd: not rbd option: " << *key << std::endl; + return -EINVAL; + } + + std::string value; + int r = g_ceph_context->_conf.get_val(key->c_str(), &value); + if (r < 0) { + std::cerr << "rbd: invalid config key: " << *key << std::endl; + return -EINVAL; + } + + return 0; +} + +std::ostream& operator<<(std::ostream& os, + const librbd::config_source_t& source) { + switch (source) { + case RBD_CONFIG_SOURCE_CONFIG: + os << "config"; + break; + case RBD_CONFIG_SOURCE_POOL: + os << "pool"; + break; + case RBD_CONFIG_SOURCE_IMAGE: + os << "image"; + break; + default: + os << "unknown (" << static_cast<uint32_t>(source) << ")"; + break; + } + return os; +} + +int config_global_list( + librados::Rados &rados, const std::string &config_entity, + std::map<std::string, std::pair<std::string, std::string>> *options) { + bool client_id_config_entity = + boost::starts_with(config_entity, ("client.")); + std::string cmd = + "{" + "\"prefix\": \"config dump\", " + "\"format\": \"json\" " + "}"; + bufferlist in_bl; + bufferlist out_bl; + std::string ss; + int r = rados.mon_command(cmd, in_bl, &out_bl, &ss); + if (r < 0) { + std::cerr << "rbd: error reading config: " << ss << std::endl; + return r; + } + + json_spirit::mValue json_root; + if (!json_spirit::read(out_bl.to_str(), json_root)) { + std::cerr << "rbd: error parsing config dump" << std::endl; + return -EINVAL; + } + + try { + auto &json_array = json_root.get_array(); + for (auto& e : json_array) { + auto &json_obj = e.get_obj(); + std::string section; + std::string name; + std::string value; + + for (auto &pairs : json_obj) { + if (pairs.first == "section") { + section = pairs.second.get_str(); + } else if (pairs.first == "name") { + name = pairs.second.get_str(); + } else if (pairs.first == "value") { + value = pairs.second.get_str(); + } + } + + if (!boost::starts_with(name, "rbd_")) { + continue; + } + if (section != "global" && section != "client" && + (!client_id_config_entity || section != config_entity)) { + continue; + } + if (config_entity == "global" && section != "global") { + continue; + } + auto it = options->find(name); + if (it == options->end()) { + (*options)[name] = {value, section}; + continue; + } + if (section == "client") { + if (it->second.second == "global") { + it->second = {value, section}; + } + } else if (client_id_config_entity) { + it->second = {value, section}; + } + } + } catch (std::runtime_error &e) { + std::cerr << "rbd: error parsing config dump: " << e.what() << std::endl; + return -EINVAL; + } + + return 0; +} + +} // anonymous namespace + +void get_global_get_arguments(po::options_description *positional, + po::options_description *options) { + add_config_entity_option(positional); + add_key_option(positional); +} + +int execute_global_get(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string config_entity; + int r = get_config_entity(vm, &config_entity); + if (r < 0) { + return r; + } + + std::string key; + size_t arg_index = 1; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + librados::Rados rados; + r = utils::init_rados(&rados); + if (r < 0) { + return r; + } + + std::map<std::string, std::pair<std::string, std::string>> options; + r = config_global_list(rados, config_entity, &options); + if (r < 0) { + return r; + } + + auto it = options.find(key); + + if (it == options.end() || it->second.second != config_entity) { + std::cerr << "rbd: " << key << " is not set" << std::endl; + return -ENOENT; + } + + std::cout << it->second.first << std::endl; + return 0; +} + +void get_global_set_arguments(po::options_description *positional, + po::options_description *options) { + add_config_entity_option(positional); + add_key_option(positional); + positional->add_options() + ("value", "config value"); +} + +int execute_global_set(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string config_entity; + int r = get_config_entity(vm, &config_entity); + if (r < 0) { + return r; + } + + std::string key; + size_t arg_index = 1; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + librados::Rados rados; + r = utils::init_rados(&rados); + if (r < 0) { + return r; + } + + std::string value = utils::get_positional_argument(vm, 2); + std::string cmd = + "{" + "\"prefix\": \"config set\", " + "\"who\": \"" + stringify(json_stream_escaper(config_entity)) + "\", " + "\"name\": \"" + key + "\", " + "\"value\": \"" + stringify(json_stream_escaper(value)) + "\"" + "}"; + bufferlist in_bl; + std::string ss; + r = rados.mon_command(cmd, in_bl, nullptr, &ss); + if (r < 0) { + std::cerr << "rbd: error setting " << key << ": " << ss << std::endl; + return r; + } + + return 0; +} + +void get_global_remove_arguments(po::options_description *positional, + po::options_description *options) { + add_config_entity_option(positional); + add_key_option(positional); +} + +int execute_global_remove( + const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string config_entity; + int r = get_config_entity(vm, &config_entity); + if (r < 0) { + return r; + } + + std::string key; + size_t arg_index = 1; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + librados::Rados rados; + r = utils::init_rados(&rados); + if (r < 0) { + return r; + } + + std::string cmd = + "{" + "\"prefix\": \"config rm\", " + "\"who\": \"" + stringify(json_stream_escaper(config_entity)) + "\", " + "\"name\": \"" + key + "\"" + "}"; + bufferlist in_bl; + std::string ss; + r = rados.mon_command(cmd, in_bl, nullptr, &ss); + if (r < 0) { + std::cerr << "rbd: error removing " << key << ": " << ss << std::endl; + return r; + } + + return 0; +} + +void get_global_list_arguments(po::options_description *positional, + po::options_description *options) { + add_config_entity_option(positional); + at::add_format_options(options); +} + +int execute_global_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string config_entity; + int r = get_config_entity(vm, &config_entity); + if (r < 0) { + return r; + } + + at::Format::Formatter f; + r = utils::get_formatter(vm, &f); + if (r < 0) { + return r; + } + + librados::Rados rados; + r = utils::init_rados(&rados); + if (r < 0) { + return r; + } + + std::map<std::string, std::pair<std::string, std::string>> options; + r = config_global_list(rados, config_entity, &options); + if (r < 0) { + return r; + } + + if (options.empty() && !f) { + return 0; + } + + TextTable tbl; + + if (f) { + f->open_array_section("config"); + } else { + tbl.define_column("Name", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("Section", TextTable::LEFT, TextTable::LEFT); + } + + for (const auto &it : options) { + if (f) { + f->open_object_section("option"); + f->dump_string("name", it.first); + f->dump_string("value", it.second.first); + f->dump_string("section", it.second.second); + f->close_section(); + } else { + tbl << it.first << it.second.first << it.second.second + << TextTable::endrow; + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } else { + std::cout << tbl; + } + + return 0; +} + +void get_pool_get_arguments(po::options_description *positional, + po::options_description *options) { + add_pool_option(positional); + add_key_option(positional); +} + +int execute_pool_get(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + int r = get_pool(vm, &pool_name); + if (r < 0) { + return r; + } + + std::string key; + size_t arg_index = 1; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + std::string value; + + r = rbd.pool_metadata_get(io_ctx, METADATA_CONF_PREFIX + key, &value); + if (r < 0) { + if (r == -ENOENT) { + std::cerr << "rbd: " << key << " is not set" << std::endl; + } else { + std::cerr << "rbd: failed to get " << key << ": " << cpp_strerror(r) + << std::endl; + } + return r; + } + + std::cout << value << std::endl; + return 0; +} + +void get_pool_set_arguments(po::options_description *positional, + po::options_description *options) { + add_pool_option(positional); + add_key_option(positional); + positional->add_options() + ("value", "config value"); +} + +int execute_pool_set(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + int r = get_pool(vm, &pool_name); + if (r < 0) { + return r; + } + + std::string key; + size_t arg_index = 1; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + std::string value = utils::get_positional_argument(vm, 2); + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.pool_metadata_set(io_ctx, METADATA_CONF_PREFIX + key, value); + if (r < 0) { + std::cerr << "rbd: failed to set " << key << ": " << cpp_strerror(r) + << std::endl; + return r; + } + + return 0; +} + +void get_pool_remove_arguments(po::options_description *positional, + po::options_description *options) { + add_pool_option(positional); + add_key_option(positional); +} + +int execute_pool_remove(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + int r = get_pool(vm, &pool_name); + if (r < 0) { + return r; + } + + std::string key; + size_t arg_index = 1; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.pool_metadata_remove(io_ctx, METADATA_CONF_PREFIX + key); + if (r < 0) { + std::cerr << "rbd: failed to remove " << key << ": " << cpp_strerror(r) + << std::endl; + return r; + } + + return 0; +} + +void get_pool_list_arguments(po::options_description *positional, + po::options_description *options) { + add_pool_option(positional); + at::add_format_options(options); +} + +int execute_pool_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + int r = get_pool(vm, &pool_name); + if (r < 0) { + return r; + } + + at::Format::Formatter f; + r = utils::get_formatter(vm, &f); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + TextTable tbl; + librbd::RBD rbd; + std::vector<librbd::config_option_t> options; + + r = rbd.config_list(io_ctx, &options); + if (r < 0) { + std::cerr << "rbd: failed to list config: " << cpp_strerror(r) << std::endl; + return r; + } + + if (f) { + f->open_array_section("config"); + } else { + tbl.define_column("Name", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("Source", TextTable::LEFT, TextTable::LEFT); + } + + for (auto &option : options) { + if (f) { + f->open_object_section("option"); + f->dump_string("name", option.name); + f->dump_string("value", option.value); + f->dump_stream("source") << option.source; + f->close_section(); + } else { + std::ostringstream source; + source << option.source; + tbl << option.name << option.value << source.str() << TextTable::endrow; + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } else { + std::cout << tbl; + } + + return 0; +} + +void get_image_get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + add_key_option(positional); +} + +int execute_image_get(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string key; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + std::string value; + + r = image.metadata_get(METADATA_CONF_PREFIX + key, &value); + if (r < 0) { + if (r == -ENOENT) { + std::cerr << "rbd: " << key << " is not set" << std::endl; + } else { + std::cerr << "rbd: failed to get " << key << ": " << cpp_strerror(r) + << std::endl; + } + return r; + } + + std::cout << value << std::endl; + return 0; +} + +void get_image_set_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + add_key_option(positional); + positional->add_options() + ("value", "config value"); +} + +int execute_image_set(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string key; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + std::string value = utils::get_positional_argument(vm, arg_index); + if (value.empty()) { + std::cerr << "rbd: image config value was not specified" << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = image.metadata_set(METADATA_CONF_PREFIX + key, value); + if (r < 0) { + std::cerr << "rbd: failed to set " << key << ": " << cpp_strerror(r) + << std::endl; + return r; + } + + return 0; +} + +void get_image_remove_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + add_key_option(positional); +} + +int execute_image_remove( + const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string key; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = image.metadata_remove(METADATA_CONF_PREFIX + key); + if (r < 0) { + std::cerr << "rbd: failed to remove " << key << ": " << cpp_strerror(r) + << std::endl; + return r; + } + + return 0; +} + +void get_image_list_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_format_options(options); +} + +int execute_image_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + at::Format::Formatter f; + r = utils::get_formatter(vm, &f); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + TextTable tbl; + std::vector<librbd::config_option_t> options; + + r = image.config_list(&options); + if (r < 0) { + std::cerr << "rbd: failed to list config: " << cpp_strerror(r) << std::endl; + return r; + } + + if (options.empty()) { + if (f == nullptr) { + std::cout << "There are no values" << std::endl; + } + return 0; + } + + if (f) { + f->open_array_section("config"); + } else { + tbl.define_column("Name", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("Source", TextTable::LEFT, TextTable::LEFT); + } + + for (auto &option : options) { + if (f) { + f->open_object_section("option"); + f->dump_string("name", option.name); + f->dump_string("value", option.value); + f->dump_stream("source") << option.source; + f->close_section(); + } else { + std::ostringstream source; + source << option.source; + tbl << option.name << option.value << source.str() << TextTable::endrow; + } + } + + if (f == nullptr) { + bool single = (options.size() == 1); + std::cout << "There " << (single ? "is" : "are") << " " << options.size() + << " " << (single ? "value" : "values") << ":" << std::endl; + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } else { + std::cout << tbl; + } + + return 0; +} + +Shell::Action action_global_get( + {"config", "global", "get"}, {}, + "Get a global-level configuration override.", "", + &get_global_get_arguments, &execute_global_get); +Shell::Action action_global_set( + {"config", "global", "set"}, {}, + "Set a global-level configuration override.", "", + &get_global_set_arguments, &execute_global_set); +Shell::Action action_global_remove( + {"config", "global", "remove"}, {"config", "global", "rm"}, + "Remove a global-level configuration override.", "", + &get_global_remove_arguments, &execute_global_remove); +Shell::Action action_global_list( + {"config", "global", "list"}, {"config", "global", "ls"}, + "List global-level configuration overrides.", "", + &get_global_list_arguments, &execute_global_list); + +Shell::Action action_pool_get( + {"config", "pool", "get"}, {}, "Get a pool-level configuration override.", "", + &get_pool_get_arguments, &execute_pool_get); +Shell::Action action_pool_set( + {"config", "pool", "set"}, {}, "Set a pool-level configuration override.", "", + &get_pool_set_arguments, &execute_pool_set); +Shell::Action action_pool_remove( + {"config", "pool", "remove"}, {"config", "pool", "rm"}, + "Remove a pool-level configuration override.", "", + &get_pool_remove_arguments, &execute_pool_remove); +Shell::Action action_pool_list( + {"config", "pool", "list"}, {"config", "pool", "ls"}, + "List pool-level configuration overrides.", "", + &get_pool_list_arguments, &execute_pool_list); + +Shell::Action action_image_get( + {"config", "image", "get"}, {}, "Get an image-level configuration override.", + "", &get_image_get_arguments, &execute_image_get); +Shell::Action action_image_set( + {"config", "image", "set"}, {}, "Set an image-level configuration override.", + "", &get_image_set_arguments, &execute_image_set); +Shell::Action action_image_remove( + {"config", "image", "remove"}, {"config", "image", "rm"}, + "Remove an image-level configuration override.", "", + &get_image_remove_arguments, &execute_image_remove); +Shell::Action action_image_list( + {"config", "image", "list"}, {"config", "image", "ls"}, + "List image-level configuration overrides.", "", + &get_image_list_arguments, &execute_image_list); + +} // namespace config +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Copy.cc b/src/tools/rbd/action/Copy.cc new file mode 100644 index 00000000..9a248437 --- /dev/null +++ b/src/tools/rbd/action/Copy.cc @@ -0,0 +1,195 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace copy { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_copy(librbd::Image &src, librados::IoCtx& dest_pp, + const char *destname, librbd::ImageOptions& opts, + bool no_progress, + size_t sparse_size) +{ + utils::ProgressContext pc("Image copy", no_progress); + int r = src.copy_with_progress4(dest_pp, destname, opts, pc, sparse_size); + if (r < 0){ + pc.fail(); + return r; + } + pc.finish(); + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_SOURCE); + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST); + at::add_create_image_options(options, false); + at::add_sparse_size_option(options); + at::add_no_progress_option(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string dst_pool_name; + std::string dst_namespace_name; + std::string dst_image_name; + std::string dst_snap_name; + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name, + &dst_namespace_name, &dst_image_name, &dst_snap_name, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librbd::ImageOptions opts; + r = utils::get_image_options(vm, false, &opts); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", + snap_name, true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + librados::IoCtx dst_io_ctx; + r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name, &dst_io_ctx); + if (r < 0) { + return r; + } + + size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE; + if (vm.count(at::IMAGE_SPARSE_SIZE)) { + sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>(); + } + r = do_copy(image, dst_io_ctx, dst_image_name.c_str(), opts, + vm[at::NO_PROGRESS].as<bool>(), sparse_size); + if (r < 0) { + std::cerr << "rbd: copy failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"copy"}, {"cp"}, "Copy src image to dest.", at::get_long_features_help(), + &get_arguments, &execute); + +static int do_deep_copy(librbd::Image &src, librados::IoCtx& dest_pp, + const char *destname, librbd::ImageOptions& opts, + bool no_progress) +{ + utils::ProgressContext pc("Image deep copy", no_progress); + int r = src.deep_copy_with_progress(dest_pp, destname, opts, pc); + if (r < 0){ + pc.fail(); + return r; + } + pc.finish(); + return 0; +} + +void get_arguments_deep(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_SOURCE); + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST); + at::add_create_image_options(options, false); + at::add_flatten_option(options); + at::add_no_progress_option(options); +} + +int execute_deep(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string dst_pool_name; + std::string dst_namespace_name; + std::string dst_image_name; + std::string dst_snap_name; + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name, + &dst_namespace_name, &dst_image_name, &dst_snap_name, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librbd::ImageOptions opts; + r = utils::get_image_options(vm, false, &opts); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", + snap_name, true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + librados::IoCtx dst_io_ctx; + r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name, &dst_io_ctx); + if (r < 0) { + return r; + } + + r = do_deep_copy(image, dst_io_ctx, dst_image_name.c_str(), opts, + vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + std::cerr << "rbd: deep copy failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action_deep( + {"deep", "copy"}, {"deep", "cp"}, "Deep copy src image to dest.", + at::get_long_features_help(), &get_arguments_deep, &execute_deep); + +} // namespace copy +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Create.cc b/src/tools/rbd/action/Create.cc new file mode 100644 index 00000000..99efa0b5 --- /dev/null +++ b/src/tools/rbd/action/Create.cc @@ -0,0 +1,264 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> +#include "common/Cond.h" +#include "common/Mutex.h" + +namespace rbd { +namespace action { +namespace create { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_create(librbd::RBD &rbd, librados::IoCtx& io_ctx, + const char *imgname, uint64_t size, + librbd::ImageOptions& opts) { + return rbd.create4(io_ctx, imgname, size, opts); +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_create_image_options(options, true); + options->add_options() + (at::IMAGE_THICK_PROVISION.c_str(), po::bool_switch(), "fully allocate storage and zero image"); + at::add_size_option(options); + at::add_no_progress_option(options); +} + +void thick_provision_writer_completion(rbd_completion_t, void *); + +struct thick_provision_writer { + librbd::Image *image; + Mutex lock; + Cond cond; + bufferlist bl; + uint64_t chunk_size; + const int block_size; + uint64_t concurr; + struct { + uint64_t in_flight; + int io_error; + } io_status; + + // Constructor + explicit thick_provision_writer(librbd::Image *i, librbd::ImageOptions &o) + : image(i), + lock("thick_provision_writer::lock"), + block_size(512) // 512 Bytes + { + // If error cases occur, the code is aborted, because + // constructor cannot return error value. + ceph_assert(g_ceph_context != nullptr); + bl.append_zero(block_size); + + librbd::image_info_t info; + int r = image->stat(info, sizeof(info)); + ceph_assert(r >= 0); + uint64_t order; + if (info.order == 0) { + order = g_conf().get_val<uint64_t>("rbd_default_order"); + } else { + order = info.order; + } + chunk_size = (1ull << order); + if (image->get_stripe_unit() < chunk_size) { + chunk_size = image->get_stripe_unit(); + } + + concurr = g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"); + io_status.in_flight = 0; + io_status.io_error = 0; + } + + int start_io(uint64_t write_offset) + { + { + Mutex::Locker l(lock); + io_status.in_flight++; + if (io_status.in_flight > concurr) { + io_status.in_flight--; + return -EINVAL; + } + } + + librbd::RBD::AioCompletion *c; + c = new librbd::RBD::AioCompletion(this, thick_provision_writer_completion); + int r; + r = image->aio_writesame(write_offset, chunk_size, bl, c, LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL); + if (r < 0) { + Mutex::Locker l(lock); + io_status.io_error = r; + } + return r; + } + + int wait_for(uint64_t max) { + Mutex::Locker l(lock); + int r = io_status.io_error; + + while (io_status.in_flight > max) { + utime_t dur; + dur.set_from_double(.2); + cond.WaitInterval(lock, dur); + } + return r; + } +}; + +void thick_provision_writer_completion(rbd_completion_t rc, void *pc) { + librbd::RBD::AioCompletion *ac = (librbd::RBD::AioCompletion *)rc; + thick_provision_writer *tc = static_cast<thick_provision_writer *>(pc); + + int r = ac->get_return_value(); + tc->lock.Lock(); + if (r < 0 && tc->io_status.io_error >= 0) { + tc->io_status.io_error = r; + } + tc->io_status.in_flight--; + tc->cond.Signal(); + tc->lock.Unlock(); + ac->release(); +} + +int write_data(librbd::Image &image, librbd::ImageOptions &opts, + bool no_progress) { + uint64_t image_size; + int r = 0; + utils::ProgressContext pc("Thick provisioning", no_progress); + + if (image.size(&image_size) != 0) { + return -EINVAL; + } + + thick_provision_writer tpw(&image, opts); + uint64_t off; + uint64_t i; + for (off = 0; off < image_size;) { + i = 0; + while (i < tpw.concurr && off < image_size) { + tpw.wait_for(tpw.concurr - 1); + r = tpw.start_io(off); + if (r != 0) { + goto err_writesame; + } + ++i; + off += tpw.chunk_size; + if(off > image_size) { + off = image_size; + } + pc.update_progress(off, image_size); + } + } + + tpw.wait_for(0); + r = image.flush(); + if (r < 0) { + std::cerr << "rbd: failed to flush at the end: " << cpp_strerror(r) + << std::endl; + goto err_writesame; + } + pc.finish(); + + return r; + +err_writesame: + tpw.wait_for(0); + pc.fail(); + + return r; +} + +int thick_write(const std::string &image_name,librados::IoCtx &io_ctx, + librbd::ImageOptions &opts, bool no_progress) { + int r = 0; + librbd::Image image; + + // To prevent writesame from discarding data, thick_write sets + // the rbd_discard_on_zeroed_write_same option to false. + ceph_assert(g_ceph_context != nullptr); + r = g_conf().set_val("rbd_discard_on_zeroed_write_same", "false"); + ceph_assert(r == 0); + r = utils::open_image(io_ctx, image_name, false, &image); + if (r < 0) { + return r; + } + + r = write_data(image, opts, no_progress); + + image.close(); + + return r; +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librbd::ImageOptions opts; + r = utils::get_image_options(vm, true, &opts); + if (r < 0) { + return r; + } + + uint64_t size; + r = utils::get_image_size(vm, &size); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = do_create(rbd, io_ctx, image_name.c_str(), size, opts); + if (!namespace_name.empty() && r == -ENOENT) { + std::cerr << "rbd: namespace not found - it must be created with " + << "'rbd namespace create' before creating an image." + << std::endl; + return r; + } else if (r < 0) { + std::cerr << "rbd: create error: " << cpp_strerror(r) << std::endl; + return r; + } + + if (vm.count(at::IMAGE_THICK_PROVISION) && vm[at::IMAGE_THICK_PROVISION].as<bool>()) { + r = thick_write(image_name, io_ctx, opts, vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + std::cerr << "rbd: image created but error encountered during thick provisioning: " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +Shell::Action action( + {"create"}, {}, "Create an empty image.", at::get_long_features_help(), + &get_arguments, &execute); + +} // namespace create +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Device.cc b/src/tools/rbd/action/Device.cc new file mode 100644 index 00000000..3fdf2ef5 --- /dev/null +++ b/src/tools/rbd/action/Device.cc @@ -0,0 +1,185 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "acconfig.h" +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" + +#include <boost/program_options.hpp> + +#include "include/ceph_assert.h" + +namespace rbd { +namespace action { + +namespace at = argument_types; +namespace po = boost::program_options; + +#define DECLARE_DEVICE_OPERATIONS(ns) \ + namespace ns { \ + int execute_list(const po::variables_map &vm, \ + const std::vector<std::string> &ceph_global_args); \ + int execute_map(const po::variables_map &vm, \ + const std::vector<std::string> &ceph_global_args); \ + int execute_unmap(const po::variables_map &vm, \ + const std::vector<std::string> &ceph_global_args); \ + } + +DECLARE_DEVICE_OPERATIONS(ggate); +DECLARE_DEVICE_OPERATIONS(kernel); +DECLARE_DEVICE_OPERATIONS(nbd); + +namespace device { + +namespace { + +struct DeviceOperations { + int (*execute_list)(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_args); + int (*execute_map)(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_args); + int (*execute_unmap)(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_args); +}; + +const DeviceOperations ggate_operations = { + ggate::execute_list, + ggate::execute_map, + ggate::execute_unmap, +}; + +const DeviceOperations krbd_operations = { + kernel::execute_list, + kernel::execute_map, + kernel::execute_unmap, +}; + +const DeviceOperations nbd_operations = { + nbd::execute_list, + nbd::execute_map, + nbd::execute_unmap, +}; + +enum device_type_t { + DEVICE_TYPE_GGATE, + DEVICE_TYPE_KRBD, + DEVICE_TYPE_NBD, +}; + +struct DeviceType {}; + +void validate(boost::any& v, const std::vector<std::string>& values, + DeviceType *target_type, int) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + if (s == "ggate") { + v = boost::any(DEVICE_TYPE_GGATE); + } else if (s == "krbd") { + v = boost::any(DEVICE_TYPE_KRBD); + } else if (s == "nbd") { + v = boost::any(DEVICE_TYPE_NBD); + } else { + throw po::validation_error(po::validation_error::invalid_option_value); + } +} + +void add_device_type_option(po::options_description *options) { + options->add_options() + ("device-type,t", po::value<DeviceType>(), + "device type [ggate, krbd (default), nbd]"); +} + +void add_device_specific_options(po::options_description *options) { + options->add_options() + ("options,o", po::value<std::vector<std::string>>(), + "device specific options"); +} + +device_type_t get_device_type(const po::variables_map &vm) { + if (vm.count("device-type")) { + return vm["device-type"].as<device_type_t>(); + } + return DEVICE_TYPE_KRBD; +} + +const DeviceOperations *get_device_operations(const po::variables_map &vm) { + switch (get_device_type(vm)) { + case DEVICE_TYPE_GGATE: + return &ggate_operations; + case DEVICE_TYPE_KRBD: + return &krbd_operations; + case DEVICE_TYPE_NBD: + return &nbd_operations; + default: + ceph_abort(); + return nullptr; + } +} + +} // anonymous namespace + +void get_list_arguments(po::options_description *positional, + po::options_description *options) { + add_device_type_option(options); + at::add_format_options(options); +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + return (*get_device_operations(vm)->execute_list)(vm, ceph_global_init_args); +} + +void get_map_arguments(po::options_description *positional, + po::options_description *options) { + add_device_type_option(options); + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_NONE); + options->add_options() + ("read-only", po::bool_switch(), "map read-only") + ("exclusive", po::bool_switch(), "disable automatic exclusive lock transitions"); + add_device_specific_options(options); +} + +int execute_map(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + return (*get_device_operations(vm)->execute_map)(vm, ceph_global_init_args); +} + +void get_unmap_arguments(po::options_description *positional, + po::options_description *options) { + add_device_type_option(options); + positional->add_options() + ("image-or-snap-or-device-spec", + "image, snapshot, or device specification\n" + "[<pool-name>/]<image-name>[@<snapshot-name>] or <device-path>"); + at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE); + at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE); + at::add_snap_option(options, at::ARGUMENT_MODIFIER_NONE); + add_device_specific_options(options); +} + +int execute_unmap(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + return (*get_device_operations(vm)->execute_unmap)(vm, ceph_global_init_args); +} + +Shell::SwitchArguments switched_arguments({"read-only", "exclusive"}); +Shell::Action action_list( + {"device", "list"}, {"showmapped"}, "List mapped rbd images.", "", + &get_list_arguments, &execute_list); +// yet another alias for list command +Shell::Action action_ls( + {"device", "ls"}, {}, "List mapped rbd images.", "", + &get_list_arguments, &execute_list, false); + +Shell::Action action_map( + {"device", "map"}, {"map"}, "Map an image to a block device.", "", + &get_map_arguments, &execute_map); + +Shell::Action action_unmap( + {"device", "unmap"}, {"unmap"}, "Unmap a rbd device.", "", + &get_unmap_arguments, &execute_unmap); + +} // namespace device +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Diff.cc b/src/tools/rbd/action/Diff.cc new file mode 100644 index 00000000..3729469c --- /dev/null +++ b/src/tools/rbd/action/Diff.cc @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace diff { + +namespace at = argument_types; +namespace po = boost::program_options; + +struct output_method { + output_method() : f(NULL), t(NULL), empty(true) {} + Formatter *f; + TextTable *t; + bool empty; +}; + +static int diff_cb(uint64_t ofs, size_t len, int exists, void *arg) +{ + output_method *om = static_cast<output_method *>(arg); + om->empty = false; + if (om->f) { + om->f->open_object_section("extent"); + om->f->dump_unsigned("offset", ofs); + om->f->dump_unsigned("length", len); + om->f->dump_string("exists", exists ? "true" : "false"); + om->f->close_section(); + } else { + ceph_assert(om->t); + *(om->t) << ofs << len << (exists ? "data" : "zero") << TextTable::endrow; + } + return 0; +} + +static int do_diff(librbd::Image& image, const char *fromsnapname, + bool whole_object, Formatter *f) +{ + int r; + librbd::image_info_t info; + + r = image.stat(info, sizeof(info)); + if (r < 0) + return r; + + output_method om; + if (f) { + om.f = f; + f->open_array_section("extents"); + } else { + om.t = new TextTable(); + om.t->define_column("Offset", TextTable::LEFT, TextTable::LEFT); + om.t->define_column("Length", TextTable::LEFT, TextTable::LEFT); + om.t->define_column("Type", TextTable::LEFT, TextTable::LEFT); + } + + r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object, + diff_cb, &om); + if (f) { + f->close_section(); + f->flush(std::cout); + } else { + if (!om.empty) + std::cout << *om.t; + delete om.t; + } + return r; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_NONE); + options->add_options() + (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(), + "snapshot starting point") + (at::WHOLE_OBJECT.c_str(), po::bool_switch(), "compare whole object"); + at::add_format_options(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string from_snap_name; + if (vm.count(at::FROM_SNAPSHOT_NAME)) { + from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>(); + } + + bool diff_whole_object = vm[at::WHOLE_OBJECT].as<bool>(); + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", + snap_name, true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_diff(image, from_snap_name.empty() ? nullptr : from_snap_name.c_str(), + diff_whole_object, formatter.get()); + if (r < 0) { + std::cerr << "rbd: diff error: " << cpp_strerror(r) << std::endl; + return -r; + } + return 0; +} + +Shell::SwitchArguments switched_arguments({at::WHOLE_OBJECT}); +Shell::Action action( + {"diff"}, {}, + "Print extents that differ since a previous snap, or image creation.", "", + &get_arguments, &execute); + +} // namespace diff +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/DiskUsage.cc b/src/tools/rbd/action/DiskUsage.cc new file mode 100644 index 00000000..649f39a7 --- /dev/null +++ b/src/tools/rbd/action/DiskUsage.cc @@ -0,0 +1,341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/types.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include <algorithm> +#include <iostream> +#include <boost/bind.hpp> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace disk_usage { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int disk_usage_callback(uint64_t offset, size_t len, int exists, + void *arg) { + uint64_t *used_size = reinterpret_cast<uint64_t *>(arg); + if (exists) { + (*used_size) += len; + } + return 0; +} + +static int compute_image_disk_usage(const std::string& name, + const std::string& snap_name, + const std::string& from_snap_name, + librbd::Image &image, uint64_t size, + bool exact, TextTable& tbl, Formatter *f, + uint64_t *used_size) { + const char* from = NULL; + if (!from_snap_name.empty()) { + from = from_snap_name.c_str(); + } + + uint64_t flags; + int r = image.get_flags(&flags); + if (r < 0) { + std::cerr << "rbd: failed to retrieve image flags: " << cpp_strerror(r) + << std::endl; + return r; + } + if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) { + std::cerr << "warning: fast-diff map is invalid for " << name + << (snap_name.empty() ? "" : "@" + snap_name) << ". " + << "operation may be slow." << std::endl; + } + + *used_size = 0; + r = image.diff_iterate2(from, 0, size, false, !exact, + &disk_usage_callback, used_size); + if (r < 0) { + std::cerr << "rbd: failed to iterate diffs: " << cpp_strerror(r) + << std::endl; + return r; + } + + if (f) { + f->open_object_section("image"); + f->dump_string("name", name); + if (!snap_name.empty()) { + f->dump_string("snapshot", snap_name); + } + f->dump_unsigned("provisioned_size", size); + f->dump_unsigned("used_size" , *used_size); + f->close_section(); + } else { + std::string full_name = name; + if (!snap_name.empty()) { + full_name += "@" + snap_name; + } + tbl << full_name + << stringify(byte_u_t(size)) + << stringify(byte_u_t(*used_size)) + << TextTable::endrow; + } + return 0; +} + +static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx, + const char *imgname, const char *snapname, + const char *from_snapname, bool exact, Formatter *f) { + std::vector<librbd::image_spec_t> images; + int r = rbd.list2(io_ctx, &images); + if (r == -ENOENT) { + r = 0; + } else if (r < 0) { + return r; + } + + TextTable tbl; + if (f) { + f->open_object_section("stats"); + f->open_array_section("images"); + } else { + tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("PROVISIONED", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT); + } + + uint32_t count = 0; + uint64_t used_size = 0; + uint64_t total_prov = 0; + uint64_t total_used = 0; + uint64_t snap_id = CEPH_NOSNAP; + uint64_t from_id = CEPH_NOSNAP; + bool found = false; + for (auto& image_spec : images) { + if (imgname != NULL && image_spec.name != imgname) { + continue; + } + found = true; + + librbd::Image image; + r = rbd.open_read_only(io_ctx, image, image_spec.name.c_str(), NULL); + if (r < 0) { + if (r != -ENOENT) { + std::cerr << "rbd: error opening " << image_spec.name << ": " + << cpp_strerror(r) << std::endl; + } + continue; + } + + uint64_t features; + r = image.features(&features); + if (r < 0) { + std::cerr << "rbd: failed to retrieve image features: " << cpp_strerror(r) + << std::endl; + goto out; + } + if ((features & RBD_FEATURE_FAST_DIFF) == 0) { + std::cerr << "warning: fast-diff map is not enabled for " + << image_spec.name << ". " << "operation may be slow." + << std::endl; + } + + librbd::image_info_t info; + if (image.stat(info, sizeof(info)) < 0) { + r = -EINVAL; + goto out; + } + + std::vector<librbd::snap_info_t> snap_list; + r = image.snap_list(snap_list); + if (r < 0) { + std::cerr << "rbd: error opening " << image_spec.name << " snapshots: " + << cpp_strerror(r) << std::endl; + continue; + } + + snap_list.erase(remove_if(snap_list.begin(), + snap_list.end(), + boost::bind(utils::is_not_user_snap_namespace, &image, _1)), + snap_list.end()); + + bool found_from_snap = (from_snapname == nullptr); + bool found_snap = (snapname == nullptr); + bool found_from = (from_snapname == nullptr); + std::string last_snap_name; + std::sort(snap_list.begin(), snap_list.end(), + boost::bind(&librbd::snap_info_t::id, _1) < + boost::bind(&librbd::snap_info_t::id, _2)); + if (!found_snap || !found_from) { + for (auto &snap_info : snap_list) { + if (!found_snap && snap_info.name == snapname) { + snap_id = snap_info.id; + found_snap = true; + } + if (!found_from && snap_info.name == from_snapname) { + from_id = snap_info.id; + found_from = true; + } + if (found_snap && found_from) { + break; + } + } + } + if ((snapname != nullptr && snap_id == CEPH_NOSNAP) || + (from_snapname != nullptr && from_id == CEPH_NOSNAP)) { + std::cerr << "specified snapshot is not found." << std::endl; + return -ENOENT; + } + if (snap_id != CEPH_NOSNAP && from_id != CEPH_NOSNAP) { + if (from_id == snap_id) { + // no diskusage. + return 0; + } + if (from_id >= snap_id) { + return -EINVAL; + } + } + + for (std::vector<librbd::snap_info_t>::const_iterator snap = + snap_list.begin(); snap != snap_list.end(); ++snap) { + librbd::Image snap_image; + r = rbd.open_read_only(io_ctx, snap_image, image_spec.name.c_str(), + snap->name.c_str()); + if (r < 0) { + std::cerr << "rbd: error opening snapshot " << image_spec.name << "@" + << snap->name << ": " << cpp_strerror(r) << std::endl; + goto out; + } + + if (imgname == nullptr || found_from_snap || + (found_from_snap && snapname != nullptr && snap->name == snapname)) { + r = compute_image_disk_usage(image_spec.name, snap->name, + last_snap_name, snap_image, snap->size, + exact, tbl, f, &used_size); + if (r < 0) { + goto out; + } + + if (snapname != NULL) { + total_prov += snap->size; + } + total_used += used_size; + ++count; + } + + if (!found_from_snap && from_snapname != nullptr && + snap->name == from_snapname) { + found_from_snap = true; + } + if (snapname != nullptr && snap->name == snapname) { + break; + } + last_snap_name = snap->name; + } + + if (snapname == NULL) { + r = compute_image_disk_usage(image_spec.name, "", last_snap_name, image, + info.size, exact, tbl, f, &used_size); + if (r < 0) { + goto out; + } + total_prov += info.size; + total_used += used_size; + ++count; + } + } + if (imgname != nullptr && !found) { + std::cerr << "specified image " << imgname << " is not found." << std::endl; + return -ENOENT; + } + +out: + if (f) { + f->close_section(); + if (imgname == NULL) { + f->dump_unsigned("total_provisioned_size", total_prov); + f->dump_unsigned("total_used_size", total_used); + } + f->close_section(); + f->flush(std::cout); + } else if (!images.empty()) { + if (count > 1) { + tbl << "<TOTAL>" + << stringify(byte_u_t(total_prov)) + << stringify(byte_u_t(total_used)) + << TextTable::endrow; + } + std::cout << tbl; + } + + return r < 0 ? r : 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_NONE); + at::add_format_options(options); + options->add_options() + (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(), + "snapshot starting point") + ("exact", po::bool_switch(), "compute exact disk usage (slow)"); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, vm.count(at::FROM_SNAPSHOT_NAME), + utils::SNAPSHOT_PRESENCE_PERMITTED, utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string from_snap_name; + if (vm.count(at::FROM_SNAPSHOT_NAME)) { + from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>(); + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + utils::disable_cache(); + + librbd::RBD rbd; + r = do_disk_usage(rbd, io_ctx, + image_name.empty() ? nullptr: image_name.c_str(), + snap_name.empty() ? nullptr : snap_name.c_str(), + from_snap_name.empty() ? nullptr : from_snap_name.c_str(), + vm["exact"].as<bool>(), formatter.get()); + if (r < 0) { + std::cerr << "rbd: du failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"disk-usage"}, {"du"}, "Show disk usage stats for pool, image or snapshot.", + "", &get_arguments, &execute); + +} // namespace disk_usage +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Export.cc b/src/tools/rbd/action/Export.cc new file mode 100644 index 00000000..b5b82f4c --- /dev/null +++ b/src/tools/rbd/action/Export.cc @@ -0,0 +1,651 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/Context.h" +#include "common/errno.h" +#include "common/Throttle.h" +#include "include/encoding.h" +#include <iostream> +#include <fcntl.h> +#include <stdlib.h> +#include <boost/program_options.hpp> +#include <boost/scope_exit.hpp> + +namespace rbd { +namespace action { +namespace export_full { + +struct ExportDiffContext { + librbd::Image *image; + int fd; + int export_format; + uint64_t totalsize; + utils::ProgressContext pc; + OrderedThrottle throttle; + + ExportDiffContext(librbd::Image *i, int f, uint64_t t, int max_ops, + bool no_progress, int eformat) : + image(i), fd(f), export_format(eformat), totalsize(t), pc("Exporting image", no_progress), + throttle(max_ops, true) { + } +}; + +class C_ExportDiff : public Context { +public: + C_ExportDiff(ExportDiffContext *edc, uint64_t offset, uint64_t length, + bool exists, int export_format) + : m_export_diff_context(edc), m_offset(offset), m_length(length), + m_exists(exists), m_export_format(export_format) { + } + + int send() { + if (m_export_diff_context->throttle.pending_error()) { + return m_export_diff_context->throttle.wait_for_ret(); + } + + C_OrderedThrottle *ctx = m_export_diff_context->throttle.start_op(this); + if (m_exists) { + librbd::RBD::AioCompletion *aio_completion = + new librbd::RBD::AioCompletion(ctx, &utils::aio_context_callback); + + int op_flags = LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + int r = m_export_diff_context->image->aio_read2( + m_offset, m_length, m_read_data, aio_completion, op_flags); + if (r < 0) { + aio_completion->release(); + ctx->complete(r); + } + } else { + ctx->complete(0); + } + return 0; + } + + static int export_diff_cb(uint64_t offset, size_t length, int exists, + void *arg) { + ExportDiffContext *edc = reinterpret_cast<ExportDiffContext *>(arg); + + C_ExportDiff *context = new C_ExportDiff(edc, offset, length, exists, edc->export_format); + return context->send(); + } + +protected: + void finish(int r) override { + if (r >= 0) { + if (m_exists) { + m_exists = !m_read_data.is_zero(); + } + r = write_extent(m_export_diff_context, m_offset, m_length, m_exists, m_export_format); + if (r == 0 && m_exists) { + r = m_read_data.write_fd(m_export_diff_context->fd); + } + } + m_export_diff_context->throttle.end_op(r); + } + +private: + ExportDiffContext *m_export_diff_context; + uint64_t m_offset; + uint64_t m_length; + bool m_exists; + int m_export_format; + bufferlist m_read_data; + + static int write_extent(ExportDiffContext *edc, uint64_t offset, + uint64_t length, bool exists, int export_format) { + // extent + bufferlist bl; + __u8 tag = exists ? RBD_DIFF_WRITE : RBD_DIFF_ZERO; + uint64_t len = 0; + encode(tag, bl); + if (export_format == 2) { + if (tag == RBD_DIFF_WRITE) + len = 8 + 8 + length; + else + len = 8 + 8; + encode(len, bl); + } + encode(offset, bl); + encode(length, bl); + int r = bl.write_fd(edc->fd); + + edc->pc.update_progress(offset, edc->totalsize); + return r; + } +}; + + +int do_export_diff_fd(librbd::Image& image, const char *fromsnapname, + const char *endsnapname, bool whole_object, + int fd, bool no_progress, int export_format) +{ + int r; + librbd::image_info_t info; + + r = image.stat(info, sizeof(info)); + if (r < 0) + return r; + + { + // header + bufferlist bl; + if (export_format == 1) + bl.append(utils::RBD_DIFF_BANNER); + else + bl.append(utils::RBD_DIFF_BANNER_V2); + + __u8 tag; + uint64_t len = 0; + if (fromsnapname) { + tag = RBD_DIFF_FROM_SNAP; + encode(tag, bl); + std::string from(fromsnapname); + if (export_format == 2) { + len = from.length() + 4; + encode(len, bl); + } + encode(from, bl); + } + + if (endsnapname) { + tag = RBD_DIFF_TO_SNAP; + encode(tag, bl); + std::string to(endsnapname); + if (export_format == 2) { + len = to.length() + 4; + encode(len, bl); + } + encode(to, bl); + } + + if (endsnapname && export_format == 2) { + tag = RBD_SNAP_PROTECTION_STATUS; + encode(tag, bl); + bool is_protected = false; + r = image.snap_is_protected(endsnapname, &is_protected); + if (r < 0) { + return r; + } + len = 8; + encode(len, bl); + encode(is_protected, bl); + } + + tag = RBD_DIFF_IMAGE_SIZE; + encode(tag, bl); + uint64_t endsize = info.size; + if (export_format == 2) { + len = 8; + encode(len, bl); + } + encode(endsize, bl); + + r = bl.write_fd(fd); + if (r < 0) { + return r; + } + } + ExportDiffContext edc(&image, fd, info.size, + g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"), + no_progress, export_format); + r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object, + &C_ExportDiff::export_diff_cb, (void *)&edc); + if (r < 0) { + goto out; + } + + r = edc.throttle.wait_for_ret(); + if (r < 0) { + goto out; + } + + { + __u8 tag = RBD_DIFF_END; + bufferlist bl; + encode(tag, bl); + r = bl.write_fd(fd); + } + +out: + if (r < 0) + edc.pc.fail(); + else + edc.pc.finish(); + + return r; +} + +int do_export_diff(librbd::Image& image, const char *fromsnapname, + const char *endsnapname, bool whole_object, + const char *path, bool no_progress) +{ + int r; + int fd; + + if (strcmp(path, "-") == 0) + fd = STDOUT_FILENO; + else + fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644); + if (fd < 0) + return -errno; + + r = do_export_diff_fd(image, fromsnapname, endsnapname, whole_object, fd, no_progress, 1); + + if (fd != 1) + close(fd); + if (r < 0 && fd != 1) { + remove(path); + } + + return r; +} + + +namespace at = argument_types; +namespace po = boost::program_options; + +void get_arguments_diff(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_SOURCE); + at::add_path_options(positional, options, + "export file (or '-' for stdout)"); + options->add_options() + (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(), + "snapshot starting point") + (at::WHOLE_OBJECT.c_str(), po::bool_switch(), "compare whole object"); + at::add_no_progress_option(options); +} + +int execute_diff(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string path; + r = utils::get_path(vm, &arg_index, &path); + if (r < 0) { + return r; + } + + std::string from_snap_name; + if (vm.count(at::FROM_SNAPSHOT_NAME)) { + from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>(); + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", + snap_name, true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_export_diff(image, + from_snap_name.empty() ? nullptr : from_snap_name.c_str(), + snap_name.empty() ? nullptr : snap_name.c_str(), + vm[at::WHOLE_OBJECT].as<bool>(), path.c_str(), + vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + std::cerr << "rbd: export-diff error: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::SwitchArguments switched_arguments({at::WHOLE_OBJECT}); +Shell::Action action_diff( + {"export-diff"}, {}, "Export incremental diff to file.", "", + &get_arguments_diff, &execute_diff); + +class C_Export : public Context +{ +public: + C_Export(OrderedThrottle &ordered_throttle, librbd::Image &image, + uint64_t fd_offset, uint64_t offset, uint64_t length, int fd) + : m_throttle(ordered_throttle), m_image(image), m_dest_offset(fd_offset), + m_offset(offset), m_length(length), m_fd(fd) + { + } + + void send() + { + auto ctx = m_throttle.start_op(this); + auto aio_completion = new librbd::RBD::AioCompletion( + ctx, &utils::aio_context_callback); + int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | + LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + int r = m_image.aio_read2(m_offset, m_length, m_bufferlist, + aio_completion, op_flags); + if (r < 0) { + cerr << "rbd: error requesting read from source image" << std::endl; + aio_completion->release(); + m_throttle.end_op(r); + } + } + + void finish(int r) override + { + BOOST_SCOPE_EXIT((&m_throttle) (&r)) + { + m_throttle.end_op(r); + } BOOST_SCOPE_EXIT_END + + if (r < 0) { + cerr << "rbd: error reading from source image at offset " + << m_offset << ": " << cpp_strerror(r) << std::endl; + return; + } + + ceph_assert(m_bufferlist.length() == static_cast<size_t>(r)); + if (m_fd != STDOUT_FILENO) { + if (m_bufferlist.is_zero()) { + return; + } + + uint64_t chkret = lseek64(m_fd, m_dest_offset, SEEK_SET); + if (chkret != m_dest_offset) { + cerr << "rbd: error seeking destination image to offset " + << m_dest_offset << std::endl; + r = -errno; + return; + } + } + + r = m_bufferlist.write_fd(m_fd); + if (r < 0) { + cerr << "rbd: error writing to destination image at offset " + << m_dest_offset << std::endl; + } + } + +private: + OrderedThrottle &m_throttle; + librbd::Image &m_image; + bufferlist m_bufferlist; + uint64_t m_dest_offset; + uint64_t m_offset; + uint64_t m_length; + int m_fd; +}; + +const uint32_t MAX_KEYS = 64; + +static int do_export_v2(librbd::Image& image, librbd::image_info_t &info, int fd, + uint64_t period, int max_concurrent_ops, utils::ProgressContext &pc) +{ + int r = 0; + // header + bufferlist bl; + bl.append(utils::RBD_IMAGE_BANNER_V2); + + __u8 tag; + uint64_t length; + // encode order + tag = RBD_EXPORT_IMAGE_ORDER; + length = 8; + encode(tag, bl); + encode(length, bl); + encode(uint64_t(info.order), bl); + + // encode features + tag = RBD_EXPORT_IMAGE_FEATURES; + uint64_t features; + image.features(&features); + length = 8; + encode(tag, bl); + encode(length, bl); + encode(features, bl); + + // encode stripe_unit and stripe_count + tag = RBD_EXPORT_IMAGE_STRIPE_UNIT; + uint64_t stripe_unit; + stripe_unit = image.get_stripe_unit(); + length = 8; + encode(tag, bl); + encode(length, bl); + encode(stripe_unit, bl); + + tag = RBD_EXPORT_IMAGE_STRIPE_COUNT; + uint64_t stripe_count; + stripe_count = image.get_stripe_count(); + length = 8; + encode(tag, bl); + encode(length, bl); + encode(stripe_count, bl); + + //retrieve metadata of image + std::map<std::string, string> imagemetas; + std::string last_key; + bool more_results = true; + while (more_results) { + std::map<std::string, bufferlist> pairs; + r = image.metadata_list(last_key, MAX_KEYS, &pairs); + if (r < 0) { + std::cerr << "failed to retrieve metadata of image : " << cpp_strerror(r) + << std::endl; + return r; + } + + if (!pairs.empty()) { + last_key = pairs.rbegin()->first; + + for (auto kv : pairs) { + std::string key = kv.first; + std::string val(kv.second.c_str(), kv.second.length()); + imagemetas[key] = val; + } + } + more_results = (pairs.size() == MAX_KEYS); + } + + //encode imageMeta key and value + for (std::map<std::string, string>::iterator it = imagemetas.begin(); + it != imagemetas.end(); ++it) { + string key = it->first; + string value = it->second; + + tag = RBD_EXPORT_IMAGE_META; + length = key.length() + value.length() + 4 * 2; + encode(tag, bl); + encode(length, bl); + encode(key, bl); + encode(value, bl); + } + + // encode end tag + tag = RBD_EXPORT_IMAGE_END; + encode(tag, bl); + + // write bl to fd. + r = bl.write_fd(fd); + if (r < 0) { + return r; + } + + // header for snapshots + bl.clear(); + bl.append(utils::RBD_IMAGE_DIFFS_BANNER_V2); + + std::vector<librbd::snap_info_t> snaps; + r = image.snap_list(snaps); + if (r < 0) { + return r; + } + + uint64_t diff_num = snaps.size() + 1; + encode(diff_num, bl); + + r = bl.write_fd(fd); + if (r < 0) { + return r; + } + + const char *last_snap = NULL; + for (size_t i = 0; i < snaps.size(); ++i) { + utils::snap_set(image, snaps[i].name.c_str()); + r = do_export_diff_fd(image, last_snap, snaps[i].name.c_str(), false, fd, true, 2); + if (r < 0) { + return r; + } + pc.update_progress(i, snaps.size() + 1); + last_snap = snaps[i].name.c_str(); + } + utils::snap_set(image, std::string("")); + r = do_export_diff_fd(image, last_snap, nullptr, false, fd, true, 2); + if (r < 0) { + return r; + } + pc.update_progress(snaps.size() + 1, snaps.size() + 1); + return r; +} + +static int do_export_v1(librbd::Image& image, librbd::image_info_t &info, + int fd, uint64_t period, int max_concurrent_ops, + utils::ProgressContext &pc) +{ + int r = 0; + size_t file_size = 0; + OrderedThrottle throttle(max_concurrent_ops, false); + for (uint64_t offset = 0; offset < info.size; offset += period) { + if (throttle.pending_error()) { + break; + } + + uint64_t length = min(period, info.size - offset); + C_Export *ctx = new C_Export(throttle, image, file_size + offset, offset, + length, fd); + ctx->send(); + + pc.update_progress(offset, info.size); + } + + file_size += info.size; + r = throttle.wait_for_ret(); + if (fd != 1) { + if (r >= 0) { + r = ftruncate(fd, file_size); + if (r < 0) + return r; + + uint64_t chkret = lseek64(fd, file_size, SEEK_SET); + if (chkret != file_size) + r = errno; + } + } + return r; +} + +static int do_export(librbd::Image& image, const char *path, bool no_progress, + int export_format) +{ + librbd::image_info_t info; + int64_t r = image.stat(info, sizeof(info)); + if (r < 0) + return r; + + int fd; + int max_concurrent_ops = g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"); + bool to_stdout = (strcmp(path, "-") == 0); + if (to_stdout) { + fd = STDOUT_FILENO; + } else { + fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644); + if (fd < 0) { + return -errno; + } +#ifdef HAVE_POSIX_FADVISE + posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#endif + } + + utils::ProgressContext pc("Exporting image", no_progress); + uint64_t period = image.get_stripe_count() * (1ull << info.order); + + if (export_format == 1) + r = do_export_v1(image, info, fd, period, max_concurrent_ops, pc); + else + r = do_export_v2(image, info, fd, period, max_concurrent_ops, pc); + + if (r < 0) + pc.fail(); + else + pc.finish(); + if (!to_stdout) + close(fd); + return r; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_SOURCE); + at::add_path_options(positional, options, + "export file (or '-' for stdout)"); + at::add_no_progress_option(options); + at::add_export_format_option(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string path; + r = utils::get_path(vm, &arg_index, &path); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", + snap_name, true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + int format = 1; + if (vm.count("export-format")) + format = vm["export-format"].as<uint64_t>(); + + r = do_export(image, path.c_str(), vm[at::NO_PROGRESS].as<bool>(), format); + if (r < 0) { + std::cerr << "rbd: export error: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"export"}, {}, "Export image to file.", "", &get_arguments, &execute); + +} // namespace export_full +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Feature.cc b/src/tools/rbd/action/Feature.cc new file mode 100644 index 00000000..13a7b6ea --- /dev/null +++ b/src/tools/rbd/action/Feature.cc @@ -0,0 +1,116 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/stringify.h" +#include "common/errno.h" +#include <iostream> +#include <map> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace feature { + +namespace at = argument_types; +namespace po = boost::program_options; + +void get_arguments(po::options_description *positional, + po::options_description *options, bool enabled) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + positional->add_options() + ("features", po::value<at::ImageFeatures>()->multitoken(), + ("image features\n" + at::get_short_features_help(false)).c_str()); + if (enabled) { + at::add_create_journal_options(options); + } +} + +void get_arguments_disable(po::options_description *positional, + po::options_description *options) { + get_arguments(positional, options, false); +} + +void get_arguments_enable(po::options_description *positional, + po::options_description *options) { + get_arguments(positional, options, true); +} + +int execute(const po::variables_map &vm, bool enabled) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librbd::ImageOptions opts; + r = utils::get_journal_options(vm, &opts); + if (r < 0) { + return r; + } + + std::vector<std::string> feature_names; + if (vm.count(at::POSITIONAL_ARGUMENTS)) { + const std::vector<std::string> &args = + vm[at::POSITIONAL_ARGUMENTS].as<std::vector<std::string> >(); + feature_names.insert(feature_names.end(), args.begin() + arg_index, + args.end()); + } + + if (feature_names.empty()) { + std::cerr << "rbd: at least one feature name must be specified" + << std::endl; + return -EINVAL; + } + + boost::any features_any(static_cast<uint64_t>(0)); + at::ImageFeatures image_features; + at::validate(features_any, feature_names, &image_features, 0); + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = image.update_features(boost::any_cast<uint64_t>(features_any), enabled); + if (r < 0) { + std::cerr << "rbd: failed to update image features: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +int execute_disable(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + return execute(vm, false); +} + +int execute_enable(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + return execute(vm, true); +} + +Shell::Action action_disable( + {"feature", "disable"}, {}, "Disable the specified image feature.", "", + &get_arguments_disable, &execute_disable); +Shell::Action action_enable( + {"feature", "enable"}, {}, "Enable the specified image feature.", "", + &get_arguments_enable, &execute_enable); + +} // namespace feature +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Flatten.cc b/src/tools/rbd/action/Flatten.cc new file mode 100644 index 00000000..ec4e837a --- /dev/null +++ b/src/tools/rbd/action/Flatten.cc @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace flatten { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_flatten(librbd::Image& image, bool no_progress) +{ + utils::ProgressContext pc("Image flatten", no_progress); + int r = image.flatten_with_progress(pc); + if (r < 0) { + pc.fail(); + return r; + } + pc.finish(); + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_no_progress_option(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_flatten(image, vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + std::cerr << "rbd: flatten error: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"flatten"}, {}, "Fill clone with parent data (make it independent).", "", + &get_arguments, &execute); + +} // namespace flatten +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Ggate.cc b/src/tools/rbd/action/Ggate.cc new file mode 100644 index 00000000..61f77be2 --- /dev/null +++ b/src/tools/rbd/action/Ggate.cc @@ -0,0 +1,193 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/param.h> +#include <errno.h> +#include <unistd.h> + +#include "include/stringify.h" +#include "common/SubProcess.h" + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" + +#include <boost/algorithm/string.hpp> +#include <boost/algorithm/string/predicate.hpp> +#include <boost/program_options.hpp> + +#include <iostream> + +namespace rbd { +namespace action { +namespace ggate { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int call_ggate_cmd(const po::variables_map &vm, + const std::vector<std::string> &args, + const std::vector<std::string> &ceph_global_args) { + SubProcess process("rbd-ggate", SubProcess::KEEP, SubProcess::KEEP, + SubProcess::KEEP); + + for (auto &arg : ceph_global_args) { + process.add_cmd_arg(arg.c_str()); + } + + for (auto &arg : args) { + process.add_cmd_arg(arg.c_str()); + } + + if (process.spawn()) { + std::cerr << "rbd: failed to run rbd-ggate: " << process.err() << std::endl; + return -EINVAL; + } else if (process.join()) { + std::cerr << "rbd: rbd-ggate failed with error: " << process.err() + << std::endl; + return -EINVAL; + } + + return 0; +} + +int get_image_or_snap_spec(const po::variables_map &vm, std::string *spec) { + size_t arg_index = 0; + std::string pool_name; + std::string nspace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name, + &image_name, &snap_name, true, + utils::SNAPSHOT_PRESENCE_PERMITTED, utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + spec->append(pool_name); + spec->append("/"); + if (!nspace_name.empty()) { + spec->append(nspace_name); + spec->append("/"); + } + spec->append(image_name); + if (!snap_name.empty()) { + spec->append("@"); + spec->append(snap_name); + } + + return 0; +} + +int parse_options(const std::vector<std::string> &options, + std::vector<std::string> *args) { + for (auto &opts : options) { + std::vector<std::string> args_; + boost::split(args_, opts, boost::is_any_of(",")); + for (auto &o : args_) { + args->push_back("--" + o); + } + } + + return 0; +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { +#if !defined(__FreeBSD__) + std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl; + return -EOPNOTSUPP; +#endif + std::vector<std::string> args; + + args.push_back("list"); + + if (vm.count("format")) { + args.push_back("--format"); + args.push_back(vm["format"].as<at::Format>().value); + } + if (vm["pretty-format"].as<bool>()) { + args.push_back("--pretty-format"); + } + + return call_ggate_cmd(vm, args, ceph_global_init_args); +} + +int execute_map(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { +#if !defined(__FreeBSD__) + std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl; + return -EOPNOTSUPP; +#endif + std::vector<std::string> args; + + args.push_back("map"); + std::string img; + int r = get_image_or_snap_spec(vm, &img); + if (r < 0) { + return r; + } + args.push_back(img); + + if (vm["read-only"].as<bool>()) { + args.push_back("--read-only"); + } + + if (vm["exclusive"].as<bool>()) { + args.push_back("--exclusive"); + } + + if (vm.count("options")) { + r = parse_options(vm["options"].as<std::vector<std::string>>(), &args); + if (r < 0) { + return r; + } + } + + return call_ggate_cmd(vm, args, ceph_global_init_args); +} + +int execute_unmap(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { +#if !defined(__FreeBSD__) + std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl; + return -EOPNOTSUPP; +#endif + std::string device_name = utils::get_positional_argument(vm, 0); + if (!boost::starts_with(device_name, "/dev/")) { + device_name.clear(); + } + + std::string image_name; + if (device_name.empty()) { + int r = get_image_or_snap_spec(vm, &image_name); + if (r < 0) { + return r; + } + } + + if (device_name.empty() && image_name.empty()) { + std::cerr << "rbd: unmap requires either image name or device path" + << std::endl; + return -EINVAL; + } + + std::vector<std::string> args; + + args.push_back("unmap"); + args.push_back(device_name.empty() ? image_name : device_name); + + if (vm.count("options")) { + int r = parse_options(vm["options"].as<std::vector<std::string>>(), &args); + if (r < 0) { + return r; + } + } + + return call_ggate_cmd(vm, args, ceph_global_init_args); +} + +} // namespace ggate +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Group.cc b/src/tools/rbd/action/Group.cc new file mode 100644 index 00000000..8554ae3b --- /dev/null +++ b/src/tools/rbd/action/Group.cc @@ -0,0 +1,904 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <iostream> + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/rbd_types.h" +#include "cls/rbd/cls_rbd_types.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" + +namespace rbd { +namespace action { +namespace group { + +namespace at = argument_types; +namespace po = boost::program_options; + +static const std::string GROUP_SPEC("group-spec"); +static const std::string GROUP_SNAP_SPEC("group-snap-spec"); + +static const std::string GROUP_NAME("group"); +static const std::string DEST_GROUP_NAME("dest-group"); + +static const std::string GROUP_POOL_NAME("group-" + at::POOL_NAME); +static const std::string IMAGE_POOL_NAME("image-" + at::POOL_NAME); + +void add_group_option(po::options_description *opt, + at::ArgumentModifier modifier) { + std::string name = GROUP_NAME; + std::string description = at::get_description_prefix(modifier) + "group name"; + switch (modifier) { + case at::ARGUMENT_MODIFIER_NONE: + case at::ARGUMENT_MODIFIER_SOURCE: + break; + case at::ARGUMENT_MODIFIER_DEST: + name = DEST_GROUP_NAME; + break; + } + + // TODO add validator + opt->add_options() + (name.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_prefixed_pool_option(po::options_description *opt, + const std::string &prefix) { + std::string name = prefix + "-" + at::POOL_NAME; + std::string description = prefix + " pool name"; + + opt->add_options() + (name.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_prefixed_namespace_option(po::options_description *opt, + const std::string &prefix) { + std::string name = prefix + "-" + at::NAMESPACE_NAME; + std::string description = prefix + " namespace name"; + + opt->add_options() + (name.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_group_spec_options(po::options_description *pos, + po::options_description *opt, + at::ArgumentModifier modifier, + bool snap) { + at::add_pool_option(opt, modifier); + at::add_namespace_option(opt, modifier); + add_group_option(opt, modifier); + if (!snap) { + pos->add_options() + ((get_name_prefix(modifier) + GROUP_SPEC).c_str(), + (get_description_prefix(modifier) + "group specification\n" + + "(example: [<pool-name>/[<namespace>/]]<group-name>)").c_str()); + } else { + add_snap_option(opt, modifier); + pos->add_options() + ((get_name_prefix(modifier) + GROUP_SNAP_SPEC).c_str(), + (get_description_prefix(modifier) + "group specification\n" + + "(example: [<pool-name>/[<namespace>/]]<group-name>@<snap-name>)").c_str()); + } +} + +int execute_create(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + + std::string pool_name; + std::string namespace_name; + std::string group_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, + &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + librbd::RBD rbd; + r = rbd.group_create(io_ctx, group_name.c_str()); + if (r < 0) { + std::cerr << "rbd: create error: " << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + std::string namespace_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name, + &namespace_name, &arg_index); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + Formatter *f = formatter.get(); + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + std::vector<std::string> names; + r = rbd.group_list(io_ctx, &names); + if (r < 0) + return r; + + if (f) + f->open_array_section("groups"); + for (auto i : names) { + if (f) + f->dump_string("name", i); + else + std::cout << i << std::endl; + } + if (f) { + f->close_section(); + f->flush(std::cout); + } + + return 0; +} + +int execute_remove(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + + std::string pool_name; + std::string namespace_name; + std::string group_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, + &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + librbd::RBD rbd; + + r = rbd.group_remove(io_ctx, group_name.c_str()); + if (r < 0) { + std::cerr << "rbd: remove error: " << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +int execute_rename(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + + std::string pool_name; + std::string namespace_name; + std::string group_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, + &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + std::string dest_pool_name; + std::string dest_namespace_name; + std::string dest_group_name; + + r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, at::DEST_POOL_NAME, + &dest_pool_name, &dest_namespace_name, DEST_GROUP_NAME, "group", + &dest_group_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + if (pool_name != dest_pool_name) { + std::cerr << "rbd: group rename across pools not supported" << std::endl + << "source pool: " << pool_name << ", dest pool: " + << dest_pool_name << std::endl; + return -EINVAL; + } else if (namespace_name != dest_namespace_name) { + std::cerr << "rbd: group rename across namespaces not supported" + << std::endl + << "source namespace: " << namespace_name << ", dest namespace: " + << dest_namespace_name << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.group_rename(io_ctx, group_name.c_str(), + dest_group_name.c_str()); + + if (r < 0) { + std::cerr << "rbd: failed to rename group: " + << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +int execute_add(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + // Parse group data. + std::string group_pool_name; + std::string group_namespace_name; + std::string group_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, GROUP_POOL_NAME, + &group_pool_name, &group_namespace_name, GROUP_NAME, "group", &group_name, + nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + std::string image_pool_name; + std::string image_namespace_name; + std::string image_name; + + r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, IMAGE_POOL_NAME, + &image_pool_name, &image_namespace_name, at::IMAGE_NAME, "image", + &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + if (group_namespace_name != image_namespace_name) { + std::cerr << "rbd: group and image namespace must match." << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx cg_io_ctx; + r = utils::init(group_pool_name, group_namespace_name, &rados, &cg_io_ctx); + if (r < 0) { + return r; + } + + librados::IoCtx image_io_ctx; + r = utils::init(image_pool_name, group_namespace_name, &rados, &image_io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.group_image_add(cg_io_ctx, group_name.c_str(), + image_io_ctx, image_name.c_str()); + if (r < 0) { + std::cerr << "rbd: add image error: " << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +int execute_remove_image(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + + std::string group_pool_name; + std::string group_namespace_name; + std::string group_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, GROUP_POOL_NAME, + &group_pool_name, &group_namespace_name, GROUP_NAME, "group", &group_name, + nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + std::string image_pool_name; + std::string image_namespace_name; + std::string image_name; + std::string image_id; + + if (vm.count(at::IMAGE_ID)) { + image_id = vm[at::IMAGE_ID].as<std::string>(); + } + + r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, IMAGE_POOL_NAME, + &image_pool_name, &image_namespace_name, at::IMAGE_NAME, "image", + &image_name, nullptr, image_id.empty(), utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + if (group_namespace_name != image_namespace_name) { + std::cerr << "rbd: group and image namespace must match." << std::endl; + return -EINVAL; + } else if (!image_id.empty() && !image_name.empty()) { + std::cerr << "rbd: trying to access image using both name and id. " + << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx cg_io_ctx; + r = utils::init(group_pool_name, group_namespace_name, &rados, &cg_io_ctx); + if (r < 0) { + return r; + } + + librados::IoCtx image_io_ctx; + r = utils::init(image_pool_name, group_namespace_name, &rados, &image_io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + if (image_id.empty()) { + r = rbd.group_image_remove(cg_io_ctx, group_name.c_str(), + image_io_ctx, image_name.c_str()); + } else { + r = rbd.group_image_remove_by_id(cg_io_ctx, group_name.c_str(), + image_io_ctx, image_id.c_str()); + } + if (r < 0) { + std::cerr << "rbd: remove image error: " << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +int execute_list_images(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string group_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, + &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + Formatter *f = formatter.get(); + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + std::vector<librbd::group_image_info_t> images; + + r = rbd.group_image_list(io_ctx, group_name.c_str(), &images, + sizeof(librbd::group_image_info_t)); + + if (r == -ENOENT) + r = 0; + + if (r < 0) + return r; + + std::sort(images.begin(), images.end(), + [](const librbd::group_image_info_t &lhs, + const librbd::group_image_info_t &rhs) { + if (lhs.pool != rhs.pool) { + return lhs.pool < rhs.pool; + } + return lhs.name < rhs.name; + } + ); + + if (f) + f->open_array_section("images"); + + for (auto image : images) { + std::string image_name = image.name; + int state = image.state; + std::string state_string; + if (RBD_GROUP_IMAGE_STATE_INCOMPLETE == state) { + state_string = "incomplete"; + } + + std::string pool_name = ""; + + librados::Rados rados(io_ctx); + librados::IoCtx pool_io_ctx; + r = rados.ioctx_create2(image.pool, pool_io_ctx); + if (r < 0) { + pool_name = "<missing image pool " + stringify(image.pool) + ">"; + } else { + pool_name = pool_io_ctx.get_pool_name(); + } + + if (f) { + f->open_object_section("image"); + f->dump_string("image", image_name); + f->dump_string("pool", pool_name); + f->dump_string("namespace", io_ctx.get_namespace()); + f->dump_int("state", state); + f->close_section(); + } else { + std::cout << pool_name << "/"; + if (!io_ctx.get_namespace().empty()) { + std::cout << io_ctx.get_namespace() << "/"; + } + std::cout << image_name << " " << state_string << std::endl; + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } + + return 0; +} + +int execute_group_snap_create(const po::variables_map &vm, + const std::vector<std::string> &global_args) { + size_t arg_index = 0; + + std::string pool_name; + std::string namespace_name; + std::string group_name; + std::string snap_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, + &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true, + utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librados::IoCtx io_ctx; + librados::Rados rados; + + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.group_snap_create(io_ctx, group_name.c_str(), snap_name.c_str()); + if (r < 0) { + return r; + } + + return 0; +} + +int execute_group_snap_remove(const po::variables_map &vm, + const std::vector<std::string> &global_args) { + size_t arg_index = 0; + + std::string pool_name; + std::string namespace_name; + std::string group_name; + std::string snap_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, + &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true, + utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librados::IoCtx io_ctx; + librados::Rados rados; + + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.group_snap_remove(io_ctx, group_name.c_str(), snap_name.c_str()); + if (r < 0) { + std::cerr << "rbd: failed to remove group snapshot: " + << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +int execute_group_snap_rename(const po::variables_map &vm, + const std::vector<std::string> &global_args) { + size_t arg_index = 0; + + std::string pool_name; + std::string namespace_name; + std::string group_name; + std::string source_snap_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, + &namespace_name, GROUP_NAME, "group", &group_name, &source_snap_name, true, + utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + std::string dest_snap_name; + if (vm.count(at::DEST_SNAPSHOT_NAME)) { + dest_snap_name = vm[at::DEST_SNAPSHOT_NAME].as<std::string>(); + } + + if (dest_snap_name.empty()) { + dest_snap_name = utils::get_positional_argument(vm, arg_index++); + } + + if (dest_snap_name.empty()) { + std::cerr << "rbd: destination snapshot name was not specified" + << std::endl; + return -EINVAL; + } + + r = utils::validate_snapshot_name(at::ARGUMENT_MODIFIER_DEST, dest_snap_name, + utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_SNAP); + if (r < 0) { + return r; + } + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.group_snap_rename(io_ctx, group_name.c_str(), + source_snap_name.c_str(), dest_snap_name.c_str()); + + if (r < 0) { + std::cerr << "rbd: failed to rename group snapshot: " + << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +int execute_group_snap_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string group_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, + &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + Formatter *f = formatter.get(); + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + std::vector<librbd::group_snap_info_t> snaps; + + r = rbd.group_snap_list(io_ctx, group_name.c_str(), &snaps, + sizeof(librbd::group_snap_info_t)); + + if (r == -ENOENT) { + r = 0; + } + if (r < 0) { + return r; + } + + TextTable t; + if (f) { + f->open_array_section("group_snaps"); + } else { + t.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + t.define_column("STATUS", TextTable::LEFT, TextTable::RIGHT); + } + + for (auto i : snaps) { + std::string snap_name = i.name; + int state = i.state; + std::string state_string; + if (RBD_GROUP_SNAP_STATE_INCOMPLETE == state) { + state_string = "incomplete"; + } else { + state_string = "ok"; + } + if (r < 0) { + return r; + } + if (f) { + f->open_object_section("group_snap"); + f->dump_string("snapshot", snap_name); + f->dump_string("state", state_string); + f->close_section(); + } else { + t << snap_name << state_string << TextTable::endrow; + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } else if (snaps.size()) { + std::cout << t; + } + return 0; +} + +int execute_group_snap_rollback(const po::variables_map &vm, + const std::vector<std::string> &global_args) { + size_t arg_index = 0; + + std::string group_name; + std::string namespace_name; + std::string pool_name; + std::string snap_name; + + int r = utils::get_pool_generic_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, + &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true, + utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librados::IoCtx io_ctx; + librados::Rados rados; + + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + utils::ProgressContext pc("Rolling back to group snapshot", + vm[at::NO_PROGRESS].as<bool>()); + r = rbd.group_snap_rollback_with_progress(io_ctx, group_name.c_str(), + snap_name.c_str(), pc); + if (r < 0) { + pc.fail(); + std::cerr << "rbd: rollback group to snapshot failed: " + << cpp_strerror(r) << std::endl; + return r; + } + + pc.finish(); + return 0; +} + +void get_create_arguments(po::options_description *positional, + po::options_description *options) { + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE, + false); +} + +void get_remove_arguments(po::options_description *positional, + po::options_description *options) { + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE, + false); +} + +void get_list_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, true); + at::add_format_options(options); +} + +void get_rename_arguments(po::options_description *positional, + po::options_description *options) { + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE, + false); + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST, + false); +} + +void get_add_arguments(po::options_description *positional, + po::options_description *options) { + positional->add_options() + (GROUP_SPEC.c_str(), + "group specification\n" + "(example: [<pool-name>/[<namespace>/]]<group-name>)"); + + add_prefixed_pool_option(options, "group"); + add_prefixed_namespace_option(options, "group"); + add_group_option(options, at::ARGUMENT_MODIFIER_NONE); + + positional->add_options() + (at::IMAGE_SPEC.c_str(), + "image specification\n" + "(example: [<pool-name>/[<namespace>/]]<image-name>)"); + + add_prefixed_pool_option(options, "image"); + add_prefixed_namespace_option(options, "image"); + at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE); + + at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE, + " unless overridden"); +} + +void get_remove_image_arguments(po::options_description *positional, + po::options_description *options) { + positional->add_options() + (GROUP_SPEC.c_str(), + "group specification\n" + "(example: [<pool-name>/[<namespace>/]]<group-name>)"); + + add_prefixed_pool_option(options, "group"); + add_prefixed_namespace_option(options, "group"); + add_group_option(options, at::ARGUMENT_MODIFIER_NONE); + + positional->add_options() + (at::IMAGE_SPEC.c_str(), + "image specification\n" + "(example: [<pool-name>/[<namespace>/]]<image-name>)"); + + add_prefixed_pool_option(options, "image"); + add_prefixed_namespace_option(options, "image"); + at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE); + + at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE, + " unless overridden"); + at::add_image_id_option(options); +} + +void get_list_images_arguments(po::options_description *positional, + po::options_description *options) { + at::add_format_options(options); + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE, + false); +} + +void get_group_snap_create_arguments(po::options_description *positional, + po::options_description *options) { + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE, + true); +} + +void get_group_snap_remove_arguments(po::options_description *positional, + po::options_description *options) { + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE, + true); +} + +void get_group_snap_rename_arguments(po::options_description *positional, + po::options_description *options) { + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE, + true); + + positional->add_options() + (at::DEST_SNAPSHOT_NAME.c_str(), + "destination snapshot name\n(example: <snapshot-name>)"); + at::add_snap_option(options, at::ARGUMENT_MODIFIER_DEST); +} + +void get_group_snap_list_arguments(po::options_description *positional, + po::options_description *options) { + at::add_format_options(options); + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE, + false); +} + +void get_group_snap_rollback_arguments(po::options_description *positional, + po::options_description *options) { + at::add_no_progress_option(options); + add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE, + true); +} + +Shell::Action action_create( + {"group", "create"}, {}, "Create a group.", + "", &get_create_arguments, &execute_create); +Shell::Action action_remove( + {"group", "remove"}, {"group", "rm"}, "Delete a group.", + "", &get_remove_arguments, &execute_remove); +Shell::Action action_list( + {"group", "list"}, {"group", "ls"}, "List rbd groups.", + "", &get_list_arguments, &execute_list); +Shell::Action action_rename( + {"group", "rename"}, {}, "Rename a group within pool.", + "", &get_rename_arguments, &execute_rename); +Shell::Action action_add( + {"group", "image", "add"}, {}, "Add an image to a group.", + "", &get_add_arguments, &execute_add); +Shell::Action action_remove_image( + {"group", "image", "remove"}, {"group", "image", "rm"}, + "Remove an image from a group.", "", + &get_remove_image_arguments, &execute_remove_image); +Shell::Action action_list_images( + {"group", "image", "list"}, {"group", "image", "ls"}, + "List images in a group.", "", + &get_list_images_arguments, &execute_list_images); +Shell::Action action_group_snap_create( + {"group", "snap", "create"}, {}, "Make a snapshot of a group.", + "", &get_group_snap_create_arguments, &execute_group_snap_create); +Shell::Action action_group_snap_remove( + {"group", "snap", "remove"}, {"group", "snap", "rm"}, + "Remove a snapshot from a group.", + "", &get_group_snap_remove_arguments, &execute_group_snap_remove); +Shell::Action action_group_snap_rename( + {"group", "snap", "rename"}, {}, "Rename group's snapshot.", + "", &get_group_snap_rename_arguments, &execute_group_snap_rename); +Shell::Action action_group_snap_list( + {"group", "snap", "list"}, {"group", "snap", "ls"}, + "List snapshots of a group.", + "", &get_group_snap_list_arguments, &execute_group_snap_list); +Shell::Action action_group_snap_rollback( + {"group", "snap", "rollback"}, {}, + "Rollback group to snapshot.", + "", &get_group_snap_rollback_arguments, &execute_group_snap_rollback); + +} // namespace group +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/ImageMeta.cc b/src/tools/rbd/action/ImageMeta.cc new file mode 100644 index 00000000..20c4555d --- /dev/null +++ b/src/tools/rbd/action/ImageMeta.cc @@ -0,0 +1,345 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace image_meta { + +namespace at = argument_types; +namespace po = boost::program_options; + +namespace { + +void add_key_option(po::options_description *positional) { + positional->add_options() + ("key", "image meta key"); +} + +int get_key(const po::variables_map &vm, size_t *arg_index, + std::string *key) { + *key = utils::get_positional_argument(vm, *arg_index); + if (key->empty()) { + std::cerr << "rbd: metadata key was not specified" << std::endl; + return -EINVAL; + } else { + ++(*arg_index); + } + return 0; +} + +const uint32_t MAX_KEYS = 64; + +} // anonymous namespace + +static int do_metadata_list(librbd::Image& image, Formatter *f) +{ + int r; + TextTable tbl; + + size_t count = 0; + std::string last_key; + bool more_results = true; + while (more_results) { + std::map<std::string, bufferlist> pairs; + r = image.metadata_list(last_key, MAX_KEYS, &pairs); + if (r < 0) { + std::cerr << "failed to list metadata of image : " << cpp_strerror(r) + << std::endl; + return r; + } + + more_results = (pairs.size() == MAX_KEYS); + if (!pairs.empty()) { + if (count == 0) { + if (f) { + f->open_object_section("metadatas"); + } else { + tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT); + } + } + + last_key = pairs.rbegin()->first; + count += pairs.size(); + + for (auto kv : pairs) { + std::string val(kv.second.c_str(), kv.second.length()); + if (f) { + f->dump_string(kv.first.c_str(), val.c_str()); + } else { + tbl << kv.first << val << TextTable::endrow; + } + } + } + } + + if (f == nullptr) { + bool single = (count == 1); + std::cout << "There " << (single ? "is" : "are") << " " << count << " " + << (single ? "metadatum" : "metadata") << " on this image" + << (count == 0 ? "." : ":") << std::endl; + } + + if (count > 0) { + if (f) { + f->close_section(); + f->flush(std::cout); + } else { + std::cout << std::endl << tbl; + } + } + return 0; +} + +static int do_metadata_set(librbd::Image& image, std::string &key, + std::string &value) +{ + int r = image.metadata_set(key, value); + if (r < 0) { + std::cerr << "failed to set metadata " << key << " of image : " + << cpp_strerror(r) << std::endl; + } + return r; +} + +static int do_metadata_remove(librbd::Image& image, std::string &key) +{ + int r = image.metadata_remove(key); + if (r == -ENOENT) { + std::cerr << "rbd: no existing metadata key " << key << " of image : " + << cpp_strerror(r) << std::endl; + } else if(r < 0) { + std::cerr << "failed to remove metadata " << key << " of image : " + << cpp_strerror(r) << std::endl; + } + return r; +} + +static int do_metadata_get(librbd::Image& image, std::string &key) +{ + std::string s; + int r = image.metadata_get(key, &s); + if (r < 0) { + std::cerr << "failed to get metadata " << key << " of image : " + << cpp_strerror(r) << std::endl; + return r; + } + std::cout << s << std::endl; + return r; +} + +void get_list_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_format_options(options); +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_metadata_list(image, formatter.get()); + if (r < 0) { + std::cerr << "rbd: listing metadata failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + add_key_option(positional); +} + +int execute_get(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string key; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_metadata_get(image, key); + if (r < 0) { + std::cerr << "rbd: getting metadata failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_set_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + add_key_option(positional); + positional->add_options() + ("value", "image meta value"); +} + +int execute_set(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string key; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + std::string value = utils::get_positional_argument(vm, arg_index); + if (value.empty()) { + std::cerr << "rbd: metadata value was not specified" << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_metadata_set(image, key, value); + if (r < 0) { + std::cerr << "rbd: setting metadata failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_remove_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + add_key_option(positional); +} + +int execute_remove(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string key; + r = get_key(vm, &arg_index, &key); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_metadata_remove(image, key); + if (r < 0) { + std::cerr << "rbd: removing metadata failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +Shell::Action action_list( + {"image-meta", "list"}, {"image-meta", "ls"}, "Image metadata list keys with values.", "", + &get_list_arguments, &execute_list); +Shell::Action action_get( + {"image-meta", "get"}, {}, + "Image metadata get the value associated with the key.", "", + &get_get_arguments, &execute_get); +Shell::Action action_set( + {"image-meta", "set"}, {}, "Image metadata set key with value.", "", + &get_set_arguments, &execute_set); +Shell::Action action_remove( + {"image-meta", "remove"}, {"image-meta", "rm"}, + "Image metadata remove the key and value associated.", "", + &get_remove_arguments, &execute_remove); + +} // namespace image_meta +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Import.cc b/src/tools/rbd/action/Import.cc new file mode 100644 index 00000000..7397d926 --- /dev/null +++ b/src/tools/rbd/action/Import.cc @@ -0,0 +1,1037 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/Context.h" +#include "common/blkdev.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Throttle.h" +#include "include/compat.h" +#include "include/encoding.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include <iostream> +#include <boost/program_options.hpp> +#include <boost/scoped_ptr.hpp> +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd + +namespace rbd { +namespace action { +namespace import { + +struct ImportDiffContext { + librbd::Image *image; + int fd; + size_t size; + utils::ProgressContext pc; + OrderedThrottle throttle; + uint64_t last_offset; + + ImportDiffContext(librbd::Image *image, int fd, size_t size, bool no_progress) + : image(image), fd(fd), size(size), pc("Importing image diff", no_progress), + throttle((fd == STDIN_FILENO) ? 1 : + g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"), + false), + last_offset(0) { + } + + void update_size(size_t new_size) + { + if (fd == STDIN_FILENO) { + size = new_size; + } + } + + void update_progress(uint64_t off) + { + if (size) { + pc.update_progress(off, size); + last_offset = off; + } + } + + void update_progress() + { + uint64_t off = last_offset; + if (fd != STDIN_FILENO) { + off = lseek(fd, 0, SEEK_CUR); + } + + update_progress(off); + } + + void finish(int r) + { + if (r < 0) { + pc.fail(); + } else { + pc.finish(); + } + } +}; + +class C_ImportDiff : public Context { +public: + C_ImportDiff(ImportDiffContext *idiffctx, bufferlist data, uint64_t offset, + uint64_t length, bool write_zeroes) + : m_idiffctx(idiffctx), m_data(data), m_offset(offset), m_length(length), + m_write_zeroes(write_zeroes) { + // use block offset (stdin) or import file position to report + // progress. + if (m_idiffctx->fd == STDIN_FILENO) { + m_prog_offset = offset; + } else { + m_prog_offset = lseek(m_idiffctx->fd, 0, SEEK_CUR); + } + } + + int send() + { + if (m_idiffctx->throttle.pending_error()) { + return m_idiffctx->throttle.wait_for_ret(); + } + + C_OrderedThrottle *ctx = m_idiffctx->throttle.start_op(this); + librbd::RBD::AioCompletion *aio_completion = + new librbd::RBD::AioCompletion(ctx, &utils::aio_context_callback); + + int r; + if (m_write_zeroes) { + r = m_idiffctx->image->aio_write_zeroes(m_offset, m_length, + aio_completion, 0U, + LIBRADOS_OP_FLAG_FADVISE_NOCACHE); + } else { + r = m_idiffctx->image->aio_write2(m_offset, m_length, m_data, + aio_completion, + LIBRADOS_OP_FLAG_FADVISE_NOCACHE); + } + + if (r < 0) { + aio_completion->release(); + ctx->complete(r); + } + + return r; + } + + void finish(int r) override + { + m_idiffctx->update_progress(m_prog_offset); + m_idiffctx->throttle.end_op(r); + } + +private: + ImportDiffContext *m_idiffctx; + bufferlist m_data; + uint64_t m_offset; + uint64_t m_length; + bool m_write_zeroes; + uint64_t m_prog_offset; +}; + +static int do_image_snap_from(ImportDiffContext *idiffctx) +{ + int r; + string from; + r = utils::read_string(idiffctx->fd, 4096, &from); // 4k limit to make sure we don't get a garbage string + if (r < 0) { + std::cerr << "rbd: failed to decode start snap name" << std::endl; + return r; + } + + bool exists; + r = idiffctx->image->snap_exists2(from.c_str(), &exists); + if (r < 0) { + std::cerr << "rbd: failed to query start snap state" << std::endl; + return r; + } + + if (!exists) { + std::cerr << "start snapshot '" << from + << "' does not exist in the image, aborting" << std::endl; + return -EINVAL; + } + + idiffctx->update_progress(); + return 0; +} + +static int do_image_snap_to(ImportDiffContext *idiffctx, std::string *tosnap) +{ + int r; + string to; + r = utils::read_string(idiffctx->fd, 4096, &to); // 4k limit to make sure we don't get a garbage string + if (r < 0) { + std::cerr << "rbd: failed to decode end snap name" << std::endl; + return r; + } + + bool exists; + r = idiffctx->image->snap_exists2(to.c_str(), &exists); + if (r < 0) { + std::cerr << "rbd: failed to query end snap state" << std::endl; + return r; + } + + if (exists) { + std::cerr << "end snapshot '" << to << "' already exists, aborting" + << std::endl; + return -EEXIST; + } + + *tosnap = to; + idiffctx->update_progress(); + + return 0; +} + +static int get_snap_protection_status(ImportDiffContext *idiffctx, + bool *is_protected) +{ + int r; + char buf[sizeof(__u8)]; + r = safe_read_exact(idiffctx->fd, buf, sizeof(buf)); + if (r < 0) { + std::cerr << "rbd: failed to decode snap protection status" << std::endl; + return r; + } + + *is_protected = (buf[0] != 0); + idiffctx->update_progress(); + + return 0; +} + +static int do_image_resize(ImportDiffContext *idiffctx) +{ + int r; + char buf[sizeof(uint64_t)]; + uint64_t end_size; + r = safe_read_exact(idiffctx->fd, buf, sizeof(buf)); + if (r < 0) { + std::cerr << "rbd: failed to decode image size" << std::endl; + return r; + } + + bufferlist bl; + bl.append(buf, sizeof(buf)); + auto p = bl.cbegin(); + decode(end_size, p); + + uint64_t cur_size; + idiffctx->image->size(&cur_size); + if (cur_size != end_size) { + idiffctx->image->resize(end_size); + } + + idiffctx->update_size(end_size); + idiffctx->update_progress(); + return 0; +} + +static int do_image_io(ImportDiffContext *idiffctx, bool write_zeroes, + size_t sparse_size) +{ + int r; + char buf[16]; + r = safe_read_exact(idiffctx->fd, buf, sizeof(buf)); + if (r < 0) { + std::cerr << "rbd: failed to decode IO length" << std::endl; + return r; + } + + bufferlist bl; + bl.append(buf, sizeof(buf)); + auto p = bl.cbegin(); + + uint64_t image_offset, buffer_length; + decode(image_offset, p); + decode(buffer_length, p); + + if (!write_zeroes) { + bufferptr bp = buffer::create(buffer_length); + r = safe_read_exact(idiffctx->fd, bp.c_str(), buffer_length); + if (r < 0) { + std::cerr << "rbd: failed to decode write data" << std::endl; + return r; + } + + size_t buffer_offset = 0; + while (buffer_offset < buffer_length) { + size_t write_length = 0; + bool zeroed = false; + utils::calc_sparse_extent(bp, sparse_size, buffer_offset, buffer_length, + &write_length, &zeroed); + ceph_assert(write_length > 0); + + bufferlist write_bl; + if (!zeroed) { + bufferptr write_ptr(bp, buffer_offset, write_length); + write_bl.push_back(write_ptr); + ceph_assert(write_bl.length() == write_length); + } + + C_ImportDiff *ctx = new C_ImportDiff(idiffctx, write_bl, + image_offset + buffer_offset, + write_length, zeroed); + r = ctx->send(); + if (r < 0) { + return r; + } + + buffer_offset += write_length; + } + } else { + bufferlist data; + C_ImportDiff *ctx = new C_ImportDiff(idiffctx, data, image_offset, + buffer_length, true); + return ctx->send(); + } + return r; +} + +static int validate_banner(int fd, std::string banner) +{ + int r; + char buf[banner.size() + 1]; + memset(buf, 0, sizeof(buf)); + r = safe_read_exact(fd, buf, banner.size()); + if (r < 0) { + std::cerr << "rbd: failed to decode diff banner" << std::endl; + return r; + } + + buf[banner.size()] = '\0'; + if (strcmp(buf, banner.c_str())) { + std::cerr << "rbd: invalid or unexpected diff banner" << std::endl; + return -EINVAL; + } + + return 0; +} + +static int skip_tag(int fd, uint64_t length) +{ + int r; + + if (fd == STDIN_FILENO) { + // read the appending data out to skip this tag. + char buf[4096]; + uint64_t len = min<uint64_t>(length, sizeof(buf)); + while (len > 0) { + r = safe_read_exact(fd, buf, len); + if (r < 0) { + std::cerr << "rbd: failed to decode skipped tag data" << std::endl; + return r; + } + length -= len; + len = min<uint64_t>(length, sizeof(buf)); + } + } else { + // lseek to skip this tag + off64_t offs = lseek64(fd, length, SEEK_CUR); + if (offs < 0) { + return -errno; + } + } + + return 0; +} + +static int read_tag(int fd, __u8 end_tag, int format, __u8 *tag, uint64_t *readlen) +{ + int r; + __u8 read_tag; + + r = safe_read_exact(fd, &read_tag, sizeof(read_tag)); + if (r < 0) { + std::cerr << "rbd: failed to decode tag" << std::endl; + return r; + } + + *tag = read_tag; + if (read_tag != end_tag && format == 2) { + char buf[sizeof(uint64_t)]; + r = safe_read_exact(fd, buf, sizeof(buf)); + if (r < 0) { + std::cerr << "rbd: failed to decode tag length" << std::endl; + return r; + } + + bufferlist bl; + bl.append(buf, sizeof(buf)); + auto p = bl.cbegin(); + decode(*readlen, p); + } + + return 0; +} + +int do_import_diff_fd(librados::Rados &rados, librbd::Image &image, int fd, + bool no_progress, int format, size_t sparse_size) +{ + int r; + + uint64_t size = 0; + bool from_stdin = (fd == STDIN_FILENO); + if (!from_stdin) { + struct stat stat_buf; + r = ::fstat(fd, &stat_buf); + if (r < 0) { + std::cerr << "rbd: failed to stat specified diff file" << std::endl; + return r; + } + size = (uint64_t)stat_buf.st_size; + } + + r = validate_banner(fd, (format == 1 ? utils::RBD_DIFF_BANNER : + utils::RBD_DIFF_BANNER_V2)); + if (r < 0) { + return r; + } + + // begin image import + std::string tosnap; + bool is_protected = false; + ImportDiffContext idiffctx(&image, fd, size, no_progress); + while (r == 0) { + __u8 tag; + uint64_t length = 0; + + r = read_tag(fd, RBD_DIFF_END, format, &tag, &length); + if (r < 0 || tag == RBD_DIFF_END) { + break; + } + + if (tag == RBD_DIFF_FROM_SNAP) { + r = do_image_snap_from(&idiffctx); + } else if (tag == RBD_DIFF_TO_SNAP) { + r = do_image_snap_to(&idiffctx, &tosnap); + } else if (tag == RBD_SNAP_PROTECTION_STATUS) { + r = get_snap_protection_status(&idiffctx, &is_protected); + } else if (tag == RBD_DIFF_IMAGE_SIZE) { + r = do_image_resize(&idiffctx); + } else if (tag == RBD_DIFF_WRITE || tag == RBD_DIFF_ZERO) { + r = do_image_io(&idiffctx, (tag == RBD_DIFF_ZERO), sparse_size); + } else { + std::cerr << "unrecognized tag byte " << (int)tag << " in stream; skipping" + << std::endl; + r = skip_tag(fd, length); + } + } + + int temp_r = idiffctx.throttle.wait_for_ret(); + r = (r < 0) ? r : temp_r; // preserve original error + if (r == 0 && tosnap.length()) { + r = idiffctx.image->snap_create(tosnap.c_str()); + if (r == 0 && is_protected) { + r = idiffctx.image->snap_protect(tosnap.c_str()); + } + } + + idiffctx.finish(r); + return r; +} + +int do_import_diff(librados::Rados &rados, librbd::Image &image, + const char *path, bool no_progress, size_t sparse_size) +{ + int r; + int fd; + + if (strcmp(path, "-") == 0) { + fd = STDIN_FILENO; + } else { + fd = open(path, O_RDONLY); + if (fd < 0) { + r = -errno; + std::cerr << "rbd: error opening " << path << std::endl; + return r; + } + } + r = do_import_diff_fd(rados, image, fd, no_progress, 1, sparse_size); + + if (fd != 0) + close(fd); + return r; +} + +namespace at = argument_types; +namespace po = boost::program_options; + +void get_arguments_diff(po::options_description *positional, + po::options_description *options) { + at::add_path_options(positional, options, + "import file (or '-' for stdin)"); + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_sparse_size_option(options); + at::add_no_progress_option(options); +} + +int execute_diff(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string path; + size_t arg_index = 0; + int r = utils::get_path(vm, &arg_index, &path); + if (r < 0) { + return r; + } + + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE; + if (vm.count(at::IMAGE_SPARSE_SIZE)) { + sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>(); + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_import_diff(rados, image, path.c_str(), + vm[at::NO_PROGRESS].as<bool>(), sparse_size); + if (r == -EDOM) { + r = -EBADMSG; + } + if (r < 0) { + cerr << "rbd: import-diff failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action_diff( + {"import-diff"}, {}, "Import an incremental diff.", "", &get_arguments_diff, + &execute_diff); + +class C_Import : public Context { +public: + C_Import(SimpleThrottle &simple_throttle, librbd::Image &image, + bufferlist &bl, uint64_t offset) + : m_throttle(simple_throttle), m_image(image), + m_aio_completion( + new librbd::RBD::AioCompletion(this, &utils::aio_context_callback)), + m_bufferlist(bl), m_offset(offset) + { + } + + void send() + { + m_throttle.start_op(); + + int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | + LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + int r = m_image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist, + m_aio_completion, op_flags); + if (r < 0) { + std::cerr << "rbd: error requesting write to destination image" + << std::endl; + m_aio_completion->release(); + m_throttle.end_op(r); + } + } + + void finish(int r) override + { + if (r < 0) { + std::cerr << "rbd: error writing to destination image at offset " + << m_offset << ": " << cpp_strerror(r) << std::endl; + } + m_throttle.end_op(r); + } + +private: + SimpleThrottle &m_throttle; + librbd::Image &m_image; + librbd::RBD::AioCompletion *m_aio_completion; + bufferlist m_bufferlist; + uint64_t m_offset; +}; + +static int decode_and_set_image_option(int fd, uint64_t imageopt, librbd::ImageOptions& opts) +{ + int r; + char buf[sizeof(uint64_t)]; + + r = safe_read_exact(fd, buf, sizeof(buf)); + if (r < 0) { + std::cerr << "rbd: failed to decode image option" << std::endl; + return r; + } + + bufferlist bl; + bl.append(buf, sizeof(buf)); + auto it = bl.cbegin(); + + uint64_t val; + decode(val, it); + + if (opts.get(imageopt, &val) != 0) { + opts.set(imageopt, val); + } + + return 0; +} + +static int do_import_metadata(int import_format, librbd::Image& image, + const std::map<std::string, std::string> &imagemetas) +{ + int r = 0; + + //v1 format + if (import_format == 1) { + return 0; + } + + for (std::map<std::string, std::string>::const_iterator it = imagemetas.begin(); + it != imagemetas.end(); ++it) { + r = image.metadata_set(it->first, it->second); + if (r < 0) + return r; + } + + return 0; +} + +static int decode_imagemeta(int fd, uint64_t length, std::map<std::string, std::string>* imagemetas) +{ + int r; + string key; + string value; + + r = utils::read_string(fd, length, &key); + if (r < 0) { + std::cerr << "rbd: failed to decode metadata key" << std::endl; + return r; + } + + r = utils::read_string(fd, length, &value); + if (r < 0) { + std::cerr << "rbd: failed to decode metadata value" << std::endl; + return r; + } + + (*imagemetas)[key] = value; + return 0; +} + +static int do_import_header(int fd, int import_format, librbd::ImageOptions& opts, + std::map<std::string, std::string>* imagemetas) +{ + // There is no header in v1 image. + if (import_format == 1) { + return 0; + } + + int r; + r = validate_banner(fd, utils::RBD_IMAGE_BANNER_V2); + if (r < 0) { + return r; + } + + // As V1 format for image is already deprecated, import image in V2 by default. + uint64_t image_format = 2; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &image_format) != 0) { + opts.set(RBD_IMAGE_OPTION_FORMAT, image_format); + } + + while (r == 0) { + __u8 tag; + uint64_t length = 0; + r = read_tag(fd, RBD_EXPORT_IMAGE_END, image_format, &tag, &length); + if (r < 0 || tag == RBD_EXPORT_IMAGE_END) { + break; + } + + if (tag == RBD_EXPORT_IMAGE_ORDER) { + r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_ORDER, opts); + } else if (tag == RBD_EXPORT_IMAGE_FEATURES) { + r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_FEATURES, opts); + } else if (tag == RBD_EXPORT_IMAGE_STRIPE_UNIT) { + r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_UNIT, opts); + } else if (tag == RBD_EXPORT_IMAGE_STRIPE_COUNT) { + r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_COUNT, opts); + } else if (tag == RBD_EXPORT_IMAGE_META) { + r = decode_imagemeta(fd, length, imagemetas); + } else { + std::cerr << "rbd: invalid tag in image properties zone: " << tag << "Skip it." + << std::endl; + r = skip_tag(fd, length); + } + } + + return r; +} + +static int do_import_v2(librados::Rados &rados, int fd, librbd::Image &image, + uint64_t size, size_t imgblklen, + utils::ProgressContext &pc, size_t sparse_size) +{ + int r = 0; + r = validate_banner(fd, utils::RBD_IMAGE_DIFFS_BANNER_V2); + if (r < 0) { + return r; + } + + char buf[sizeof(uint64_t)]; + r = safe_read_exact(fd, buf, sizeof(buf)); + if (r < 0) { + std::cerr << "rbd: failed to decode diff count" << std::endl; + return r; + } + bufferlist bl; + bl.append(buf, sizeof(buf)); + auto p = bl.cbegin(); + uint64_t diff_num; + decode(diff_num, p); + for (size_t i = 0; i < diff_num; i++) { + r = do_import_diff_fd(rados, image, fd, true, 2, sparse_size); + if (r < 0) { + pc.fail(); + std::cerr << "rbd: import-diff failed: " << cpp_strerror(r) << std::endl; + return r; + } + pc.update_progress(i + 1, diff_num); + } + + return r; +} + +static int do_import_v1(int fd, librbd::Image &image, uint64_t size, + size_t imgblklen, utils::ProgressContext &pc, + size_t sparse_size) +{ + int r = 0; + size_t reqlen = imgblklen; // amount requested from read + ssize_t readlen; // amount received from one read + size_t blklen = 0; // amount accumulated from reads to fill blk + char *p = new char[imgblklen]; + uint64_t image_pos = 0; + bool from_stdin = (fd == STDIN_FILENO); + boost::scoped_ptr<SimpleThrottle> throttle; + + if (from_stdin) { + throttle.reset(new SimpleThrottle(1, false)); + } else { + throttle.reset(new SimpleThrottle( + g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"), false)); + } + + reqlen = min<uint64_t>(reqlen, size); + // loop body handles 0 return, as we may have a block to flush + while ((readlen = ::read(fd, p + blklen, reqlen)) >= 0) { + if (throttle->pending_error()) { + break; + } + + blklen += readlen; + // if read was short, try again to fill the block before writing + if (readlen && ((size_t)readlen < reqlen)) { + reqlen -= readlen; + continue; + } + if (!from_stdin) + pc.update_progress(image_pos, size); + + bufferptr blkptr(p, blklen); + // resize output image by binary expansion as we go for stdin + if (from_stdin && (image_pos + (size_t)blklen) > size) { + size *= 2; + r = image.resize(size); + if (r < 0) { + std::cerr << "rbd: can't resize image during import" << std::endl; + goto out; + } + } + + // write as much as we got; perhaps less than imgblklen + // but skip writing zeros to create sparse images + size_t buffer_offset = 0; + while (buffer_offset < blklen) { + size_t write_length = 0; + bool zeroed = false; + utils::calc_sparse_extent(blkptr, sparse_size, buffer_offset, blklen, + &write_length, &zeroed); + + if (!zeroed) { + bufferlist write_bl; + bufferptr write_ptr(blkptr, buffer_offset, write_length); + write_bl.push_back(write_ptr); + ceph_assert(write_bl.length() == write_length); + + C_Import *ctx = new C_Import(*throttle, image, write_bl, + image_pos + buffer_offset); + ctx->send(); + } + + buffer_offset += write_length; + } + + // done with whole block, whether written or not + image_pos += blklen; + if (!from_stdin && image_pos >= size) + break; + // if read had returned 0, we're at EOF and should quit + if (readlen == 0) + break; + blklen = 0; + reqlen = imgblklen; + } + r = throttle->wait_for_ret(); + if (r < 0) { + goto out; + } + + if (fd == STDIN_FILENO) { + r = image.resize(image_pos); + if (r < 0) { + std::cerr << "rbd: final image resize failed" << std::endl; + goto out; + } + } +out: + delete[] p; + return r; +} + +static int do_import(librados::Rados &rados, librbd::RBD &rbd, + librados::IoCtx& io_ctx, const char *imgname, + const char *path, librbd::ImageOptions& opts, + bool no_progress, int import_format, size_t sparse_size) +{ + int fd, r; + struct stat stat_buf; + utils::ProgressContext pc("Importing image", no_progress); + std::map<std::string, std::string> imagemetas; + + ceph_assert(imgname); + + uint64_t order; + if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) { + order = g_conf().get_val<uint64_t>("rbd_default_order"); + } + + // try to fill whole imgblklen blocks for sparsification + size_t imgblklen = 1 << order; + librbd::Image image; + uint64_t size = 0; + + bool from_stdin = !strcmp(path, "-"); + if (from_stdin) { + fd = STDIN_FILENO; + size = 1ULL << order; + } else { + if ((fd = open(path, O_RDONLY)) < 0) { + r = -errno; + std::cerr << "rbd: error opening " << path << std::endl; + goto done2; + } + + if ((fstat(fd, &stat_buf)) < 0) { + r = -errno; + std::cerr << "rbd: stat error " << path << std::endl; + goto done; + } + if (S_ISDIR(stat_buf.st_mode)) { + r = -EISDIR; + std::cerr << "rbd: cannot import a directory" << std::endl; + goto done; + } + if (stat_buf.st_size) + size = (uint64_t)stat_buf.st_size; + + if (!size) { + int64_t bdev_size = 0; + BlkDev blkdev(fd); + r = blkdev.get_size(&bdev_size); + if (r < 0) { + std::cerr << "rbd: unable to get size of file/block device" + << std::endl; + goto done; + } + ceph_assert(bdev_size >= 0); + size = (uint64_t) bdev_size; + } +#ifdef HAVE_POSIX_FADVISE + posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#endif + } + + r = do_import_header(fd, import_format, opts, &imagemetas); + if (r < 0) { + std::cerr << "rbd: import header failed." << std::endl; + goto done; + } + + r = rbd.create4(io_ctx, imgname, size, opts); + if (r < 0) { + std::cerr << "rbd: image creation failed" << std::endl; + goto done; + } + + r = rbd.open(io_ctx, image, imgname); + if (r < 0) { + std::cerr << "rbd: failed to open image" << std::endl; + goto err; + } + + r = do_import_metadata(import_format, image, imagemetas); + if (r < 0) { + std::cerr << "rbd: failed to import image-meta" << std::endl; + goto err; + } + + if (import_format == 1) { + r = do_import_v1(fd, image, size, imgblklen, pc, sparse_size); + } else { + r = do_import_v2(rados, fd, image, size, imgblklen, pc, sparse_size); + } + if (r < 0) { + std::cerr << "rbd: failed to import image" << std::endl; + image.close(); + goto err; + } + + r = image.close(); +err: + if (r < 0) + rbd.remove(io_ctx, imgname); +done: + if (r < 0) + pc.fail(); + else + pc.finish(); + if (!from_stdin) + close(fd); +done2: + return r; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_path_options(positional, options, + "import file (or '-' for stdin)"); + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST); + at::add_create_image_options(options, true); + at::add_sparse_size_option(options); + at::add_no_progress_option(options); + at::add_export_format_option(options); + + // TODO legacy rbd allowed import to accept both 'image'/'dest' and + // 'pool'/'dest-pool' + at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE, " (deprecated)"); + at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE, " (deprecated)"); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string path; + size_t arg_index = 0; + int r = utils::get_path(vm, &arg_index, &path); + if (r < 0) { + return r; + } + + // odd check to support legacy / deprecated behavior of import + std::string deprecated_pool_name; + if (vm.count(at::POOL_NAME)) { + deprecated_pool_name = vm[at::POOL_NAME].as<std::string>(); + std::cerr << "rbd: --pool is deprecated for import, use --dest-pool" + << std::endl; + } + + std::string deprecated_image_name; + if (vm.count(at::IMAGE_NAME)) { + deprecated_image_name = vm[at::IMAGE_NAME].as<std::string>(); + std::cerr << "rbd: --image is deprecated for import, use --dest" + << std::endl; + } else { + deprecated_image_name = path.substr(path.find_last_of("/") + 1); + } + + std::string deprecated_snap_name; + r = utils::extract_spec(deprecated_image_name, &deprecated_pool_name, + nullptr, &deprecated_image_name, + &deprecated_snap_name, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE; + if (vm.count(at::IMAGE_SPARSE_SIZE)) { + sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>(); + } + + std::string pool_name = deprecated_pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name = deprecated_snap_name; + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, false, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + if (image_name.empty()) { + image_name = deprecated_image_name; + } + + librbd::ImageOptions opts; + r = utils::get_image_options(vm, true, &opts); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + int format = 1; + if (vm.count("export-format")) + format = vm["export-format"].as<uint64_t>(); + + librbd::RBD rbd; + r = do_import(rados, rbd, io_ctx, image_name.c_str(), path.c_str(), + opts, vm[at::NO_PROGRESS].as<bool>(), format, sparse_size); + if (r < 0) { + std::cerr << "rbd: import failed: " << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +Shell::Action action( + {"import"}, {}, "Import image from file.", at::get_long_features_help(), + &get_arguments, &execute); + +} // namespace import +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Info.cc b/src/tools/rbd/action/Info.cc new file mode 100644 index 00000000..5adacb92 --- /dev/null +++ b/src/tools/rbd/action/Info.cc @@ -0,0 +1,459 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/types.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include <iostream> +#include <boost/program_options.hpp> + +#include "common/Clock.h" + +namespace rbd { +namespace action { +namespace info { + +namespace at = argument_types; +namespace po = boost::program_options; + +static void format_bitmask(Formatter *f, const std::string &name, + const std::map<uint64_t, std::string>& mapping, + uint64_t bitmask) +{ + int count = 0; + std::string group_name(name + "s"); + if (f == NULL) { + std::cout << "\t" << group_name << ": "; + } else { + f->open_array_section(group_name.c_str()); + } + for (std::map<uint64_t, std::string>::const_iterator it = mapping.begin(); + it != mapping.end(); ++it) { + if ((it->first & bitmask) == 0) { + continue; + } + + if (f == NULL) { + if (count++ > 0) { + std::cout << ", "; + } + std::cout << it->second; + } else { + f->dump_string(name.c_str(), it->second); + } + } + if (f == NULL) { + std::cout << std::endl; + } else { + f->close_section(); + } +} + +static void format_features(Formatter *f, uint64_t features) +{ + format_bitmask(f, "feature", at::ImageFeatures::FEATURE_MAPPING, features); +} + +static void format_op_features(Formatter *f, uint64_t op_features) +{ + static std::map<uint64_t, std::string> mapping = { + {RBD_OPERATION_FEATURE_CLONE_PARENT, RBD_OPERATION_FEATURE_NAME_CLONE_PARENT}, + {RBD_OPERATION_FEATURE_CLONE_CHILD, RBD_OPERATION_FEATURE_NAME_CLONE_CHILD}, + {RBD_OPERATION_FEATURE_GROUP, RBD_OPERATION_FEATURE_NAME_GROUP}, + {RBD_OPERATION_FEATURE_SNAP_TRASH, RBD_OPERATION_FEATURE_NAME_SNAP_TRASH}}; + format_bitmask(f, "op_feature", mapping, op_features); +} + +static void format_flags(Formatter *f, uint64_t flags) +{ + std::map<uint64_t, std::string> mapping = { + {RBD_FLAG_OBJECT_MAP_INVALID, "object map invalid"}, + {RBD_FLAG_FAST_DIFF_INVALID, "fast diff invalid"}}; + format_bitmask(f, "flag", mapping, flags); +} + +void format_timestamp(struct timespec timestamp, std::string ×tamp_str) { + if(timestamp.tv_sec != 0) { + time_t ts = timestamp.tv_sec; + timestamp_str = ctime(&ts); + timestamp_str = timestamp_str.substr(0, timestamp_str.length() - 1); + } +} + +static int do_show_info(librados::IoCtx &io_ctx, librbd::Image& image, + const std::string &snapname, Formatter *f) +{ + librbd::image_info_t info; + uint8_t old_format; + uint64_t overlap, features, flags, snap_limit; + bool snap_protected = false; + librbd::mirror_image_info_t mirror_image; + std::vector<librbd::snap_info_t> snaps; + int r; + + std::string imgname; + r = image.get_name(&imgname); + if (r < 0) + return r; + + r = image.snap_list(snaps); + if (r < 0) + return r; + + r = image.stat(info, sizeof(info)); + if (r < 0) + return r; + + r = image.old_format(&old_format); + if (r < 0) + return r; + + std::string imgid; + if (!old_format) { + r = image.get_id(&imgid); + if (r < 0) + return r; + } + + std::string data_pool; + if (!old_format) { + int64_t data_pool_id = image.get_data_pool_id(); + if (data_pool_id != io_ctx.get_id()) { + librados::Rados rados(io_ctx); + librados::IoCtx data_io_ctx; + r = rados.ioctx_create2(data_pool_id, data_io_ctx); + if (r < 0) { + data_pool = "<missing data pool " + stringify(data_pool_id) + ">"; + } else { + data_pool = data_io_ctx.get_pool_name(); + } + } + } + + r = image.overlap(&overlap); + if (r < 0) + return r; + + r = image.features(&features); + if (r < 0) + return r; + + uint64_t op_features; + r = image.get_op_features(&op_features); + if (r < 0) { + return r; + } + + r = image.get_flags(&flags); + if (r < 0) { + return r; + } + + if (!snapname.empty()) { + r = image.snap_is_protected(snapname.c_str(), &snap_protected); + if (r < 0) + return r; + } + + if (features & RBD_FEATURE_JOURNALING) { + r = image.mirror_image_get_info(&mirror_image, sizeof(mirror_image)); + if (r < 0) { + return r; + } + } + + r = image.snap_get_limit(&snap_limit); + if (r < 0) + return r; + + std::string prefix = image.get_block_name_prefix(); + + librbd::group_info_t group_info; + r = image.get_group(&group_info, sizeof(group_info)); + if (r < 0) { + return r; + } + + std::string group_string = ""; + if (RBD_GROUP_INVALID_POOL != group_info.pool) { + std::string group_pool; + librados::Rados rados(io_ctx); + librados::IoCtx group_io_ctx; + r = rados.ioctx_create2(group_info.pool, group_io_ctx); + if (r < 0) { + group_pool = "<missing group pool " + stringify(group_info.pool) + ">"; + } else { + group_pool = group_io_ctx.get_pool_name(); + } + + group_string = group_pool + "/"; + if (!io_ctx.get_namespace().empty()) { + group_string += io_ctx.get_namespace() + "/"; + } + group_string += group_info.name; + } + + struct timespec create_timestamp; + image.get_create_timestamp(&create_timestamp); + + std::string create_timestamp_str = ""; + format_timestamp(create_timestamp, create_timestamp_str); + + struct timespec access_timestamp; + image.get_access_timestamp(&access_timestamp); + + std::string access_timestamp_str = ""; + format_timestamp(access_timestamp, access_timestamp_str); + + struct timespec modify_timestamp; + image.get_modify_timestamp(&modify_timestamp); + + std::string modify_timestamp_str = ""; + format_timestamp(modify_timestamp, modify_timestamp_str); + + if (f) { + f->open_object_section("image"); + f->dump_string("name", imgname); + f->dump_string("id", imgid); + f->dump_unsigned("size", info.size); + f->dump_unsigned("objects", info.num_objs); + f->dump_int("order", info.order); + f->dump_unsigned("object_size", info.obj_size); + f->dump_int("snapshot_count", snaps.size()); + if (!data_pool.empty()) { + f->dump_string("data_pool", data_pool); + } + f->dump_string("block_name_prefix", prefix); + f->dump_int("format", (old_format ? 1 : 2)); + } else { + std::cout << "rbd image '" << imgname << "':\n" + << "\tsize " << byte_u_t(info.size) << " in " + << info.num_objs << " objects" + << std::endl + << "\torder " << info.order + << " (" << byte_u_t(info.obj_size) << " objects)" + << std::endl + << "\tsnapshot_count: " << snaps.size() + << std::endl; + if (!imgid.empty()) { + std::cout << "\tid: " << imgid << std::endl; + } + if (!data_pool.empty()) { + std::cout << "\tdata_pool: " << data_pool << std::endl; + } + std::cout << "\tblock_name_prefix: " << prefix + << std::endl + << "\tformat: " << (old_format ? "1" : "2") + << std::endl; + } + + if (!old_format) { + format_features(f, features); + format_op_features(f, op_features); + format_flags(f, flags); + } + + if (!group_string.empty()) { + if (f) { + f->dump_string("group", group_string); + } else { + std::cout << "\tgroup: " << group_string + << std::endl; + } + } + + if (!create_timestamp_str.empty()) { + if (f) { + f->dump_string("create_timestamp", create_timestamp_str); + } else { + std::cout << "\tcreate_timestamp: " << create_timestamp_str + << std::endl; + } + } + + if (!access_timestamp_str.empty()) { + if (f) { + f->dump_string("access_timestamp", access_timestamp_str); + } else { + std::cout << "\taccess_timestamp: " << access_timestamp_str + << std::endl; + } + } + + if (!modify_timestamp_str.empty()) { + if (f) { + f->dump_string("modify_timestamp", modify_timestamp_str); + } else { + std::cout << "\tmodify_timestamp: " << modify_timestamp_str + << std::endl; + } + } + + // snapshot info, if present + if (!snapname.empty()) { + if (f) { + f->dump_string("protected", snap_protected ? "true" : "false"); + } else { + std::cout << "\tprotected: " << (snap_protected ? "True" : "False") + << std::endl; + } + } + + if (snap_limit < UINT64_MAX) { + if (f) { + f->dump_unsigned("snapshot_limit", snap_limit); + } else { + std::cout << "\tsnapshot_limit: " << snap_limit << std::endl; + } + } + + // parent info, if present + librbd::linked_image_spec_t parent_image_spec; + librbd::snap_spec_t parent_snap_spec; + if ((image.get_parent(&parent_image_spec, &parent_snap_spec) == 0) && + (parent_image_spec.image_name.length() > 0)) { + if (f) { + f->open_object_section("parent"); + f->dump_string("pool", parent_image_spec.pool_name); + f->dump_string("pool_namespace", parent_image_spec.pool_namespace); + f->dump_string("image", parent_image_spec.image_name); + f->dump_string("id", parent_image_spec.image_id); + f->dump_string("snapshot", parent_snap_spec.name); + f->dump_bool("trash", parent_image_spec.trash); + f->dump_unsigned("overlap", overlap); + f->close_section(); + } else { + std::cout << "\tparent: " << parent_image_spec.pool_name << "/"; + if (!parent_image_spec.pool_namespace.empty()) { + std::cout << parent_image_spec.pool_namespace << "/"; + } + std::cout << parent_image_spec.image_name << "@" + << parent_snap_spec.name; + if (parent_image_spec.trash) { + std::cout << " (trash " << parent_image_spec.image_id << ")"; + } + std::cout << std::endl; + std::cout << "\toverlap: " << byte_u_t(overlap) << std::endl; + } + } + + // striping info, if feature is set + if (features & RBD_FEATURE_STRIPINGV2) { + if (f) { + f->dump_unsigned("stripe_unit", image.get_stripe_unit()); + f->dump_unsigned("stripe_count", image.get_stripe_count()); + } else { + std::cout << "\tstripe unit: " << byte_u_t(image.get_stripe_unit()) + << std::endl + << "\tstripe count: " << image.get_stripe_count() << std::endl; + } + } + + if (features & RBD_FEATURE_JOURNALING) { + if (f) { + f->dump_string("journal", utils::image_id(image)); + } else { + std::cout << "\tjournal: " << utils::image_id(image) << std::endl; + } + } + + if (features & RBD_FEATURE_JOURNALING) { + if (f) { + f->open_object_section("mirroring"); + f->dump_string("state", + utils::mirror_image_state(mirror_image.state)); + if (mirror_image.state != RBD_MIRROR_IMAGE_DISABLED) { + f->dump_string("global_id", mirror_image.global_id); + f->dump_bool("primary", mirror_image.primary); + } + f->close_section(); + } else { + std::cout << "\tmirroring state: " + << utils::mirror_image_state(mirror_image.state) << std::endl; + if (mirror_image.state != RBD_MIRROR_IMAGE_DISABLED) { + std::cout << "\tmirroring global id: " << mirror_image.global_id + << std::endl + << "\tmirroring primary: " + << (mirror_image.primary ? "true" : "false") <<std::endl; + } + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } + + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_NONE); + at::add_image_id_option(options); + at::add_format_options(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + std::string image_id; + + if (vm.count(at::IMAGE_ID)) { + image_id = vm[at::IMAGE_ID].as<std::string>(); + } + + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, image_id.empty(), + utils::SNAPSHOT_PRESENCE_PERMITTED, utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + if (!image_id.empty() && !image_name.empty()) { + std::cerr << "rbd: trying to access image using both name and id. " + << std::endl; + return -EINVAL; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, + image_id, snap_name, true, &rados, &io_ctx, + &image); + if (r < 0) { + return r; + } + + r = do_show_info(io_ctx, image, snap_name, formatter.get()); + if (r < 0) { + std::cerr << "rbd: info: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"info"}, {}, "Show information about image size, striping, etc.", "", + &get_arguments, &execute); + +} // namespace info +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Journal.cc b/src/tools/rbd/action/Journal.cc new file mode 100644 index 00000000..d3a54f94 --- /dev/null +++ b/src/tools/rbd/action/Journal.cc @@ -0,0 +1,1254 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/Cond.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "include/stringify.h" +#include <fstream> +#include <sstream> +#include <boost/program_options.hpp> +#include "cls/rbd/cls_rbd_client.h" +#include "cls/journal/cls_journal_types.h" +#include "cls/journal/cls_journal_client.h" + +#include "journal/Journaler.h" +#include "journal/ReplayEntry.h" +#include "journal/ReplayHandler.h" +#include "journal/Settings.h" +#include "librbd/journal/Types.h" + +namespace rbd { +namespace action { +namespace journal { + +namespace at = argument_types; +namespace po = boost::program_options; + +static const std::string JOURNAL_SPEC("journal-spec"); +static const std::string JOURNAL_NAME("journal"); +static const std::string DEST_JOURNAL_NAME("dest-journal"); + +void add_journal_option(po::options_description *opt, + at::ArgumentModifier modifier) { + std::string name = JOURNAL_NAME; + std::string description = at::get_description_prefix(modifier) + + "journal name"; + switch (modifier) { + case at::ARGUMENT_MODIFIER_NONE: + case at::ARGUMENT_MODIFIER_SOURCE: + break; + case at::ARGUMENT_MODIFIER_DEST: + name = DEST_JOURNAL_NAME; + break; + } + + // TODO add validator + opt->add_options() + (name.c_str(), po::value<std::string>(), description.c_str()); +} + +void add_journal_spec_options(po::options_description *pos, + po::options_description *opt, + at::ArgumentModifier modifier) { + + pos->add_options() + ((get_name_prefix(modifier) + JOURNAL_SPEC).c_str(), + (get_description_prefix(modifier) + "journal specification\n" + + "(example: [<pool-name>/[<namespace>/]]<journal-name>)").c_str()); + add_pool_option(opt, modifier); + add_namespace_option(opt, modifier); + add_image_option(opt, modifier); + add_journal_option(opt, modifier); +} + +int get_pool_journal_names(const po::variables_map &vm, + at::ArgumentModifier mod, + size_t *spec_arg_index, + std::string *pool_name, + std::string *namespace_name, + std::string *journal_name) { + std::string pool_key = (mod == at::ARGUMENT_MODIFIER_DEST ? + at::DEST_POOL_NAME : at::POOL_NAME); + std::string namespace_key = (mod == at::ARGUMENT_MODIFIER_DEST ? + at::DEST_NAMESPACE_NAME : at::NAMESPACE_NAME); + std::string image_key = (mod == at::ARGUMENT_MODIFIER_DEST ? + at::DEST_IMAGE_NAME : at::IMAGE_NAME); + std::string journal_key = (mod == at::ARGUMENT_MODIFIER_DEST ? + DEST_JOURNAL_NAME : JOURNAL_NAME); + + if (vm.count(pool_key) && pool_name != nullptr) { + *pool_name = vm[pool_key].as<std::string>(); + } + if (vm.count(namespace_key) && namespace_name != nullptr) { + *namespace_name = vm[namespace_key].as<std::string>(); + } + if (vm.count(journal_key) && journal_name != nullptr) { + *journal_name = vm[journal_key].as<std::string>(); + } + + std::string image_name; + if (vm.count(image_key)) { + image_name = vm[image_key].as<std::string>(); + } + + int r; + if (journal_name != nullptr && !journal_name->empty()) { + // despite the separate pool option, + // we can also specify them via the journal option + std::string journal_name_copy(*journal_name); + r = extract_spec(journal_name_copy, pool_name, namespace_name, journal_name, + nullptr, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + } + + if (!image_name.empty()) { + // despite the separate pool option, + // we can also specify them via the image option + std::string image_name_copy(image_name); + r = extract_spec(image_name_copy, pool_name, namespace_name, &image_name, + nullptr, utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + } + + if (journal_name != nullptr && spec_arg_index != nullptr && + journal_name->empty()) { + std::string spec = utils::get_positional_argument(vm, (*spec_arg_index)++); + if (!spec.empty()) { + r = extract_spec(spec, pool_name, namespace_name, journal_name, nullptr, + utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + } + } + + if (pool_name != nullptr && pool_name->empty()) { + *pool_name = utils::get_default_pool_name(); + } + + if (pool_name != nullptr && namespace_name != nullptr && + journal_name != nullptr && journal_name->empty() && !image_name.empty()) { + // Try to get journal name from image info. + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + int r = utils::init_and_open_image(*pool_name, *namespace_name, image_name, + "", "", true, &rados, &io_ctx, &image); + if (r < 0) { + std::cerr << "rbd: failed to open image " << image_name + << " to get journal name: " << cpp_strerror(r) << std::endl; + return r; + } + + uint64_t features; + r = image.features(&features); + if (r < 0) { + return r; + } + if ((features & RBD_FEATURE_JOURNALING) == 0) { + std::cerr << "rbd: journaling is not enabled for image " << image_name + << std::endl; + return -EINVAL; + } + *journal_name = utils::image_id(image); + } + + if (journal_name != nullptr && journal_name->empty()) { + std::string prefix = at::get_description_prefix(mod); + std::cerr << "rbd: " + << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string()) + << "journal was not specified" << std::endl; + return -EINVAL; + } + + return 0; +} + +static int do_show_journal_info(librados::Rados& rados, librados::IoCtx& io_ctx, + const std::string& journal_id, Formatter *f) +{ + int r; + C_SaferCond cond; + + std::string header_oid = ::journal::Journaler::header_oid(journal_id); + std::string object_oid_prefix = ::journal::Journaler::object_oid_prefix( + io_ctx.get_id(), journal_id); + uint8_t order; + uint8_t splay_width; + int64_t pool_id; + + cls::journal::client::get_immutable_metadata(io_ctx, header_oid, &order, + &splay_width, &pool_id, &cond); + r = cond.wait(); + if (r < 0) { + std::cerr << "failed to get journal metadata: " << cpp_strerror(r) + << std::endl; + return r; + } + + std::string object_pool_name; + if (pool_id >= 0) { + r = rados.pool_reverse_lookup(pool_id, &object_pool_name); + if (r < 0) { + std::cerr << "error looking up pool name for pool_id=" << pool_id << ": " + << cpp_strerror(r) << std::endl; + } + } + + if (f) { + f->open_object_section("journal"); + f->dump_string("journal_id", journal_id); + f->dump_string("header_oid", header_oid); + f->dump_string("object_oid_prefix", object_oid_prefix); + f->dump_int("order", order); + f->dump_int("splay_width", splay_width); + if (!object_pool_name.empty()) { + f->dump_string("object_pool", object_pool_name); + } + f->close_section(); + f->flush(std::cout); + } else { + std::cout << "rbd journal '" << journal_id << "':" << std::endl; + std::cout << "\theader_oid: " << header_oid << std::endl; + std::cout << "\tobject_oid_prefix: " << object_oid_prefix << std::endl; + std::cout << "\torder: " << static_cast<int>(order) << " (" + << byte_u_t(1ull << order) << " objects)"<< std::endl; + std::cout << "\tsplay_width: " << static_cast<int>(splay_width) << std::endl; + if (!object_pool_name.empty()) { + std::cout << "\tobject_pool: " << object_pool_name << std::endl; + } + } + return 0; +} + +static int do_show_journal_status(librados::IoCtx& io_ctx, + const std::string& journal_id, Formatter *f) +{ + int r; + + C_SaferCond cond; + uint64_t minimum_set; + uint64_t active_set; + std::set<cls::journal::Client> registered_clients; + std::string oid = ::journal::Journaler::header_oid(journal_id); + + cls::journal::client::get_mutable_metadata(io_ctx, oid, &minimum_set, + &active_set, ®istered_clients, + &cond); + r = cond.wait(); + if (r < 0) { + std::cerr << "warning: failed to get journal metadata" << std::endl; + return r; + } + + if (f) { + f->open_object_section("status"); + f->dump_unsigned("minimum_set", minimum_set); + f->dump_unsigned("active_set", active_set); + f->open_array_section("registered_clients"); + for (std::set<cls::journal::Client>::iterator c = + registered_clients.begin(); c != registered_clients.end(); ++c) { + f->open_object_section("client"); + c->dump(f); + f->close_section(); + } + f->close_section(); + f->close_section(); + f->flush(std::cout); + } else { + std::cout << "minimum_set: " << minimum_set << std::endl; + std::cout << "active_set: " << active_set << std::endl; + std::cout << "registered clients: " << std::endl; + for (std::set<cls::journal::Client>::iterator c = + registered_clients.begin(); c != registered_clients.end(); ++c) { + std::cout << "\t" << *c << std::endl; + } + } + return 0; +} + +static int do_reset_journal(librados::IoCtx& io_ctx, + const std::string& journal_id) +{ + // disable/re-enable journaling to delete/re-create the journal + // to properly handle mirroring constraints + std::string image_name; + int r = librbd::cls_client::dir_get_name(&io_ctx, RBD_DIRECTORY, journal_id, + &image_name); + if (r < 0) { + std::cerr << "failed to locate journal's image: " << cpp_strerror(r) + << std::endl; + return r; + } + + librbd::Image image; + r = utils::open_image(io_ctx, image_name, false, &image); + if (r < 0) { + std::cerr << "failed to open image: " << cpp_strerror(r) << std::endl; + return r; + } + + r = image.update_features(RBD_FEATURE_JOURNALING, false); + if (r < 0) { + std::cerr << "failed to disable image journaling: " << cpp_strerror(r) + << std::endl; + return r; + } + + r = image.update_features(RBD_FEATURE_JOURNALING, true); + if (r < 0) { + std::cerr << "failed to re-enable image journaling: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +static int do_disconnect_journal_client(librados::IoCtx& io_ctx, + const std::string& journal_id, + const std::string& client_id) +{ + int r; + + C_SaferCond cond; + uint64_t minimum_set; + uint64_t active_set; + std::set<cls::journal::Client> registered_clients; + std::string oid = ::journal::Journaler::header_oid(journal_id); + + cls::journal::client::get_mutable_metadata(io_ctx, oid, &minimum_set, + &active_set, ®istered_clients, + &cond); + r = cond.wait(); + if (r < 0) { + std::cerr << "warning: failed to get journal metadata" << std::endl; + return r; + } + + static const std::string IMAGE_CLIENT_ID(""); + + bool found = false; + for (auto &c : registered_clients) { + if (c.id == IMAGE_CLIENT_ID || (!client_id.empty() && client_id != c.id)) { + continue; + } + r = cls::journal::client::client_update_state(io_ctx, oid, c.id, + cls::journal::CLIENT_STATE_DISCONNECTED); + if (r < 0) { + std::cerr << "warning: failed to disconnect client " << c.id << ": " + << cpp_strerror(r) << std::endl; + return r; + } + std::cout << "client " << c.id << " disconnected" << std::endl; + found = true; + } + + if (!found) { + if (!client_id.empty()) { + std::cerr << "warning: client " << client_id << " is not registered" + << std::endl; + } else { + std::cerr << "no registered clients to disconnect" << std::endl; + } + return -ENOENT; + } + + bufferlist bl; + r = io_ctx.notify2(oid, bl, 5000, NULL); + if (r < 0) { + std::cerr << "warning: failed to notify state change:" << ": " + << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +class Journaler : public ::journal::Journaler { +public: + Journaler(librados::IoCtx& io_ctx, const std::string& journal_id, + const std::string &client_id) : + ::journal::Journaler(io_ctx, journal_id, client_id, {}) { + } + + int init() { + int r; + + // TODO register with librbd payload + r = register_client(bufferlist()); + if (r < 0) { + std::cerr << "failed to register client: " << cpp_strerror(r) + << std::endl; + return r; + } + + C_SaferCond cond; + + ::journal::Journaler::init(&cond); + r = cond.wait(); + if (r < 0) { + std::cerr << "failed to initialize journal: " << cpp_strerror(r) + << std::endl; + (void) unregister_client(); + return r; + } + + return 0; + } + + int shut_down() { + int r = unregister_client(); + if (r < 0) { + std::cerr << "rbd: failed to unregister journal client: " + << cpp_strerror(r) << std::endl; + } + ::journal::Journaler::shut_down(); + + return r; + } +}; + +class JournalPlayer { +public: + JournalPlayer(librados::IoCtx& io_ctx, const std::string& journal_id, + const std::string &client_id) : + m_journaler(io_ctx, journal_id, client_id), + m_cond(), + m_r(0) { + } + + virtual ~JournalPlayer() {} + + virtual int exec() { + int r; + + r = m_journaler.init(); + if (r < 0) { + return r; + } + + ReplayHandler replay_handler(this); + + m_journaler.start_replay(&replay_handler); + + r = m_cond.wait(); + if (r < 0) { + std::cerr << "rbd: failed to process journal: " << cpp_strerror(r) + << std::endl; + if (m_r == 0) { + m_r = r; + } + } + return m_r; + } + + int shut_down() { + return m_journaler.shut_down(); + } + +protected: + struct ReplayHandler : public ::journal::ReplayHandler { + JournalPlayer *journal; + explicit ReplayHandler(JournalPlayer *_journal) : journal(_journal) {} + + void get() override {} + void put() override {} + + void handle_entries_available() override { + journal->handle_replay_ready(); + } + void handle_complete(int r) override { + journal->handle_replay_complete(r); + } + }; + + void handle_replay_ready() { + int r = 0; + while (true) { + ::journal::ReplayEntry replay_entry; + uint64_t tag_id; + if (!m_journaler.try_pop_front(&replay_entry, &tag_id)) { + break; + } + + r = process_entry(replay_entry, tag_id); + if (r < 0) { + break; + } + } + } + + virtual int process_entry(::journal::ReplayEntry replay_entry, + uint64_t tag_id) = 0; + + void handle_replay_complete(int r) { + if (m_r == 0 && r < 0) { + m_r = r; + } + m_journaler.stop_replay(&m_cond); + } + + Journaler m_journaler; + C_SaferCond m_cond; + int m_r; +}; + +static int inspect_entry(bufferlist& data, + librbd::journal::EventEntry& event_entry, + bool verbose) { + try { + auto it = data.cbegin(); + decode(event_entry, it); + } catch (const buffer::error &err) { + std::cerr << "failed to decode event entry: " << err.what() << std::endl; + return -EINVAL; + } + if (verbose) { + JSONFormatter f(true); + f.open_object_section("event_entry"); + event_entry.dump(&f); + f.close_section(); + f.flush(std::cout); + } + return 0; +} + +class JournalInspector : public JournalPlayer { +public: + JournalInspector(librados::IoCtx& io_ctx, const std::string& journal_id, + bool verbose) : + JournalPlayer(io_ctx, journal_id, "INSPECT"), + m_verbose(verbose), + m_s() { + } + + int exec() override { + int r = JournalPlayer::exec(); + m_s.print(); + return r; + } + +private: + struct Stats { + Stats() : total(0), error(0) {} + + void print() { + std::cout << "Summary:" << std::endl + << " " << total << " entries inspected, " << error << " errors" + << std::endl; + } + + int total; + int error; + }; + + int process_entry(::journal::ReplayEntry replay_entry, + uint64_t tag_id) override { + m_s.total++; + if (m_verbose) { + std::cout << "Entry: tag_id=" << tag_id << ", commit_tid=" + << replay_entry.get_commit_tid() << std::endl; + } + bufferlist data = replay_entry.get_data(); + librbd::journal::EventEntry event_entry; + int r = inspect_entry(data, event_entry, m_verbose); + if (r < 0) { + m_r = r; + m_s.error++; + } + return 0; + } + + bool m_verbose; + Stats m_s; +}; + +static int do_inspect_journal(librados::IoCtx& io_ctx, + const std::string& journal_id, + bool verbose) { + JournalInspector inspector(io_ctx, journal_id, verbose); + int r = inspector.exec(); + if (r < 0) { + inspector.shut_down(); + return r; + } + + r = inspector.shut_down(); + if (r < 0) { + return r; + } + return 0; +} + +struct ExportEntry { + uint64_t tag_id; + uint64_t commit_tid; + int type; + bufferlist entry; + + ExportEntry() : tag_id(0), commit_tid(0), type(0), entry() {} + + ExportEntry(uint64_t tag_id, uint64_t commit_tid, int type, + const bufferlist& entry) + : tag_id(tag_id), commit_tid(commit_tid), type(type), entry(entry) { + } + + void dump(Formatter *f) const { + ::encode_json("tag_id", tag_id, f); + ::encode_json("commit_tid", commit_tid, f); + ::encode_json("type", type, f); + ::encode_json("entry", entry, f); + } + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("tag_id", tag_id, obj); + JSONDecoder::decode_json("commit_tid", commit_tid, obj); + JSONDecoder::decode_json("type", type, obj); + JSONDecoder::decode_json("entry", entry, obj); + } +}; + +class JournalExporter : public JournalPlayer { +public: + JournalExporter(librados::IoCtx& io_ctx, const std::string& journal_id, + int fd, bool no_error, bool verbose) : + JournalPlayer(io_ctx, journal_id, "EXPORT"), + m_journal_id(journal_id), + m_fd(fd), + m_no_error(no_error), + m_verbose(verbose), + m_s() { + } + + int exec() override { + std::string header("# journal_id: " + m_journal_id + "\n"); + int r; + r = safe_write(m_fd, header.c_str(), header.size()); + if (r < 0) { + std::cerr << "rbd: failed to write to export file: " << cpp_strerror(r) + << std::endl; + return r; + } + r = JournalPlayer::exec(); + m_s.print(); + return r; + } + +private: + struct Stats { + Stats() : total(0), error(0) {} + + void print() { + std::cout << total << " entries processed, " << error << " errors" + << std::endl; + } + + int total; + int error; + }; + + int process_entry(::journal::ReplayEntry replay_entry, + uint64_t tag_id) override { + m_s.total++; + int type = -1; + bufferlist entry = replay_entry.get_data(); + librbd::journal::EventEntry event_entry; + int r = inspect_entry(entry, event_entry, m_verbose); + if (r < 0) { + m_s.error++; + m_r = r; + return m_no_error ? 0 : r; + } else { + type = event_entry.get_event_type(); + } + ExportEntry export_entry(tag_id, replay_entry.get_commit_tid(), type, + entry); + JSONFormatter f; + ::encode_json("event_entry", export_entry, &f); + std::ostringstream oss; + f.flush(oss); + std::string objstr = oss.str(); + std::string header = stringify(objstr.size()) + " "; + r = safe_write(m_fd, header.c_str(), header.size()); + if (r == 0) { + r = safe_write(m_fd, objstr.c_str(), objstr.size()); + } + if (r == 0) { + r = safe_write(m_fd, "\n", 1); + } + if (r < 0) { + std::cerr << "rbd: failed to write to export file: " << cpp_strerror(r) + << std::endl; + m_s.error++; + return r; + } + return 0; + } + + std::string m_journal_id; + int m_fd; + bool m_no_error; + bool m_verbose; + Stats m_s; +}; + +static int do_export_journal(librados::IoCtx& io_ctx, + const std::string& journal_id, + const std::string& path, + bool no_error, bool verbose) { + int r; + int fd; + bool to_stdout = path == "-"; + if (to_stdout) { + fd = STDOUT_FILENO; + } else { + fd = open(path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0644); + if (fd < 0) { + r = -errno; + std::cerr << "rbd: error creating " << path << std::endl; + return r; + } +#ifdef HAVE_POSIX_FADVISE + posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#endif + } + + JournalExporter exporter(io_ctx, journal_id, fd, no_error, verbose); + r = exporter.exec(); + + if (!to_stdout) { + close(fd); + } + + int shut_down_r = exporter.shut_down(); + if (r == 0 && shut_down_r < 0) { + r = shut_down_r; + } + + return r; +} + +class JournalImporter { +public: + JournalImporter(librados::IoCtx& io_ctx, const std::string& journal_id, + int fd, bool no_error, bool verbose) : + m_journaler(io_ctx, journal_id, "IMPORT"), + m_fd(fd), + m_no_error(no_error), + m_verbose(verbose) { + } + + bool read_entry(bufferlist& bl, int& r) { + // Entries are stored in the file using the following format: + // + // # Optional comments + // NNN {json encoded entry} + // ... + // + // Where NNN is the encoded entry size. + bl.clear(); + char buf[80]; + // Skip line feed and comments (lines started with #). + while ((r = safe_read_exact(m_fd, buf, 1)) == 0) { + if (buf[0] == '\n') { + continue; + } else if (buf[0] == '#') { + while ((r = safe_read_exact(m_fd, buf, 1)) == 0) { + if (buf[0] == '\n') { + break; + } + } + } else { + break; + } + } + if (r < 0) { + if (r == -EDOM) { + r = 0; + } + return false; + } + // Read entry size to buf. + if (!isdigit(buf[0])) { + r = -EINVAL; + std::cerr << "rbd: import data invalid format (digit expected)" + << std::endl; + return false; + } + for (size_t i = 1; i < sizeof(buf); i++) { + r = safe_read_exact(m_fd, buf + i, 1); + if (r < 0) { + std::cerr << "rbd: error reading import data" << std::endl; + return false; + } + if (!isdigit(buf[i])) { + if (buf[i] != ' ') { + r = -EINVAL; + std::cerr << "rbd: import data invalid format (space expected)" + << std::endl; + return false; + } + buf[i] = '\0'; + break; + } + } + int entry_size = atoi(buf); + if (entry_size == 0) { + r = -EINVAL; + std::cerr << "rbd: import data invalid format (zero entry size)" + << std::endl; + return false; + } + ceph_assert(entry_size > 0); + // Read entry. + r = bl.read_fd(m_fd, entry_size); + if (r < 0) { + std::cerr << "rbd: error reading from stdin: " << cpp_strerror(r) + << std::endl; + return false; + } + if (r != entry_size) { + std::cerr << "rbd: error reading from stdin: truncated" + << std::endl; + r = -EINVAL; + return false; + } + r = 0; + return true; + } + + int exec() { + int r = m_journaler.init(); + if (r < 0) { + return r; + } + m_journaler.start_append(0); + + int r1 = 0; + bufferlist bl; + int n = 0; + int error_count = 0; + while (read_entry(bl, r)) { + n++; + error_count++; + JSONParser p; + if (!p.parse(bl.c_str(), bl.length())) { + std::cerr << "rbd: error parsing input (entry " << n << ")" + << std::endl; + r = -EINVAL; + if (m_no_error) { + r1 = r; + continue; + } else { + break; + } + } + ExportEntry e; + try { + decode_json_obj(e, &p); + } catch (JSONDecoder::err& err) { + std::cerr << "rbd: error json decoding import data (entry " << n << "):" + << err.message << std::endl; + r = -EINVAL; + if (m_no_error) { + r1 = r; + continue; + } else { + break; + } + } + librbd::journal::EventEntry event_entry; + r = inspect_entry(e.entry, event_entry, m_verbose); + if (r < 0) { + std::cerr << "rbd: corrupted entry " << n << ": tag_tid=" << e.tag_id + << ", commit_tid=" << e.commit_tid << std::endl; + if (m_no_error) { + r1 = r; + continue; + } else { + break; + } + } + m_journaler.append(e.tag_id, e.entry); + error_count--; + } + + std::cout << n << " entries processed, " << error_count << " errors" << std::endl; + + std::cout << "Waiting for journal append to complete..." << std::endl; + + C_SaferCond cond; + m_journaler.stop_append(&cond); + r = cond.wait(); + + if (r < 0) { + std::cerr << "failed to append journal: " << cpp_strerror(r) << std::endl; + } + + if (r1 < 0 && r == 0) { + r = r1; + } + return r; + } + + int shut_down() { + return m_journaler.shut_down(); + } + +private: + Journaler m_journaler; + int m_fd; + bool m_no_error; + bool m_verbose; +}; + +static int do_import_journal(librados::IoCtx& io_ctx, + const std::string& journal_id, + const std::string& path, + bool no_error, bool verbose) { + int r; + + int fd; + bool from_stdin = path == "-"; + if (from_stdin) { + fd = STDIN_FILENO; + } else { + if ((fd = open(path.c_str(), O_RDONLY)) < 0) { + r = -errno; + std::cerr << "rbd: error opening " << path << std::endl; + return r; + } +#ifdef HAVE_POSIX_FADVISE + posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#endif + } + + JournalImporter importer(io_ctx, journal_id, fd, no_error, verbose); + r = importer.exec(); + + if (!from_stdin) { + close(fd); + } + + int shut_down_r = importer.shut_down(); + if (r == 0 && shut_down_r < 0) { + r = shut_down_r; + } + + return r; +} + +void get_info_arguments(po::options_description *positional, + po::options_description *options) { + add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_format_options(options); +} + +int execute_info(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string journal_name; + int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, + &pool_name, &namespace_name, &journal_name); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = do_show_journal_info(rados, io_ctx, journal_name, formatter.get()); + if (r < 0) { + std::cerr << "rbd: journal info: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; + +} + +void get_status_arguments(po::options_description *positional, + po::options_description *options) { + add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_format_options(options); +} + +int execute_status(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string journal_name; + int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, + &pool_name, &namespace_name, &journal_name); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = do_show_journal_status(io_ctx, journal_name, formatter.get()); + if (r < 0) { + std::cerr << "rbd: journal status: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +void get_reset_arguments(po::options_description *positional, + po::options_description *options) { + add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); +} + +int execute_reset(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string journal_name; + int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, + &pool_name, &namespace_name, &journal_name); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = do_reset_journal(io_ctx, journal_name); + if (r < 0) { + std::cerr << "rbd: journal reset: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +void get_client_disconnect_arguments(po::options_description *positional, + po::options_description *options) { + add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + options->add_options() + ("client-id", po::value<std::string>(), + "client ID (or leave unspecified to disconnect all)"); +} + +int execute_client_disconnect(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string journal_name; + int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, + &pool_name, &namespace_name, &journal_name); + if (r < 0) { + return r; + } + + std::string client_id; + if (vm.count("client-id")) { + client_id = vm["client-id"].as<std::string>(); + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = do_disconnect_journal_client(io_ctx, journal_name, client_id); + if (r < 0) { + std::cerr << "rbd: journal client disconnect: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_inspect_arguments(po::options_description *positional, + po::options_description *options) { + add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_verbose_option(options); +} + +int execute_inspect(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string journal_name; + int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, + &pool_name, &namespace_name, &journal_name); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = do_inspect_journal(io_ctx, journal_name, vm[at::VERBOSE].as<bool>()); + if (r < 0) { + std::cerr << "rbd: journal inspect: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +void get_export_arguments(po::options_description *positional, + po::options_description *options) { + add_journal_spec_options(positional, options, + at::ARGUMENT_MODIFIER_SOURCE); + at::add_path_options(positional, options, + "export file (or '-' for stdout)"); + at::add_verbose_option(options); + at::add_no_error_option(options); +} + +int execute_export(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string journal_name; + int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, + &pool_name, &namespace_name, &journal_name); + if (r < 0) { + return r; + } + + std::string path; + r = utils::get_path(vm, &arg_index, &path); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = do_export_journal(io_ctx, journal_name, path, vm[at::NO_ERROR].as<bool>(), + vm[at::VERBOSE].as<bool>()); + if (r < 0) { + std::cerr << "rbd: journal export: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +void get_import_arguments(po::options_description *positional, + po::options_description *options) { + at::add_path_options(positional, options, + "import file (or '-' for stdin)"); + add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST); + at::add_verbose_option(options); + at::add_no_error_option(options); +} + +int execute_import(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string path; + size_t arg_index = 0; + int r = utils::get_path(vm, &arg_index, &path); + if (r < 0) { + return r; + } + + std::string pool_name; + std::string namespace_name; + std::string journal_name; + r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, + &pool_name, &namespace_name, &journal_name); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = do_import_journal(io_ctx, journal_name, path, vm[at::NO_ERROR].as<bool>(), + vm[at::VERBOSE].as<bool>()); + if (r < 0) { + std::cerr << "rbd: journal import: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action_info( + {"journal", "info"}, {}, "Show information about image journal.", "", + &get_info_arguments, &execute_info); + +Shell::Action action_status( + {"journal", "status"}, {}, "Show status of image journal.", "", + &get_status_arguments, &execute_status); + +Shell::Action action_reset( + {"journal", "reset"}, {}, "Reset image journal.", "", + &get_reset_arguments, &execute_reset); + +Shell::Action action_inspect( + {"journal", "inspect"}, {}, "Inspect image journal for structural errors.", "", + &get_inspect_arguments, &execute_inspect); + +Shell::Action action_export( + {"journal", "export"}, {}, "Export image journal.", "", + &get_export_arguments, &execute_export); + +Shell::Action action_import( + {"journal", "import"}, {}, "Import image journal.", "", + &get_import_arguments, &execute_import); + +Shell::Action action_disconnect( + {"journal", "client", "disconnect"}, {}, + "Flag image journal client as disconnected.", "", + &get_client_disconnect_arguments, &execute_client_disconnect); + +} // namespace journal +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Kernel.cc b/src/tools/rbd/action/Kernel.cc new file mode 100644 index 00000000..dc0938eb --- /dev/null +++ b/src/tools/rbd/action/Kernel.cc @@ -0,0 +1,561 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "acconfig.h" +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/krbd.h" +#include "include/stringify.h" +#include "include/uuid.h" +#include "common/config_proxy.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "common/strtol.h" +#include "common/Formatter.h" +#include "msg/msg_types.h" +#include "global/global_context.h" +#include <iostream> +#include <boost/algorithm/string/predicate.hpp> +#include <boost/scope_exit.hpp> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace kernel { + +namespace at = argument_types; +namespace po = boost::program_options; + +namespace { + +std::map<std::string, std::string> map_options; // used for both map and unmap + +} // anonymous namespace + +static std::string map_option_uuid_cb(const char *value_char) +{ + uuid_d u; + if (!u.parse(value_char)) + return ""; + + return stringify(u); +} + +static std::string map_option_ip_cb(const char *value_char) +{ + entity_addr_t a; + const char *endptr; + if (!a.parse(value_char, &endptr) || + endptr != value_char + strlen(value_char)) { + return ""; + } + + return stringify(a.get_sockaddr()); +} + +static std::string map_option_int_cb(const char *value_char) +{ + std::string err; + int d = strict_strtol(value_char, 10, &err); + if (!err.empty() || d < 0) + return ""; + + return stringify(d); +} + +static std::string map_option_ms_mode_cb(const char *value_char) +{ + if (!strcmp(value_char, "legacy") || !strcmp(value_char, "crc") || + !strcmp(value_char, "secure") || !strcmp(value_char, "prefer-crc") || + !strcmp(value_char, "prefer-secure")) { + return value_char; + } + return ""; +} + +static void put_map_option(const std::string &key, const std::string &val) +{ + map_options[key] = val; +} + +static int put_map_option_value(const std::string &opt, const char *value_char, + std::string (*parse_cb)(const char *)) +{ + if (!value_char || *value_char == '\0') { + std::cerr << "rbd: " << opt << " option requires a value" << std::endl; + return -EINVAL; + } + + std::string value = parse_cb(value_char); + if (value.empty()) { + std::cerr << "rbd: invalid " << opt << " value '" << value_char << "'" + << std::endl; + return -EINVAL; + } + + put_map_option(opt, opt + "=" + value); + return 0; +} + +static int parse_map_options(const std::string &options_string) +{ + char *options = strdup(options_string.c_str()); + BOOST_SCOPE_EXIT(options) { + free(options); + } BOOST_SCOPE_EXIT_END; + + for (char *this_char = strtok(options, ", "); + this_char != NULL; + this_char = strtok(NULL, ",")) { + char *value_char; + + if ((value_char = strchr(this_char, '=')) != NULL) + *value_char++ = '\0'; + + if (!strcmp(this_char, "fsid")) { + if (put_map_option_value("fsid", value_char, map_option_uuid_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "ip")) { + if (put_map_option_value("ip", value_char, map_option_ip_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "share") || !strcmp(this_char, "noshare")) { + put_map_option("share", this_char); + } else if (!strcmp(this_char, "crc") || !strcmp(this_char, "nocrc")) { + put_map_option("crc", this_char); + } else if (!strcmp(this_char, "cephx_require_signatures") || + !strcmp(this_char, "nocephx_require_signatures")) { + put_map_option("cephx_require_signatures", this_char); + } else if (!strcmp(this_char, "tcp_nodelay") || + !strcmp(this_char, "notcp_nodelay")) { + put_map_option("tcp_nodelay", this_char); + } else if (!strcmp(this_char, "cephx_sign_messages") || + !strcmp(this_char, "nocephx_sign_messages")) { + put_map_option("cephx_sign_messages", this_char); + } else if (!strcmp(this_char, "mount_timeout")) { + if (put_map_option_value("mount_timeout", value_char, map_option_int_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "osd_request_timeout")) { + if (put_map_option_value("osd_request_timeout", value_char, map_option_int_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "lock_timeout")) { + if (put_map_option_value("lock_timeout", value_char, map_option_int_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "osdkeepalive")) { + if (put_map_option_value("osdkeepalive", value_char, map_option_int_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "osd_idle_ttl")) { + if (put_map_option_value("osd_idle_ttl", value_char, map_option_int_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "rw") || !strcmp(this_char, "ro")) { + put_map_option("rw", this_char); + } else if (!strcmp(this_char, "queue_depth")) { + if (put_map_option_value("queue_depth", value_char, map_option_int_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "lock_on_read")) { + put_map_option("lock_on_read", this_char); + } else if (!strcmp(this_char, "exclusive")) { + put_map_option("exclusive", this_char); + } else if (!strcmp(this_char, "notrim")) { + put_map_option("notrim", this_char); + } else if (!strcmp(this_char, "abort_on_full")) { + put_map_option("abort_on_full", this_char); + } else if (!strcmp(this_char, "alloc_size")) { + if (put_map_option_value("alloc_size", value_char, map_option_int_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "ms_mode")) { + if (put_map_option_value("ms_mode", value_char, map_option_ms_mode_cb)) + return -EINVAL; + } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) { + put_map_option("udev", this_char); + } else { + std::cerr << "rbd: unknown map option '" << this_char << "'" << std::endl; + return -EINVAL; + } + } + + return 0; +} + +static int parse_unmap_options(const std::string &options_string) +{ + char *options = strdup(options_string.c_str()); + BOOST_SCOPE_EXIT(options) { + free(options); + } BOOST_SCOPE_EXIT_END; + + for (char *this_char = strtok(options, ", "); + this_char != NULL; + this_char = strtok(NULL, ",")) { + char *value_char; + + if ((value_char = strchr(this_char, '=')) != NULL) + *value_char++ = '\0'; + + if (!strcmp(this_char, "force")) { + put_map_option("force", this_char); + } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) { + put_map_option("udev", this_char); + } else { + std::cerr << "rbd: unknown unmap option '" << this_char << "'" << std::endl; + return -EINVAL; + } + } + + return 0; +} + +static int do_kernel_list(Formatter *f) { +#if defined(WITH_KRBD) + struct krbd_ctx *krbd; + int r; + + r = krbd_create_from_context(g_ceph_context, 0, &krbd); + if (r < 0) + return r; + + r = krbd_showmapped(krbd, f); + + krbd_destroy(krbd); + return r; +#else + std::cerr << "rbd: kernel device is not supported" << std::endl; + return -EOPNOTSUPP; +#endif +} + +static int get_unsupported_features(librbd::Image &image, + uint64_t *unsupported_features) +{ + char buf[20]; + uint64_t features, supported_features; + int r; + + r = safe_read_file("/sys/bus/rbd/", "supported_features", buf, + sizeof(buf) - 1); + if (r < 0) + return r; + + buf[r] = '\0'; + try { + supported_features = std::stoull(buf, nullptr, 16); + } catch (...) { + return -EINVAL; + } + + r = image.features(&features); + if (r < 0) + return r; + + *unsupported_features = features & ~supported_features; + return 0; +} + +/* + * hint user to check syslog for krbd related messages and provide suggestions + * based on errno return by krbd_map(). also note that even if some librbd calls + * fail, we at least dump the "try dmesg..." message to aid debugging. + */ +static void print_error_description(const char *poolname, + const char *nspace_name, + const char *imgname, + const char *snapname, + int maperrno) +{ + int r; + uint8_t oldformat; + librados::Rados rados; + librados::IoCtx ioctx; + librbd::Image image; + + if (maperrno == -ENOENT) + goto done; + + r = utils::init_and_open_image(poolname, nspace_name, imgname, "", snapname, + true, &rados, &ioctx, &image); + if (r < 0) + goto done; + + r = image.old_format(&oldformat); + if (r < 0) + goto done; + + /* + * kernel returns -ENXIO when mapping a V2 image due to unsupported feature + * set - so, hint about that too... + */ + if (!oldformat && (maperrno == -ENXIO)) { + uint64_t unsupported_features; + bool need_terminate = true; + + std::cout << "RBD image feature set mismatch. "; + r = get_unsupported_features(image, &unsupported_features); + if (r == 0 && (unsupported_features & ~RBD_FEATURES_ALL) == 0) { + uint64_t immutable = RBD_FEATURES_ALL & ~(RBD_FEATURES_MUTABLE | + RBD_FEATURES_DISABLE_ONLY); + if (unsupported_features & immutable) { + std::cout << "This image cannot be mapped because the following " + << "immutable features are unsupported by the kernel:"; + unsupported_features &= immutable; + need_terminate = false; + } else { + std::cout << "You can disable features unsupported by the kernel " + << "with \"rbd feature disable "; + if (poolname != utils::get_default_pool_name() || *nspace_name) { + std::cout << poolname << "/"; + } + if (*nspace_name) { + std::cout << nspace_name << "/"; + } + std::cout << imgname; + } + } else { + std::cout << "Try disabling features unsupported by the kernel " + << "with \"rbd feature disable"; + unsupported_features = 0; + } + for (auto it : at::ImageFeatures::FEATURE_MAPPING) { + if (it.first & unsupported_features) { + std::cout << " " << it.second; + } + } + if (need_terminate) + std::cout << "\""; + std::cout << "." << std::endl; + } + + done: + std::cout << "In some cases useful info is found in syslog - try \"dmesg | tail\"." << std::endl; +} + +static int do_kernel_map(const char *poolname, const char *nspace_name, + const char *imgname, const char *snapname) +{ +#if defined(WITH_KRBD) + struct krbd_ctx *krbd; + std::ostringstream oss; + uint32_t flags = 0; + char *devnode; + int r; + + for (auto it = map_options.begin(); it != map_options.end(); ) { + // for compatibility with < 3.7 kernels, assume that rw is on by + // default and omit it even if it was specified by the user + // (see ceph.git commit fb0f1986449b) + if (it->first == "rw" && it->second == "rw") { + it = map_options.erase(it); + } else if (it->first == "udev") { + if (it->second == "noudev") { + flags |= KRBD_CTX_F_NOUDEV; + } + it = map_options.erase(it); + } else { + if (it != map_options.begin()) + oss << ","; + oss << it->second; + ++it; + } + } + + r = krbd_create_from_context(g_ceph_context, flags, &krbd); + if (r < 0) + return r; + + r = krbd_is_mapped(krbd, poolname, nspace_name, imgname, snapname, &devnode); + if (r < 0) { + std::cerr << "rbd: warning: can't get image map information: " + << cpp_strerror(r) << std::endl; + } else if (r > 0) { + std::cerr << "rbd: warning: image already mapped as " << devnode + << std::endl; + free(devnode); + } + + r = krbd_map(krbd, poolname, nspace_name, imgname, snapname, + oss.str().c_str(), &devnode); + if (r < 0) { + print_error_description(poolname, nspace_name, imgname, snapname, r); + goto out; + } + + std::cout << devnode << std::endl; + + free(devnode); +out: + krbd_destroy(krbd); + return r; +#else + std::cerr << "rbd: kernel device is not supported" << std::endl; + return -EOPNOTSUPP; +#endif +} + +static int do_kernel_unmap(const char *dev, const char *poolname, + const char *nspace_name, const char *imgname, + const char *snapname) +{ +#if defined(WITH_KRBD) + struct krbd_ctx *krbd; + std::ostringstream oss; + uint32_t flags = 0; + int r; + + for (auto it = map_options.begin(); it != map_options.end(); ) { + if (it->first == "udev") { + if (it->second == "noudev") { + flags |= KRBD_CTX_F_NOUDEV; + } + it = map_options.erase(it); + } else { + if (it != map_options.begin()) + oss << ","; + oss << it->second; + ++it; + } + } + + r = krbd_create_from_context(g_ceph_context, flags, &krbd); + if (r < 0) + return r; + + if (dev) + r = krbd_unmap(krbd, dev, oss.str().c_str()); + else + r = krbd_unmap_by_spec(krbd, poolname, nspace_name, imgname, snapname, + oss.str().c_str()); + + krbd_destroy(krbd); + return r; +#else + std::cerr << "rbd: kernel device is not supported" << std::endl; + return -EOPNOTSUPP; +#endif +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + at::Format::Formatter formatter; + int r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + utils::init_context(); + + r = do_kernel_list(formatter.get()); + if (r < 0) { + std::cerr << "rbd: device list failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +int execute_map(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string nspace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + // parse default options first so they can be overwritten by cli options + r = parse_map_options( + g_conf().get_val<std::string>("rbd_default_map_options")); + if (r < 0) { + std::cerr << "rbd: couldn't parse default map options" << std::endl; + return r; + } + + if (vm.count("options")) { + for (auto &options : vm["options"].as<std::vector<std::string>>()) { + r = parse_map_options(options); + if (r < 0) { + std::cerr << "rbd: couldn't parse map options" << std::endl; + return r; + } + } + } + + // parse options common to all device types after parsing krbd-specific + // options so that common options win (in particular "-o rw --read-only" + // should result in read-only mapping) + if (vm["read-only"].as<bool>()) { + put_map_option("rw", "ro"); + } + if (vm["exclusive"].as<bool>()) { + put_map_option("exclusive", "exclusive"); + } + + utils::init_context(); + + r = do_kernel_map(pool_name.c_str(), nspace_name.c_str(), image_name.c_str(), + snap_name.c_str()); + if (r < 0) { + std::cerr << "rbd: map failed: " << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +int execute_unmap(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string device_name = utils::get_positional_argument(vm, 0); + if (!boost::starts_with(device_name, "/dev/")) { + device_name.clear(); + } + + size_t arg_index = 0; + std::string pool_name; + std::string nspace_name; + std::string image_name; + std::string snap_name; + int r; + if (device_name.empty()) { + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name, + &image_name, &snap_name, false, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + } + + if (device_name.empty() && image_name.empty()) { + std::cerr << "rbd: unmap requires either image name or device path" + << std::endl; + return -EINVAL; + } + + if (vm.count("options")) { + for (auto &options : vm["options"].as<std::vector<std::string>>()) { + r = parse_unmap_options(options); + if (r < 0) { + std::cerr << "rbd: couldn't parse unmap options" << std::endl; + return r; + } + } + } + + utils::init_context(); + + r = do_kernel_unmap(device_name.empty() ? nullptr : device_name.c_str(), + pool_name.c_str(), nspace_name.c_str(), + image_name.c_str(), snap_name.c_str()); + if (r < 0) { + std::cerr << "rbd: unmap failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +} // namespace kernel +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/List.cc b/src/tools/rbd/action/List.cc new file mode 100644 index 00000000..e6025418 --- /dev/null +++ b/src/tools/rbd/action/List.cc @@ -0,0 +1,340 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/Context.h" +#include "include/stringify.h" +#include "include/types.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include <iostream> +#include <boost/bind.hpp> +#include <boost/program_options.hpp> +#include "global/global_context.h" + +namespace rbd { + +namespace action { +namespace list { + +namespace at = argument_types; +namespace po = boost::program_options; + +enum WorkerState { + STATE_IDLE = 0, + STATE_OPENED, + STATE_DONE +} ; + +struct WorkerEntry { + librbd::Image img; + librbd::RBD::AioCompletion* completion; + WorkerState state; + string name; + + WorkerEntry() { + state = STATE_IDLE; + completion = nullptr; + } +}; + + +int list_process_image(librados::Rados* rados, WorkerEntry* w, bool lflag, Formatter *f, TextTable &tbl) +{ + int r = 0; + librbd::image_info_t info; + std::string parent; + + // handle second-nth trips through loop + librbd::linked_image_spec_t parent_image_spec; + librbd::snap_spec_t parent_snap_spec; + r = w->img.get_parent(&parent_image_spec, &parent_snap_spec); + if (r < 0 && r != -ENOENT) { + return r; + } + + bool has_parent = false; + if (r != -ENOENT) { + parent = parent_image_spec.pool_name + "/"; + if (!parent_image_spec.pool_namespace.empty()) { + parent += parent_image_spec.pool_namespace + "/"; + } + parent += parent_image_spec.image_name + "@" + parent_snap_spec.name; + has_parent = true; + } + + if (w->img.stat(info, sizeof(info)) < 0) { + return -EINVAL; + } + + uint8_t old_format; + w->img.old_format(&old_format); + + std::list<librbd::locker_t> lockers; + bool exclusive; + r = w->img.list_lockers(&lockers, &exclusive, NULL); + if (r < 0) + return r; + std::string lockstr; + if (!lockers.empty()) { + lockstr = (exclusive) ? "excl" : "shr"; + } + + if (f) { + f->open_object_section("image"); + f->dump_string("image", w->name); + f->dump_unsigned("size", info.size); + if (has_parent) { + f->open_object_section("parent"); + f->dump_string("pool", parent_image_spec.pool_name); + f->dump_string("pool_namespace", parent_image_spec.pool_namespace); + f->dump_string("image", parent_image_spec.image_name); + f->dump_string("snapshot", parent_snap_spec.name); + f->close_section(); + } + f->dump_int("format", old_format ? 1 : 2); + if (!lockers.empty()) + f->dump_string("lock_type", exclusive ? "exclusive" : "shared"); + f->close_section(); + } else { + tbl << w->name + << stringify(byte_u_t(info.size)) + << parent + << ((old_format) ? '1' : '2') + << "" // protect doesn't apply to images + << lockstr + << TextTable::endrow; + } + + std::vector<librbd::snap_info_t> snaplist; + if (w->img.snap_list(snaplist) >= 0 && !snaplist.empty()) { + snaplist.erase(remove_if(snaplist.begin(), + snaplist.end(), + boost::bind(utils::is_not_user_snap_namespace, &w->img, _1)), + snaplist.end()); + for (std::vector<librbd::snap_info_t>::iterator s = snaplist.begin(); + s != snaplist.end(); ++s) { + bool is_protected; + bool has_parent = false; + parent.clear(); + w->img.snap_set(s->name.c_str()); + r = w->img.snap_is_protected(s->name.c_str(), &is_protected); + if (r < 0) + return r; + if (w->img.get_parent(&parent_image_spec, &parent_snap_spec) >= 0) { + parent = parent_image_spec.pool_name + "/"; + if (!parent_image_spec.pool_namespace.empty()) { + parent += parent_image_spec.pool_namespace + "/"; + } + parent += parent_image_spec.image_name + "@" + parent_snap_spec.name; + has_parent = true; + } + if (f) { + f->open_object_section("snapshot"); + f->dump_string("image", w->name); + f->dump_string("snapshot", s->name); + f->dump_unsigned("size", s->size); + if (has_parent) { + f->open_object_section("parent"); + f->dump_string("pool", parent_image_spec.pool_name); + f->dump_string("pool_namespace", parent_image_spec.pool_namespace); + f->dump_string("image", parent_image_spec.image_name); + f->dump_string("snapshot", parent_snap_spec.name); + f->close_section(); + } + f->dump_int("format", old_format ? 1 : 2); + f->dump_string("protected", is_protected ? "true" : "false"); + f->close_section(); + } else { + tbl << w->name + "@" + s->name + << stringify(byte_u_t(s->size)) + << parent + << ((old_format) ? '1' : '2') + << (is_protected ? "yes" : "") + << "" // locks don't apply to snaps + << TextTable::endrow; + } + } + } + + return 0; +} + +int do_list(const std::string &pool_name, const std::string& namespace_name, + bool lflag, int threads, Formatter *f) { + std::vector<WorkerEntry*> workers; + std::vector<librbd::image_spec_t> images; + librados::Rados rados; + librbd::RBD rbd; + librados::IoCtx ioctx; + + if (threads < 1) { + threads = 1; + } + if (threads > 32) { + threads = 32; + } + + int r = utils::init(pool_name, namespace_name, &rados, &ioctx); + if (r < 0) { + return r; + } + + utils::disable_cache(); + + r = rbd.list2(ioctx, &images); + if (r < 0) + return r; + + if (!lflag) { + if (f) + f->open_array_section("images"); + for (auto& image : images) { + if (f) + f->dump_string("name", image.name); + else + std::cout << image.name << std::endl; + } + if (f) { + f->close_section(); + f->flush(std::cout); + } + return 0; + } + + TextTable tbl; + + if (f) { + f->open_array_section("images"); + } else { + tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("PARENT", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("FMT", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("PROT", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("LOCK", TextTable::LEFT, TextTable::LEFT); + } + + for (size_t left = 0; left < std::min<size_t>(threads, images.size()); + left++) { + workers.push_back(new WorkerEntry()); + } + + auto i = images.begin(); + while (true) { + size_t workers_idle = 0; + for (auto comp : workers) { + switch (comp->state) { + case STATE_DONE: + comp->completion->wait_for_complete(); + comp->state = STATE_IDLE; + comp->completion->release(); + comp->completion = nullptr; + // we want it to fall through in this case + case STATE_IDLE: + if (i == images.end()) { + workers_idle++; + continue; + } + comp->name = i->name; + comp->completion = new librbd::RBD::AioCompletion(nullptr, nullptr); + r = rbd.aio_open_read_only(ioctx, comp->img, i->name.c_str(), nullptr, + comp->completion); + i++; + comp->state = STATE_OPENED; + break; + case STATE_OPENED: + comp->completion->wait_for_complete(); + // image might disappear between rbd.list() and rbd.open(); ignore + // that, warn about other possible errors (EPERM, say, for opening + // an old-format image, because you need execute permission for the + // class method) + r = comp->completion->get_return_value(); + comp->completion->release(); + if (r < 0) { + std::cerr << "rbd: error opening " << comp->name << ": " + << cpp_strerror(r) << std::endl; + + // in any event, continue to next image + comp->state = STATE_IDLE; + continue; + } + r = list_process_image(&rados, comp, lflag, f, tbl); + if (r < 0) { + std::cerr << "rbd: error processing image " << comp->name << ": " + << cpp_strerror(r) << std::endl; + } + comp->completion = new librbd::RBD::AioCompletion(nullptr, nullptr); + r = comp->img.aio_close(comp->completion); + comp->state = STATE_DONE; + break; + } + } + if (workers_idle == workers.size()) { + break; + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } else if (!images.empty()) { + std::cout << tbl; + } + + rados.shutdown(); + + for (auto comp : workers) { + delete comp; + } + + return r < 0 ? r : 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + options->add_options() + ("long,l", po::bool_switch(), "long listing format"); + at::add_pool_options(positional, options, true); + at::add_format_options(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + std::string namespace_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name, + &namespace_name, &arg_index); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + r = do_list(pool_name, namespace_name, vm["long"].as<bool>(), + g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"), + formatter.get()); + if (r < 0) { + std::cerr << "rbd: listing images failed: " << cpp_strerror(r) + << std::endl; + return r; + } + + return 0; +} + +Shell::SwitchArguments switched_arguments({"long", "l"}); +Shell::Action action( + {"list"}, {"ls"}, "List rbd images.", "", &get_arguments, &execute); + +} // namespace list +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Lock.cc b/src/tools/rbd/action/Lock.cc new file mode 100644 index 00000000..754cb384 --- /dev/null +++ b/src/tools/rbd/action/Lock.cc @@ -0,0 +1,279 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace lock { + +namespace at = argument_types; +namespace po = boost::program_options; + +namespace { + +void add_id_option(po::options_description *positional) { + positional->add_options() + ("lock-id", "unique lock id"); +} + +int get_id(const po::variables_map &vm, size_t *arg_index, + std::string *id) { + *id = utils::get_positional_argument(vm, *arg_index); + if (id->empty()) { + std::cerr << "rbd: lock id was not specified" << std::endl; + return -EINVAL; + } else { + ++(*arg_index); + } + return 0; +} + +} // anonymous namespace + +static int do_lock_list(librbd::Image& image, Formatter *f) +{ + std::list<librbd::locker_t> lockers; + bool exclusive; + std::string tag; + TextTable tbl; + int r; + + r = image.list_lockers(&lockers, &exclusive, &tag); + if (r < 0) + return r; + + if (f) { + f->open_array_section("locks"); + } else { + tbl.define_column("Locker", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("Address", TextTable::LEFT, TextTable::LEFT); + } + + if (lockers.size()) { + bool one = (lockers.size() == 1); + + if (!f) { + std::cout << "There " << (one ? "is " : "are ") << lockers.size() + << (exclusive ? " exclusive" : " shared") + << " lock" << (one ? "" : "s") << " on this image.\n"; + if (!exclusive) + std::cout << "Lock tag: " << tag << "\n"; + } + + for (std::list<librbd::locker_t>::const_iterator it = lockers.begin(); + it != lockers.end(); ++it) { + if (f) { + f->open_object_section("lock"); + f->dump_string("id", it->cookie); + f->dump_string("locker", it->client); + f->dump_string("address", it->address); + f->close_section(); + } else { + tbl << it->client << it->cookie << it->address << TextTable::endrow; + } + } + if (!f) + std::cout << tbl; + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } + return 0; +} + +static int do_lock_add(librbd::Image& image, const char *cookie, + const char *tag) +{ + if (tag) + return image.lock_shared(cookie, tag); + else + return image.lock_exclusive(cookie); +} + +static int do_lock_remove(librbd::Image& image, const char *client, + const char *cookie) +{ + return image.break_lock(client, cookie); +} + +void get_list_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_format_options(options); +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_lock_list(image, formatter.get()); + if (r < 0) { + std::cerr << "rbd: listing locks failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +void get_add_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + add_id_option(positional); + options->add_options() + ("shared", po::value<std::string>(), "shared lock tag"); +} + +int execute_add(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string lock_cookie; + r = get_id(vm, &arg_index, &lock_cookie); + if (r < 0) { + return r; + } + + std::string lock_tag; + if (vm.count("shared")) { + lock_tag = vm["shared"].as<std::string>(); + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_lock_add(image, lock_cookie.c_str(), + lock_tag.empty() ? nullptr : lock_tag.c_str()); + if (r < 0) { + if (r == -EBUSY || r == -EEXIST) { + if (!lock_tag.empty()) { + std::cerr << "rbd: lock is already held by someone else" + << " with a different tag" << std::endl; + } else { + std::cerr << "rbd: lock is already held by someone else" << std::endl; + } + } else { + std::cerr << "rbd: taking lock failed: " << cpp_strerror(r) << std::endl; + } + return r; + } + return 0; +} + +void get_remove_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + add_id_option(positional); + positional->add_options() + ("locker", "locker client"); +} + +int execute_remove(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string lock_cookie; + r = get_id(vm, &arg_index, &lock_cookie); + if (r < 0) { + return r; + } + + std::string lock_client = utils::get_positional_argument(vm, arg_index); + if (lock_client.empty()) { + std::cerr << "rbd: locker was not specified" << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_lock_remove(image, lock_client.c_str(), lock_cookie.c_str()); + if (r < 0) { + std::cerr << "rbd: releasing lock failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action_list( + {"lock", "list"}, {"lock", "ls"}, "Show locks held on an image.", "", + &get_list_arguments, &execute_list); +Shell::Action action_add( + {"lock", "add"}, {}, "Take a lock on an image.", "", + &get_add_arguments, &execute_add); +Shell::Action action_remove( + {"lock", "remove"}, {"lock", "rm"}, "Release a lock on an image.", "", + &get_remove_arguments, &execute_remove); + +} // namespace lock +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/MergeDiff.cc b/src/tools/rbd/action/MergeDiff.cc new file mode 100644 index 00000000..406b23b4 --- /dev/null +++ b/src/tools/rbd/action/MergeDiff.cc @@ -0,0 +1,454 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#define _LARGEFILE64_SOURCE +#include <sys/types.h> +#include <unistd.h> + +#include "include/compat.h" +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/safe_io.h" +#include "common/debug.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd + +namespace rbd { +namespace action { +namespace merge_diff { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int parse_diff_header(int fd, __u8 *tag, string *from, string *to, uint64_t *size) +{ + int r; + + {//header + char buf[utils::RBD_DIFF_BANNER.size() + 1]; + r = safe_read_exact(fd, buf, utils::RBD_DIFF_BANNER.size()); + if (r < 0) + return r; + + buf[utils::RBD_DIFF_BANNER.size()] = '\0'; + if (strcmp(buf, utils::RBD_DIFF_BANNER.c_str())) { + std::cerr << "invalid banner '" << buf << "', expected '" + << utils::RBD_DIFF_BANNER << "'" << std::endl; + return -EINVAL; + } + } + + while (true) { + r = safe_read_exact(fd, tag, 1); + if (r < 0) + return r; + + if (*tag == RBD_DIFF_FROM_SNAP) { + r = utils::read_string(fd, 4096, from); // 4k limit to make sure we don't get a garbage string + if (r < 0) + return r; + dout(2) << " from snap " << *from << dendl; + } else if (*tag == RBD_DIFF_TO_SNAP) { + r = utils::read_string(fd, 4096, to); // 4k limit to make sure we don't get a garbage string + if (r < 0) + return r; + dout(2) << " to snap " << *to << dendl; + } else if (*tag == RBD_DIFF_IMAGE_SIZE) { + char buf[8]; + r = safe_read_exact(fd, buf, 8); + if (r < 0) + return r; + + bufferlist bl; + bl.append(buf, 8); + auto p = bl.cbegin(); + decode(*size, p); + } else { + break; + } + } + + return 0; +} + +static int parse_diff_body(int fd, __u8 *tag, uint64_t *offset, uint64_t *length) +{ + int r; + + if (!(*tag)) { + r = safe_read_exact(fd, tag, 1); + if (r < 0) + return r; + } + + if (*tag == RBD_DIFF_END) { + offset = 0; + length = 0; + return 0; + } + + if (*tag != RBD_DIFF_WRITE && *tag != RBD_DIFF_ZERO) + return -ENOTSUP; + + char buf[16]; + r = safe_read_exact(fd, buf, 16); + if (r < 0) + return r; + + bufferlist bl; + bl.append(buf, 16); + auto p = bl.cbegin(); + decode(*offset, p); + decode(*length, p); + + if (!(*length)) + return -ENOTSUP; + + return 0; +} + +/* + * fd: the diff file to read from + * pd: the diff file to be written into + */ +static int accept_diff_body(int fd, int pd, __u8 tag, uint64_t offset, uint64_t length) +{ + if (tag == RBD_DIFF_END) + return 0; + + bufferlist bl; + encode(tag, bl); + encode(offset, bl); + encode(length, bl); + int r; + r = bl.write_fd(pd); + if (r < 0) + return r; + + if (tag == RBD_DIFF_WRITE) { + bufferptr bp = buffer::create(length); + r = safe_read_exact(fd, bp.c_str(), length); + if (r < 0) + return r; + bufferlist data; + data.append(bp); + r = data.write_fd(pd); + if (r < 0) + return r; + } + + return 0; +} + +/* + * Merge two diff files into one single file + * Note: It does not do the merging work if + * either of the source diff files is stripped, + * since which complicates the process and is + * rarely used + */ +static int do_merge_diff(const char *first, const char *second, + const char *path, bool no_progress) +{ + utils::ProgressContext pc("Merging image diff", no_progress); + int fd = -1, sd = -1, pd = -1, r; + + string f_from, f_to; + string s_from, s_to; + uint64_t f_size = 0; + uint64_t s_size = 0; + uint64_t pc_size; + + __u8 f_tag = 0, s_tag = 0; + uint64_t f_off = 0, f_len = 0; + uint64_t s_off = 0, s_len = 0; + bool f_end = false, s_end = false; + + bool first_stdin = !strcmp(first, "-"); + if (first_stdin) { + fd = STDIN_FILENO; + } else { + fd = open(first, O_RDONLY); + if (fd < 0) { + r = -errno; + std::cerr << "rbd: error opening " << first << std::endl; + goto done; + } + } + + sd = open(second, O_RDONLY); + if (sd < 0) { + r = -errno; + std::cerr << "rbd: error opening " << second << std::endl; + goto done; + } + + if (strcmp(path, "-") == 0) { + pd = 1; + } else { + pd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644); + if (pd < 0) { + r = -errno; + std::cerr << "rbd: error create " << path << std::endl; + goto done; + } + } + + //We just handle the case like 'banner, [ftag], [ttag], stag, [wztag]*,etag', + // and the (offset,length) in wztag must be ascending order. + r = parse_diff_header(fd, &f_tag, &f_from, &f_to, &f_size); + if (r < 0) { + std::cerr << "rbd: failed to parse first diff header" << std::endl; + goto done; + } + + r = parse_diff_header(sd, &s_tag, &s_from, &s_to, &s_size); + if (r < 0) { + std::cerr << "rbd: failed to parse second diff header" << std::endl; + goto done; + } + + if (f_to != s_from) { + r = -EINVAL; + std::cerr << "The first TO snapshot must be equal with the second FROM " + << "snapshot, aborting" << std::endl; + goto done; + } + + { + // header + bufferlist bl; + bl.append(utils::RBD_DIFF_BANNER); + + __u8 tag; + if (f_from.size()) { + tag = RBD_DIFF_FROM_SNAP; + encode(tag, bl); + encode(f_from, bl); + } + + if (s_to.size()) { + tag = RBD_DIFF_TO_SNAP; + encode(tag, bl); + encode(s_to, bl); + } + + tag = RBD_DIFF_IMAGE_SIZE; + encode(tag, bl); + encode(s_size, bl); + + r = bl.write_fd(pd); + if (r < 0) { + std::cerr << "rbd: failed to write merged diff header" << std::endl; + goto done; + } + } + if (f_size > s_size) + pc_size = f_size << 1; + else + pc_size = s_size << 1; + + //data block + while (!f_end || !s_end) { + // progress through input + pc.update_progress(f_off + s_off, pc_size); + + if (!f_end && !f_len) { + uint64_t last_off = f_off; + + r = parse_diff_body(fd, &f_tag, &f_off, &f_len); + dout(2) << "first diff data chunk: tag=" << f_tag << ", " + << "off=" << f_off << ", " + << "len=" << f_len << dendl; + if (r < 0) { + std::cerr << "rbd: failed to read first diff data chunk header" + << std::endl; + goto done; + } + + if (f_tag == RBD_DIFF_END) { + f_end = true; + f_tag = RBD_DIFF_ZERO; + f_off = f_size; + if (f_size < s_size) + f_len = s_size - f_size; + else + f_len = 0; + } + + if (last_off > f_off) { + r = -ENOTSUP; + std::cerr << "rbd: out-of-order offset from first diff (" + << last_off << " > " << f_off << ")" << std::endl; + goto done; + } + } + + if (!s_end && !s_len) { + uint64_t last_off = s_off; + + r = parse_diff_body(sd, &s_tag, &s_off, &s_len); + dout(2) << "second diff data chunk: tag=" << s_tag << ", " + << "off=" << s_off << ", " + << "len=" << s_len << dendl; + if (r < 0) { + std::cerr << "rbd: failed to read second diff data chunk header" + << std::endl; + goto done; + } + + if (s_tag == RBD_DIFF_END) { + s_end = true; + s_off = s_size; + if (s_size < f_size) + s_len = f_size - s_size; + else + s_len = 0; + } + + if (last_off > s_off) { + r = -ENOTSUP; + std::cerr << "rbd: out-of-order offset from second diff (" + << last_off << " > " << s_off << ")" << std::endl; + goto done; + } + } + + if (f_off < s_off && f_len) { + uint64_t delta = s_off - f_off; + if (delta > f_len) + delta = f_len; + r = accept_diff_body(fd, pd, f_tag, f_off, delta); + if (r < 0) { + std::cerr << "rbd: failed to merge diff chunk" << std::endl; + goto done; + } + f_off += delta; + f_len -= delta; + + if (!f_len) { + f_tag = 0; + continue; + } + } + ceph_assert(f_off >= s_off); + + if (f_off < s_off + s_len && f_len) { + uint64_t delta = s_off + s_len - f_off; + if (delta > f_len) + delta = f_len; + if (f_tag == RBD_DIFF_WRITE) { + if (first_stdin) { + bufferptr bp = buffer::create(delta); + r = safe_read_exact(fd, bp.c_str(), delta); + } else { + off64_t l = lseek64(fd, delta, SEEK_CUR); + r = l < 0 ? -errno : 0; + } + if (r < 0) { + std::cerr << "rbd: failed to skip first diff data" << std::endl; + goto done; + } + } + f_off += delta; + f_len -= delta; + + if (!f_len) { + f_tag = 0; + continue; + } + } + ceph_assert(f_off >= s_off + s_len); + if (s_len) { + r = accept_diff_body(sd, pd, s_tag, s_off, s_len); + if (r < 0) { + std::cerr << "rbd: failed to merge diff chunk" << std::endl; + goto done; + } + s_off += s_len; + s_len = 0; + s_tag = 0; + } else { + ceph_assert(f_end && s_end); + } + continue; + } + + {//tail + __u8 tag = RBD_DIFF_END; + bufferlist bl; + encode(tag, bl); + r = bl.write_fd(pd); + } + +done: + if (pd > 2) + close(pd); + if (sd > 2) + close(sd); + if (fd > 2) + close(fd); + + if(r < 0) { + pc.fail(); + if (pd > 2) + unlink(path); + } else + pc.finish(); + + return r; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + positional->add_options() + ("diff1-path", "path to first diff (or '-' for stdin)") + ("diff2-path", "path to second diff"); + at::add_path_options(positional, options, + "path to merged diff (or '-' for stdout)"); + at::add_no_progress_option(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string first_diff = utils::get_positional_argument(vm, 0); + if (first_diff.empty()) { + std::cerr << "rbd: first diff was not specified" << std::endl; + return -EINVAL; + } + + std::string second_diff = utils::get_positional_argument(vm, 1); + if (second_diff.empty()) { + std::cerr << "rbd: second diff was not specified" << std::endl; + return -EINVAL; + } + + std::string path; + size_t arg_index = 2; + int r = utils::get_path(vm, &arg_index, &path); + if (r < 0) { + return r; + } + + r = do_merge_diff(first_diff.c_str(), second_diff.c_str(), path.c_str(), + vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + cerr << "rbd: merge-diff error" << std::endl; + return -r; + } + + return 0; +} + +Shell::Action action( + {"merge-diff"}, {}, "Merge two diff exports together.", "", + &get_arguments, &execute); + +} // namespace merge_diff +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Migration.cc b/src/tools/rbd/action/Migration.cc new file mode 100644 index 00000000..bb05e376 --- /dev/null +++ b/src/tools/rbd/action/Migration.cc @@ -0,0 +1,338 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" + +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace migration { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_prepare(librados::IoCtx& io_ctx, const std::string &image_name, + librados::IoCtx& dest_io_ctx, + const std::string &dest_image_name, + librbd::ImageOptions& opts) { + int r = librbd::RBD().migration_prepare(io_ctx, image_name.c_str(), + dest_io_ctx, dest_image_name.c_str(), + opts); + if (r < 0) { + std::cerr << "rbd: preparing migration failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +static int do_execute(librados::IoCtx& io_ctx, const std::string &image_name, + bool no_progress) { + utils::ProgressContext pc("Image migration", no_progress); + int r = librbd::RBD().migration_execute_with_progress(io_ctx, + image_name.c_str(), pc); + if (r < 0) { + pc.fail(); + std::cerr << "rbd: migration failed: " << cpp_strerror(r) << std::endl; + return r; + } + pc.finish(); + return 0; +} + +static int do_abort(librados::IoCtx& io_ctx, const std::string &image_name, + bool no_progress) { + utils::ProgressContext pc("Abort image migration", no_progress); + int r = librbd::RBD().migration_abort_with_progress(io_ctx, + image_name.c_str(), pc); + if (r < 0) { + pc.fail(); + std::cerr << "rbd: aborting migration failed: " << cpp_strerror(r) + << std::endl; + return r; + } + pc.finish(); + return 0; +} + +static int do_commit(librados::IoCtx& io_ctx, const std::string &image_name, + bool force, bool no_progress) { + librbd::image_migration_status_t migration_status; + int r = librbd::RBD().migration_status(io_ctx, image_name.c_str(), + &migration_status, + sizeof(migration_status)); + if (r < 0) { + std::cerr << "rbd: getting migration status failed: " << cpp_strerror(r) + << std::endl; + return r; + } + + librados::IoCtx dst_io_ctx; + r = librados::Rados(io_ctx).ioctx_create2(migration_status.dest_pool_id, dst_io_ctx); + if (r < 0) { + std::cerr << "rbd: accessing source pool id=" + << migration_status.dest_pool_id << " failed: " + << cpp_strerror(r) << std::endl; + return r; + } + + r = utils::set_namespace(migration_status.dest_pool_namespace, &dst_io_ctx); + if (r < 0) { + return r; + } + + librbd::Image image; + r = utils::open_image_by_id(dst_io_ctx, migration_status.dest_image_id, + true, &image); + if (r < 0) { + return r; + } + + std::vector<librbd::linked_image_spec_t> children; + r = image.list_descendants(&children); + if (r < 0) { + std::cerr << "rbd: listing descendants failed: " << cpp_strerror(r) + << std::endl; + return r; + } + + if (children.size() > 0) { + std::cerr << "rbd: the image has " + << (children.size() == 1 ? "a descendant" : "descendants") << ": " + << std::endl; + for (auto& child : children) { + std::cerr << " " << child.pool_name << "/"; + if (!child.pool_namespace.empty()) { + std::cerr << child.pool_namespace << "/"; + } + std::cerr << child.image_name; + if (child.trash) { + std::cerr << " (trash " << child.image_id << ")"; + } + std::cerr << std::endl; + } + std::cerr << "Warning: in-use, read-only descendant images" + << " will not detect the parent update." << std::endl; + if (force) { + std::cerr << "Proceeding anyway due to force flag set." << std::endl; + } else { + std::cerr << "Ensure no descendant images are opened read-only" + << " and run again with force flag." << std::endl; + return -EBUSY; + } + } + + utils::ProgressContext pc("Commit image migration", no_progress); + r = librbd::RBD().migration_commit_with_progress(io_ctx, image_name.c_str(), + pc); + if (r < 0) { + pc.fail(); + std::cerr << "rbd: committing migration failed: " << cpp_strerror(r) + << std::endl; + return r; + } + pc.finish(); + return 0; +} + +void get_prepare_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE); + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST); + at::add_create_image_options(options, true); + at::add_flatten_option(options); +} + +int execute_prepare(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name, + &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + io_ctx.set_osdmap_full_try(); + + std::string dest_pool_name; + std::string dest_namespace_name; + std::string dest_image_name; + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dest_pool_name, + &dest_namespace_name, &dest_image_name, nullptr, false, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + librbd::ImageOptions opts; + r = utils::get_image_options(vm, true, &opts); + if (r < 0) { + return r; + } + + librados::IoCtx dest_io_ctx; + if (!dest_pool_name.empty()) { + r = utils::init_io_ctx(rados, dest_pool_name, dest_namespace_name, + &dest_io_ctx); + if (r < 0) { + return r; + } + } + + r = do_prepare(io_ctx, image_name, dest_pool_name.empty() ? io_ctx : + dest_io_ctx, dest_image_name, opts); + if (r < 0) { + return r; + } + + return 0; +} + +void get_execute_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_no_progress_option(options); +} + +int execute_execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + io_ctx.set_osdmap_full_try(); + + r = do_execute(io_ctx, image_name, vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + return r; + } + + return 0; +} + +void get_abort_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_no_progress_option(options); +} + +int execute_abort(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + io_ctx.set_osdmap_full_try(); + + r = do_abort(io_ctx, image_name, vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + return r; + } + + return 0; +} + +void get_commit_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_no_progress_option(options); + options->add_options() + ("force", po::bool_switch(), "proceed even if the image has children"); +} + +int execute_commit(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + io_ctx.set_osdmap_full_try(); + + r = do_commit(io_ctx, image_name, vm["force"].as<bool>(), + vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + return r; + } + + return 0; +} + +Shell::Action action_prepare( + {"migration", "prepare"}, {}, "Prepare image migration.", + at::get_long_features_help(), &get_prepare_arguments, &execute_prepare); + +Shell::Action action_execute( + {"migration", "execute"}, {}, "Execute image migration.", "", + &get_execute_arguments, &execute_execute); + +Shell::Action action_abort( + {"migration", "abort"}, {}, "Cancel interrupted image migration.", "", + &get_abort_arguments, &execute_abort); + +Shell::Action action_commit( + {"migration", "commit"}, {}, "Commit image migration.", "", + &get_commit_arguments, &execute_commit); + +} // namespace migration +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/MirrorImage.cc b/src/tools/rbd/action/MirrorImage.cc new file mode 100644 index 00000000..a250b694 --- /dev/null +++ b/src/tools/rbd/action/MirrorImage.cc @@ -0,0 +1,360 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/MirrorDaemonServiceInfo.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/stringify.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "global/global_context.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace mirror_image { + +namespace at = argument_types; +namespace po = boost::program_options; + +namespace { + +int validate_mirroring_enabled(librbd::Image& image) { + librbd::mirror_image_info_t mirror_image; + int r = image.mirror_image_get_info(&mirror_image, sizeof(mirror_image)); + if (r < 0) { + std::cerr << "rbd: failed to retrieve mirror mode: " + << cpp_strerror(r) << std::endl; + return r; + } + + if (mirror_image.state != RBD_MIRROR_IMAGE_ENABLED) { + std::cerr << "rbd: mirroring not enabled on the image" << std::endl; + return -EINVAL; + } + return 0; +} + +} // anonymous namespace + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); +} + +void get_arguments_disable(po::options_description *positional, + po::options_description *options) { + options->add_options() + ("force", po::bool_switch(), "disable even if not primary"); + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); +} + +int execute_enable_disable(const po::variables_map &vm, bool enable, + bool force) { + size_t arg_index = 0; + std::string pool_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, "", image_name, "", "", false, + &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = enable ? image.mirror_image_enable() : image.mirror_image_disable(force); + if (r < 0) { + return r; + } + + std::cout << (enable ? "Mirroring enabled" : "Mirroring disabled") + << std::endl; + + return 0; +} + +int execute_disable(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + return execute_enable_disable(vm, false, vm["force"].as<bool>()); +} + +int execute_enable(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + return execute_enable_disable(vm, true, false); +} + +void get_arguments_promote(po::options_description *positional, + po::options_description *options) { + options->add_options() + ("force", po::bool_switch(), "promote even if not cleanly demoted by remote cluster"); + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); +} + +int execute_promote(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + bool force = vm["force"].as<bool>(); + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, "", image_name, "", "", false, + &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(image); + if (r < 0) { + return r; + } + + r = image.mirror_image_promote(force); + if (r < 0) { + std::cerr << "rbd: error promoting image to primary" << std::endl; + return r; + } + + std::cout << "Image promoted to primary" << std::endl; + return 0; +} + +int execute_demote(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, "", image_name, "", "", false, + &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(image); + if (r < 0) { + return r; + } + + r = image.mirror_image_demote(); + if (r < 0) { + std::cerr << "rbd: error demoting image to non-primary" << std::endl; + return r; + } + + std::cout << "Image demoted to non-primary" << std::endl; + return 0; +} + +int execute_resync(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, "", image_name, "", "", false, + &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(image); + if (r < 0) { + return r; + } + + r = image.mirror_image_resync(); + if (r < 0) { + std::cerr << "rbd: error flagging image resync" << std::endl; + return r; + } + + std::cout << "Flagged image for resync from primary" << std::endl; + return 0; +} + +void get_status_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_format_options(options); +} + +int execute_status(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + at::Format::Formatter formatter; + int r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + size_t arg_index = 0; + std::string pool_name; + std::string image_name; + std::string snap_name; + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, "", image_name, "", "", false, + &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(image); + if (r < 0) { + return r; + } + + librbd::mirror_image_status_t status; + r = image.mirror_image_get_status(&status, sizeof(status)); + if (r < 0) { + std::cerr << "rbd: failed to get status for image " << image_name << ": " + << cpp_strerror(r) << std::endl; + return r; + } + + std::string instance_id; + MirrorDaemonServiceInfo daemon_service_info(io_ctx); + + if (status.up) { + r = image.mirror_image_get_instance_id(&instance_id); + if (r == -EOPNOTSUPP) { + std::cerr << "rbd: newer release of Ceph OSDs required to map image " + << "to rbd-mirror daemon instance" << std::endl; + // not fatal + } else if (r < 0 && r != -ENOENT) { + std::cerr << "rbd: failed to get service id for image " + << image_name << ": " << cpp_strerror(r) << std::endl; + // not fatal + } else if (!instance_id.empty()) { + daemon_service_info.init(); + } + } + + std::string state = utils::mirror_image_status_state(status); + std::string last_update = ( + status.last_update == 0 ? "" : utils::timestr(status.last_update)); + + if (formatter != nullptr) { + formatter->open_object_section("image"); + formatter->dump_string("name", image_name); + formatter->dump_string("global_id", status.info.global_id); + formatter->dump_string("state", state); + formatter->dump_string("description", status.description); + daemon_service_info.dump(instance_id, formatter); + formatter->dump_string("last_update", last_update); + formatter->close_section(); // image + formatter->flush(std::cout); + } else { + std::cout << image_name << ":\n" + << " global_id: " << status.info.global_id << "\n" + << " state: " << state << "\n" + << " description: " << status.description << "\n"; + if (!instance_id.empty()) { + std::cout << " service: " << + daemon_service_info.get_description(instance_id) << "\n"; + } + std::cout << " last_update: " << last_update << std::endl; + } + + return 0; +} + +Shell::Action action_enable( + {"mirror", "image", "enable"}, {}, + "Enable RBD mirroring for an image.", "", + &get_arguments, &execute_enable); +Shell::Action action_disable( + {"mirror", "image", "disable"}, {}, + "Disable RBD mirroring for an image.", "", + &get_arguments_disable, &execute_disable); +Shell::Action action_promote( + {"mirror", "image", "promote"}, {}, + "Promote an image to primary for RBD mirroring.", "", + &get_arguments_promote, &execute_promote); +Shell::Action action_demote( + {"mirror", "image", "demote"}, {}, + "Demote an image to non-primary for RBD mirroring.", "", + &get_arguments, &execute_demote); +Shell::Action action_resync( + {"mirror", "image", "resync"}, {}, + "Force resync to primary image for RBD mirroring.", "", + &get_arguments, &execute_resync); +Shell::Action action_status( + {"mirror", "image", "status"}, {}, + "Show RBD mirroring status for an image.", "", + &get_status_arguments, &execute_status); + +} // namespace mirror_image +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/MirrorPool.cc b/src/tools/rbd/action/MirrorPool.cc new file mode 100644 index 00000000..ff7c3031 --- /dev/null +++ b/src/tools/rbd/action/MirrorPool.cc @@ -0,0 +1,1537 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/MirrorDaemonServiceInfo.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/Context.h" +#include "include/stringify.h" +#include "include/rbd/librbd.hpp" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "common/Throttle.h" +#include "global/global_context.h" +#include <fstream> +#include <functional> +#include <iostream> +#include <regex> +#include <set> +#include <boost/program_options.hpp> +#include "include/ceph_assert.h" + +#include <atomic> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "rbd::action::MirrorPool: " + +namespace rbd { +namespace action { +namespace mirror_pool { + +namespace at = argument_types; +namespace po = boost::program_options; + +static const std::string ALL_NAME("all"); +static const std::string SITE_NAME("site-name"); + +namespace { + +void add_site_name_optional(po::options_description *options) { + options->add_options() + (SITE_NAME.c_str(), po::value<std::string>(), "local site name"); +} + +int set_site_name(librados::Rados& rados, const std::string& site_name) { + librbd::RBD rbd; + int r = rbd.mirror_site_name_set(rados, site_name); + if (r == -EOPNOTSUPP) { + std::cerr << "rbd: cluster does not support site names" << std::endl; + return r; + } else if (r < 0) { + std::cerr << "rbd: failed to set site name" << cpp_strerror(r) + << std::endl; + return r; + } + + return 0; +} + +struct MirrorPeerDirection {}; + +void validate(boost::any& v, const std::vector<std::string>& values, + MirrorPeerDirection *target_type, int) { + po::validators::check_first_occurrence(v); + const std::string &s = po::validators::get_single_string(values); + + if (s == "rx-only") { + v = boost::any(RBD_MIRROR_PEER_DIRECTION_RX); + } else if (s == "rx-tx") { + v = boost::any(RBD_MIRROR_PEER_DIRECTION_RX_TX); + } else { + throw po::validation_error(po::validation_error::invalid_option_value); + } +} + +int validate_mirroring_enabled(librados::IoCtx& io_ctx) { + librbd::RBD rbd; + rbd_mirror_mode_t mirror_mode; + int r = rbd.mirror_mode_get(io_ctx, &mirror_mode); + if (r < 0) { + std::cerr << "rbd: failed to retrieve mirror mode: " + << cpp_strerror(r) << std::endl; + return r; + } + + if (mirror_mode == RBD_MIRROR_MODE_DISABLED) { + std::cerr << "rbd: mirroring not enabled on the pool" << std::endl; + return -EINVAL; + } + return 0; +} + +int validate_uuid(const std::string &uuid) { + std::regex pattern("^[A-F0-9]{8}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{12}$", + std::regex::icase); + std::smatch match; + if (!std::regex_match(uuid, match, pattern)) { + std::cerr << "rbd: invalid uuid '" << uuid << "'" << std::endl; + return -EINVAL; + } + return 0; +} + +int read_key_file(std::string path, std::string* key) { + std::ifstream key_file; + key_file.open(path); + if (key_file.fail()) { + std::cerr << "rbd: failed to open " << path << std::endl; + return -EINVAL; + } + + std::getline(key_file, *key); + if (key_file.bad()) { + std::cerr << "rbd: failed to read key from " << path << std::endl; + return -EINVAL; + } + + key_file.close(); + return 0; +} + +void add_uuid_option(po::options_description *positional) { + positional->add_options() + ("uuid", po::value<std::string>(), "peer uuid"); +} + +int get_uuid(const po::variables_map &vm, size_t arg_index, + std::string *uuid) { + *uuid = utils::get_positional_argument(vm, arg_index); + if (uuid->empty()) { + std::cerr << "rbd: must specify peer uuid" << std::endl; + return -EINVAL; + } + return validate_uuid(*uuid); +} + +int get_remote_cluster_spec(const po::variables_map &vm, + const std::string &spec, + std::string *remote_client_name, + std::string *remote_cluster, + std::map<std::string, std::string>* attributes) { + if (vm.count("remote-client-name")) { + *remote_client_name = vm["remote-client-name"].as<std::string>(); + } + if (vm.count("remote-cluster")) { + *remote_cluster = vm["remote-cluster"].as<std::string>(); + } + if (vm.count("remote-mon-host")) { + (*attributes)["mon_host"] = vm["remote-mon-host"].as<std::string>(); + } + if (vm.count("remote-key-file")) { + std::string key; + int r = read_key_file(vm["remote-key-file"].as<std::string>(), &key); + if (r < 0) { + return r; + } + (*attributes)["key"] = key; + } + + if (!spec.empty()) { + std::regex pattern("^(?:(client\\.[^@]+)@)?([^/@]+)$"); + std::smatch match; + if (!std::regex_match(spec, match, pattern)) { + std::cerr << "rbd: invalid spec '" << spec << "'" << std::endl; + return -EINVAL; + } + if (match[1].matched) { + *remote_client_name = match[1]; + } + *remote_cluster = match[2]; + } + + if (remote_cluster->empty()) { + std::cerr << "rbd: remote cluster was not specified" << std::endl; + return -EINVAL; + } + return 0; +} + +int set_peer_config_key(librados::IoCtx& io_ctx, const std::string& peer_uuid, + std::map<std::string, std::string>&& attributes) { + librbd::RBD rbd; + int r = rbd.mirror_peer_set_attributes(io_ctx, peer_uuid, attributes); + if (r == -EPERM) { + std::cerr << "rbd: permission denied attempting to set peer " + << "config-key secrets in the monitor" << std::endl; + return r; + } else if (r < 0) { + std::cerr << "rbd: failed to update mirroring peer config: " + << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +int get_peer_config_key(librados::IoCtx& io_ctx, const std::string& peer_uuid, + std::map<std::string, std::string>* attributes) { + librbd::RBD rbd; + int r = rbd.mirror_peer_get_attributes(io_ctx, peer_uuid, attributes); + if (r == -ENOENT) { + return r; + } else if (r == -EPERM) { + std::cerr << "rbd: permission denied attempting to access peer " + << "config-key secrets from the monitor" << std::endl; + return r; + } else if (r == -EINVAL) { + std::cerr << "rbd: corrupt mirroring peer config" << std::endl; + return r; + } else if (r < 0) { + std::cerr << "rbd: error reading mirroring peer config: " + << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +int update_peer_config_key(librados::IoCtx& io_ctx, + const std::string& peer_uuid, + const std::string& key, + const std::string& value) { + std::map<std::string, std::string> attributes; + int r = get_peer_config_key(io_ctx, peer_uuid, &attributes); + if (r == -ENOENT) { + return set_peer_config_key(io_ctx, peer_uuid, {{key, value}}); + } else if (r < 0) { + return r; + } + + if (value.empty()) { + attributes.erase(key); + } else { + attributes[key] = value; + } + return set_peer_config_key(io_ctx, peer_uuid, std::move(attributes)); +} + +int format_mirror_peers(librados::IoCtx& io_ctx, + at::Format::Formatter formatter, + const std::vector<librbd::mirror_peer_t> &peers, + bool config_key) { + TextTable tbl; + if (formatter != nullptr) { + formatter->open_array_section("peers"); + } else { + std::cout << "Peers: "; + if (peers.empty()) { + std::cout << "none" << std::endl; + } else { + tbl.define_column("", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("UUID", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("CLIENT", TextTable::LEFT, TextTable::LEFT); + if (config_key) { + tbl.define_column("MON_HOST", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("KEY", TextTable::LEFT, TextTable::LEFT); + } + } + } + + for (auto &peer : peers) { + std::map<std::string, std::string> attributes; + if (config_key) { + int r = get_peer_config_key(io_ctx, peer.uuid, &attributes); + if (r < 0 && r != -ENOENT) { + return r; + } + } + + if (formatter != nullptr) { + formatter->open_object_section("peer"); + formatter->dump_string("uuid", peer.uuid); + formatter->dump_string("cluster_name", peer.cluster_name); + formatter->dump_string("client_name", peer.client_name); + for (auto& pair : attributes) { + formatter->dump_string(pair.first.c_str(), pair.second); + } + formatter->close_section(); + } else { + tbl << " " + << peer.uuid + << peer.cluster_name + << peer.client_name; + if (config_key) { + tbl << attributes["mon_host"] + << attributes["key"]; + } + tbl << TextTable::endrow; + } + } + + if (formatter != nullptr) { + formatter->close_section(); + } else { + std::cout << std::endl << tbl; + } + return 0; +} + +class ImageRequestBase { +public: + void send() { + dout(20) << this << " " << __func__ << ": image_name=" << m_image_name + << dendl; + + auto ctx = new FunctionContext([this](int r) { + handle_finalize(r); + }); + + // will pause here until slots are available + m_finalize_ctx = m_throttle.start_op(ctx); + + open_image(); + } + +protected: + ImageRequestBase(librados::IoCtx &io_ctx, OrderedThrottle &throttle, + const std::string &image_name) + : m_io_ctx(io_ctx), m_throttle(throttle), m_image_name(image_name) { + } + virtual ~ImageRequestBase() { + } + + virtual bool skip_get_info() const { + return false; + } + virtual void get_info(librbd::Image &image, librbd::mirror_image_info_t *info, + librbd::RBD::AioCompletion *aio_comp) { + image.aio_mirror_image_get_info(info, sizeof(librbd::mirror_image_info_t), + aio_comp); + } + + virtual bool skip_action(const librbd::mirror_image_info_t &info) const { + return false; + } + virtual void execute_action(librbd::Image &image, + librbd::RBD::AioCompletion *aio_comp) = 0; + virtual void handle_execute_action(int r) { + dout(20) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + std::cerr << "rbd: failed to " << get_action_type() << " image " + << m_image_name << ": " << cpp_strerror(r) << std::endl; + m_ret_val = r; + } + + close_image(); + } + + virtual void finalize_action() { + } + virtual std::string get_action_type() const = 0; + +private: + /** + * @verbatim + * + * <start> + * | + * v + * OPEN_IMAGE + * | + * v + * GET_INFO + * | + * v + * EXECUTE_ACTION + * | + * v + * CLOSE_IMAGE + * | + * v + * FINALIZE_ACTION + * | + * v + * <finish> + * + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + OrderedThrottle &m_throttle; + const std::string m_image_name; + + librbd::Image m_image; + Context *m_finalize_ctx = nullptr; + + librbd::mirror_image_info_t m_mirror_image_info; + + int m_ret_val = 0; + + void open_image() { + dout(20) << this << " " << __func__ << dendl; + + librbd::RBD rbd; + auto aio_completion = utils::create_aio_completion< + ImageRequestBase, &ImageRequestBase::handle_open_image>(this); + rbd.aio_open(m_io_ctx, m_image, m_image_name.c_str(), nullptr, + aio_completion); + } + + void handle_open_image(int r) { + dout(20) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + std::cerr << "rbd: failed to open image " + << m_image_name << ": " << cpp_strerror(r) << std::endl; + m_finalize_ctx->complete(r); + return; + } + + get_info(); + } + + void get_info() { + if (skip_get_info()) { + execute_action(); + return; + } + dout(20) << this << " " << __func__ << dendl; + + auto aio_completion = utils::create_aio_completion< + ImageRequestBase, &ImageRequestBase::handle_get_info>(this); + get_info(m_image, &m_mirror_image_info, aio_completion); + } + + void handle_get_info(int r) { + dout(20) << this << " " << __func__ << ": r=" << r << dendl; + + if (r == -ENOENT) { + close_image(); + return; + } else if (r < 0) { + std::cerr << "rbd: failed to retrieve mirror image info for " + << m_image_name << ": " << cpp_strerror(r) << std::endl; + m_ret_val = r; + close_image(); + return; + } + + execute_action(); + } + + void execute_action() { + if (skip_action(m_mirror_image_info)) { + close_image(); + return; + } + dout(20) << this << " " << __func__ << dendl; + + auto aio_completion = utils::create_aio_completion< + ImageRequestBase, &ImageRequestBase::handle_execute_action>(this); + execute_action(m_image, aio_completion); + } + + void close_image() { + dout(20) << this << " " << __func__ << dendl; + + auto aio_completion = utils::create_aio_completion< + ImageRequestBase, &ImageRequestBase::handle_close_image>(this); + m_image.aio_close(aio_completion); + } + + void handle_close_image(int r) { + dout(20) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + std::cerr << "rbd: failed to close image " + << m_image_name << ": " << cpp_strerror(r) << std::endl; + } + + m_finalize_ctx->complete(r); + } + + void handle_finalize(int r) { + dout(20) << this << " " << __func__ << ": r=" << r << dendl; + + if (r == 0 && m_ret_val < 0) { + r = m_ret_val; + } + if (r >= 0) { + finalize_action(); + } + m_throttle.end_op(r); + delete this; + } + +}; + +class PromoteImageRequest : public ImageRequestBase { +public: + PromoteImageRequest(librados::IoCtx &io_ctx, OrderedThrottle &throttle, + const std::string &image_name, std::atomic<unsigned> *counter, + bool force) + : ImageRequestBase(io_ctx, throttle, image_name), m_counter(counter), + m_force(force) { + } + +protected: + bool skip_action(const librbd::mirror_image_info_t &info) const override { + return (info.state != RBD_MIRROR_IMAGE_ENABLED || info.primary); + } + + void execute_action(librbd::Image &image, + librbd::RBD::AioCompletion *aio_comp) override { + image.aio_mirror_image_promote(m_force, aio_comp); + } + + void handle_execute_action(int r) override { + if (r >= 0) { + (*m_counter)++; + } + ImageRequestBase::handle_execute_action(r); + } + + std::string get_action_type() const override { + return "promote"; + } + +private: + std::atomic<unsigned> *m_counter = nullptr; + bool m_force; +}; + +class DemoteImageRequest : public ImageRequestBase { +public: + DemoteImageRequest(librados::IoCtx &io_ctx, OrderedThrottle &throttle, + const std::string &image_name, std::atomic<unsigned> *counter) + : ImageRequestBase(io_ctx, throttle, image_name), m_counter(counter) { + } + +protected: + bool skip_action(const librbd::mirror_image_info_t &info) const override { + return (info.state != RBD_MIRROR_IMAGE_ENABLED || !info.primary); + } + + void execute_action(librbd::Image &image, + librbd::RBD::AioCompletion *aio_comp) override { + image.aio_mirror_image_demote(aio_comp); + } + void handle_execute_action(int r) override { + if (r >= 0) { + (*m_counter)++; + } + ImageRequestBase::handle_execute_action(r); + } + + std::string get_action_type() const override { + return "demote"; + } + +private: + std::atomic<unsigned> *m_counter = nullptr; +}; + +class StatusImageRequest : public ImageRequestBase { +public: + StatusImageRequest( + librados::IoCtx &io_ctx, OrderedThrottle &throttle, + const std::string &image_name, + const std::map<std::string, std::string> &instance_ids, + const MirrorDaemonServiceInfo &daemon_service_info, + at::Format::Formatter formatter) + : ImageRequestBase(io_ctx, throttle, image_name), + m_instance_ids(instance_ids), m_daemon_service_info(daemon_service_info), + m_formatter(formatter) { + } + +protected: + bool skip_get_info() const override { + return true; + } + + void execute_action(librbd::Image &image, + librbd::RBD::AioCompletion *aio_comp) override { + image.get_id(&m_image_id); + image.aio_mirror_image_get_status(&m_mirror_image_status, + sizeof(m_mirror_image_status), aio_comp); + } + + void finalize_action() override { + if (m_mirror_image_status.info.global_id.empty()) { + return; + } + + std::string state = utils::mirror_image_status_state(m_mirror_image_status); + std::string instance_id = (m_mirror_image_status.up && + m_instance_ids.count(m_image_id)) ? + m_instance_ids.find(m_image_id)->second : ""; + std::string last_update = ( + m_mirror_image_status.last_update == 0 ? + "" : utils::timestr(m_mirror_image_status.last_update)); + + if (m_formatter != nullptr) { + m_formatter->open_object_section("image"); + m_formatter->dump_string("name", m_mirror_image_status.name); + m_formatter->dump_string("global_id", + m_mirror_image_status.info.global_id); + m_formatter->dump_string("state", state); + m_formatter->dump_string("description", + m_mirror_image_status.description); + m_daemon_service_info.dump(instance_id, m_formatter); + m_formatter->dump_string("last_update", last_update); + m_formatter->close_section(); // image + } else { + std::cout << "\n" << m_mirror_image_status.name << ":\n" + << " global_id: " + << m_mirror_image_status.info.global_id << "\n" + << " state: " << state << "\n" + << " description: " + << m_mirror_image_status.description << "\n"; + if (!instance_id.empty()) { + std::cout << " service: " + << m_daemon_service_info.get_description(instance_id) << "\n"; + } + std::cout << " last_update: " << last_update << std::endl; + } + } + + std::string get_action_type() const override { + return "status"; + } + +private: + const std::map<std::string, std::string> &m_instance_ids; + const MirrorDaemonServiceInfo &m_daemon_service_info; + at::Format::Formatter m_formatter; + std::string m_image_id; + librbd::mirror_image_status_t m_mirror_image_status; +}; + +template <typename RequestT> +class ImageRequestAllocator { +public: + template <class... Args> + RequestT *operator()(librados::IoCtx &io_ctx, OrderedThrottle &throttle, + const std::string &image_name, Args&&... args) { + return new RequestT(io_ctx, throttle, image_name, + std::forward<Args>(args)...); + } +}; + +template <typename RequestT> +class ImageRequestGenerator { +public: + template <class... Args> + ImageRequestGenerator(librados::IoCtx &io_ctx, Args&&... args) + : m_io_ctx(io_ctx), + m_factory(std::bind(ImageRequestAllocator<RequestT>(), + std::ref(m_io_ctx), std::ref(m_throttle), + std::placeholders::_1, std::forward<Args>(args)...)), + m_throttle(g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"), + true) { + } + + int execute() { + // use the alphabetical list of image names for pool-level + // mirror image operations + librbd::RBD rbd; + int r = rbd.list2(m_io_ctx, &m_images); + if (r < 0 && r != -ENOENT) { + std::cerr << "rbd: failed to list images within pool" << std::endl; + return r; + } + + for (auto &image : m_images) { + auto request = m_factory(image.name); + request->send(); + } + + return m_throttle.wait_for_ret(); + } +private: + typedef std::function<RequestT*(const std::string&)> Factory; + + librados::IoCtx &m_io_ctx; + Factory m_factory; + + OrderedThrottle m_throttle; + + std::vector<librbd::image_spec_t> m_images; + +}; + +} // anonymous namespace + +void get_peer_bootstrap_create_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + options->add_options() + (SITE_NAME.c_str(), po::value<std::string>(), "local site name"); +} + +int execute_peer_bootstrap_create( + const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(io_ctx); + if (r < 0) { + return r; + } + + if (vm.count(SITE_NAME)) { + r = set_site_name(rados, vm[SITE_NAME].as<std::string>()); + if (r < 0) { + return r; + } + } + + librbd::RBD rbd; + std::string token; + r = rbd.mirror_peer_bootstrap_create(io_ctx, &token); + if (r == -EEXIST) { + std::cerr << "rbd: mismatch with pre-existing RBD mirroring peer user caps" + << std::endl; + } else if (r < 0) { + std::cerr << "rbd: failed to create mirroring bootstrap token: " + << cpp_strerror(r) << std::endl; + return r; + } + + std::cout << token << std::endl; + return 0; +} + +void get_peer_bootstrap_import_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + options->add_options() + (SITE_NAME.c_str(), po::value<std::string>(), "local site name"); + positional->add_options() + ("token-path", po::value<std::string>(), + "bootstrap token file (or '-' for stdin)"); + options->add_options() + ("token-path", po::value<std::string>(), + "bootstrap token file (or '-' for stdin)") + ("direction", po::value<MirrorPeerDirection>(), + "mirroring direction (rx-only, rx-tx)\n" + "[default: rx-tx]"); +} + +int execute_peer_bootstrap_import( + const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + std::string token_path; + if (vm.count("token-path")) { + token_path = vm["token-path"].as<std::string>(); + } else { + token_path = utils::get_positional_argument(vm, arg_index++); + } + + if (token_path.empty()) { + std::cerr << "rbd: token path was not specified" << std::endl; + return -EINVAL; + } + + rbd_mirror_peer_direction_t mirror_peer_direction = + RBD_MIRROR_PEER_DIRECTION_RX_TX; + if (vm.count("direction")) { + mirror_peer_direction = vm["direction"].as<rbd_mirror_peer_direction_t>(); + } + + int fd = STDIN_FILENO; + if (token_path != "-") { + fd = open(token_path.c_str(), O_RDONLY); + if (fd < 0) { + r = -errno; + std::cerr << "rbd: error opening " << token_path << std::endl; + return r; + } + } + + char token[1024]; + memset(token, 0, sizeof(token)); + r = safe_read(fd, token, sizeof(token) - 1); + if (fd != STDIN_FILENO) { + VOID_TEMP_FAILURE_RETRY(close(fd)); + } + + if (r < 0) { + std::cerr << "rbd: error reading token file: " << cpp_strerror(r) + << std::endl; + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + if (vm.count(SITE_NAME)) { + r = set_site_name(rados, vm[SITE_NAME].as<std::string>()); + if (r < 0) { + return r; + } + } + + librbd::RBD rbd; + r = rbd.mirror_peer_bootstrap_import(io_ctx, mirror_peer_direction, token); + if (r == -ENOSYS) { + std::cerr << "rbd: mirroring is not enabled on remote peer" << std::endl; + return r; + } else if (r < 0) { + std::cerr << "rbd: failed to import peer bootstrap token" << std::endl; + return r; + } + + return 0; +} + +void get_peer_add_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + positional->add_options() + ("remote-cluster-spec", "remote cluster spec\n" + "(example: [<client name>@]<cluster name>)"); + options->add_options() + ("remote-client-name", po::value<std::string>(), "remote client name") + ("remote-cluster", po::value<std::string>(), "remote cluster name") + ("remote-mon-host", po::value<std::string>(), "remote mon host(s)") + ("remote-key-file", po::value<std::string>(), + "path to file containing remote key"); +} + +int execute_peer_add(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + std::string remote_client_name = g_ceph_context->_conf->name.to_str(); + std::string remote_cluster; + std::map<std::string, std::string> attributes; + r = get_remote_cluster_spec( + vm, utils::get_positional_argument(vm, arg_index), + &remote_client_name, &remote_cluster, &attributes); + if (r < 0) { + return r; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(io_ctx); + if (r < 0) { + return r; + } + + // TODO: temporary restriction to prevent adding multiple peers + // until rbd-mirror daemon can properly handle the scenario + librbd::RBD rbd; + std::vector<librbd::mirror_peer_t> mirror_peers; + r = rbd.mirror_peer_list(io_ctx, &mirror_peers); + if (r < 0) { + std::cerr << "rbd: failed to list mirror peers" << std::endl; + return r; + } + if (!mirror_peers.empty()) { + std::cerr << "rbd: multiple peers are not currently supported" << std::endl; + return -EINVAL; + } + + std::string uuid; + r = rbd.mirror_peer_add(io_ctx, &uuid, remote_cluster, remote_client_name); + if (r < 0) { + std::cerr << "rbd: error adding mirror peer" << std::endl; + return r; + } + + if (!attributes.empty()) { + r = set_peer_config_key(io_ctx, uuid, std::move(attributes)); + if (r < 0) { + return r; + } + } + + std::cout << uuid << std::endl; + return 0; +} + +void get_peer_remove_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + add_uuid_option(positional); +} + +int execute_peer_remove(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + std::string uuid; + r = get_uuid(vm, arg_index, &uuid); + if (r < 0) { + return r; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.mirror_peer_remove(io_ctx, uuid); + if (r < 0) { + std::cerr << "rbd: error removing mirror peer" << std::endl; + return r; + } + return 0; +} + +void get_peer_set_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + add_uuid_option(positional); + positional->add_options() + ("key", "peer parameter [client, cluster, mon-host, key-file]") + ("value", "new value for specified key"); +} + +int execute_peer_set(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + std::string uuid; + r = get_uuid(vm, arg_index++, &uuid); + if (r < 0) { + return r; + } + + std::set<std::string> valid_keys{{"client", "cluster", "mon-host", + "key-file"}}; + std::string key = utils::get_positional_argument(vm, arg_index++); + if (valid_keys.find(key) == valid_keys.end()) { + std::cerr << "rbd: must specify "; + for (auto& valid_key : valid_keys) { + std::cerr << "'" << valid_key << "'"; + if (&valid_key != &(*valid_keys.rbegin())) { + std::cerr << ", "; + } + } + std::cerr << " key." << std::endl; + return -EINVAL; + } + + std::string value = utils::get_positional_argument(vm, arg_index++); + if (value.empty() && (key == "client" || key == "cluster")) { + std::cerr << "rbd: must specify new " << key << " value." << std::endl; + } else if (key == "key-file") { + key = "key"; + r = read_key_file(value, &value); + if (r < 0) { + return r; + } + } else if (key == "mon-host") { + key = "mon_host"; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + if (key == "client") { + r = rbd.mirror_peer_set_client(io_ctx, uuid.c_str(), value.c_str()); + } else if (key == "cluster") { + r = rbd.mirror_peer_set_cluster(io_ctx, uuid.c_str(), value.c_str()); + } else { + r = update_peer_config_key(io_ctx, uuid, key, value); + if (r == -ENOENT) { + std::cerr << "rbd: mirror peer " << uuid << " does not exist" + << std::endl; + } + } + + if (r < 0) { + return r; + } + return 0; +} + +void get_disable_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); +} + +void get_enable_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + positional->add_options() + ("mode", "mirror mode [image or pool]"); + add_site_name_optional(options); +} + +int execute_enable_disable(librados::IoCtx& io_ctx, + rbd_mirror_mode_t next_mirror_mode, + const std::string &mode, bool ignore_no_update) { + librbd::RBD rbd; + rbd_mirror_mode_t current_mirror_mode; + int r = rbd.mirror_mode_get(io_ctx, ¤t_mirror_mode); + if (r < 0) { + std::cerr << "rbd: failed to retrieve mirror mode: " + << cpp_strerror(r) << std::endl; + return r; + } + + if (current_mirror_mode == next_mirror_mode) { + if (!ignore_no_update) { + if (mode == "disabled") { + std::cout << "rbd: mirroring is already " << mode << std::endl; + } else { + std::cout << "rbd: mirroring is already configured for " + << mode << " mode" << std::endl; + } + } + return 0; + } else if (next_mirror_mode == RBD_MIRROR_MODE_IMAGE && + current_mirror_mode == RBD_MIRROR_MODE_POOL) { + std::cout << "note: changing mirroring mode from pool to image" + << std::endl; + } else if (next_mirror_mode == RBD_MIRROR_MODE_POOL && + current_mirror_mode == RBD_MIRROR_MODE_IMAGE) { + std::cout << "note: changing mirroring mode from image to pool" + << std::endl; + } + + r = rbd.mirror_mode_set(io_ctx, next_mirror_mode); + if (r < 0) { + return r; + } + return 0; +} + +int execute_disable(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + + // TODO support namespaces + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + return execute_enable_disable(io_ctx, RBD_MIRROR_MODE_DISABLED, "disabled", + false); +} + +int execute_enable(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + rbd_mirror_mode_t mirror_mode; + std::string mode = utils::get_positional_argument(vm, arg_index++); + if (mode == "image") { + mirror_mode = RBD_MIRROR_MODE_IMAGE; + } else if (mode == "pool") { + mirror_mode = RBD_MIRROR_MODE_POOL; + } else { + std::cerr << "rbd: must specify 'image' or 'pool' mode." << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + + // TODO support namespaces + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + bool updated = false; + if (vm.count(SITE_NAME)) { + librbd::RBD rbd; + + auto site_name = vm[SITE_NAME].as<std::string>(); + std::string original_site_name; + r = rbd.mirror_site_name_get(rados, &original_site_name); + updated = (r >= 0 && site_name != original_site_name); + + r = set_site_name(rados, site_name); + if (r < 0) { + return r; + } + } + + return execute_enable_disable(io_ctx, mirror_mode, mode, updated); +} + +void get_info_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + at::add_format_options(options); + options->add_options() + (ALL_NAME.c_str(), po::bool_switch(), "list all attributes"); +} + +int execute_info(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + rbd_mirror_mode_t mirror_mode; + r = rbd.mirror_mode_get(io_ctx, &mirror_mode); + if (r < 0) { + return r; + } + + std::string site_name; + r = rbd.mirror_site_name_get(rados, &site_name); + if (r < 0 && r != -EOPNOTSUPP) { + return r; + } + + std::vector<librbd::mirror_peer_t> mirror_peers; + r = rbd.mirror_peer_list(io_ctx, &mirror_peers); + if (r < 0) { + return r; + } + + std::string mirror_mode_desc; + switch (mirror_mode) { + case RBD_MIRROR_MODE_DISABLED: + mirror_mode_desc = "disabled"; + break; + case RBD_MIRROR_MODE_IMAGE: + mirror_mode_desc = "image"; + break; + case RBD_MIRROR_MODE_POOL: + mirror_mode_desc = "pool"; + break; + default: + mirror_mode_desc = "unknown"; + break; + } + + if (formatter != nullptr) { + formatter->open_object_section("mirror"); + formatter->dump_string("mode", mirror_mode_desc); + } else { + std::cout << "Mode: " << mirror_mode_desc << std::endl; + } + + if (mirror_mode != RBD_MIRROR_MODE_DISABLED) { + if (formatter != nullptr) { + formatter->dump_string("site_name", site_name); + } else { + std::cout << "Site Name: " << site_name << std::endl; + } + + r = format_mirror_peers(io_ctx, formatter, mirror_peers, + vm[ALL_NAME].as<bool>()); + if (r < 0) { + return r; + } + } + if (formatter != nullptr) { + formatter->close_section(); + formatter->flush(std::cout); + } + return 0; +} + +void get_status_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + at::add_format_options(options); + at::add_verbose_option(options); +} + +int execute_status(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + bool verbose = vm[at::VERBOSE].as<bool>(); + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + + std::map<librbd::mirror_image_status_state_t, int> states; + r = rbd.mirror_image_status_summary(io_ctx, &states); + if (r < 0) { + std::cerr << "rbd: failed to get status summary for mirrored images: " + << cpp_strerror(r) << std::endl; + return r; + } + + if (formatter != nullptr) { + formatter->open_object_section("status"); + } + + enum Health {Ok = 0, Warning = 1, Error = 2} health = Ok; + const char *names[] = {"OK", "WARNING", "ERROR"}; + int total = 0; + + for (auto &it : states) { + auto &state = it.first; + if (health < Warning && + (state != MIRROR_IMAGE_STATUS_STATE_REPLAYING && + state != MIRROR_IMAGE_STATUS_STATE_STOPPED)) { + health = Warning; + } + if (health < Error && + state == MIRROR_IMAGE_STATUS_STATE_ERROR) { + health = Error; + } + total += it.second; + } + + if (formatter != nullptr) { + formatter->open_object_section("summary"); + formatter->dump_string("health", names[health]); + formatter->open_object_section("states"); + for (auto &it : states) { + std::string state_name = utils::mirror_image_status_state(it.first); + formatter->dump_int(state_name.c_str(), it.second); + } + formatter->close_section(); // states + formatter->close_section(); // summary + } else { + std::cout << "health: " << names[health] << std::endl; + std::cout << "images: " << total << " total" << std::endl; + for (auto &it : states) { + std::cout << " " << it.second << " " + << utils::mirror_image_status_state(it.first) << std::endl; + } + } + + int ret = 0; + + if (verbose) { + if (formatter != nullptr) { + formatter->open_array_section("images"); + } + + std::map<std::string, std::string> instance_ids; + MirrorDaemonServiceInfo daemon_service_info(io_ctx); + + std::string start_image_id; + while (true) { + std::map<std::string, std::string> ids; + r = rbd.mirror_image_instance_id_list(io_ctx, start_image_id, 1024, &ids); + if (r < 0) { + if (r == -EOPNOTSUPP) { + std::cerr << "rbd: newer release of Ceph OSDs required to map image " + << "to rbd-mirror daemon instance" << std::endl; + } else { + std::cerr << "rbd: failed to get instance id list: " + << cpp_strerror(r) << std::endl; + } + // not fatal + break; + } + if (ids.empty()) { + break; + } + instance_ids.insert(ids.begin(), ids.end()); + start_image_id = ids.rbegin()->first; + } + + if (!instance_ids.empty()) { + daemon_service_info.init(); + } + + ImageRequestGenerator<StatusImageRequest> generator( + io_ctx, instance_ids, daemon_service_info, formatter); + ret = generator.execute(); + + if (formatter != nullptr) { + formatter->close_section(); // images + } + } + + if (formatter != nullptr) { + formatter->close_section(); // status + formatter->flush(std::cout); + } + + return ret; +} + +void get_promote_arguments(po::options_description *positional, + po::options_description *options) { + options->add_options() + ("force", po::bool_switch(), + "promote even if not cleanly demoted by remote cluster"); + at::add_pool_options(positional, options, false); +} + +int execute_promote(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(io_ctx); + if (r < 0) { + return r; + } + + utils::disable_cache(); + + std::atomic<unsigned> counter = { 0 }; + ImageRequestGenerator<PromoteImageRequest> generator(io_ctx, &counter, + vm["force"].as<bool>()); + r = generator.execute(); + + std::cout << "Promoted " << counter.load() << " mirrored images" << std::endl; + return r; +} + +void get_demote_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); +} + +int execute_demote(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + // TODO support namespaces + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + r = validate_mirroring_enabled(io_ctx); + if (r < 0) { + return r; + } + + utils::disable_cache(); + + std::atomic<unsigned> counter { 0 }; + ImageRequestGenerator<DemoteImageRequest> generator(io_ctx, &counter); + r = generator.execute(); + + std::cout << "Demoted " << counter.load() << " mirrored images" << std::endl; + return r; +} + +Shell::Action action_bootstrap_create( + {"mirror", "pool", "peer", "bootstrap", "create"}, {}, + "Create a peer bootstrap token to import in a remote cluster", "", + &get_peer_bootstrap_create_arguments, &execute_peer_bootstrap_create); +Shell::Action action_bootstreap_import( + {"mirror", "pool", "peer", "bootstrap", "import"}, {}, + "Import a peer bootstrap token created from a remote cluster", "", + &get_peer_bootstrap_import_arguments, &execute_peer_bootstrap_import); + +Shell::Action action_add( + {"mirror", "pool", "peer", "add"}, {}, + "Add a mirroring peer to a pool.", "", + &get_peer_add_arguments, &execute_peer_add); +Shell::Action action_remove( + {"mirror", "pool", "peer", "remove"}, {}, + "Remove a mirroring peer from a pool.", "", + &get_peer_remove_arguments, &execute_peer_remove); +Shell::Action action_set( + {"mirror", "pool", "peer", "set"}, {}, + "Update mirroring peer settings.", "", + &get_peer_set_arguments, &execute_peer_set); + +Shell::Action action_disable( + {"mirror", "pool", "disable"}, {}, + "Disable RBD mirroring by default within a pool.", "", + &get_disable_arguments, &execute_disable); +Shell::Action action_enable( + {"mirror", "pool", "enable"}, {}, + "Enable RBD mirroring by default within a pool.", "", + &get_enable_arguments, &execute_enable); +Shell::Action action_info( + {"mirror", "pool", "info"}, {}, + "Show information about the pool mirroring configuration.", {}, + &get_info_arguments, &execute_info); +Shell::Action action_status( + {"mirror", "pool", "status"}, {}, + "Show status for all mirrored images in the pool.", {}, + &get_status_arguments, &execute_status); +Shell::Action action_promote( + {"mirror", "pool", "promote"}, {}, + "Promote all non-primary images in the pool.", {}, + &get_promote_arguments, &execute_promote); +Shell::Action action_demote( + {"mirror", "pool", "demote"}, {}, + "Demote all primary images in the pool.", {}, + &get_demote_arguments, &execute_demote); + +} // namespace mirror_pool +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Namespace.cc b/src/tools/rbd/action/Namespace.cc new file mode 100644 index 00000000..746ab40c --- /dev/null +++ b/src/tools/rbd/action/Namespace.cc @@ -0,0 +1,191 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include <algorithm> +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace ns { + +namespace at = argument_types; +namespace po = boost::program_options; + +void get_create_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, true); +} + +int execute_create(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + std::string namespace_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + &namespace_name, &arg_index); + if (r < 0) { + return r; + } + + if (namespace_name.empty()) { + std::cerr << "rbd: namespace name was not specified" << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.namespace_create(io_ctx, namespace_name.c_str()); + if (r < 0) { + std::cerr << "rbd: failed to created namespace: " << cpp_strerror(r) + << std::endl; + return r; + } + + return 0; +} + +void get_remove_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, true); +} + +int execute_remove(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + std::string namespace_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + &namespace_name, &arg_index); + if (r < 0) { + return r; + } + + if (namespace_name.empty()) { + std::cerr << "rbd: namespace name was not specified" << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.namespace_remove(io_ctx, namespace_name.c_str()); + if (r == -EBUSY) { + std::cerr << "rbd: namespace contains images which must be deleted first." + << std::endl; + return r; + } else if (r == -ENOENT) { + std::cerr << "rbd: namespace does not exist." << std::endl; + return r; + } else if (r < 0) { + std::cerr << "rbd: failed to remove namespace: " << cpp_strerror(r) + << std::endl; + return r; + } + + return 0; +} + +void get_list_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + at::add_format_options(options); +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + std::vector<std::string> names; + r = rbd.namespace_list(io_ctx, &names); + if (r < 0 && r != -ENOENT) { + std::cerr << "rbd: failed to list namespaces: " << cpp_strerror(r) + << std::endl; + return r; + } + + std::sort(names.begin(), names.end()); + + TextTable tbl; + if (formatter) { + formatter->open_array_section("namespaces"); + } else { + tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + } + + for (auto& name : names) { + if (formatter) { + formatter->open_object_section("namespace"); + formatter->dump_string("name", name); + formatter->close_section(); + } else { + tbl << name << TextTable::endrow; + } + } + + if (formatter) { + formatter->close_section(); + formatter->flush(std::cout); + } else if (!names.empty()) { + std::cout << tbl; + } + + return 0; +} + +Shell::Action action_create( + {"namespace", "create"}, {}, + "Create an RBD image namespace.", "", + &get_create_arguments, &execute_create); + +Shell::Action action_remove( + {"namespace", "remove"}, {"namespace", "rm"}, + "Remove an RBD image namespace.", "", + &get_remove_arguments, &execute_remove); + +Shell::Action action_list( + {"namespace", "list"}, {"namespace", "ls"}, "List RBD image namespaces.", "", + &get_list_arguments, &execute_list); + +} // namespace ns +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Nbd.cc b/src/tools/rbd/action/Nbd.cc new file mode 100644 index 00000000..5c55adea --- /dev/null +++ b/src/tools/rbd/action/Nbd.cc @@ -0,0 +1,286 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/stringify.h" +#include "common/SubProcess.h" +#include <iostream> +#include <boost/algorithm/string.hpp> +#include <boost/algorithm/string/predicate.hpp> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace nbd { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int call_nbd_cmd(const po::variables_map &vm, + const std::vector<std::string> &args, + const std::vector<std::string> &ceph_global_init_args) { + char exe_path[PATH_MAX]; + ssize_t exe_path_bytes = readlink("/proc/self/exe", exe_path, + sizeof(exe_path) - 1); + if (exe_path_bytes < 0) { + strcpy(exe_path, "rbd-nbd"); + } else { + if (snprintf(exe_path + exe_path_bytes, + sizeof(exe_path) - exe_path_bytes, + "-nbd") < 0) { + return -EOVERFLOW; + } + } + + SubProcess process(exe_path, SubProcess::KEEP, SubProcess::KEEP, SubProcess::KEEP); + + for (auto &arg : ceph_global_init_args) { + process.add_cmd_arg(arg.c_str()); + } + + for (auto &arg : args) { + process.add_cmd_arg(arg.c_str()); + } + + if (process.spawn()) { + std::cerr << "rbd: failed to run rbd-nbd: " << process.err() << std::endl; + return -EINVAL; + } else if (process.join()) { + std::cerr << "rbd: rbd-nbd failed with error: " << process.err() << std::endl; + return -EINVAL; + } + + return 0; +} + +int get_image_or_snap_spec(const po::variables_map &vm, std::string *spec) { + size_t arg_index = 0; + std::string pool_name; + std::string nspace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + spec->append(pool_name); + spec->append("/"); + if (!nspace_name.empty()) { + spec->append(nspace_name); + spec->append("/"); + } + spec->append(image_name); + if (!snap_name.empty()) { + spec->append("@"); + spec->append(snap_name); + } + + return 0; +} + +int parse_options(const std::vector<std::string> &options, + std::vector<std::string> *args) { + for (auto &opts : options) { + std::vector<std::string> args_; + boost::split(args_, opts, boost::is_any_of(",")); + for (auto &o : args_) { + args->push_back("--" + o); + } + } + + return 0; +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { +#if defined(__FreeBSD__) + std::cerr << "rbd: nbd device is not supported" << std::endl; + return -EOPNOTSUPP; +#endif + std::vector<std::string> args; + + args.push_back("list-mapped"); + + if (vm.count("format")) { + args.push_back("--format"); + args.push_back(vm["format"].as<at::Format>().value); + } + if (vm["pretty-format"].as<bool>()) { + args.push_back("--pretty-format"); + } + + return call_nbd_cmd(vm, args, ceph_global_init_args); +} + +int execute_map(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { +#if defined(__FreeBSD__) + std::cerr << "rbd: nbd device is not supported" << std::endl; + return -EOPNOTSUPP; +#endif + std::vector<std::string> args; + + args.push_back("map"); + std::string img; + int r = get_image_or_snap_spec(vm, &img); + if (r < 0) { + return r; + } + args.push_back(img); + + if (vm["read-only"].as<bool>()) { + args.push_back("--read-only"); + } + + if (vm["exclusive"].as<bool>()) { + args.push_back("--exclusive"); + } + + if (vm.count("options")) { + r = parse_options(vm["options"].as<std::vector<std::string>>(), &args); + if (r < 0) { + return r; + } + } + + return call_nbd_cmd(vm, args, ceph_global_init_args); +} + +int execute_unmap(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { +#if defined(__FreeBSD__) + std::cerr << "rbd: nbd device is not supported" << std::endl; + return -EOPNOTSUPP; +#endif + std::string device_name = utils::get_positional_argument(vm, 0); + if (!boost::starts_with(device_name, "/dev/")) { + device_name.clear(); + } + + std::string image_name; + if (device_name.empty()) { + int r = get_image_or_snap_spec(vm, &image_name); + if (r < 0) { + return r; + } + } + + if (device_name.empty() && image_name.empty()) { + std::cerr << "rbd: unmap requires either image name or device path" + << std::endl; + return -EINVAL; + } + + std::vector<std::string> args; + + args.push_back("unmap"); + args.push_back(device_name.empty() ? image_name : device_name); + + if (vm.count("options")) { + int r = parse_options(vm["options"].as<std::vector<std::string>>(), &args); + if (r < 0) { + return r; + } + } + + return call_nbd_cmd(vm, args, ceph_global_init_args); +} + +void get_list_arguments_deprecated(po::options_description *positional, + po::options_description *options) { + at::add_format_options(options); +} + +int execute_list_deprecated(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_args) { + std::cerr << "rbd: 'nbd list' command is deprecated, " + << "use 'device list -t nbd' instead" << std::endl; + return execute_list(vm, ceph_global_args); +} + +void get_map_arguments_deprecated(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_NONE); + options->add_options() + ("read-only", po::bool_switch(), "map read-only") + ("exclusive", po::bool_switch(), "forbid writes by other clients") + ("device", po::value<std::string>(), "specify nbd device") + ("nbds_max", po::value<std::string>(), "override module param nbds_max") + ("max_part", po::value<std::string>(), "override module param max_part") + ("timeout", po::value<std::string>(), "set nbd request timeout (seconds)"); +} + +int execute_map_deprecated(const po::variables_map &vm_deprecated, + const std::vector<std::string> &ceph_global_args) { + std::cerr << "rbd: 'nbd map' command is deprecated, " + << "use 'device map -t nbd' instead" << std::endl; + + po::options_description options; + options.add_options() + ("options,o", po::value<std::vector<std::string>>() + ->default_value(std::vector<std::string>(), ""), ""); + + po::variables_map vm = vm_deprecated; + po::store(po::command_line_parser({}).options(options).run(), vm); + + std::vector<std::string> opts; + if (vm_deprecated.count("device")) { + opts.push_back("device=" + vm_deprecated["device"].as<std::string>()); + } + if (vm_deprecated.count("nbds_max")) { + opts.push_back("nbds_max=" + vm_deprecated["nbds_max"].as<std::string>()); + } + if (vm_deprecated.count("max_part")) { + opts.push_back("max_part=" + vm_deprecated["max_part"].as<std::string>()); + } + if (vm_deprecated.count("timeout")) { + opts.push_back("timeout=" + vm_deprecated["timeout"].as<std::string>()); + } + + vm.at("options").value() = boost::any(opts); + + return execute_map(vm, ceph_global_args); +} + +void get_unmap_arguments_deprecated(po::options_description *positional, + po::options_description *options) { + positional->add_options() + ("image-or-snap-or-device-spec", + "image, snapshot, or device specification\n" + "[<pool-name>/]<image-name>[@<snapshot-name>] or <device-path>"); + at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE); + at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE); + at::add_snap_option(options, at::ARGUMENT_MODIFIER_NONE); +} + +int execute_unmap_deprecated(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_args) { + std::cerr << "rbd: 'nbd unmap' command is deprecated, " + << "use 'device unmap -t nbd' instead" << std::endl; + return execute_unmap(vm, ceph_global_args); +} + +Shell::SwitchArguments switched_arguments({"read-only", "exclusive"}); + +Shell::Action action_show_deprecated( + {"nbd", "list"}, {"nbd", "ls"}, "List the nbd devices already used.", "", + &get_list_arguments_deprecated, &execute_list_deprecated, false); + +Shell::Action action_map_deprecated( + {"nbd", "map"}, {}, "Map image to a nbd device.", "", + &get_map_arguments_deprecated, &execute_map_deprecated, false); + +Shell::Action action_unmap_deprecated( + {"nbd", "unmap"}, {}, "Unmap a nbd device.", "", + &get_unmap_arguments_deprecated, &execute_unmap_deprecated, false); + +} // namespace nbd +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/ObjectMap.cc b/src/tools/rbd/action/ObjectMap.cc new file mode 100644 index 00000000..40ee2d47 --- /dev/null +++ b/src/tools/rbd/action/ObjectMap.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace object_map { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_object_map_rebuild(librbd::Image &image, bool no_progress) +{ + utils::ProgressContext pc("Object Map Rebuild", no_progress); + int r = image.rebuild_object_map(pc); + if (r < 0) { + pc.fail(); + return r; + } + pc.finish(); + return 0; +} + +void get_rebuild_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_NONE); + at::add_no_progress_option(options); +} + +int execute_rebuild(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", + snap_name, false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_object_map_rebuild(image, vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + std::cerr << "rbd: rebuilding object map failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +static int do_object_map_check(librbd::Image &image, bool no_progress) +{ + utils::ProgressContext pc("Object Map Check", no_progress); + int r = image.check_object_map(pc); + if (r < 0) { + pc.fail(); + return r; + } + pc.finish(); + return 0; +} + +void get_check_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_or_snap_spec_options(positional, options, + at::ARGUMENT_MODIFIER_NONE); + at::add_no_progress_option(options); +} + +int execute_check(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", + snap_name, false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_object_map_check(image, vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + std::cerr << "rbd: checking object map failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +Shell::Action action_rebuild( + {"object-map", "rebuild"}, {}, "Rebuild an invalid object map.", "", + &get_rebuild_arguments, &execute_rebuild); +Shell::Action action_check( + {"object-map", "check"}, {}, "Verify the object map is correct.", "", + &get_check_arguments, &execute_check); + +} // namespace object_map +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Perf.cc b/src/tools/rbd/action/Perf.cc new file mode 100644 index 00000000..8f76e85f --- /dev/null +++ b/src/tools/rbd/action/Perf.cc @@ -0,0 +1,699 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/stringify.h" +#include "common/ceph_context.h" +#include "common/ceph_json.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "global/global_context.h" +#include <ncurses.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/time.h> +#include <sys/types.h> +#include <iostream> +#include <vector> +#include <boost/algorithm/string.hpp> +#include <boost/assign.hpp> +#include <boost/bimap.hpp> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace perf { + +namespace at = argument_types; +namespace po = boost::program_options; + +namespace { + +enum class StatDescriptor { + WRITE_OPS = 0, + READ_OPS, + WRITE_BYTES, + READ_BYTES, + WRITE_LATENCY, + READ_LATENCY +}; + +typedef boost::bimap<StatDescriptor, std::string> StatDescriptors; + +static const StatDescriptors STAT_DESCRIPTORS = + boost::assign::list_of<StatDescriptors::relation> + (StatDescriptor::WRITE_OPS, "write_ops") + (StatDescriptor::READ_OPS, "read_ops") + (StatDescriptor::WRITE_BYTES, "write_bytes") + (StatDescriptor::READ_BYTES, "read_bytes") + (StatDescriptor::WRITE_LATENCY, "write_latency") + (StatDescriptor::READ_LATENCY, "read_latency"); + +std::ostream& operator<<(std::ostream& os, const StatDescriptor& val) { + auto it = STAT_DESCRIPTORS.left.find(val); + if (it == STAT_DESCRIPTORS.left.end()) { + os << "unknown (" << static_cast<int>(val) << ")"; + } else { + os << it->second; + } + return os; +} + +void validate(boost::any& v, const std::vector<std::string>& values, + StatDescriptor *target_type, int) { + po::validators::check_first_occurrence(v); + std::string s = po::validators::get_single_string(values); + boost::replace_all(s, "_", " "); + boost::replace_all(s, "-", "_"); + + auto it = STAT_DESCRIPTORS.right.find(s); + if (it == STAT_DESCRIPTORS.right.end()) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + v = boost::any(it->second); +} + +struct ImageStat { + ImageStat(const std::string& pool_name, const std::string& pool_namespace, + const std::string& image_name) + : pool_name(pool_name), pool_namespace(pool_namespace), + image_name(image_name) { + stats.resize(STAT_DESCRIPTORS.size()); + } + + std::string pool_name; + std::string pool_namespace; + std::string image_name; + std::vector<double> stats; +}; + +typedef std::vector<ImageStat> ImageStats; + +typedef std::pair<std::string, std::string> SpecPair; + +std::string format_pool_spec(const std::string& pool, + const std::string& pool_namespace) { + std::string pool_spec{pool}; + if (!pool_namespace.empty()) { + pool_spec += "/" + pool_namespace; + } + return pool_spec; +} + +int query_iostats(librados::Rados& rados, const std::string& pool_spec, + StatDescriptor sort_by, ImageStats* image_stats, + std::ostream& err_os) { + auto sort_by_str = STAT_DESCRIPTORS.left.find(sort_by)->second; + + std::string cmd = R"( + { + "prefix": "rbd perf image stats", + "pool_spec": ")" + pool_spec + R"(", + "sort_by": ")" + sort_by_str + R"(", + "format": "json" + }")"; + + bufferlist in_bl; + bufferlist out_bl; + std::string outs; + int r = rados.mgr_command(cmd, in_bl, &out_bl, &outs); + if (r == -EOPNOTSUPP) { + err_os << "rbd: 'rbd_support' mgr module is not enabled." + << std::endl << std::endl + << "Use 'ceph mgr module enable rbd_support' to enable." + << std::endl; + return r; + } else if (r < 0) { + err_os << "rbd: mgr command failed: " << cpp_strerror(r); + if (!outs.empty()) { + err_os << ": " << outs; + } + err_os << std::endl; + return r; + } + + json_spirit::mValue json_root; + if (!json_spirit::read(out_bl.to_str(), json_root)) { + err_os << "rbd: error parsing perf stats" << std::endl; + return -EINVAL; + } + + image_stats->clear(); + try { + auto& root = json_root.get_obj(); + + // map JSON stat descriptor order to our internal order + std::map<uint32_t, uint32_t> json_to_internal_stats; + auto& json_stat_descriptors = root["stat_descriptors"].get_array(); + for (size_t idx = 0; idx < json_stat_descriptors.size(); ++idx) { + auto it = STAT_DESCRIPTORS.right.find( + json_stat_descriptors[idx].get_str()); + if (it == STAT_DESCRIPTORS.right.end()) { + continue; + } + json_to_internal_stats[idx] = static_cast<uint32_t>(it->second); + } + + // cache a mapping from pool descriptors back to pool-specs + std::map<std::string, SpecPair> json_to_internal_pools; + auto& pool_descriptors = root["pool_descriptors"].get_obj(); + for (auto& pool : pool_descriptors) { + auto& pool_spec = pool.second.get_str(); + auto pos = pool_spec.rfind("/"); + + SpecPair pair{pool_spec.substr(0, pos), ""}; + if (pos != std::string::npos) { + pair.second = pool_spec.substr(pos + 1); + } + + json_to_internal_pools[pool.first] = pair; + } + + auto& stats = root["stats"].get_array(); + for (auto& stat : stats) { + auto& stat_obj = stat.get_obj(); + if (!stat_obj.empty()) { + auto& image_spec = stat_obj.begin()->first; + + auto pos = image_spec.find("/"); + SpecPair pair{image_spec.substr(0, pos), ""}; + if (pos != std::string::npos) { + pair.second = image_spec.substr(pos + 1); + } + + const auto pool_it = json_to_internal_pools.find(pair.first); + if (pool_it == json_to_internal_pools.end()) { + continue; + } + + image_stats->emplace_back( + pool_it->second.first, pool_it->second.second, pair.second); + + auto& image_stat = image_stats->back(); + auto& data = stat_obj.begin()->second.get_array(); + for (auto& indexes : json_to_internal_stats) { + image_stat.stats[indexes.second] = data[indexes.first].get_real(); + } + } + } + } catch (std::runtime_error &e) { + err_os << "rbd: error parsing perf stats: " << e.what() << std::endl; + return -EINVAL; + } + + return 0; +} + +void format_stat(StatDescriptor stat_descriptor, double stat, + std::ostream& os) { + switch (stat_descriptor) { + case StatDescriptor::WRITE_OPS: + case StatDescriptor::READ_OPS: + os << si_u_t(stat) << "/s"; + break; + case StatDescriptor::WRITE_BYTES: + case StatDescriptor::READ_BYTES: + os << byte_u_t(stat) << "/s"; + break; + case StatDescriptor::WRITE_LATENCY: + case StatDescriptor::READ_LATENCY: + os << std::fixed << std::setprecision(2); + if (stat >= 1000000000) { + os << (stat / 1000000000) << " s"; + } else if (stat >= 1000000) { + os << (stat / 1000000) << " ms"; + } else if (stat >= 1000) { + os << (stat / 1000) << " us"; + } else { + os << stat << " ns"; + } + break; + default: + ceph_assert(false); + break; + } +} + +} // anonymous namespace + +namespace iostat { + +struct Iterations {}; + +void validate(boost::any& v, const std::vector<std::string>& values, + Iterations *target_type, int) { + po::validators::check_first_occurrence(v); + auto& s = po::validators::get_single_string(values); + + try { + auto iterations = boost::lexical_cast<uint32_t>(s); + if (iterations > 0) { + v = boost::any(iterations); + return; + } + } catch (const boost::bad_lexical_cast &) { + } + throw po::validation_error(po::validation_error::invalid_option_value); +} + +void format(const ImageStats& image_stats, Formatter* f, bool global_search) { + TextTable tbl; + if (f) { + f->open_array_section("images"); + } else { + tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + for (auto& stat : STAT_DESCRIPTORS.left) { + std::string title; + switch (stat.first) { + case StatDescriptor::WRITE_OPS: + title = "WR "; + break; + case StatDescriptor::READ_OPS: + title = "RD "; + break; + case StatDescriptor::WRITE_BYTES: + title = "WR_BYTES "; + break; + case StatDescriptor::READ_BYTES: + title = "RD_BYTES "; + break; + case StatDescriptor::WRITE_LATENCY: + title = "WR_LAT "; + break; + case StatDescriptor::READ_LATENCY: + title = "RD_LAT "; + break; + default: + ceph_assert(false); + break; + } + tbl.define_column(title, TextTable::RIGHT, TextTable::RIGHT); + } + } + + for (auto& image_stat : image_stats) { + if (f) { + f->open_object_section("image"); + f->dump_string("pool", image_stat.pool_name); + f->dump_string("pool_namespace", image_stat.pool_namespace); + f->dump_string("image", image_stat.image_name); + for (auto& pair : STAT_DESCRIPTORS.left) { + f->dump_float(pair.second.c_str(), + image_stat.stats[static_cast<size_t>(pair.first)]); + } + f->close_section(); + } else { + std::string name; + if (global_search) { + name += image_stat.pool_name + "/"; + if (!image_stat.pool_namespace.empty()) { + name += image_stat.pool_namespace + "/"; + } + } + name += image_stat.image_name; + + tbl << name; + for (auto& pair : STAT_DESCRIPTORS.left) { + std::stringstream str; + format_stat(pair.first, + image_stat.stats[static_cast<size_t>(pair.first)], str); + str << ' '; + tbl << str.str(); + } + tbl << TextTable::endrow; + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } else { + std::cout << tbl << std::endl; + } +} + +} // namespace iostat + +namespace iotop { + +class MainWindow { +public: + MainWindow(librados::Rados& rados, const std::string& pool_spec) + : m_rados(rados), m_pool_spec(pool_spec) { + initscr(); + curs_set(0); + cbreak(); + noecho(); + keypad(stdscr, TRUE); + nodelay(stdscr, TRUE); + + init_columns(); + } + + int run() { + redraw(); + + int r = 0; + std::stringstream err_str; + while (true) { + r = query_iostats(m_rados, m_pool_spec, m_sort_by, &m_image_stats, + err_str); + if (r < 0) { + break; + return r; + } + + redraw(); + wait_for_key_or_delay(); + + int ch = getch(); + if (ch == 'q' || ch == 'Q') { + break; + } else if (ch == '<' || ch == KEY_LEFT) { + auto it = STAT_DESCRIPTORS.left.find(m_sort_by); + if (it != STAT_DESCRIPTORS.left.begin()) { + m_sort_by = (--it)->first; + } + } else if (ch == '>' || ch == KEY_RIGHT) { + auto it = STAT_DESCRIPTORS.left.find(m_sort_by); + if (it != STAT_DESCRIPTORS.left.end() && + ++it != STAT_DESCRIPTORS.left.end()) { + m_sort_by = it->first; + } + } + } + + endwin(); + + if (r < 0) { + std::cerr << err_str.str() << std::endl; + } + return r; + } + +private: + static const size_t STAT_COLUMN_WIDTH = 12; + + librados::Rados& m_rados; + std::string m_pool_spec; + + ImageStats m_image_stats; + StatDescriptor m_sort_by = StatDescriptor::WRITE_OPS; + + bool m_pending_win_opened = false; + WINDOW* m_pending_win = nullptr; + + int m_height = 1; + int m_width = 1; + + std::map<StatDescriptor, std::string> m_columns; + + void init_columns() { + m_columns.clear(); + for (auto& pair : STAT_DESCRIPTORS.left) { + std::string title; + switch (pair.first) { + case StatDescriptor::WRITE_OPS: + title = "WRITES OPS"; + break; + case StatDescriptor::READ_OPS: + title = "READS OPS"; + break; + case StatDescriptor::WRITE_BYTES: + title = "WRITE BYTES"; + break; + case StatDescriptor::READ_BYTES: + title = "READ BYTES"; + break; + case StatDescriptor::WRITE_LATENCY: + title = "WRITE LAT"; + break; + case StatDescriptor::READ_LATENCY: + title = "READ LAT"; + break; + default: + ceph_assert(false); + break; + } + m_columns[pair.first] = (title); + } + } + + void redraw() { + getmaxyx(stdscr, m_height, m_width); + + redraw_main_window(); + redraw_pending_window(); + + doupdate(); + } + + void redraw_main_window() { + werase(stdscr); + mvhline(0, 0, ' ' | A_REVERSE, m_width); + + // print header for all metrics + int remaining_cols = m_width; + std::stringstream str; + for (auto& pair : m_columns) { + int attr = A_REVERSE; + std::string title; + if (pair.first == m_sort_by) { + title += '>'; + attr |= A_BOLD; + } else { + title += ' '; + } + title += pair.second; + + str.str(""); + str << std::right << std::setfill(' ') + << std::setw(STAT_COLUMN_WIDTH) + << title << ' '; + + attrset(attr); + addstr(str.str().c_str()); + remaining_cols -= title.size(); + } + + attrset(A_REVERSE); + addstr("IMAGE"); + attrset(A_NORMAL); + + // print each image (one per line) + int row = 1; + int remaining_lines = m_height - 1; + for (auto& image_stat : m_image_stats) { + if (remaining_lines <= 0) { + break; + } + --remaining_lines; + + move(row++, 0); + for (auto& pair : m_columns) { + str.str(""); + format_stat(pair.first, + image_stat.stats[static_cast<size_t>(pair.first)], str); + auto value = str.str().substr(0, STAT_COLUMN_WIDTH); + + str.str(""); + str << std::right << std::setfill(' ') + << std::setw(STAT_COLUMN_WIDTH) + << value << ' '; + addstr(str.str().c_str()); + } + + std::string image; + if (m_pool_spec.empty()) { + image = format_pool_spec(image_stat.pool_name, + image_stat.pool_namespace) + "/"; + } + image += image_stat.image_name; + addstr(image.substr(0, remaining_cols).c_str()); + } + + wnoutrefresh(stdscr); + } + + void redraw_pending_window() { + // draw a "please by patient" window while waiting + const char* msg = "Waiting for initial stats"; + int height = 5; + int width = strlen(msg) + 4;; + int starty = (m_height - height) / 2; + int startx = (m_width - width) / 2; + + if (m_image_stats.empty() && !m_pending_win_opened) { + m_pending_win_opened = true; + m_pending_win = newwin(height, width, starty, startx); + } + + if (m_pending_win != nullptr) { + if (m_image_stats.empty()) { + box(m_pending_win, 0 , 0); + mvwaddstr(m_pending_win, 2, 2, msg); + wnoutrefresh(m_pending_win); + } else { + delwin(m_pending_win); + m_pending_win = nullptr; + } + } + } + + void wait_for_key_or_delay() { + fd_set fds; + FD_ZERO(&fds); + FD_SET(STDIN_FILENO, &fds); + + // no point to refreshing faster than the stats period + struct timeval tval; + tval.tv_sec = std::min<uint32_t>( + 10, g_conf().get_val<int64_t>("mgr_stats_period")); + tval.tv_usec = 0; + + select(STDIN_FILENO + 1, &fds, NULL, NULL, &tval); + } +}; + +} // namespace iotop + + +void get_arguments_iostat(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, true); + options->add_options() + ("iterations", po::value<iostat::Iterations>(), + "iterations of metric collection [> 0]") + ("sort-by", po::value<StatDescriptor>()->default_value(StatDescriptor::WRITE_OPS), + "sort-by IO metric " + "(write-ops, read-ops, write-bytes, read-bytes, write-latency, read-latency) " + "[default: write-ops]"); + at::add_format_options(options); +} + +int execute_iostat(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool; + std::string pool_namespace; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, false, false, &pool, + &pool_namespace, &arg_index); + if (r < 0) { + return r; + } + + uint32_t iterations = 0; + if (vm.count("iterations")) { + iterations = vm["iterations"].as<uint32_t>(); + } + auto sort_by = vm["sort-by"].as<StatDescriptor>(); + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + auto f = formatter.get(); + if (iterations > 1 && f != nullptr) { + std::cerr << "rbd: specifing iterations is not valid with formatted output" + << std::endl; + return -EINVAL; + } + + librados::Rados rados; + r = utils::init_rados(&rados); + if (r < 0) { + return r; + } + + r = rados.wait_for_latest_osdmap(); + if (r < 0) { + std::cerr << "rbd: failed to retrieve OSD map" << std::endl; + return r; + } + + std::string pool_spec = format_pool_spec(pool, pool_namespace); + + // no point to refreshing faster than the stats period + auto delay = std::min<uint32_t>(10, g_conf().get_val<int64_t>("mgr_stats_period")); + + ImageStats image_stats; + uint32_t count = 0; + bool printed_notice = false; + while (count++ < iterations || iterations == 0) { + r = query_iostats(rados, pool_spec, sort_by, &image_stats, std::cerr); + if (r < 0) { + return r; + } + + if (count == 1 && image_stats.empty()) { + count = 0; + if (!printed_notice) { + std::cerr << "rbd: waiting for initial image stats" + << std::endl << std::endl;; + printed_notice = true; + } + } else { + iostat::format(image_stats, f, pool_spec.empty()); + if (f != nullptr) { + break; + } + } + + sleep(delay); + } + + return 0; +} + +void get_arguments_iotop(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, true); +} + +int execute_iotop(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool; + std::string pool_namespace; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, false, false, &pool, + &pool_namespace, &arg_index); + if (r < 0) { + return r; + } + + librados::Rados rados; + r = utils::init_rados(&rados); + if (r < 0) { + return r; + } + + r = rados.wait_for_latest_osdmap(); + if (r < 0) { + std::cerr << "rbd: failed to retrieve OSD map" << std::endl; + return r; + } + + iotop::MainWindow mainWindow(rados, format_pool_spec(pool, pool_namespace)); + r = mainWindow.run(); + if (r < 0) { + return r; + } + + return 0; +} + +Shell::Action stat_action( + {"perf", "image", "iostat"}, {}, "Display image IO statistics.", "", + &get_arguments_iostat, &execute_iostat); +Shell::Action top_action( + {"perf", "image", "iotop"}, {}, "Display a top-like IO monitor.", "", + &get_arguments_iotop, &execute_iotop); + +} // namespace perf +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Pool.cc b/src/tools/rbd/action/Pool.cc new file mode 100644 index 00000000..f1718eb1 --- /dev/null +++ b/src/tools/rbd/action/Pool.cc @@ -0,0 +1,162 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace pool { + +namespace at = argument_types; +namespace po = boost::program_options; + +void get_arguments_init(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, false); + options->add_options() + ("force", po::bool_switch(), + "force initialize pool for RBD use if registered by another application"); +} + +int execute_init(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name, + nullptr, &arg_index); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, "", &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = rbd.pool_init(io_ctx, vm["force"].as<bool>()); + if (r == -EOPNOTSUPP) { + std::cerr << "rbd: luminous or later release required." << std::endl; + } else if (r == -EPERM) { + std::cerr << "rbd: pool already registered to a different application." + << std::endl; + } else if (r < 0) { + std::cerr << "rbd: error registered application: " << cpp_strerror(r) + << std::endl; + } + + return 0; +} + +void get_arguments_stats(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, true); + at::add_format_options(options); +} + +int execute_stats(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + std::string namespace_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name, + &namespace_name, &arg_index); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + uint64_t image_count; + uint64_t provisioned_bytes; + uint64_t snap_count; + uint64_t trash_count; + uint64_t trash_provisioned_bytes; + uint64_t trash_snap_count; + + librbd::PoolStats pool_stats; + pool_stats.add(RBD_POOL_STAT_OPTION_IMAGES, &image_count); + pool_stats.add(RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES, + &provisioned_bytes); + pool_stats.add(RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, &snap_count); + pool_stats.add(RBD_POOL_STAT_OPTION_TRASH_IMAGES, &trash_count); + pool_stats.add(RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES, + &trash_provisioned_bytes); + pool_stats.add(RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS, &trash_snap_count); + + r = rbd.pool_stats_get(io_ctx, &pool_stats); + if (r < 0) { + std::cerr << "rbd: failed to query pool stats: " << cpp_strerror(r) + << std::endl; + return r; + } + + if (formatter) { + formatter->open_object_section("stats"); + formatter->open_object_section("images"); + formatter->dump_unsigned("count", image_count); + formatter->dump_unsigned("provisioned_bytes", provisioned_bytes); + formatter->dump_unsigned("snap_count", snap_count); + formatter->close_section(); + formatter->open_object_section("trash"); + formatter->dump_unsigned("count", trash_count); + formatter->dump_unsigned("provisioned_bytes", trash_provisioned_bytes); + formatter->dump_unsigned("snap_count", trash_snap_count); + formatter->close_section(); + formatter->close_section(); + formatter->flush(std::cout); + } else { + std::cout << "Total Images: " << image_count; + if (trash_count > 0) { + std::cout << " (" << trash_count << " in trash)"; + } + std::cout << std::endl; + + std::cout << "Total Snapshots: " << snap_count; + if (trash_count > 0) { + std::cout << " (" << trash_snap_count << " in trash)"; + } + std::cout << std::endl; + + std::cout << "Provisioned Size: " << byte_u_t(provisioned_bytes); + if (trash_count > 0) { + std::cout << " (" << byte_u_t(trash_provisioned_bytes) << " in trash)"; + } + std::cout << std::endl; + } + + return 0; +} + +Shell::Action init_action( + {"pool", "init"}, {}, "Initialize pool for use by RBD.", "", + &get_arguments_init, &execute_init); +Shell::Action stat_action( + {"pool", "stats"}, {}, "Display pool statistics.", + "Note: legacy v1 images are not included in stats", + &get_arguments_stats, &execute_stats); + +} // namespace pool +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Remove.cc b/src/tools/rbd/action/Remove.cc new file mode 100644 index 00000000..337d42be --- /dev/null +++ b/src/tools/rbd/action/Remove.cc @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include "include/stringify.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace remove { + +namespace { + +bool is_auto_delete_snapshot(librbd::Image* image, + const librbd::snap_info_t &snap_info) { + librbd::snap_namespace_type_t namespace_type; + int r = image->snap_get_namespace_type(snap_info.id, &namespace_type); + if (r < 0) { + return false; + } + + switch (namespace_type) { + case RBD_SNAP_NAMESPACE_TYPE_TRASH: + return true; + default: + return false; + } +} + +} // anonymous namespace + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_delete(librbd::RBD &rbd, librados::IoCtx& io_ctx, + const char *imgname, bool no_progress) +{ + utils::ProgressContext pc("Removing image", no_progress); + int r = rbd.remove_with_progress(io_ctx, imgname, pc); + if (r < 0) { + pc.fail(); + return r; + } + pc.finish(); + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_no_progress_option(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + io_ctx.set_osdmap_full_try(); + + librbd::RBD rbd; + r = do_delete(rbd, io_ctx, image_name.c_str(), + vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + if (r == -ENOTEMPTY) { + librbd::Image image; + std::vector<librbd::snap_info_t> snaps; + int image_r = utils::open_image(io_ctx, image_name, true, &image); + if (image_r >= 0) { + image_r = image.snap_list(snaps); + } + if (image_r >= 0) { + snaps.erase(std::remove_if(snaps.begin(), snaps.end(), + [&image](const librbd::snap_info_t& snap) { + return is_auto_delete_snapshot(&image, + snap); + }), + snaps.end()); + } + + if (!snaps.empty()) { + std::cerr << "rbd: image has snapshots - these must be deleted" + << " with 'rbd snap purge' before the image can be removed." + << std::endl; + } else { + std::cerr << "rbd: image has snapshots with linked clones - these must " + << "be deleted or flattened before the image can be removed." + << std::endl; + } + } else if (r == -EBUSY) { + std::cerr << "rbd: error: image still has watchers" + << std::endl + << "This means the image is still open or the client using " + << "it crashed. Try again after closing/unmapping it or " + << "waiting 30s for the crashed client to timeout." + << std::endl; + } else if (r == -EMLINK) { + librbd::Image image; + int image_r = utils::open_image(io_ctx, image_name, true, &image); + librbd::group_info_t group_info; + if (image_r == 0) { + image_r = image.get_group(&group_info, sizeof(group_info)); + } + if (image_r == 0) { + std::string pool_name = ""; + librados::Rados rados(io_ctx); + librados::IoCtx pool_io_ctx; + image_r = rados.ioctx_create2(group_info.pool, pool_io_ctx); + if (image_r < 0) { + pool_name = "<missing group pool " + stringify(group_info.pool) + ">"; + } else { + pool_name = pool_io_ctx.get_pool_name(); + } + std::cerr << "rbd: error: image belongs to a group " + << pool_name << "/"; + if (!io_ctx.get_namespace().empty()) { + std::cerr << io_ctx.get_namespace() << "/"; + } + std::cerr << group_info.name; + } else + std::cerr << "rbd: error: image belongs to a group"; + + std::cerr << std::endl + << "Remove the image from the group and try again." + << std::endl; + image.close(); + } else { + std::cerr << "rbd: delete error: " << cpp_strerror(r) << std::endl; + } + return r; + } + return 0; +} + +Shell::Action action( + {"remove"}, {"rm"}, "Delete an image.", "", &get_arguments, &execute); + +} // namespace remove +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Rename.cc b/src/tools/rbd/action/Rename.cc new file mode 100644 index 00000000..b4954bcb --- /dev/null +++ b/src/tools/rbd/action/Rename.cc @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace rename { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_rename(librbd::RBD &rbd, librados::IoCtx& io_ctx, + const char *imgname, const char *destname) +{ + int r = rbd.rename(io_ctx, imgname, destname); + if (r < 0) + return r; + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE); + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + std::string dst_image_name; + std::string dst_snap_name; + std::string dst_pool_name = pool_name; + std::string dst_namespace_name = namespace_name; + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name, + &dst_namespace_name, &dst_image_name, &dst_snap_name, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + if (r < 0) { + return r; + } + + if (pool_name != dst_pool_name) { + std::cerr << "rbd: mv/rename across pools not supported" << std::endl + << "source pool: " << pool_name << " dest pool: " << dst_pool_name + << std::endl; + return -EINVAL; + } else if (namespace_name != dst_namespace_name) { + std::cerr << "rbd: mv/rename across namespaces not supported" << std::endl + << "source namespace: " << namespace_name << " dest namespace: " + << dst_namespace_name << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + librbd::RBD rbd; + r = do_rename(rbd, io_ctx, image_name.c_str(), dst_image_name.c_str()); + if (r < 0) { + std::cerr << "rbd: rename error: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"rename"}, {"mv"}, "Rename image within pool.", "", &get_arguments, + &execute); + +} // namespace rename +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Resize.cc b/src/tools/rbd/action/Resize.cc new file mode 100644 index 00000000..60c16429 --- /dev/null +++ b/src/tools/rbd/action/Resize.cc @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace resize { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_resize(librbd::Image& image, uint64_t size, bool allow_shrink, bool no_progress) +{ + utils::ProgressContext pc("Resizing image", no_progress); + int r = image.resize2(size, allow_shrink, pc); + if (r < 0) { + pc.fail(); + return r; + } + pc.finish(); + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_size_option(options); + options->add_options() + ("allow-shrink", po::bool_switch(), "permit shrinking"); + at::add_no_progress_option(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + uint64_t size; + r = utils::get_image_size(vm, &size); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", + snap_name, false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + librbd::image_info_t info; + r = image.stat(info, sizeof(info)); + if (r < 0) { + std::cerr << "rbd: resize error: " << cpp_strerror(r) << std::endl; + return r; + } + + if (info.size == size) { + std::cerr << "rbd: new size is equal to original size " << std::endl; + return -EINVAL; + } + + if (info.size > size && !vm["allow-shrink"].as<bool>()) { + r = -EINVAL; + } else { + r = do_resize(image, size, vm["allow-shrink"].as<bool>(), vm[at::NO_PROGRESS].as<bool>()); + } + + if (r < 0) { + if (r == -EINVAL && !vm["allow-shrink"].as<bool>()) { + std::cerr << "rbd: shrinking an image is only allowed with the " + << "--allow-shrink flag" << std::endl; + return r; + } + std::cerr << "rbd: resize error: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::SwitchArguments switched_arguments({"allow-shrink"}); +Shell::Action action( + {"resize"}, {}, "Resize (expand or shrink) image.", "", &get_arguments, + &execute); + +} // namespace resize +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Snap.cc b/src/tools/rbd/action/Snap.cc new file mode 100644 index 00000000..70cf62da --- /dev/null +++ b/src/tools/rbd/action/Snap.cc @@ -0,0 +1,889 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/types.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include <iostream> +#include <boost/program_options.hpp> +#include <boost/bind.hpp> + +namespace rbd { +namespace action { +namespace snap { + +static const std::string ALL_NAME("all"); + +namespace at = argument_types; +namespace po = boost::program_options; + +int do_list_snaps(librbd::Image& image, Formatter *f, bool all_snaps, librados::Rados& rados) +{ + std::vector<librbd::snap_info_t> snaps; + TextTable t; + int r; + + r = image.snap_list(snaps); + if (r < 0) { + std::cerr << "rbd: unable to list snapshots" << std::endl; + return r; + } + + if (!all_snaps) { + snaps.erase(remove_if(snaps.begin(), + snaps.end(), + boost::bind(utils::is_not_user_snap_namespace, &image, _1)), + snaps.end()); + } + + if (f) { + f->open_array_section("snapshots"); + } else { + t.define_column("SNAPID", TextTable::LEFT, TextTable::RIGHT); + t.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + t.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT); + t.define_column("PROTECTED", TextTable::LEFT, TextTable::LEFT); + t.define_column("TIMESTAMP", TextTable::LEFT, TextTable::RIGHT); + if (all_snaps) { + t.define_column("NAMESPACE", TextTable::LEFT, TextTable::LEFT); + } + } + + std::list<std::pair<int64_t, std::string>> pool_list; + rados.pool_list2(pool_list); + std::map<int64_t, std::string> pool_map(pool_list.begin(), pool_list.end()); + + for (std::vector<librbd::snap_info_t>::iterator s = snaps.begin(); + s != snaps.end(); ++s) { + struct timespec timestamp; + bool snap_protected = false; + image.snap_get_timestamp(s->id, ×tamp); + string tt_str = ""; + if(timestamp.tv_sec != 0) { + time_t tt = timestamp.tv_sec; + tt_str = ctime(&tt); + tt_str = tt_str.substr(0, tt_str.length() - 1); + } + + librbd::snap_namespace_type_t snap_namespace; + r = image.snap_get_namespace_type(s->id, &snap_namespace); + if (r < 0) { + std::cerr << "rbd: unable to retrieve snap namespace" << std::endl; + return r; + } + + std::string snap_namespace_name = "Unknown"; + switch (snap_namespace) { + case RBD_SNAP_NAMESPACE_TYPE_USER: + snap_namespace_name = "user"; + break; + case RBD_SNAP_NAMESPACE_TYPE_GROUP: + snap_namespace_name = "group"; + break; + case RBD_SNAP_NAMESPACE_TYPE_TRASH: + snap_namespace_name = "trash"; + break; + } + + int get_trash_res = -ENOENT; + std::string trash_original_name; + int get_group_res = -ENOENT; + librbd::snap_group_namespace_t group_snap; + if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_GROUP) { + get_group_res = image.snap_get_group_namespace(s->id, &group_snap, + sizeof(group_snap)); + } else if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_TRASH) { + get_trash_res = image.snap_get_trash_namespace( + s->id, &trash_original_name); + } + + std::string protected_str = ""; + if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_USER) { + r = image.snap_is_protected(s->name.c_str(), &snap_protected); + if (r < 0) { + std::cerr << "rbd: unable to retrieve snap protection" << std::endl; + return r; + } + } + + if (f) { + protected_str = snap_protected ? "true" : "false"; + f->open_object_section("snapshot"); + f->dump_unsigned("id", s->id); + f->dump_string("name", s->name); + f->dump_unsigned("size", s->size); + f->dump_string("protected", protected_str); + f->dump_string("timestamp", tt_str); + if (all_snaps) { + f->open_object_section("namespace"); + f->dump_string("type", snap_namespace_name); + if (get_group_res == 0) { + std::string pool_name = pool_map[group_snap.group_pool]; + f->dump_string("pool", pool_name); + f->dump_string("group", group_snap.group_name); + f->dump_string("group snap", group_snap.group_snap_name); + } else if (get_trash_res == 0) { + f->dump_string("original_name", trash_original_name); + } + f->close_section(); + } + f->close_section(); + } else { + protected_str = snap_protected ? "yes" : ""; + t << s->id << s->name << stringify(byte_u_t(s->size)) << protected_str << tt_str; + + if (all_snaps) { + ostringstream oss; + oss << snap_namespace_name; + + if (get_group_res == 0) { + std::string pool_name = pool_map[group_snap.group_pool]; + oss << " (" << pool_name << "/" + << group_snap.group_name << "@" + << group_snap.group_snap_name << ")"; + } else if (get_trash_res == 0) { + oss << " (" << trash_original_name << ")"; + } + + t << oss.str(); + } + t << TextTable::endrow; + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } else if (snaps.size()) { + std::cout << t; + } + + return 0; +} + +int do_add_snap(librbd::Image& image, const char *snapname) +{ + int r = image.snap_create(snapname); + if (r < 0) + return r; + + return 0; +} + +int do_remove_snap(librbd::Image& image, const char *snapname, bool force, + bool no_progress) +{ + uint32_t flags = force? RBD_SNAP_REMOVE_FORCE : 0; + int r = 0; + utils::ProgressContext pc("Removing snap", no_progress); + + r = image.snap_remove2(snapname, flags, pc); + if (r < 0) { + pc.fail(); + return r; + } + + pc.finish(); + return 0; +} + +int do_rollback_snap(librbd::Image& image, const char *snapname, + bool no_progress) +{ + utils::ProgressContext pc("Rolling back to snapshot", no_progress); + int r = image.snap_rollback_with_progress(snapname, pc); + if (r < 0) { + pc.fail(); + return r; + } + pc.finish(); + return 0; +} + +int do_purge_snaps(librbd::Image& image, bool no_progress) +{ + utils::ProgressContext pc("Removing all snapshots", no_progress); + std::vector<librbd::snap_info_t> snaps; + bool is_protected = false; + int r = image.snap_list(snaps); + if (r < 0) { + pc.fail(); + return r; + } else if (0 == snaps.size()) { + return 0; + } else { + list<std::string> protect; + snaps.erase(remove_if(snaps.begin(), + snaps.end(), + boost::bind(utils::is_not_user_snap_namespace, &image, _1)), + snaps.end()); + for (auto it = snaps.begin(); it != snaps.end();) { + r = image.snap_is_protected(it->name.c_str(), &is_protected); + if (r < 0) { + pc.fail(); + return r; + } else if (is_protected == true) { + protect.push_back(it->name.c_str()); + snaps.erase(it); + } else { + ++it; + } + } + + if (!protect.empty()) { + std::cout << "rbd: error removing snapshot(s) '" << protect << "', which " + << (1 == protect.size() ? "is" : "are") + << " protected - these must be unprotected with " + << "`rbd snap unprotect`." + << std::endl; + } + for (size_t i = 0; i < snaps.size(); ++i) { + r = image.snap_remove(snaps[i].name.c_str()); + if (r < 0) { + pc.fail(); + return r; + } + pc.update_progress(i + 1, snaps.size() + protect.size()); + } + + if (!protect.empty()) { + pc.fail(); + } else if (snaps.size() > 0) { + pc.finish(); + } + + return 0; + } +} + +int do_protect_snap(librbd::Image& image, const char *snapname) +{ + int r = image.snap_protect(snapname); + if (r < 0) + return r; + + return 0; +} + +int do_unprotect_snap(librbd::Image& image, const char *snapname) +{ + int r = image.snap_unprotect(snapname); + if (r < 0) + return r; + + return 0; +} + +int do_set_limit(librbd::Image& image, uint64_t limit) +{ + return image.snap_set_limit(limit); +} + +void get_list_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_image_id_option(options); + at::add_format_options(options); + + std::string name = ALL_NAME + ",a"; + + options->add_options() + (name.c_str(), po::bool_switch(), "list snapshots from all namespaces"); +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + std::string image_id; + + if (vm.count(at::IMAGE_ID)) { + image_id = vm[at::IMAGE_ID].as<std::string>(); + } + + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, image_id.empty(), + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + if (!image_id.empty() && !image_name.empty()) { + std::cerr << "rbd: trying to access image using both name and id. " + << std::endl; + return -EINVAL; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, + image_id, "", true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + bool all_snaps = vm[ALL_NAME].as<bool>(); + r = do_list_snaps(image, formatter.get(), all_snaps, rados); + if (r < 0) { + cerr << "rbd: failed to list snapshots: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_create_arguments(po::options_description *positional, + po::options_description *options) { + at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); +} + +int execute_create(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_SNAP); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_add_snap(image, snap_name.c_str()); + if (r < 0) { + cerr << "rbd: failed to create snapshot: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_remove_arguments(po::options_description *positional, + po::options_description *options) { + at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_image_id_option(options); + at::add_snap_id_option(options); + at::add_no_progress_option(options); + + options->add_options() + ("force", po::bool_switch(), "flatten children and unprotect snapshot if needed."); +} + +int execute_remove(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + std::string image_id; + uint64_t snap_id = CEPH_NOSNAP; + bool force = vm["force"].as<bool>(); + bool no_progress = vm[at::NO_PROGRESS].as<bool>(); + + if (vm.count(at::IMAGE_ID)) { + image_id = vm[at::IMAGE_ID].as<std::string>(); + } + if (vm.count(at::SNAPSHOT_ID)) { + snap_id = vm[at::SNAPSHOT_ID].as<uint64_t>(); + } + + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, image_id.empty(), + (snap_id == CEPH_NOSNAP ? utils::SNAPSHOT_PRESENCE_REQUIRED : + utils::SNAPSHOT_PRESENCE_PERMITTED), + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + if (!image_id.empty() && !image_name.empty()) { + std::cerr << "rbd: trying to access image using both name and id." + << std::endl; + return -EINVAL; + } else if (!snap_name.empty() && snap_id != CEPH_NOSNAP) { + std::cerr << "rbd: trying to access snapshot using both name and id." + << std::endl; + return -EINVAL; + } else if ((force || no_progress) && snap_id != CEPH_NOSNAP) { + std::cerr << "rbd: force and no-progress options not permitted when " + << "removing by id." << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + io_ctx.set_osdmap_full_try(); + if (image_id.empty()) { + r = utils::open_image(io_ctx, image_name, false, &image); + } else { + r = utils::open_image_by_id(io_ctx, image_id, false, &image); + } + if (r < 0) { + return r; + } + + if (!snap_name.empty()) { + r = do_remove_snap(image, snap_name.c_str(), force, no_progress); + } else { + r = image.snap_remove_by_id(snap_id); + } + + if (r < 0) { + if (r == -EBUSY) { + std::cerr << "rbd: snapshot " + << (snap_name.empty() ? std::string("id ") + stringify(snap_id) : + std::string("'") + snap_name + "'") + << " is protected from removal." << std::endl; + } else { + std::cerr << "rbd: failed to remove snapshot: " << cpp_strerror(r) + << std::endl; + } + return r; + } + return 0; +} + +void get_purge_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_image_id_option(options); + at::add_no_progress_option(options); +} + +int execute_purge(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + std::string image_id; + + if (vm.count(at::IMAGE_ID)) { + image_id = vm[at::IMAGE_ID].as<std::string>(); + } + + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, image_id.empty(), + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + if (!image_id.empty() && !image_name.empty()) { + std::cerr << "rbd: trying to access image using both name and id. " + << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + io_ctx.set_osdmap_full_try(); + if (image_id.empty()) { + r = utils::open_image(io_ctx, image_name, false, &image); + } else { + r = utils::open_image_by_id(io_ctx, image_id, false, &image); + } + if (r < 0) { + return r; + } + + r = do_purge_snaps(image, vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + if (r != -EBUSY) { + std::cerr << "rbd: removing snaps failed: " << cpp_strerror(r) + << std::endl; + } + return r; + } + return 0; +} + +void get_rollback_arguments(po::options_description *positional, + po::options_description *options) { + at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_no_progress_option(options); +} + +int execute_rollback(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_rollback_snap(image, snap_name.c_str(), + vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + std::cerr << "rbd: rollback failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +void get_protect_arguments(po::options_description *positional, + po::options_description *options) { + at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); +} + +int execute_protect(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + bool is_protected = false; + r = image.snap_is_protected(snap_name.c_str(), &is_protected); + if (r < 0) { + std::cerr << "rbd: protecting snap failed: " << cpp_strerror(r) + << std::endl; + return r; + } else if (is_protected) { + std::cerr << "rbd: snap is already protected" << std::endl; + return -EBUSY; + } + + r = do_protect_snap(image, snap_name.c_str()); + if (r < 0) { + std::cerr << "rbd: protecting snap failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_unprotect_arguments(po::options_description *positional, + po::options_description *options) { + at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_image_id_option(options); +} + +int execute_unprotect(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + std::string image_id; + + if (vm.count(at::IMAGE_ID)) { + image_id = vm[at::IMAGE_ID].as<std::string>(); + } + + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, image_id.empty(), + utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + if (!image_id.empty() && !image_name.empty()) { + std::cerr << "rbd: trying to access image using both name and id. " + << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + io_ctx.set_osdmap_full_try(); + if (image_id.empty()) { + r = utils::open_image(io_ctx, image_name, false, &image); + } else { + r = utils::open_image_by_id(io_ctx, image_id, false, &image); + } + if (r < 0) { + return r; + } + + bool is_protected = false; + r = image.snap_is_protected(snap_name.c_str(), &is_protected); + if (r < 0) { + std::cerr << "rbd: unprotecting snap failed: " << cpp_strerror(r) + << std::endl; + return r; + } else if (!is_protected) { + std::cerr << "rbd: snap is already unprotected" << std::endl; + return -EINVAL; + } + + r = do_unprotect_snap(image, snap_name.c_str()); + if (r < 0) { + std::cerr << "rbd: unprotecting snap failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_set_limit_arguments(po::options_description *pos, + po::options_description *opt) { + at::add_image_spec_options(pos, opt, at::ARGUMENT_MODIFIER_NONE); + at::add_limit_option(opt); +} + +int execute_set_limit(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + uint64_t limit; + + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + if (vm.count(at::LIMIT)) { + limit = vm[at::LIMIT].as<uint64_t>(); + } else { + std::cerr << "rbd: must specify --limit <num>" << std::endl; + return -ERANGE; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_set_limit(image, limit); + if (r < 0) { + std::cerr << "rbd: setting snapshot limit failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_clear_limit_arguments(po::options_description *pos, + po::options_description *opt) { + at::add_image_spec_options(pos, opt, at::ARGUMENT_MODIFIER_NONE); +} + +int execute_clear_limit(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_set_limit(image, UINT64_MAX); + if (r < 0) { + std::cerr << "rbd: clearing snapshot limit failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +void get_rename_arguments(po::options_description *positional, + po::options_description *options) { + at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE); + at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST); +} + +int execute_rename(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string src_snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name, + &image_name, &src_snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return -r; + } + + std::string dest_pool_name(pool_name); + std::string dest_namespace_name(namespace_name); + std::string dest_image_name; + std::string dest_snap_name; + r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dest_pool_name, + &dest_namespace_name, &dest_image_name, &dest_snap_name, true, + utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_SNAP); + if (r < 0) { + return -r; + } + + if (pool_name != dest_pool_name) { + std::cerr << "rbd: source and destination pool must be the same" + << std::endl; + return -EINVAL; + } else if (namespace_name != dest_namespace_name) { + std::cerr << "rbd: source and destination namespace must be the same" + << std::endl; + return -EINVAL; + } else if (image_name != dest_image_name) { + std::cerr << "rbd: source and destination image name must be the same" + << std::endl; + return -EINVAL; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = image.snap_rename(src_snap_name.c_str(), dest_snap_name.c_str()); + if (r < 0) { + std::cerr << "rbd: renaming snap failed: " << cpp_strerror(r) + << std::endl; + return r; + } + return 0; +} + +Shell::Action action_list( + {"snap", "list"}, {"snap", "ls"}, "Dump list of image snapshots.", "", + &get_list_arguments, &execute_list); +Shell::Action action_create( + {"snap", "create"}, {"snap", "add"}, "Create a snapshot.", "", + &get_create_arguments, &execute_create); +Shell::Action action_remove( + {"snap", "remove"}, {"snap", "rm"}, "Delete a snapshot.", "", + &get_remove_arguments, &execute_remove); +Shell::Action action_purge( + {"snap", "purge"}, {}, "Delete all unprotected snapshots.", "", + &get_purge_arguments, &execute_purge); +Shell::Action action_rollback( + {"snap", "rollback"}, {"snap", "revert"}, "Rollback image to snapshot.", "", + &get_rollback_arguments, &execute_rollback); +Shell::Action action_protect( + {"snap", "protect"}, {}, "Prevent a snapshot from being deleted.", "", + &get_protect_arguments, &execute_protect); +Shell::Action action_unprotect( + {"snap", "unprotect"}, {}, "Allow a snapshot to be deleted.", "", + &get_unprotect_arguments, &execute_unprotect); +Shell::Action action_set_limit( + {"snap", "limit", "set"}, {}, "Limit the number of snapshots.", "", + &get_set_limit_arguments, &execute_set_limit); +Shell::Action action_clear_limit( + {"snap", "limit", "clear"}, {}, "Remove snapshot limit.", "", + &get_clear_limit_arguments, &execute_clear_limit); +Shell::Action action_rename( + {"snap", "rename"}, {}, "Rename a snapshot.", "", + &get_rename_arguments, &execute_rename); + +} // namespace snap +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Sparsify.cc b/src/tools/rbd/action/Sparsify.cc new file mode 100644 index 00000000..a345f920 --- /dev/null +++ b/src/tools/rbd/action/Sparsify.cc @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace sparsify { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_sparsify(librbd::Image& image, size_t sparse_size, + bool no_progress) +{ + utils::ProgressContext pc("Image sparsify", no_progress); + int r = image.sparsify_with_progress(sparse_size, pc); + if (r < 0) { + pc.fail(); + return r; + } + pc.finish(); + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_no_progress_option(options); + at::add_sparse_size_option(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + false, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE; + if (vm.count(at::IMAGE_SPARSE_SIZE)) { + sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>(); + } + + r = do_sparsify(image, sparse_size, vm[at::NO_PROGRESS].as<bool>()); + if (r < 0) { + std::cerr << "rbd: sparsify error: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"sparsify"}, {}, + "Reclaim space for zeroed image extents.", "", + &get_arguments, &execute); + +} // namespace sparsify +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Status.cc b/src/tools/rbd/action/Status.cc new file mode 100644 index 00000000..0a599e7f --- /dev/null +++ b/src/tools/rbd/action/Status.cc @@ -0,0 +1,214 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/rbd_types.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace status { + +namespace at = argument_types; +namespace po = boost::program_options; + +static int do_show_status(librados::IoCtx& io_ctx, const std::string &image_name, + librbd::Image &image, Formatter *f) +{ + int r; + std::list<librbd::image_watcher_t> watchers; + + r = image.list_watchers(watchers); + if (r < 0) + return r; + + uint64_t features; + r = image.features(&features); + if (r < 0) { + return r; + } + + librbd::image_migration_status_t migration_status; + std::string source_pool_name; + std::string dest_pool_name; + std::string migration_state; + if ((features & RBD_FEATURE_MIGRATING) != 0) { + r = librbd::RBD().migration_status(io_ctx, image_name.c_str(), + &migration_status, + sizeof(migration_status)); + if (r < 0) { + std::cerr << "rbd: getting migration status failed: " << cpp_strerror(r) + << std::endl; + // not fatal + } else { + librados::IoCtx src_io_ctx; + r = librados::Rados(io_ctx).ioctx_create2(migration_status.source_pool_id, src_io_ctx); + if (r < 0) { + source_pool_name = stringify(migration_status.source_pool_id); + } else { + source_pool_name = src_io_ctx.get_pool_name(); + } + + librados::IoCtx dst_io_ctx; + r = librados::Rados(io_ctx).ioctx_create2(migration_status.dest_pool_id, dst_io_ctx); + if (r < 0) { + dest_pool_name = stringify(migration_status.dest_pool_id); + } else { + dest_pool_name = dst_io_ctx.get_pool_name(); + } + + switch (migration_status.state) { + case RBD_IMAGE_MIGRATION_STATE_ERROR: + migration_state = "error"; + break; + case RBD_IMAGE_MIGRATION_STATE_PREPARING: + migration_state = "preparing"; + break; + case RBD_IMAGE_MIGRATION_STATE_PREPARED: + migration_state = "prepared"; + break; + case RBD_IMAGE_MIGRATION_STATE_EXECUTING: + migration_state = "executing"; + break; + case RBD_IMAGE_MIGRATION_STATE_EXECUTED: + migration_state = "executed"; + break; + case RBD_IMAGE_MIGRATION_STATE_ABORTING: + migration_state = "aborting"; + break; + default: + migration_state = "unknown"; + } + } + } + + if (f) + f->open_object_section("status"); + + if (f) { + f->open_array_section("watchers"); + for (auto &watcher : watchers) { + f->open_object_section("watcher"); + f->dump_string("address", watcher.addr); + f->dump_unsigned("client", watcher.id); + f->dump_unsigned("cookie", watcher.cookie); + f->close_section(); + } + f->close_section(); // watchers + if (!migration_state.empty()) { + f->open_object_section("migration"); + f->dump_string("source_pool_name", source_pool_name); + f->dump_string("source_pool_namespace", + migration_status.source_pool_namespace); + f->dump_string("source_image_name", migration_status.source_image_name); + f->dump_string("source_image_id", migration_status.source_image_id); + f->dump_string("dest_pool_name", dest_pool_name); + f->dump_string("dest_pool_namespace", + migration_status.dest_pool_namespace); + f->dump_string("dest_image_name", migration_status.dest_image_name); + f->dump_string("dest_image_id", migration_status.dest_image_id); + f->dump_string("state", migration_state); + f->dump_string("state_description", migration_status.state_description); + f->close_section(); // migration + } + } else { + if (watchers.size()) { + std::cout << "Watchers:" << std::endl; + for (auto &watcher : watchers) { + std::cout << "\twatcher=" << watcher.addr << " client." << watcher.id + << " cookie=" << watcher.cookie << std::endl; + } + } else { + std::cout << "Watchers: none" << std::endl; + } + if (!migration_state.empty()) { + if (!migration_status.source_pool_namespace.empty()) { + source_pool_name += ("/" + migration_status.source_pool_namespace); + } + if (!migration_status.dest_pool_namespace.empty()) { + dest_pool_name += ("/" + migration_status.dest_pool_namespace); + } + + std::cout << "Migration:" << std::endl; + std::cout << "\tsource: " << source_pool_name << "/" + << migration_status.source_image_name; + if (!migration_status.source_image_id.empty()) { + std::cout << " (" << migration_status.source_image_id << ")"; + } + std::cout << std::endl; + std::cout << "\tdestination: " << dest_pool_name << "/" + << migration_status.dest_image_name << " (" + << migration_status.dest_image_id << ")" << std::endl; + std::cout << "\tstate: " << migration_state; + if (!migration_status.state_description.empty()) { + std::cout << " (" << migration_status.state_description << ")"; + } + std::cout << std::endl; + } + } + + if (f) { + f->close_section(); // status + f->flush(std::cout); + } + + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + at::add_format_options(options); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_show_status(io_ctx, image_name, image, formatter.get()); + if (r < 0) { + std::cerr << "rbd: show status failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"status"}, {}, "Show the status of this image.", "", &get_arguments, + &execute); + +} // namespace status +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Trash.cc b/src/tools/rbd/action/Trash.cc new file mode 100644 index 00000000..327b20ba --- /dev/null +++ b/src/tools/rbd/action/Trash.cc @@ -0,0 +1,525 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "common/Clock.h" +#include <iostream> +#include <sstream> +#include <boost/program_options.hpp> +#include <boost/bind.hpp> + +namespace rbd { +namespace action { +namespace trash { + +namespace at = argument_types; +namespace po = boost::program_options; + +//Optional arguments used only by this set of commands (rbd trash *) +static const std::string EXPIRES_AT("expires-at"); +static const std::string EXPIRED_BEFORE("expired-before"); +static const std::string THRESHOLD("threshold"); + +static bool is_not_trash_user(const librbd::trash_image_info_t &trash_info) { + return trash_info.source != RBD_TRASH_IMAGE_SOURCE_USER; +} + +void get_move_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); + options->add_options() + (EXPIRES_AT.c_str(), po::value<std::string>()->default_value("now"), + "set the expiration time of an image so it can be purged when it is stale"); +} + +int execute_move(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + utime_t now = ceph_clock_now(); + utime_t exp_time = now; + std::string expires_at; + if (vm.find(EXPIRES_AT) != vm.end()) { + expires_at = vm[EXPIRES_AT].as<std::string>(); + r = utime_t::invoke_date(expires_at, &exp_time); + if (r < 0) { + std::cerr << "rbd: error calling /bin/date: " << cpp_strerror(r) + << std::endl; + return r; + } + } + + time_t dt = (exp_time - now).sec(); + if(dt < 0) { + std::cerr << "rbd: cannot use a date in the past as an expiration date" + << std::endl; + return -EINVAL; + } + + librbd::RBD rbd; + r = rbd.trash_move(io_ctx, image_name.c_str(), dt); + if (r < 0) { + std::cerr << "rbd: deferred delete error: " << cpp_strerror(r) + << std::endl; + } + + return r; +} + +void get_remove_arguments(po::options_description *positional, + po::options_description *options) { + positional->add_options() + (at::IMAGE_ID.c_str(), "image id\n(example: [<pool-name>/[<namespace>/]]<image-id>)"); + at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE); + at::add_namespace_option(options, at::ARGUMENT_MODIFIER_NONE); + at::add_image_id_option(options); + + at::add_no_progress_option(options); + options->add_options() + ("force", po::bool_switch(), "force remove of non-expired delayed images"); +} + +int execute_remove(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_id; + int r = utils::get_pool_image_id(vm, &arg_index, &pool_name, &namespace_name, + &image_id); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + io_ctx.set_osdmap_full_try(); + librbd::RBD rbd; + + utils::ProgressContext pc("Removing image", vm[at::NO_PROGRESS].as<bool>()); + r = rbd.trash_remove_with_progress(io_ctx, image_id.c_str(), + vm["force"].as<bool>(), pc); + if (r < 0) { + if (r == -ENOTEMPTY) { + std::cerr << "rbd: image has snapshots - these must be deleted" + << " with 'rbd snap purge' before the image can be removed." + << std::endl; + } else if (r == -EBUSY) { + std::cerr << "rbd: error: image still has watchers" + << std::endl + << "This means the image is still open or the client using " + << "it crashed. Try again after closing/unmapping it or " + << "waiting 30s for the crashed client to timeout." + << std::endl; + } else if (r == -EMLINK) { + std::cerr << std::endl + << "Remove the image from the group and try again." + << std::endl; + } else if (r == -EPERM) { + std::cerr << std::endl + << "Deferment time has not expired, please use --force if you " + << "really want to remove the image" + << std::endl; + } else { + std::cerr << "rbd: remove error: " << cpp_strerror(r) << std::endl; + } + pc.fail(); + return r; + } + + pc.finish(); + + return r; +} + +std::string delete_status(time_t deferment_end_time) { + time_t now = time(nullptr); + + std::string time_str = ctime(&deferment_end_time); + time_str = time_str.substr(0, time_str.length() - 1); + + std::stringstream ss; + if (now < deferment_end_time) { + ss << "protected until " << time_str; + } else { + ss << "expired at " << time_str; + } + + return ss.str(); +} + +int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool long_flag, + bool all_flag, Formatter *f) { + std::vector<librbd::trash_image_info_t> trash_entries; + int r = rbd.trash_list(io_ctx, trash_entries); + if (r < 0) { + return r; + } + + if (!all_flag) { + trash_entries.erase(remove_if(trash_entries.begin(), + trash_entries.end(), + boost::bind(is_not_trash_user, _1)), + trash_entries.end()); + } + + if (!long_flag) { + if (f) { + f->open_array_section("trash"); + } + for (const auto& entry : trash_entries) { + if (f) { + f->open_object_section("image"); + f->dump_string("id", entry.id); + f->dump_string("name", entry.name); + f->close_section(); + } else { + std::cout << entry.id << " " << entry.name << std::endl; + } + } + if (f) { + f->close_section(); + f->flush(std::cout); + } + return 0; + } + + TextTable tbl; + + if (f) { + f->open_array_section("trash"); + } else { + tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("SOURCE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DELETED_AT", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("STATUS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("PARENT", TextTable::LEFT, TextTable::LEFT); + } + + for (const auto& entry : trash_entries) { + librbd::Image im; + + r = rbd.open_by_id_read_only(io_ctx, im, entry.id.c_str(), NULL); + // image might disappear between rbd.list() and rbd.open(); ignore + // that, warn about other possible errors (EPERM, say, for opening + // an old-format image, because you need execute permission for the + // class method) + if (r < 0) { + if (r != -ENOENT) { + std::cerr << "rbd: error opening " << entry.id << ": " + << cpp_strerror(r) << std::endl; + } + // in any event, continue to next image + continue; + } + + std::string del_source; + switch (entry.source) { + case RBD_TRASH_IMAGE_SOURCE_USER: + del_source = "USER"; + break; + case RBD_TRASH_IMAGE_SOURCE_MIRRORING: + del_source = "MIRRORING"; + break; + case RBD_TRASH_IMAGE_SOURCE_MIGRATION: + del_source = "MIGRATION"; + break; + case RBD_TRASH_IMAGE_SOURCE_REMOVING: + del_source = "REMOVING"; + break; + } + + std::string time_str = ctime(&entry.deletion_time); + time_str = time_str.substr(0, time_str.length() - 1); + + bool has_parent = false; + std::string parent; + librbd::linked_image_spec_t parent_image; + librbd::snap_spec_t parent_snap; + r = im.get_parent(&parent_image, &parent_snap); + if (r == -ENOENT) { + r = 0; + } else if (r < 0) { + return r; + } else { + parent = parent_image.pool_name + "/"; + if (!parent_image.pool_namespace.empty()) { + parent += parent_image.pool_namespace + "/"; + } + parent += parent_image.image_name + "@" + parent_snap.name; + has_parent = true; + } + + if (f) { + f->open_object_section("image"); + f->dump_string("id", entry.id); + f->dump_string("name", entry.name); + f->dump_string("source", del_source); + f->dump_string("deleted_at", time_str); + f->dump_string("status", + delete_status(entry.deferment_end_time)); + if (has_parent) { + f->open_object_section("parent"); + f->dump_string("pool", parent_image.pool_name); + f->dump_string("pool_namespace", parent_image.pool_namespace); + f->dump_string("image", parent_image.image_name); + f->dump_string("snapshot", parent_snap.name); + f->close_section(); + } + f->close_section(); + } else { + tbl << entry.id + << entry.name + << del_source + << time_str + << delete_status(entry.deferment_end_time); + if (has_parent) + tbl << parent; + tbl << TextTable::endrow; + } + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } else if (!trash_entries.empty()) { + std::cout << tbl; + } + + return r < 0 ? r : 0; +} + +void get_list_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, true); + options->add_options() + ("all,a", po::bool_switch(), "list images from all sources"); + options->add_options() + ("long,l", po::bool_switch(), "long listing format"); + at::add_format_options(options); +} + +int execute_list(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + std::string namespace_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name, + &namespace_name, &arg_index); + if (r < 0) { + return r; + } + + at::Format::Formatter formatter; + r = utils::get_formatter(vm, &formatter); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + utils::disable_cache(); + + librbd::RBD rbd; + r = do_list(rbd, io_ctx, vm["long"].as<bool>(), vm["all"].as<bool>(), + formatter.get()); + if (r < 0) { + std::cerr << "rbd: trash list: " << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +void get_purge_arguments(po::options_description *positional, + po::options_description *options) { + at::add_pool_options(positional, options, true); + at::add_no_progress_option(options); + + options->add_options() + (EXPIRED_BEFORE.c_str(), po::value<std::string>()->value_name("date"), + "purges images that expired before the given date"); + options->add_options() + (THRESHOLD.c_str(), po::value<float>(), + "purges images until the current pool data usage is reduced to X%, " + "value range: 0.0-1.0"); +} + +int execute_purge (const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + std::string pool_name; + std::string namespace_name; + size_t arg_index = 0; + int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name, + &namespace_name, &arg_index); + if (r < 0) { + return r; + } + + utils::disable_cache(); + + librbd::RBD rbd; + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + io_ctx.set_osdmap_full_try(); + + float threshold = -1; + time_t expire_ts = 0; + + if (vm.find(THRESHOLD) != vm.end()) { + threshold = vm[THRESHOLD].as<float>(); + } else { + if (vm.find(EXPIRED_BEFORE) != vm.end()) { + utime_t new_time; + r = utime_t::invoke_date(vm[EXPIRED_BEFORE].as<std::string>(), &new_time); + if (r < 0) { + std::cerr << "rbd: error calling /bin/date: " << cpp_strerror(r) + << std::endl; + return r; + } + expire_ts = new_time.sec(); + } + } + + utils::ProgressContext pc("Removing images", vm[at::NO_PROGRESS].as<bool>()); + r = rbd.trash_purge_with_progress(io_ctx, expire_ts, threshold, pc); + if (r < 0) { + pc.fail(); + } else { + pc.finish(); + } + + return 0; +} + +void get_restore_arguments(po::options_description *positional, + po::options_description *options) { + positional->add_options() + (at::IMAGE_ID.c_str(), "image id\n(example: [<pool-name>/]<image-id>)"); + at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE); + at::add_namespace_option(options, at::ARGUMENT_MODIFIER_NONE); + at::add_image_id_option(options); + at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE, ""); +} + +int execute_restore(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_id; + int r = utils::get_pool_image_id(vm, &arg_index, &pool_name, &namespace_name, + &image_id); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + r = utils::init(pool_name, namespace_name, &rados, &io_ctx); + if (r < 0) { + return r; + } + + std::string name; + if (vm.find(at::IMAGE_NAME) != vm.end()) { + name = vm[at::IMAGE_NAME].as<std::string>(); + } + + librbd::RBD rbd; + r = rbd.trash_restore(io_ctx, image_id.c_str(), name.c_str()); + if (r < 0) { + if (r == -ENOENT) { + std::cerr << "rbd: error: image does not exist in trash" + << std::endl; + } else if (r == -EEXIST) { + std::cerr << "rbd: error: an image with the same name already exists, " + << "try again with with a different name" + << std::endl; + } else { + std::cerr << "rbd: restore error: " << cpp_strerror(r) << std::endl; + } + return r; + } + + return r; +} + + +Shell::Action action_move( + {"trash", "move"}, {"trash", "mv"}, "Move an image to the trash.", "", + &get_move_arguments, &execute_move); + +Shell::Action action_remove( + {"trash", "remove"}, {"trash", "rm"}, "Remove an image from trash.", "", + &get_remove_arguments, &execute_remove); + +Shell::Action action_purge( + {"trash", "purge"}, {}, "Remove all expired images from trash.", "", + &get_purge_arguments, &execute_purge); + +Shell::SwitchArguments switched_arguments({"long", "l"}); +Shell::Action action_list( + {"trash", "list"}, {"trash", "ls"}, "List trash images.", "", + &get_list_arguments, &execute_list); + +Shell::Action action_restore( + {"trash", "restore"}, {}, "Restore an image from trash.", "", + &get_restore_arguments, &execute_restore); + +} // namespace trash +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/action/Watch.cc b/src/tools/rbd/action/Watch.cc new file mode 100644 index 00000000..65f0f93d --- /dev/null +++ b/src/tools/rbd/action/Watch.cc @@ -0,0 +1,149 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/ArgumentTypes.h" +#include "tools/rbd/Shell.h" +#include "tools/rbd/Utils.h" +#include "include/rbd_types.h" +#include "librbd/WatchNotifyTypes.h" +#include "common/errno.h" +#include <iostream> +#include <boost/program_options.hpp> + +namespace rbd { +namespace action { +namespace watch { + +namespace at = argument_types; +namespace po = boost::program_options; + +class RbdWatchCtx : public librados::WatchCtx2 { +public: + RbdWatchCtx(librados::IoCtx& io_ctx, const char *image_name, + const std::string &header_oid) + : m_io_ctx(io_ctx), m_image_name(image_name), m_header_oid(header_oid) + { + } + + ~RbdWatchCtx() override {} + + void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) override { + using namespace librbd::watch_notify; + NotifyMessage notify_message; + if (bl.length() == 0) { + notify_message = NotifyMessage(HeaderUpdatePayload()); + } else { + try { + auto iter = bl.cbegin(); + notify_message.decode(iter); + } catch (const buffer::error &err) { + std::cerr << "rbd: failed to decode image notification" << std::endl; + } + } + + std::cout << m_image_name << " received notification: notify_id=" + << notify_id << ", cookie=" << cookie << ", notifier_id=" + << notifier_id << ", bl.length=" << bl.length() << ", notify_op=" + << notify_message.get_notify_op() << std::endl; + bufferlist reply; + m_io_ctx.notify_ack(m_header_oid, notify_id, cookie, reply); + } + + void handle_error(uint64_t cookie, int err) override { + std::cerr << m_image_name << " received error: cookie=" << cookie << ", " + << "err=" << cpp_strerror(err) << std::endl; + } +private: + librados::IoCtx m_io_ctx; + const char *m_image_name; + std::string m_header_oid; +}; + +static int do_watch(librados::IoCtx& pp, librbd::Image &image, + const char *imgname) +{ + uint8_t old_format; + int r = image.old_format(&old_format); + if (r < 0) { + std::cerr << "failed to query format" << std::endl; + return r; + } + + std::string header_oid; + if (old_format != 0) { + header_oid = std::string(imgname) + RBD_SUFFIX; + } else { + std::string id; + r = image.get_id(&id); + if (r < 0) { + return r; + } + + header_oid = RBD_HEADER_PREFIX + id; + } + + uint64_t cookie; + RbdWatchCtx ctx(pp, imgname, header_oid); + r = pp.watch2(header_oid, &cookie, &ctx); + if (r < 0) { + std::cerr << "rbd: watch failed" << std::endl; + return r; + } + + std::cout << "press enter to exit..." << std::endl; + getchar(); + + r = pp.unwatch2(cookie); + if (r < 0) { + std::cerr << "rbd: unwatch failed" << std::endl; + return r; + } + return 0; +} + +void get_arguments(po::options_description *positional, + po::options_description *options) { + at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE); +} + +int execute(const po::variables_map &vm, + const std::vector<std::string> &ceph_global_init_args) { + size_t arg_index = 0; + std::string pool_name; + std::string namespace_name; + std::string image_name; + std::string snap_name; + int r = utils::get_pool_image_snapshot_names( + vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name, + &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE, + utils::SPEC_VALIDATION_NONE); + if (r < 0) { + return r; + } + + librados::Rados rados; + librados::IoCtx io_ctx; + librbd::Image image; + r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "", + true, &rados, &io_ctx, &image); + if (r < 0) { + return r; + } + + r = do_watch(io_ctx, image, image_name.c_str()); + if (r < 0) { + std::cerr << "rbd: watch failed: " << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +Shell::Action action( + {"watch"}, {}, "Watch events on image.", "", &get_arguments, &execute); + +} // namespace watch +} // namespace action +} // namespace rbd diff --git a/src/tools/rbd/rbd.cc b/src/tools/rbd/rbd.cc new file mode 100644 index 00000000..a8c59d57 --- /dev/null +++ b/src/tools/rbd/rbd.cc @@ -0,0 +1,10 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd/Shell.h" + +int main(int argc, const char **argv) +{ + rbd::Shell shell; + return shell.execute(argc, argv); +} diff --git a/src/tools/rbd_ggate/CMakeLists.txt b/src/tools/rbd_ggate/CMakeLists.txt new file mode 100644 index 00000000..5c5572c4 --- /dev/null +++ b/src/tools/rbd_ggate/CMakeLists.txt @@ -0,0 +1,9 @@ +add_executable(rbd-ggate + Driver.cc + Server.cc + Watcher.cc + debug.cc + ggate_drv.c + main.cc) +target_link_libraries(rbd-ggate geom librbd librados global) +install(TARGETS rbd-ggate DESTINATION bin) diff --git a/src/tools/rbd_ggate/Driver.cc b/src/tools/rbd_ggate/Driver.cc new file mode 100644 index 00000000..752ef56f --- /dev/null +++ b/src/tools/rbd_ggate/Driver.cc @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <stdlib.h> + +#include "common/debug.h" +#include "common/errno.h" +#include "Driver.h" +#include "Request.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "rbd::ggate::Driver: " << this \ + << " " << __func__ << ": " + +namespace rbd { +namespace ggate { + +int Driver::load() { + + return ggate_drv_load(); +} + +int Driver::kill(const std::string &devname) { + + int r = ggate_drv_kill(devname.c_str()); + + return r; +} + +int Driver::list(std::map<std::string, DevInfo> *devices) { + size_t size = 1024; + ggate_drv_info *devs = nullptr; + int r; + + while (size <= 1024 * 1024) { + devs = static_cast<ggate_drv_info *>( + realloc(static_cast<void *>(devs), size * sizeof(*devs))); + r = ggate_drv_list(devs, &size); + if (r != -ERANGE) { + break; + } + } + if (r < 0) { + goto free; + } + + devices->clear(); + for (size_t i = 0; i < size; i++) { + auto &dev = devs[i]; + (*devices)[dev.id] = {dev.name, dev.info}; + } + +free: + free(devs); + + return r; +} + +Driver::Driver(const std::string &devname, size_t sectorsize, size_t mediasize, + bool readonly, const std::string &info) + : m_devname(devname), m_sectorsize(sectorsize), m_mediasize(mediasize), + m_readonly(readonly), m_info(info) { +} + +int Driver::init() { + dout(20) << dendl; + + char name[PATH_MAX]; + size_t namelen; + + if (m_devname.empty()) { + name[0] = '\0'; + namelen = PATH_MAX; + } else { + namelen = m_devname.size(); + if (namelen >= PATH_MAX) { + return -ENAMETOOLONG; + } + strncpy(name, m_devname.c_str(), namelen + 1); + } + + int r = ggate_drv_create(name, namelen, m_sectorsize, m_mediasize, m_readonly, + m_info.c_str(), &m_drv); + if (r < 0) { + return r; + } + + if (m_devname.empty()) { + m_devname = name; + } + + return 0; +} + +std::string Driver::get_devname() const { + dout(30) << m_devname << dendl; + + return m_devname; +} + +void Driver::shut_down() { + dout(20) << dendl; + + ggate_drv_destroy(m_drv); +} + +int Driver::resize(size_t newsize) { + dout(20) << "newsize=" << newsize << dendl; + + int r = ggate_drv_resize(m_drv, newsize); + if (r < 0) { + return r; + } + + m_mediasize = newsize; + return 0; +} + +int Driver::recv(Request **req) { + dout(20) << dendl; + + ggate_drv_req_t req_; + + int r = ggate_drv_recv(m_drv, &req_); + if (r < 0) { + return r; + } + + *req = new Request(req_); + + dout(20) << "req=" << *req << dendl; + + if (ggate_drv_req_cmd(req_) == GGATE_DRV_CMD_WRITE) { + bufferptr ptr(buffer::claim_malloc( + ggate_drv_req_length(req_), + static_cast<char *>(ggate_drv_req_release_buf(req_)))); + (*req)->bl.push_back(ptr); + } + + return 0; +} + +int Driver::send(Request *req) { + dout(20) << "req=" << req << dendl; + + if (ggate_drv_req_cmd(req->req) == GGATE_DRV_CMD_READ && + ggate_drv_req_error(req->req) == 0) { + ceph_assert(req->bl.length() == ggate_drv_req_length(req->req)); + // TODO: avoid copying? + req->bl.copy(0, ggate_drv_req_length(req->req), + static_cast<char *>(ggate_drv_req_buf(req->req))); + dout(20) << "copied resulting " << req->bl.length() << " bytes to " + << ggate_drv_req_buf(req->req) << dendl; + } + + int r = ggate_drv_send(m_drv, req->req); + + delete req; + return r; +} + +} // namespace ggate +} // namespace rbd diff --git a/src/tools/rbd_ggate/Driver.h b/src/tools/rbd_ggate/Driver.h new file mode 100644 index 00000000..50be72b9 --- /dev/null +++ b/src/tools/rbd_ggate/Driver.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_GGATE_DRIVER_H +#define CEPH_RBD_GGATE_DRIVER_H + +#include <map> +#include <string> + +#include "ggate_drv.h" + +namespace rbd { +namespace ggate { + +struct Request; + +class Driver { +public: + typedef std::pair<std::string, std::string> DevInfo; + static int load(); + static int kill(const std::string &devname); + static int list(std::map<std::string, DevInfo> *devices); + + Driver(const std::string &devname, size_t sectorsize, size_t mediasize, + bool readonly, const std::string &info); + + int init(); + void shut_down(); + + std::string get_devname() const; + + int recv(Request **req); + int send(Request *req); + + int resize(size_t newsize); + +private: + std::string m_devname; + size_t m_sectorsize; + size_t m_mediasize; + bool m_readonly; + std::string m_info; + ggate_drv_t m_drv = 0; +}; + +} // namespace ggate +} // namespace rbd + +#endif // CEPH_RBD_GGATE_DRIVER_H + diff --git a/src/tools/rbd_ggate/Request.h b/src/tools/rbd_ggate/Request.h new file mode 100644 index 00000000..66f21985 --- /dev/null +++ b/src/tools/rbd_ggate/Request.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_GGATE_REQUEST_H +#define CEPH_RBD_GGATE_REQUEST_H + +#include "ggate_drv.h" + +namespace rbd { +namespace ggate { + +struct Request { + enum Command { + Unknown = 0, + Write = 1, + Read = 2, + Flush = 3, + Discard = 4, + }; + + ggate_drv_req_t req; + bufferlist bl; + + Request(ggate_drv_req_t req) : req(req) { + } + + uint64_t get_id() { + return ggate_drv_req_id(req); + } + + Command get_cmd() { + return static_cast<Command>(ggate_drv_req_cmd(req)); + } + + size_t get_length() { + return ggate_drv_req_length(req); + } + + uint64_t get_offset() { + return ggate_drv_req_offset(req); + } + + uint64_t get_error() { + return ggate_drv_req_error(req); + } + + void set_error(int error) { + ggate_drv_req_set_error(req, error); + } +}; + +} // namespace ggate +} // namespace rbd + +#endif // CEPH_RBD_GGATE_REQUEST_H diff --git a/src/tools/rbd_ggate/Server.cc b/src/tools/rbd_ggate/Server.cc new file mode 100644 index 00000000..3beeec3f --- /dev/null +++ b/src/tools/rbd_ggate/Server.cc @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" +#include "Driver.h" +#include "Server.h" +#include "Request.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "rbd::ggate::Server: " << this \ + << " " << __func__ << ": " + +namespace rbd { +namespace ggate { + +Server::Server(Driver *drv, librbd::Image& image) + : m_drv(drv), m_image(image), m_lock("rbd::ggate::Server::m_lock"), + m_reader_thread(this, &Server::reader_entry), + m_writer_thread(this, &Server::writer_entry) { +} + +void Server::run() { + dout(10) << dendl; + + int r = start(); + ceph_assert(r == 0); + + dout(20) << "entering run loop" << dendl; + + { + Mutex::Locker locker(m_lock); + while (!m_stopping) { + m_cond.WaitInterval(m_lock, utime_t(1, 0)); + } + } + + dout(20) << "exiting run loop" << dendl; + + stop(); +} + +int Server::start() { + dout(10) << dendl; + + m_reader_thread.create("rbd_reader"); + m_writer_thread.create("rbd_writer"); + return 0; +} + +void Server::stop() { + dout(10) << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_stopping); + } + + m_reader_thread.join(); + m_writer_thread.join(); + + wait_clean(); +} + +void Server::io_start(IOContext *ctx) { + dout(20) << ctx << dendl; + + Mutex::Locker locker(m_lock); + m_io_pending.push_back(&ctx->item); +} + +void Server::io_finish(IOContext *ctx) { + dout(20) << ctx << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(ctx->item.is_on_list()); + + ctx->item.remove_myself(); + m_io_finished.push_back(&ctx->item); + m_cond.Signal(); +} + +Server::IOContext *Server::wait_io_finish() { + dout(20) << dendl; + + Mutex::Locker locker(m_lock); + + while (m_io_finished.empty() && !m_stopping) { + m_cond.Wait(m_lock); + } + + if (m_io_finished.empty()) { + return nullptr; + } + + IOContext *ret = m_io_finished.front(); + m_io_finished.pop_front(); + + return ret; +} + +void Server::wait_clean() { + dout(20) << dendl; + + ceph_assert(!m_reader_thread.is_started()); + + Mutex::Locker locker(m_lock); + + while (!m_io_pending.empty()) { + m_cond.Wait(m_lock); + } + + while (!m_io_finished.empty()) { + std::unique_ptr<IOContext> free_ctx(m_io_finished.front()); + m_io_finished.pop_front(); + } +} + +void Server::aio_callback(librbd::completion_t cb, void *arg) { + librbd::RBD::AioCompletion *aio_completion = + reinterpret_cast<librbd::RBD::AioCompletion*>(cb); + + IOContext *ctx = reinterpret_cast<IOContext *>(arg); + int r = aio_completion->get_return_value(); + + ctx->server->handle_aio(ctx, r); + aio_completion->release(); +} + +void Server::handle_aio(IOContext *ctx, int r) { + dout(20) << ctx << ": r=" << r << dendl; + + if (r == -EINVAL) { + // if shrinking an image, a pagecache writeback might reference + // extents outside of the range of the new image extents + dout(5) << "masking IO out-of-bounds error" << dendl; + ctx->req->bl.clear(); + r = 0; + } + + if (r < 0) { + ctx->req->set_error(-r); + } else if ((ctx->req->get_cmd() == Request::Read) && + r != static_cast<int>(ctx->req->get_length())) { + int pad_byte_count = static_cast<int> (ctx->req->get_length()) - r; + ctx->req->bl.append_zero(pad_byte_count); + dout(20) << ctx << ": pad byte count: " << pad_byte_count << dendl; + ctx->req->set_error(0); + } else { + ctx->req->set_error(0); + } + io_finish(ctx); +} + +void Server::reader_entry() { + dout(20) << dendl; + + while (!m_stopping) { + std::unique_ptr<IOContext> ctx(new IOContext(this)); + + dout(20) << "waiting for ggate request" << dendl; + + int r = m_drv->recv(&ctx->req); + if (r < 0) { + if (r != -ECANCELED) { + derr << "recv: " << cpp_strerror(r) << dendl; + } + Mutex::Locker locker(m_lock); + m_stopping = true; + m_cond.Signal(); + return; + } + + IOContext *pctx = ctx.release(); + + dout(20) << pctx << ": start: " << *pctx << dendl; + + io_start(pctx); + librbd::RBD::AioCompletion *c = + new librbd::RBD::AioCompletion(pctx, aio_callback); + switch (pctx->req->get_cmd()) + { + case rbd::ggate::Request::Write: + m_image.aio_write(pctx->req->get_offset(), pctx->req->get_length(), + pctx->req->bl, c); + break; + case rbd::ggate::Request::Read: + m_image.aio_read(pctx->req->get_offset(), pctx->req->get_length(), + pctx->req->bl, c); + break; + case rbd::ggate::Request::Flush: + m_image.aio_flush(c); + break; + case rbd::ggate::Request::Discard: + m_image.aio_discard(pctx->req->get_offset(), pctx->req->get_length(), c); + break; + default: + derr << pctx << ": invalid request command: " << pctx->req->get_cmd() + << dendl; + c->release(); + Mutex::Locker locker(m_lock); + m_stopping = true; + m_cond.Signal(); + return; + } + } + dout(20) << "terminated" << dendl; +} + +void Server::writer_entry() { + dout(20) << dendl; + + while (!m_stopping) { + dout(20) << "waiting for io request" << dendl; + + std::unique_ptr<IOContext> ctx(wait_io_finish()); + if (!ctx) { + dout(20) << "no io requests, terminating" << dendl; + return; + } + + dout(20) << ctx.get() << ": got: " << *ctx << dendl; + + int r = m_drv->send(ctx->req); + if (r < 0) { + derr << ctx.get() << ": send: " << cpp_strerror(r) << dendl; + Mutex::Locker locker(m_lock); + m_stopping = true; + m_cond.Signal(); + return; + } + dout(20) << ctx.get() << " finish" << dendl; + } + dout(20) << "terminated" << dendl; +} + +std::ostream &operator<<(std::ostream &os, const Server::IOContext &ctx) { + + os << "[" << ctx.req->get_id(); + + switch (ctx.req->get_cmd()) + { + case rbd::ggate::Request::Write: + os << " Write "; + break; + case rbd::ggate::Request::Read: + os << " Read "; + break; + case rbd::ggate::Request::Flush: + os << " Flush "; + break; + case rbd::ggate::Request::Discard: + os << " Discard "; + break; + default: + os << " Unknow(" << ctx.req->get_cmd() << ") "; + break; + } + + os << ctx.req->get_offset() << "~" << ctx.req->get_length() << " " + << ctx.req->get_error() << "]"; + + return os; +} + +} // namespace ggate +} // namespace rbd + diff --git a/src/tools/rbd_ggate/Server.h b/src/tools/rbd_ggate/Server.h new file mode 100644 index 00000000..8ed4f512 --- /dev/null +++ b/src/tools/rbd_ggate/Server.h @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_GGATE_SERVER_H +#define CEPH_RBD_GGATE_SERVER_H + +#include "include/rbd/librbd.hpp" +#include "include/xlist.h" +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/Thread.h" + +namespace rbd { +namespace ggate { + +class Driver; +struct Request; + +class Server { +public: + Server(Driver *drv, librbd::Image& image); + + void run(); + +private: + struct IOContext { + xlist<IOContext*>::item item; + Server *server; + Request *req = nullptr; + + IOContext(Server *server) : item(this), server(server) { + } + }; + + class ThreadHelper : public Thread { + public: + typedef void (Server::*entry_func)(); + + ThreadHelper(Server *server, entry_func func) + : server(server), func(func) { + } + + protected: + virtual void* entry() { + (server->*func)(); + return nullptr; + } + + private: + Server *server; + entry_func func; + }; + + friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx); + + Driver *m_drv; + librbd::Image &m_image; + + mutable Mutex m_lock; + Cond m_cond; + bool m_stopping = false; + ThreadHelper m_reader_thread, m_writer_thread; + xlist<IOContext*> m_io_pending; + xlist<IOContext*> m_io_finished; + + static void aio_callback(librbd::completion_t cb, void *arg); + + int start(); + void stop(); + + void reader_entry(); + void writer_entry(); + + void io_start(IOContext *ctx); + void io_finish(IOContext *ctx); + + IOContext *wait_io_finish(); + void wait_clean(); + + void handle_aio(IOContext *ctx, int r); +}; + +std::ostream &operator<<(std::ostream &os, const Server::IOContext &ctx); + +} // namespace ggate +} // namespace rbd + +#endif // CEPH_RBD_GGATE_SERVER_H diff --git a/src/tools/rbd_ggate/Watcher.cc b/src/tools/rbd_ggate/Watcher.cc new file mode 100644 index 00000000..57b3f960 --- /dev/null +++ b/src/tools/rbd_ggate/Watcher.cc @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" +#include "Driver.h" +#include "Watcher.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "rbd::ggate::Watcher: " << this \ + << " " << __func__ << ": " + +namespace rbd { +namespace ggate { + +Watcher::Watcher(Driver *drv, librados::IoCtx &ioctx, librbd::Image &image, + size_t size) + : m_drv(drv), m_ioctx(ioctx), m_image(image), m_size(size) { +} + +void Watcher::handle_notify() { + dout(20) << dendl; + + librbd::image_info_t info; + + if (m_image.stat(info, sizeof(info)) == 0) { + size_t new_size = info.size; + + if (new_size != m_size) { + int r = m_drv->resize(new_size); + if (r < 0) { + derr << "resize failed: " << cpp_strerror(r) << dendl; + m_drv->shut_down(); + } + r = m_image.invalidate_cache(); + if (r < 0) { + derr << "invalidate rbd cache failed: " << cpp_strerror(r) << dendl; + m_drv->shut_down(); + } + m_size = new_size; + } + } +} + +} // namespace ggate +} // namespace rbd diff --git a/src/tools/rbd_ggate/Watcher.h b/src/tools/rbd_ggate/Watcher.h new file mode 100644 index 00000000..8f524b43 --- /dev/null +++ b/src/tools/rbd_ggate/Watcher.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_GGATE_WATCHER_H +#define CEPH_RBD_GGATE_WATCHER_H + +#include "include/rbd/librbd.hpp" + +namespace rbd { +namespace ggate { + +class Driver; + +class Watcher : public librbd::UpdateWatchCtx +{ +public: + Watcher(Driver *m_drv, librados::IoCtx &ioctx, librbd::Image &image, + size_t size); + + void handle_notify() override; + +private: + Driver *m_drv; + librados::IoCtx &m_ioctx; + librbd::Image &m_image; + size_t m_size; +}; + + +} // namespace ggate +} // namespace rbd + +#endif // CEPH_RBD_GGATE_WATCHER_H + diff --git a/src/tools/rbd_ggate/debug.cc b/src/tools/rbd_ggate/debug.cc new file mode 100644 index 00000000..b675ba5b --- /dev/null +++ b/src/tools/rbd_ggate/debug.cc @@ -0,0 +1,55 @@ +#include "common/debug.h" +#include "common/errno.h" +#include "debug.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "rbd::ggate: " + +extern "C" void debugv(int level, const char *fmt, va_list ap) { + char *msg; + int saved_errno = errno; + + if (g_ceph_context == nullptr) { + return; + } + + vasprintf(&msg, fmt, ap); + + dout(ceph::dout::need_dynamic(level)) << msg << dendl; + + free(msg); + errno = saved_errno; +} + +extern "C" void debug(int level, const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + debugv(level, fmt, ap); + va_end(ap); +} + +extern "C" void errx(const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + debugv(-1, fmt, ap); + va_end(ap); +} + +extern "C" void err(const char *fmt, ...) { + va_list ap; + char *msg; + int saved_errno = errno; + + va_start(ap, fmt); + vasprintf(&msg, fmt, ap); + va_end(ap); + errno = saved_errno; + + errx("%s: %s", msg, cpp_strerror(errno).c_str()); + + free(msg); +} diff --git a/src/tools/rbd_ggate/debug.h b/src/tools/rbd_ggate/debug.h new file mode 100644 index 00000000..da9b46a3 --- /dev/null +++ b/src/tools/rbd_ggate/debug.h @@ -0,0 +1,17 @@ +#ifndef CEPH_RBD_GGATE_DEBUG_H +#define CEPH_RBD_GGATE_DEBUG_H + +#ifdef __cplusplus +extern "C" { +#endif + +void debug(int level, const char *fmt, ...) __printflike(2, 3); +void debugv(int level, const char *fmt, va_list ap) __printflike(2, 0); +void err(const char *fmt, ...) __printflike(1, 2); +void errx(const char *fmt, ...) __printflike(1, 2); + +#ifdef __cplusplus +} +#endif + +#endif // CEPH_RBD_GGATE_DEBUG_H diff --git a/src/tools/rbd_ggate/ggate_drv.c b/src/tools/rbd_ggate/ggate_drv.c new file mode 100644 index 00000000..b1faccd2 --- /dev/null +++ b/src/tools/rbd_ggate/ggate_drv.c @@ -0,0 +1,379 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/disk.h> +#include <sys/linker.h> +#include <sys/queue.h> +#include <sys/stat.h> + +#include <geom/gate/g_gate.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <libgeom.h> + +#include "debug.h" +#include "ggate_drv.h" + +uint64_t ggate_drv_req_id(ggate_drv_req_t req) { + struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req; + + return ggio->gctl_seq; +} + +int ggate_drv_req_cmd(ggate_drv_req_t req) { + struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req; + + switch (ggio->gctl_cmd) { + case BIO_WRITE: + return GGATE_DRV_CMD_WRITE; + case BIO_READ: + return GGATE_DRV_CMD_READ; + case BIO_FLUSH: + return GGATE_DRV_CMD_FLUSH; + case BIO_DELETE: + return GGATE_DRV_CMD_DISCARD; + default: + return GGATE_DRV_CMD_UNKNOWN; + } +} + +uint64_t ggate_drv_req_offset(ggate_drv_req_t req) { + struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req; + + return ggio->gctl_offset; +} + +size_t ggate_drv_req_length(ggate_drv_req_t req) { + struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req; + + return ggio->gctl_length; +} + +void *ggate_drv_req_buf(ggate_drv_req_t req) { + struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req; + + return ggio->gctl_data; +} + +int ggate_drv_req_error(ggate_drv_req_t req) { + struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req; + + return ggio->gctl_error; +} + +void ggate_drv_req_set_error(ggate_drv_req_t req, int error) { + struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req; + + ggio->gctl_error = error; +} + +void *ggate_drv_req_release_buf(ggate_drv_req_t req) { + struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req; + + void *data = ggio->gctl_data; + ggio->gctl_data = NULL; + + return data; +} + +struct ggate_drv { + int fd; + int unit; +}; + +int ggate_drv_load() { + if (modfind("g_gate") != -1) { + /* Present in kernel. */ + return 0; + } + + if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) { + if (errno != EEXIST) { + err("failed to load geom_gate module"); + return -errno; + } + } + return 0; +} + +int ggate_drv_create(char *name, size_t namelen, size_t sectorsize, + size_t mediasize, bool readonly, const char *info, ggate_drv_t *drv_) { + struct ggate_drv *drv; + struct g_gate_ctl_create ggiocreate; + + debug(20, "%s: name=%s, sectorsize=%zd, mediasize=%zd, readonly=%d, info=%s", + __func__, name, sectorsize, mediasize, (int)readonly, info); + + if (*name != '\0') { + if (namelen > sizeof(ggiocreate.gctl_name) - 1) { + return -ENAMETOOLONG; + } + } + + /* + * We communicate with ggate via /dev/ggctl. Open it. + */ + int fd = open("/dev/" G_GATE_CTL_NAME, O_RDWR); + if (fd == -1) { + err("failed to open /dev/" G_GATE_CTL_NAME); + return -errno; + } + + drv = calloc(1, sizeof(*drv)); + if (drv == NULL) { + errno = -ENOMEM; + goto fail_close; + } + + /* + * Create provider. + */ + memset(&ggiocreate, 0, sizeof(ggiocreate)); + ggiocreate.gctl_version = G_GATE_VERSION; + ggiocreate.gctl_mediasize = mediasize; + ggiocreate.gctl_sectorsize = sectorsize; + ggiocreate.gctl_flags = readonly ? G_GATE_FLAG_READONLY : 0; + ggiocreate.gctl_maxcount = 0; + ggiocreate.gctl_timeout = 0; + if (*name != '\0') { + ggiocreate.gctl_unit = G_GATE_NAME_GIVEN; + strlcpy(ggiocreate.gctl_name, name, sizeof(ggiocreate.gctl_name)); + } else { + ggiocreate.gctl_unit = G_GATE_UNIT_AUTO; + } + strlcpy(ggiocreate.gctl_info, info, sizeof(ggiocreate.gctl_info)); + if (ioctl(fd, G_GATE_CMD_CREATE, &ggiocreate) == -1) { + err("failed to create " G_GATE_PROVIDER_NAME " device"); + goto fail; + } + + debug(20, "%s: created, unit: %d, name: %s", __func__, ggiocreate.gctl_unit, + ggiocreate.gctl_name); + + drv->fd = fd; + drv->unit = ggiocreate.gctl_unit; + *drv_ = drv; + + if (*name == '\0') { + snprintf(name, namelen, "%s%d", G_GATE_PROVIDER_NAME, drv->unit); + } + + return 0; + +fail: + free(drv); +fail_close: + close(fd); + return -errno; +} + +void ggate_drv_destroy(ggate_drv_t drv_) { + struct ggate_drv *drv = (struct ggate_drv *)drv_; + struct g_gate_ctl_destroy ggiodestroy; + + debug(20, "%s %p", __func__, drv); + + memset(&ggiodestroy, 0, sizeof(ggiodestroy)); + ggiodestroy.gctl_version = G_GATE_VERSION; + ggiodestroy.gctl_unit = drv->unit; + ggiodestroy.gctl_force = 1; + + // Remember errno. + int rerrno = errno; + + int r = ioctl(drv->fd, G_GATE_CMD_DESTROY, &ggiodestroy); + if (r == -1) { + err("failed to destroy /dev/%s%d device", G_GATE_PROVIDER_NAME, + drv->unit); + } + // Restore errno. + errno = rerrno; + + free(drv); +} + +int ggate_drv_resize(ggate_drv_t drv_, size_t newsize) { + struct ggate_drv *drv = (struct ggate_drv *)drv_; + + debug(20, "%s %p: newsize=%zd", __func__, drv, newsize); + + struct g_gate_ctl_modify ggiomodify; + + memset(&ggiomodify, 0, sizeof(ggiomodify)); + ggiomodify.gctl_version = G_GATE_VERSION; + ggiomodify.gctl_unit = drv->unit; + ggiomodify.gctl_modify = GG_MODIFY_MEDIASIZE; + ggiomodify.gctl_mediasize = newsize; + + int r = ioctl(drv->fd, G_GATE_CMD_MODIFY, &ggiomodify); + if (r == -1) { + r = -errno; + err("failed to resize /dev/%s%d device", G_GATE_PROVIDER_NAME, drv->unit); + } + return r; +} + +int ggate_drv_kill(const char *devname) { + debug(20, "%s %s", __func__, devname); + + int fd = open("/dev/" G_GATE_CTL_NAME, O_RDWR); + if (fd == -1) { + err("failed to open /dev/" G_GATE_CTL_NAME); + return -errno; + } + + struct g_gate_ctl_destroy ggiodestroy; + memset(&ggiodestroy, 0, sizeof(ggiodestroy)); + ggiodestroy.gctl_version = G_GATE_VERSION; + ggiodestroy.gctl_unit = G_GATE_NAME_GIVEN; + ggiodestroy.gctl_force = 1; + + strlcpy(ggiodestroy.gctl_name, devname, sizeof(ggiodestroy.gctl_name)); + + int r = ioctl(fd, G_GATE_CMD_DESTROY, &ggiodestroy); + if (r == -1) { + r = -errno; + err("failed to destroy %s device", devname); + } + + close(fd); + return r; +} + +int ggate_drv_recv(ggate_drv_t drv_, ggate_drv_req_t *req) { + struct ggate_drv *drv = (struct ggate_drv *)drv_; + struct g_gate_ctl_io *ggio; + int error, r; + + debug(20, "%s", __func__); + + ggio = calloc(1, sizeof(*ggio)); + if (ggio == NULL) { + return -ENOMEM; + } + + ggio->gctl_version = G_GATE_VERSION; + ggio->gctl_unit = drv->unit; + ggio->gctl_data = malloc(MAXPHYS); + ggio->gctl_length = MAXPHYS; + + debug(20, "%s: waiting for request from kernel", __func__); + if (ioctl(drv->fd, G_GATE_CMD_START, ggio) == -1) { + err("%s: G_GATE_CMD_START failed", __func__); + return -errno; + } + + debug(20, "%s: got request from kernel: " + "unit=%d, seq=%ju, cmd=%u, offset=%ju, length=%ju, error=%d, data=%p", + __func__, ggio->gctl_unit, (uintmax_t)ggio->gctl_seq, ggio->gctl_cmd, + (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length, + ggio->gctl_error, ggio->gctl_data); + + error = ggio->gctl_error; + switch (error) { + case 0: + break; + case ECANCELED: + debug(10, "%s: canceled: exit gracefully", __func__); + r = -error; + goto fail; + case ENOMEM: + /* + * Buffer too small? Impossible, we allocate MAXPHYS + * bytes - request can't be bigger than that. + */ + /* FALLTHROUGH */ + case ENXIO: + default: + errno = error; + err("%s: G_GATE_CMD_START failed", __func__); + r = -error; + goto fail; + } + + *req = ggio; + return 0; + +fail: + free(ggio->gctl_data); + free(ggio); + return r; +} + +int ggate_drv_send(ggate_drv_t drv_, ggate_drv_req_t req) { + struct ggate_drv *drv = (struct ggate_drv *)drv_; + struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req; + int r = 0; + + debug(20, "%s: send request to kernel: " + "unit=%d, seq=%ju, cmd=%u, offset=%ju, length=%ju, error=%d, data=%p", + __func__, ggio->gctl_unit, (uintmax_t)ggio->gctl_seq, ggio->gctl_cmd, + (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length, + ggio->gctl_error, ggio->gctl_data); + + if (ioctl(drv->fd, G_GATE_CMD_DONE, ggio) == -1) { + err("%s: G_GATE_CMD_DONE failed", __func__); + r = -errno; + } + + free(ggio->gctl_data); + free(ggio); + return r; +} + +static const char * get_conf(struct ggeom *gp, const char *name) { + struct gconfig *conf; + + LIST_FOREACH(conf, &gp->lg_config, lg_config) { + if (strcmp(conf->lg_name, name) == 0) + return (conf->lg_val); + } + return ""; +} + +int ggate_drv_list(struct ggate_drv_info *info, size_t *size) { + struct gmesh mesh; + struct gclass *class; + struct ggeom *gp; + int r; + size_t max_size; + + r = geom_gettree(&mesh); + if (r != 0) { + return -errno; + } + + max_size = *size; + *size = 0; + + LIST_FOREACH(class, &mesh.lg_class, lg_class) { + if (strcmp(class->lg_name, G_GATE_CLASS_NAME) == 0) { + LIST_FOREACH(gp, &class->lg_geom, lg_geom) { + (*size)++; + } + if (*size > max_size) { + r = -ERANGE; + goto done; + } + LIST_FOREACH(gp, &class->lg_geom, lg_geom) { + strlcpy(info->id, get_conf(gp, "unit"), sizeof(info->id)); + strlcpy(info->name, gp->lg_name, sizeof(info->name)); + strlcpy(info->info, get_conf(gp, "info"), sizeof(info->info)); + info++; + } + } + } + +done: + geom_deletetree(&mesh); + return r; +} diff --git a/src/tools/rbd_ggate/ggate_drv.h b/src/tools/rbd_ggate/ggate_drv.h new file mode 100644 index 00000000..a32f5113 --- /dev/null +++ b/src/tools/rbd_ggate/ggate_drv.h @@ -0,0 +1,64 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_GGATE_GGATE_DRV_H +#define CEPH_RBD_GGATE_GGATE_DRV_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/param.h> + +#include <stdbool.h> +#include <stdint.h> + +typedef void *ggate_drv_t; +typedef void *ggate_drv_req_t; + +/* + * GGATE driver commands. They are mapped to GgateReq::Command. + */ +enum { + GGATE_DRV_CMD_UNKNOWN = 0, + GGATE_DRV_CMD_WRITE = 1, + GGATE_DRV_CMD_READ = 2, + GGATE_DRV_CMD_FLUSH = 3, + GGATE_DRV_CMD_DISCARD = 4, +}; + +struct ggate_drv_info { + char id[16]; + char name[NAME_MAX]; + char info[2048]; /* G_GATE_INFOSIZE */ +}; + +uint64_t ggate_drv_req_id(ggate_drv_req_t req); +int ggate_drv_req_cmd(ggate_drv_req_t req); +void *ggate_drv_req_buf(ggate_drv_req_t req); +size_t ggate_drv_req_length(ggate_drv_req_t req); +uint64_t ggate_drv_req_offset(ggate_drv_req_t req); +int ggate_drv_req_error(ggate_drv_req_t req); + +void ggate_drv_req_set_error(ggate_drv_req_t req, int error); +void *ggate_drv_req_release_buf(ggate_drv_req_t req); + +int ggate_drv_load(); + +int ggate_drv_create(char *name, size_t namelen, size_t sectorsize, + size_t mediasize, bool readonly, const char *info, ggate_drv_t *drv); +void ggate_drv_destroy(ggate_drv_t drv); + +int ggate_drv_recv(ggate_drv_t drv, ggate_drv_req_t *req); +int ggate_drv_send(ggate_drv_t drv, ggate_drv_req_t req); + +int ggate_drv_resize(ggate_drv_t drv, size_t newsize); + +int ggate_drv_kill(const char *devname); +int ggate_drv_list(struct ggate_drv_info *info, size_t *size); + +#ifdef __cplusplus +} +#endif + +#endif // CEPH_RBD_GGATE_GGATE_DRV_H diff --git a/src/tools/rbd_ggate/main.cc b/src/tools/rbd_ggate/main.cc new file mode 100644 index 00000000..5ed582fb --- /dev/null +++ b/src/tools/rbd_ggate/main.cc @@ -0,0 +1,521 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/int_types.h" + +#include <sys/types.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> +#include <string.h> +#include <assert.h> + +#include <iostream> +#include <memory> +#include <boost/algorithm/string/predicate.hpp> +#include <regex> + +#include "common/Formatter.h" +#include "common/Preforker.h" +#include "common/TextTable.h" +#include "common/ceph_argparse.h" +#include "common/config_proxy.h" +#include "common/debug.h" +#include "common/errno.h" +#include "global/global_init.h" +#include "global/signal_handler.h" + +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "include/stringify.h" + +#include "Driver.h" +#include "Server.h" +#include "Watcher.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "rbd-ggate: " << __func__ << ": " + +static void usage() { + std::cout << "Usage: rbd-ggate [options] map <image-or-snap-spec> Map an image to ggate device\n" + << " unmap <device path> Unmap ggate device\n" + << " list List mapped ggate devices\n" + << "\n" + << "Map options:\n" + << " --device <device path> Specify ggate device path\n" + << " --read-only Map readonly\n" + << " --exclusive Forbid writes by other clients\n" + << "\n" + << "List options:\n" + << " --format plain|json|xml Output format (default: plain)\n" + << " --pretty-format Pretty formatting (json and xml)\n" + << std::endl; + generic_server_usage(); +} + +static std::string devpath, poolname, nsname, imgname, snapname; +static bool readonly = false; +static bool exclusive = false; + +static std::unique_ptr<rbd::ggate::Driver> drv; + +static void handle_signal(int signum) +{ + derr << "*** Got signal " << sig_str(signum) << " ***" << dendl; + + ceph_assert(signum == SIGINT || signum == SIGTERM); + ceph_assert(drv); + + drv->shut_down(); +} + +static int do_map(int argc, const char *argv[]) +{ + int r; + + librados::Rados rados; + librbd::RBD rbd; + librados::IoCtx io_ctx; + librbd::Image image; + + librbd::image_info_t info; + std::string desc; + + Preforker forker; + + vector<const char*> args; + argv_to_vec(argc, argv, args); + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + g_ceph_context->_conf.set_val_or_die("pid_file", ""); + + if (global_init_prefork(g_ceph_context) >= 0) { + std::string err; + r = forker.prefork(err); + if (r < 0) { + std::cerr << err << std::endl; + return r; + } + if (forker.is_parent()) { + if (forker.parent_wait(err) != 0) { + return -ENXIO; + } + return 0; + } + global_init_postfork_start(g_ceph_context); + } + + common_init_finish(g_ceph_context); + global_init_chdir(g_ceph_context); + + if (poolname.empty()) { + poolname = g_ceph_context->_conf.get_val<std::string>("rbd_default_pool"); + } + + std::string devname = boost::starts_with(devpath, "/dev/") ? + devpath.substr(5) : devpath; + std::unique_ptr<rbd::ggate::Watcher> watcher; + uint64_t handle; + + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + goto done; + } + + r = rados.connect(); + if (r < 0) { + std::cerr << "rbd-ggate: failed to connect to cluster: " << cpp_strerror(r) + << std::endl; + goto done; + } + + r = rados.ioctx_create(poolname.c_str(), io_ctx); + if (r < 0) { + std::cerr << "rbd-ggate: failed to acces pool " << poolname << ": " + << cpp_strerror(r) << std::endl; + goto done; + } + + io_ctx.set_namespace(nsname); + + r = rbd.open(io_ctx, image, imgname.c_str()); + if (r < 0) { + std::cerr << "rbd-ggate: failed to open image " << imgname << ": " + << cpp_strerror(r) << std::endl; + goto done; + } + + if (exclusive) { + r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE); + if (r < 0) { + std::cerr << "rbd-ggate: failed to acquire exclusive lock: " + << cpp_strerror(r) << std::endl; + goto done; + } + } + + desc = "RBD " + poolname + "/" + (nsname.empty() ? "" : nsname + "/") + + imgname; + + if (!snapname.empty()) { + r = image.snap_set(snapname.c_str()); + if (r < 0) { + std::cerr << "rbd-ggate: failed to set snapshot " << snapname << ": " + << cpp_strerror(r) << std::endl; + goto done; + } + readonly = true; + desc += "@" + snapname; + } + + r = image.stat(info, sizeof(info)); + if (r < 0) { + std::cerr << "rbd-ggate: image stat failed: " << cpp_strerror(r) + << std::endl; + goto done; + } + + rbd::ggate::Driver::load(); + drv.reset(new rbd::ggate::Driver(devname, 512, info.size, readonly, desc)); + r = drv->init(); + if (r < 0) { + r = -errno; + std::cerr << "rbd-ggate: failed to create ggate device: " << cpp_strerror(r) + << std::endl; + goto done; + } + + watcher.reset(new rbd::ggate::Watcher(drv.get(), io_ctx, image, info.size)); + r = image.update_watch(watcher.get(), &handle); + if (r < 0) { + std::cerr << "rbd-ggate: failed to set watcher: " << cpp_strerror(r) + << std::endl; + drv->shut_down(); + goto done; + } + + std::cout << "/dev/" << drv->get_devname() << std::endl; + + if (g_conf()->daemonize) { + global_init_postfork_finish(g_ceph_context); + forker.daemonize(); + } + + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + register_async_signal_handler_oneshot(SIGINT, handle_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_signal); + + rbd::ggate::Server(drv.get(), image).run(); + + unregister_async_signal_handler(SIGHUP, sighup_handler); + unregister_async_signal_handler(SIGINT, handle_signal); + unregister_async_signal_handler(SIGTERM, handle_signal); + shutdown_async_signal_handler(); + + r = image.update_unwatch(handle); + ceph_assert(r == 0); + +done: + image.close(); + io_ctx.close(); + rados.shutdown(); + + if (r < 0) { + std::cerr << "rbd-ggate: failed to map: " << cpp_strerror(r) << std::endl; + } + + forker.exit(r < 0 ? EXIT_FAILURE : 0); + // Unreachable; + return r; +} + +static int do_unmap() +{ + std::string devname = boost::starts_with(devpath, "/dev/") ? + devpath.substr(5) : devpath; + + int r = rbd::ggate::Driver::kill(devname); + if (r < 0) { + cerr << "rbd-ggate: failed to destroy " << devname << ": " + << cpp_strerror(r) << std::endl; + return r; + } + + return 0; +} + +static int parse_imgpath(const std::string &imgpath, std::string *poolname, + std::string *nsname, std::string *imgname, + std::string *snapname) { + std::regex pattern("^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$"); + std::smatch match; + if (!std::regex_match(imgpath, match, pattern)) { + std::cerr << "rbd-ggate: invalid spec '" << imgpath << "'" << std::endl; + return -EINVAL; + } + + if (match[1].matched) { + *poolname = match[1]; + } + + if (match[2].matched) { + *nsname = match[2]; + } + + *imgname = match[3]; + + if (match[4].matched) { + *snapname = match[4]; + } + + return 0; +} + +static bool find_mapped_dev_by_spec(const std::string &spec, + std::string *devname) { + std::string poolname, nsname, imgname, snapname; + int r = parse_imgpath(spec, &poolname, &nsname, &imgname, &snapname); + if (r < 0) { + return false; + } + if (poolname.empty()) { + // We could use rbd_default_pool config to set pool name but then + // we would need to initialize the global context. So right now it + // is mandatory for the user to specify a pool. Fortunately the + // preferred way for users to call rbd-ggate is via rbd, which + // cares to set the pool name. + return false; + } + + std::map<std::string, rbd::ggate::Driver::DevInfo> devs; + r = rbd::ggate::Driver::list(&devs); + if (r < 0) { + return false; + } + + for (auto &it : devs) { + auto &name = it.second.first; + auto &info = it.second.second; + if (!boost::starts_with(info, "RBD ")) { + continue; + } + + std::string p, n, i, s; + parse_imgpath(info.substr(4), &p, &n, &i, &s); + if (p == poolname && n == nsname && i == imgname && s == snapname) { + *devname = name; + return true; + } + } + + return false; +} + +static int do_list(const std::string &format, bool pretty_format) +{ + rbd::ggate::Driver::load(); + + std::map<std::string, rbd::ggate::Driver::DevInfo> devs; + int r = rbd::ggate::Driver::list(&devs); + if (r < 0) { + return -r; + } + + std::unique_ptr<ceph::Formatter> f; + TextTable tbl; + + if (format == "json") { + f.reset(new JSONFormatter(pretty_format)); + } else if (format == "xml") { + f.reset(new XMLFormatter(pretty_format)); + } else if (!format.empty() && format != "plain") { + std::cerr << "rbd-ggate: invalid output format: " << format << std::endl; + return -EINVAL; + } + + if (f) { + f->open_array_section("devices"); + } else { + tbl.define_column("id", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("image", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("device", TextTable::LEFT, TextTable::LEFT); + } + + int count = 0; + + for (auto &it : devs) { + auto &id = it.first; + auto &name = it.second.first; + auto &info = it.second.second; + if (!boost::starts_with(info, "RBD ")) { + continue; + } + + std::string poolname; + std::string nsname; + std::string imgname; + std::string snapname(f ? "" : "-"); + parse_imgpath(info.substr(4), &poolname, &nsname, &imgname, &snapname); + + if (f) { + f->open_object_section("device"); + f->dump_string("id", id); + f->dump_string("pool", poolname); + f->dump_string("namespace", nsname); + f->dump_string("image", imgname); + f->dump_string("snap", snapname); + f->dump_string("device", "/dev/" + name); + f->close_section(); + } else { + tbl << id << poolname << nsname << imgname << snapname << "/dev/" + name + << TextTable::endrow; + } + count++; + } + + if (f) { + f->close_section(); // devices + f->flush(std::cout); + } else if (count > 0) { + std::cout << tbl; + } + + return 0; +} + +int main(int argc, const char *argv[]) { + int r; + enum { + None, + Connect, + Disconnect, + List + } cmd = None; + + vector<const char*> args; + + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + // filter out ceph config options + ConfigProxy{false}.parse_argv(args); + + std::string format; + bool pretty_format = false; + std::vector<const char*>::iterator i; + + for (i = args.begin(); i != args.end(); ) { + if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) { + usage(); + return 0; + } else if (ceph_argparse_witharg(args, i, &devpath, "--device", + (char *)NULL)) { + } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) { + readonly = true; + } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) { + exclusive = true; + } else if (ceph_argparse_witharg(args, i, &format, "--format", + (char *)NULL)) { + } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) { + pretty_format = true; + } else { + ++i; + } + } + + if (args.begin() != args.end()) { + if (strcmp(*args.begin(), "map") == 0) { + cmd = Connect; + } else if (strcmp(*args.begin(), "unmap") == 0) { + cmd = Disconnect; + } else if (strcmp(*args.begin(), "list") == 0) { + cmd = List; + } else { + cerr << "rbd-ggate: unknown command: " << *args.begin() << std::endl; + return EXIT_FAILURE; + } + args.erase(args.begin()); + } + + if (cmd == None) { + cerr << "rbd-ggate: must specify command" << std::endl; + return EXIT_FAILURE; + } + + switch (cmd) { + case Connect: + if (args.begin() == args.end()) { + cerr << "rbd-ggate: must specify image-or-snap-spec" << std::endl; + return EXIT_FAILURE; + } + if (parse_imgpath(*args.begin(), &poolname, &nsname, &imgname, + &snapname) < 0) { + return EXIT_FAILURE; + } + args.erase(args.begin()); + break; + case Disconnect: + if (args.begin() == args.end()) { + std::cerr << "rbd-ggate: must specify ggate device or image-or-snap-spec" + << std::endl; + return EXIT_FAILURE; + } + if (boost::starts_with(*args.begin(), "/dev/") || + !find_mapped_dev_by_spec(*args.begin(), &devpath)) { + devpath = *args.begin(); + } + args.erase(args.begin()); + break; + default: + break; + } + + if (args.begin() != args.end()) { + cerr << "rbd-ggate: unknown args: " << *args.begin() << std::endl; + return EXIT_FAILURE; + } + + switch (cmd) { + case Connect: + if (imgname.empty()) { + cerr << "rbd-ggate: image name was not specified" << std::endl; + return EXIT_FAILURE; + } + + r = do_map(argc, argv); + if (r < 0) + return EXIT_FAILURE; + break; + case Disconnect: + r = do_unmap(); + if (r < 0) + return EXIT_FAILURE; + break; + case List: + r = do_list(format, pretty_format); + if (r < 0) + return EXIT_FAILURE; + break; + default: + usage(); + return EXIT_FAILURE; + } + + return 0; +} diff --git a/src/tools/rbd_mirror/BaseRequest.h b/src/tools/rbd_mirror/BaseRequest.h new file mode 100644 index 00000000..5053eb83 --- /dev/null +++ b/src/tools/rbd_mirror/BaseRequest.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_BASE_REQUEST_H +#define CEPH_RBD_MIRROR_BASE_REQUEST_H + +#include "common/RefCountedObj.h" +#include "include/Context.h" + +namespace rbd { +namespace mirror { + +class BaseRequest : public RefCountedObject { +public: + BaseRequest(const std::string& name, CephContext *cct, Context *on_finish) + : RefCountedObject(cct, 1), m_name(name), m_cct(cct), + m_on_finish(on_finish) { + } + + virtual void send() = 0; + virtual void cancel() {} + +protected: + virtual void finish(int r) { + if (m_cct) { + lsubdout(m_cct, rbd_mirror, 20) << m_name << "::finish: r=" << r << dendl; + } + if (m_on_finish) { + m_on_finish->complete(r); + } + put(); + } + +private: + const std::string m_name; + CephContext *m_cct; + Context *m_on_finish; +}; + +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_BASE_REQUEST_H diff --git a/src/tools/rbd_mirror/CMakeLists.txt b/src/tools/rbd_mirror/CMakeLists.txt new file mode 100644 index 00000000..30106a86 --- /dev/null +++ b/src/tools/rbd_mirror/CMakeLists.txt @@ -0,0 +1,69 @@ +add_library(rbd_mirror_types STATIC + image_map/Types.cc + instance_watcher/Types.cc + leader_watcher/Types.cc) + +set(rbd_mirror_internal + ClusterWatcher.cc + ImageDeleter.cc + ImageMap.cc + ImageReplayer.cc + ImageSync.cc + ImageSyncThrottler.cc + InstanceReplayer.cc + InstanceWatcher.cc + Instances.cc + LeaderWatcher.cc + Mirror.cc + MirrorStatusWatcher.cc + PoolReplayer.cc + PoolWatcher.cc + ServiceDaemon.cc + Threads.cc + Types.cc + image_deleter/SnapshotPurgeRequest.cc + image_deleter/TrashMoveRequest.cc + image_deleter/TrashRemoveRequest.cc + image_deleter/TrashWatcher.cc + image_map/LoadRequest.cc + image_map/Policy.cc + image_map/SimplePolicy.cc + image_map/StateTransition.cc + image_map/UpdateRequest.cc + image_replayer/BootstrapRequest.cc + image_replayer/CloseImageRequest.cc + image_replayer/CreateImageRequest.cc + image_replayer/EventPreprocessor.cc + image_replayer/GetMirrorImageIdRequest.cc + image_replayer/IsPrimaryRequest.cc + image_replayer/OpenImageRequest.cc + image_replayer/OpenLocalImageRequest.cc + image_replayer/PrepareLocalImageRequest.cc + image_replayer/PrepareRemoteImageRequest.cc + image_replayer/ReplayStatusFormatter.cc + image_replayer/Utils.cc + image_sync/SyncPointCreateRequest.cc + image_sync/SyncPointPruneRequest.cc + pool_watcher/RefreshImagesRequest.cc + service_daemon/Types.cc) + +add_library(rbd_mirror_internal STATIC + ${rbd_mirror_internal}) + +add_executable(rbd-mirror + main.cc) +target_link_libraries(rbd-mirror + rbd_mirror_internal + rbd_mirror_types + rbd_api + rbd_internal + rbd_types + journal + librados + osdc + cls_rbd_client + cls_lock_client + cls_journal_client + global + ${ALLOC_LIBS}) +install(TARGETS rbd-mirror DESTINATION bin) diff --git a/src/tools/rbd_mirror/ClusterWatcher.cc b/src/tools/rbd_mirror/ClusterWatcher.cc new file mode 100644 index 00000000..54329de6 --- /dev/null +++ b/src/tools/rbd_mirror/ClusterWatcher.cc @@ -0,0 +1,223 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ClusterWatcher.h" +#include "include/stringify.h" +#include "common/ceph_json.h" +#include "common/debug.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/internal.h" +#include "librbd/api/Mirror.h" +#include "tools/rbd_mirror/ServiceDaemon.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::ClusterWatcher:" << this << " " \ + << __func__ << ": " + +using std::list; +using std::map; +using std::set; +using std::string; +using std::unique_ptr; +using std::vector; + +using librados::Rados; +using librados::IoCtx; + +namespace rbd { +namespace mirror { + +ClusterWatcher::ClusterWatcher(RadosRef cluster, Mutex &lock, + ServiceDaemon<librbd::ImageCtx>* service_daemon) + : m_cluster(cluster), m_lock(lock), m_service_daemon(service_daemon) +{ +} + +const ClusterWatcher::PoolPeers& ClusterWatcher::get_pool_peers() const +{ + ceph_assert(m_lock.is_locked()); + return m_pool_peers; +} + +void ClusterWatcher::refresh_pools() +{ + dout(20) << "enter" << dendl; + + PoolPeers pool_peers; + read_pool_peers(&pool_peers); + + Mutex::Locker l(m_lock); + m_pool_peers = pool_peers; + // TODO: perhaps use a workqueue instead, once we get notifications + // about config changes for existing pools +} + +void ClusterWatcher::read_pool_peers(PoolPeers *pool_peers) +{ + int r = m_cluster->wait_for_latest_osdmap(); + if (r < 0) { + derr << "error waiting for OSD map: " << cpp_strerror(r) << dendl; + return; + } + + list<pair<int64_t, string> > pools; + r = m_cluster->pool_list2(pools); + if (r < 0) { + derr << "error listing pools: " << cpp_strerror(r) << dendl; + return; + } + + std::set<int64_t> service_pool_ids; + for (auto& kv : pools) { + int64_t pool_id = kv.first; + auto& pool_name = kv.second; + int64_t base_tier; + r = m_cluster->pool_get_base_tier(pool_id, &base_tier); + if (r == -ENOENT) { + dout(10) << "pool " << pool_name << " no longer exists" << dendl; + continue; + } else if (r < 0) { + derr << "Error retrieving base tier for pool " << pool_name << dendl; + continue; + } + if (pool_id != base_tier) { + // pool is a cache; skip it + continue; + } + + IoCtx ioctx; + r = m_cluster->ioctx_create2(pool_id, ioctx); + if (r == -ENOENT) { + dout(10) << "pool " << pool_id << " no longer exists" << dendl; + continue; + } else if (r < 0) { + derr << "Error accessing pool " << pool_name << cpp_strerror(r) << dendl; + continue; + } + + cls::rbd::MirrorMode mirror_mode_internal; + r = librbd::cls_client::mirror_mode_get(&ioctx, &mirror_mode_internal); + if (r == 0 && mirror_mode_internal == cls::rbd::MIRROR_MODE_DISABLED) { + dout(10) << "mirroring is disabled for pool " << pool_name << dendl; + continue; + } + + service_pool_ids.insert(pool_id); + if (m_service_pools.find(pool_id) == m_service_pools.end()) { + m_service_pools[pool_id] = {}; + m_service_daemon->add_pool(pool_id, pool_name); + } + + if (r == -EPERM) { + dout(10) << "access denied querying pool " << pool_name << dendl; + m_service_pools[pool_id] = m_service_daemon->add_or_update_callout( + pool_id, m_service_pools[pool_id], + service_daemon::CALLOUT_LEVEL_WARNING, "access denied"); + continue; + } else if (r < 0) { + derr << "could not tell whether mirroring was enabled for " << pool_name + << " : " << cpp_strerror(r) << dendl; + m_service_pools[pool_id] = m_service_daemon->add_or_update_callout( + pool_id, m_service_pools[pool_id], + service_daemon::CALLOUT_LEVEL_WARNING, "mirroring mode query failed"); + continue; + } + + vector<librbd::mirror_peer_t> configs; + r = librbd::api::Mirror<>::peer_list(ioctx, &configs); + if (r < 0) { + derr << "error reading mirroring config for pool " << pool_name + << cpp_strerror(r) << dendl; + m_service_pools[pool_id] = m_service_daemon->add_or_update_callout( + pool_id, m_service_pools[pool_id], + service_daemon::CALLOUT_LEVEL_ERROR, "mirroring peer list failed"); + continue; + } + + std::vector<PeerSpec> peers{configs.begin(), configs.end()}; + for (auto& peer : peers) { + r = resolve_peer_config_keys(pool_id, pool_name, &peer); + if (r < 0) { + break; + } + } + + if (m_service_pools[pool_id] != service_daemon::CALLOUT_ID_NONE) { + m_service_daemon->remove_callout(pool_id, m_service_pools[pool_id]); + m_service_pools[pool_id] = service_daemon::CALLOUT_ID_NONE; + } + + pool_peers->emplace(pool_id, Peers{peers.begin(), peers.end()}); + } + + for (auto it = m_service_pools.begin(); it != m_service_pools.end(); ) { + auto current_it(it++); + if (service_pool_ids.find(current_it->first) == service_pool_ids.end()) { + m_service_daemon->remove_pool(current_it->first); + m_service_pools.erase(current_it->first); + } + } +} + +int ClusterWatcher::resolve_peer_config_keys(int64_t pool_id, + const std::string& pool_name, + PeerSpec* peer) { + dout(10) << "retrieving config-key: pool_id=" << pool_id << ", " + << "pool_name=" << pool_name << ", " + << "peer_uuid=" << peer->uuid << dendl; + + std::string cmd = + "{" + "\"prefix\": \"config-key get\", " + "\"key\": \"" RBD_MIRROR_PEER_CONFIG_KEY_PREFIX + stringify(pool_id) + + "/" + peer->uuid + "\"" + "}"; + + bufferlist in_bl; + bufferlist out_bl; + int r = m_cluster->mon_command(cmd, in_bl, &out_bl, nullptr); + if (r == -ENOENT || out_bl.length() == 0) { + return 0; + } else if (r < 0) { + derr << "error reading mirroring peer config for pool " << pool_name << ": " + << cpp_strerror(r) << dendl; + m_service_pools[pool_id] = m_service_daemon->add_or_update_callout( + pool_id, m_service_pools[pool_id], + service_daemon::CALLOUT_LEVEL_WARNING, + "mirroring peer config-key query failed"); + return r; + } + + bool json_valid = false; + json_spirit::mValue json_root; + if(json_spirit::read(out_bl.to_str(), json_root)) { + try { + auto& json_obj = json_root.get_obj(); + if (json_obj.count("mon_host")) { + peer->mon_host = json_obj["mon_host"].get_str(); + } + if (json_obj.count("key")) { + peer->key = json_obj["key"].get_str(); + } + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + derr << "error parsing mirroring peer config for pool " << pool_name << ", " + << "peer " << peer->uuid << dendl; + m_service_pools[pool_id] = m_service_daemon->add_or_update_callout( + pool_id, m_service_pools[pool_id], + service_daemon::CALLOUT_LEVEL_WARNING, + "mirroring peer config-key decode failed"); + } + + return 0; +} + +} // namespace mirror +} // namespace rbd diff --git a/src/tools/rbd_mirror/ClusterWatcher.h b/src/tools/rbd_mirror/ClusterWatcher.h new file mode 100644 index 00000000..e8430b47 --- /dev/null +++ b/src/tools/rbd_mirror/ClusterWatcher.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_CLUSTER_WATCHER_H +#define CEPH_RBD_MIRROR_CLUSTER_WATCHER_H + +#include <map> +#include <memory> +#include <set> + +#include "common/ceph_context.h" +#include "common/Mutex.h" +#include "common/Timer.h" +#include "include/rados/librados.hpp" +#include "tools/rbd_mirror/Types.h" +#include "tools/rbd_mirror/service_daemon/Types.h" +#include <unordered_map> + +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> class ServiceDaemon; + +/** + * Tracks mirroring configuration for pools in a single + * cluster. + */ +class ClusterWatcher { +public: + struct PeerSpecCompare { + bool operator()(const PeerSpec& lhs, const PeerSpec& rhs) const { + return (lhs.uuid < rhs.uuid); + } + }; + typedef std::set<PeerSpec, PeerSpecCompare> Peers; + typedef std::map<int64_t, Peers> PoolPeers; + + ClusterWatcher(RadosRef cluster, Mutex &lock, + ServiceDaemon<librbd::ImageCtx>* service_daemon); + ~ClusterWatcher() = default; + ClusterWatcher(const ClusterWatcher&) = delete; + ClusterWatcher& operator=(const ClusterWatcher&) = delete; + + // Caller controls frequency of calls + void refresh_pools(); + const PoolPeers& get_pool_peers() const; + +private: + typedef std::unordered_map<int64_t, service_daemon::CalloutId> ServicePools; + + RadosRef m_cluster; + Mutex &m_lock; + ServiceDaemon<librbd::ImageCtx>* m_service_daemon; + + ServicePools m_service_pools; + PoolPeers m_pool_peers; + + void read_pool_peers(PoolPeers *pool_peers); + + int resolve_peer_config_keys(int64_t pool_id, const std::string& pool_name, + PeerSpec* peer); +}; + +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_CLUSTER_WATCHER_H diff --git a/src/tools/rbd_mirror/ImageDeleter.cc b/src/tools/rbd_mirror/ImageDeleter.cc new file mode 100644 index 00000000..f4d928ca --- /dev/null +++ b/src/tools/rbd_mirror/ImageDeleter.cc @@ -0,0 +1,549 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/rados/librados.hpp" +#include "common/Formatter.h" +#include "common/admin_socket.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "global/global_context.h" +#include "librbd/internal.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Operations.h" +#include "cls/rbd/cls_rbd_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/Utils.h" +#include "ImageDeleter.h" +#include "tools/rbd_mirror/Threads.h" +#include "tools/rbd_mirror/image_deleter/TrashMoveRequest.h" +#include "tools/rbd_mirror/image_deleter/TrashRemoveRequest.h" +#include "tools/rbd_mirror/image_deleter/TrashWatcher.h" +#include <map> +#include <sstream> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror + +using std::string; +using std::stringstream; +using std::vector; +using std::pair; +using std::make_pair; + +using librados::IoCtx; +using namespace librbd; + +namespace rbd { +namespace mirror { + +namespace { + +class ImageDeleterAdminSocketCommand { +public: + virtual ~ImageDeleterAdminSocketCommand() {} + virtual bool call(Formatter *f, stringstream *ss) = 0; +}; + +template <typename I> +class StatusCommand : public ImageDeleterAdminSocketCommand { +public: + explicit StatusCommand(ImageDeleter<I> *image_del) : image_del(image_del) {} + + bool call(Formatter *f, stringstream *ss) override { + image_del->print_status(f, ss); + return true; + } + +private: + ImageDeleter<I> *image_del; +}; + +} // anonymous namespace + +template <typename I> +class ImageDeleterAdminSocketHook : public AdminSocketHook { +public: + ImageDeleterAdminSocketHook(CephContext *cct, const std::string& pool_name, + ImageDeleter<I> *image_del) : + admin_socket(cct->get_admin_socket()) { + + std::string command; + int r; + + command = "rbd mirror deletion status " + pool_name; + r = admin_socket->register_command(command, command, this, + "get status for image deleter"); + if (r == 0) { + commands[command] = new StatusCommand<I>(image_del); + } + + } + + ~ImageDeleterAdminSocketHook() override { + for (Commands::const_iterator i = commands.begin(); i != commands.end(); + ++i) { + (void)admin_socket->unregister_command(i->first); + delete i->second; + } + } + + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + Commands::const_iterator i = commands.find(command); + ceph_assert(i != commands.end()); + Formatter *f = Formatter::create(format); + stringstream ss; + bool r = i->second->call(f, &ss); + delete f; + out.append(ss); + return r; + } + +private: + typedef std::map<std::string, ImageDeleterAdminSocketCommand*, + std::less<>> Commands; + AdminSocket *admin_socket; + Commands commands; +}; + +template <typename I> +ImageDeleter<I>::ImageDeleter(librados::IoCtx& local_io_ctx, + Threads<librbd::ImageCtx>* threads, + ServiceDaemon<librbd::ImageCtx>* service_daemon) + : m_local_io_ctx(local_io_ctx), m_threads(threads), + m_service_daemon(service_daemon), m_trash_listener(this), + m_lock(librbd::util::unique_lock_name("rbd::mirror::ImageDeleter::m_lock", + this)) { +} + +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::ImageDeleter: " << " " \ + << __func__ << ": " + +template <typename I> +void ImageDeleter<I>::trash_move(librados::IoCtx& local_io_ctx, + const std::string& global_image_id, + bool resync, + ContextWQ* work_queue, Context* on_finish) { + dout(10) << "global_image_id=" << global_image_id << ", " + << "resync=" << resync << dendl; + + auto req = rbd::mirror::image_deleter::TrashMoveRequest<>::create( + local_io_ctx, global_image_id, resync, work_queue, on_finish); + req->send(); +} + +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::ImageDeleter: " << this << " " \ + << __func__ << ": " + +template <typename I> +void ImageDeleter<I>::init(Context* on_finish) { + dout(10) << dendl; + + m_asok_hook = new ImageDeleterAdminSocketHook<I>( + g_ceph_context, m_local_io_ctx.get_pool_name(), this); + + m_trash_watcher = image_deleter::TrashWatcher<I>::create(m_local_io_ctx, + m_threads, + m_trash_listener); + m_trash_watcher->init(on_finish); +} + +template <typename I> +void ImageDeleter<I>::shut_down(Context* on_finish) { + dout(10) << dendl; + + delete m_asok_hook; + m_asok_hook = nullptr; + + shut_down_trash_watcher(on_finish); +} + +template <typename I> +void ImageDeleter<I>::shut_down_trash_watcher(Context* on_finish) { + dout(10) << dendl; + ceph_assert(m_trash_watcher); + auto ctx = new FunctionContext([this, on_finish](int r) { + delete m_trash_watcher; + m_trash_watcher = nullptr; + + wait_for_ops(on_finish); + }); + m_trash_watcher->shut_down(ctx); +} + +template <typename I> +void ImageDeleter<I>::wait_for_ops(Context* on_finish) { + { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + m_running = false; + cancel_retry_timer(); + } + + auto ctx = new FunctionContext([this, on_finish](int) { + cancel_all_deletions(on_finish); + }); + m_async_op_tracker.wait_for_ops(ctx); +} + +template <typename I> +void ImageDeleter<I>::cancel_all_deletions(Context* on_finish) { + { + Mutex::Locker locker(m_lock); + // wake up any external state machines waiting on deletions + ceph_assert(m_in_flight_delete_queue.empty()); + for (auto& queue : {&m_delete_queue, &m_retry_delete_queue}) { + for (auto& info : *queue) { + notify_on_delete(info->image_id, -ECANCELED); + } + queue->clear(); + } + } + on_finish->complete(0); +} + +template <typename I> +void ImageDeleter<I>::wait_for_deletion(const std::string& image_id, + bool scheduled_only, + Context* on_finish) { + dout(5) << "image_id=" << image_id << dendl; + + on_finish = new FunctionContext([this, on_finish](int r) { + m_threads->work_queue->queue(on_finish, r); + }); + + Mutex::Locker locker(m_lock); + auto del_info = find_delete_info(image_id); + if (!del_info && scheduled_only) { + // image not scheduled for deletion + on_finish->complete(0); + return; + } + + notify_on_delete(image_id, -ESTALE); + m_on_delete_contexts[image_id] = on_finish; +} + +template <typename I> +void ImageDeleter<I>::complete_active_delete(DeleteInfoRef* delete_info, + int r) { + dout(20) << "info=" << *delete_info << ", r=" << r << dendl; + Mutex::Locker locker(m_lock); + notify_on_delete((*delete_info)->image_id, r); + delete_info->reset(); +} + +template <typename I> +void ImageDeleter<I>::enqueue_failed_delete(DeleteInfoRef* delete_info, + int error_code, + double retry_delay) { + dout(20) << "info=" << *delete_info << ", r=" << error_code << dendl; + if (error_code == -EBLACKLISTED) { + Mutex::Locker locker(m_lock); + derr << "blacklisted while deleting local image" << dendl; + complete_active_delete(delete_info, error_code); + return; + } + + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + auto& delete_info_ref = *delete_info; + notify_on_delete(delete_info_ref->image_id, error_code); + delete_info_ref->error_code = error_code; + ++delete_info_ref->retries; + delete_info_ref->retry_time = ceph_clock_now(); + delete_info_ref->retry_time += retry_delay; + m_retry_delete_queue.push_back(delete_info_ref); + + schedule_retry_timer(); +} + +template <typename I> +typename ImageDeleter<I>::DeleteInfoRef +ImageDeleter<I>::find_delete_info(const std::string &image_id) { + ceph_assert(m_lock.is_locked()); + DeleteQueue delete_queues[] = {m_in_flight_delete_queue, + m_retry_delete_queue, + m_delete_queue}; + + DeleteInfo delete_info{image_id}; + for (auto& queue : delete_queues) { + auto it = std::find_if(queue.begin(), queue.end(), + [&delete_info](const DeleteInfoRef& ref) { + return delete_info == *ref; + }); + if (it != queue.end()) { + return *it; + } + } + return {}; +} + +template <typename I> +void ImageDeleter<I>::print_status(Formatter *f, stringstream *ss) { + dout(20) << dendl; + + if (f) { + f->open_object_section("image_deleter_status"); + f->open_array_section("delete_images_queue"); + } + + Mutex::Locker l(m_lock); + for (const auto& image : m_delete_queue) { + image->print_status(f, ss); + } + + if (f) { + f->close_section(); + f->open_array_section("failed_deletes_queue"); + } + + for (const auto& image : m_retry_delete_queue) { + image->print_status(f, ss, true); + } + + if (f) { + f->close_section(); + f->close_section(); + f->flush(*ss); + } +} + +template <typename I> +vector<string> ImageDeleter<I>::get_delete_queue_items() { + vector<string> items; + + Mutex::Locker l(m_lock); + for (const auto& del_info : m_delete_queue) { + items.push_back(del_info->image_id); + } + + return items; +} + +template <typename I> +vector<pair<string, int> > ImageDeleter<I>::get_failed_queue_items() { + vector<pair<string, int> > items; + + Mutex::Locker l(m_lock); + for (const auto& del_info : m_retry_delete_queue) { + items.push_back(make_pair(del_info->image_id, + del_info->error_code)); + } + + return items; +} + +template <typename I> +void ImageDeleter<I>::remove_images() { + dout(10) << dendl; + + auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct()); + uint64_t max_concurrent_deletions = cct->_conf.get_val<uint64_t>( + "rbd_mirror_concurrent_image_deletions"); + + Mutex::Locker locker(m_lock); + while (true) { + if (!m_running || m_delete_queue.empty() || + m_in_flight_delete_queue.size() >= max_concurrent_deletions) { + return; + } + + DeleteInfoRef delete_info = m_delete_queue.front(); + m_delete_queue.pop_front(); + + ceph_assert(delete_info); + remove_image(delete_info); + } +} + +template <typename I> +void ImageDeleter<I>::remove_image(DeleteInfoRef delete_info) { + dout(10) << "info=" << *delete_info << dendl; + ceph_assert(m_lock.is_locked()); + + m_in_flight_delete_queue.push_back(delete_info); + m_async_op_tracker.start_op(); + + auto ctx = new FunctionContext([this, delete_info](int r) { + handle_remove_image(delete_info, r); + m_async_op_tracker.finish_op(); + }); + + auto req = image_deleter::TrashRemoveRequest<I>::create( + m_local_io_ctx, delete_info->image_id, &delete_info->error_result, + m_threads->work_queue, ctx); + req->send(); +} + +template <typename I> +void ImageDeleter<I>::handle_remove_image(DeleteInfoRef delete_info, + int r) { + dout(10) << "info=" << *delete_info << ", r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_lock.is_locked()); + auto it = std::find(m_in_flight_delete_queue.begin(), + m_in_flight_delete_queue.end(), delete_info); + ceph_assert(it != m_in_flight_delete_queue.end()); + m_in_flight_delete_queue.erase(it); + } + + if (r < 0) { + if (delete_info->error_result == image_deleter::ERROR_RESULT_COMPLETE) { + complete_active_delete(&delete_info, r); + } else if (delete_info->error_result == + image_deleter::ERROR_RESULT_RETRY_IMMEDIATELY) { + enqueue_failed_delete(&delete_info, r, m_busy_interval); + } else { + auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct()); + double failed_interval = cct->_conf.get_val<double>( + "rbd_mirror_delete_retry_interval"); + enqueue_failed_delete(&delete_info, r, failed_interval); + } + } else { + complete_active_delete(&delete_info, 0); + } + + // process the next queued image to delete + remove_images(); +} + +template <typename I> +void ImageDeleter<I>::schedule_retry_timer() { + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + if (!m_running || m_timer_ctx != nullptr || m_retry_delete_queue.empty()) { + return; + } + + dout(10) << dendl; + auto &delete_info = m_retry_delete_queue.front(); + m_timer_ctx = new FunctionContext([this](int r) { + handle_retry_timer(); + }); + m_threads->timer->add_event_at(delete_info->retry_time, m_timer_ctx); +} + +template <typename I> +void ImageDeleter<I>::cancel_retry_timer() { + dout(10) << dendl; + ceph_assert(m_threads->timer_lock.is_locked()); + if (m_timer_ctx != nullptr) { + bool canceled = m_threads->timer->cancel_event(m_timer_ctx); + m_timer_ctx = nullptr; + ceph_assert(canceled); + } +} + +template <typename I> +void ImageDeleter<I>::handle_retry_timer() { + dout(10) << dendl; + ceph_assert(m_threads->timer_lock.is_locked()); + Mutex::Locker locker(m_lock); + + ceph_assert(m_timer_ctx != nullptr); + m_timer_ctx = nullptr; + + ceph_assert(m_running); + ceph_assert(!m_retry_delete_queue.empty()); + + // move all ready-to-ready items back to main queue + utime_t now = ceph_clock_now(); + while (!m_retry_delete_queue.empty()) { + auto &delete_info = m_retry_delete_queue.front(); + if (delete_info->retry_time > now) { + break; + } + + m_delete_queue.push_back(delete_info); + m_retry_delete_queue.pop_front(); + } + + // schedule wake up for any future retries + schedule_retry_timer(); + + // start (concurrent) removal of images + m_async_op_tracker.start_op(); + auto ctx = new FunctionContext([this](int r) { + remove_images(); + m_async_op_tracker.finish_op(); + }); + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void ImageDeleter<I>::handle_trash_image(const std::string& image_id, + const utime_t& deferment_end_time) { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + + auto del_info = find_delete_info(image_id); + if (del_info != nullptr) { + dout(20) << "image " << image_id << " " + << "was already scheduled for deletion" << dendl; + return; + } + + dout(10) << "image_id=" << image_id << ", " + << "deferment_end_time=" << deferment_end_time << dendl; + + del_info.reset(new DeleteInfo(image_id)); + del_info->retry_time = deferment_end_time; + m_retry_delete_queue.push_back(del_info); + + schedule_retry_timer(); +} + +template <typename I> +void ImageDeleter<I>::notify_on_delete(const std::string& image_id, + int r) { + dout(10) << "image_id=" << image_id << ", r=" << r << dendl; + auto it = m_on_delete_contexts.find(image_id); + if (it == m_on_delete_contexts.end()) { + return; + } + + it->second->complete(r); + m_on_delete_contexts.erase(it); +} + +template <typename I> +void ImageDeleter<I>::DeleteInfo::print_status(Formatter *f, stringstream *ss, + bool print_failure_info) { + if (f) { + f->open_object_section("delete_info"); + f->dump_string("image_id", image_id); + if (print_failure_info) { + f->dump_string("error_code", cpp_strerror(error_code)); + f->dump_int("retries", retries); + } + f->close_section(); + f->flush(*ss); + } else { + *ss << *this; + } +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::ImageDeleter<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/ImageDeleter.h b/src/tools/rbd_mirror/ImageDeleter.h new file mode 100644 index 00000000..8a17eb38 --- /dev/null +++ b/src/tools/rbd_mirror/ImageDeleter.h @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_H +#define CEPH_RBD_MIRROR_IMAGE_DELETER_H + +#include "include/utime.h" +#include "common/AsyncOpTracker.h" +#include "common/Mutex.h" +#include "tools/rbd_mirror/Types.h" +#include "tools/rbd_mirror/image_deleter/Types.h" +#include <atomic> +#include <deque> +#include <iosfwd> +#include <map> +#include <memory> +#include <vector> + +class AdminSocketHook; +class Context; +class ContextWQ; +class SafeTimer; +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> class ServiceDaemon; +template <typename> class Threads; + +namespace image_deleter { template <typename> struct TrashWatcher; } + +/** + * Manage deletion of non-primary images. + */ +template <typename ImageCtxT = librbd::ImageCtx> +class ImageDeleter { +public: + static ImageDeleter* create(librados::IoCtx& local_io_ctx, + Threads<librbd::ImageCtx>* threads, + ServiceDaemon<librbd::ImageCtx>* service_daemon) { + return new ImageDeleter(local_io_ctx, threads, service_daemon); + } + + ImageDeleter(librados::IoCtx& local_io_ctx, + Threads<librbd::ImageCtx>* threads, + ServiceDaemon<librbd::ImageCtx>* service_daemon); + + ImageDeleter(const ImageDeleter&) = delete; + ImageDeleter& operator=(const ImageDeleter&) = delete; + + static void trash_move(librados::IoCtx& local_io_ctx, + const std::string& global_image_id, bool resync, + ContextWQ* work_queue, Context* on_finish); + + void init(Context* on_finish); + void shut_down(Context* on_finish); + + void print_status(Formatter *f, std::stringstream *ss); + + // for testing purposes + void wait_for_deletion(const std::string &image_id, + bool scheduled_only, Context* on_finish); + + std::vector<std::string> get_delete_queue_items(); + std::vector<std::pair<std::string, int> > get_failed_queue_items(); + + inline void set_busy_timer_interval(double interval) { + m_busy_interval = interval; + } + +private: + struct TrashListener : public image_deleter::TrashListener { + ImageDeleter *image_deleter; + + TrashListener(ImageDeleter *image_deleter) : image_deleter(image_deleter) { + } + + void handle_trash_image(const std::string& image_id, + const utime_t& deferment_end_time) override { + image_deleter->handle_trash_image(image_id, deferment_end_time); + } + }; + + struct DeleteInfo { + std::string image_id; + + image_deleter::ErrorResult error_result = {}; + int error_code = 0; + utime_t retry_time = {}; + int retries = 0; + + DeleteInfo(const std::string& image_id) + : image_id(image_id) { + } + + inline bool operator==(const DeleteInfo& delete_info) const { + return (image_id == delete_info.image_id); + } + + friend std::ostream& operator<<(std::ostream& os, DeleteInfo& delete_info) { + os << "[image_id=" << delete_info.image_id << "]"; + return os; + } + + void print_status(Formatter *f, std::stringstream *ss, + bool print_failure_info=false); + }; + typedef std::shared_ptr<DeleteInfo> DeleteInfoRef; + typedef std::deque<DeleteInfoRef> DeleteQueue; + typedef std::map<std::string, Context*> OnDeleteContexts; + + librados::IoCtx& m_local_io_ctx; + Threads<librbd::ImageCtx>* m_threads; + ServiceDaemon<librbd::ImageCtx>* m_service_daemon; + + image_deleter::TrashWatcher<ImageCtxT>* m_trash_watcher = nullptr; + TrashListener m_trash_listener; + + std::atomic<unsigned> m_running { 1 }; + + double m_busy_interval = 1; + + AsyncOpTracker m_async_op_tracker; + + Mutex m_lock; + DeleteQueue m_delete_queue; + DeleteQueue m_retry_delete_queue; + DeleteQueue m_in_flight_delete_queue; + + OnDeleteContexts m_on_delete_contexts; + + AdminSocketHook *m_asok_hook = nullptr; + + Context *m_timer_ctx = nullptr; + + bool process_image_delete(); + + void complete_active_delete(DeleteInfoRef* delete_info, int r); + void enqueue_failed_delete(DeleteInfoRef* delete_info, int error_code, + double retry_delay); + + DeleteInfoRef find_delete_info(const std::string &image_id); + + void remove_images(); + void remove_image(DeleteInfoRef delete_info); + void handle_remove_image(DeleteInfoRef delete_info, int r); + + void schedule_retry_timer(); + void cancel_retry_timer(); + void handle_retry_timer(); + + void handle_trash_image(const std::string& image_id, + const utime_t& deferment_end_time); + + void shut_down_trash_watcher(Context* on_finish); + void wait_for_ops(Context* on_finish); + void cancel_all_deletions(Context* on_finish); + + void notify_on_delete(const std::string& image_id, int r); + +}; + +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::ImageDeleter<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_H diff --git a/src/tools/rbd_mirror/ImageMap.cc b/src/tools/rbd_mirror/ImageMap.cc new file mode 100644 index 00000000..58fa5e03 --- /dev/null +++ b/src/tools/rbd_mirror/ImageMap.cc @@ -0,0 +1,601 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" + +#include "librbd/Utils.h" +#include "tools/rbd_mirror/Threads.h" + +#include "ImageMap.h" +#include "image_map/LoadRequest.h" +#include "image_map/SimplePolicy.h" +#include "image_map/UpdateRequest.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::ImageMap: " << this << " " \ + << __func__ << ": " + +namespace rbd { +namespace mirror { + +using ::operator<<; +using image_map::Policy; + +using librbd::util::unique_lock_name; +using librbd::util::create_async_context_callback; + +template <typename I> +struct ImageMap<I>::C_NotifyInstance : public Context { + ImageMap* image_map; + std::string global_image_id; + bool acquire_release; + + C_NotifyInstance(ImageMap* image_map, const std::string& global_image_id, + bool acquire_release) + : image_map(image_map), global_image_id(global_image_id), + acquire_release(acquire_release) { + image_map->start_async_op(); + } + + void finish(int r) override { + if (acquire_release) { + image_map->handle_peer_ack(global_image_id, r); + } else { + image_map->handle_peer_ack_remove(global_image_id, r); + } + image_map->finish_async_op(); + } +}; + +template <typename I> +ImageMap<I>::ImageMap(librados::IoCtx &ioctx, Threads<I> *threads, + const std::string& instance_id, + image_map::Listener &listener) + : m_ioctx(ioctx), m_threads(threads), m_instance_id(instance_id), + m_listener(listener), + m_lock(unique_lock_name("rbd::mirror::ImageMap::m_lock", this)) { +} + +template <typename I> +ImageMap<I>::~ImageMap() { + ceph_assert(m_async_op_tracker.empty()); + ceph_assert(m_timer_task == nullptr); + ceph_assert(m_rebalance_task == nullptr); +} + +template <typename I> +void ImageMap<I>::continue_action(const std::set<std::string> &global_image_ids, + int r) { + dout(20) << dendl; + + { + Mutex::Locker locker(m_lock); + if (m_shutting_down) { + return; + } + + for (auto const &global_image_id : global_image_ids) { + bool schedule = m_policy->finish_action(global_image_id, r); + if (schedule) { + schedule_action(global_image_id); + } + } + } + + schedule_update_task(); +} + +template <typename I> +void ImageMap<I>::handle_update_request( + const Updates &updates, + const std::set<std::string> &remove_global_image_ids, int r) { + dout(20) << "r=" << r << dendl; + + std::set<std::string> global_image_ids; + + global_image_ids.insert(remove_global_image_ids.begin(), + remove_global_image_ids.end()); + for (auto const &update : updates) { + global_image_ids.insert(update.global_image_id); + } + + continue_action(global_image_ids, r); +} + +template <typename I> +void ImageMap<I>::update_image_mapping(Updates&& map_updates, + std::set<std::string>&& map_removals) { + if (map_updates.empty() && map_removals.empty()) { + return; + } + + dout(5) << "updates=[" << map_updates << "], " + << "removes=[" << map_removals << "]" << dendl; + + Context *on_finish = new FunctionContext( + [this, map_updates, map_removals](int r) { + handle_update_request(map_updates, map_removals, r); + finish_async_op(); + }); + on_finish = create_async_context_callback(m_threads->work_queue, on_finish); + + // empty meta policy for now.. + image_map::PolicyMetaNone policy_meta; + + bufferlist bl; + encode(image_map::PolicyData(policy_meta), bl); + + // prepare update map + std::map<std::string, cls::rbd::MirrorImageMap> update_mapping; + for (auto const &update : map_updates) { + update_mapping.emplace( + update.global_image_id, cls::rbd::MirrorImageMap(update.instance_id, + update.mapped_time, bl)); + } + + start_async_op(); + image_map::UpdateRequest<I> *req = image_map::UpdateRequest<I>::create( + m_ioctx, std::move(update_mapping), std::move(map_removals), on_finish); + req->send(); +} + +template <typename I> +void ImageMap<I>::process_updates() { + dout(20) << dendl; + + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_timer_task == nullptr); + + Updates map_updates; + std::set<std::string> map_removals; + Updates acquire_updates; + Updates release_updates; + + // gather updates by advancing the state machine + m_lock.Lock(); + for (auto const &global_image_id : m_global_image_ids) { + image_map::ActionType action_type = + m_policy->start_action(global_image_id); + image_map::LookupInfo info = m_policy->lookup(global_image_id); + + dout(15) << "global_image_id=" << global_image_id << ", " + << "action=" << action_type << ", " + << "instance=" << info.instance_id << dendl; + switch (action_type) { + case image_map::ACTION_TYPE_NONE: + continue; + case image_map::ACTION_TYPE_MAP_UPDATE: + ceph_assert(info.instance_id != image_map::UNMAPPED_INSTANCE_ID); + map_updates.emplace_back(global_image_id, info.instance_id, + info.mapped_time); + break; + case image_map::ACTION_TYPE_MAP_REMOVE: + map_removals.emplace(global_image_id); + break; + case image_map::ACTION_TYPE_ACQUIRE: + ceph_assert(info.instance_id != image_map::UNMAPPED_INSTANCE_ID); + acquire_updates.emplace_back(global_image_id, info.instance_id); + break; + case image_map::ACTION_TYPE_RELEASE: + ceph_assert(info.instance_id != image_map::UNMAPPED_INSTANCE_ID); + release_updates.emplace_back(global_image_id, info.instance_id); + break; + } + } + m_global_image_ids.clear(); + m_lock.Unlock(); + + // notify listener (acquire, release) and update on-disk map. note + // that its safe to process this outside m_lock as we still hold + // timer lock. + notify_listener_acquire_release_images(acquire_updates, release_updates); + update_image_mapping(std::move(map_updates), std::move(map_removals)); +} + +template <typename I> +void ImageMap<I>::schedule_update_task() { + Mutex::Locker timer_lock(m_threads->timer_lock); + schedule_update_task(m_threads->timer_lock); +} + +template <typename I> +void ImageMap<I>::schedule_update_task(const Mutex &timer_lock) { + ceph_assert(m_threads->timer_lock.is_locked()); + + schedule_rebalance_task(); + + if (m_timer_task != nullptr) { + return; + } + + { + Mutex::Locker locker(m_lock); + if (m_global_image_ids.empty()) { + return; + } + } + + m_timer_task = new FunctionContext([this](int r) { + ceph_assert(m_threads->timer_lock.is_locked()); + m_timer_task = nullptr; + + process_updates(); + }); + + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + double after = cct->_conf.get_val<double>("rbd_mirror_image_policy_update_throttle_interval"); + + dout(20) << "scheduling image check update (" << m_timer_task << ")" + << " after " << after << " second(s)" << dendl; + m_threads->timer->add_event_after(after, m_timer_task); +} + +template <typename I> +void ImageMap<I>::rebalance() { + ceph_assert(m_rebalance_task == nullptr); + + { + Mutex::Locker locker(m_lock); + if (m_async_op_tracker.empty() && m_global_image_ids.empty()){ + dout(20) << "starting rebalance" << dendl; + + std::set<std::string> remap_global_image_ids; + m_policy->add_instances({}, &remap_global_image_ids); + + for (auto const &global_image_id : remap_global_image_ids) { + schedule_action(global_image_id); + } + } + } + + schedule_update_task(m_threads->timer_lock); +} + +template <typename I> +void ImageMap<I>::schedule_rebalance_task() { + ceph_assert(m_threads->timer_lock.is_locked()); + + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + + // fetch the updated value of idle timeout for (re)scheduling + double resched_after = cct->_conf.get_val<double>( + "rbd_mirror_image_policy_rebalance_timeout"); + if (!resched_after) { + return; + } + + // cancel existing rebalance task if any before scheduling + if (m_rebalance_task != nullptr) { + m_threads->timer->cancel_event(m_rebalance_task); + } + + m_rebalance_task = new FunctionContext([this](int _) { + ceph_assert(m_threads->timer_lock.is_locked()); + m_rebalance_task = nullptr; + + rebalance(); + }); + + dout(20) << "scheduling rebalance (" << m_rebalance_task << ")" + << " after " << resched_after << " second(s)" << dendl; + m_threads->timer->add_event_after(resched_after, m_rebalance_task); +} + +template <typename I> +void ImageMap<I>::schedule_action(const std::string &global_image_id) { + dout(20) << "global_image_id=" << global_image_id << dendl; + ceph_assert(m_lock.is_locked()); + + m_global_image_ids.emplace(global_image_id); +} + +template <typename I> +void ImageMap<I>::notify_listener_acquire_release_images( + const Updates &acquire, const Updates &release) { + if (acquire.empty() && release.empty()) { + return; + } + + dout(5) << "acquire=[" << acquire << "], " + << "release=[" << release << "]" << dendl; + + for (auto const &update : acquire) { + m_listener.acquire_image( + update.global_image_id, update.instance_id, + create_async_context_callback( + m_threads->work_queue, + new C_NotifyInstance(this, update.global_image_id, true))); + } + + for (auto const &update : release) { + m_listener.release_image( + update.global_image_id, update.instance_id, + create_async_context_callback( + m_threads->work_queue, + new C_NotifyInstance(this, update.global_image_id, true))); + } +} + +template <typename I> +void ImageMap<I>::notify_listener_remove_images(const std::string &peer_uuid, + const Updates &remove) { + dout(5) << "peer_uuid=" << peer_uuid << ", " + << "remove=[" << remove << "]" << dendl; + + for (auto const &update : remove) { + m_listener.remove_image( + peer_uuid, update.global_image_id, update.instance_id, + create_async_context_callback( + m_threads->work_queue, + new C_NotifyInstance(this, update.global_image_id, false))); + } +} + +template <typename I> +void ImageMap<I>::handle_load(const std::map<std::string, + cls::rbd::MirrorImageMap> &image_mapping) { + dout(20) << dendl; + + { + Mutex::Locker locker(m_lock); + m_policy->init(image_mapping); + + for (auto& pair : image_mapping) { + schedule_action(pair.first); + } + } + schedule_update_task(); +} + +template <typename I> +void ImageMap<I>::handle_peer_ack_remove(const std::string &global_image_id, + int r) { + Mutex::Locker locker(m_lock); + dout(5) << "global_image_id=" << global_image_id << dendl; + + if (r < 0) { + derr << "failed to remove global_image_id=" << global_image_id << dendl; + } + + auto peer_it = m_peer_map.find(global_image_id); + if (peer_it == m_peer_map.end()) { + return; + } + + m_peer_map.erase(peer_it); +} + +template <typename I> +void ImageMap<I>::update_images_added( + const std::string &peer_uuid, + const std::set<std::string> &global_image_ids) { + dout(5) << "peer_uuid=" << peer_uuid << ", " + << "global_image_ids=[" << global_image_ids << "]" << dendl; + ceph_assert(m_lock.is_locked()); + + for (auto const &global_image_id : global_image_ids) { + auto result = m_peer_map[global_image_id].insert(peer_uuid); + if (result.second && m_peer_map[global_image_id].size() == 1) { + if (m_policy->add_image(global_image_id)) { + schedule_action(global_image_id); + } + } + } +} + +template <typename I> +void ImageMap<I>::update_images_removed( + const std::string &peer_uuid, + const std::set<std::string> &global_image_ids) { + dout(5) << "peer_uuid=" << peer_uuid << ", " + << "global_image_ids=[" << global_image_ids << "]" << dendl; + ceph_assert(m_lock.is_locked()); + + Updates to_remove; + for (auto const &global_image_id : global_image_ids) { + image_map::LookupInfo info = m_policy->lookup(global_image_id); + bool image_mapped = (info.instance_id != image_map::UNMAPPED_INSTANCE_ID); + + bool image_removed = image_mapped; + bool peer_removed = false; + auto peer_it = m_peer_map.find(global_image_id); + if (peer_it != m_peer_map.end()) { + auto& peer_set = peer_it->second; + peer_removed = peer_set.erase(peer_uuid); + image_removed = peer_removed && peer_set.empty(); + } + + if (image_mapped && peer_removed && !peer_uuid.empty()) { + // peer image has been deleted + to_remove.emplace_back(global_image_id, info.instance_id); + } + + if (image_mapped && image_removed) { + // local and peer images have been deleted + if (m_policy->remove_image(global_image_id)) { + schedule_action(global_image_id); + } + } + } + + if (!to_remove.empty()) { + // removal notification will be notified instantly. this is safe + // even after scheduling action for images as we still hold m_lock + notify_listener_remove_images(peer_uuid, to_remove); + } +} + +template <typename I> +void ImageMap<I>::update_instances_added( + const std::vector<std::string> &instance_ids) { + { + Mutex::Locker locker(m_lock); + if (m_shutting_down) { + return; + } + + std::vector<std::string> filtered_instance_ids; + filter_instance_ids(instance_ids, &filtered_instance_ids, false); + if (filtered_instance_ids.empty()) { + return; + } + + dout(20) << "instance_ids=" << filtered_instance_ids << dendl; + + std::set<std::string> remap_global_image_ids; + m_policy->add_instances(filtered_instance_ids, &remap_global_image_ids); + + for (auto const &global_image_id : remap_global_image_ids) { + schedule_action(global_image_id); + } + } + + schedule_update_task(); +} + +template <typename I> +void ImageMap<I>::update_instances_removed( + const std::vector<std::string> &instance_ids) { + { + Mutex::Locker locker(m_lock); + if (m_shutting_down) { + return; + } + + std::vector<std::string> filtered_instance_ids; + filter_instance_ids(instance_ids, &filtered_instance_ids, true); + if (filtered_instance_ids.empty()) { + return; + } + + dout(20) << "instance_ids=" << filtered_instance_ids << dendl; + + std::set<std::string> remap_global_image_ids; + m_policy->remove_instances(filtered_instance_ids, &remap_global_image_ids); + + for (auto const &global_image_id : remap_global_image_ids) { + schedule_action(global_image_id); + } + } + + schedule_update_task(); +} + +template <typename I> +void ImageMap<I>::update_images(const std::string &peer_uuid, + std::set<std::string> &&added_global_image_ids, + std::set<std::string> &&removed_global_image_ids) { + dout(5) << "peer_uuid=" << peer_uuid << ", " << "added_count=" + << added_global_image_ids.size() << ", " << "removed_count=" + << removed_global_image_ids.size() << dendl; + + { + Mutex::Locker locker(m_lock); + if (m_shutting_down) { + return; + } + + if (!removed_global_image_ids.empty()) { + update_images_removed(peer_uuid, removed_global_image_ids); + } + if (!added_global_image_ids.empty()) { + update_images_added(peer_uuid, added_global_image_ids); + } + } + + schedule_update_task(); +} + +template <typename I> +void ImageMap<I>::handle_peer_ack(const std::string &global_image_id, int r) { + dout (20) << "global_image_id=" << global_image_id << ", r=" << r + << dendl; + + continue_action({global_image_id}, r); +} + +template <typename I> +void ImageMap<I>::init(Context *on_finish) { + dout(20) << dendl; + + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + std::string policy_type = cct->_conf.get_val<string>("rbd_mirror_image_policy_type"); + + if (policy_type == "none" || policy_type == "simple") { + m_policy.reset(image_map::SimplePolicy::create(m_ioctx)); + } else { + ceph_abort(); // not really needed as such, but catch it. + } + + dout(20) << "mapping policy=" << policy_type << dendl; + + start_async_op(); + C_LoadMap *ctx = new C_LoadMap(this, on_finish); + image_map::LoadRequest<I> *req = image_map::LoadRequest<I>::create( + m_ioctx, &ctx->image_mapping, ctx); + req->send(); +} + +template <typename I> +void ImageMap<I>::shut_down(Context *on_finish) { + dout(20) << dendl; + + { + Mutex::Locker timer_lock(m_threads->timer_lock); + + { + Mutex::Locker locker(m_lock); + ceph_assert(!m_shutting_down); + + m_shutting_down = true; + m_policy.reset(); + } + + if (m_timer_task != nullptr) { + m_threads->timer->cancel_event(m_timer_task); + m_timer_task = nullptr; + } + if (m_rebalance_task != nullptr) { + m_threads->timer->cancel_event(m_rebalance_task); + m_rebalance_task = nullptr; + } + } + + wait_for_async_ops(on_finish); +} + +template <typename I> +void ImageMap<I>::filter_instance_ids( + const std::vector<std::string> &instance_ids, + std::vector<std::string> *filtered_instance_ids, bool removal) const { + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + std::string policy_type = cct->_conf.get_val<string>("rbd_mirror_image_policy_type"); + + if (policy_type != "none") { + *filtered_instance_ids = instance_ids; + return; + } + + if (removal) { + // propagate removals for external instances + for (auto& instance_id : instance_ids) { + if (instance_id != m_instance_id) { + filtered_instance_ids->push_back(instance_id); + } + } + } else if (std::find(instance_ids.begin(), instance_ids.end(), + m_instance_id) != instance_ids.end()) { + // propagate addition only for local instance + filtered_instance_ids->push_back(m_instance_id); + } +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::ImageMap<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/ImageMap.h b/src/tools/rbd_mirror/ImageMap.h new file mode 100644 index 00000000..283f55db --- /dev/null +++ b/src/tools/rbd_mirror/ImageMap.h @@ -0,0 +1,175 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_H +#define CEPH_RBD_MIRROR_IMAGE_MAP_H + +#include <vector> + +#include "common/Mutex.h" +#include "include/Context.h" +#include "common/AsyncOpTracker.h" +#include "cls/rbd/cls_rbd_types.h" +#include "include/rados/librados.hpp" + +#include "image_map/Policy.h" +#include "image_map/Types.h" + +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> struct Threads; + +template <typename ImageCtxT = librbd::ImageCtx> +class ImageMap { +public: + static ImageMap *create(librados::IoCtx &ioctx, Threads<ImageCtxT> *threads, + const std::string& instance_id, + image_map::Listener &listener) { + return new ImageMap(ioctx, threads, instance_id, listener); + } + + ~ImageMap(); + + // init (load) the instance map from disk + void init(Context *on_finish); + + // shut down map operations + void shut_down(Context *on_finish); + + // update (add/remove) images + void update_images(const std::string &peer_uuid, + std::set<std::string> &&added_global_image_ids, + std::set<std::string> &&removed_global_image_ids); + + // add/remove instances + void update_instances_added(const std::vector<std::string> &instances); + void update_instances_removed(const std::vector<std::string> &instances); + +private: + struct C_NotifyInstance; + + ImageMap(librados::IoCtx &ioctx, Threads<ImageCtxT> *threads, + const std::string& instance_id, image_map::Listener &listener); + + struct Update { + std::string global_image_id; + std::string instance_id; + utime_t mapped_time; + + Update(const std::string &global_image_id, const std::string &instance_id, + utime_t mapped_time) + : global_image_id(global_image_id), + instance_id(instance_id), + mapped_time(mapped_time) { + } + Update(const std::string &global_image_id, const std::string &instance_id) + : Update(global_image_id, instance_id, ceph_clock_now()) { + } + + friend std::ostream& operator<<(std::ostream& os, + const Update& update) { + os << "{global_image_id=" << update.global_image_id << ", " + << "instance_id=" << update.instance_id << "}"; + return os; + } + + }; + typedef std::list<Update> Updates; + + // Lock ordering: m_threads->timer_lock, m_lock + + librados::IoCtx &m_ioctx; + Threads<ImageCtxT> *m_threads; + std::string m_instance_id; + image_map::Listener &m_listener; + + std::unique_ptr<image_map::Policy> m_policy; // our mapping policy + + Context *m_timer_task = nullptr; + Mutex m_lock; + bool m_shutting_down = false; + AsyncOpTracker m_async_op_tracker; + + // global_image_id -> registered peers ("" == local, remote otherwise) + std::map<std::string, std::set<std::string> > m_peer_map; + + std::set<std::string> m_global_image_ids; + + Context *m_rebalance_task = nullptr; + + struct C_LoadMap : Context { + ImageMap *image_map; + Context *on_finish; + + std::map<std::string, cls::rbd::MirrorImageMap> image_mapping; + + C_LoadMap(ImageMap *image_map, Context *on_finish) + : image_map(image_map), + on_finish(on_finish) { + } + + void finish(int r) override { + if (r == 0) { + image_map->handle_load(image_mapping); + } + + image_map->finish_async_op(); + on_finish->complete(r); + } + }; + + // async op-tracker helper routines + void start_async_op() { + m_async_op_tracker.start_op(); + } + void finish_async_op() { + m_async_op_tracker.finish_op(); + } + void wait_for_async_ops(Context *on_finish) { + m_async_op_tracker.wait_for_ops(on_finish); + } + + void handle_peer_ack(const std::string &global_image_id, int r); + void handle_peer_ack_remove(const std::string &global_image_id, int r); + + void handle_load(const std::map<std::string, cls::rbd::MirrorImageMap> &image_mapping); + void handle_update_request(const Updates &updates, + const std::set<std::string> &remove_global_image_ids, int r); + + // continue (retry or resume depending on state machine) processing + // current action. + void continue_action(const std::set<std::string> &global_image_ids, int r); + + // schedule an image for update + void schedule_action(const std::string &global_image_id); + + void schedule_update_task(); + void schedule_update_task(const Mutex &timer_lock); + void process_updates(); + void update_image_mapping(Updates&& map_updates, + std::set<std::string>&& map_removals); + + void rebalance(); + void schedule_rebalance_task(); + + void notify_listener_acquire_release_images(const Updates &acquire, const Updates &release); + void notify_listener_remove_images(const std::string &peer_uuid, const Updates &remove); + + void update_images_added(const std::string &peer_uuid, + const std::set<std::string> &global_image_ids); + void update_images_removed(const std::string &peer_uuid, + const std::set<std::string> &global_image_ids); + + void filter_instance_ids(const std::vector<std::string> &instance_ids, + std::vector<std::string> *filtered_instance_ids, + bool removal) const; + +}; + +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_IMAGE_MAP_H diff --git a/src/tools/rbd_mirror/ImageReplayer.cc b/src/tools/rbd_mirror/ImageReplayer.cc new file mode 100644 index 00000000..6c6ee2d5 --- /dev/null +++ b/src/tools/rbd_mirror/ImageReplayer.cc @@ -0,0 +1,1896 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include "common/Formatter.h" +#include "common/admin_socket.h" +#include "common/debug.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "global/global_context.h" +#include "journal/Journaler.h" +#include "journal/ReplayHandler.h" +#include "journal/Settings.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/journal/Replay.h" +#include "ImageDeleter.h" +#include "ImageReplayer.h" +#include "Threads.h" +#include "tools/rbd_mirror/image_replayer/BootstrapRequest.h" +#include "tools/rbd_mirror/image_replayer/CloseImageRequest.h" +#include "tools/rbd_mirror/image_replayer/EventPreprocessor.h" +#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h" +#include "tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h" +#include "tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::" << *this << " " \ + << __func__ << ": " + +using std::map; +using std::string; +using std::unique_ptr; +using std::shared_ptr; +using std::vector; + +extern PerfCounters *g_perf_counters; + +namespace rbd { +namespace mirror { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; +using namespace rbd::mirror::image_replayer; + +template <typename I> +std::ostream &operator<<(std::ostream &os, + const typename ImageReplayer<I>::State &state); + +namespace { + +template <typename I> +struct ReplayHandler : public ::journal::ReplayHandler { + ImageReplayer<I> *replayer; + ReplayHandler(ImageReplayer<I> *replayer) : replayer(replayer) {} + void get() override {} + void put() override {} + + void handle_entries_available() override { + replayer->handle_replay_ready(); + } + void handle_complete(int r) override { + std::stringstream ss; + if (r < 0) { + ss << "replay completed with error: " << cpp_strerror(r); + } + replayer->handle_replay_complete(r, ss.str()); + } +}; + +template <typename I> +class ImageReplayerAdminSocketCommand { +public: + ImageReplayerAdminSocketCommand(const std::string &desc, + ImageReplayer<I> *replayer) + : desc(desc), replayer(replayer) { + } + virtual ~ImageReplayerAdminSocketCommand() {} + virtual bool call(Formatter *f, stringstream *ss) = 0; + + std::string desc; + ImageReplayer<I> *replayer; + bool registered = false; +}; + +template <typename I> +class StatusCommand : public ImageReplayerAdminSocketCommand<I> { +public: + explicit StatusCommand(const std::string &desc, ImageReplayer<I> *replayer) + : ImageReplayerAdminSocketCommand<I>(desc, replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->replayer->print_status(f, ss); + return true; + } +}; + +template <typename I> +class StartCommand : public ImageReplayerAdminSocketCommand<I> { +public: + explicit StartCommand(const std::string &desc, ImageReplayer<I> *replayer) + : ImageReplayerAdminSocketCommand<I>(desc, replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->replayer->start(nullptr, true); + return true; + } +}; + +template <typename I> +class StopCommand : public ImageReplayerAdminSocketCommand<I> { +public: + explicit StopCommand(const std::string &desc, ImageReplayer<I> *replayer) + : ImageReplayerAdminSocketCommand<I>(desc, replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->replayer->stop(nullptr, true); + return true; + } +}; + +template <typename I> +class RestartCommand : public ImageReplayerAdminSocketCommand<I> { +public: + explicit RestartCommand(const std::string &desc, ImageReplayer<I> *replayer) + : ImageReplayerAdminSocketCommand<I>(desc, replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->replayer->restart(); + return true; + } +}; + +template <typename I> +class FlushCommand : public ImageReplayerAdminSocketCommand<I> { +public: + explicit FlushCommand(const std::string &desc, ImageReplayer<I> *replayer) + : ImageReplayerAdminSocketCommand<I>(desc, replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->replayer->flush(); + return true; + } +}; + +template <typename I> +class ImageReplayerAdminSocketHook : public AdminSocketHook { +public: + ImageReplayerAdminSocketHook(CephContext *cct, const std::string &name, + ImageReplayer<I> *replayer) + : admin_socket(cct->get_admin_socket()), + commands{{"rbd mirror flush " + name, + new FlushCommand<I>("flush rbd mirror " + name, replayer)}, + {"rbd mirror restart " + name, + new RestartCommand<I>("restart rbd mirror " + name, replayer)}, + {"rbd mirror start " + name, + new StartCommand<I>("start rbd mirror " + name, replayer)}, + {"rbd mirror status " + name, + new StatusCommand<I>("get status for rbd mirror " + name, replayer)}, + {"rbd mirror stop " + name, + new StopCommand<I>("stop rbd mirror " + name, replayer)}} { + } + + int register_commands() { + for (auto &it : commands) { + int r = admin_socket->register_command(it.first, it.first, this, + it.second->desc); + if (r < 0) { + return r; + } + it.second->registered = true; + } + return 0; + } + + ~ImageReplayerAdminSocketHook() override { + for (auto &it : commands) { + if (it.second->registered) { + admin_socket->unregister_command(it.first); + } + delete it.second; + } + commands.clear(); + } + + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + auto i = commands.find(command); + ceph_assert(i != commands.end()); + Formatter *f = Formatter::create(format); + stringstream ss; + bool r = i->second->call(f, &ss); + delete f; + out.append(ss); + return r; + } + +private: + typedef std::map<std::string, ImageReplayerAdminSocketCommand<I>*, + std::less<>> Commands; + + AdminSocket *admin_socket; + Commands commands; +}; + +uint32_t calculate_replay_delay(const utime_t &event_time, + int mirroring_replay_delay) { + if (mirroring_replay_delay <= 0) { + return 0; + } + + utime_t now = ceph_clock_now(); + if (event_time + mirroring_replay_delay <= now) { + return 0; + } + + // ensure it is rounded up when converting to integer + return (event_time + mirroring_replay_delay - now) + 1; +} + +} // anonymous namespace + +template <typename I> +void ImageReplayer<I>::BootstrapProgressContext::update_progress( + const std::string &description, bool flush) +{ + const std::string desc = "bootstrapping, " + description; + replayer->set_state_description(0, desc); + if (flush) { + replayer->update_mirror_image_status(false, boost::none); + } +} + +template <typename I> +void ImageReplayer<I>::RemoteJournalerListener::handle_update( + ::journal::JournalMetadata *) { + FunctionContext *ctx = new FunctionContext([this](int r) { + replayer->handle_remote_journal_metadata_updated(); + }); + replayer->m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +ImageReplayer<I>::ImageReplayer(Threads<I> *threads, + InstanceWatcher<I> *instance_watcher, + RadosRef local, + const std::string &local_mirror_uuid, + int64_t local_pool_id, + const std::string &global_image_id) : + m_threads(threads), + m_instance_watcher(instance_watcher), + m_local(local), + m_local_mirror_uuid(local_mirror_uuid), + m_local_pool_id(local_pool_id), + m_global_image_id(global_image_id), m_local_image_name(global_image_id), + m_lock("rbd::mirror::ImageReplayer " + stringify(local_pool_id) + " " + + global_image_id), + m_progress_cxt(this), + m_journal_listener(new JournalListener(this)), + m_remote_listener(this) +{ + // Register asok commands using a temporary "remote_pool_name/global_image_id" + // name. When the image name becomes known on start the asok commands will be + // re-registered using "remote_pool_name/remote_image_name" name. + + std::string pool_name; + int r = m_local->pool_reverse_lookup(m_local_pool_id, &pool_name); + if (r < 0) { + derr << "error resolving local pool " << m_local_pool_id + << ": " << cpp_strerror(r) << dendl; + pool_name = stringify(m_local_pool_id); + } + + m_name = pool_name + "/" + m_global_image_id; + register_admin_socket_hook(); +} + +template <typename I> +ImageReplayer<I>::~ImageReplayer() +{ + unregister_admin_socket_hook(); + ceph_assert(m_event_preprocessor == nullptr); + ceph_assert(m_replay_status_formatter == nullptr); + ceph_assert(m_local_image_ctx == nullptr); + ceph_assert(m_local_replay == nullptr); + ceph_assert(m_remote_journaler == nullptr); + ceph_assert(m_replay_handler == nullptr); + ceph_assert(m_on_start_finish == nullptr); + ceph_assert(m_on_stop_finish == nullptr); + ceph_assert(m_bootstrap_request == nullptr); + ceph_assert(m_in_flight_status_updates == 0); + + delete m_journal_listener; +} + +template <typename I> +image_replayer::HealthState ImageReplayer<I>::get_health_state() const { + Mutex::Locker locker(m_lock); + + if (!m_mirror_image_status_state) { + return image_replayer::HEALTH_STATE_OK; + } else if (*m_mirror_image_status_state == + cls::rbd::MIRROR_IMAGE_STATUS_STATE_SYNCING || + *m_mirror_image_status_state == + cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN) { + return image_replayer::HEALTH_STATE_WARNING; + } + return image_replayer::HEALTH_STATE_ERROR; +} + +template <typename I> +void ImageReplayer<I>::add_peer(const std::string &peer_uuid, + librados::IoCtx &io_ctx) { + Mutex::Locker locker(m_lock); + auto it = m_peers.find({peer_uuid}); + if (it == m_peers.end()) { + m_peers.insert({peer_uuid, io_ctx}); + } +} + +template <typename I> +void ImageReplayer<I>::set_state_description(int r, const std::string &desc) { + dout(10) << r << " " << desc << dendl; + + Mutex::Locker l(m_lock); + m_last_r = r; + m_state_desc = desc; +} + +template <typename I> +void ImageReplayer<I>::start(Context *on_finish, bool manual) +{ + dout(10) << "on_finish=" << on_finish << dendl; + + int r = 0; + { + Mutex::Locker locker(m_lock); + if (!is_stopped_()) { + derr << "already running" << dendl; + r = -EINVAL; + } else if (m_manual_stop && !manual) { + dout(5) << "stopped manually, ignoring start without manual flag" + << dendl; + r = -EPERM; + } else { + m_state = STATE_STARTING; + m_last_r = 0; + m_state_desc.clear(); + m_manual_stop = false; + m_delete_requested = false; + + if (on_finish != nullptr) { + ceph_assert(m_on_start_finish == nullptr); + m_on_start_finish = on_finish; + } + ceph_assert(m_on_stop_finish == nullptr); + } + } + + if (r < 0) { + if (on_finish) { + on_finish->complete(r); + } + return; + } + + m_local_ioctx.reset(new librados::IoCtx{}); + r = m_local->ioctx_create2(m_local_pool_id, *m_local_ioctx); + if (r < 0) { + m_local_ioctx.reset(); + + derr << "error opening ioctx for local pool " << m_local_pool_id + << ": " << cpp_strerror(r) << dendl; + on_start_fail(r, "error opening local pool"); + return; + } + + prepare_local_image(); +} + +template <typename I> +void ImageReplayer<I>::prepare_local_image() { + dout(10) << dendl; + + m_local_image_id = ""; + Context *ctx = create_context_callback< + ImageReplayer, &ImageReplayer<I>::handle_prepare_local_image>(this); + auto req = PrepareLocalImageRequest<I>::create( + *m_local_ioctx, m_global_image_id, &m_local_image_id, &m_local_image_name, + &m_local_image_tag_owner, m_threads->work_queue, ctx); + req->send(); +} + +template <typename I> +void ImageReplayer<I>::handle_prepare_local_image(int r) { + dout(10) << "r=" << r << dendl; + + if (r == -ENOENT) { + dout(10) << "local image does not exist" << dendl; + } else if (r < 0) { + on_start_fail(r, "error preparing local image for replay"); + return; + } else { + reregister_admin_socket_hook(); + } + + // local image doesn't exist or is non-primary + prepare_remote_image(); +} + +template <typename I> +void ImageReplayer<I>::prepare_remote_image() { + dout(10) << dendl; + if (m_peers.empty()) { + // technically nothing to bootstrap, but it handles the status update + bootstrap(); + return; + } + + // TODO need to support multiple remote images + ceph_assert(!m_peers.empty()); + m_remote_image = {*m_peers.begin()}; + + auto cct = static_cast<CephContext *>(m_local->cct()); + journal::Settings journal_settings; + journal_settings.commit_interval = cct->_conf.get_val<double>( + "rbd_mirror_journal_commit_age"); + journal_settings.max_fetch_bytes = cct->_conf.get_val<Option::size_t>( + "rbd_mirror_journal_max_fetch_bytes"); + + Context *ctx = create_context_callback< + ImageReplayer, &ImageReplayer<I>::handle_prepare_remote_image>(this); + auto req = PrepareRemoteImageRequest<I>::create( + m_threads, m_remote_image.io_ctx, m_global_image_id, m_local_mirror_uuid, + m_local_image_id, journal_settings, &m_remote_image.mirror_uuid, + &m_remote_image.image_id, &m_remote_journaler, &m_client_state, + &m_client_meta, ctx); + req->send(); +} + +template <typename I> +void ImageReplayer<I>::handle_prepare_remote_image(int r) { + dout(10) << "r=" << r << dendl; + + ceph_assert(r < 0 ? m_remote_journaler == nullptr : m_remote_journaler != nullptr); + if (r < 0 && !m_local_image_id.empty() && + m_local_image_tag_owner == librbd::Journal<>::LOCAL_MIRROR_UUID) { + // local image is primary -- fall-through + } else if (r == -ENOENT) { + dout(10) << "remote image does not exist" << dendl; + + // TODO need to support multiple remote images + if (m_remote_image.image_id.empty() && !m_local_image_id.empty() && + m_local_image_tag_owner == m_remote_image.mirror_uuid) { + // local image exists and is non-primary and linked to the missing + // remote image + + m_delete_requested = true; + on_start_fail(0, "remote image no longer exists"); + } else { + on_start_fail(-ENOENT, "remote image does not exist"); + } + return; + } else if (r < 0) { + on_start_fail(r, "error retrieving remote image id"); + return; + } + + bootstrap(); +} + +template <typename I> +void ImageReplayer<I>::bootstrap() { + dout(10) << dendl; + + if (!m_local_image_id.empty() && + m_local_image_tag_owner == librbd::Journal<>::LOCAL_MIRROR_UUID) { + dout(5) << "local image is primary" << dendl; + on_start_fail(0, "local image is primary"); + return; + } else if (m_peers.empty()) { + dout(5) << "no peer clusters" << dendl; + on_start_fail(-ENOENT, "no peer clusters"); + return; + } + + BootstrapRequest<I> *request = nullptr; + { + Mutex::Locker locker(m_lock); + if (on_start_interrupted(m_lock)) { + return; + } + + auto ctx = create_context_callback< + ImageReplayer, &ImageReplayer<I>::handle_bootstrap>(this); + request = BootstrapRequest<I>::create( + m_threads, *m_local_ioctx, m_remote_image.io_ctx, m_instance_watcher, + &m_local_image_ctx, m_local_image_id, m_remote_image.image_id, + m_global_image_id, m_local_mirror_uuid, m_remote_image.mirror_uuid, + m_remote_journaler, &m_client_state, &m_client_meta, ctx, + &m_resync_requested, &m_progress_cxt); + request->get(); + m_bootstrap_request = request; + } + + update_mirror_image_status(false, boost::none); + reschedule_update_status_task(10); + + request->send(); +} + +template <typename I> +void ImageReplayer<I>::handle_bootstrap(int r) { + dout(10) << "r=" << r << dendl; + { + Mutex::Locker locker(m_lock); + m_bootstrap_request->put(); + m_bootstrap_request = nullptr; + if (m_local_image_ctx) { + m_local_image_id = m_local_image_ctx->id; + } + } + + if (on_start_interrupted()) { + return; + } else if (r == -EREMOTEIO) { + m_local_image_tag_owner = ""; + dout(5) << "remote image is non-primary" << dendl; + on_start_fail(-EREMOTEIO, "remote image is non-primary"); + return; + } else if (r == -EEXIST) { + m_local_image_tag_owner = ""; + on_start_fail(r, "split-brain detected"); + return; + } else if (r < 0) { + on_start_fail(r, "error bootstrapping replay"); + return; + } else if (m_resync_requested) { + on_start_fail(0, "resync requested"); + return; + } + + ceph_assert(m_local_journal == nullptr); + { + RWLock::RLocker snap_locker(m_local_image_ctx->snap_lock); + if (m_local_image_ctx->journal != nullptr) { + m_local_journal = m_local_image_ctx->journal; + m_local_journal->add_listener(m_journal_listener); + } + } + + if (m_local_journal == nullptr) { + on_start_fail(-EINVAL, "error accessing local journal"); + return; + } + + update_mirror_image_status(false, boost::none); + init_remote_journaler(); +} + +template <typename I> +void ImageReplayer<I>::init_remote_journaler() { + dout(10) << dendl; + + Context *ctx = create_context_callback< + ImageReplayer, &ImageReplayer<I>::handle_init_remote_journaler>(this); + m_remote_journaler->init(ctx); +} + +template <typename I> +void ImageReplayer<I>::handle_init_remote_journaler(int r) { + dout(10) << "r=" << r << dendl; + + if (on_start_interrupted()) { + return; + } else if (r < 0) { + derr << "failed to initialize remote journal: " << cpp_strerror(r) << dendl; + on_start_fail(r, "error initializing remote journal"); + return; + } + + m_remote_journaler->add_listener(&m_remote_listener); + + cls::journal::Client client; + r = m_remote_journaler->get_cached_client(m_local_mirror_uuid, &client); + if (r < 0) { + derr << "error retrieving remote journal client: " << cpp_strerror(r) + << dendl; + on_start_fail(r, "error retrieving remote journal client"); + return; + } + + dout(5) << "image_id=" << m_local_image_id << ", " + << "client_meta.image_id=" << m_client_meta.image_id << ", " + << "client.state=" << client.state << dendl; + if (m_client_meta.image_id == m_local_image_id && + client.state != cls::journal::CLIENT_STATE_CONNECTED) { + dout(5) << "client flagged disconnected, stopping image replay" << dendl; + if (m_local_image_ctx->config.template get_val<bool>("rbd_mirroring_resync_after_disconnect")) { + m_resync_requested = true; + on_start_fail(-ENOTCONN, "disconnected: automatic resync"); + } else { + on_start_fail(-ENOTCONN, "disconnected"); + } + return; + } + + start_replay(); +} + +template <typename I> +void ImageReplayer<I>::start_replay() { + dout(10) << dendl; + + Context *start_ctx = create_context_callback< + ImageReplayer, &ImageReplayer<I>::handle_start_replay>(this); + m_local_journal->start_external_replay(&m_local_replay, start_ctx); +} + +template <typename I> +void ImageReplayer<I>::handle_start_replay(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + ceph_assert(m_local_replay == nullptr); + derr << "error starting external replay on local image " + << m_local_image_id << ": " << cpp_strerror(r) << dendl; + on_start_fail(r, "error starting replay on local image"); + return; + } + + m_replay_status_formatter = + ReplayStatusFormatter<I>::create(m_remote_journaler, m_local_mirror_uuid); + + Context *on_finish(nullptr); + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_STARTING); + m_state = STATE_REPLAYING; + std::swap(m_on_start_finish, on_finish); + } + + m_event_preprocessor = EventPreprocessor<I>::create( + *m_local_image_ctx, *m_remote_journaler, m_local_mirror_uuid, + &m_client_meta, m_threads->work_queue); + + update_mirror_image_status(true, boost::none); + reschedule_update_status_task(30); + + if (on_replay_interrupted()) { + return; + } + + { + CephContext *cct = static_cast<CephContext *>(m_local->cct()); + double poll_seconds = cct->_conf.get_val<double>( + "rbd_mirror_journal_poll_age"); + + Mutex::Locker locker(m_lock); + m_replay_handler = new ReplayHandler<I>(this); + m_remote_journaler->start_live_replay(m_replay_handler, poll_seconds); + + dout(10) << "m_remote_journaler=" << *m_remote_journaler << dendl; + } + + dout(10) << "start succeeded" << dendl; + if (on_finish != nullptr) { + dout(10) << "on finish complete, r=" << r << dendl; + on_finish->complete(r); + } +} + +template <typename I> +void ImageReplayer<I>::on_start_fail(int r, const std::string &desc) +{ + dout(10) << "r=" << r << dendl; + Context *ctx = new FunctionContext([this, r, desc](int _r) { + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_STARTING); + m_state = STATE_STOPPING; + if (r < 0 && r != -ECANCELED && r != -EREMOTEIO && r != -ENOENT) { + derr << "start failed: " << cpp_strerror(r) << dendl; + } else { + dout(10) << "start canceled" << dendl; + } + } + + set_state_description(r, desc); + if (m_local_ioctx) { + update_mirror_image_status(false, boost::none); + } + reschedule_update_status_task(-1); + shut_down(r); + }); + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +bool ImageReplayer<I>::on_start_interrupted() { + Mutex::Locker locker(m_lock); + return on_start_interrupted(m_lock); +} + +template <typename I> +bool ImageReplayer<I>::on_start_interrupted(Mutex& lock) { + ceph_assert(m_lock.is_locked()); + ceph_assert(m_state == STATE_STARTING); + if (!m_stop_requested) { + return false; + } + + on_start_fail(-ECANCELED, ""); + return true; +} + +template <typename I> +void ImageReplayer<I>::stop(Context *on_finish, bool manual, int r, + const std::string& desc) +{ + dout(10) << "on_finish=" << on_finish << ", manual=" << manual + << ", desc=" << desc << dendl; + + image_replayer::BootstrapRequest<I> *bootstrap_request = nullptr; + bool shut_down_replay = false; + bool running = true; + { + Mutex::Locker locker(m_lock); + + if (!is_running_()) { + running = false; + } else { + if (!is_stopped_()) { + if (m_state == STATE_STARTING) { + dout(10) << "canceling start" << dendl; + if (m_bootstrap_request != nullptr) { + bootstrap_request = m_bootstrap_request; + bootstrap_request->get(); + } + } else { + dout(10) << "interrupting replay" << dendl; + shut_down_replay = true; + } + + ceph_assert(m_on_stop_finish == nullptr); + std::swap(m_on_stop_finish, on_finish); + m_stop_requested = true; + m_manual_stop = manual; + } + } + } + + // avoid holding lock since bootstrap request will update status + if (bootstrap_request != nullptr) { + dout(10) << "canceling bootstrap" << dendl; + bootstrap_request->cancel(); + bootstrap_request->put(); + } + + if (!running) { + dout(20) << "not running" << dendl; + if (on_finish) { + on_finish->complete(-EINVAL); + } + return; + } + + if (shut_down_replay) { + on_stop_journal_replay(r, desc); + } else if (on_finish != nullptr) { + on_finish->complete(0); + } +} + +template <typename I> +void ImageReplayer<I>::on_stop_journal_replay(int r, const std::string &desc) +{ + dout(10) << dendl; + + { + Mutex::Locker locker(m_lock); + if (m_state != STATE_REPLAYING) { + // might be invoked multiple times while stopping + return; + } + m_stop_requested = true; + m_state = STATE_STOPPING; + } + + set_state_description(r, desc); + update_mirror_image_status(true, boost::none); + reschedule_update_status_task(-1); + shut_down(0); +} + +template <typename I> +void ImageReplayer<I>::handle_replay_ready() +{ + dout(20) << dendl; + if (on_replay_interrupted()) { + return; + } + + if (!m_remote_journaler->try_pop_front(&m_replay_entry, &m_replay_tag_tid)) { + return; + } + + m_event_replay_tracker.start_op(); + + m_lock.Lock(); + bool stopping = (m_state == STATE_STOPPING); + m_lock.Unlock(); + + if (stopping) { + dout(10) << "stopping event replay" << dendl; + m_event_replay_tracker.finish_op(); + return; + } + + if (m_replay_tag_valid && m_replay_tag.tid == m_replay_tag_tid) { + preprocess_entry(); + return; + } + + replay_flush(); +} + +template <typename I> +void ImageReplayer<I>::restart(Context *on_finish) +{ + FunctionContext *ctx = new FunctionContext( + [this, on_finish](int r) { + if (r < 0) { + // Try start anyway. + } + start(on_finish, true); + }); + stop(ctx); +} + +template <typename I> +void ImageReplayer<I>::flush() +{ + dout(10) << dendl; + C_SaferCond ctx; + flush_local_replay(&ctx); + ctx.wait(); + + update_mirror_image_status(false, boost::none); +} + +template <typename I> +void ImageReplayer<I>::flush_local_replay(Context* on_flush) +{ + m_lock.Lock(); + if (m_state != STATE_REPLAYING) { + m_lock.Unlock(); + on_flush->complete(0); + return; + } + + dout(15) << dendl; + auto ctx = new FunctionContext( + [this, on_flush](int r) { + handle_flush_local_replay(on_flush, r); + }); + m_local_replay->flush(ctx); + m_lock.Unlock(); +} + +template <typename I> +void ImageReplayer<I>::handle_flush_local_replay(Context* on_flush, int r) +{ + dout(15) << "r=" << r << dendl; + if (r < 0) { + derr << "error flushing local replay: " << cpp_strerror(r) << dendl; + on_flush->complete(r); + return; + } + + flush_commit_position(on_flush); +} + +template <typename I> +void ImageReplayer<I>::flush_commit_position(Context* on_flush) +{ + m_lock.Lock(); + if (m_state != STATE_REPLAYING) { + m_lock.Unlock(); + on_flush->complete(0); + return; + } + + dout(15) << dendl; + auto ctx = new FunctionContext( + [this, on_flush](int r) { + handle_flush_commit_position(on_flush, r); + }); + m_remote_journaler->flush_commit_position(ctx); + m_lock.Unlock(); +} + +template <typename I> +void ImageReplayer<I>::handle_flush_commit_position(Context* on_flush, int r) +{ + dout(15) << "r=" << r << dendl; + if (r < 0) { + derr << "error flushing remote journal commit position: " + << cpp_strerror(r) << dendl; + } + + on_flush->complete(r); +} + +template <typename I> +bool ImageReplayer<I>::on_replay_interrupted() +{ + bool shut_down; + { + Mutex::Locker locker(m_lock); + shut_down = m_stop_requested; + } + + if (shut_down) { + on_stop_journal_replay(); + } + return shut_down; +} + +template <typename I> +void ImageReplayer<I>::print_status(Formatter *f, stringstream *ss) +{ + dout(10) << dendl; + + Mutex::Locker l(m_lock); + + if (f) { + f->open_object_section("image_replayer"); + f->dump_string("name", m_name); + f->dump_string("state", to_string(m_state)); + f->close_section(); + f->flush(*ss); + } else { + *ss << m_name << ": state: " << to_string(m_state); + } +} + +template <typename I> +void ImageReplayer<I>::handle_replay_complete(int r, const std::string &error_desc) +{ + dout(10) << "r=" << r << dendl; + if (r < 0) { + derr << "replay encountered an error: " << cpp_strerror(r) << dendl; + } + + { + Mutex::Locker locker(m_lock); + m_stop_requested = true; + } + on_stop_journal_replay(r, error_desc); +} + +template <typename I> +void ImageReplayer<I>::replay_flush() { + dout(10) << dendl; + + bool interrupted = false; + { + Mutex::Locker locker(m_lock); + if (m_state != STATE_REPLAYING) { + dout(10) << "replay interrupted" << dendl; + interrupted = true; + } else { + m_state = STATE_REPLAY_FLUSHING; + } + } + + if (interrupted) { + m_event_replay_tracker.finish_op(); + return; + } + + // shut down the replay to flush all IO and ops and create a new + // replayer to handle the new tag epoch + Context *ctx = create_context_callback< + ImageReplayer<I>, &ImageReplayer<I>::handle_replay_flush>(this); + ctx = new FunctionContext([this, ctx](int r) { + m_local_image_ctx->journal->stop_external_replay(); + m_local_replay = nullptr; + + if (r < 0) { + ctx->complete(r); + return; + } + + m_local_journal->start_external_replay(&m_local_replay, ctx); + }); + m_local_replay->shut_down(false, ctx); +} + +template <typename I> +void ImageReplayer<I>::handle_replay_flush(int r) { + dout(10) << "r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_REPLAY_FLUSHING); + m_state = STATE_REPLAYING; + } + + if (r < 0) { + derr << "replay flush encountered an error: " << cpp_strerror(r) << dendl; + m_event_replay_tracker.finish_op(); + handle_replay_complete(r, "replay flush encountered an error"); + return; + } else if (on_replay_interrupted()) { + m_event_replay_tracker.finish_op(); + return; + } + + get_remote_tag(); +} + +template <typename I> +void ImageReplayer<I>::get_remote_tag() { + dout(15) << "tag_tid: " << m_replay_tag_tid << dendl; + + Context *ctx = create_context_callback< + ImageReplayer, &ImageReplayer<I>::handle_get_remote_tag>(this); + m_remote_journaler->get_tag(m_replay_tag_tid, &m_replay_tag, ctx); +} + +template <typename I> +void ImageReplayer<I>::handle_get_remote_tag(int r) { + dout(15) << "r=" << r << dendl; + + if (r == 0) { + try { + auto it = m_replay_tag.data.cbegin(); + decode(m_replay_tag_data, it); + } catch (const buffer::error &err) { + r = -EBADMSG; + } + } + + if (r < 0) { + derr << "failed to retrieve remote tag " << m_replay_tag_tid << ": " + << cpp_strerror(r) << dendl; + m_event_replay_tracker.finish_op(); + handle_replay_complete(r, "failed to retrieve remote tag"); + return; + } + + m_replay_tag_valid = true; + dout(15) << "decoded remote tag " << m_replay_tag_tid << ": " + << m_replay_tag_data << dendl; + + allocate_local_tag(); +} + +template <typename I> +void ImageReplayer<I>::allocate_local_tag() { + dout(15) << dendl; + + std::string mirror_uuid = m_replay_tag_data.mirror_uuid; + if (mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) { + mirror_uuid = m_remote_image.mirror_uuid; + } else if (mirror_uuid == m_local_mirror_uuid) { + mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID; + } else if (mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) { + // handle possible edge condition where daemon can failover and + // the local image has already been promoted/demoted + auto local_tag_data = m_local_journal->get_tag_data(); + if (local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID && + (local_tag_data.predecessor.commit_valid && + local_tag_data.predecessor.mirror_uuid == + librbd::Journal<>::LOCAL_MIRROR_UUID)) { + dout(15) << "skipping stale demotion event" << dendl; + handle_process_entry_safe(m_replay_entry, m_replay_start_time, 0); + handle_replay_ready(); + return; + } else { + dout(5) << "encountered image demotion: stopping" << dendl; + Mutex::Locker locker(m_lock); + m_stop_requested = true; + } + } + + librbd::journal::TagPredecessor predecessor(m_replay_tag_data.predecessor); + if (predecessor.mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) { + predecessor.mirror_uuid = m_remote_image.mirror_uuid; + } else if (predecessor.mirror_uuid == m_local_mirror_uuid) { + predecessor.mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID; + } + + dout(15) << "mirror_uuid=" << mirror_uuid << ", " + << "predecessor=" << predecessor << ", " + << "replay_tag_tid=" << m_replay_tag_tid << dendl; + Context *ctx = create_context_callback< + ImageReplayer, &ImageReplayer<I>::handle_allocate_local_tag>(this); + m_local_journal->allocate_tag(mirror_uuid, predecessor, ctx); +} + +template <typename I> +void ImageReplayer<I>::handle_allocate_local_tag(int r) { + dout(15) << "r=" << r << ", " + << "tag_tid=" << m_local_journal->get_tag_tid() << dendl; + + if (r < 0) { + derr << "failed to allocate journal tag: " << cpp_strerror(r) << dendl; + m_event_replay_tracker.finish_op(); + handle_replay_complete(r, "failed to allocate journal tag"); + return; + } + + preprocess_entry(); +} + +template <typename I> +void ImageReplayer<I>::preprocess_entry() { + dout(20) << "preprocessing entry tid=" << m_replay_entry.get_commit_tid() + << dendl; + + bufferlist data = m_replay_entry.get_data(); + auto it = data.cbegin(); + int r = m_local_replay->decode(&it, &m_event_entry); + if (r < 0) { + derr << "failed to decode journal event" << dendl; + m_event_replay_tracker.finish_op(); + handle_replay_complete(r, "failed to decode journal event"); + return; + } + + uint32_t delay = calculate_replay_delay( + m_event_entry.timestamp, m_local_image_ctx->mirroring_replay_delay); + if (delay == 0) { + handle_preprocess_entry_ready(0); + return; + } + + dout(20) << "delaying replay by " << delay << " sec" << dendl; + + Mutex::Locker timer_locker(m_threads->timer_lock); + ceph_assert(m_delayed_preprocess_task == nullptr); + m_delayed_preprocess_task = new FunctionContext( + [this](int r) { + ceph_assert(m_threads->timer_lock.is_locked()); + m_delayed_preprocess_task = nullptr; + m_threads->work_queue->queue( + create_context_callback<ImageReplayer, + &ImageReplayer<I>::handle_preprocess_entry_ready>(this), 0); + }); + m_threads->timer->add_event_after(delay, m_delayed_preprocess_task); +} + +template <typename I> +void ImageReplayer<I>::handle_preprocess_entry_ready(int r) { + dout(20) << "r=" << r << dendl; + ceph_assert(r == 0); + + m_replay_start_time = ceph_clock_now(); + if (!m_event_preprocessor->is_required(m_event_entry)) { + process_entry(); + return; + } + + Context *ctx = create_context_callback< + ImageReplayer, &ImageReplayer<I>::handle_preprocess_entry_safe>(this); + m_event_preprocessor->preprocess(&m_event_entry, ctx); +} + +template <typename I> +void ImageReplayer<I>::handle_preprocess_entry_safe(int r) { + dout(20) << "r=" << r << dendl; + + if (r < 0) { + m_event_replay_tracker.finish_op(); + + if (r == -ECANCELED) { + handle_replay_complete(0, "lost exclusive lock"); + } else { + derr << "failed to preprocess journal event" << dendl; + handle_replay_complete(r, "failed to preprocess journal event"); + } + return; + } + + process_entry(); +} + +template <typename I> +void ImageReplayer<I>::process_entry() { + dout(20) << "processing entry tid=" << m_replay_entry.get_commit_tid() + << dendl; + + // stop replaying events if stop has been requested + if (on_replay_interrupted()) { + m_event_replay_tracker.finish_op(); + return; + } + + Context *on_ready = create_context_callback< + ImageReplayer, &ImageReplayer<I>::handle_process_entry_ready>(this); + Context *on_commit = new C_ReplayCommitted(this, std::move(m_replay_entry), + m_replay_start_time); + + m_local_replay->process(m_event_entry, on_ready, on_commit); +} + +template <typename I> +void ImageReplayer<I>::handle_process_entry_ready(int r) { + dout(20) << dendl; + ceph_assert(r == 0); + + bool update_status = false; + { + RWLock::RLocker snap_locker(m_local_image_ctx->snap_lock); + if (m_local_image_name != m_local_image_ctx->name) { + m_local_image_name = m_local_image_ctx->name; + update_status = true; + } + } + + if (update_status) { + reschedule_update_status_task(0); + } + + // attempt to process the next event + handle_replay_ready(); +} + +template <typename I> +void ImageReplayer<I>::handle_process_entry_safe(const ReplayEntry &replay_entry, + const utime_t &replay_start_time, + int r) { + dout(20) << "commit_tid=" << replay_entry.get_commit_tid() << ", r=" << r + << dendl; + + if (r < 0) { + derr << "failed to commit journal event: " << cpp_strerror(r) << dendl; + handle_replay_complete(r, "failed to commit journal event"); + } else { + ceph_assert(m_remote_journaler != nullptr); + m_remote_journaler->committed(replay_entry); + } + + auto bytes = replay_entry.get_data().length(); + auto latency = ceph_clock_now() - replay_start_time; + + if (g_perf_counters) { + g_perf_counters->inc(l_rbd_mirror_replay); + g_perf_counters->inc(l_rbd_mirror_replay_bytes, bytes); + g_perf_counters->tinc(l_rbd_mirror_replay_latency, latency); + } + + auto ctx = new FunctionContext( + [this, bytes, latency](int r) { + Mutex::Locker locker(m_lock); + if (m_perf_counters) { + m_perf_counters->inc(l_rbd_mirror_replay); + m_perf_counters->inc(l_rbd_mirror_replay_bytes, bytes); + m_perf_counters->tinc(l_rbd_mirror_replay_latency, latency); + } + m_event_replay_tracker.finish_op(); + }); + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +bool ImageReplayer<I>::update_mirror_image_status(bool force, + const OptionalState &state) { + dout(15) << dendl; + { + Mutex::Locker locker(m_lock); + if (!start_mirror_image_status_update(force, false)) { + return false; + } + } + + queue_mirror_image_status_update(state); + return true; +} + +template <typename I> +bool ImageReplayer<I>::start_mirror_image_status_update(bool force, + bool restarting) { + ceph_assert(m_lock.is_locked()); + + if (!force && !is_stopped_()) { + if (!is_running_()) { + dout(15) << "shut down in-progress: ignoring update" << dendl; + return false; + } else if (m_in_flight_status_updates > (restarting ? 1 : 0)) { + dout(15) << "already sending update" << dendl; + m_update_status_requested = true; + return false; + } + } + + ++m_in_flight_status_updates; + dout(15) << "in-flight updates=" << m_in_flight_status_updates << dendl; + return true; +} + +template <typename I> +void ImageReplayer<I>::finish_mirror_image_status_update() { + reregister_admin_socket_hook(); + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_in_flight_status_updates > 0); + if (--m_in_flight_status_updates > 0) { + dout(15) << "waiting on " << m_in_flight_status_updates << " in-flight " + << "updates" << dendl; + return; + } + + std::swap(on_finish, m_on_update_status_finish); + } + + dout(15) << dendl; + if (on_finish != nullptr) { + on_finish->complete(0); + } +} + +template <typename I> +void ImageReplayer<I>::queue_mirror_image_status_update(const OptionalState &state) { + dout(15) << dendl; + + auto ctx = new FunctionContext( + [this, state](int r) { + send_mirror_status_update(state); + }); + + // ensure pending IO is flushed and the commit position is updated + // prior to updating the mirror status + ctx = new FunctionContext( + [this, ctx](int r) { + flush_local_replay(ctx); + }); + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void ImageReplayer<I>::send_mirror_status_update(const OptionalState &opt_state) { + State state; + std::string state_desc; + int last_r; + bool stopping_replay; + + OptionalMirrorImageStatusState mirror_image_status_state = + boost::make_optional(false, cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN); + image_replayer::BootstrapRequest<I>* bootstrap_request = nullptr; + { + Mutex::Locker locker(m_lock); + state = m_state; + state_desc = m_state_desc; + mirror_image_status_state = m_mirror_image_status_state; + last_r = m_last_r; + stopping_replay = (m_local_image_ctx != nullptr); + + if (m_bootstrap_request != nullptr) { + bootstrap_request = m_bootstrap_request; + bootstrap_request->get(); + } + } + + bool syncing = false; + if (bootstrap_request != nullptr) { + syncing = bootstrap_request->is_syncing(); + bootstrap_request->put(); + bootstrap_request = nullptr; + } + + if (opt_state) { + state = *opt_state; + } + + cls::rbd::MirrorImageStatus status; + status.up = true; + switch (state) { + case STATE_STARTING: + if (syncing) { + status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_SYNCING; + status.description = state_desc.empty() ? "syncing" : state_desc; + mirror_image_status_state = status.state; + } else { + status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY; + status.description = "starting replay"; + } + break; + case STATE_REPLAYING: + case STATE_REPLAY_FLUSHING: + status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_REPLAYING; + { + Context *on_req_finish = new FunctionContext( + [this](int r) { + dout(15) << "replay status ready: r=" << r << dendl; + if (r >= 0) { + send_mirror_status_update(boost::none); + } else if (r == -EAGAIN) { + // decrement in-flight status update counter + handle_mirror_status_update(r); + } + }); + + std::string desc; + ceph_assert(m_replay_status_formatter != nullptr); + if (!m_replay_status_formatter->get_or_send_update(&desc, + on_req_finish)) { + dout(15) << "waiting for replay status" << dendl; + return; + } + status.description = "replaying, " + desc; + mirror_image_status_state = boost::make_optional( + false, cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN); + } + break; + case STATE_STOPPING: + if (stopping_replay) { + status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY; + status.description = state_desc.empty() ? "stopping replay" : state_desc; + break; + } + // FALLTHROUGH + case STATE_STOPPED: + if (last_r == -EREMOTEIO) { + status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN; + status.description = state_desc; + mirror_image_status_state = status.state; + } else if (last_r < 0) { + status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_ERROR; + status.description = state_desc; + mirror_image_status_state = status.state; + } else { + status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STOPPED; + status.description = state_desc.empty() ? "stopped" : state_desc; + mirror_image_status_state = boost::none; + } + break; + default: + ceph_assert(!"invalid state"); + } + + { + Mutex::Locker locker(m_lock); + m_mirror_image_status_state = mirror_image_status_state; + } + + // prevent the status from ping-ponging when failed replays are restarted + if (mirror_image_status_state && + *mirror_image_status_state == cls::rbd::MIRROR_IMAGE_STATUS_STATE_ERROR) { + status.state = *mirror_image_status_state; + } + + dout(15) << "status=" << status << dendl; + librados::ObjectWriteOperation op; + librbd::cls_client::mirror_image_status_set(&op, m_global_image_id, status); + + ceph_assert(m_local_ioctx); + librados::AioCompletion *aio_comp = create_rados_callback< + ImageReplayer<I>, &ImageReplayer<I>::handle_mirror_status_update>(this); + int r = m_local_ioctx->aio_operate(RBD_MIRRORING, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void ImageReplayer<I>::handle_mirror_status_update(int r) { + dout(15) << "r=" << r << dendl; + + bool running = false; + bool started = false; + { + Mutex::Locker locker(m_lock); + bool update_status_requested = false; + std::swap(update_status_requested, m_update_status_requested); + + running = is_running_(); + if (running && update_status_requested) { + started = start_mirror_image_status_update(false, true); + } + } + + // if a deferred update is available, send it -- otherwise reschedule + // the timer task + if (started) { + queue_mirror_image_status_update(boost::none); + } else if (running) { + reschedule_update_status_task(0); + } + + // mark committed status update as no longer in-flight + finish_mirror_image_status_update(); +} + +template <typename I> +void ImageReplayer<I>::reschedule_update_status_task(int new_interval) { + bool canceled_task = false; + { + Mutex::Locker locker(m_lock); + Mutex::Locker timer_locker(m_threads->timer_lock); + + if (m_update_status_task) { + dout(15) << "canceling existing status update task" << dendl; + + canceled_task = m_threads->timer->cancel_event(m_update_status_task); + m_update_status_task = nullptr; + } + + if (new_interval > 0) { + m_update_status_interval = new_interval; + } + + if (new_interval >= 0 && is_running_() && + start_mirror_image_status_update(true, false)) { + m_update_status_task = new FunctionContext( + [this](int r) { + ceph_assert(m_threads->timer_lock.is_locked()); + m_update_status_task = nullptr; + + queue_mirror_image_status_update(boost::none); + }); + dout(15) << "scheduling status update task after " + << m_update_status_interval << " seconds" << dendl; + m_threads->timer->add_event_after(m_update_status_interval, + m_update_status_task); + } + } + + if (canceled_task) { + // decrement in-flight status update counter for canceled task + finish_mirror_image_status_update(); + } +} + +template <typename I> +void ImageReplayer<I>::shut_down(int r) { + dout(10) << "r=" << r << dendl; + + bool canceled_delayed_preprocess_task = false; + { + Mutex::Locker timer_locker(m_threads->timer_lock); + if (m_delayed_preprocess_task != nullptr) { + canceled_delayed_preprocess_task = m_threads->timer->cancel_event( + m_delayed_preprocess_task); + ceph_assert(canceled_delayed_preprocess_task); + m_delayed_preprocess_task = nullptr; + } + } + if (canceled_delayed_preprocess_task) { + // wake up sleeping replay + m_event_replay_tracker.finish_op(); + } + + reschedule_update_status_task(-1); + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_state == STATE_STOPPING); + + // if status updates are in-flight, wait for them to complete + // before proceeding + if (m_in_flight_status_updates > 0) { + if (m_on_update_status_finish == nullptr) { + dout(15) << "waiting for in-flight status update" << dendl; + m_on_update_status_finish = new FunctionContext( + [this, r](int _r) { + shut_down(r); + }); + } + return; + } + } + + // NOTE: it's important to ensure that the local image is fully + // closed before attempting to close the remote journal in + // case the remote cluster is unreachable + + // chain the shut down sequence (reverse order) + Context *ctx = new FunctionContext( + [this, r](int _r) { + if (m_local_ioctx) { + update_mirror_image_status(true, STATE_STOPPED); + } + handle_shut_down(r); + }); + + // close the remote journal + if (m_remote_journaler != nullptr) { + ctx = new FunctionContext([this, ctx](int r) { + delete m_remote_journaler; + m_remote_journaler = nullptr; + ctx->complete(0); + }); + ctx = new FunctionContext([this, ctx](int r) { + m_remote_journaler->remove_listener(&m_remote_listener); + m_remote_journaler->shut_down(ctx); + }); + } + + // stop the replay of remote journal events + if (m_replay_handler != nullptr) { + ctx = new FunctionContext([this, ctx](int r) { + delete m_replay_handler; + m_replay_handler = nullptr; + + m_event_replay_tracker.wait_for_ops(ctx); + }); + ctx = new FunctionContext([this, ctx](int r) { + m_remote_journaler->stop_replay(ctx); + }); + } + + // close the local image (release exclusive lock) + if (m_local_image_ctx) { + ctx = new FunctionContext([this, ctx](int r) { + CloseImageRequest<I> *request = CloseImageRequest<I>::create( + &m_local_image_ctx, ctx); + request->send(); + }); + } + + // shut down event replay into the local image + if (m_local_journal != nullptr) { + ctx = new FunctionContext([this, ctx](int r) { + m_local_journal = nullptr; + ctx->complete(0); + }); + if (m_local_replay != nullptr) { + ctx = new FunctionContext([this, ctx](int r) { + m_local_journal->stop_external_replay(); + m_local_replay = nullptr; + + EventPreprocessor<I>::destroy(m_event_preprocessor); + m_event_preprocessor = nullptr; + ctx->complete(0); + }); + } + ctx = new FunctionContext([this, ctx](int r) { + // blocks if listener notification is in-progress + m_local_journal->remove_listener(m_journal_listener); + ctx->complete(0); + }); + } + + // wait for all local in-flight replay events to complete + ctx = new FunctionContext([this, ctx](int r) { + if (r < 0) { + derr << "error shutting down journal replay: " << cpp_strerror(r) + << dendl; + } + + m_event_replay_tracker.wait_for_ops(ctx); + }); + + // flush any local in-flight replay events + if (m_local_replay != nullptr) { + ctx = new FunctionContext([this, ctx](int r) { + m_local_replay->shut_down(true, ctx); + }); + } + + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void ImageReplayer<I>::handle_shut_down(int r) { + reschedule_update_status_task(-1); + + bool resync_requested = false; + bool delete_requested = false; + bool unregister_asok_hook = false; + { + Mutex::Locker locker(m_lock); + + // if status updates are in-flight, wait for them to complete + // before proceeding + if (m_in_flight_status_updates > 0) { + if (m_on_update_status_finish == nullptr) { + dout(15) << "waiting for in-flight status update" << dendl; + m_on_update_status_finish = new FunctionContext( + [this, r](int _r) { + handle_shut_down(r); + }); + } + return; + } + + if (m_delete_requested && !m_local_image_id.empty()) { + ceph_assert(m_remote_image.image_id.empty()); + dout(0) << "remote image no longer exists: scheduling deletion" << dendl; + unregister_asok_hook = true; + std::swap(delete_requested, m_delete_requested); + } + + std::swap(resync_requested, m_resync_requested); + if (delete_requested || resync_requested) { + m_local_image_id = ""; + } else if (m_last_r == -ENOENT && + m_local_image_id.empty() && m_remote_image.image_id.empty()) { + dout(0) << "mirror image no longer exists" << dendl; + unregister_asok_hook = true; + m_finished = true; + } + } + + if (unregister_asok_hook) { + unregister_admin_socket_hook(); + } + + if (delete_requested || resync_requested) { + dout(5) << "moving image to trash" << dendl; + auto ctx = new FunctionContext([this, r](int) { + handle_shut_down(r); + }); + ImageDeleter<I>::trash_move(*m_local_ioctx, m_global_image_id, + resync_requested, m_threads->work_queue, ctx); + return; + } + + dout(10) << "stop complete" << dendl; + ReplayStatusFormatter<I>::destroy(m_replay_status_formatter); + m_replay_status_formatter = nullptr; + + Context *on_start = nullptr; + Context *on_stop = nullptr; + { + Mutex::Locker locker(m_lock); + std::swap(on_start, m_on_start_finish); + std::swap(on_stop, m_on_stop_finish); + m_stop_requested = false; + ceph_assert(m_delayed_preprocess_task == nullptr); + ceph_assert(m_state == STATE_STOPPING); + m_state = STATE_STOPPED; + } + + if (on_start != nullptr) { + dout(10) << "on start finish complete, r=" << r << dendl; + on_start->complete(r); + r = 0; + } + if (on_stop != nullptr) { + dout(10) << "on stop finish complete, r=" << r << dendl; + on_stop->complete(r); + } +} + +template <typename I> +void ImageReplayer<I>::handle_remote_journal_metadata_updated() { + dout(20) << dendl; + + cls::journal::Client client; + { + Mutex::Locker locker(m_lock); + if (!is_running_()) { + return; + } + + int r = m_remote_journaler->get_cached_client(m_local_mirror_uuid, &client); + if (r < 0) { + derr << "failed to retrieve client: " << cpp_strerror(r) << dendl; + return; + } + } + + if (client.state != cls::journal::CLIENT_STATE_CONNECTED) { + dout(0) << "client flagged disconnected, stopping image replay" << dendl; + stop(nullptr, false, -ENOTCONN, "disconnected"); + } +} + +template <typename I> +std::string ImageReplayer<I>::to_string(const State state) { + switch (state) { + case ImageReplayer<I>::STATE_STARTING: + return "Starting"; + case ImageReplayer<I>::STATE_REPLAYING: + return "Replaying"; + case ImageReplayer<I>::STATE_REPLAY_FLUSHING: + return "ReplayFlushing"; + case ImageReplayer<I>::STATE_STOPPING: + return "Stopping"; + case ImageReplayer<I>::STATE_STOPPED: + return "Stopped"; + default: + break; + } + return "Unknown(" + stringify(state) + ")"; +} + +template <typename I> +void ImageReplayer<I>::resync_image(Context *on_finish) { + dout(10) << dendl; + + m_resync_requested = true; + stop(on_finish); +} + +template <typename I> +void ImageReplayer<I>::register_admin_socket_hook() { + ImageReplayerAdminSocketHook<I> *asok_hook; + { + Mutex::Locker locker(m_lock); + if (m_asok_hook != nullptr) { + return; + } + + ceph_assert(m_perf_counters == nullptr); + + dout(15) << "registered asok hook: " << m_name << dendl; + asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name, + this); + int r = asok_hook->register_commands(); + if (r == 0) { + m_asok_hook = asok_hook; + + CephContext *cct = static_cast<CephContext *>(m_local->cct()); + auto prio = cct->_conf.get_val<int64_t>("rbd_mirror_perf_stats_prio"); + PerfCountersBuilder plb(g_ceph_context, "rbd_mirror_" + m_name, + l_rbd_mirror_first, l_rbd_mirror_last); + plb.add_u64_counter(l_rbd_mirror_replay, "replay", "Replays", "r", prio); + plb.add_u64_counter(l_rbd_mirror_replay_bytes, "replay_bytes", + "Replayed data", "rb", prio, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_rbd_mirror_replay_latency, "replay_latency", + "Replay latency", "rl", prio); + m_perf_counters = plb.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(m_perf_counters); + + return; + } + derr << "error registering admin socket commands" << dendl; + } + delete asok_hook; +} + +template <typename I> +void ImageReplayer<I>::unregister_admin_socket_hook() { + dout(15) << dendl; + + AdminSocketHook *asok_hook = nullptr; + PerfCounters *perf_counters = nullptr; + { + Mutex::Locker locker(m_lock); + std::swap(asok_hook, m_asok_hook); + std::swap(perf_counters, m_perf_counters); + } + delete asok_hook; + if (perf_counters != nullptr) { + g_ceph_context->get_perfcounters_collection()->remove(perf_counters); + delete perf_counters; + } +} + +template <typename I> +void ImageReplayer<I>::reregister_admin_socket_hook() { + { + Mutex::Locker locker(m_lock); + auto name = m_local_ioctx->get_pool_name() + "/" + m_local_image_name; + if (m_asok_hook != nullptr && m_name == name) { + return; + } + m_name = name; + } + unregister_admin_socket_hook(); + register_admin_socket_hook(); +} + +template <typename I> +std::ostream &operator<<(std::ostream &os, const ImageReplayer<I> &replayer) +{ + os << "ImageReplayer: " << &replayer << " [" << replayer.get_local_pool_id() + << "/" << replayer.get_global_image_id() << "]"; + return os; +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::ImageReplayer<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/ImageReplayer.h b/src/tools/rbd_mirror/ImageReplayer.h new file mode 100644 index 00000000..9af3e961 --- /dev/null +++ b/src/tools/rbd_mirror/ImageReplayer.h @@ -0,0 +1,438 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_H +#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_H + +#include "common/AsyncOpTracker.h" +#include "common/Mutex.h" +#include "common/WorkQueue.h" +#include "include/rados/librados.hpp" +#include "cls/journal/cls_journal_types.h" +#include "cls/rbd/cls_rbd_types.h" +#include "journal/JournalMetadataListener.h" +#include "journal/ReplayEntry.h" +#include "librbd/ImageCtx.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" +#include "ProgressContext.h" +#include "tools/rbd_mirror/Types.h" +#include "tools/rbd_mirror/image_replayer/Types.h" + +#include <boost/noncopyable.hpp> +#include <boost/optional.hpp> + +#include <set> +#include <map> +#include <atomic> +#include <string> +#include <vector> + +class AdminSocketHook; +class PerfCounters; + +namespace journal { + +class Journaler; +class ReplayHandler; + +} + +namespace librbd { + +class ImageCtx; +namespace journal { template <typename> class Replay; } + +} + +namespace rbd { +namespace mirror { + +template <typename> struct InstanceWatcher; +template <typename> struct Threads; + +namespace image_replayer { template <typename> class BootstrapRequest; } +namespace image_replayer { template <typename> class EventPreprocessor; } +namespace image_replayer { template <typename> class ReplayStatusFormatter; } + +/** + * Replays changes from a remote cluster for a single image. + */ +template <typename ImageCtxT = librbd::ImageCtx> +class ImageReplayer { +public: + static ImageReplayer *create( + Threads<ImageCtxT> *threads, InstanceWatcher<ImageCtxT> *instance_watcher, + RadosRef local, const std::string &local_mirror_uuid, int64_t local_pool_id, + const std::string &global_image_id) { + return new ImageReplayer(threads, instance_watcher, local, + local_mirror_uuid, local_pool_id, global_image_id); + } + void destroy() { + delete this; + } + + ImageReplayer(Threads<ImageCtxT> *threads, + InstanceWatcher<ImageCtxT> *instance_watcher, + RadosRef local, const std::string &local_mirror_uuid, + int64_t local_pool_id, const std::string &global_image_id); + virtual ~ImageReplayer(); + ImageReplayer(const ImageReplayer&) = delete; + ImageReplayer& operator=(const ImageReplayer&) = delete; + + bool is_stopped() { Mutex::Locker l(m_lock); return is_stopped_(); } + bool is_running() { Mutex::Locker l(m_lock); return is_running_(); } + bool is_replaying() { Mutex::Locker l(m_lock); return is_replaying_(); } + + std::string get_name() { Mutex::Locker l(m_lock); return m_name; }; + void set_state_description(int r, const std::string &desc); + + // TODO temporary until policy handles release of image replayers + inline bool is_finished() const { + Mutex::Locker locker(m_lock); + return m_finished; + } + inline void set_finished(bool finished) { + Mutex::Locker locker(m_lock); + m_finished = finished; + } + + inline bool is_blacklisted() const { + Mutex::Locker locker(m_lock); + return (m_last_r == -EBLACKLISTED); + } + + image_replayer::HealthState get_health_state() const; + + void add_peer(const std::string &peer_uuid, librados::IoCtx &remote_io_ctx); + + inline int64_t get_local_pool_id() const { + return m_local_pool_id; + } + inline const std::string& get_global_image_id() const { + return m_global_image_id; + } + + void start(Context *on_finish = nullptr, bool manual = false); + void stop(Context *on_finish = nullptr, bool manual = false, + int r = 0, const std::string& desc = ""); + void restart(Context *on_finish = nullptr); + void flush(); + + void resync_image(Context *on_finish=nullptr); + + void print_status(Formatter *f, stringstream *ss); + + virtual void handle_replay_ready(); + virtual void handle_replay_complete(int r, const std::string &error_desc); + +protected: + /** + * @verbatim + * (error) + * <uninitialized> <------------------------------------ FAIL + * | ^ + * v * + * <starting> * + * | * + * v (error) * + * PREPARE_LOCAL_IMAGE * * * * * * * * * * * * * * * * * * + * | * + * v (error) * + * PREPARE_REMOTE_IMAGE * * * * * * * * * * * * * * * * * * + * | * + * v (error) * + * BOOTSTRAP_IMAGE * * * * * * * * * * * * * * * * * * * * + * | * + * v (error) * + * INIT_REMOTE_JOURNALER * * * * * * * * * * * * * * * * * + * | * + * v (error) * + * START_REPLAY * * * * * * * * * * * * * * * * * * * * * * + * | + * | /--------------------------------------------\ + * | | | + * v v (asok flush) | + * REPLAYING -------------> LOCAL_REPLAY_FLUSH | + * | \ | | + * | | v | + * | | FLUSH_COMMIT_POSITION | + * | | | | + * | | \--------------------/| + * | | | + * | | (entries available) | + * | \-----------> REPLAY_READY | + * | | | + * | | (skip if not | + * | v needed) (error) + * | REPLAY_FLUSH * * * * * * * * * + * | | | * + * | | (skip if not | * + * | v needed) (error) * + * | GET_REMOTE_TAG * * * * * * * * + * | | | * + * | | (skip if not | * + * | v needed) (error) * + * | ALLOCATE_LOCAL_TAG * * * * * * + * | | | * + * | v (error) * + * | PREPROCESS_ENTRY * * * * * * * + * | | | * + * | v (error) * + * | PROCESS_ENTRY * * * * * * * * * + * | | | * + * | \---------------------/ * + * v * + * REPLAY_COMPLETE < * * * * * * * * * * * * * * * * * * * + * | + * v + * JOURNAL_REPLAY_SHUT_DOWN + * | + * v + * LOCAL_IMAGE_CLOSE + * | + * v + * <stopped> + * + * @endverbatim + */ + + virtual void on_start_fail(int r, const std::string &desc); + virtual bool on_start_interrupted(); + virtual bool on_start_interrupted(Mutex& lock); + + virtual void on_stop_journal_replay(int r = 0, const std::string &desc = ""); + + bool on_replay_interrupted(); + +private: + typedef typename librbd::journal::TypeTraits<ImageCtxT>::ReplayEntry ReplayEntry; + + enum State { + STATE_UNKNOWN, + STATE_STARTING, + STATE_REPLAYING, + STATE_REPLAY_FLUSHING, + STATE_STOPPING, + STATE_STOPPED, + }; + + struct RemoteImage { + std::string mirror_uuid; + std::string image_id; + librados::IoCtx io_ctx; + + RemoteImage() { + } + RemoteImage(const Peer& peer) : io_ctx(peer.io_ctx) { + } + }; + + typedef typename librbd::journal::TypeTraits<ImageCtxT>::Journaler Journaler; + typedef boost::optional<State> OptionalState; + typedef boost::optional<cls::rbd::MirrorImageStatusState> + OptionalMirrorImageStatusState; + + struct JournalListener : public librbd::journal::Listener { + ImageReplayer *img_replayer; + + JournalListener(ImageReplayer *img_replayer) + : img_replayer(img_replayer) { + } + + void handle_close() override { + img_replayer->on_stop_journal_replay(); + } + + void handle_promoted() override { + img_replayer->on_stop_journal_replay(0, "force promoted"); + } + + void handle_resync() override { + img_replayer->resync_image(); + } + }; + + class BootstrapProgressContext : public ProgressContext { + public: + BootstrapProgressContext(ImageReplayer<ImageCtxT> *replayer) : + replayer(replayer) { + } + + void update_progress(const std::string &description, + bool flush = true) override; + private: + ImageReplayer<ImageCtxT> *replayer; + }; + + Threads<ImageCtxT> *m_threads; + InstanceWatcher<ImageCtxT> *m_instance_watcher; + + Peers m_peers; + RemoteImage m_remote_image; + + RadosRef m_local; + std::string m_local_mirror_uuid; + int64_t m_local_pool_id; + std::string m_local_image_id; + std::string m_global_image_id; + std::string m_local_image_name; + std::string m_name; + + mutable Mutex m_lock; + State m_state = STATE_STOPPED; + std::string m_state_desc; + + OptionalMirrorImageStatusState m_mirror_image_status_state = + boost::make_optional(false, cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN); + int m_last_r = 0; + + BootstrapProgressContext m_progress_cxt; + + bool m_finished = false; + bool m_delete_requested = false; + bool m_resync_requested = false; + + image_replayer::EventPreprocessor<ImageCtxT> *m_event_preprocessor = nullptr; + image_replayer::ReplayStatusFormatter<ImageCtxT> *m_replay_status_formatter = + nullptr; + IoCtxRef m_local_ioctx; + ImageCtxT *m_local_image_ctx = nullptr; + std::string m_local_image_tag_owner; + + decltype(ImageCtxT::journal) m_local_journal = nullptr; + librbd::journal::Replay<ImageCtxT> *m_local_replay = nullptr; + Journaler* m_remote_journaler = nullptr; + ::journal::ReplayHandler *m_replay_handler = nullptr; + librbd::journal::Listener *m_journal_listener; + + Context *m_on_start_finish = nullptr; + Context *m_on_stop_finish = nullptr; + Context *m_update_status_task = nullptr; + int m_update_status_interval = 0; + librados::AioCompletion *m_update_status_comp = nullptr; + bool m_stop_requested = false; + bool m_manual_stop = false; + + AdminSocketHook *m_asok_hook = nullptr; + PerfCounters *m_perf_counters = nullptr; + + image_replayer::BootstrapRequest<ImageCtxT> *m_bootstrap_request = nullptr; + + uint32_t m_in_flight_status_updates = 0; + bool m_update_status_requested = false; + Context *m_on_update_status_finish = nullptr; + + cls::journal::ClientState m_client_state = + cls::journal::CLIENT_STATE_DISCONNECTED; + librbd::journal::MirrorPeerClientMeta m_client_meta; + + ReplayEntry m_replay_entry; + utime_t m_replay_start_time; + bool m_replay_tag_valid = false; + uint64_t m_replay_tag_tid = 0; + cls::journal::Tag m_replay_tag; + librbd::journal::TagData m_replay_tag_data; + librbd::journal::EventEntry m_event_entry; + AsyncOpTracker m_event_replay_tracker; + Context *m_delayed_preprocess_task = nullptr; + + struct RemoteJournalerListener : public ::journal::JournalMetadataListener { + ImageReplayer *replayer; + + RemoteJournalerListener(ImageReplayer *replayer) : replayer(replayer) { } + + void handle_update(::journal::JournalMetadata *) override; + } m_remote_listener; + + struct C_ReplayCommitted : public Context { + ImageReplayer *replayer; + ReplayEntry replay_entry; + utime_t replay_start_time; + + C_ReplayCommitted(ImageReplayer *replayer, + ReplayEntry &&replay_entry, + const utime_t &replay_start_time) + : replayer(replayer), replay_entry(std::move(replay_entry)), + replay_start_time(replay_start_time) { + } + void finish(int r) override { + replayer->handle_process_entry_safe(replay_entry, replay_start_time, r); + } + }; + + static std::string to_string(const State state); + + bool is_stopped_() const { + return m_state == STATE_STOPPED; + } + bool is_running_() const { + return !is_stopped_() && m_state != STATE_STOPPING && !m_stop_requested; + } + bool is_replaying_() const { + return (m_state == STATE_REPLAYING || + m_state == STATE_REPLAY_FLUSHING); + } + + void flush_local_replay(Context* on_flush); + void handle_flush_local_replay(Context* on_flush, int r); + + void flush_commit_position(Context* on_flush); + void handle_flush_commit_position(Context* on_flush, int r); + + bool update_mirror_image_status(bool force, const OptionalState &state); + bool start_mirror_image_status_update(bool force, bool restarting); + void finish_mirror_image_status_update(); + void queue_mirror_image_status_update(const OptionalState &state); + void send_mirror_status_update(const OptionalState &state); + void handle_mirror_status_update(int r); + void reschedule_update_status_task(int new_interval); + + void shut_down(int r); + void handle_shut_down(int r); + void handle_remote_journal_metadata_updated(); + + void prepare_local_image(); + void handle_prepare_local_image(int r); + + void prepare_remote_image(); + void handle_prepare_remote_image(int r); + + void bootstrap(); + void handle_bootstrap(int r); + + void init_remote_journaler(); + void handle_init_remote_journaler(int r); + + void start_replay(); + void handle_start_replay(int r); + + void replay_flush(); + void handle_replay_flush(int r); + + void get_remote_tag(); + void handle_get_remote_tag(int r); + + void allocate_local_tag(); + void handle_allocate_local_tag(int r); + + void preprocess_entry(); + void handle_preprocess_entry_ready(int r); + void handle_preprocess_entry_safe(int r); + + void process_entry(); + void handle_process_entry_ready(int r); + void handle_process_entry_safe(const ReplayEntry& replay_entry, + const utime_t &m_replay_start_time, int r); + + void register_admin_socket_hook(); + void unregister_admin_socket_hook(); + void reregister_admin_socket_hook(); +}; + +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::ImageReplayer<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_H diff --git a/src/tools/rbd_mirror/ImageSync.cc b/src/tools/rbd_mirror/ImageSync.cc new file mode 100644 index 00000000..929d75c2 --- /dev/null +++ b/src/tools/rbd_mirror/ImageSync.cc @@ -0,0 +1,481 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ImageSync.h" +#include "InstanceWatcher.h" +#include "ProgressContext.h" +#include "common/debug.h" +#include "common/Timer.h" +#include "common/errno.h" +#include "journal/Journaler.h" +#include "librbd/DeepCopyRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/internal.h" +#include "librbd/journal/Types.h" +#include "tools/rbd_mirror/image_sync/SyncPointCreateRequest.h" +#include "tools/rbd_mirror/image_sync/SyncPointPruneRequest.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::ImageSync: " \ + << this << " " << __func__ + +namespace rbd { +namespace mirror { + +using namespace image_sync; +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using librbd::util::unique_lock_name; + +template <typename I> +class ImageSync<I>::ImageCopyProgressContext : public librbd::ProgressContext { +public: + ImageCopyProgressContext(ImageSync *image_sync) : image_sync(image_sync) { + } + + int update_progress(uint64_t object_no, uint64_t object_count) override { + image_sync->handle_copy_image_update_progress(object_no, object_count); + return 0; + } + + ImageSync *image_sync; +}; + +template <typename I> +ImageSync<I>::ImageSync(I *local_image_ctx, I *remote_image_ctx, + SafeTimer *timer, Mutex *timer_lock, + const std::string &mirror_uuid, Journaler *journaler, + MirrorPeerClientMeta *client_meta, + ContextWQ *work_queue, + InstanceWatcher<I> *instance_watcher, + Context *on_finish, ProgressContext *progress_ctx) + : BaseRequest("rbd::mirror::ImageSync", local_image_ctx->cct, on_finish), + m_local_image_ctx(local_image_ctx), m_remote_image_ctx(remote_image_ctx), + m_timer(timer), m_timer_lock(timer_lock), m_mirror_uuid(mirror_uuid), + m_journaler(journaler), m_client_meta(client_meta), + m_work_queue(work_queue), m_instance_watcher(instance_watcher), + m_progress_ctx(progress_ctx), + m_lock(unique_lock_name("ImageSync::m_lock", this)), + m_update_sync_point_interval(m_local_image_ctx->cct->_conf.template get_val<double>( + "rbd_mirror_sync_point_update_age")), m_client_meta_copy(*client_meta) { +} + +template <typename I> +ImageSync<I>::~ImageSync() { + ceph_assert(m_image_copy_request == nullptr); + ceph_assert(m_image_copy_prog_ctx == nullptr); + ceph_assert(m_update_sync_ctx == nullptr); +} + +template <typename I> +void ImageSync<I>::send() { + send_notify_sync_request(); +} + +template <typename I> +void ImageSync<I>::cancel() { + Mutex::Locker locker(m_lock); + + dout(10) << dendl; + + m_canceled = true; + + if (m_instance_watcher->cancel_sync_request(m_local_image_ctx->id)) { + return; + } + + if (m_image_copy_request != nullptr) { + m_image_copy_request->cancel(); + } +} + +template <typename I> +void ImageSync<I>::send_notify_sync_request() { + update_progress("NOTIFY_SYNC_REQUEST"); + + dout(10) << dendl; + + m_lock.Lock(); + if (m_canceled) { + m_lock.Unlock(); + BaseRequest::finish(-ECANCELED); + return; + } + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + ImageSync<I>, &ImageSync<I>::handle_notify_sync_request>(this)); + m_instance_watcher->notify_sync_request(m_local_image_ctx->id, ctx); + m_lock.Unlock(); +} + +template <typename I> +void ImageSync<I>::handle_notify_sync_request(int r) { + dout(10) << ": r=" << r << dendl; + + m_lock.Lock(); + if (r == 0 && m_canceled) { + r = -ECANCELED; + } + m_lock.Unlock(); + + if (r < 0) { + BaseRequest::finish(r); + return; + } + + send_prune_catch_up_sync_point(); +} + +template <typename I> +void ImageSync<I>::send_prune_catch_up_sync_point() { + update_progress("PRUNE_CATCH_UP_SYNC_POINT"); + + if (m_client_meta->sync_points.empty()) { + send_create_sync_point(); + return; + } + + dout(10) << dendl; + + // prune will remove sync points with missing snapshots and + // ensure we have a maximum of one sync point (in case we + // restarted) + Context *ctx = create_context_callback< + ImageSync<I>, &ImageSync<I>::handle_prune_catch_up_sync_point>(this); + SyncPointPruneRequest<I> *request = SyncPointPruneRequest<I>::create( + m_remote_image_ctx, false, m_journaler, m_client_meta, ctx); + request->send(); +} + +template <typename I> +void ImageSync<I>::handle_prune_catch_up_sync_point(int r) { + dout(10) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to prune catch-up sync point: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_create_sync_point(); +} + +template <typename I> +void ImageSync<I>::send_create_sync_point() { + update_progress("CREATE_SYNC_POINT"); + + // TODO: when support for disconnecting laggy clients is added, + // re-connect and create catch-up sync point + if (m_client_meta->sync_points.size() > 0) { + send_copy_image(); + return; + } + + dout(10) << dendl; + + Context *ctx = create_context_callback< + ImageSync<I>, &ImageSync<I>::handle_create_sync_point>(this); + SyncPointCreateRequest<I> *request = SyncPointCreateRequest<I>::create( + m_remote_image_ctx, m_mirror_uuid, m_journaler, m_client_meta, ctx); + request->send(); +} + +template <typename I> +void ImageSync<I>::handle_create_sync_point(int r) { + dout(10) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to create sync point: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + send_copy_image(); +} + +template <typename I> +void ImageSync<I>::send_copy_image() { + librados::snap_t snap_id_start = 0; + librados::snap_t snap_id_end; + librbd::deep_copy::ObjectNumber object_number; + int r = 0; + { + RWLock::RLocker snap_locker(m_remote_image_ctx->snap_lock); + ceph_assert(!m_client_meta->sync_points.empty()); + auto &sync_point = m_client_meta->sync_points.front(); + snap_id_end = m_remote_image_ctx->get_snap_id( + cls::rbd::UserSnapshotNamespace(), sync_point.snap_name); + if (snap_id_end == CEPH_NOSNAP) { + derr << ": failed to locate snapshot: " << sync_point.snap_name << dendl; + r = -ENOENT; + } else if (!sync_point.from_snap_name.empty()) { + snap_id_start = m_remote_image_ctx->get_snap_id( + cls::rbd::UserSnapshotNamespace(), sync_point.from_snap_name); + if (snap_id_start == CEPH_NOSNAP) { + derr << ": failed to locate from snapshot: " + << sync_point.from_snap_name << dendl; + r = -ENOENT; + } + } + object_number = sync_point.object_number; + } + if (r < 0) { + finish(r); + return; + } + + m_lock.Lock(); + if (m_canceled) { + m_lock.Unlock(); + finish(-ECANCELED); + return; + } + + dout(10) << dendl; + + Context *ctx = create_context_callback< + ImageSync<I>, &ImageSync<I>::handle_copy_image>(this); + m_image_copy_prog_ctx = new ImageCopyProgressContext(this); + m_image_copy_request = librbd::DeepCopyRequest<I>::create( + m_remote_image_ctx, m_local_image_ctx, snap_id_start, snap_id_end, + 0, false, object_number, m_work_queue, &m_client_meta->snap_seqs, + m_image_copy_prog_ctx, ctx); + m_image_copy_request->get(); + m_lock.Unlock(); + + update_progress("COPY_IMAGE"); + + m_image_copy_request->send(); +} + +template <typename I> +void ImageSync<I>::handle_copy_image(int r) { + dout(10) << ": r=" << r << dendl; + + { + Mutex::Locker timer_locker(*m_timer_lock); + Mutex::Locker locker(m_lock); + m_image_copy_request->put(); + m_image_copy_request = nullptr; + delete m_image_copy_prog_ctx; + m_image_copy_prog_ctx = nullptr; + if (r == 0 && m_canceled) { + r = -ECANCELED; + } + + if (m_update_sync_ctx != nullptr) { + m_timer->cancel_event(m_update_sync_ctx); + m_update_sync_ctx = nullptr; + } + + if (m_updating_sync_point) { + m_ret_val = r; + return; + } + } + + if (r == -ECANCELED) { + dout(10) << ": image copy canceled" << dendl; + finish(r); + return; + } else if (r < 0) { + derr << ": failed to copy image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_flush_sync_point(); +} + +template <typename I> +void ImageSync<I>::handle_copy_image_update_progress(uint64_t object_no, + uint64_t object_count) { + int percent = 100 * object_no / object_count; + update_progress("COPY_IMAGE " + stringify(percent) + "%"); + + Mutex::Locker locker(m_lock); + m_image_copy_object_no = object_no; + m_image_copy_object_count = object_count; + + if (m_update_sync_ctx == nullptr && !m_updating_sync_point) { + send_update_sync_point(); + } +} + +template <typename I> +void ImageSync<I>::send_update_sync_point() { + ceph_assert(m_lock.is_locked()); + + m_update_sync_ctx = nullptr; + + if (m_canceled) { + return; + } + + auto sync_point = &m_client_meta->sync_points.front(); + + if (m_client_meta->sync_object_count == m_image_copy_object_count && + sync_point->object_number && + (m_image_copy_object_no - 1) == sync_point->object_number.get()) { + // update sync point did not progress since last sync + return; + } + + m_updating_sync_point = true; + + m_client_meta_copy = *m_client_meta; + m_client_meta->sync_object_count = m_image_copy_object_count; + if (m_image_copy_object_no > 0) { + sync_point->object_number = m_image_copy_object_no - 1; + } + + CephContext *cct = m_local_image_ctx->cct; + ldout(cct, 20) << ": sync_point=" << *sync_point << dendl; + + bufferlist client_data_bl; + librbd::journal::ClientData client_data(*m_client_meta); + encode(client_data, client_data_bl); + + Context *ctx = create_context_callback< + ImageSync<I>, &ImageSync<I>::handle_update_sync_point>( + this); + m_journaler->update_client(client_data_bl, ctx); +} + +template <typename I> +void ImageSync<I>::handle_update_sync_point(int r) { + CephContext *cct = m_local_image_ctx->cct; + ldout(cct, 20) << ": r=" << r << dendl; + + if (r < 0) { + *m_client_meta = m_client_meta_copy; + lderr(cct) << ": failed to update client data: " << cpp_strerror(r) + << dendl; + } + + { + Mutex::Locker timer_locker(*m_timer_lock); + Mutex::Locker locker(m_lock); + m_updating_sync_point = false; + + if (m_image_copy_request != nullptr) { + m_update_sync_ctx = new FunctionContext( + [this](int r) { + Mutex::Locker locker(m_lock); + this->send_update_sync_point(); + }); + m_timer->add_event_after(m_update_sync_point_interval, + m_update_sync_ctx); + return; + } + } + + send_flush_sync_point(); +} + +template <typename I> +void ImageSync<I>::send_flush_sync_point() { + if (m_ret_val < 0) { + finish(m_ret_val); + return; + } + + update_progress("FLUSH_SYNC_POINT"); + + m_client_meta_copy = *m_client_meta; + m_client_meta->sync_object_count = m_image_copy_object_count; + auto sync_point = &m_client_meta->sync_points.front(); + if (m_image_copy_object_no > 0) { + sync_point->object_number = m_image_copy_object_no - 1; + } else { + sync_point->object_number = boost::none; + } + + dout(10) << ": sync_point=" << *sync_point << dendl; + + bufferlist client_data_bl; + librbd::journal::ClientData client_data(*m_client_meta); + encode(client_data, client_data_bl); + + Context *ctx = create_context_callback< + ImageSync<I>, &ImageSync<I>::handle_flush_sync_point>( + this); + m_journaler->update_client(client_data_bl, ctx); +} + +template <typename I> +void ImageSync<I>::handle_flush_sync_point(int r) { + dout(10) << ": r=" << r << dendl; + + if (r < 0) { + *m_client_meta = m_client_meta_copy; + + derr << ": failed to update client data: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + send_prune_sync_points(); +} + +template <typename I> +void ImageSync<I>::send_prune_sync_points() { + dout(10) << dendl; + + update_progress("PRUNE_SYNC_POINTS"); + + Context *ctx = create_context_callback< + ImageSync<I>, &ImageSync<I>::handle_prune_sync_points>(this); + SyncPointPruneRequest<I> *request = SyncPointPruneRequest<I>::create( + m_remote_image_ctx, true, m_journaler, m_client_meta, ctx); + request->send(); +} + +template <typename I> +void ImageSync<I>::handle_prune_sync_points(int r) { + dout(10) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to prune sync point: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (!m_client_meta->sync_points.empty()) { + send_copy_image(); + return; + } + + finish(0); +} + +template <typename I> +void ImageSync<I>::update_progress(const std::string &description) { + dout(20) << ": " << description << dendl; + + if (m_progress_ctx) { + m_progress_ctx->update_progress("IMAGE_SYNC/" + description); + } +} + +template <typename I> +void ImageSync<I>::finish(int r) { + dout(20) << ": r=" << r << dendl; + + m_instance_watcher->notify_sync_complete(m_local_image_ctx->id); + BaseRequest::finish(r); +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::ImageSync<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/ImageSync.h b/src/tools/rbd_mirror/ImageSync.h new file mode 100644 index 00000000..9e00c129 --- /dev/null +++ b/src/tools/rbd_mirror/ImageSync.h @@ -0,0 +1,160 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_SYNC_H +#define RBD_MIRROR_IMAGE_SYNC_H + +#include "include/int_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/journal/TypeTraits.h" +#include "librbd/journal/Types.h" +#include "common/Mutex.h" +#include "tools/rbd_mirror/BaseRequest.h" +#include <map> +#include <vector> + +class Context; +class ContextWQ; +namespace journal { class Journaler; } +namespace librbd { class ProgressContext; } +namespace librbd { template <typename> class DeepCopyRequest; } +namespace librbd { namespace journal { struct MirrorPeerClientMeta; } } + +namespace rbd { +namespace mirror { + +class ProgressContext; + +template <typename> class InstanceWatcher; + +template <typename ImageCtxT = librbd::ImageCtx> +class ImageSync : public BaseRequest { +public: + typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Journaler Journaler; + typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta; + + static ImageSync* create(ImageCtxT *local_image_ctx, + ImageCtxT *remote_image_ctx, + SafeTimer *timer, Mutex *timer_lock, + const std::string &mirror_uuid, + Journaler *journaler, + MirrorPeerClientMeta *client_meta, + ContextWQ *work_queue, + InstanceWatcher<ImageCtxT> *instance_watcher, + Context *on_finish, + ProgressContext *progress_ctx = nullptr) { + return new ImageSync(local_image_ctx, remote_image_ctx, timer, timer_lock, + mirror_uuid, journaler, client_meta, work_queue, + instance_watcher, on_finish, progress_ctx); + } + + ImageSync(ImageCtxT *local_image_ctx, ImageCtxT *remote_image_ctx, + SafeTimer *timer, Mutex *timer_lock, const std::string &mirror_uuid, + Journaler *journaler, MirrorPeerClientMeta *client_meta, + ContextWQ *work_queue, InstanceWatcher<ImageCtxT> *instance_watcher, + Context *on_finish, ProgressContext *progress_ctx = nullptr); + ~ImageSync() override; + + void send() override; + void cancel() override; + +protected: + void finish(int r) override; + +private: + /** + * @verbatim + * + * <start> + * | + * v + * NOTIFY_SYNC_REQUEST + * | + * v + * PRUNE_CATCH_UP_SYNC_POINT + * | + * v + * CREATE_SYNC_POINT (skip if already exists and + * | not disconnected) + * v + * COPY_IMAGE . . . . . . . . . . . . . . + * | . + * v . + * FLUSH_SYNC_POINT . + * | . (image sync canceled) + * v . + * PRUNE_SYNC_POINTS . + * | . + * v . + * <finish> < . . . . . . . . . . . . . . + * + * @endverbatim + */ + + typedef std::vector<librados::snap_t> SnapIds; + typedef std::map<librados::snap_t, SnapIds> SnapMap; + class ImageCopyProgressContext; + + ImageCtxT *m_local_image_ctx; + ImageCtxT *m_remote_image_ctx; + SafeTimer *m_timer; + Mutex *m_timer_lock; + std::string m_mirror_uuid; + Journaler *m_journaler; + MirrorPeerClientMeta *m_client_meta; + ContextWQ *m_work_queue; + InstanceWatcher<ImageCtxT> *m_instance_watcher; + ProgressContext *m_progress_ctx; + + SnapMap m_snap_map; + + Mutex m_lock; + bool m_canceled = false; + + librbd::DeepCopyRequest<ImageCtxT> *m_image_copy_request = nullptr; + librbd::ProgressContext *m_image_copy_prog_ctx = nullptr; + + bool m_updating_sync_point = false; + Context *m_update_sync_ctx = nullptr; + double m_update_sync_point_interval; + uint64_t m_image_copy_object_no = 0; + uint64_t m_image_copy_object_count = 0; + MirrorPeerClientMeta m_client_meta_copy; + + int m_ret_val = 0; + + void send_notify_sync_request(); + void handle_notify_sync_request(int r); + + void send_prune_catch_up_sync_point(); + void handle_prune_catch_up_sync_point(int r); + + void send_create_sync_point(); + void handle_create_sync_point(int r); + + void send_update_max_object_count(); + void handle_update_max_object_count(int r); + + void send_copy_image(); + void handle_copy_image(int r); + void handle_copy_image_update_progress(uint64_t object_no, + uint64_t object_count); + void send_update_sync_point(); + void handle_update_sync_point(int r); + + void send_flush_sync_point(); + void handle_flush_sync_point(int r); + + void send_prune_sync_points(); + void handle_prune_sync_points(int r); + + void update_progress(const std::string &description); +}; + +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::ImageSync<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_SYNC_H diff --git a/src/tools/rbd_mirror/ImageSyncThrottler.cc b/src/tools/rbd_mirror/ImageSyncThrottler.cc new file mode 100644 index 00000000..b395a012 --- /dev/null +++ b/src/tools/rbd_mirror/ImageSyncThrottler.cc @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "ImageSyncThrottler.h" +#include "common/Formatter.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/Utils.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::ImageSyncThrottler:: " << this \ + << " " << __func__ << ": " + +namespace rbd { +namespace mirror { + +template <typename I> +ImageSyncThrottler<I>::ImageSyncThrottler(CephContext *cct) + : m_cct(cct), + m_lock(librbd::util::unique_lock_name("rbd::mirror::ImageSyncThrottler", + this)), + m_max_concurrent_syncs(cct->_conf.get_val<uint64_t>( + "rbd_mirror_concurrent_image_syncs")) { + dout(20) << "max_concurrent_syncs=" << m_max_concurrent_syncs << dendl; + m_cct->_conf.add_observer(this); +} + +template <typename I> +ImageSyncThrottler<I>::~ImageSyncThrottler() { + m_cct->_conf.remove_observer(this); + + Mutex::Locker locker(m_lock); + ceph_assert(m_inflight_ops.empty()); + ceph_assert(m_queue.empty()); +} + +template <typename I> +void ImageSyncThrottler<I>::start_op(const std::string &id, Context *on_start) { + dout(20) << "id=" << id << dendl; + + int r = 0; + { + Mutex::Locker locker(m_lock); + + if (m_inflight_ops.count(id) > 0) { + dout(20) << "duplicate for already started op " << id << dendl; + } else if (m_queued_ops.count(id) > 0) { + dout(20) << "duplicate for already queued op " << id << dendl; + std::swap(m_queued_ops[id], on_start); + r = -ENOENT; + } else if (m_max_concurrent_syncs == 0 || + m_inflight_ops.size() < m_max_concurrent_syncs) { + ceph_assert(m_queue.empty()); + m_inflight_ops.insert(id); + dout(20) << "ready to start sync for " << id << " [" + << m_inflight_ops.size() << "/" << m_max_concurrent_syncs << "]" + << dendl; + } else { + m_queue.push_back(id); + std::swap(m_queued_ops[id], on_start); + dout(20) << "image sync for " << id << " has been queued" << dendl; + } + } + + if (on_start != nullptr) { + on_start->complete(r); + } +} + +template <typename I> +bool ImageSyncThrottler<I>::cancel_op(const std::string &id) { + dout(20) << "id=" << id << dendl; + + Context *on_start = nullptr; + { + Mutex::Locker locker(m_lock); + auto it = m_queued_ops.find(id); + if (it != m_queued_ops.end()) { + dout(20) << "canceled queued sync for " << id << dendl; + m_queue.remove(id); + on_start = it->second; + m_queued_ops.erase(it); + } + } + + if (on_start == nullptr) { + return false; + } + + on_start->complete(-ECANCELED); + return true; +} + +template <typename I> +void ImageSyncThrottler<I>::finish_op(const std::string &id) { + dout(20) << "id=" << id << dendl; + + if (cancel_op(id)) { + return; + } + + Context *on_start = nullptr; + { + Mutex::Locker locker(m_lock); + + m_inflight_ops.erase(id); + + if (m_inflight_ops.size() < m_max_concurrent_syncs && !m_queue.empty()) { + auto id = m_queue.front(); + auto it = m_queued_ops.find(id); + ceph_assert(it != m_queued_ops.end()); + m_inflight_ops.insert(id); + dout(20) << "ready to start sync for " << id << " [" + << m_inflight_ops.size() << "/" << m_max_concurrent_syncs << "]" + << dendl; + on_start = it->second; + m_queued_ops.erase(it); + m_queue.pop_front(); + } + } + + if (on_start != nullptr) { + on_start->complete(0); + } +} + +template <typename I> +void ImageSyncThrottler<I>::drain(int r) { + dout(20) << dendl; + + std::map<std::string, Context *> queued_ops; + { + Mutex::Locker locker(m_lock); + std::swap(m_queued_ops, queued_ops); + m_queue.clear(); + m_inflight_ops.clear(); + } + + for (auto &it : queued_ops) { + it.second->complete(r); + } +} + +template <typename I> +void ImageSyncThrottler<I>::set_max_concurrent_syncs(uint32_t max) { + dout(20) << "max=" << max << dendl; + + std::list<Context *> ops; + { + Mutex::Locker locker(m_lock); + m_max_concurrent_syncs = max; + + // Start waiting ops in the case of available free slots + while ((m_max_concurrent_syncs == 0 || + m_inflight_ops.size() < m_max_concurrent_syncs) && + !m_queue.empty()) { + auto id = m_queue.front(); + m_inflight_ops.insert(id); + dout(20) << "ready to start sync for " << id << " [" + << m_inflight_ops.size() << "/" << m_max_concurrent_syncs << "]" + << dendl; + auto it = m_queued_ops.find(id); + ceph_assert(it != m_queued_ops.end()); + ops.push_back(it->second); + m_queued_ops.erase(it); + m_queue.pop_front(); + } + } + + for (const auto& ctx : ops) { + ctx->complete(0); + } +} + +template <typename I> +void ImageSyncThrottler<I>::print_status(Formatter *f, std::stringstream *ss) { + dout(20) << dendl; + + Mutex::Locker locker(m_lock); + + if (f) { + f->dump_int("max_parallel_syncs", m_max_concurrent_syncs); + f->dump_int("running_syncs", m_inflight_ops.size()); + f->dump_int("waiting_syncs", m_queue.size()); + f->flush(*ss); + } else { + *ss << "[ "; + *ss << "max_parallel_syncs=" << m_max_concurrent_syncs << ", "; + *ss << "running_syncs=" << m_inflight_ops.size() << ", "; + *ss << "waiting_syncs=" << m_queue.size() << " ]"; + } +} + +template <typename I> +const char** ImageSyncThrottler<I>::get_tracked_conf_keys() const { + static const char* KEYS[] = { + "rbd_mirror_concurrent_image_syncs", + NULL + }; + return KEYS; +} + +template <typename I> +void ImageSyncThrottler<I>::handle_conf_change(const ConfigProxy& conf, + const set<string> &changed) { + if (changed.count("rbd_mirror_concurrent_image_syncs")) { + set_max_concurrent_syncs(conf.get_val<uint64_t>("rbd_mirror_concurrent_image_syncs")); + } +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::ImageSyncThrottler<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/ImageSyncThrottler.h b/src/tools/rbd_mirror/ImageSyncThrottler.h new file mode 100644 index 00000000..c0cda61e --- /dev/null +++ b/src/tools/rbd_mirror/ImageSyncThrottler.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_SYNC_THROTTLER_H +#define RBD_MIRROR_IMAGE_SYNC_THROTTLER_H + +#include <list> +#include <map> +#include <set> +#include <sstream> +#include <string> +#include <utility> + +#include "common/Mutex.h" +#include "common/config_obs.h" + +class CephContext; +class Context; + +namespace ceph { class Formatter; } +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename ImageCtxT = librbd::ImageCtx> +class ImageSyncThrottler : public md_config_obs_t { +public: + static ImageSyncThrottler *create(CephContext *cct) { + return new ImageSyncThrottler(cct); + } + void destroy() { + delete this; + } + + ImageSyncThrottler(CephContext *cct); + ~ImageSyncThrottler() override; + + void set_max_concurrent_syncs(uint32_t max); + void start_op(const std::string &id, Context *on_start); + bool cancel_op(const std::string &id); + void finish_op(const std::string &id); + void drain(int r); + + void print_status(Formatter *f, std::stringstream *ss); + +private: + CephContext *m_cct; + Mutex m_lock; + uint32_t m_max_concurrent_syncs; + std::list<std::string> m_queue; + std::map<std::string, Context *> m_queued_ops; + std::set<std::string> m_inflight_ops; + + const char **get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) override; +}; + +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::ImageSyncThrottler<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_SYNC_THROTTLER_H diff --git a/src/tools/rbd_mirror/InstanceReplayer.cc b/src/tools/rbd_mirror/InstanceReplayer.cc new file mode 100644 index 00000000..c0086a48 --- /dev/null +++ b/src/tools/rbd_mirror/InstanceReplayer.cc @@ -0,0 +1,510 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/stringify.h" +#include "common/Timer.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "ImageReplayer.h" +#include "InstanceReplayer.h" +#include "ServiceDaemon.h" +#include "Threads.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::InstanceReplayer: " \ + << this << " " << __func__ << ": " + +namespace rbd { +namespace mirror { + +namespace { + +const std::string SERVICE_DAEMON_ASSIGNED_COUNT_KEY("image_assigned_count"); +const std::string SERVICE_DAEMON_WARNING_COUNT_KEY("image_warning_count"); +const std::string SERVICE_DAEMON_ERROR_COUNT_KEY("image_error_count"); + +} // anonymous namespace + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template <typename I> +InstanceReplayer<I>::InstanceReplayer( + Threads<I> *threads, ServiceDaemon<I>* service_daemon, + RadosRef local_rados, const std::string &local_mirror_uuid, + int64_t local_pool_id) + : m_threads(threads), m_service_daemon(service_daemon), + m_local_rados(local_rados), m_local_mirror_uuid(local_mirror_uuid), + m_local_pool_id(local_pool_id), + m_lock("rbd::mirror::InstanceReplayer " + stringify(local_pool_id)) { +} + +template <typename I> +InstanceReplayer<I>::~InstanceReplayer() { + ceph_assert(m_image_state_check_task == nullptr); + ceph_assert(m_async_op_tracker.empty()); + ceph_assert(m_image_replayers.empty()); +} + +template <typename I> +bool InstanceReplayer<I>::is_blacklisted() const { + std::lock_guard locker{m_lock}; + return m_blacklisted; +} + +template <typename I> +int InstanceReplayer<I>::init() { + C_SaferCond init_ctx; + init(&init_ctx); + return init_ctx.wait(); +} + +template <typename I> +void InstanceReplayer<I>::init(Context *on_finish) { + dout(10) << dendl; + + Context *ctx = new FunctionContext( + [this, on_finish] (int r) { + { + Mutex::Locker timer_locker(m_threads->timer_lock); + schedule_image_state_check_task(); + } + on_finish->complete(0); + }); + + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void InstanceReplayer<I>::shut_down() { + C_SaferCond shut_down_ctx; + shut_down(&shut_down_ctx); + int r = shut_down_ctx.wait(); + ceph_assert(r == 0); +} + +template <typename I> +void InstanceReplayer<I>::shut_down(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_on_shut_down == nullptr); + m_on_shut_down = on_finish; + + Context *ctx = new FunctionContext( + [this] (int r) { + cancel_image_state_check_task(); + wait_for_ops(); + }); + + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void InstanceReplayer<I>::add_peer(std::string peer_uuid, + librados::IoCtx io_ctx) { + dout(10) << peer_uuid << dendl; + + Mutex::Locker locker(m_lock); + auto result = m_peers.insert(Peer(peer_uuid, io_ctx)).second; + ceph_assert(result); +} + +template <typename I> +void InstanceReplayer<I>::release_all(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + C_Gather *gather_ctx = new C_Gather(g_ceph_context, on_finish); + for (auto it = m_image_replayers.begin(); it != m_image_replayers.end(); + it = m_image_replayers.erase(it)) { + auto image_replayer = it->second; + auto ctx = gather_ctx->new_sub(); + ctx = new FunctionContext( + [image_replayer, ctx] (int r) { + image_replayer->destroy(); + ctx->complete(0); + }); + stop_image_replayer(image_replayer, ctx); + } + gather_ctx->activate(); +} + +template <typename I> +void InstanceReplayer<I>::acquire_image(InstanceWatcher<I> *instance_watcher, + const std::string &global_image_id, + Context *on_finish) { + dout(10) << "global_image_id=" << global_image_id << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_on_shut_down == nullptr); + + auto it = m_image_replayers.find(global_image_id); + if (it == m_image_replayers.end()) { + auto image_replayer = ImageReplayer<I>::create( + m_threads, instance_watcher, m_local_rados, + m_local_mirror_uuid, m_local_pool_id, global_image_id); + + dout(10) << global_image_id << ": creating replayer " << image_replayer + << dendl; + + it = m_image_replayers.insert(std::make_pair(global_image_id, + image_replayer)).first; + + // TODO only a single peer is currently supported + ceph_assert(m_peers.size() == 1); + auto peer = *m_peers.begin(); + image_replayer->add_peer(peer.peer_uuid, peer.io_ctx); + start_image_replayer(image_replayer); + } else { + // A duplicate acquire notification implies (1) connection hiccup or + // (2) new leader election. For the second case, restart the replayer to + // detect if the image has been deleted while the leader was offline + auto& image_replayer = it->second; + image_replayer->set_finished(false); + image_replayer->restart(); + } + + m_threads->work_queue->queue(on_finish, 0); +} + +template <typename I> +void InstanceReplayer<I>::release_image(const std::string &global_image_id, + Context *on_finish) { + dout(10) << "global_image_id=" << global_image_id << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_on_shut_down == nullptr); + + auto it = m_image_replayers.find(global_image_id); + if (it == m_image_replayers.end()) { + dout(5) << global_image_id << ": not found" << dendl; + m_threads->work_queue->queue(on_finish, 0); + return; + } + + auto image_replayer = it->second; + m_image_replayers.erase(it); + + on_finish = new FunctionContext( + [image_replayer, on_finish] (int r) { + image_replayer->destroy(); + on_finish->complete(0); + }); + stop_image_replayer(image_replayer, on_finish); +} + +template <typename I> +void InstanceReplayer<I>::remove_peer_image(const std::string &global_image_id, + const std::string &peer_mirror_uuid, + Context *on_finish) { + dout(10) << "global_image_id=" << global_image_id << ", " + << "peer_mirror_uuid=" << peer_mirror_uuid << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_on_shut_down == nullptr); + + auto it = m_image_replayers.find(global_image_id); + if (it != m_image_replayers.end()) { + // TODO only a single peer is currently supported, therefore + // we can just interrupt the current image replayer and + // it will eventually detect that the peer image is missing and + // determine if a delete propagation is required. + auto image_replayer = it->second; + image_replayer->restart(); + } + m_threads->work_queue->queue(on_finish, 0); +} + +template <typename I> +void InstanceReplayer<I>::print_status(Formatter *f, stringstream *ss) { + dout(10) << dendl; + + if (!f) { + return; + } + + Mutex::Locker locker(m_lock); + + f->open_array_section("image_replayers"); + for (auto &kv : m_image_replayers) { + auto &image_replayer = kv.second; + image_replayer->print_status(f, ss); + } + f->close_section(); +} + +template <typename I> +void InstanceReplayer<I>::start() +{ + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + m_manual_stop = false; + + for (auto &kv : m_image_replayers) { + auto &image_replayer = kv.second; + image_replayer->start(nullptr, true); + } +} + +template <typename I> +void InstanceReplayer<I>::stop() +{ + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + m_manual_stop = true; + + for (auto &kv : m_image_replayers) { + auto &image_replayer = kv.second; + image_replayer->stop(nullptr, true); + } +} + +template <typename I> +void InstanceReplayer<I>::restart() +{ + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + m_manual_stop = false; + + for (auto &kv : m_image_replayers) { + auto &image_replayer = kv.second; + image_replayer->restart(); + } +} + +template <typename I> +void InstanceReplayer<I>::flush() +{ + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + for (auto &kv : m_image_replayers) { + auto &image_replayer = kv.second; + image_replayer->flush(); + } +} + +template <typename I> +void InstanceReplayer<I>::start_image_replayer( + ImageReplayer<I> *image_replayer) { + ceph_assert(m_lock.is_locked()); + + std::string global_image_id = image_replayer->get_global_image_id(); + if (!image_replayer->is_stopped()) { + return; + } else if (image_replayer->is_blacklisted()) { + derr << "global_image_id=" << global_image_id << ": blacklisted detected " + << "during image replay" << dendl; + m_blacklisted = true; + return; + } else if (image_replayer->is_finished()) { + // TODO temporary until policy integrated + dout(5) << "removing image replayer for global_image_id=" + << global_image_id << dendl; + m_image_replayers.erase(image_replayer->get_global_image_id()); + image_replayer->destroy(); + return; + } else if (m_manual_stop) { + return; + } + + dout(10) << "global_image_id=" << global_image_id << dendl; + image_replayer->start(nullptr, false); +} + +template <typename I> +void InstanceReplayer<I>::queue_start_image_replayers() { + dout(10) << dendl; + + Context *ctx = create_context_callback< + InstanceReplayer, &InstanceReplayer<I>::start_image_replayers>(this); + m_async_op_tracker.start_op(); + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void InstanceReplayer<I>::start_image_replayers(int r) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + if (m_on_shut_down != nullptr) { + return; + } + + uint64_t image_count = 0; + uint64_t warning_count = 0; + uint64_t error_count = 0; + for (auto it = m_image_replayers.begin(); + it != m_image_replayers.end();) { + auto current_it(it); + ++it; + + ++image_count; + auto health_state = current_it->second->get_health_state(); + if (health_state == image_replayer::HEALTH_STATE_WARNING) { + ++warning_count; + } else if (health_state == image_replayer::HEALTH_STATE_ERROR) { + ++error_count; + } + + start_image_replayer(current_it->second); + } + + m_service_daemon->add_or_update_attribute( + m_local_pool_id, SERVICE_DAEMON_ASSIGNED_COUNT_KEY, image_count); + m_service_daemon->add_or_update_attribute( + m_local_pool_id, SERVICE_DAEMON_WARNING_COUNT_KEY, warning_count); + m_service_daemon->add_or_update_attribute( + m_local_pool_id, SERVICE_DAEMON_ERROR_COUNT_KEY, error_count); + + m_async_op_tracker.finish_op(); +} + +template <typename I> +void InstanceReplayer<I>::stop_image_replayer(ImageReplayer<I> *image_replayer, + Context *on_finish) { + dout(10) << image_replayer << " global_image_id=" + << image_replayer->get_global_image_id() << ", on_finish=" + << on_finish << dendl; + + if (image_replayer->is_stopped()) { + m_threads->work_queue->queue(on_finish, 0); + return; + } + + m_async_op_tracker.start_op(); + Context *ctx = create_async_context_callback( + m_threads->work_queue, new FunctionContext( + [this, image_replayer, on_finish] (int r) { + stop_image_replayer(image_replayer, on_finish); + m_async_op_tracker.finish_op(); + })); + + if (image_replayer->is_running()) { + image_replayer->stop(ctx, false); + } else { + int after = 1; + dout(10) << "scheduling image replayer " << image_replayer << " stop after " + << after << " sec (task " << ctx << ")" << dendl; + ctx = new FunctionContext( + [this, after, ctx] (int r) { + Mutex::Locker timer_locker(m_threads->timer_lock); + m_threads->timer->add_event_after(after, ctx); + }); + m_threads->work_queue->queue(ctx, 0); + } +} + +template <typename I> +void InstanceReplayer<I>::wait_for_ops() { + dout(10) << dendl; + + Context *ctx = create_context_callback< + InstanceReplayer, &InstanceReplayer<I>::handle_wait_for_ops>(this); + + m_async_op_tracker.wait_for_ops(ctx); +} + +template <typename I> +void InstanceReplayer<I>::handle_wait_for_ops(int r) { + dout(10) << "r=" << r << dendl; + + ceph_assert(r == 0); + + Mutex::Locker locker(m_lock); + stop_image_replayers(); +} + +template <typename I> +void InstanceReplayer<I>::stop_image_replayers() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_threads->work_queue, create_context_callback<InstanceReplayer<I>, + &InstanceReplayer<I>::handle_stop_image_replayers>(this)); + + C_Gather *gather_ctx = new C_Gather(g_ceph_context, ctx); + for (auto &it : m_image_replayers) { + stop_image_replayer(it.second, gather_ctx->new_sub()); + } + gather_ctx->activate(); +} + +template <typename I> +void InstanceReplayer<I>::handle_stop_image_replayers(int r) { + dout(10) << "r=" << r << dendl; + + ceph_assert(r == 0); + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + + for (auto &it : m_image_replayers) { + ceph_assert(it.second->is_stopped()); + it.second->destroy(); + } + m_image_replayers.clear(); + + ceph_assert(m_on_shut_down != nullptr); + std::swap(on_finish, m_on_shut_down); + } + on_finish->complete(r); +} + +template <typename I> +void InstanceReplayer<I>::cancel_image_state_check_task() { + Mutex::Locker timer_locker(m_threads->timer_lock); + + if (m_image_state_check_task == nullptr) { + return; + } + + dout(10) << m_image_state_check_task << dendl; + bool canceled = m_threads->timer->cancel_event(m_image_state_check_task); + ceph_assert(canceled); + m_image_state_check_task = nullptr; +} + +template <typename I> +void InstanceReplayer<I>::schedule_image_state_check_task() { + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_image_state_check_task == nullptr); + + m_image_state_check_task = new FunctionContext( + [this](int r) { + ceph_assert(m_threads->timer_lock.is_locked()); + m_image_state_check_task = nullptr; + schedule_image_state_check_task(); + queue_start_image_replayers(); + }); + + auto cct = static_cast<CephContext *>(m_local_rados->cct()); + int after = cct->_conf.get_val<uint64_t>( + "rbd_mirror_image_state_check_interval"); + + dout(10) << "scheduling image state check after " << after << " sec (task " + << m_image_state_check_task << ")" << dendl; + m_threads->timer->add_event_after(after, m_image_state_check_task); +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::InstanceReplayer<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/InstanceReplayer.h b/src/tools/rbd_mirror/InstanceReplayer.h new file mode 100644 index 00000000..efbdde02 --- /dev/null +++ b/src/tools/rbd_mirror/InstanceReplayer.h @@ -0,0 +1,123 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_INSTANCE_REPLAYER_H +#define RBD_MIRROR_INSTANCE_REPLAYER_H + +#include <map> +#include <sstream> + +#include "common/AsyncOpTracker.h" +#include "common/Formatter.h" +#include "common/Mutex.h" +#include "tools/rbd_mirror/Types.h" + +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> class ImageReplayer; +template <typename> class InstanceWatcher; +template <typename> class ServiceDaemon; +template <typename> struct Threads; + +template <typename ImageCtxT = librbd::ImageCtx> +class InstanceReplayer { +public: + static InstanceReplayer* create( + Threads<ImageCtxT> *threads, + ServiceDaemon<ImageCtxT>* service_daemon, + RadosRef local_rados, const std::string &local_mirror_uuid, + int64_t local_pool_id) { + return new InstanceReplayer(threads, service_daemon, local_rados, + local_mirror_uuid, local_pool_id); + } + void destroy() { + delete this; + } + + InstanceReplayer(Threads<ImageCtxT> *threads, + ServiceDaemon<ImageCtxT>* service_daemon, + RadosRef local_rados, const std::string &local_mirror_uuid, + int64_t local_pool_id); + ~InstanceReplayer(); + + bool is_blacklisted() const; + + int init(); + void shut_down(); + + void init(Context *on_finish); + void shut_down(Context *on_finish); + + void add_peer(std::string peer_uuid, librados::IoCtx io_ctx); + + void acquire_image(InstanceWatcher<ImageCtxT> *instance_watcher, + const std::string &global_image_id, Context *on_finish); + void release_image(const std::string &global_image_id, Context *on_finish); + void remove_peer_image(const std::string &global_image_id, + const std::string &peer_mirror_uuid, + Context *on_finish); + + void release_all(Context *on_finish); + + void print_status(Formatter *f, stringstream *ss); + void start(); + void stop(); + void restart(); + void flush(); + +private: + /** + * @verbatim + * + * <uninitialized> <-------------------\ + * | (init) | (repeat for each + * v STOP_IMAGE_REPLAYER ---\ image replayer) + * SCHEDULE_IMAGE_STATE_CHECK_TASK ^ ^ | + * | | | | + * v (shut_down) | \---------/ + * <initialized> -----------------> WAIT_FOR_OPS + * + * @endverbatim + */ + + Threads<ImageCtxT> *m_threads; + ServiceDaemon<ImageCtxT>* m_service_daemon; + RadosRef m_local_rados; + std::string m_local_mirror_uuid; + int64_t m_local_pool_id; + + mutable Mutex m_lock; + AsyncOpTracker m_async_op_tracker; + std::map<std::string, ImageReplayer<ImageCtxT> *> m_image_replayers; + Peers m_peers; + Context *m_image_state_check_task = nullptr; + Context *m_on_shut_down = nullptr; + bool m_manual_stop = false; + bool m_blacklisted = false; + + void wait_for_ops(); + void handle_wait_for_ops(int r); + + void start_image_replayer(ImageReplayer<ImageCtxT> *image_replayer); + void queue_start_image_replayers(); + void start_image_replayers(int r); + + void stop_image_replayer(ImageReplayer<ImageCtxT> *image_replayer, + Context *on_finish); + + void stop_image_replayers(); + void handle_stop_image_replayers(int r); + + void schedule_image_state_check_task(); + void cancel_image_state_check_task(); +}; + +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::InstanceReplayer<librbd::ImageCtx>; + +#endif // RBD_MIRROR_INSTANCE_REPLAYER_H diff --git a/src/tools/rbd_mirror/InstanceWatcher.cc b/src/tools/rbd_mirror/InstanceWatcher.cc new file mode 100644 index 00000000..d9e1ba23 --- /dev/null +++ b/src/tools/rbd_mirror/InstanceWatcher.cc @@ -0,0 +1,1299 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "InstanceWatcher.h" +#include "include/stringify.h" +#include "common/debug.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ManagedLock.h" +#include "librbd/Utils.h" +#include "InstanceReplayer.h" +#include "ImageSyncThrottler.h" +#include "common/Cond.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::InstanceWatcher: " + +namespace rbd { +namespace mirror { + +using namespace instance_watcher; + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; +using librbd::util::unique_lock_name; + +namespace { + +struct C_GetInstances : public Context { + std::vector<std::string> *instance_ids; + Context *on_finish; + bufferlist out_bl; + + C_GetInstances(std::vector<std::string> *instance_ids, Context *on_finish) + : instance_ids(instance_ids), on_finish(on_finish) { + } + + void finish(int r) override { + dout(10) << "C_GetInstances: " << this << " " << __func__ << ": r=" << r + << dendl; + + if (r == 0) { + auto it = out_bl.cbegin(); + r = librbd::cls_client::mirror_instances_list_finish(&it, instance_ids); + } else if (r == -ENOENT) { + r = 0; + } + on_finish->complete(r); + } +}; + +template <typename I> +struct C_RemoveInstanceRequest : public Context { + InstanceWatcher<I> instance_watcher; + Context *on_finish; + + C_RemoveInstanceRequest(librados::IoCtx &io_ctx, ContextWQ *work_queue, + const std::string &instance_id, Context *on_finish) + : instance_watcher(io_ctx, work_queue, nullptr, instance_id), + on_finish(on_finish) { + } + + void send() { + dout(10) << "C_RemoveInstanceRequest: " << this << " " << __func__ << dendl; + + instance_watcher.remove(this); + } + + void finish(int r) override { + dout(10) << "C_RemoveInstanceRequest: " << this << " " << __func__ << ": r=" + << r << dendl; + ceph_assert(r == 0); + + on_finish->complete(r); + } +}; + +} // anonymous namespace + +template <typename I> +struct InstanceWatcher<I>::C_NotifyInstanceRequest : public Context { + InstanceWatcher<I> *instance_watcher; + std::string instance_id; + uint64_t request_id; + bufferlist bl; + Context *on_finish; + bool send_to_leader; + std::unique_ptr<librbd::watcher::Notifier> notifier; + librbd::watcher::NotifyResponse response; + bool canceling = false; + + C_NotifyInstanceRequest(InstanceWatcher<I> *instance_watcher, + const std::string &instance_id, uint64_t request_id, + bufferlist &&bl, Context *on_finish) + : instance_watcher(instance_watcher), instance_id(instance_id), + request_id(request_id), bl(bl), on_finish(on_finish), + send_to_leader(instance_id.empty()) { + dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": instance_watcher=" << instance_watcher << ", instance_id=" + << instance_id << ", request_id=" << request_id << dendl; + + ceph_assert(instance_watcher->m_lock.is_locked()); + + if (!send_to_leader) { + ceph_assert((!instance_id.empty())); + notifier.reset(new librbd::watcher::Notifier( + instance_watcher->m_work_queue, + instance_watcher->m_ioctx, + RBD_MIRROR_INSTANCE_PREFIX + instance_id)); + } + + instance_watcher->m_notify_op_tracker.start_op(); + auto result = instance_watcher->m_notify_ops.insert( + std::make_pair(instance_id, this)).second; + ceph_assert(result); + } + + void send() { + dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ << dendl; + + ceph_assert(instance_watcher->m_lock.is_locked()); + + if (canceling) { + dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": canceling" << dendl; + instance_watcher->m_work_queue->queue(this, -ECANCELED); + return; + } + + if (send_to_leader) { + if (instance_watcher->m_leader_instance_id.empty()) { + dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": suspending" << dendl; + instance_watcher->suspend_notify_request(this); + return; + } + + if (instance_watcher->m_leader_instance_id != instance_id) { + auto count = instance_watcher->m_notify_ops.erase( + std::make_pair(instance_id, this)); + ceph_assert(count > 0); + + instance_id = instance_watcher->m_leader_instance_id; + + auto result = instance_watcher->m_notify_ops.insert( + std::make_pair(instance_id, this)).second; + ceph_assert(result); + + notifier.reset(new librbd::watcher::Notifier( + instance_watcher->m_work_queue, + instance_watcher->m_ioctx, + RBD_MIRROR_INSTANCE_PREFIX + instance_id)); + } + } + + dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": sending to " << instance_id << dendl; + notifier->notify(bl, &response, this); + } + + void cancel() { + dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ << dendl; + + ceph_assert(instance_watcher->m_lock.is_locked()); + + canceling = true; + instance_watcher->unsuspend_notify_request(this); + } + + void finish(int r) override { + dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ << ": r=" + << r << dendl; + + if (r == 0 || r == -ETIMEDOUT) { + bool found = false; + for (auto &it : response.acks) { + auto &bl = it.second; + if (it.second.length() == 0) { + dout(5) << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": no payload in ack, ignoring" << dendl; + continue; + } + try { + auto iter = bl.cbegin(); + NotifyAckPayload ack; + decode(ack, iter); + if (ack.instance_id != instance_watcher->get_instance_id()) { + derr << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": ack instance_id (" << ack.instance_id << ") " + << "does not match, ignoring" << dendl; + continue; + } + if (ack.request_id != request_id) { + derr << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": ack request_id (" << ack.request_id << ") " + << "does not match, ignoring" << dendl; + continue; + } + r = ack.ret_val; + found = true; + break; + } catch (const buffer::error &err) { + derr << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": failed to decode ack: " << err.what() << dendl; + continue; + } + } + + if (!found) { + if (r == -ETIMEDOUT) { + derr << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": resending after timeout" << dendl; + Mutex::Locker locker(instance_watcher->m_lock); + send(); + return; + } else { + r = -EINVAL; + } + } else { + if (r == -ESTALE && send_to_leader) { + derr << "C_NotifyInstanceRequest: " << this << " " << __func__ + << ": resending due to leader change" << dendl; + Mutex::Locker locker(instance_watcher->m_lock); + send(); + return; + } + } + } + + on_finish->complete(r); + + { + Mutex::Locker locker(instance_watcher->m_lock); + auto result = instance_watcher->m_notify_ops.erase( + std::make_pair(instance_id, this)); + ceph_assert(result > 0); + instance_watcher->m_notify_op_tracker.finish_op(); + } + + delete this; + } + + void complete(int r) override { + finish(r); + } +}; + +template <typename I> +struct InstanceWatcher<I>::C_SyncRequest : public Context { + InstanceWatcher<I> *instance_watcher; + std::string sync_id; + Context *on_start; + Context *on_complete = nullptr; + C_NotifyInstanceRequest *req = nullptr; + + C_SyncRequest(InstanceWatcher<I> *instance_watcher, + const std::string &sync_id, Context *on_start) + : instance_watcher(instance_watcher), sync_id(sync_id), + on_start(on_start) { + dout(10) << "C_SyncRequest: " << this << " " << __func__ << ": sync_id=" + << sync_id << dendl; + } + + void finish(int r) override { + dout(10) << "C_SyncRequest: " << this << " " << __func__ << ": r=" + << r << dendl; + + if (on_start != nullptr) { + instance_watcher->handle_notify_sync_request(this, r); + } else { + instance_watcher->handle_notify_sync_complete(this, r); + delete this; + } + } + + // called twice + void complete(int r) override { + finish(r); + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::InstanceWatcher: " \ + << this << " " << __func__ << ": " +template <typename I> +void InstanceWatcher<I>::get_instances(librados::IoCtx &io_ctx, + std::vector<std::string> *instance_ids, + Context *on_finish) { + librados::ObjectReadOperation op; + librbd::cls_client::mirror_instances_list_start(&op); + C_GetInstances *ctx = new C_GetInstances(instance_ids, on_finish); + librados::AioCompletion *aio_comp = create_rados_callback(ctx); + + int r = io_ctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op, &ctx->out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void InstanceWatcher<I>::remove_instance(librados::IoCtx &io_ctx, + ContextWQ *work_queue, + const std::string &instance_id, + Context *on_finish) { + auto req = new C_RemoveInstanceRequest<I>(io_ctx, work_queue, instance_id, + on_finish); + req->send(); +} + +template <typename I> +InstanceWatcher<I> *InstanceWatcher<I>::create( + librados::IoCtx &io_ctx, ContextWQ *work_queue, + InstanceReplayer<I> *instance_replayer) { + return new InstanceWatcher<I>(io_ctx, work_queue, instance_replayer, + stringify(io_ctx.get_instance_id())); +} + +template <typename I> +InstanceWatcher<I>::InstanceWatcher(librados::IoCtx &io_ctx, + ContextWQ *work_queue, + InstanceReplayer<I> *instance_replayer, + const std::string &instance_id) + : Watcher(io_ctx, work_queue, RBD_MIRROR_INSTANCE_PREFIX + instance_id), + m_instance_replayer(instance_replayer), m_instance_id(instance_id), + m_lock(unique_lock_name("rbd::mirror::InstanceWatcher::m_lock", this)), + m_instance_lock(librbd::ManagedLock<I>::create( + m_ioctx, m_work_queue, m_oid, this, librbd::managed_lock::EXCLUSIVE, true, + m_cct->_conf.get_val<uint64_t>("rbd_blacklist_expire_seconds"))) { +} + +template <typename I> +InstanceWatcher<I>::~InstanceWatcher() { + ceph_assert(m_requests.empty()); + ceph_assert(m_notify_ops.empty()); + ceph_assert(m_notify_op_tracker.empty()); + ceph_assert(m_suspended_ops.empty()); + ceph_assert(m_inflight_sync_reqs.empty()); + ceph_assert(m_image_sync_throttler == nullptr); + m_instance_lock->destroy(); +} + +template <typename I> +int InstanceWatcher<I>::init() { + C_SaferCond init_ctx; + init(&init_ctx); + return init_ctx.wait(); +} + +template <typename I> +void InstanceWatcher<I>::init(Context *on_finish) { + dout(10) << "instance_id=" << m_instance_id << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_on_finish == nullptr); + m_on_finish = on_finish; + m_ret_val = 0; + + register_instance(); +} + +template <typename I> +void InstanceWatcher<I>::shut_down() { + C_SaferCond shut_down_ctx; + shut_down(&shut_down_ctx); + int r = shut_down_ctx.wait(); + ceph_assert(r == 0); +} + +template <typename I> +void InstanceWatcher<I>::shut_down(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_on_finish == nullptr); + m_on_finish = on_finish; + m_ret_val = 0; + + release_lock(); +} + +template <typename I> +void InstanceWatcher<I>::remove(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_on_finish == nullptr); + m_on_finish = on_finish; + m_ret_val = 0; + + get_instance_locker(); +} + +template <typename I> +void InstanceWatcher<I>::notify_image_acquire( + const std::string &instance_id, const std::string &global_image_id, + Context *on_notify_ack) { + dout(10) << "instance_id=" << instance_id << ", global_image_id=" + << global_image_id << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_on_finish == nullptr); + + uint64_t request_id = ++m_request_seq; + bufferlist bl; + encode(NotifyMessage{ImageAcquirePayload{request_id, global_image_id}}, bl); + auto req = new C_NotifyInstanceRequest(this, instance_id, request_id, + std::move(bl), on_notify_ack); + req->send(); +} + +template <typename I> +void InstanceWatcher<I>::notify_image_release( + const std::string &instance_id, const std::string &global_image_id, + Context *on_notify_ack) { + dout(10) << "instance_id=" << instance_id << ", global_image_id=" + << global_image_id << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_on_finish == nullptr); + + uint64_t request_id = ++m_request_seq; + bufferlist bl; + encode(NotifyMessage{ImageReleasePayload{request_id, global_image_id}}, bl); + auto req = new C_NotifyInstanceRequest(this, instance_id, request_id, + std::move(bl), on_notify_ack); + req->send(); +} + +template <typename I> +void InstanceWatcher<I>::notify_peer_image_removed( + const std::string &instance_id, const std::string &global_image_id, + const std::string &peer_mirror_uuid, Context *on_notify_ack) { + dout(10) << "instance_id=" << instance_id << ", " + << "global_image_id=" << global_image_id << ", " + << "peer_mirror_uuid=" << peer_mirror_uuid << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_on_finish == nullptr); + + uint64_t request_id = ++m_request_seq; + bufferlist bl; + encode(NotifyMessage{PeerImageRemovedPayload{request_id, global_image_id, + peer_mirror_uuid}}, bl); + auto req = new C_NotifyInstanceRequest(this, instance_id, request_id, + std::move(bl), on_notify_ack); + req->send(); +} + +template <typename I> +void InstanceWatcher<I>::notify_sync_request(const std::string &sync_id, + Context *on_sync_start) { + dout(10) << "sync_id=" << sync_id << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_inflight_sync_reqs.count(sync_id) == 0); + + uint64_t request_id = ++m_request_seq; + + bufferlist bl; + encode(NotifyMessage{SyncRequestPayload{request_id, sync_id}}, bl); + + auto sync_ctx = new C_SyncRequest(this, sync_id, on_sync_start); + sync_ctx->req = new C_NotifyInstanceRequest(this, "", request_id, + std::move(bl), sync_ctx); + + m_inflight_sync_reqs[sync_id] = sync_ctx; + sync_ctx->req->send(); +} + +template <typename I> +bool InstanceWatcher<I>::cancel_sync_request(const std::string &sync_id) { + dout(10) << "sync_id=" << sync_id << dendl; + + Mutex::Locker locker(m_lock); + + auto it = m_inflight_sync_reqs.find(sync_id); + if (it == m_inflight_sync_reqs.end()) { + return false; + } + + auto sync_ctx = it->second; + + if (sync_ctx->on_start == nullptr) { + return false; + } + + ceph_assert(sync_ctx->req != nullptr); + sync_ctx->req->cancel(); + return true; +} + +template <typename I> +void InstanceWatcher<I>::notify_sync_start(const std::string &instance_id, + const std::string &sync_id) { + dout(10) << "sync_id=" << sync_id << dendl; + + Mutex::Locker locker(m_lock); + + uint64_t request_id = ++m_request_seq; + + bufferlist bl; + encode(NotifyMessage{SyncStartPayload{request_id, sync_id}}, bl); + + auto ctx = new FunctionContext( + [this, sync_id] (int r) { + dout(10) << "finish: sync_id=" << sync_id << ", r=" << r << dendl; + Mutex::Locker locker(m_lock); + if (r != -ESTALE && m_image_sync_throttler != nullptr) { + m_image_sync_throttler->finish_op(sync_id); + } + }); + auto req = new C_NotifyInstanceRequest(this, instance_id, request_id, + std::move(bl), ctx); + req->send(); +} + +template <typename I> +void InstanceWatcher<I>::notify_sync_complete(const std::string &sync_id) { + Mutex::Locker locker(m_lock); + notify_sync_complete(m_lock, sync_id); +} + +template <typename I> +void InstanceWatcher<I>::notify_sync_complete(const Mutex&, + const std::string &sync_id) { + dout(10) << "sync_id=" << sync_id << dendl; + ceph_assert(m_lock.is_locked()); + + auto it = m_inflight_sync_reqs.find(sync_id); + ceph_assert(it != m_inflight_sync_reqs.end()); + + auto sync_ctx = it->second; + ceph_assert(sync_ctx->req == nullptr); + + m_inflight_sync_reqs.erase(it); + m_work_queue->queue(sync_ctx, 0); +} + +template <typename I> +void InstanceWatcher<I>::handle_notify_sync_request(C_SyncRequest *sync_ctx, + int r) { + dout(10) << "sync_id=" << sync_ctx->sync_id << ", r=" << r << dendl; + + Context *on_start = nullptr; + { + Mutex::Locker locker(m_lock); + ceph_assert(sync_ctx->req != nullptr); + ceph_assert(sync_ctx->on_start != nullptr); + + if (sync_ctx->req->canceling) { + r = -ECANCELED; + } + + std::swap(sync_ctx->on_start, on_start); + sync_ctx->req = nullptr; + + if (r == -ECANCELED) { + notify_sync_complete(m_lock, sync_ctx->sync_id); + } + } + + on_start->complete(r == -ECANCELED ? r : 0); +} + +template <typename I> +void InstanceWatcher<I>::handle_notify_sync_complete(C_SyncRequest *sync_ctx, + int r) { + dout(10) << "sync_id=" << sync_ctx->sync_id << ", r=" << r << dendl; + + if (sync_ctx->on_complete != nullptr) { + sync_ctx->on_complete->complete(r); + } +} + +template <typename I> +void InstanceWatcher<I>::print_sync_status(Formatter *f, stringstream *ss) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + if (m_image_sync_throttler != nullptr) { + m_image_sync_throttler->print_status(f, ss); + } +} + +template <typename I> +void InstanceWatcher<I>::handle_acquire_leader() { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_image_sync_throttler == nullptr); + m_image_sync_throttler = ImageSyncThrottler<I>::create(m_cct); + + m_leader_instance_id = m_instance_id; + unsuspend_notify_requests(); +} + +template <typename I> +void InstanceWatcher<I>::handle_release_leader() { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_image_sync_throttler != nullptr); + + m_leader_instance_id.clear(); + + m_image_sync_throttler->drain(-ESTALE); + m_image_sync_throttler->destroy(); + m_image_sync_throttler = nullptr; +} + +template <typename I> +void InstanceWatcher<I>::handle_update_leader( + const std::string &leader_instance_id) { + dout(10) << "leader_instance_id=" << leader_instance_id << dendl; + + Mutex::Locker locker(m_lock); + + m_leader_instance_id = leader_instance_id; + + if (!m_leader_instance_id.empty()) { + unsuspend_notify_requests(); + } +} + +template <typename I> +void InstanceWatcher<I>::cancel_notify_requests( + const std::string &instance_id) { + dout(10) << "instance_id=" << instance_id << dendl; + + Mutex::Locker locker(m_lock); + + for (auto op : m_notify_ops) { + if (op.first == instance_id && !op.second->send_to_leader) { + op.second->cancel(); + } + } +} + +template <typename I> +void InstanceWatcher<I>::register_instance() { + ceph_assert(m_lock.is_locked()); + + dout(10) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::mirror_instances_add(&op, m_instance_id); + librados::AioCompletion *aio_comp = create_rados_callback< + InstanceWatcher<I>, &InstanceWatcher<I>::handle_register_instance>(this); + + int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void InstanceWatcher<I>::handle_register_instance(int r) { + dout(10) << "r=" << r << dendl; + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + + if (r == 0) { + create_instance_object(); + return; + } + + derr << "error registering instance: " << cpp_strerror(r) << dendl; + + std::swap(on_finish, m_on_finish); + } + on_finish->complete(r); +} + + +template <typename I> +void InstanceWatcher<I>::create_instance_object() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + librados::ObjectWriteOperation op; + op.create(true); + + librados::AioCompletion *aio_comp = create_rados_callback< + InstanceWatcher<I>, + &InstanceWatcher<I>::handle_create_instance_object>(this); + int r = m_ioctx.aio_operate(m_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void InstanceWatcher<I>::handle_create_instance_object(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + + if (r < 0) { + derr << "error creating " << m_oid << " object: " << cpp_strerror(r) + << dendl; + + m_ret_val = r; + unregister_instance(); + return; + } + + register_watch(); +} + +template <typename I> +void InstanceWatcher<I>::register_watch() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + InstanceWatcher<I>, &InstanceWatcher<I>::handle_register_watch>(this)); + + librbd::Watcher::register_watch(ctx); +} + +template <typename I> +void InstanceWatcher<I>::handle_register_watch(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + + if (r < 0) { + derr << "error registering instance watcher for " << m_oid << " object: " + << cpp_strerror(r) << dendl; + + m_ret_val = r; + remove_instance_object(); + return; + } + + acquire_lock(); +} + +template <typename I> +void InstanceWatcher<I>::acquire_lock() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + InstanceWatcher<I>, &InstanceWatcher<I>::handle_acquire_lock>(this)); + + m_instance_lock->acquire_lock(ctx); +} + +template <typename I> +void InstanceWatcher<I>::handle_acquire_lock(int r) { + dout(10) << "r=" << r << dendl; + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + + if (r < 0) { + + derr << "error acquiring instance lock: " << cpp_strerror(r) << dendl; + + m_ret_val = r; + unregister_watch(); + return; + } + + std::swap(on_finish, m_on_finish); + } + + on_finish->complete(r); +} + +template <typename I> +void InstanceWatcher<I>::release_lock() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + InstanceWatcher<I>, &InstanceWatcher<I>::handle_release_lock>(this)); + + m_instance_lock->shut_down(ctx); +} + +template <typename I> +void InstanceWatcher<I>::handle_release_lock(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + + if (r < 0) { + derr << "error releasing instance lock: " << cpp_strerror(r) << dendl; + } + + unregister_watch(); +} + +template <typename I> +void InstanceWatcher<I>::unregister_watch() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + InstanceWatcher<I>, &InstanceWatcher<I>::handle_unregister_watch>(this)); + + librbd::Watcher::unregister_watch(ctx); +} + +template <typename I> +void InstanceWatcher<I>::handle_unregister_watch(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "error unregistering instance watcher for " << m_oid << " object: " + << cpp_strerror(r) << dendl; + } + + Mutex::Locker locker(m_lock); + remove_instance_object(); +} + +template <typename I> +void InstanceWatcher<I>::remove_instance_object() { + ceph_assert(m_lock.is_locked()); + + dout(10) << dendl; + + librados::ObjectWriteOperation op; + op.remove(); + + librados::AioCompletion *aio_comp = create_rados_callback< + InstanceWatcher<I>, + &InstanceWatcher<I>::handle_remove_instance_object>(this); + int r = m_ioctx.aio_operate(m_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void InstanceWatcher<I>::handle_remove_instance_object(int r) { + dout(10) << "r=" << r << dendl; + + if (r == -ENOENT) { + r = 0; + } + + if (r < 0) { + derr << "error removing " << m_oid << " object: " << cpp_strerror(r) + << dendl; + } + + Mutex::Locker locker(m_lock); + unregister_instance(); +} + +template <typename I> +void InstanceWatcher<I>::unregister_instance() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + librados::ObjectWriteOperation op; + librbd::cls_client::mirror_instances_remove(&op, m_instance_id); + librados::AioCompletion *aio_comp = create_rados_callback< + InstanceWatcher<I>, &InstanceWatcher<I>::handle_unregister_instance>(this); + + int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void InstanceWatcher<I>::handle_unregister_instance(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "error unregistering instance: " << cpp_strerror(r) << dendl; + } + + Mutex::Locker locker(m_lock); + wait_for_notify_ops(); +} + +template <typename I> +void InstanceWatcher<I>::wait_for_notify_ops() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + for (auto op : m_notify_ops) { + op.second->cancel(); + } + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + InstanceWatcher<I>, &InstanceWatcher<I>::handle_wait_for_notify_ops>(this)); + + m_notify_op_tracker.wait_for_ops(ctx); +} + +template <typename I> +void InstanceWatcher<I>::handle_wait_for_notify_ops(int r) { + dout(10) << "r=" << r << dendl; + + ceph_assert(r == 0); + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + + ceph_assert(m_notify_ops.empty()); + + std::swap(on_finish, m_on_finish); + r = m_ret_val; + } + on_finish->complete(r); +} + +template <typename I> +void InstanceWatcher<I>::get_instance_locker() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + InstanceWatcher<I>, &InstanceWatcher<I>::handle_get_instance_locker>(this)); + + m_instance_lock->get_locker(&m_instance_locker, ctx); +} + +template <typename I> +void InstanceWatcher<I>::handle_get_instance_locker(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + + if (r < 0) { + if (r != -ENOENT) { + derr << "error retrieving instance locker: " << cpp_strerror(r) << dendl; + } + remove_instance_object(); + return; + } + + break_instance_lock(); +} + +template <typename I> +void InstanceWatcher<I>::break_instance_lock() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + InstanceWatcher<I>, &InstanceWatcher<I>::handle_break_instance_lock>(this)); + + m_instance_lock->break_lock(m_instance_locker, true, ctx); +} + +template <typename I> +void InstanceWatcher<I>::handle_break_instance_lock(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + + if (r < 0) { + if (r != -ENOENT) { + derr << "error breaking instance lock: " << cpp_strerror(r) << dendl; + } + remove_instance_object(); + return; + } + + remove_instance_object(); +} + +template <typename I> +void InstanceWatcher<I>::suspend_notify_request(C_NotifyInstanceRequest *req) { + dout(10) << req << dendl; + + ceph_assert(m_lock.is_locked()); + + auto result = m_suspended_ops.insert(req).second; + ceph_assert(result); +} + +template <typename I> +bool InstanceWatcher<I>::unsuspend_notify_request( + C_NotifyInstanceRequest *req) { + dout(10) << req << dendl; + + ceph_assert(m_lock.is_locked()); + + auto result = m_suspended_ops.erase(req); + if (result == 0) { + return false; + } + + req->send(); + return true; +} + +template <typename I> +void InstanceWatcher<I>::unsuspend_notify_requests() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + std::set<C_NotifyInstanceRequest *> suspended_ops; + std::swap(m_suspended_ops, suspended_ops); + + for (auto op : suspended_ops) { + op->send(); + } +} + +template <typename I> +Context *InstanceWatcher<I>::prepare_request(const std::string &instance_id, + uint64_t request_id, + C_NotifyAck *on_notify_ack) { + dout(10) << "instance_id=" << instance_id << ", request_id=" << request_id + << dendl; + + Mutex::Locker locker(m_lock); + + Context *ctx = nullptr; + Request request(instance_id, request_id); + auto it = m_requests.find(request); + + if (it != m_requests.end()) { + dout(10) << "duplicate for in-progress request" << dendl; + delete it->on_notify_ack; + m_requests.erase(it); + } else { + ctx = create_async_context_callback( + m_work_queue, new FunctionContext( + [this, instance_id, request_id] (int r) { + complete_request(instance_id, request_id, r); + })); + } + + request.on_notify_ack = on_notify_ack; + m_requests.insert(request); + return ctx; +} + +template <typename I> +void InstanceWatcher<I>::complete_request(const std::string &instance_id, + uint64_t request_id, int r) { + dout(10) << "instance_id=" << instance_id << ", request_id=" << request_id + << dendl; + + C_NotifyAck *on_notify_ack; + { + Mutex::Locker locker(m_lock); + Request request(instance_id, request_id); + auto it = m_requests.find(request); + ceph_assert(it != m_requests.end()); + on_notify_ack = it->on_notify_ack; + m_requests.erase(it); + } + + encode(NotifyAckPayload(instance_id, request_id, r), on_notify_ack->out); + on_notify_ack->complete(0); +} + +template <typename I> +void InstanceWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) { + dout(10) << "notify_id=" << notify_id << ", handle=" << handle << ", " + << "notifier_id=" << notifier_id << dendl; + + auto ctx = new C_NotifyAck(this, notify_id, handle); + + NotifyMessage notify_message; + try { + auto iter = bl.cbegin(); + decode(notify_message, iter); + } catch (const buffer::error &err) { + derr << "error decoding image notification: " << err.what() << dendl; + ctx->complete(0); + return; + } + + apply_visitor(HandlePayloadVisitor(this, stringify(notifier_id), ctx), + notify_message.payload); +} + +template <typename I> +void InstanceWatcher<I>::handle_image_acquire( + const std::string &global_image_id, Context *on_finish) { + dout(10) << "global_image_id=" << global_image_id << dendl; + + auto ctx = new FunctionContext( + [this, global_image_id, on_finish] (int r) { + m_instance_replayer->acquire_image(this, global_image_id, on_finish); + m_notify_op_tracker.finish_op(); + }); + + m_notify_op_tracker.start_op(); + m_work_queue->queue(ctx, 0); +} + +template <typename I> +void InstanceWatcher<I>::handle_image_release( + const std::string &global_image_id, Context *on_finish) { + dout(10) << "global_image_id=" << global_image_id << dendl; + + auto ctx = new FunctionContext( + [this, global_image_id, on_finish] (int r) { + m_instance_replayer->release_image(global_image_id, on_finish); + m_notify_op_tracker.finish_op(); + }); + + m_notify_op_tracker.start_op(); + m_work_queue->queue(ctx, 0); +} + +template <typename I> +void InstanceWatcher<I>::handle_peer_image_removed( + const std::string &global_image_id, const std::string &peer_mirror_uuid, + Context *on_finish) { + dout(10) << "global_image_id=" << global_image_id << ", " + << "peer_mirror_uuid=" << peer_mirror_uuid << dendl; + + auto ctx = new FunctionContext( + [this, peer_mirror_uuid, global_image_id, on_finish] (int r) { + m_instance_replayer->remove_peer_image(global_image_id, + peer_mirror_uuid, on_finish); + m_notify_op_tracker.finish_op(); + }); + + m_notify_op_tracker.start_op(); + m_work_queue->queue(ctx, 0); +} + +template <typename I> +void InstanceWatcher<I>::handle_sync_request(const std::string &instance_id, + const std::string &sync_id, + Context *on_finish) { + dout(10) << "instance_id=" << instance_id << ", sync_id=" << sync_id << dendl; + + Mutex::Locker locker(m_lock); + + if (m_image_sync_throttler == nullptr) { + dout(10) << "sync request for non-leader" << dendl; + m_work_queue->queue(on_finish, -ESTALE); + return; + } + + Context *on_start = create_async_context_callback( + m_work_queue, new FunctionContext( + [this, instance_id, sync_id, on_finish] (int r) { + dout(10) << "handle_sync_request: finish: instance_id=" << instance_id + << ", sync_id=" << sync_id << ", r=" << r << dendl; + if (r == 0) { + notify_sync_start(instance_id, sync_id); + } + if (r == -ENOENT) { + r = 0; + } + on_finish->complete(r); + })); + m_image_sync_throttler->start_op(sync_id, on_start); +} + +template <typename I> +void InstanceWatcher<I>::handle_sync_start(const std::string &instance_id, + const std::string &sync_id, + Context *on_finish) { + dout(10) << "instance_id=" << instance_id << ", sync_id=" << sync_id << dendl; + + Mutex::Locker locker(m_lock); + + auto it = m_inflight_sync_reqs.find(sync_id); + if (it == m_inflight_sync_reqs.end()) { + dout(5) << "not found" << dendl; + m_work_queue->queue(on_finish, 0); + return; + } + + auto sync_ctx = it->second; + + if (sync_ctx->on_complete != nullptr) { + dout(5) << "duplicate request" << dendl; + m_work_queue->queue(sync_ctx->on_complete, -ESTALE); + } + + sync_ctx->on_complete = on_finish; +} + +template <typename I> +void InstanceWatcher<I>::handle_payload(const std::string &instance_id, + const ImageAcquirePayload &payload, + C_NotifyAck *on_notify_ack) { + dout(10) << "image_acquire: instance_id=" << instance_id << ", " + << "request_id=" << payload.request_id << dendl; + + auto on_finish = prepare_request(instance_id, payload.request_id, + on_notify_ack); + if (on_finish != nullptr) { + handle_image_acquire(payload.global_image_id, on_finish); + } +} + +template <typename I> +void InstanceWatcher<I>::handle_payload(const std::string &instance_id, + const ImageReleasePayload &payload, + C_NotifyAck *on_notify_ack) { + dout(10) << "image_release: instance_id=" << instance_id << ", " + << "request_id=" << payload.request_id << dendl; + + auto on_finish = prepare_request(instance_id, payload.request_id, + on_notify_ack); + if (on_finish != nullptr) { + handle_image_release(payload.global_image_id, on_finish); + } +} + +template <typename I> +void InstanceWatcher<I>::handle_payload(const std::string &instance_id, + const PeerImageRemovedPayload &payload, + C_NotifyAck *on_notify_ack) { + dout(10) << "remove_peer_image: instance_id=" << instance_id << ", " + << "request_id=" << payload.request_id << dendl; + + auto on_finish = prepare_request(instance_id, payload.request_id, + on_notify_ack); + if (on_finish != nullptr) { + handle_peer_image_removed(payload.global_image_id, payload.peer_mirror_uuid, + on_finish); + } +} + +template <typename I> +void InstanceWatcher<I>::handle_payload(const std::string &instance_id, + const SyncRequestPayload &payload, + C_NotifyAck *on_notify_ack) { + dout(10) << "sync_request: instance_id=" << instance_id << ", " + << "request_id=" << payload.request_id << dendl; + + auto on_finish = prepare_request(instance_id, payload.request_id, + on_notify_ack); + if (on_finish == nullptr) { + return; + } + + handle_sync_request(instance_id, payload.sync_id, on_finish); +} + +template <typename I> +void InstanceWatcher<I>::handle_payload(const std::string &instance_id, + const SyncStartPayload &payload, + C_NotifyAck *on_notify_ack) { + dout(10) << "sync_start: instance_id=" << instance_id << ", " + << "request_id=" << payload.request_id << dendl; + + auto on_finish = prepare_request(instance_id, payload.request_id, + on_notify_ack); + if (on_finish == nullptr) { + return; + } + + handle_sync_start(instance_id, payload.sync_id, on_finish); +} + +template <typename I> +void InstanceWatcher<I>::handle_payload(const std::string &instance_id, + const UnknownPayload &payload, + C_NotifyAck *on_notify_ack) { + dout(5) << "unknown: instance_id=" << instance_id << dendl; + + on_notify_ack->complete(0); +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::InstanceWatcher<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/InstanceWatcher.h b/src/tools/rbd_mirror/InstanceWatcher.h new file mode 100644 index 00000000..5ec1aef0 --- /dev/null +++ b/src/tools/rbd_mirror/InstanceWatcher.h @@ -0,0 +1,264 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_INSTANCE_WATCHER_H +#define CEPH_RBD_MIRROR_INSTANCE_WATCHER_H + +#include <map> +#include <memory> +#include <set> +#include <string> +#include <vector> + +#include "common/AsyncOpTracker.h" +#include "librbd/Watcher.h" +#include "librbd/managed_lock/Types.h" +#include "tools/rbd_mirror/instance_watcher/Types.h" + +namespace librbd { + +class ImageCtx; +template <typename> class ManagedLock; + +} + +namespace rbd { +namespace mirror { + +template <typename> class ImageSyncThrottler; +template <typename> class InstanceReplayer; +template <typename> struct Threads; + +template <typename ImageCtxT = librbd::ImageCtx> +class InstanceWatcher : protected librbd::Watcher { + using librbd::Watcher::unregister_watch; // Silence overloaded virtual warning +public: + static void get_instances(librados::IoCtx &io_ctx, + std::vector<std::string> *instance_ids, + Context *on_finish); + static void remove_instance(librados::IoCtx &io_ctx, + ContextWQ *work_queue, + const std::string &instance_id, + Context *on_finish); + + static InstanceWatcher *create( + librados::IoCtx &io_ctx, ContextWQ *work_queue, + InstanceReplayer<ImageCtxT> *instance_replayer); + void destroy() { + delete this; + } + + InstanceWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue, + InstanceReplayer<ImageCtxT> *instance_replayer, + const std::string &instance_id); + ~InstanceWatcher() override; + + inline std::string &get_instance_id() { + return m_instance_id; + } + + int init(); + void shut_down(); + + void init(Context *on_finish); + void shut_down(Context *on_finish); + void remove(Context *on_finish); + + void notify_image_acquire(const std::string &instance_id, + const std::string &global_image_id, + Context *on_notify_ack); + void notify_image_release(const std::string &instance_id, + const std::string &global_image_id, + Context *on_notify_ack); + void notify_peer_image_removed(const std::string &instance_id, + const std::string &global_image_id, + const std::string &peer_mirror_uuid, + Context *on_notify_ack); + + void notify_sync_request(const std::string &sync_id, Context *on_sync_start); + bool cancel_sync_request(const std::string &sync_id); + void notify_sync_complete(const std::string &sync_id); + + void print_sync_status(Formatter *f, stringstream *ss); + + void cancel_notify_requests(const std::string &instance_id); + + void handle_acquire_leader(); + void handle_release_leader(); + void handle_update_leader(const std::string &leader_instance_id); + +private: + /** + * @verbatim + * + * BREAK_INSTANCE_LOCK -------\ + * ^ | + * | (error) | + * GET_INSTANCE_LOCKER * * *>| + * ^ (remove) | + * | | + * <uninitialized> <----------------+---- WAIT_FOR_NOTIFY_OPS + * | (init) ^ | ^ + * v (error) * | | + * REGISTER_INSTANCE * * * * * *|* *> UNREGISTER_INSTANCE + * | * | ^ + * v (error) * v | + * CREATE_INSTANCE_OBJECT * * * * * *> REMOVE_INSTANCE_OBJECT + * | * ^ + * v (error) * | + * REGISTER_WATCH * * * * * * * * * *> UNREGISTER_WATCH + * | * ^ + * v (error) * | + * ACQUIRE_LOCK * * * * * * * * * * * RELEASE_LOCK + * | ^ + * v (shut_down) | + * <watching> -------------------------------/ + * + * @endverbatim + */ + + struct C_NotifyInstanceRequest; + struct C_SyncRequest; + + typedef std::pair<std::string, std::string> Id; + + struct HandlePayloadVisitor : public boost::static_visitor<void> { + InstanceWatcher *instance_watcher; + std::string instance_id; + C_NotifyAck *on_notify_ack; + + HandlePayloadVisitor(InstanceWatcher *instance_watcher, + const std::string &instance_id, + C_NotifyAck *on_notify_ack) + : instance_watcher(instance_watcher), instance_id(instance_id), + on_notify_ack(on_notify_ack) { + } + + template <typename Payload> + inline void operator()(const Payload &payload) const { + instance_watcher->handle_payload(instance_id, payload, on_notify_ack); + } + }; + + struct Request { + std::string instance_id; + uint64_t request_id; + C_NotifyAck *on_notify_ack = nullptr; + + Request(const std::string &instance_id, uint64_t request_id) + : instance_id(instance_id), request_id(request_id) { + } + + inline bool operator<(const Request &rhs) const { + return instance_id < rhs.instance_id || + (instance_id == rhs.instance_id && request_id < rhs.request_id); + } + }; + + Threads<ImageCtxT> *m_threads; + InstanceReplayer<ImageCtxT> *m_instance_replayer; + std::string m_instance_id; + + mutable Mutex m_lock; + librbd::ManagedLock<ImageCtxT> *m_instance_lock; + Context *m_on_finish = nullptr; + int m_ret_val = 0; + std::string m_leader_instance_id; + librbd::managed_lock::Locker m_instance_locker; + std::set<std::pair<std::string, C_NotifyInstanceRequest *>> m_notify_ops; + AsyncOpTracker m_notify_op_tracker; + uint64_t m_request_seq = 0; + std::set<Request> m_requests; + std::set<C_NotifyInstanceRequest *> m_suspended_ops; + std::map<std::string, C_SyncRequest *> m_inflight_sync_reqs; + ImageSyncThrottler<ImageCtxT> *m_image_sync_throttler = nullptr; + + void register_instance(); + void handle_register_instance(int r); + + void create_instance_object(); + void handle_create_instance_object(int r); + + void register_watch(); + void handle_register_watch(int r); + + void acquire_lock(); + void handle_acquire_lock(int r); + + void release_lock(); + void handle_release_lock(int r); + + void unregister_watch(); + void handle_unregister_watch(int r); + + void remove_instance_object(); + void handle_remove_instance_object(int r); + + void unregister_instance(); + void handle_unregister_instance(int r); + + void wait_for_notify_ops(); + void handle_wait_for_notify_ops(int r); + + void get_instance_locker(); + void handle_get_instance_locker(int r); + + void break_instance_lock(); + void handle_break_instance_lock(int r); + + void suspend_notify_request(C_NotifyInstanceRequest *req); + bool unsuspend_notify_request(C_NotifyInstanceRequest *req); + void unsuspend_notify_requests(); + + void notify_sync_complete(const Mutex& lock, const std::string &sync_id); + void handle_notify_sync_request(C_SyncRequest *sync_ctx, int r); + void handle_notify_sync_complete(C_SyncRequest *sync_ctx, int r); + + void notify_sync_start(const std::string &instance_id, + const std::string &sync_id); + + Context *prepare_request(const std::string &instance_id, uint64_t request_id, + C_NotifyAck *on_notify_ack); + void complete_request(const std::string &instance_id, uint64_t request_id, + int r); + + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) override; + + void handle_image_acquire(const std::string &global_image_id, + Context *on_finish); + void handle_image_release(const std::string &global_image_id, + Context *on_finish); + void handle_peer_image_removed(const std::string &global_image_id, + const std::string &peer_mirror_uuid, + Context *on_finish); + + void handle_sync_request(const std::string &instance_id, + const std::string &sync_id, Context *on_finish); + void handle_sync_start(const std::string &instance_id, + const std::string &sync_id, Context *on_finish); + + void handle_payload(const std::string &instance_id, + const instance_watcher::ImageAcquirePayload &payload, + C_NotifyAck *on_notify_ack); + void handle_payload(const std::string &instance_id, + const instance_watcher::ImageReleasePayload &payload, + C_NotifyAck *on_notify_ack); + void handle_payload(const std::string &instance_id, + const instance_watcher::PeerImageRemovedPayload &payload, + C_NotifyAck *on_notify_ack); + void handle_payload(const std::string &instance_id, + const instance_watcher::SyncRequestPayload &payload, + C_NotifyAck *on_notify_ack); + void handle_payload(const std::string &instance_id, + const instance_watcher::SyncStartPayload &payload, + C_NotifyAck *on_notify_ack); + void handle_payload(const std::string &instance_id, + const instance_watcher::UnknownPayload &payload, + C_NotifyAck *on_notify_ack); +}; + +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_INSTANCE_WATCHER_H diff --git a/src/tools/rbd_mirror/Instances.cc b/src/tools/rbd_mirror/Instances.cc new file mode 100644 index 00000000..b7a6cf11 --- /dev/null +++ b/src/tools/rbd_mirror/Instances.cc @@ -0,0 +1,359 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/stringify.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "InstanceWatcher.h" +#include "Instances.h" +#include "Threads.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::Instances: " \ + << this << " " << __func__ << ": " + +namespace rbd { +namespace mirror { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +Instances<I>::Instances(Threads<I> *threads, librados::IoCtx &ioctx, + const std::string& instance_id, + instances::Listener& listener) : + m_threads(threads), m_ioctx(ioctx), m_instance_id(instance_id), + m_listener(listener), m_cct(reinterpret_cast<CephContext *>(ioctx.cct())), + m_lock("rbd::mirror::Instances " + ioctx.get_pool_name()) { +} + +template <typename I> +Instances<I>::~Instances() { +} + +template <typename I> +void Instances<I>::init(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_on_finish == nullptr); + m_on_finish = on_finish; + get_instances(); +} + +template <typename I> +void Instances<I>::shut_down(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_on_finish == nullptr); + m_on_finish = on_finish; + + Context *ctx = new FunctionContext( + [this](int r) { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + cancel_remove_task(); + wait_for_ops(); + }); + + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void Instances<I>::unblock_listener() { + dout(5) << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_listener_blocked); + m_listener_blocked = false; + + InstanceIds added_instance_ids; + for (auto& pair : m_instances) { + if (pair.second.state == INSTANCE_STATE_ADDING) { + added_instance_ids.push_back(pair.first); + } + } + + if (!added_instance_ids.empty()) { + m_threads->work_queue->queue( + new C_NotifyInstancesAdded(this, added_instance_ids), 0); + } +} + +template <typename I> +void Instances<I>::acked(const InstanceIds& instance_ids) { + dout(10) << "instance_ids=" << instance_ids << dendl; + + Mutex::Locker locker(m_lock); + if (m_on_finish != nullptr) { + dout(5) << "received on shut down, ignoring" << dendl; + return; + } + + Context *ctx = new C_HandleAcked(this, instance_ids); + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void Instances<I>::handle_acked(const InstanceIds& instance_ids) { + dout(5) << "instance_ids=" << instance_ids << dendl; + + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + if (m_on_finish != nullptr) { + dout(5) << "handled on shut down, ignoring" << dendl; + return; + } + + InstanceIds added_instance_ids; + auto time = ceph_clock_now(); + for (auto& instance_id : instance_ids) { + auto &instance = m_instances.insert( + std::make_pair(instance_id, Instance{})).first->second; + instance.acked_time = time; + if (instance.state == INSTANCE_STATE_ADDING) { + added_instance_ids.push_back(instance_id); + } + } + + schedule_remove_task(time); + if (!m_listener_blocked && !added_instance_ids.empty()) { + m_threads->work_queue->queue( + new C_NotifyInstancesAdded(this, added_instance_ids), 0); + } +} + +template <typename I> +void Instances<I>::notify_instances_added(const InstanceIds& instance_ids) { + Mutex::Locker locker(m_lock); + InstanceIds added_instance_ids; + for (auto& instance_id : instance_ids) { + auto it = m_instances.find(instance_id); + if (it != m_instances.end() && it->second.state == INSTANCE_STATE_ADDING) { + added_instance_ids.push_back(instance_id); + } + } + + if (added_instance_ids.empty()) { + return; + } + + dout(5) << "instance_ids=" << added_instance_ids << dendl; + m_lock.Unlock(); + m_listener.handle_added(added_instance_ids); + m_lock.Lock(); + + for (auto& instance_id : added_instance_ids) { + auto it = m_instances.find(instance_id); + if (it != m_instances.end() && it->second.state == INSTANCE_STATE_ADDING) { + it->second.state = INSTANCE_STATE_IDLE; + } + } +} + +template <typename I> +void Instances<I>::notify_instances_removed(const InstanceIds& instance_ids) { + dout(5) << "instance_ids=" << instance_ids << dendl; + m_listener.handle_removed(instance_ids); + + Mutex::Locker locker(m_lock); + for (auto& instance_id : instance_ids) { + m_instances.erase(instance_id); + } +} + +template <typename I> +void Instances<I>::list(std::vector<std::string> *instance_ids) { + dout(20) << dendl; + + Mutex::Locker locker(m_lock); + + for (auto it : m_instances) { + instance_ids->push_back(it.first); + } +} + + +template <typename I> +void Instances<I>::get_instances() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_context_callback< + Instances, &Instances<I>::handle_get_instances>(this); + + InstanceWatcher<I>::get_instances(m_ioctx, &m_instance_ids, ctx); +} + +template <typename I> +void Instances<I>::handle_get_instances(int r) { + dout(10) << "r=" << r << dendl; + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + std::swap(on_finish, m_on_finish); + } + + if (r < 0) { + derr << "error retrieving instances: " << cpp_strerror(r) << dendl; + } else { + handle_acked(m_instance_ids); + } + on_finish->complete(r); +} + +template <typename I> +void Instances<I>::wait_for_ops() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_threads->work_queue, create_context_callback< + Instances, &Instances<I>::handle_wait_for_ops>(this)); + + m_async_op_tracker.wait_for_ops(ctx); +} + +template <typename I> +void Instances<I>::handle_wait_for_ops(int r) { + dout(10) << "r=" << r << dendl; + + ceph_assert(r == 0); + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + std::swap(on_finish, m_on_finish); + } + on_finish->complete(r); +} + +template <typename I> +void Instances<I>::remove_instances(const utime_t& time) { + ceph_assert(m_lock.is_locked()); + + InstanceIds instance_ids; + for (auto& instance_pair : m_instances) { + if (instance_pair.first == m_instance_id) { + continue; + } + auto& instance = instance_pair.second; + if (instance.state != INSTANCE_STATE_REMOVING && + instance.acked_time <= time) { + instance.state = INSTANCE_STATE_REMOVING; + instance_ids.push_back(instance_pair.first); + } + } + ceph_assert(!instance_ids.empty()); + + dout(10) << "instance_ids=" << instance_ids << dendl; + Context* ctx = new FunctionContext([this, instance_ids](int r) { + handle_remove_instances(r, instance_ids); + }); + ctx = create_async_context_callback(m_threads->work_queue, ctx); + + auto gather_ctx = new C_Gather(m_cct, ctx); + for (auto& instance_id : instance_ids) { + InstanceWatcher<I>::remove_instance(m_ioctx, m_threads->work_queue, + instance_id, gather_ctx->new_sub()); + } + + m_async_op_tracker.start_op(); + gather_ctx->activate(); +} + +template <typename I> +void Instances<I>::handle_remove_instances( + int r, const InstanceIds& instance_ids) { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + + dout(10) << "r=" << r << ", instance_ids=" << instance_ids << dendl; + ceph_assert(r == 0); + + // fire removed notification now that instances have been blacklisted + m_threads->work_queue->queue( + new C_NotifyInstancesRemoved(this, instance_ids), 0); + + // reschedule the timer for the next batch + schedule_remove_task(ceph_clock_now()); + m_async_op_tracker.finish_op(); +} + +template <typename I> +void Instances<I>::cancel_remove_task() { + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + + if (m_timer_task == nullptr) { + return; + } + + dout(10) << dendl; + + bool canceled = m_threads->timer->cancel_event(m_timer_task); + ceph_assert(canceled); + m_timer_task = nullptr; +} + +template <typename I> +void Instances<I>::schedule_remove_task(const utime_t& time) { + cancel_remove_task(); + if (m_on_finish != nullptr) { + dout(10) << "received on shut down, ignoring" << dendl; + return; + } + + int after = m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_heartbeat_interval") * + (1 + m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_max_missed_heartbeats") + + m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_max_acquire_attempts_before_break")); + + bool schedule = false; + utime_t oldest_time = time; + for (auto& instance : m_instances) { + if (instance.first == m_instance_id) { + continue; + } + if (instance.second.state == INSTANCE_STATE_REMOVING) { + // removal is already in-flight + continue; + } + + oldest_time = std::min(oldest_time, instance.second.acked_time); + schedule = true; + } + + if (!schedule) { + return; + } + + dout(10) << dendl; + + // schedule a time to fire when the oldest instance should be removed + m_timer_task = new FunctionContext( + [this, oldest_time](int r) { + ceph_assert(m_threads->timer_lock.is_locked()); + Mutex::Locker locker(m_lock); + m_timer_task = nullptr; + + remove_instances(oldest_time); + }); + + oldest_time += after; + m_threads->timer->add_event_at(oldest_time, m_timer_task); +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::Instances<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/Instances.h b/src/tools/rbd_mirror/Instances.h new file mode 100644 index 00000000..dbfb16df --- /dev/null +++ b/src/tools/rbd_mirror/Instances.h @@ -0,0 +1,167 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_INSTANCES_H +#define CEPH_RBD_MIRROR_INSTANCES_H + +#include <map> +#include <vector> + +#include "include/buffer_fwd.h" +#include "include/rados/librados_fwd.hpp" +#include "common/AsyncOpTracker.h" +#include "common/Mutex.h" +#include "librbd/Watcher.h" +#include "tools/rbd_mirror/instances/Types.h" + +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> struct Threads; + +template <typename ImageCtxT = librbd::ImageCtx> +class Instances { +public: + typedef std::vector<std::string> InstanceIds; + + static Instances *create(Threads<ImageCtxT> *threads, + librados::IoCtx &ioctx, + const std::string& instance_id, + instances::Listener& listener) { + return new Instances(threads, ioctx, instance_id, listener); + } + void destroy() { + delete this; + } + + Instances(Threads<ImageCtxT> *threads, librados::IoCtx &ioctx, + const std::string& instance_id, instances::Listener& listener); + virtual ~Instances(); + + void init(Context *on_finish); + void shut_down(Context *on_finish); + + void unblock_listener(); + + void acked(const InstanceIds& instance_ids); + + void list(std::vector<std::string> *instance_ids); + +private: + /** + * @verbatim + * + * <uninitialized> <---------------------\ + * | (init) ^ | + * v (error) * | + * GET_INSTANCES * * * * * WAIT_FOR_OPS + * | ^ + * v (shut_down) | + * <initialized> ------------------------/ + * . + * . (remove_instance) + * v + * REMOVE_INSTANCE + * + * @endverbatim + */ + + enum InstanceState { + INSTANCE_STATE_ADDING, + INSTANCE_STATE_IDLE, + INSTANCE_STATE_REMOVING + }; + + struct Instance { + utime_t acked_time{}; + InstanceState state = INSTANCE_STATE_ADDING; + }; + + struct C_NotifyBase : public Context { + Instances *instances; + InstanceIds instance_ids; + + C_NotifyBase(Instances *instances, const InstanceIds& instance_ids) + : instances(instances), instance_ids(instance_ids) { + instances->m_async_op_tracker.start_op(); + } + + void finish(int r) override { + execute(); + instances->m_async_op_tracker.finish_op(); + } + + virtual void execute() = 0; + }; + + struct C_HandleAcked : public C_NotifyBase { + C_HandleAcked(Instances *instances, const InstanceIds& instance_ids) + : C_NotifyBase(instances, instance_ids) { + } + + void execute() override { + this->instances->handle_acked(this->instance_ids); + } + }; + + struct C_NotifyInstancesAdded : public C_NotifyBase { + C_NotifyInstancesAdded(Instances *instances, + const InstanceIds& instance_ids) + : C_NotifyBase(instances, instance_ids) { + } + + void execute() override { + this->instances->notify_instances_added(this->instance_ids); + } + }; + + struct C_NotifyInstancesRemoved : public C_NotifyBase { + C_NotifyInstancesRemoved(Instances *instances, + const InstanceIds& instance_ids) + : C_NotifyBase(instances, instance_ids) { + } + + void execute() override { + this->instances->notify_instances_removed(this->instance_ids); + } + }; + + Threads<ImageCtxT> *m_threads; + librados::IoCtx &m_ioctx; + std::string m_instance_id; + instances::Listener& m_listener; + CephContext *m_cct; + + Mutex m_lock; + InstanceIds m_instance_ids; + std::map<std::string, Instance> m_instances; + Context *m_on_finish = nullptr; + AsyncOpTracker m_async_op_tracker; + + Context *m_timer_task = nullptr; + + bool m_listener_blocked = true; + + void handle_acked(const InstanceIds& instance_ids); + void notify_instances_added(const InstanceIds& instance_ids); + void notify_instances_removed(const InstanceIds& instance_ids); + + void get_instances(); + void handle_get_instances(int r); + + void wait_for_ops(); + void handle_wait_for_ops(int r); + + void remove_instances(const utime_t& time); + void handle_remove_instances(int r, const InstanceIds& instance_ids); + + void cancel_remove_task(); + void schedule_remove_task(const utime_t& time); +}; + +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_INSTANCES_H diff --git a/src/tools/rbd_mirror/LeaderWatcher.cc b/src/tools/rbd_mirror/LeaderWatcher.cc new file mode 100644 index 00000000..0d4bde6f --- /dev/null +++ b/src/tools/rbd_mirror/LeaderWatcher.cc @@ -0,0 +1,1145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "LeaderWatcher.h" +#include "common/Timer.h" +#include "common/debug.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "include/stringify.h" +#include "librbd/Utils.h" +#include "librbd/watcher/Types.h" +#include "Threads.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::LeaderWatcher: " \ + << this << " " << __func__ << ": " +namespace rbd { +namespace mirror { + +using namespace leader_watcher; + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +LeaderWatcher<I>::LeaderWatcher(Threads<I> *threads, librados::IoCtx &io_ctx, + leader_watcher::Listener *listener) + : Watcher(io_ctx, threads->work_queue, RBD_MIRROR_LEADER), + m_threads(threads), m_listener(listener), m_instances_listener(this), + m_lock("rbd::mirror::LeaderWatcher " + io_ctx.get_pool_name()), + m_notifier_id(librados::Rados(io_ctx).get_instance_id()), + m_instance_id(stringify(m_notifier_id)), + m_leader_lock(new LeaderLock(m_ioctx, m_work_queue, m_oid, this, true, + m_cct->_conf.get_val<uint64_t>( + "rbd_blacklist_expire_seconds"))) { +} + +template <typename I> +LeaderWatcher<I>::~LeaderWatcher() { + ceph_assert(m_status_watcher == nullptr); + ceph_assert(m_instances == nullptr); + ceph_assert(m_timer_task == nullptr); + + delete m_leader_lock; +} + +template <typename I> +std::string LeaderWatcher<I>::get_instance_id() { + return m_instance_id; +} + +template <typename I> +int LeaderWatcher<I>::init() { + C_SaferCond init_ctx; + init(&init_ctx); + return init_ctx.wait(); +} + +template <typename I> +void LeaderWatcher<I>::init(Context *on_finish) { + dout(10) << "notifier_id=" << m_notifier_id << dendl; + + Mutex::Locker locker(m_lock); + + ceph_assert(m_on_finish == nullptr); + m_on_finish = on_finish; + + create_leader_object(); +} + +template <typename I> +void LeaderWatcher<I>::create_leader_object() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + librados::ObjectWriteOperation op; + op.create(false); + + librados::AioCompletion *aio_comp = create_rados_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_create_leader_object>(this); + int r = m_ioctx.aio_operate(m_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void LeaderWatcher<I>::handle_create_leader_object(int r) { + dout(10) << "r=" << r << dendl; + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + + if (r == 0) { + register_watch(); + return; + } + + derr << "error creating " << m_oid << " object: " << cpp_strerror(r) + << dendl; + + std::swap(on_finish, m_on_finish); + } + on_finish->complete(r); +} + +template <typename I> +void LeaderWatcher<I>::register_watch() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_register_watch>(this)); + + librbd::Watcher::register_watch(ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_register_watch(int r) { + dout(10) << "r=" << r << dendl; + + Context *on_finish = nullptr; + if (r < 0) { + Mutex::Locker locker(m_lock); + derr << "error registering leader watcher for " << m_oid << " object: " + << cpp_strerror(r) << dendl; + ceph_assert(m_on_finish != nullptr); + std::swap(on_finish, m_on_finish); + } else { + Mutex::Locker locker(m_lock); + init_status_watcher(); + return; + } + + on_finish->complete(r); +} + +template <typename I> +void LeaderWatcher<I>::shut_down() { + C_SaferCond shut_down_ctx; + shut_down(&shut_down_ctx); + int r = shut_down_ctx.wait(); + ceph_assert(r == 0); +} + +template <typename I> +void LeaderWatcher<I>::shut_down(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + + ceph_assert(m_on_shut_down_finish == nullptr); + m_on_shut_down_finish = on_finish; + cancel_timer_task(); + shut_down_leader_lock(); +} + +template <typename I> +void LeaderWatcher<I>::shut_down_leader_lock() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_shut_down_leader_lock>(this)); + + m_leader_lock->shut_down(ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_shut_down_leader_lock(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + + if (r < 0) { + derr << "error shutting down leader lock: " << cpp_strerror(r) << dendl; + } + + shut_down_status_watcher(); +} + +template <typename I> +void LeaderWatcher<I>::unregister_watch() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_unregister_watch>(this)); + + librbd::Watcher::unregister_watch(ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_unregister_watch(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "error unregistering leader watcher for " << m_oid << " object: " + << cpp_strerror(r) << dendl; + } + wait_for_tasks(); +} + +template <typename I> +void LeaderWatcher<I>::wait_for_tasks() { + dout(10) << dendl; + + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + schedule_timer_task("wait for tasks", 0, false, + &LeaderWatcher<I>::handle_wait_for_tasks, true); +} + +template <typename I> +void LeaderWatcher<I>::handle_wait_for_tasks() { + dout(10) << dendl; + + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + ceph_assert(m_on_shut_down_finish != nullptr); + + ceph_assert(!m_timer_op_tracker.empty()); + m_timer_op_tracker.finish_op(); + + auto ctx = new FunctionContext([this](int r) { + Context *on_finish; + { + // ensure lock isn't held when completing shut down + Mutex::Locker locker(m_lock); + ceph_assert(m_on_shut_down_finish != nullptr); + on_finish = m_on_shut_down_finish; + } + on_finish->complete(0); + }); + m_work_queue->queue(ctx, 0); +} + +template <typename I> +bool LeaderWatcher<I>::is_blacklisted() const { + std::lock_guard locker{m_lock}; + return m_blacklisted; +} + +template <typename I> +bool LeaderWatcher<I>::is_leader() const { + Mutex::Locker locker(m_lock); + + return is_leader(m_lock); +} + +template <typename I> +bool LeaderWatcher<I>::is_leader(Mutex &lock) const { + ceph_assert(m_lock.is_locked()); + + bool leader = m_leader_lock->is_leader(); + dout(10) << leader << dendl; + return leader; +} + +template <typename I> +bool LeaderWatcher<I>::is_releasing_leader() const { + Mutex::Locker locker(m_lock); + + return is_releasing_leader(m_lock); +} + +template <typename I> +bool LeaderWatcher<I>::is_releasing_leader(Mutex &lock) const { + ceph_assert(m_lock.is_locked()); + + bool releasing = m_leader_lock->is_releasing_leader(); + dout(10) << releasing << dendl; + return releasing; +} + +template <typename I> +bool LeaderWatcher<I>::get_leader_instance_id(std::string *instance_id) const { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + if (is_leader(m_lock) || is_releasing_leader(m_lock)) { + *instance_id = m_instance_id; + return true; + } + + if (!m_locker.cookie.empty()) { + *instance_id = stringify(m_locker.entity.num()); + return true; + } + + return false; +} + +template <typename I> +void LeaderWatcher<I>::release_leader() { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + if (!is_leader(m_lock)) { + return; + } + + release_leader_lock(); +} + +template <typename I> +void LeaderWatcher<I>::list_instances(std::vector<std::string> *instance_ids) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + instance_ids->clear(); + if (m_instances != nullptr) { + m_instances->list(instance_ids); + } +} + +template <typename I> +void LeaderWatcher<I>::cancel_timer_task() { + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + + if (m_timer_task == nullptr) { + return; + } + + dout(10) << m_timer_task << dendl; + bool canceled = m_threads->timer->cancel_event(m_timer_task); + ceph_assert(canceled); + m_timer_task = nullptr; +} + +template <typename I> +void LeaderWatcher<I>::schedule_timer_task(const std::string &name, + int delay_factor, bool leader, + TimerCallback timer_callback, + bool shutting_down) { + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + + if (!shutting_down && m_on_shut_down_finish != nullptr) { + return; + } + + cancel_timer_task(); + + m_timer_task = new FunctionContext( + [this, leader, timer_callback](int r) { + ceph_assert(m_threads->timer_lock.is_locked()); + m_timer_task = nullptr; + + if (m_timer_op_tracker.empty()) { + Mutex::Locker locker(m_lock); + execute_timer_task(leader, timer_callback); + return; + } + + // old timer task is still running -- do not start next + // task until the previous task completes + if (m_timer_gate == nullptr) { + m_timer_gate = new C_TimerGate(this); + m_timer_op_tracker.wait_for_ops(m_timer_gate); + } + m_timer_gate->leader = leader; + m_timer_gate->timer_callback = timer_callback; + }); + + int after = delay_factor * m_cct->_conf.get_val<uint64_t>( + "rbd_mirror_leader_heartbeat_interval"); + + dout(10) << "scheduling " << name << " after " << after << " sec (task " + << m_timer_task << ")" << dendl; + m_threads->timer->add_event_after(after, m_timer_task); +} + +template <typename I> +void LeaderWatcher<I>::execute_timer_task(bool leader, + TimerCallback timer_callback) { + dout(10) << dendl; + + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + ceph_assert(m_timer_op_tracker.empty()); + + if (is_leader(m_lock) != leader) { + return; + } + + m_timer_op_tracker.start_op(); + (this->*timer_callback)(); +} + +template <typename I> +void LeaderWatcher<I>::handle_post_acquire_leader_lock(int r, + Context *on_finish) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + if (r == -EAGAIN) { + dout(10) << "already locked" << dendl; + } else { + derr << "error acquiring leader lock: " << cpp_strerror(r) << dendl; + } + on_finish->complete(r); + return; + } + + Mutex::Locker locker(m_lock); + ceph_assert(m_on_finish == nullptr); + m_on_finish = on_finish; + m_ret_val = 0; + + init_instances(); +} + +template <typename I> +void LeaderWatcher<I>::handle_pre_release_leader_lock(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(m_on_finish == nullptr); + m_on_finish = on_finish; + m_ret_val = 0; + + notify_listener(); +} + +template <typename I> +void LeaderWatcher<I>::handle_post_release_leader_lock(int r, + Context *on_finish) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + on_finish->complete(r); + return; + } + + Mutex::Locker locker(m_lock); + ceph_assert(m_on_finish == nullptr); + m_on_finish = on_finish; + + notify_lock_released(); +} + +template <typename I> +void LeaderWatcher<I>::break_leader_lock() { + dout(10) << dendl; + + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + ceph_assert(!m_timer_op_tracker.empty()); + + if (m_locker.cookie.empty()) { + get_locker(); + return; + } + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_break_leader_lock>(this)); + + m_leader_lock->break_lock(m_locker, true, ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_break_leader_lock(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + ceph_assert(!m_timer_op_tracker.empty()); + + if (m_leader_lock->is_shutdown()) { + dout(10) << "canceling due to shutdown" << dendl; + m_timer_op_tracker.finish_op(); + return; + } + + if (r < 0 && r != -ENOENT) { + derr << "error breaking leader lock: " << cpp_strerror(r) << dendl; + schedule_acquire_leader_lock(1); + m_timer_op_tracker.finish_op(); + return; + } + + m_locker = {}; + m_acquire_attempts = 0; + acquire_leader_lock(); +} + +template <typename I> +void LeaderWatcher<I>::schedule_get_locker(bool reset_leader, + uint32_t delay_factor) { + dout(10) << dendl; + + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + + if (reset_leader) { + m_locker = {}; + m_acquire_attempts = 0; + } + + schedule_timer_task("get locker", delay_factor, false, + &LeaderWatcher<I>::get_locker, false); +} + +template <typename I> +void LeaderWatcher<I>::get_locker() { + dout(10) << dendl; + + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + ceph_assert(!m_timer_op_tracker.empty()); + + C_GetLocker *get_locker_ctx = new C_GetLocker(this); + Context *ctx = create_async_context_callback(m_work_queue, get_locker_ctx); + + m_leader_lock->get_locker(&get_locker_ctx->locker, ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_get_locker(int r, + librbd::managed_lock::Locker& locker) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker mutex_locker(m_lock); + ceph_assert(!m_timer_op_tracker.empty()); + + if (m_leader_lock->is_shutdown()) { + dout(10) << "canceling due to shutdown" << dendl; + m_timer_op_tracker.finish_op(); + return; + } + + if (is_leader(m_lock)) { + m_locker = {}; + m_timer_op_tracker.finish_op(); + return; + } + + if (r == -ENOENT) { + m_locker = {}; + m_acquire_attempts = 0; + acquire_leader_lock(); + return; + } else if (r < 0) { + derr << "error retrieving leader locker: " << cpp_strerror(r) << dendl; + schedule_get_locker(true, 1); + m_timer_op_tracker.finish_op(); + return; + } + + bool notify_listener = false; + if (m_locker != locker) { + m_locker = locker; + notify_listener = true; + if (m_acquire_attempts > 1) { + dout(10) << "new lock owner detected -- resetting heartbeat counter" + << dendl; + m_acquire_attempts = 0; + } + } + + if (m_acquire_attempts >= m_cct->_conf.get_val<uint64_t>( + "rbd_mirror_leader_max_acquire_attempts_before_break")) { + dout(0) << "breaking leader lock after " << m_acquire_attempts << " " + << "failed attempts to acquire" << dendl; + break_leader_lock(); + return; + } + + schedule_acquire_leader_lock(1); + + if (!notify_listener) { + m_timer_op_tracker.finish_op(); + return; + } + + auto ctx = new FunctionContext( + [this](int r) { + std::string instance_id; + if (get_leader_instance_id(&instance_id)) { + m_listener->update_leader_handler(instance_id); + } + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + m_timer_op_tracker.finish_op(); + }); + m_work_queue->queue(ctx, 0); +} + +template <typename I> +void LeaderWatcher<I>::schedule_acquire_leader_lock(uint32_t delay_factor) { + dout(10) << dendl; + + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + + schedule_timer_task("acquire leader lock", + delay_factor * + m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_max_missed_heartbeats"), + false, &LeaderWatcher<I>::acquire_leader_lock, false); +} + +template <typename I> +void LeaderWatcher<I>::acquire_leader_lock() { + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + ceph_assert(!m_timer_op_tracker.empty()); + + ++m_acquire_attempts; + dout(10) << "acquire_attempts=" << m_acquire_attempts << dendl; + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_acquire_leader_lock>(this)); + m_leader_lock->try_acquire_lock(ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_acquire_leader_lock(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + ceph_assert(!m_timer_op_tracker.empty()); + + if (m_leader_lock->is_shutdown()) { + dout(10) << "canceling due to shutdown" << dendl; + m_timer_op_tracker.finish_op(); + return; + } + + if (r < 0) { + if (r == -EAGAIN) { + dout(10) << "already locked" << dendl; + } else { + derr << "error acquiring lock: " << cpp_strerror(r) << dendl; + } + + get_locker(); + return; + } + + m_locker = {}; + m_acquire_attempts = 0; + + if (m_ret_val) { + dout(5) << "releasing due to error on notify" << dendl; + release_leader_lock(); + m_timer_op_tracker.finish_op(); + return; + } + + notify_heartbeat(); +} + +template <typename I> +void LeaderWatcher<I>::release_leader_lock() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_release_leader_lock>(this)); + + m_leader_lock->release_lock(ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_release_leader_lock(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + + if (r < 0) { + derr << "error releasing lock: " << cpp_strerror(r) << dendl; + return; + } + + schedule_acquire_leader_lock(1); +} + +template <typename I> +void LeaderWatcher<I>::init_status_watcher() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + ceph_assert(m_status_watcher == nullptr); + + m_status_watcher = MirrorStatusWatcher<I>::create(m_ioctx, m_work_queue); + + Context *ctx = create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_init_status_watcher>(this); + + m_status_watcher->init(ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_init_status_watcher(int r) { + dout(10) << "r=" << r << dendl; + + Context *on_finish = nullptr; + { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + + if (r < 0) { + derr << "error initializing mirror status watcher: " << cpp_strerror(r) + << cpp_strerror(r) << dendl; + } else { + schedule_acquire_leader_lock(0); + } + + ceph_assert(m_on_finish != nullptr); + std::swap(on_finish, m_on_finish); + } + + on_finish->complete(r); +} + +template <typename I> +void LeaderWatcher<I>::shut_down_status_watcher() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + ceph_assert(m_status_watcher != nullptr); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback<LeaderWatcher<I>, + &LeaderWatcher<I>::handle_shut_down_status_watcher>(this)); + + m_status_watcher->shut_down(ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_shut_down_status_watcher(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + m_status_watcher->destroy(); + m_status_watcher = nullptr; + + if (r < 0) { + derr << "error shutting mirror status watcher down: " << cpp_strerror(r) + << dendl; + } + + unregister_watch(); +} + +template <typename I> +void LeaderWatcher<I>::init_instances() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + ceph_assert(m_instances == nullptr); + + m_instances = Instances<I>::create(m_threads, m_ioctx, m_instance_id, + m_instances_listener); + + Context *ctx = create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_init_instances>(this); + + m_instances->init(ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_init_instances(int r) { + dout(10) << "r=" << r << dendl; + + Context *on_finish = nullptr; + if (r < 0) { + Mutex::Locker locker(m_lock); + derr << "error initializing instances: " << cpp_strerror(r) << dendl; + m_instances->destroy(); + m_instances = nullptr; + + ceph_assert(m_on_finish != nullptr); + std::swap(m_on_finish, on_finish); + } else { + Mutex::Locker locker(m_lock); + notify_listener(); + return; + } + + on_finish->complete(r); +} + +template <typename I> +void LeaderWatcher<I>::shut_down_instances() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + ceph_assert(m_instances != nullptr); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback<LeaderWatcher<I>, + &LeaderWatcher<I>::handle_shut_down_instances>(this)); + + m_instances->shut_down(ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_shut_down_instances(int r) { + dout(10) << "r=" << r << dendl; + ceph_assert(r == 0); + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + + m_instances->destroy(); + m_instances = nullptr; + + ceph_assert(m_on_finish != nullptr); + std::swap(m_on_finish, on_finish); + } + on_finish->complete(r); +} + +template <typename I> +void LeaderWatcher<I>::notify_listener() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_async_context_callback( + m_work_queue, create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_listener>(this)); + + if (is_leader(m_lock)) { + ctx = new FunctionContext( + [this, ctx](int r) { + m_listener->post_acquire_handler(ctx); + }); + } else { + ctx = new FunctionContext( + [this, ctx](int r) { + m_listener->pre_release_handler(ctx); + }); + } + m_work_queue->queue(ctx, 0); +} + +template <typename I> +void LeaderWatcher<I>::handle_notify_listener(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker locker(m_lock); + + if (r < 0) { + derr << "error notifying listener: " << cpp_strerror(r) << dendl; + m_ret_val = r; + } + + if (is_leader(m_lock)) { + notify_lock_acquired(); + } else { + shut_down_instances(); + } +} + +template <typename I> +void LeaderWatcher<I>::notify_lock_acquired() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_lock_acquired>(this); + + bufferlist bl; + encode(NotifyMessage{LockAcquiredPayload{}}, bl); + + send_notify(bl, nullptr, ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_notify_lock_acquired(int r) { + dout(10) << "r=" << r << dendl; + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + if (r < 0 && r != -ETIMEDOUT) { + derr << "error notifying leader lock acquired: " << cpp_strerror(r) + << dendl; + m_ret_val = r; + } + + ceph_assert(m_on_finish != nullptr); + std::swap(m_on_finish, on_finish); + + if (m_ret_val == 0) { + // listener should be ready for instance add/remove events now + m_instances->unblock_listener(); + } + } + on_finish->complete(0); +} + +template <typename I> +void LeaderWatcher<I>::notify_lock_released() { + dout(10) << dendl; + + ceph_assert(m_lock.is_locked()); + + Context *ctx = create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_lock_released>(this); + + bufferlist bl; + encode(NotifyMessage{LockReleasedPayload{}}, bl); + + send_notify(bl, nullptr, ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_notify_lock_released(int r) { + dout(10) << "r=" << r << dendl; + + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + if (r < 0 && r != -ETIMEDOUT) { + derr << "error notifying leader lock released: " << cpp_strerror(r) + << dendl; + } + + ceph_assert(m_on_finish != nullptr); + std::swap(m_on_finish, on_finish); + } + on_finish->complete(r); +} + +template <typename I> +void LeaderWatcher<I>::notify_heartbeat() { + dout(10) << dendl; + + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_lock.is_locked()); + ceph_assert(!m_timer_op_tracker.empty()); + + if (!is_leader(m_lock)) { + dout(5) << "not leader, canceling" << dendl; + m_timer_op_tracker.finish_op(); + return; + } + + Context *ctx = create_context_callback< + LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_heartbeat>(this); + + bufferlist bl; + encode(NotifyMessage{HeartbeatPayload{}}, bl); + + m_heartbeat_response.acks.clear(); + send_notify(bl, &m_heartbeat_response, ctx); +} + +template <typename I> +void LeaderWatcher<I>::handle_notify_heartbeat(int r) { + dout(10) << "r=" << r << dendl; + + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + ceph_assert(!m_timer_op_tracker.empty()); + + m_timer_op_tracker.finish_op(); + if (m_leader_lock->is_shutdown()) { + dout(10) << "canceling due to shutdown" << dendl; + return; + } else if (!is_leader(m_lock)) { + return; + } + + if (r < 0 && r != -ETIMEDOUT) { + derr << "error notifying heartbeat: " << cpp_strerror(r) + << ", releasing leader" << dendl; + release_leader_lock(); + return; + } + + dout(10) << m_heartbeat_response.acks.size() << " acks received, " + << m_heartbeat_response.timeouts.size() << " timed out" << dendl; + + std::vector<std::string> instance_ids; + for (auto &it: m_heartbeat_response.acks) { + uint64_t notifier_id = it.first.gid; + instance_ids.push_back(stringify(notifier_id)); + } + if (!instance_ids.empty()) { + m_instances->acked(instance_ids); + } + + schedule_timer_task("heartbeat", 1, true, + &LeaderWatcher<I>::notify_heartbeat, false); +} + +template <typename I> +void LeaderWatcher<I>::handle_heartbeat(Context *on_notify_ack) { + dout(10) << dendl; + + { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + if (is_leader(m_lock)) { + dout(5) << "got another leader heartbeat, ignoring" << dendl; + } else { + cancel_timer_task(); + m_acquire_attempts = 0; + schedule_acquire_leader_lock(1); + } + } + + on_notify_ack->complete(0); +} + +template <typename I> +void LeaderWatcher<I>::handle_lock_acquired(Context *on_notify_ack) { + dout(10) << dendl; + + { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + if (is_leader(m_lock)) { + dout(5) << "got another leader lock_acquired, ignoring" << dendl; + } else { + cancel_timer_task(); + schedule_get_locker(true, 0); + } + } + + on_notify_ack->complete(0); +} + +template <typename I> +void LeaderWatcher<I>::handle_lock_released(Context *on_notify_ack) { + dout(10) << dendl; + + { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + if (is_leader(m_lock)) { + dout(5) << "got another leader lock_released, ignoring" << dendl; + } else { + cancel_timer_task(); + schedule_get_locker(true, 0); + } + } + + on_notify_ack->complete(0); +} + +template <typename I> +void LeaderWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) { + dout(10) << "notify_id=" << notify_id << ", handle=" << handle << ", " + << "notifier_id=" << notifier_id << dendl; + + Context *ctx = new C_NotifyAck(this, notify_id, handle); + + if (notifier_id == m_notifier_id) { + dout(10) << "our own notification, ignoring" << dendl; + ctx->complete(0); + return; + } + + NotifyMessage notify_message; + try { + auto iter = bl.cbegin(); + decode(notify_message, iter); + } catch (const buffer::error &err) { + derr << "error decoding image notification: " << err.what() << dendl; + ctx->complete(0); + return; + } + + apply_visitor(HandlePayloadVisitor(this, ctx), notify_message.payload); +} + +template <typename I> +void LeaderWatcher<I>::handle_rewatch_complete(int r) { + dout(5) << "r=" << r << dendl; + + if (r == -EBLACKLISTED) { + dout(1) << "blacklisted detected" << dendl; + m_blacklisted = true; + return; + } + + m_leader_lock->reacquire_lock(nullptr); +} + +template <typename I> +void LeaderWatcher<I>::handle_payload(const HeartbeatPayload &payload, + Context *on_notify_ack) { + dout(10) << "heartbeat" << dendl; + + handle_heartbeat(on_notify_ack); +} + +template <typename I> +void LeaderWatcher<I>::handle_payload(const LockAcquiredPayload &payload, + Context *on_notify_ack) { + dout(10) << "lock_acquired" << dendl; + + handle_lock_acquired(on_notify_ack); +} + +template <typename I> +void LeaderWatcher<I>::handle_payload(const LockReleasedPayload &payload, + Context *on_notify_ack) { + dout(10) << "lock_released" << dendl; + + handle_lock_released(on_notify_ack); +} + +template <typename I> +void LeaderWatcher<I>::handle_payload(const UnknownPayload &payload, + Context *on_notify_ack) { + dout(10) << "unknown" << dendl; + + on_notify_ack->complete(0); +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::LeaderWatcher<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/LeaderWatcher.h b/src/tools/rbd_mirror/LeaderWatcher.h new file mode 100644 index 00000000..01ee0565 --- /dev/null +++ b/src/tools/rbd_mirror/LeaderWatcher.h @@ -0,0 +1,320 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_LEADER_WATCHER_H +#define CEPH_RBD_MIRROR_LEADER_WATCHER_H + +#include <list> +#include <memory> +#include <string> + +#include "common/AsyncOpTracker.h" +#include "librbd/ManagedLock.h" +#include "librbd/Watcher.h" +#include "librbd/managed_lock/Types.h" +#include "librbd/watcher/Types.h" +#include "Instances.h" +#include "MirrorStatusWatcher.h" +#include "tools/rbd_mirror/instances/Types.h" +#include "tools/rbd_mirror/leader_watcher/Types.h" + +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> struct Threads; + +template <typename ImageCtxT = librbd::ImageCtx> +class LeaderWatcher : protected librbd::Watcher { + using librbd::Watcher::unregister_watch; // Silence overloaded virtual warning +public: + static LeaderWatcher* create(Threads<ImageCtxT> *threads, + librados::IoCtx &io_ctx, + leader_watcher::Listener *listener) { + return new LeaderWatcher(threads, io_ctx, listener); + } + + LeaderWatcher(Threads<ImageCtxT> *threads, librados::IoCtx &io_ctx, + leader_watcher::Listener *listener); + ~LeaderWatcher() override; + + int init(); + void shut_down(); + + void init(Context *on_finish); + void shut_down(Context *on_finish); + + bool is_blacklisted() const; + bool is_leader() const; + bool is_releasing_leader() const; + bool get_leader_instance_id(std::string *instance_id) const; + void release_leader(); + void list_instances(std::vector<std::string> *instance_ids); + + std::string get_instance_id(); + +private: + /** + * @verbatim + * + * <uninitialized> <------------------------------ WAIT_FOR_TASKS + * | (init) ^ ^ + * v * | + * CREATE_OBJECT * * * * * (error) UNREGISTER_WATCH + * | * ^ + * v * | + * REGISTER_WATCH * * * * * SHUT_DOWN_STATUS_WATCHER + * | * ^ + * v * | + * INIT_STATUS_WATCHER * * SHUT_DOWN_LEADER_LOCK + * | | + * | (no leader heartbeat and acquire failed) | + * | BREAK_LOCK <-------------------------------------\ | + * | | (no leader heartbeat) | | (shut down) + * | | /----------------------------------------\ | | + * | | | (lock_released received) | | + * | | | /-------------------------------------\ | | + * | | | | (lock_acquired or | | | + * | | | | heartbeat received) | | | + * | | | | (ENOENT) /-----------\ | | | + * | | | | * * * * * * * * * * | | | | | + * v v v v v (error) * v | | | | + * ACQUIRE_LEADER_LOCK * * * * *> GET_LOCKER ---> <secondary> + * | * ^ + * ....|...................*.................... .....|..................... + * . v * . . | post_release . + * .INIT_INSTANCES * * * * * . .NOTIFY_LOCK_RELEASED . + * . | . .....^..................... + * . v . | + * .NOTIFY_LISTENER . RELEASE_LEADER_LOCK + * . | . ^ + * . v . .....|..................... + * .NOTIFY_LOCK_ACQUIRED . . | . + * . | post_acquire . .SHUT_DOWN_INSTANCES . + * ....|........................................ . ^ . + * v . | . + * <leader> -----------------------------------> .NOTIFY_LISTENER . + * (shut_down, release_leader, . pre_release . + * notify error) ........................... + * @endverbatim + */ + + struct InstancesListener : public instances::Listener { + LeaderWatcher* leader_watcher; + + InstancesListener(LeaderWatcher* leader_watcher) + : leader_watcher(leader_watcher) { + } + + void handle_added(const InstanceIds& instance_ids) override { + leader_watcher->m_listener->handle_instances_added(instance_ids); + } + + void handle_removed(const InstanceIds& instance_ids) override { + leader_watcher->m_listener->handle_instances_removed(instance_ids); + } + }; + + class LeaderLock : public librbd::ManagedLock<ImageCtxT> { + public: + typedef librbd::ManagedLock<ImageCtxT> Parent; + + LeaderLock(librados::IoCtx& ioctx, ContextWQ *work_queue, + const std::string& oid, LeaderWatcher *watcher, + bool blacklist_on_break_lock, + uint32_t blacklist_expire_seconds) + : Parent(ioctx, work_queue, oid, watcher, librbd::managed_lock::EXCLUSIVE, + blacklist_on_break_lock, blacklist_expire_seconds), + watcher(watcher) { + } + + bool is_leader() const { + Mutex::Locker locker(Parent::m_lock); + return Parent::is_state_post_acquiring() || Parent::is_state_locked(); + } + + bool is_releasing_leader() const { + Mutex::Locker locker(Parent::m_lock); + return Parent::is_state_pre_releasing(); + } + + protected: + void post_acquire_lock_handler(int r, Context *on_finish) { + if (r == 0) { + // lock is owned at this point + Mutex::Locker locker(Parent::m_lock); + Parent::set_state_post_acquiring(); + } + watcher->handle_post_acquire_leader_lock(r, on_finish); + } + void pre_release_lock_handler(bool shutting_down, + Context *on_finish) { + watcher->handle_pre_release_leader_lock(on_finish); + } + void post_release_lock_handler(bool shutting_down, int r, + Context *on_finish) { + watcher->handle_post_release_leader_lock(r, on_finish); + } + private: + LeaderWatcher *watcher; + }; + + struct HandlePayloadVisitor : public boost::static_visitor<void> { + LeaderWatcher *leader_watcher; + Context *on_notify_ack; + + HandlePayloadVisitor(LeaderWatcher *leader_watcher, Context *on_notify_ack) + : leader_watcher(leader_watcher), on_notify_ack(on_notify_ack) { + } + + template <typename Payload> + inline void operator()(const Payload &payload) const { + leader_watcher->handle_payload(payload, on_notify_ack); + } + }; + + struct C_GetLocker : public Context { + LeaderWatcher *leader_watcher; + librbd::managed_lock::Locker locker; + + C_GetLocker(LeaderWatcher *leader_watcher) + : leader_watcher(leader_watcher) { + } + + void finish(int r) override { + leader_watcher->handle_get_locker(r, locker); + } + }; + + typedef void (LeaderWatcher<ImageCtxT>::*TimerCallback)(); + + struct C_TimerGate : public Context { + LeaderWatcher *leader_watcher; + + bool leader = false; + TimerCallback timer_callback = nullptr; + + C_TimerGate(LeaderWatcher *leader_watcher) + : leader_watcher(leader_watcher) { + } + + void finish(int r) override { + leader_watcher->m_timer_gate = nullptr; + leader_watcher->execute_timer_task(leader, timer_callback); + } + }; + + Threads<ImageCtxT> *m_threads; + leader_watcher::Listener *m_listener; + + InstancesListener m_instances_listener; + mutable Mutex m_lock; + uint64_t m_notifier_id; + std::string m_instance_id; + LeaderLock *m_leader_lock; + Context *m_on_finish = nullptr; + Context *m_on_shut_down_finish = nullptr; + uint64_t m_acquire_attempts = 0; + int m_ret_val = 0; + MirrorStatusWatcher<ImageCtxT> *m_status_watcher = nullptr; + Instances<ImageCtxT> *m_instances = nullptr; + librbd::managed_lock::Locker m_locker; + + bool m_blacklisted = false; + + AsyncOpTracker m_timer_op_tracker; + Context *m_timer_task = nullptr; + C_TimerGate *m_timer_gate = nullptr; + + librbd::watcher::NotifyResponse m_heartbeat_response; + + bool is_leader(Mutex &m_lock) const; + bool is_releasing_leader(Mutex &m_lock) const; + + void cancel_timer_task(); + void schedule_timer_task(const std::string &name, + int delay_factor, bool leader, + TimerCallback callback, bool shutting_down); + void execute_timer_task(bool leader, TimerCallback timer_callback); + + void create_leader_object(); + void handle_create_leader_object(int r); + + void register_watch(); + void handle_register_watch(int r); + + void shut_down_leader_lock(); + void handle_shut_down_leader_lock(int r); + + void unregister_watch(); + void handle_unregister_watch(int r); + + void wait_for_tasks(); + void handle_wait_for_tasks(); + + void break_leader_lock(); + void handle_break_leader_lock(int r); + + void schedule_get_locker(bool reset_leader, uint32_t delay_factor); + void get_locker(); + void handle_get_locker(int r, librbd::managed_lock::Locker& locker); + + void schedule_acquire_leader_lock(uint32_t delay_factor); + void acquire_leader_lock(); + void handle_acquire_leader_lock(int r); + + void release_leader_lock(); + void handle_release_leader_lock(int r); + + void init_status_watcher(); + void handle_init_status_watcher(int r); + + void shut_down_status_watcher(); + void handle_shut_down_status_watcher(int r); + + void init_instances(); + void handle_init_instances(int r); + + void shut_down_instances(); + void handle_shut_down_instances(int r); + + void notify_listener(); + void handle_notify_listener(int r); + + void notify_lock_acquired(); + void handle_notify_lock_acquired(int r); + + void notify_lock_released(); + void handle_notify_lock_released(int r); + + void notify_heartbeat(); + void handle_notify_heartbeat(int r); + + void handle_post_acquire_leader_lock(int r, Context *on_finish); + void handle_pre_release_leader_lock(Context *on_finish); + void handle_post_release_leader_lock(int r, Context *on_finish); + + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) override; + + void handle_rewatch_complete(int r) override; + + void handle_heartbeat(Context *on_ack); + void handle_lock_acquired(Context *on_ack); + void handle_lock_released(Context *on_ack); + + void handle_payload(const leader_watcher::HeartbeatPayload &payload, + Context *on_notify_ack); + void handle_payload(const leader_watcher::LockAcquiredPayload &payload, + Context *on_notify_ack); + void handle_payload(const leader_watcher::LockReleasedPayload &payload, + Context *on_notify_ack); + void handle_payload(const leader_watcher::UnknownPayload &payload, + Context *on_notify_ack); +}; + +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_LEADER_WATCHER_H diff --git a/src/tools/rbd_mirror/Mirror.cc b/src/tools/rbd_mirror/Mirror.cc new file mode 100644 index 00000000..ef18a0b6 --- /dev/null +++ b/src/tools/rbd_mirror/Mirror.cc @@ -0,0 +1,448 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <signal.h> + +#include <boost/range/adaptor/map.hpp> + +#include "common/Formatter.h" +#include "common/admin_socket.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "Mirror.h" +#include "ServiceDaemon.h" +#include "Threads.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::Mirror: " << this << " " \ + << __func__ << ": " + +using std::list; +using std::map; +using std::set; +using std::string; +using std::unique_ptr; +using std::vector; + +using librados::Rados; +using librados::IoCtx; +using librbd::mirror_peer_t; + +namespace rbd { +namespace mirror { + +namespace { + +class MirrorAdminSocketCommand { +public: + virtual ~MirrorAdminSocketCommand() {} + virtual bool call(Formatter *f, stringstream *ss) = 0; +}; + +class StatusCommand : public MirrorAdminSocketCommand { +public: + explicit StatusCommand(Mirror *mirror) : mirror(mirror) {} + + bool call(Formatter *f, stringstream *ss) override { + mirror->print_status(f, ss); + return true; + } + +private: + Mirror *mirror; +}; + +class StartCommand : public MirrorAdminSocketCommand { +public: + explicit StartCommand(Mirror *mirror) : mirror(mirror) {} + + bool call(Formatter *f, stringstream *ss) override { + mirror->start(); + return true; + } + +private: + Mirror *mirror; +}; + +class StopCommand : public MirrorAdminSocketCommand { +public: + explicit StopCommand(Mirror *mirror) : mirror(mirror) {} + + bool call(Formatter *f, stringstream *ss) override { + mirror->stop(); + return true; + } + +private: + Mirror *mirror; +}; + +class RestartCommand : public MirrorAdminSocketCommand { +public: + explicit RestartCommand(Mirror *mirror) : mirror(mirror) {} + + bool call(Formatter *f, stringstream *ss) override { + mirror->restart(); + return true; + } + +private: + Mirror *mirror; +}; + +class FlushCommand : public MirrorAdminSocketCommand { +public: + explicit FlushCommand(Mirror *mirror) : mirror(mirror) {} + + bool call(Formatter *f, stringstream *ss) override { + mirror->flush(); + return true; + } + +private: + Mirror *mirror; +}; + +class LeaderReleaseCommand : public MirrorAdminSocketCommand { +public: + explicit LeaderReleaseCommand(Mirror *mirror) : mirror(mirror) {} + + bool call(Formatter *f, stringstream *ss) override { + mirror->release_leader(); + return true; + } + +private: + Mirror *mirror; +}; + +} // anonymous namespace + +class MirrorAdminSocketHook : public AdminSocketHook { +public: + MirrorAdminSocketHook(CephContext *cct, Mirror *mirror) : + admin_socket(cct->get_admin_socket()) { + std::string command; + int r; + + command = "rbd mirror status"; + r = admin_socket->register_command(command, command, this, + "get status for rbd mirror"); + if (r == 0) { + commands[command] = new StatusCommand(mirror); + } + + command = "rbd mirror start"; + r = admin_socket->register_command(command, command, this, + "start rbd mirror"); + if (r == 0) { + commands[command] = new StartCommand(mirror); + } + + command = "rbd mirror stop"; + r = admin_socket->register_command(command, command, this, + "stop rbd mirror"); + if (r == 0) { + commands[command] = new StopCommand(mirror); + } + + command = "rbd mirror restart"; + r = admin_socket->register_command(command, command, this, + "restart rbd mirror"); + if (r == 0) { + commands[command] = new RestartCommand(mirror); + } + + command = "rbd mirror flush"; + r = admin_socket->register_command(command, command, this, + "flush rbd mirror"); + if (r == 0) { + commands[command] = new FlushCommand(mirror); + } + + command = "rbd mirror leader release"; + r = admin_socket->register_command(command, command, this, + "release rbd mirror leader"); + if (r == 0) { + commands[command] = new LeaderReleaseCommand(mirror); + } + } + + ~MirrorAdminSocketHook() override { + for (Commands::const_iterator i = commands.begin(); i != commands.end(); + ++i) { + (void)admin_socket->unregister_command(i->first); + delete i->second; + } + } + + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + Commands::const_iterator i = commands.find(command); + ceph_assert(i != commands.end()); + Formatter *f = Formatter::create(format); + stringstream ss; + bool r = i->second->call(f, &ss); + delete f; + out.append(ss); + return r; + } + +private: + typedef std::map<std::string, MirrorAdminSocketCommand*, std::less<>> Commands; + + AdminSocket *admin_socket; + Commands commands; +}; + +Mirror::Mirror(CephContext *cct, const std::vector<const char*> &args) : + m_cct(cct), + m_args(args), + m_lock("rbd::mirror::Mirror"), + m_local(new librados::Rados()), + m_asok_hook(new MirrorAdminSocketHook(cct, this)) +{ + m_threads = + &(cct->lookup_or_create_singleton_object<Threads<librbd::ImageCtx>>( + "rbd_mirror::threads", false, cct)); + m_service_daemon.reset(new ServiceDaemon<>(m_cct, m_local, m_threads)); +} + +Mirror::~Mirror() +{ + delete m_asok_hook; +} + +void Mirror::handle_signal(int signum) +{ + dout(20) << signum << dendl; + + Mutex::Locker l(m_lock); + + switch (signum) { + case SIGHUP: + for (auto &it : m_pool_replayers) { + it.second->reopen_logs(); + } + g_ceph_context->reopen_logs(); + break; + + case SIGINT: + case SIGTERM: + m_stopping = true; + m_cond.Signal(); + break; + + default: + ceph_abort_msgf("unexpected signal %d", signum); + } +} + +int Mirror::init() +{ + int r = m_local->init_with_context(m_cct); + if (r < 0) { + derr << "could not initialize rados handle" << dendl; + return r; + } + + r = m_local->connect(); + if (r < 0) { + derr << "error connecting to local cluster" << dendl; + return r; + } + + r = m_service_daemon->init(); + if (r < 0) { + derr << "error registering service daemon: " << cpp_strerror(r) << dendl; + return r; + } + + m_local_cluster_watcher.reset(new ClusterWatcher(m_local, m_lock, + m_service_daemon.get())); + return r; +} + +void Mirror::run() +{ + dout(20) << "enter" << dendl; + while (!m_stopping) { + m_local_cluster_watcher->refresh_pools(); + Mutex::Locker l(m_lock); + if (!m_manual_stop) { + update_pool_replayers(m_local_cluster_watcher->get_pool_peers()); + } + m_cond.WaitInterval( + m_lock, + utime_t(m_cct->_conf.get_val<uint64_t>("rbd_mirror_pool_replayers_refresh_interval"), 0)); + } + + // stop all pool replayers in parallel + Mutex::Locker locker(m_lock); + for (auto &pool_replayer : m_pool_replayers) { + pool_replayer.second->stop(false); + } + dout(20) << "return" << dendl; +} + +void Mirror::print_status(Formatter *f, stringstream *ss) +{ + dout(20) << "enter" << dendl; + + Mutex::Locker l(m_lock); + + if (m_stopping) { + return; + } + + if (f) { + f->open_object_section("mirror_status"); + f->open_array_section("pool_replayers"); + }; + + for (auto &pool_replayer : m_pool_replayers) { + pool_replayer.second->print_status(f, ss); + } + + if (f) { + f->close_section(); + f->close_section(); + f->flush(*ss); + } +} + +void Mirror::start() +{ + dout(20) << "enter" << dendl; + Mutex::Locker l(m_lock); + + if (m_stopping) { + return; + } + + m_manual_stop = false; + + for (auto &pool_replayer : m_pool_replayers) { + pool_replayer.second->start(); + } +} + +void Mirror::stop() +{ + dout(20) << "enter" << dendl; + Mutex::Locker l(m_lock); + + if (m_stopping) { + return; + } + + m_manual_stop = true; + + for (auto &pool_replayer : m_pool_replayers) { + pool_replayer.second->stop(true); + } +} + +void Mirror::restart() +{ + dout(20) << "enter" << dendl; + Mutex::Locker l(m_lock); + + if (m_stopping) { + return; + } + + m_manual_stop = false; + + for (auto &pool_replayer : m_pool_replayers) { + pool_replayer.second->restart(); + } +} + +void Mirror::flush() +{ + dout(20) << "enter" << dendl; + Mutex::Locker l(m_lock); + + if (m_stopping || m_manual_stop) { + return; + } + + for (auto &pool_replayer : m_pool_replayers) { + pool_replayer.second->flush(); + } +} + +void Mirror::release_leader() +{ + dout(20) << "enter" << dendl; + Mutex::Locker l(m_lock); + + if (m_stopping) { + return; + } + + for (auto &pool_replayer : m_pool_replayers) { + pool_replayer.second->release_leader(); + } +} + +void Mirror::update_pool_replayers(const PoolPeers &pool_peers) +{ + dout(20) << "enter" << dendl; + ceph_assert(m_lock.is_locked()); + + // remove stale pool replayers before creating new pool replayers + for (auto it = m_pool_replayers.begin(); it != m_pool_replayers.end();) { + auto &peer = it->first.second; + auto pool_peer_it = pool_peers.find(it->first.first); + if (pool_peer_it == pool_peers.end() || + pool_peer_it->second.find(peer) == pool_peer_it->second.end()) { + dout(20) << "removing pool replayer for " << peer << dendl; + // TODO: make async + it->second->shut_down(); + it = m_pool_replayers.erase(it); + } else { + ++it; + } + } + + for (auto &kv : pool_peers) { + for (auto &peer : kv.second) { + PoolPeer pool_peer(kv.first, peer); + + auto pool_replayers_it = m_pool_replayers.find(pool_peer); + if (pool_replayers_it != m_pool_replayers.end()) { + auto& pool_replayer = pool_replayers_it->second; + if (pool_replayer->is_blacklisted()) { + derr << "restarting blacklisted pool replayer for " << peer << dendl; + // TODO: make async + pool_replayer->shut_down(); + pool_replayer->init(); + } else if (!pool_replayer->is_running()) { + derr << "restarting failed pool replayer for " << peer << dendl; + // TODO: make async + pool_replayer->shut_down(); + pool_replayer->init(); + } + } else { + dout(20) << "starting pool replayer for " << peer << dendl; + unique_ptr<PoolReplayer<>> pool_replayer(new PoolReplayer<>( + m_threads, m_service_daemon.get(), kv.first, peer, m_args)); + + // TODO: make async + pool_replayer->init(); + m_pool_replayers.emplace(pool_peer, std::move(pool_replayer)); + } + } + + // TODO currently only support a single peer + } +} + +} // namespace mirror +} // namespace rbd diff --git a/src/tools/rbd_mirror/Mirror.h b/src/tools/rbd_mirror/Mirror.h new file mode 100644 index 00000000..153c0bc5 --- /dev/null +++ b/src/tools/rbd_mirror/Mirror.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_H +#define CEPH_RBD_MIRROR_H + +#include "common/ceph_context.h" +#include "common/Mutex.h" +#include "include/rados/librados.hpp" +#include "ClusterWatcher.h" +#include "PoolReplayer.h" +#include "tools/rbd_mirror/Types.h" + +#include <set> +#include <map> +#include <memory> +#include <atomic> + +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> struct ServiceDaemon; +template <typename> struct Threads; +class MirrorAdminSocketHook; + +/** + * Contains the main loop and overall state for rbd-mirror. + * + * Sets up mirroring, and coordinates between noticing config + * changes and applying them. + */ +class Mirror { +public: + Mirror(CephContext *cct, const std::vector<const char*> &args); + Mirror(const Mirror&) = delete; + Mirror& operator=(const Mirror&) = delete; + ~Mirror(); + + int init(); + void run(); + void handle_signal(int signum); + + void print_status(Formatter *f, stringstream *ss); + void start(); + void stop(); + void restart(); + void flush(); + void release_leader(); + +private: + typedef ClusterWatcher::PoolPeers PoolPeers; + typedef std::pair<int64_t, PeerSpec> PoolPeer; + + void update_pool_replayers(const PoolPeers &pool_peers); + + CephContext *m_cct; + std::vector<const char*> m_args; + Threads<librbd::ImageCtx> *m_threads = nullptr; + Mutex m_lock; + Cond m_cond; + RadosRef m_local; + std::unique_ptr<ServiceDaemon<librbd::ImageCtx>> m_service_daemon; + + // monitor local cluster for config changes in peers + std::unique_ptr<ClusterWatcher> m_local_cluster_watcher; + std::map<PoolPeer, std::unique_ptr<PoolReplayer<>>> m_pool_replayers; + std::atomic<bool> m_stopping = { false }; + bool m_manual_stop = false; + MirrorAdminSocketHook *m_asok_hook; +}; + +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_H diff --git a/src/tools/rbd_mirror/MirrorStatusWatcher.cc b/src/tools/rbd_mirror/MirrorStatusWatcher.cc new file mode 100644 index 00000000..b935bc5c --- /dev/null +++ b/src/tools/rbd_mirror/MirrorStatusWatcher.cc @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MirrorStatusWatcher.h" +#include "common/debug.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/Utils.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::MirrorStatusWatcher: " \ + << this << " " << __func__ << ": " + +namespace rbd { +namespace mirror { + +using librbd::util::create_rados_callback; + +template <typename I> +MirrorStatusWatcher<I>::MirrorStatusWatcher(librados::IoCtx &io_ctx, + ContextWQ *work_queue) + : Watcher(io_ctx, work_queue, RBD_MIRRORING) { +} + +template <typename I> +MirrorStatusWatcher<I>::~MirrorStatusWatcher() { +} + +template <typename I> +void MirrorStatusWatcher<I>::init(Context *on_finish) { + dout(20) << dendl; + + on_finish = new FunctionContext( + [this, on_finish] (int r) { + if (r < 0) { + derr << "error removing down statuses: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + register_watch(on_finish); + }); + + librados::ObjectWriteOperation op; + librbd::cls_client::mirror_image_status_remove_down(&op); + librados::AioCompletion *aio_comp = create_rados_callback(on_finish); + + int r = m_ioctx.aio_operate(RBD_MIRRORING, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void MirrorStatusWatcher<I>::shut_down(Context *on_finish) { + dout(20) << dendl; + + unregister_watch(on_finish); +} + +template <typename I> +void MirrorStatusWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, + bufferlist &bl) { + dout(20) << dendl; + + bufferlist out; + acknowledge_notify(notify_id, handle, out); +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::MirrorStatusWatcher<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/MirrorStatusWatcher.h b/src/tools/rbd_mirror/MirrorStatusWatcher.h new file mode 100644 index 00000000..155f8cc8 --- /dev/null +++ b/src/tools/rbd_mirror/MirrorStatusWatcher.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_MIRROR_STATUS_WATCHER_H +#define CEPH_RBD_MIRROR_MIRROR_STATUS_WATCHER_H + +#include "librbd/Watcher.h" + +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename ImageCtxT = librbd::ImageCtx> +class MirrorStatusWatcher : protected librbd::Watcher { +public: + static MirrorStatusWatcher *create(librados::IoCtx &io_ctx, + ContextWQ *work_queue) { + return new MirrorStatusWatcher(io_ctx, work_queue); + } + void destroy() { + delete this; + } + + MirrorStatusWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue); + ~MirrorStatusWatcher() override; + + void init(Context *on_finish); + void shut_down(Context *on_finish); + +protected: + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) override; +}; + +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_MIRROR_STATUS_WATCHER_H diff --git a/src/tools/rbd_mirror/PoolReplayer.cc b/src/tools/rbd_mirror/PoolReplayer.cc new file mode 100644 index 00000000..35d32eb5 --- /dev/null +++ b/src/tools/rbd_mirror/PoolReplayer.cc @@ -0,0 +1,1133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "PoolReplayer.h" +#include <boost/bind.hpp> +#include "common/Formatter.h" +#include "common/admin_socket.h" +#include "common/ceph_argparse.h" +#include "common/code_environment.h" +#include "common/common_init.h" +#include "common/debug.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "cls/rbd/cls_rbd_client.h" +#include "global/global_context.h" +#include "librbd/internal.h" +#include "librbd/Utils.h" +#include "librbd/Watcher.h" +#include "librbd/api/Config.h" +#include "librbd/api/Mirror.h" +#include "ImageMap.h" +#include "InstanceReplayer.h" +#include "InstanceWatcher.h" +#include "LeaderWatcher.h" +#include "ServiceDaemon.h" +#include "Threads.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::PoolReplayer: " \ + << this << " " << __func__ << ": " + +using std::chrono::seconds; +using std::map; +using std::string; +using std::unique_ptr; +using std::vector; + +using librbd::cls_client::dir_get_name; +using librbd::util::create_async_context_callback; + +namespace rbd { +namespace mirror { + +using ::operator<<; + +namespace { + +const std::string SERVICE_DAEMON_INSTANCE_ID_KEY("instance_id"); +const std::string SERVICE_DAEMON_LEADER_KEY("leader"); +const std::string SERVICE_DAEMON_LOCAL_COUNT_KEY("image_local_count"); +const std::string SERVICE_DAEMON_REMOTE_COUNT_KEY("image_remote_count"); + +const std::vector<std::string> UNIQUE_PEER_CONFIG_KEYS { + {"monmap", "mon_host", "mon_dns_srv_name", "key", "keyfile", "keyring"}}; + +template <typename I> +class PoolReplayerAdminSocketCommand { +public: + PoolReplayerAdminSocketCommand(PoolReplayer<I> *pool_replayer) + : pool_replayer(pool_replayer) { + } + virtual ~PoolReplayerAdminSocketCommand() {} + virtual bool call(Formatter *f, stringstream *ss) = 0; +protected: + PoolReplayer<I> *pool_replayer; +}; + +template <typename I> +class StatusCommand : public PoolReplayerAdminSocketCommand<I> { +public: + explicit StatusCommand(PoolReplayer<I> *pool_replayer) + : PoolReplayerAdminSocketCommand<I>(pool_replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->pool_replayer->print_status(f, ss); + return true; + } +}; + +template <typename I> +class StartCommand : public PoolReplayerAdminSocketCommand<I> { +public: + explicit StartCommand(PoolReplayer<I> *pool_replayer) + : PoolReplayerAdminSocketCommand<I>(pool_replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->pool_replayer->start(); + return true; + } +}; + +template <typename I> +class StopCommand : public PoolReplayerAdminSocketCommand<I> { +public: + explicit StopCommand(PoolReplayer<I> *pool_replayer) + : PoolReplayerAdminSocketCommand<I>(pool_replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->pool_replayer->stop(true); + return true; + } +}; + +template <typename I> +class RestartCommand : public PoolReplayerAdminSocketCommand<I> { +public: + explicit RestartCommand(PoolReplayer<I> *pool_replayer) + : PoolReplayerAdminSocketCommand<I>(pool_replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->pool_replayer->restart(); + return true; + } +}; + +template <typename I> +class FlushCommand : public PoolReplayerAdminSocketCommand<I> { +public: + explicit FlushCommand(PoolReplayer<I> *pool_replayer) + : PoolReplayerAdminSocketCommand<I>(pool_replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->pool_replayer->flush(); + return true; + } +}; + +template <typename I> +class LeaderReleaseCommand : public PoolReplayerAdminSocketCommand<I> { +public: + explicit LeaderReleaseCommand(PoolReplayer<I> *pool_replayer) + : PoolReplayerAdminSocketCommand<I>(pool_replayer) { + } + + bool call(Formatter *f, stringstream *ss) override { + this->pool_replayer->release_leader(); + return true; + } +}; + +template <typename I> +class PoolReplayerAdminSocketHook : public AdminSocketHook { +public: + PoolReplayerAdminSocketHook(CephContext *cct, const std::string &name, + PoolReplayer<I> *pool_replayer) + : admin_socket(cct->get_admin_socket()) { + std::string command; + int r; + + command = "rbd mirror status " + name; + r = admin_socket->register_command(command, command, this, + "get status for rbd mirror " + name); + if (r == 0) { + commands[command] = new StatusCommand<I>(pool_replayer); + } + + command = "rbd mirror start " + name; + r = admin_socket->register_command(command, command, this, + "start rbd mirror " + name); + if (r == 0) { + commands[command] = new StartCommand<I>(pool_replayer); + } + + command = "rbd mirror stop " + name; + r = admin_socket->register_command(command, command, this, + "stop rbd mirror " + name); + if (r == 0) { + commands[command] = new StopCommand<I>(pool_replayer); + } + + command = "rbd mirror restart " + name; + r = admin_socket->register_command(command, command, this, + "restart rbd mirror " + name); + if (r == 0) { + commands[command] = new RestartCommand<I>(pool_replayer); + } + + command = "rbd mirror flush " + name; + r = admin_socket->register_command(command, command, this, + "flush rbd mirror " + name); + if (r == 0) { + commands[command] = new FlushCommand<I>(pool_replayer); + } + + command = "rbd mirror leader release " + name; + r = admin_socket->register_command(command, command, this, + "release rbd mirror leader " + name); + if (r == 0) { + commands[command] = new LeaderReleaseCommand<I>(pool_replayer); + } + } + + ~PoolReplayerAdminSocketHook() override { + for (auto i = commands.begin(); i != commands.end(); ++i) { + (void)admin_socket->unregister_command(i->first); + delete i->second; + } + } + + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + auto i = commands.find(command); + ceph_assert(i != commands.end()); + Formatter *f = Formatter::create(format); + stringstream ss; + bool r = i->second->call(f, &ss); + delete f; + out.append(ss); + return r; + } + +private: + typedef std::map<std::string, PoolReplayerAdminSocketCommand<I>*, + std::less<>> Commands; + + AdminSocket *admin_socket; + Commands commands; +}; + +} // anonymous namespace + +template <typename I> +PoolReplayer<I>::PoolReplayer(Threads<I> *threads, + ServiceDaemon<I>* service_daemon, + int64_t local_pool_id, const PeerSpec &peer, + const std::vector<const char*> &args) : + m_threads(threads), + m_service_daemon(service_daemon), + m_local_pool_id(local_pool_id), + m_peer(peer), + m_args(args), + m_lock(stringify("rbd::mirror::PoolReplayer ") + stringify(peer)), + m_local_pool_watcher_listener(this, true), + m_remote_pool_watcher_listener(this, false), + m_image_map_listener(this), + m_pool_replayer_thread(this), + m_leader_listener(this) +{ +} + +template <typename I> +PoolReplayer<I>::~PoolReplayer() +{ + delete m_asok_hook; + shut_down(); +} + +template <typename I> +bool PoolReplayer<I>::is_blacklisted() const { + Mutex::Locker locker(m_lock); + return m_blacklisted; +} + +template <typename I> +bool PoolReplayer<I>::is_leader() const { + Mutex::Locker locker(m_lock); + return m_leader_watcher && m_leader_watcher->is_leader(); +} + +template <typename I> +bool PoolReplayer<I>::is_running() const { + return m_pool_replayer_thread.is_started(); +} + +template <typename I> +void PoolReplayer<I>::init() +{ + Mutex::Locker l(m_lock); + + ceph_assert(!m_pool_replayer_thread.is_started()); + + // reset state + m_stopping = false; + m_blacklisted = false; + + dout(10) << "replaying for " << m_peer << dendl; + int r = init_rados(g_ceph_context->_conf->cluster, + g_ceph_context->_conf->name.to_str(), + "", "", "local cluster", &m_local_rados, false); + if (r < 0) { + m_callout_id = m_service_daemon->add_or_update_callout( + m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR, + "unable to connect to local cluster"); + return; + } + + r = init_rados(m_peer.cluster_name, m_peer.client_name, + m_peer.mon_host, m_peer.key, + std::string("remote peer ") + stringify(m_peer), + &m_remote_rados, true); + if (r < 0) { + m_callout_id = m_service_daemon->add_or_update_callout( + m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR, + "unable to connect to remote cluster"); + return; + } + + r = m_local_rados->ioctx_create2(m_local_pool_id, m_local_io_ctx); + if (r < 0) { + derr << "error accessing local pool " << m_local_pool_id << ": " + << cpp_strerror(r) << dendl; + return; + } + + auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct()); + librbd::api::Config<I>::apply_pool_overrides(m_local_io_ctx, &cct->_conf); + + std::string local_mirror_uuid; + r = librbd::cls_client::mirror_uuid_get(&m_local_io_ctx, + &local_mirror_uuid); + if (r < 0) { + derr << "failed to retrieve local mirror uuid from pool " + << m_local_io_ctx.get_pool_name() << ": " << cpp_strerror(r) << dendl; + m_callout_id = m_service_daemon->add_or_update_callout( + m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR, + "unable to query local mirror uuid"); + return; + } + + r = m_remote_rados->ioctx_create(m_local_io_ctx.get_pool_name().c_str(), + m_remote_io_ctx); + if (r < 0) { + derr << "error accessing remote pool " << m_local_io_ctx.get_pool_name() + << ": " << cpp_strerror(r) << dendl; + m_callout_id = m_service_daemon->add_or_update_callout( + m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_WARNING, + "unable to access remote pool"); + return; + } + + dout(10) << "connected to " << m_peer << dendl; + + m_instance_replayer.reset(InstanceReplayer<I>::create( + m_threads, m_service_daemon, m_local_rados, local_mirror_uuid, + m_local_pool_id)); + m_instance_replayer->init(); + m_instance_replayer->add_peer(m_peer.uuid, m_remote_io_ctx); + + m_instance_watcher.reset(InstanceWatcher<I>::create( + m_local_io_ctx, m_threads->work_queue, m_instance_replayer.get())); + r = m_instance_watcher->init(); + if (r < 0) { + derr << "error initializing instance watcher: " << cpp_strerror(r) << dendl; + m_callout_id = m_service_daemon->add_or_update_callout( + m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR, + "unable to initialize instance messenger object"); + return; + } + m_service_daemon->add_or_update_attribute( + m_local_pool_id, SERVICE_DAEMON_INSTANCE_ID_KEY, + m_instance_watcher->get_instance_id()); + + m_leader_watcher.reset(LeaderWatcher<I>::create(m_threads, m_local_io_ctx, + &m_leader_listener)); + r = m_leader_watcher->init(); + if (r < 0) { + derr << "error initializing leader watcher: " << cpp_strerror(r) << dendl; + m_callout_id = m_service_daemon->add_or_update_callout( + m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR, + "unable to initialize leader messenger object"); + return; + } + + if (m_callout_id != service_daemon::CALLOUT_ID_NONE) { + m_service_daemon->remove_callout(m_local_pool_id, m_callout_id); + m_callout_id = service_daemon::CALLOUT_ID_NONE; + } + + m_pool_replayer_thread.create("pool replayer"); +} + +template <typename I> +void PoolReplayer<I>::shut_down() { + m_stopping = true; + { + Mutex::Locker l(m_lock); + m_cond.Signal(); + } + if (m_pool_replayer_thread.is_started()) { + m_pool_replayer_thread.join(); + } + if (m_leader_watcher) { + m_leader_watcher->shut_down(); + } + if (m_instance_watcher) { + m_instance_watcher->shut_down(); + } + if (m_instance_replayer) { + m_instance_replayer->shut_down(); + } + + m_leader_watcher.reset(); + m_instance_watcher.reset(); + m_instance_replayer.reset(); + + ceph_assert(!m_image_map); + ceph_assert(!m_image_deleter); + ceph_assert(!m_local_pool_watcher); + ceph_assert(!m_remote_pool_watcher); + m_local_rados.reset(); + m_remote_rados.reset(); +} + +template <typename I> +int PoolReplayer<I>::init_rados(const std::string &cluster_name, + const std::string &client_name, + const std::string &mon_host, + const std::string &key, + const std::string &description, + RadosRef *rados_ref, + bool strip_cluster_overrides) { + // NOTE: manually bootstrap a CephContext here instead of via + // the librados API to avoid mixing global singletons between + // the librados shared library and the daemon + // TODO: eliminate intermingling of global singletons within Ceph APIs + CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT); + if (client_name.empty() || !iparams.name.from_str(client_name)) { + derr << "error initializing cluster handle for " << description << dendl; + return -EINVAL; + } + + CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + cct->_conf->cluster = cluster_name; + + // librados::Rados::conf_read_file + int r = cct->_conf.parse_config_files(nullptr, nullptr, 0); + if (r < 0 && r != -ENOENT) { + // do not treat this as fatal, it might still be able to connect + derr << "could not read ceph conf for " << description << ": " + << cpp_strerror(r) << dendl; + } + + // preserve cluster-specific config settings before applying environment/cli + // overrides + std::map<std::string, std::string> config_values; + if (strip_cluster_overrides) { + // remote peer connections shouldn't apply cluster-specific + // configuration settings + for (auto& key : UNIQUE_PEER_CONFIG_KEYS) { + config_values[key] = cct->_conf.get_val<std::string>(key); + } + } + + cct->_conf.parse_env(cct->get_module_type()); + + // librados::Rados::conf_parse_env + std::vector<const char*> args; + r = cct->_conf.parse_argv(args); + if (r < 0) { + derr << "could not parse environment for " << description << ":" + << cpp_strerror(r) << dendl; + cct->put(); + return r; + } + cct->_conf.parse_env(cct->get_module_type()); + + if (!m_args.empty()) { + // librados::Rados::conf_parse_argv + args = m_args; + r = cct->_conf.parse_argv(args); + if (r < 0) { + derr << "could not parse command line args for " << description << ": " + << cpp_strerror(r) << dendl; + cct->put(); + return r; + } + } + + if (strip_cluster_overrides) { + // remote peer connections shouldn't apply cluster-specific + // configuration settings + for (auto& pair : config_values) { + auto value = cct->_conf.get_val<std::string>(pair.first); + if (pair.second != value) { + dout(0) << "reverting global config option override: " + << pair.first << ": " << value << " -> " << pair.second + << dendl; + cct->_conf.set_val_or_die(pair.first, pair.second); + } + } + } + + if (!g_ceph_context->_conf->admin_socket.empty()) { + cct->_conf.set_val_or_die("admin_socket", + "$run_dir/$name.$pid.$cluster.$cctid.asok"); + } + + if (!mon_host.empty()) { + r = cct->_conf.set_val("mon_host", mon_host); + if (r < 0) { + derr << "failed to set mon_host config for " << description << ": " + << cpp_strerror(r) << dendl; + cct->put(); + return r; + } + } + + if (!key.empty()) { + r = cct->_conf.set_val("key", key); + if (r < 0) { + derr << "failed to set key config for " << description << ": " + << cpp_strerror(r) << dendl; + cct->put(); + return r; + } + } + + // disable unnecessary librbd cache + cct->_conf.set_val_or_die("rbd_cache", "false"); + cct->_conf.apply_changes(nullptr); + cct->_conf.complain_about_parse_errors(cct); + + rados_ref->reset(new librados::Rados()); + + r = (*rados_ref)->init_with_context(cct); + ceph_assert(r == 0); + cct->put(); + + r = (*rados_ref)->connect(); + if (r < 0) { + derr << "error connecting to " << description << ": " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template <typename I> +void PoolReplayer<I>::run() +{ + dout(20) << "enter" << dendl; + + while (!m_stopping) { + std::string asok_hook_name = m_local_io_ctx.get_pool_name() + " " + + m_peer.cluster_name; + if (m_asok_hook_name != asok_hook_name || m_asok_hook == nullptr) { + m_asok_hook_name = asok_hook_name; + delete m_asok_hook; + + m_asok_hook = new PoolReplayerAdminSocketHook<I>(g_ceph_context, + m_asok_hook_name, this); + } + + Mutex::Locker locker(m_lock); + if (m_leader_watcher->is_blacklisted() || + m_instance_replayer->is_blacklisted() || + (m_local_pool_watcher && m_local_pool_watcher->is_blacklisted()) || + (m_remote_pool_watcher && m_remote_pool_watcher->is_blacklisted())) { + m_blacklisted = true; + m_stopping = true; + break; + } + + if (!m_stopping) { + m_cond.WaitInterval(m_lock, utime_t(1, 0)); + } + } + + m_instance_replayer->stop(); +} + +template <typename I> +void PoolReplayer<I>::reopen_logs() +{ + Mutex::Locker l(m_lock); + + if (m_local_rados) { + reinterpret_cast<CephContext *>(m_local_rados->cct())->reopen_logs(); + } + if (m_remote_rados) { + reinterpret_cast<CephContext *>(m_remote_rados->cct())->reopen_logs(); + } +} + +template <typename I> +void PoolReplayer<I>::print_status(Formatter *f, stringstream *ss) +{ + dout(20) << "enter" << dendl; + + if (!f) { + return; + } + + Mutex::Locker l(m_lock); + + f->open_object_section("pool_replayer_status"); + f->dump_stream("peer") << m_peer; + if (m_local_io_ctx.is_valid()) { + f->dump_string("pool", m_local_io_ctx.get_pool_name()); + f->dump_stream("instance_id") << m_instance_watcher->get_instance_id(); + } + + std::string state("running"); + if (m_manual_stop) { + state = "stopped (manual)"; + } else if (m_stopping) { + state = "stopped"; + } + f->dump_string("state", state); + + std::string leader_instance_id; + m_leader_watcher->get_leader_instance_id(&leader_instance_id); + f->dump_string("leader_instance_id", leader_instance_id); + + bool leader = m_leader_watcher->is_leader(); + f->dump_bool("leader", leader); + if (leader) { + std::vector<std::string> instance_ids; + m_leader_watcher->list_instances(&instance_ids); + f->open_array_section("instances"); + for (auto instance_id : instance_ids) { + f->dump_string("instance_id", instance_id); + } + f->close_section(); + } + + f->dump_string("local_cluster_admin_socket", + reinterpret_cast<CephContext *>(m_local_io_ctx.cct())->_conf. + get_val<std::string>("admin_socket")); + f->dump_string("remote_cluster_admin_socket", + reinterpret_cast<CephContext *>(m_remote_io_ctx.cct())->_conf. + get_val<std::string>("admin_socket")); + + f->open_object_section("sync_throttler"); + m_instance_watcher->print_sync_status(f, ss); + f->close_section(); + + m_instance_replayer->print_status(f, ss); + + if (m_image_deleter) { + f->open_object_section("image_deleter"); + m_image_deleter->print_status(f, ss); + f->close_section(); + } + + f->close_section(); + f->flush(*ss); +} + +template <typename I> +void PoolReplayer<I>::start() +{ + dout(20) << "enter" << dendl; + + Mutex::Locker l(m_lock); + + if (m_stopping) { + return; + } + + m_manual_stop = false; + + if (m_instance_replayer) { + m_instance_replayer->start(); + } +} + +template <typename I> +void PoolReplayer<I>::stop(bool manual) +{ + dout(20) << "enter: manual=" << manual << dendl; + + Mutex::Locker l(m_lock); + if (!manual) { + m_stopping = true; + m_cond.Signal(); + return; + } else if (m_stopping) { + return; + } + + m_manual_stop = true; + + if (m_instance_replayer) { + m_instance_replayer->stop(); + } +} + +template <typename I> +void PoolReplayer<I>::restart() +{ + dout(20) << "enter" << dendl; + + Mutex::Locker l(m_lock); + + if (m_stopping) { + return; + } + + if (m_instance_replayer) { + m_instance_replayer->restart(); + } +} + +template <typename I> +void PoolReplayer<I>::flush() +{ + dout(20) << "enter" << dendl; + + Mutex::Locker l(m_lock); + + if (m_stopping || m_manual_stop) { + return; + } + + if (m_instance_replayer) { + m_instance_replayer->flush(); + } +} + +template <typename I> +void PoolReplayer<I>::release_leader() +{ + dout(20) << "enter" << dendl; + + Mutex::Locker l(m_lock); + + if (m_stopping || !m_leader_watcher) { + return; + } + + m_leader_watcher->release_leader(); +} + +template <typename I> +void PoolReplayer<I>::handle_update(const std::string &mirror_uuid, + ImageIds &&added_image_ids, + ImageIds &&removed_image_ids) { + if (m_stopping) { + return; + } + + dout(10) << "mirror_uuid=" << mirror_uuid << ", " + << "added_count=" << added_image_ids.size() << ", " + << "removed_count=" << removed_image_ids.size() << dendl; + Mutex::Locker locker(m_lock); + if (!m_leader_watcher->is_leader()) { + return; + } + + m_service_daemon->add_or_update_attribute( + m_local_pool_id, SERVICE_DAEMON_LOCAL_COUNT_KEY, + m_local_pool_watcher->get_image_count()); + if (m_remote_pool_watcher) { + m_service_daemon->add_or_update_attribute( + m_local_pool_id, SERVICE_DAEMON_REMOTE_COUNT_KEY, + m_remote_pool_watcher->get_image_count()); + } + + std::set<std::string> added_global_image_ids; + for (auto& image_id : added_image_ids) { + added_global_image_ids.insert(image_id.global_id); + } + + std::set<std::string> removed_global_image_ids; + for (auto& image_id : removed_image_ids) { + removed_global_image_ids.insert(image_id.global_id); + } + + m_image_map->update_images(mirror_uuid, + std::move(added_global_image_ids), + std::move(removed_global_image_ids)); +} + +template <typename I> +void PoolReplayer<I>::handle_post_acquire_leader(Context *on_finish) { + dout(10) << dendl; + + m_service_daemon->add_or_update_attribute(m_local_pool_id, + SERVICE_DAEMON_LEADER_KEY, true); + m_instance_watcher->handle_acquire_leader(); + init_image_map(on_finish); +} + +template <typename I> +void PoolReplayer<I>::handle_pre_release_leader(Context *on_finish) { + dout(10) << dendl; + + m_service_daemon->remove_attribute(m_local_pool_id, + SERVICE_DAEMON_LEADER_KEY); + m_instance_watcher->handle_release_leader(); + shut_down_image_deleter(on_finish); +} + +template <typename I> +void PoolReplayer<I>::init_image_map(Context *on_finish) { + dout(5) << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(!m_image_map); + m_image_map.reset(ImageMap<I>::create(m_local_io_ctx, m_threads, + m_instance_watcher->get_instance_id(), + m_image_map_listener)); + + auto ctx = new FunctionContext([this, on_finish](int r) { + handle_init_image_map(r, on_finish); + }); + m_image_map->init(create_async_context_callback( + m_threads->work_queue, ctx)); +} + +template <typename I> +void PoolReplayer<I>::handle_init_image_map(int r, Context *on_finish) { + dout(5) << "r=" << r << dendl; + if (r < 0) { + derr << "failed to init image map: " << cpp_strerror(r) << dendl; + on_finish = new FunctionContext([on_finish, r](int) { + on_finish->complete(r); + }); + shut_down_image_map(on_finish); + return; + } + + init_local_pool_watcher(on_finish); +} + +template <typename I> +void PoolReplayer<I>::init_local_pool_watcher(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(!m_local_pool_watcher); + m_local_pool_watcher.reset(PoolWatcher<I>::create( + m_threads, m_local_io_ctx, m_local_pool_watcher_listener)); + + // ensure the initial set of local images is up-to-date + // after acquiring the leader role + auto ctx = new FunctionContext([this, on_finish](int r) { + handle_init_local_pool_watcher(r, on_finish); + }); + m_local_pool_watcher->init(create_async_context_callback( + m_threads->work_queue, ctx)); +} + +template <typename I> +void PoolReplayer<I>::handle_init_local_pool_watcher( + int r, Context *on_finish) { + dout(10) << "r=" << r << dendl; + if (r < 0) { + derr << "failed to retrieve local images: " << cpp_strerror(r) << dendl; + on_finish = new FunctionContext([on_finish, r](int) { + on_finish->complete(r); + }); + shut_down_pool_watchers(on_finish); + return; + } + + init_remote_pool_watcher(on_finish); +} + +template <typename I> +void PoolReplayer<I>::init_remote_pool_watcher(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(!m_remote_pool_watcher); + m_remote_pool_watcher.reset(PoolWatcher<I>::create( + m_threads, m_remote_io_ctx, m_remote_pool_watcher_listener)); + + auto ctx = new FunctionContext([this, on_finish](int r) { + handle_init_remote_pool_watcher(r, on_finish); + }); + m_remote_pool_watcher->init(create_async_context_callback( + m_threads->work_queue, ctx)); +} + +template <typename I> +void PoolReplayer<I>::handle_init_remote_pool_watcher( + int r, Context *on_finish) { + dout(10) << "r=" << r << dendl; + if (r == -ENOENT) { + // Technically nothing to do since the other side doesn't + // have mirroring enabled. Eventually the remote pool watcher will + // detect images (if mirroring is enabled), so no point propagating + // an error which would just busy-spin the state machines. + dout(0) << "remote peer does not have mirroring configured" << dendl; + } else if (r < 0) { + derr << "failed to retrieve remote images: " << cpp_strerror(r) << dendl; + on_finish = new FunctionContext([on_finish, r](int) { + on_finish->complete(r); + }); + shut_down_pool_watchers(on_finish); + return; + } + + init_image_deleter(on_finish); +} + +template <typename I> +void PoolReplayer<I>::init_image_deleter(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + ceph_assert(!m_image_deleter); + + on_finish = new FunctionContext([this, on_finish](int r) { + handle_init_image_deleter(r, on_finish); + }); + m_image_deleter.reset(ImageDeleter<I>::create(m_local_io_ctx, m_threads, + m_service_daemon)); + m_image_deleter->init(create_async_context_callback( + m_threads->work_queue, on_finish)); +} + +template <typename I> +void PoolReplayer<I>::handle_init_image_deleter(int r, Context *on_finish) { + dout(10) << "r=" << r << dendl; + if (r < 0) { + derr << "failed to init image deleter: " << cpp_strerror(r) << dendl; + on_finish = new FunctionContext([on_finish, r](int) { + on_finish->complete(r); + }); + shut_down_image_deleter(on_finish); + return; + } + + on_finish->complete(0); + + Mutex::Locker locker(m_lock); + m_cond.Signal(); +} + +template <typename I> +void PoolReplayer<I>::shut_down_image_deleter(Context* on_finish) { + dout(10) << dendl; + { + Mutex::Locker locker(m_lock); + if (m_image_deleter) { + Context *ctx = new FunctionContext([this, on_finish](int r) { + handle_shut_down_image_deleter(r, on_finish); + }); + ctx = create_async_context_callback(m_threads->work_queue, ctx); + + m_image_deleter->shut_down(ctx); + return; + } + } + shut_down_pool_watchers(on_finish); +} + +template <typename I> +void PoolReplayer<I>::handle_shut_down_image_deleter( + int r, Context* on_finish) { + dout(10) << "r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_image_deleter); + m_image_deleter.reset(); + } + + shut_down_pool_watchers(on_finish); +} + +template <typename I> +void PoolReplayer<I>::shut_down_pool_watchers(Context *on_finish) { + dout(10) << dendl; + + { + Mutex::Locker locker(m_lock); + if (m_local_pool_watcher) { + Context *ctx = new FunctionContext([this, on_finish](int r) { + handle_shut_down_pool_watchers(r, on_finish); + }); + ctx = create_async_context_callback(m_threads->work_queue, ctx); + + auto gather_ctx = new C_Gather(g_ceph_context, ctx); + m_local_pool_watcher->shut_down(gather_ctx->new_sub()); + if (m_remote_pool_watcher) { + m_remote_pool_watcher->shut_down(gather_ctx->new_sub()); + } + gather_ctx->activate(); + return; + } + } + + on_finish->complete(0); +} + +template <typename I> +void PoolReplayer<I>::handle_shut_down_pool_watchers( + int r, Context *on_finish) { + dout(10) << "r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_local_pool_watcher); + m_local_pool_watcher.reset(); + + if (m_remote_pool_watcher) { + m_remote_pool_watcher.reset(); + } + } + wait_for_update_ops(on_finish); +} + +template <typename I> +void PoolReplayer<I>::wait_for_update_ops(Context *on_finish) { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + + Context *ctx = new FunctionContext([this, on_finish](int r) { + handle_wait_for_update_ops(r, on_finish); + }); + ctx = create_async_context_callback(m_threads->work_queue, ctx); + + m_update_op_tracker.wait_for_ops(ctx); +} + +template <typename I> +void PoolReplayer<I>::handle_wait_for_update_ops(int r, Context *on_finish) { + dout(10) << "r=" << r << dendl; + ceph_assert(r == 0); + + shut_down_image_map(on_finish); +} + +template <typename I> +void PoolReplayer<I>::shut_down_image_map(Context *on_finish) { + dout(5) << dendl; + + { + Mutex::Locker locker(m_lock); + if (m_image_map) { + on_finish = new FunctionContext([this, on_finish](int r) { + handle_shut_down_image_map(r, on_finish); + }); + m_image_map->shut_down(create_async_context_callback( + m_threads->work_queue, on_finish)); + return; + } + } + + on_finish->complete(0); +} + +template <typename I> +void PoolReplayer<I>::handle_shut_down_image_map(int r, Context *on_finish) { + dout(5) << "r=" << r << dendl; + if (r < 0 && r != -EBLACKLISTED) { + derr << "failed to shut down image map: " << cpp_strerror(r) << dendl; + } + + Mutex::Locker locker(m_lock); + ceph_assert(m_image_map); + m_image_map.reset(); + + m_instance_replayer->release_all(on_finish); +} + +template <typename I> +void PoolReplayer<I>::handle_update_leader( + const std::string &leader_instance_id) { + dout(10) << "leader_instance_id=" << leader_instance_id << dendl; + + m_instance_watcher->handle_update_leader(leader_instance_id); +} + +template <typename I> +void PoolReplayer<I>::handle_acquire_image(const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish) { + dout(5) << "global_image_id=" << global_image_id << ", " + << "instance_id=" << instance_id << dendl; + + m_instance_watcher->notify_image_acquire(instance_id, global_image_id, + on_finish); +} + +template <typename I> +void PoolReplayer<I>::handle_release_image(const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish) { + dout(5) << "global_image_id=" << global_image_id << ", " + << "instance_id=" << instance_id << dendl; + + m_instance_watcher->notify_image_release(instance_id, global_image_id, + on_finish); +} + +template <typename I> +void PoolReplayer<I>::handle_remove_image(const std::string &mirror_uuid, + const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish) { + ceph_assert(!mirror_uuid.empty()); + dout(5) << "mirror_uuid=" << mirror_uuid << ", " + << "global_image_id=" << global_image_id << ", " + << "instance_id=" << instance_id << dendl; + + m_instance_watcher->notify_peer_image_removed(instance_id, global_image_id, + mirror_uuid, on_finish); +} + +template <typename I> +void PoolReplayer<I>::handle_instances_added(const InstanceIds &instance_ids) { + dout(5) << "instance_ids=" << instance_ids << dendl; + Mutex::Locker locker(m_lock); + if (!m_leader_watcher->is_leader()) { + return; + } + + ceph_assert(m_image_map); + m_image_map->update_instances_added(instance_ids); +} + +template <typename I> +void PoolReplayer<I>::handle_instances_removed( + const InstanceIds &instance_ids) { + dout(5) << "instance_ids=" << instance_ids << dendl; + Mutex::Locker locker(m_lock); + if (!m_leader_watcher->is_leader()) { + return; + } + + ceph_assert(m_image_map); + m_image_map->update_instances_removed(instance_ids); +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::PoolReplayer<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/PoolReplayer.h b/src/tools/rbd_mirror/PoolReplayer.h new file mode 100644 index 00000000..43a4a0fc --- /dev/null +++ b/src/tools/rbd_mirror/PoolReplayer.h @@ -0,0 +1,303 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_POOL_REPLAYER_H +#define CEPH_RBD_MIRROR_POOL_REPLAYER_H + +#include "common/AsyncOpTracker.h" +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/WorkQueue.h" +#include "include/rados/librados.hpp" + +#include "ClusterWatcher.h" +#include "LeaderWatcher.h" +#include "PoolWatcher.h" +#include "ImageDeleter.h" +#include "tools/rbd_mirror/Types.h" +#include "tools/rbd_mirror/image_map/Types.h" +#include "tools/rbd_mirror/leader_watcher/Types.h" +#include "tools/rbd_mirror/pool_watcher/Types.h" +#include "tools/rbd_mirror/service_daemon/Types.h" + +#include <set> +#include <map> +#include <memory> +#include <atomic> +#include <string> +#include <vector> + +class AdminSocketHook; + +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> class ImageMap; +template <typename> class InstanceReplayer; +template <typename> class InstanceWatcher; +template <typename> class ServiceDaemon; +template <typename> struct Threads; + +/** + * Controls mirroring for a single remote cluster. + */ +template <typename ImageCtxT = librbd::ImageCtx> +class PoolReplayer { +public: + PoolReplayer(Threads<ImageCtxT> *threads, + ServiceDaemon<ImageCtxT>* service_daemon, + int64_t local_pool_id, const PeerSpec &peer, + const std::vector<const char*> &args); + ~PoolReplayer(); + PoolReplayer(const PoolReplayer&) = delete; + PoolReplayer& operator=(const PoolReplayer&) = delete; + + bool is_blacklisted() const; + bool is_leader() const; + bool is_running() const; + + void init(); + void shut_down(); + + void run(); + + void print_status(Formatter *f, stringstream *ss); + void start(); + void stop(bool manual); + void restart(); + void flush(); + void release_leader(); + void reopen_logs(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * INIT + * | + * v + * <follower> <-------------------------\ + * . | + * . | + * v (leader acquired) | + * INIT_IMAGE_MAP SHUT_DOWN_IMAGE_MAP + * | ^ + * v | + * INIT_LOCAL_POOL_WATCHER WAIT_FOR_NOTIFICATIONS + * | ^ + * v | + * INIT_REMOTE_POOL_WATCHER SHUT_DOWN_POOL_WATCHERS + * | ^ + * v | + * INIT_IMAGE_DELETER SHUT_DOWN_IMAGE_DELETER + * | ^ + * v . + * <leader> <-----------\ . + * . | . + * . (image update) | . + * . . > NOTIFY_INSTANCE_WATCHER . + * . . + * . (leader lost / shut down) . + * . . . . . . . . . . . . . . . . . . + * + * @endverbatim + */ + + typedef std::vector<std::string> InstanceIds; + + struct PoolWatcherListener : public pool_watcher::Listener { + PoolReplayer *pool_replayer; + bool local; + + PoolWatcherListener(PoolReplayer *pool_replayer, bool local) + : pool_replayer(pool_replayer), local(local) { + } + + void handle_update(const std::string &mirror_uuid, + ImageIds &&added_image_ids, + ImageIds &&removed_image_ids) override { + pool_replayer->handle_update((local ? "" : mirror_uuid), + std::move(added_image_ids), + std::move(removed_image_ids)); + } + }; + + struct ImageMapListener : public image_map::Listener { + PoolReplayer *pool_replayer; + + ImageMapListener(PoolReplayer *pool_replayer) + : pool_replayer(pool_replayer) { + } + + void acquire_image(const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish) override { + pool_replayer->handle_acquire_image(global_image_id, instance_id, + on_finish); + } + + void release_image(const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish) override { + pool_replayer->handle_release_image(global_image_id, instance_id, + on_finish); + } + + void remove_image(const std::string &mirror_uuid, + const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish) override { + pool_replayer->handle_remove_image(mirror_uuid, global_image_id, + instance_id, on_finish); + } + }; + + void handle_update(const std::string &mirror_uuid, + ImageIds &&added_image_ids, + ImageIds &&removed_image_ids); + + int init_rados(const std::string &cluster_name, + const std::string &client_name, + const std::string &mon_host, + const std::string &key, + const std::string &description, RadosRef *rados_ref, + bool strip_cluster_overrides); + + void handle_post_acquire_leader(Context *on_finish); + void handle_pre_release_leader(Context *on_finish); + + void init_image_map(Context *on_finish); + void handle_init_image_map(int r, Context *on_finish); + + void init_local_pool_watcher(Context *on_finish); + void handle_init_local_pool_watcher(int r, Context *on_finish); + + void init_remote_pool_watcher(Context *on_finish); + void handle_init_remote_pool_watcher(int r, Context *on_finish); + + void init_image_deleter(Context* on_finish); + void handle_init_image_deleter(int r, Context* on_finish); + + void shut_down_image_deleter(Context* on_finish); + void handle_shut_down_image_deleter(int r, Context* on_finish); + + void shut_down_pool_watchers(Context *on_finish); + void handle_shut_down_pool_watchers(int r, Context *on_finish); + + void wait_for_update_ops(Context *on_finish); + void handle_wait_for_update_ops(int r, Context *on_finish); + + void shut_down_image_map(Context *on_finish); + void handle_shut_down_image_map(int r, Context *on_finish); + + void handle_update_leader(const std::string &leader_instance_id); + + void handle_acquire_image(const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish); + void handle_release_image(const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish); + void handle_remove_image(const std::string &mirror_uuid, + const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish); + + void handle_instances_added(const InstanceIds &instance_ids); + void handle_instances_removed(const InstanceIds &instance_ids); + + Threads<ImageCtxT> *m_threads; + ServiceDaemon<ImageCtxT>* m_service_daemon; + int64_t m_local_pool_id = -1; + PeerSpec m_peer; + std::vector<const char*> m_args; + + mutable Mutex m_lock; + Cond m_cond; + std::atomic<bool> m_stopping = { false }; + bool m_manual_stop = false; + bool m_blacklisted = false; + + RadosRef m_local_rados; + RadosRef m_remote_rados; + + librados::IoCtx m_local_io_ctx; + librados::IoCtx m_remote_io_ctx; + + PoolWatcherListener m_local_pool_watcher_listener; + std::unique_ptr<PoolWatcher<ImageCtxT>> m_local_pool_watcher; + + PoolWatcherListener m_remote_pool_watcher_listener; + std::unique_ptr<PoolWatcher<ImageCtxT>> m_remote_pool_watcher; + + std::unique_ptr<InstanceReplayer<ImageCtxT>> m_instance_replayer; + std::unique_ptr<ImageDeleter<ImageCtxT>> m_image_deleter; + + ImageMapListener m_image_map_listener; + std::unique_ptr<ImageMap<ImageCtxT>> m_image_map; + + std::string m_asok_hook_name; + AdminSocketHook *m_asok_hook = nullptr; + + service_daemon::CalloutId m_callout_id = service_daemon::CALLOUT_ID_NONE; + + class PoolReplayerThread : public Thread { + PoolReplayer *m_pool_replayer; + public: + PoolReplayerThread(PoolReplayer *pool_replayer) + : m_pool_replayer(pool_replayer) { + } + void *entry() override { + m_pool_replayer->run(); + return 0; + } + } m_pool_replayer_thread; + + class LeaderListener : public leader_watcher::Listener { + public: + LeaderListener(PoolReplayer *pool_replayer) + : m_pool_replayer(pool_replayer) { + } + + protected: + void post_acquire_handler(Context *on_finish) override { + m_pool_replayer->handle_post_acquire_leader(on_finish); + } + + void pre_release_handler(Context *on_finish) override { + m_pool_replayer->handle_pre_release_leader(on_finish); + } + + void update_leader_handler( + const std::string &leader_instance_id) override { + m_pool_replayer->handle_update_leader(leader_instance_id); + } + + void handle_instances_added(const InstanceIds& instance_ids) override { + m_pool_replayer->handle_instances_added(instance_ids); + } + + void handle_instances_removed(const InstanceIds& instance_ids) override { + m_pool_replayer->handle_instances_removed(instance_ids); + } + + private: + PoolReplayer *m_pool_replayer; + } m_leader_listener; + + std::unique_ptr<LeaderWatcher<ImageCtxT>> m_leader_watcher; + std::unique_ptr<InstanceWatcher<ImageCtxT>> m_instance_watcher; + AsyncOpTracker m_update_op_tracker; +}; + +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::PoolReplayer<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_POOL_REPLAYER_H diff --git a/src/tools/rbd_mirror/PoolWatcher.cc b/src/tools/rbd_mirror/PoolWatcher.cc new file mode 100644 index 00000000..81810ea1 --- /dev/null +++ b/src/tools/rbd_mirror/PoolWatcher.cc @@ -0,0 +1,553 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/PoolWatcher.h" +#include "include/rbd_types.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/MirroringWatcher.h" +#include "librbd/Utils.h" +#include "librbd/api/Image.h" +#include "librbd/api/Mirror.h" +#include "tools/rbd_mirror/Threads.h" +#include "tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h" +#include <boost/bind.hpp> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::PoolWatcher: " << this << " " \ + << __func__ << ": " + +using std::list; +using std::string; +using std::unique_ptr; +using std::vector; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +namespace rbd { +namespace mirror { + +template <typename I> +class PoolWatcher<I>::MirroringWatcher : public librbd::MirroringWatcher<I> { +public: + using ContextWQ = typename std::decay< + typename std::remove_pointer< + decltype(Threads<I>::work_queue)>::type>::type; + + MirroringWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue, + PoolWatcher *pool_watcher) + : librbd::MirroringWatcher<I>(io_ctx, work_queue), + m_pool_watcher(pool_watcher) { + } + + void handle_rewatch_complete(int r) override { + m_pool_watcher->handle_rewatch_complete(r); + } + + void handle_mode_updated(cls::rbd::MirrorMode mirror_mode) override { + // invalidate all image state and refresh the pool contents + m_pool_watcher->schedule_refresh_images(5); + } + + void handle_image_updated(cls::rbd::MirrorImageState state, + const std::string &remote_image_id, + const std::string &global_image_id) override { + bool enabled = (state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED); + m_pool_watcher->handle_image_updated(remote_image_id, global_image_id, + enabled); + } + +private: + PoolWatcher *m_pool_watcher; +}; + +template <typename I> +PoolWatcher<I>::PoolWatcher(Threads<I> *threads, librados::IoCtx &remote_io_ctx, + pool_watcher::Listener &listener) + : m_threads(threads), m_remote_io_ctx(remote_io_ctx), m_listener(listener), + m_lock(librbd::util::unique_lock_name("rbd::mirror::PoolWatcher", this)) { + m_mirroring_watcher = new MirroringWatcher(m_remote_io_ctx, + m_threads->work_queue, this); +} + +template <typename I> +PoolWatcher<I>::~PoolWatcher() { + delete m_mirroring_watcher; +} + +template <typename I> +bool PoolWatcher<I>::is_blacklisted() const { + Mutex::Locker locker(m_lock); + return m_blacklisted; +} + +template <typename I> +void PoolWatcher<I>::init(Context *on_finish) { + dout(5) << dendl; + + { + Mutex::Locker locker(m_lock); + m_on_init_finish = on_finish; + + ceph_assert(!m_refresh_in_progress); + m_refresh_in_progress = true; + } + + // start async updates for mirror image directory + register_watcher(); +} + +template <typename I> +void PoolWatcher<I>::shut_down(Context *on_finish) { + dout(5) << dendl; + + { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + + ceph_assert(!m_shutting_down); + m_shutting_down = true; + if (m_timer_ctx != nullptr) { + m_threads->timer->cancel_event(m_timer_ctx); + m_timer_ctx = nullptr; + } + } + + // in-progress unregister tracked as async op + unregister_watcher(); + + m_async_op_tracker.wait_for_ops(on_finish); +} + +template <typename I> +void PoolWatcher<I>::register_watcher() { + { + Mutex::Locker locker(m_lock); + ceph_assert(m_image_ids_invalid); + ceph_assert(m_refresh_in_progress); + } + + // if the watch registration is in-flight, let the watcher + // handle the transition -- only (re-)register if it's not registered + if (!m_mirroring_watcher->is_unregistered()) { + refresh_images(); + return; + } + + // first time registering or the watch failed + dout(5) << dendl; + m_async_op_tracker.start_op(); + + Context *ctx = create_context_callback< + PoolWatcher, &PoolWatcher<I>::handle_register_watcher>(this); + m_mirroring_watcher->register_watch(ctx); +} + +template <typename I> +void PoolWatcher<I>::handle_register_watcher(int r) { + dout(5) << "r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_image_ids_invalid); + ceph_assert(m_refresh_in_progress); + if (r < 0) { + m_refresh_in_progress = false; + } + } + + Context *on_init_finish = nullptr; + if (r >= 0) { + refresh_images(); + } else if (r == -EBLACKLISTED) { + dout(0) << "detected client is blacklisted" << dendl; + + Mutex::Locker locker(m_lock); + m_blacklisted = true; + std::swap(on_init_finish, m_on_init_finish); + } else if (r == -ENOENT) { + dout(5) << "mirroring directory does not exist" << dendl; + { + Mutex::Locker locker(m_lock); + std::swap(on_init_finish, m_on_init_finish); + } + + schedule_refresh_images(30); + } else { + derr << "unexpected error registering mirroring directory watch: " + << cpp_strerror(r) << dendl; + schedule_refresh_images(10); + } + + m_async_op_tracker.finish_op(); + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + } +} + +template <typename I> +void PoolWatcher<I>::unregister_watcher() { + dout(5) << dendl; + + m_async_op_tracker.start_op(); + Context *ctx = new FunctionContext([this](int r) { + dout(5) << "unregister_watcher: r=" << r << dendl; + if (r < 0) { + derr << "error unregistering watcher for " + << m_mirroring_watcher->get_oid() << " object: " << cpp_strerror(r) + << dendl; + } + m_async_op_tracker.finish_op(); + }); + + m_mirroring_watcher->unregister_watch(ctx); +} + +template <typename I> +void PoolWatcher<I>::refresh_images() { + dout(5) << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_image_ids_invalid); + ceph_assert(m_refresh_in_progress); + + // clear all pending notification events since we need to perform + // a full image list refresh + m_pending_added_image_ids.clear(); + m_pending_removed_image_ids.clear(); + } + + m_async_op_tracker.start_op(); + m_refresh_image_ids.clear(); + Context *ctx = create_context_callback< + PoolWatcher, &PoolWatcher<I>::handle_refresh_images>(this); + auto req = pool_watcher::RefreshImagesRequest<I>::create(m_remote_io_ctx, + &m_refresh_image_ids, + ctx); + req->send(); +} + +template <typename I> +void PoolWatcher<I>::handle_refresh_images(int r) { + dout(5) << "r=" << r << dendl; + + bool retry_refresh = false; + Context *on_init_finish = nullptr; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_image_ids_invalid); + ceph_assert(m_refresh_in_progress); + + if (r >= 0) { + m_pending_image_ids = std::move(m_refresh_image_ids); + } else if (r == -EBLACKLISTED) { + dout(0) << "detected client is blacklisted during image refresh" << dendl; + + m_blacklisted = true; + m_refresh_in_progress = false; + std::swap(on_init_finish, m_on_init_finish); + } else if (r == -ENOENT) { + dout(5) << "mirroring directory not found" << dendl; + m_pending_image_ids.clear(); + r = 0; + } else { + m_refresh_in_progress = false; + retry_refresh = true; + } + } + + if (retry_refresh) { + derr << "failed to retrieve mirroring directory: " << cpp_strerror(r) + << dendl; + schedule_refresh_images(10); + } else if (r >= 0) { + get_mirror_uuid(); + return; + } + + m_async_op_tracker.finish_op(); + if (on_init_finish != nullptr) { + ceph_assert(r == -EBLACKLISTED); + on_init_finish->complete(r); + } +} + +template <typename I> +void PoolWatcher<I>::get_mirror_uuid() { + dout(5) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_uuid_get_start(&op); + + m_out_bl.clear(); + librados::AioCompletion *aio_comp = create_rados_callback< + PoolWatcher, &PoolWatcher<I>::handle_get_mirror_uuid>(this); + int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void PoolWatcher<I>::handle_get_mirror_uuid(int r) { + dout(5) << "r=" << r << dendl; + + bool deferred_refresh = false; + bool retry_refresh = false; + Context *on_init_finish = nullptr; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_image_ids_invalid); + ceph_assert(m_refresh_in_progress); + m_refresh_in_progress = false; + + m_pending_mirror_uuid = ""; + if (r >= 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_uuid_get_finish( + &it, &m_pending_mirror_uuid); + } + if (r >= 0 && m_pending_mirror_uuid.empty()) { + r = -ENOENT; + } + + if (m_deferred_refresh) { + // need to refresh -- skip the notification + deferred_refresh = true; + } else if (r >= 0) { + dout(10) << "mirror_uuid=" << m_pending_mirror_uuid << dendl; + m_image_ids_invalid = false; + std::swap(on_init_finish, m_on_init_finish); + schedule_listener(); + } else if (r == -EBLACKLISTED) { + dout(0) << "detected client is blacklisted during image refresh" << dendl; + + m_blacklisted = true; + std::swap(on_init_finish, m_on_init_finish); + } else if (r == -ENOENT) { + dout(5) << "mirroring uuid not found" << dendl; + std::swap(on_init_finish, m_on_init_finish); + retry_refresh = true; + } else { + retry_refresh = true; + } + } + + if (deferred_refresh) { + dout(5) << "scheduling deferred refresh" << dendl; + schedule_refresh_images(0); + } else if (retry_refresh) { + derr << "failed to retrieve mirror uuid: " << cpp_strerror(r) + << dendl; + schedule_refresh_images(10); + } + + m_async_op_tracker.finish_op(); + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + } +} + +template <typename I> +void PoolWatcher<I>::schedule_refresh_images(double interval) { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + if (m_shutting_down || m_refresh_in_progress || m_timer_ctx != nullptr) { + if (m_refresh_in_progress && !m_deferred_refresh) { + dout(5) << "deferring refresh until in-flight refresh completes" << dendl; + m_deferred_refresh = true; + } + return; + } + + m_image_ids_invalid = true; + m_timer_ctx = m_threads->timer->add_event_after( + interval, + new FunctionContext([this](int r) { + process_refresh_images(); + })); +} + +template <typename I> +void PoolWatcher<I>::handle_rewatch_complete(int r) { + dout(5) << "r=" << r << dendl; + + if (r == -EBLACKLISTED) { + dout(0) << "detected client is blacklisted" << dendl; + + Mutex::Locker locker(m_lock); + m_blacklisted = true; + return; + } else if (r == -ENOENT) { + dout(5) << "mirroring directory deleted" << dendl; + } else if (r < 0) { + derr << "unexpected error re-registering mirroring directory watch: " + << cpp_strerror(r) << dendl; + } + + schedule_refresh_images(5); +} + +template <typename I> +void PoolWatcher<I>::handle_image_updated(const std::string &remote_image_id, + const std::string &global_image_id, + bool enabled) { + dout(10) << "remote_image_id=" << remote_image_id << ", " + << "global_image_id=" << global_image_id << ", " + << "enabled=" << enabled << dendl; + + Mutex::Locker locker(m_lock); + ImageId image_id(global_image_id, remote_image_id); + m_pending_added_image_ids.erase(image_id); + m_pending_removed_image_ids.erase(image_id); + + if (enabled) { + m_pending_added_image_ids.insert(image_id); + schedule_listener(); + } else { + m_pending_removed_image_ids.insert(image_id); + schedule_listener(); + } +} + +template <typename I> +void PoolWatcher<I>::process_refresh_images() { + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_timer_ctx != nullptr); + m_timer_ctx = nullptr; + + { + Mutex::Locker locker(m_lock); + ceph_assert(!m_refresh_in_progress); + m_refresh_in_progress = true; + m_deferred_refresh = false; + } + + // execute outside of the timer's lock + m_async_op_tracker.start_op(); + Context *ctx = new FunctionContext([this](int r) { + register_watcher(); + m_async_op_tracker.finish_op(); + }); + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void PoolWatcher<I>::schedule_listener() { + ceph_assert(m_lock.is_locked()); + m_pending_updates = true; + if (m_shutting_down || m_image_ids_invalid || m_notify_listener_in_progress) { + return; + } + + dout(20) << dendl; + + m_async_op_tracker.start_op(); + Context *ctx = new FunctionContext([this](int r) { + notify_listener(); + m_async_op_tracker.finish_op(); + }); + + m_notify_listener_in_progress = true; + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void PoolWatcher<I>::notify_listener() { + dout(10) << dendl; + + std::string mirror_uuid; + ImageIds added_image_ids; + ImageIds removed_image_ids; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_notify_listener_in_progress); + + // if the mirror uuid is updated, treat it as the removal of all + // images in the pool + if (m_mirror_uuid != m_pending_mirror_uuid) { + if (!m_mirror_uuid.empty()) { + dout(0) << "mirror uuid updated:" + << "old=" << m_mirror_uuid << ", " + << "new=" << m_pending_mirror_uuid << dendl; + } + + mirror_uuid = m_mirror_uuid; + removed_image_ids = std::move(m_image_ids); + m_image_ids.clear(); + } + } + + if (!removed_image_ids.empty()) { + m_listener.handle_update(mirror_uuid, {}, std::move(removed_image_ids)); + removed_image_ids.clear(); + } + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_notify_listener_in_progress); + + // if the watch failed while we didn't own the lock, we are going + // to need to perform a full refresh + if (m_image_ids_invalid) { + m_notify_listener_in_progress = false; + return; + } + + // merge add/remove notifications into pending set (a given image + // can only be in one set or another) + for (auto &image_id : m_pending_removed_image_ids) { + dout(20) << "image_id=" << image_id << dendl; + m_pending_image_ids.erase(image_id); + } + + for (auto &image_id : m_pending_added_image_ids) { + dout(20) << "image_id=" << image_id << dendl; + m_pending_image_ids.erase(image_id); + m_pending_image_ids.insert(image_id); + } + m_pending_added_image_ids.clear(); + + // compute added/removed images + for (auto &image_id : m_image_ids) { + auto it = m_pending_image_ids.find(image_id); + if (it == m_pending_image_ids.end() || it->id != image_id.id) { + removed_image_ids.insert(image_id); + } + } + for (auto &image_id : m_pending_image_ids) { + auto it = m_image_ids.find(image_id); + if (it == m_image_ids.end() || it->id != image_id.id) { + added_image_ids.insert(image_id); + } + } + + m_pending_updates = false; + m_image_ids = m_pending_image_ids; + + m_mirror_uuid = m_pending_mirror_uuid; + mirror_uuid = m_mirror_uuid; + } + + m_listener.handle_update(mirror_uuid, std::move(added_image_ids), + std::move(removed_image_ids)); + + { + Mutex::Locker locker(m_lock); + m_notify_listener_in_progress = false; + if (m_pending_updates) { + schedule_listener(); + } + } +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::PoolWatcher<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/PoolWatcher.h b/src/tools/rbd_mirror/PoolWatcher.h new file mode 100644 index 00000000..1136a319 --- /dev/null +++ b/src/tools/rbd_mirror/PoolWatcher.h @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_POOL_WATCHER_H +#define CEPH_RBD_MIRROR_POOL_WATCHER_H + +#include <map> +#include <memory> +#include <set> +#include <string> + +#include "common/AsyncOpTracker.h" +#include "common/ceph_context.h" +#include "common/Mutex.h" +#include "include/rados/librados.hpp" +#include "tools/rbd_mirror/Types.h" +#include <boost/functional/hash.hpp> +#include <boost/optional.hpp> +#include "include/ceph_assert.h" +#include "tools/rbd_mirror/pool_watcher/Types.h" + +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> struct Threads; + +/** + * Keeps track of images that have mirroring enabled within all + * pools. + */ +template <typename ImageCtxT = librbd::ImageCtx> +class PoolWatcher { +public: + static PoolWatcher* create(Threads<ImageCtxT> *threads, + librados::IoCtx &remote_io_ctx, + pool_watcher::Listener &listener) { + return new PoolWatcher(threads, remote_io_ctx, listener); + } + + PoolWatcher(Threads<ImageCtxT> *threads, librados::IoCtx &remote_io_ctx, + pool_watcher::Listener &listener); + ~PoolWatcher(); + PoolWatcher(const PoolWatcher&) = delete; + PoolWatcher& operator=(const PoolWatcher&) = delete; + + bool is_blacklisted() const; + + void init(Context *on_finish = nullptr); + void shut_down(Context *on_finish); + + inline uint64_t get_image_count() const { + Mutex::Locker locker(m_lock); + return m_image_ids.size(); + } + +private: + /** + * @verbatim + * + * <start> + * | + * v + * INIT + * | + * v + * REGISTER_WATCHER + * | + * |/--------------------------------\ + * | | + * v | + * REFRESH_IMAGES | + * | | + * |/----------------------------\ | + * | | | + * v | | + * GET_MIRROR_UUID | | + * | | | + * v | | + * NOTIFY_LISTENER | | + * | | | + * v | | + * IDLE ---\ | | + * | | | | + * | |\---> IMAGE_UPDATED | | + * | | | | | + * | | v | | + * | | GET_IMAGE_NAME --/ | + * | | | + * | \----> WATCH_ERROR ---------/ + * v + * SHUT_DOWN + * | + * v + * UNREGISTER_WATCHER + * | + * v + * <finish> + * + * @endverbatim + */ + class MirroringWatcher; + + Threads<ImageCtxT> *m_threads; + librados::IoCtx m_remote_io_ctx; + pool_watcher::Listener &m_listener; + + ImageIds m_refresh_image_ids; + bufferlist m_out_bl; + + mutable Mutex m_lock; + + Context *m_on_init_finish = nullptr; + + ImageIds m_image_ids; + std::string m_mirror_uuid; + + bool m_pending_updates = false; + bool m_notify_listener_in_progress = false; + ImageIds m_pending_image_ids; + ImageIds m_pending_added_image_ids; + ImageIds m_pending_removed_image_ids; + + std::string m_pending_mirror_uuid; + + MirroringWatcher *m_mirroring_watcher; + + Context *m_timer_ctx = nullptr; + + AsyncOpTracker m_async_op_tracker; + bool m_blacklisted = false; + bool m_shutting_down = false; + bool m_image_ids_invalid = true; + bool m_refresh_in_progress = false; + bool m_deferred_refresh = false; + + void register_watcher(); + void handle_register_watcher(int r); + void unregister_watcher(); + + void refresh_images(); + void handle_refresh_images(int r); + + void schedule_refresh_images(double interval); + void process_refresh_images(); + + void get_mirror_uuid(); + void handle_get_mirror_uuid(int r); + + void handle_rewatch_complete(int r); + void handle_image_updated(const std::string &remote_image_id, + const std::string &global_image_id, + bool enabled); + + void schedule_listener(); + void notify_listener(); + +}; + +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::PoolWatcher<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_POOL_WATCHER_H diff --git a/src/tools/rbd_mirror/ProgressContext.h b/src/tools/rbd_mirror/ProgressContext.h new file mode 100644 index 00000000..e4430ee6 --- /dev/null +++ b/src/tools/rbd_mirror/ProgressContext.h @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_PROGRESS_CONTEXT_H +#define RBD_MIRROR_PROGRESS_CONTEXT_H + +namespace rbd { +namespace mirror { + +class ProgressContext +{ +public: + virtual ~ProgressContext() {} + virtual void update_progress(const std::string &description, + bool flush = true) = 0; +}; + +} // namespace mirror +} // namespace rbd + +#endif // RBD_MIRROR_PROGRESS_CONTEXT_H diff --git a/src/tools/rbd_mirror/ServiceDaemon.cc b/src/tools/rbd_mirror/ServiceDaemon.cc new file mode 100644 index 00000000..f3b549b8 --- /dev/null +++ b/src/tools/rbd_mirror/ServiceDaemon.cc @@ -0,0 +1,251 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/ServiceDaemon.h" +#include "include/Context.h" +#include "include/stringify.h" +#include "common/ceph_context.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/Timer.h" +#include "tools/rbd_mirror/Threads.h" +#include <sstream> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::ServiceDaemon: " << this << " " \ + << __func__ << ": " + +namespace rbd { +namespace mirror { + +namespace { + +const std::string RBD_MIRROR_AUTH_ID_PREFIX("rbd-mirror."); + +struct AttributeDumpVisitor : public boost::static_visitor<void> { + ceph::Formatter *f; + const std::string& name; + + AttributeDumpVisitor(ceph::Formatter *f, const std::string& name) + : f(f), name(name) { + } + + void operator()(bool val) const { + f->dump_bool(name.c_str(), val); + } + void operator()(uint64_t val) const { + f->dump_unsigned(name.c_str(), val); + } + void operator()(const std::string& val) const { + f->dump_string(name.c_str(), val); + } +}; + +} // anonymous namespace + +using namespace service_daemon; + +template <typename I> +ServiceDaemon<I>::ServiceDaemon(CephContext *cct, RadosRef rados, + Threads<I>* threads) + : m_cct(cct), m_rados(rados), m_threads(threads), + m_lock("rbd::mirror::ServiceDaemon") { + dout(20) << dendl; +} + +template <typename I> +ServiceDaemon<I>::~ServiceDaemon() { + dout(20) << dendl; + Mutex::Locker timer_locker(m_threads->timer_lock); + if (m_timer_ctx != nullptr) { + m_threads->timer->cancel_event(m_timer_ctx); + update_status(); + } +} + +template <typename I> +int ServiceDaemon<I>::init() { + dout(20) << dendl; + + std::string id = m_cct->_conf->name.get_id(); + if (id.find(RBD_MIRROR_AUTH_ID_PREFIX) == 0) { + id = id.substr(RBD_MIRROR_AUTH_ID_PREFIX.size()); + } + + std::string instance_id = stringify(m_rados->get_instance_id()); + std::map<std::string, std::string> service_metadata = { + {"id", id}, {"instance_id", instance_id}}; + int r = m_rados->service_daemon_register("rbd-mirror", instance_id, + service_metadata); + if (r < 0) { + return r; + } + + return 0; +} + +template <typename I> +void ServiceDaemon<I>::add_pool(int64_t pool_id, const std::string& pool_name) { + dout(20) << "pool_id=" << pool_id << ", pool_name=" << pool_name << dendl; + + { + Mutex::Locker locker(m_lock); + m_pools.insert({pool_id, {pool_name}}); + } + schedule_update_status(); +} + +template <typename I> +void ServiceDaemon<I>::remove_pool(int64_t pool_id) { + dout(20) << "pool_id=" << pool_id << dendl; + { + Mutex::Locker locker(m_lock); + m_pools.erase(pool_id); + } + schedule_update_status(); +} + +template <typename I> +uint64_t ServiceDaemon<I>::add_or_update_callout(int64_t pool_id, + uint64_t callout_id, + CalloutLevel callout_level, + const std::string& text) { + dout(20) << "pool_id=" << pool_id << ", " + << "callout_id=" << callout_id << ", " + << "callout_level=" << callout_level << ", " + << "text=" << text << dendl; + + { + Mutex::Locker locker(m_lock); + auto pool_it = m_pools.find(pool_id); + if (pool_it == m_pools.end()) { + return CALLOUT_ID_NONE; + } + + if (callout_id == CALLOUT_ID_NONE) { + callout_id = ++m_callout_id; + } + pool_it->second.callouts[callout_id] = {callout_level, text}; + } + + schedule_update_status(); + return callout_id; +} + +template <typename I> +void ServiceDaemon<I>::remove_callout(int64_t pool_id, uint64_t callout_id) { + dout(20) << "pool_id=" << pool_id << ", " + << "callout_id=" << callout_id << dendl; + + { + Mutex::Locker locker(m_lock); + auto pool_it = m_pools.find(pool_id); + if (pool_it == m_pools.end()) { + return; + } + pool_it->second.callouts.erase(callout_id); + } + + schedule_update_status(); +} + +template <typename I> +void ServiceDaemon<I>::add_or_update_attribute(int64_t pool_id, + const std::string& key, + const AttributeValue& value) { + dout(20) << "pool_id=" << pool_id << ", " + << "key=" << key << ", " + << "value=" << value << dendl; + + { + Mutex::Locker locker(m_lock); + auto pool_it = m_pools.find(pool_id); + if (pool_it == m_pools.end()) { + return; + } + pool_it->second.attributes[key] = value; + } + + schedule_update_status(); +} + +template <typename I> +void ServiceDaemon<I>::remove_attribute(int64_t pool_id, + const std::string& key) { + dout(20) << "pool_id=" << pool_id << ", " + << "key=" << key << dendl; + + { + Mutex::Locker locker(m_lock); + auto pool_it = m_pools.find(pool_id); + if (pool_it == m_pools.end()) { + return; + } + pool_it->second.attributes.erase(key); + } + + schedule_update_status(); +} + +template <typename I> +void ServiceDaemon<I>::schedule_update_status() { + Mutex::Locker timer_locker(m_threads->timer_lock); + if (m_timer_ctx != nullptr) { + return; + } + + m_timer_ctx = new FunctionContext([this](int) { + m_timer_ctx = nullptr; + update_status(); + }); + m_threads->timer->add_event_after(1, m_timer_ctx); +} + +template <typename I> +void ServiceDaemon<I>::update_status() { + dout(20) << dendl; + ceph_assert(m_threads->timer_lock.is_locked()); + + ceph::JSONFormatter f; + { + Mutex::Locker locker(m_lock); + f.open_object_section("pools"); + for (auto& pool_pair : m_pools) { + f.open_object_section(stringify(pool_pair.first).c_str()); + f.dump_string("name", pool_pair.second.name); + f.open_object_section("callouts"); + for (auto& callout : pool_pair.second.callouts) { + f.open_object_section(stringify(callout.first).c_str()); + f.dump_string("level", stringify(callout.second.level).c_str()); + f.dump_string("text", callout.second.text.c_str()); + f.close_section(); + } + f.close_section(); // callouts + + for (auto& attribute : pool_pair.second.attributes) { + AttributeDumpVisitor attribute_dump_visitor(&f, attribute.first); + boost::apply_visitor(attribute_dump_visitor, attribute.second); + } + f.close_section(); // pool + } + f.close_section(); // pools + } + + std::stringstream ss; + f.flush(ss); + + int r = m_rados->service_daemon_update_status({{"json", ss.str()}}); + if (r < 0) { + derr << "failed to update service daemon status: " << cpp_strerror(r) + << dendl; + } +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::ServiceDaemon<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/ServiceDaemon.h b/src/tools/rbd_mirror/ServiceDaemon.h new file mode 100644 index 00000000..1de7e20b --- /dev/null +++ b/src/tools/rbd_mirror/ServiceDaemon.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_SERVICE_DAEMON_H +#define CEPH_RBD_MIRROR_SERVICE_DAEMON_H + +#include "common/Mutex.h" +#include "tools/rbd_mirror/Types.h" +#include "tools/rbd_mirror/service_daemon/Types.h" +#include <map> +#include <string> + +struct CephContext; +struct Context; +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> struct Threads; + +template <typename ImageCtxT = librbd::ImageCtx> +class ServiceDaemon { +public: + ServiceDaemon(CephContext *cct, RadosRef rados, Threads<ImageCtxT>* threads); + ~ServiceDaemon(); + + int init(); + + void add_pool(int64_t pool_id, const std::string& pool_name); + void remove_pool(int64_t pool_id); + + uint64_t add_or_update_callout(int64_t pool_id, uint64_t callout_id, + service_daemon::CalloutLevel callout_level, + const std::string& text); + void remove_callout(int64_t pool_id, uint64_t callout_id); + + void add_or_update_attribute(int64_t pool_id, const std::string& key, + const service_daemon::AttributeValue& value); + void remove_attribute(int64_t pool_id, const std::string& key); + +private: + struct Callout { + service_daemon::CalloutLevel level; + std::string text; + + Callout() : level(service_daemon::CALLOUT_LEVEL_INFO) { + } + Callout(service_daemon::CalloutLevel level, const std::string& text) + : level(level), text(text) { + } + }; + typedef std::map<uint64_t, Callout> Callouts; + typedef std::map<std::string, service_daemon::AttributeValue> Attributes; + + struct Pool { + std::string name; + Callouts callouts; + Attributes attributes; + + Pool(const std::string& name) : name(name) { + } + }; + + typedef std::map<int64_t, Pool> Pools; + + CephContext *m_cct; + RadosRef m_rados; + Threads<ImageCtxT>* m_threads; + + Mutex m_lock; + Pools m_pools; + uint64_t m_callout_id = service_daemon::CALLOUT_ID_NONE; + + Context* m_timer_ctx = nullptr; + + void schedule_update_status(); + void update_status(); +}; + +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::ServiceDaemon<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_SERVICE_DAEMON_H diff --git a/src/tools/rbd_mirror/Threads.cc b/src/tools/rbd_mirror/Threads.cc new file mode 100644 index 00000000..ca0a8b0f --- /dev/null +++ b/src/tools/rbd_mirror/Threads.cc @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/Threads.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" + +namespace rbd { +namespace mirror { + +template <typename I> +Threads<I>::Threads(CephContext *cct) : timer_lock("Threads::timer_lock") { + thread_pool = new ThreadPool(cct, "Journaler::thread_pool", "tp_journal", + cct->_conf.get_val<uint64_t>("rbd_op_threads"), + "rbd_op_threads"); + thread_pool->start(); + + work_queue = new ContextWQ("Journaler::work_queue", + cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"), + thread_pool); + + timer = new SafeTimer(cct, timer_lock, true); + timer->init(); +} + +template <typename I> +Threads<I>::~Threads() { + { + Mutex::Locker timer_locker(timer_lock); + timer->shutdown(); + } + delete timer; + + work_queue->drain(); + delete work_queue; + + thread_pool->stop(); + delete thread_pool; +} + +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::Threads<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/Threads.h b/src/tools/rbd_mirror/Threads.h new file mode 100644 index 00000000..f52e8837 --- /dev/null +++ b/src/tools/rbd_mirror/Threads.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_THREADS_H +#define CEPH_RBD_MIRROR_THREADS_H + +#include "common/Mutex.h" + +class CephContext; +class ContextWQ; +class SafeTimer; +class ThreadPool; + +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename ImageCtxT = librbd::ImageCtx> +struct Threads { + ThreadPool *thread_pool = nullptr; + ContextWQ *work_queue = nullptr; + + SafeTimer *timer = nullptr; + Mutex timer_lock; + + explicit Threads(CephContext *cct); + Threads(const Threads&) = delete; + Threads& operator=(const Threads&) = delete; + + ~Threads(); +}; + +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::Threads<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_THREADS_H diff --git a/src/tools/rbd_mirror/Types.cc b/src/tools/rbd_mirror/Types.cc new file mode 100644 index 00000000..74fe318e --- /dev/null +++ b/src/tools/rbd_mirror/Types.cc @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/Types.h" + +namespace rbd { +namespace mirror { + +std::ostream &operator<<(std::ostream &os, const ImageId &image_id) { + return os << "global id=" << image_id.global_id << ", " + << "id=" << image_id.id; +} + +std::ostream& operator<<(std::ostream& lhs, const PeerSpec &peer) { + return lhs << "uuid: " << peer.uuid + << " cluster: " << peer.cluster_name + << " client: " << peer.client_name; +} + +} // namespace mirror +} // namespace rbd diff --git a/src/tools/rbd_mirror/Types.h b/src/tools/rbd_mirror/Types.h new file mode 100644 index 00000000..ed3b9d8a --- /dev/null +++ b/src/tools/rbd_mirror/Types.h @@ -0,0 +1,123 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_TYPES_H +#define CEPH_RBD_MIRROR_TYPES_H + +#include <iostream> +#include <memory> +#include <set> +#include <string> +#include <vector> + +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" + +namespace rbd { +namespace mirror { + +// Performance counters +enum { + l_rbd_mirror_first = 27000, + l_rbd_mirror_replay, + l_rbd_mirror_replay_bytes, + l_rbd_mirror_replay_latency, + l_rbd_mirror_last, +}; + +typedef std::shared_ptr<librados::Rados> RadosRef; +typedef std::shared_ptr<librados::IoCtx> IoCtxRef; +typedef std::shared_ptr<librbd::Image> ImageRef; + +struct ImageId { + std::string global_id; + std::string id; + + explicit ImageId(const std::string &global_id) : global_id(global_id) { + } + ImageId(const std::string &global_id, const std::string &id) + : global_id(global_id), id(id) { + } + + inline bool operator==(const ImageId &rhs) const { + return (global_id == rhs.global_id && id == rhs.id); + } + inline bool operator<(const ImageId &rhs) const { + return global_id < rhs.global_id; + } +}; + +std::ostream &operator<<(std::ostream &, const ImageId &image_id); + +typedef std::set<ImageId> ImageIds; + +struct Peer { + std::string peer_uuid; + librados::IoCtx io_ctx; + + Peer() { + } + Peer(const std::string &peer_uuid) : peer_uuid(peer_uuid) { + } + Peer(const std::string &peer_uuid, librados::IoCtx& io_ctx) + : peer_uuid(peer_uuid), io_ctx(io_ctx) { + } + + inline bool operator<(const Peer &rhs) const { + return peer_uuid < rhs.peer_uuid; + } +}; + +typedef std::set<Peer> Peers; + +struct PeerSpec { + PeerSpec() = default; + PeerSpec(const std::string &uuid, const std::string &cluster_name, + const std::string &client_name) + : uuid(uuid), cluster_name(cluster_name), client_name(client_name) + { + } + PeerSpec(const librbd::mirror_peer_t &peer) : + uuid(peer.uuid), + cluster_name(peer.cluster_name), + client_name(peer.client_name) + { + } + + std::string uuid; + std::string cluster_name; + std::string client_name; + + /// optional config properties + std::string mon_host; + std::string key; + + bool operator==(const PeerSpec& rhs) const { + return (uuid == rhs.uuid && + cluster_name == rhs.cluster_name && + client_name == rhs.client_name && + mon_host == rhs.mon_host && + key == rhs.key); + } + bool operator<(const PeerSpec& rhs) const { + if (uuid != rhs.uuid) { + return uuid < rhs.uuid; + } else if (cluster_name != rhs.cluster_name) { + return cluster_name < rhs.cluster_name; + } else if (client_name != rhs.client_name) { + return client_name < rhs.client_name; + } else if (mon_host < rhs.mon_host) { + return mon_host < rhs.mon_host; + } else { + return key < rhs.key; + } + } +}; + +std::ostream& operator<<(std::ostream& lhs, const PeerSpec &peer); + +} // namespace mirror +} // namespace rbd + + +#endif // CEPH_RBD_MIRROR_TYPES_H diff --git a/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc new file mode 100644 index 00000000..a0e9fd90 --- /dev/null +++ b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc @@ -0,0 +1,290 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/journal/Policy.h" +#include "tools/rbd_mirror/image_deleter/Types.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_deleter::SnapshotPurgeRequest: " \ + << this << " " << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace image_deleter { + +using librbd::util::create_context_callback; + +template <typename I> +void SnapshotPurgeRequest<I>::send() { + open_image(); +} + +template <typename I> +void SnapshotPurgeRequest<I>::open_image() { + dout(10) << dendl; + m_image_ctx = I::create("", m_image_id, nullptr, m_io_ctx, false); + + { + RWLock::WLocker snap_locker(m_image_ctx->snap_lock); + m_image_ctx->set_journal_policy(new JournalPolicy()); + } + + Context *ctx = create_context_callback< + SnapshotPurgeRequest<I>, &SnapshotPurgeRequest<I>::handle_open_image>( + this); + m_image_ctx->state->open(librbd::OPEN_FLAG_SKIP_OPEN_PARENT, ctx); +} + +template <typename I> +void SnapshotPurgeRequest<I>::handle_open_image(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to open image '" << m_image_id << "': " << cpp_strerror(r) + << dendl; + m_image_ctx->destroy(); + m_image_ctx = nullptr; + + finish(r); + return; + } + + acquire_lock(); +} + +template <typename I> +void SnapshotPurgeRequest<I>::acquire_lock() { + dout(10) << dendl; + + m_image_ctx->owner_lock.get_read(); + if (m_image_ctx->exclusive_lock == nullptr) { + m_image_ctx->owner_lock.put_read(); + + derr << "exclusive lock not enabled" << dendl; + m_ret_val = -EINVAL; + close_image(); + return; + } + + m_image_ctx->exclusive_lock->acquire_lock(create_context_callback< + SnapshotPurgeRequest<I>, &SnapshotPurgeRequest<I>::handle_acquire_lock>( + this)); + m_image_ctx->owner_lock.put_read(); +} + +template <typename I> +void SnapshotPurgeRequest<I>::handle_acquire_lock(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to acquire exclusive lock: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_image(); + return; + } + + { + RWLock::RLocker snap_locker(m_image_ctx->snap_lock); + m_snaps = m_image_ctx->snaps; + } + snap_unprotect(); +} + +template <typename I> +void SnapshotPurgeRequest<I>::snap_unprotect() { + if (m_snaps.empty()) { + close_image(); + return; + } + + librados::snap_t snap_id = m_snaps.back(); + m_image_ctx->snap_lock.get_read(); + int r = m_image_ctx->get_snap_namespace(snap_id, &m_snap_namespace); + if (r < 0) { + m_image_ctx->snap_lock.put_read(); + + derr << "failed to get snap namespace: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_image(); + return; + } + + r = m_image_ctx->get_snap_name(snap_id, &m_snap_name); + if (r < 0) { + m_image_ctx->snap_lock.put_read(); + + derr << "failed to get snap name: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_image(); + return; + } + + bool is_protected; + r = m_image_ctx->is_snap_protected(snap_id, &is_protected); + if (r < 0) { + m_image_ctx->snap_lock.put_read(); + + derr << "failed to get snap protection status: " << cpp_strerror(r) + << dendl; + m_ret_val = r; + close_image(); + return; + } + m_image_ctx->snap_lock.put_read(); + + if (!is_protected) { + snap_remove(); + return; + } + + dout(10) << "snap_id=" << snap_id << ", " + << "snap_namespace=" << m_snap_namespace << ", " + << "snap_name=" << m_snap_name << dendl; + + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + derr << "lost exclusive lock" << dendl; + m_ret_val = r; + close_image(); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_snap_unprotect(r); + finish_op_ctx->complete(0); + }); + RWLock::RLocker owner_locker(m_image_ctx->owner_lock); + m_image_ctx->operations->execute_snap_unprotect( + m_snap_namespace, m_snap_name.c_str(), ctx); +} + +template <typename I> +void SnapshotPurgeRequest<I>::handle_snap_unprotect(int r) { + dout(10) << "r=" << r << dendl; + + if (r == -EBUSY) { + dout(10) << "snapshot in-use" << dendl; + m_ret_val = r; + close_image(); + return; + } else if (r < 0) { + derr << "failed to unprotect snapshot: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_image(); + return; + } + + { + // avoid the need to refresh to delete the newly unprotected snapshot + RWLock::RLocker snap_locker(m_image_ctx->snap_lock); + librados::snap_t snap_id = m_snaps.back(); + auto snap_info_it = m_image_ctx->snap_info.find(snap_id); + if (snap_info_it != m_image_ctx->snap_info.end()) { + snap_info_it->second.protection_status = + RBD_PROTECTION_STATUS_UNPROTECTED; + } + } + + snap_remove(); +} + +template <typename I> +void SnapshotPurgeRequest<I>::snap_remove() { + librados::snap_t snap_id = m_snaps.back(); + dout(10) << "snap_id=" << snap_id << ", " + << "snap_namespace=" << m_snap_namespace << ", " + << "snap_name=" << m_snap_name << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + derr << "lost exclusive lock" << dendl; + m_ret_val = r; + close_image(); + return; + } + + auto ctx = new FunctionContext([this, finish_op_ctx](int r) { + handle_snap_remove(r); + finish_op_ctx->complete(0); + }); + RWLock::RLocker owner_locker(m_image_ctx->owner_lock); + m_image_ctx->operations->execute_snap_remove( + m_snap_namespace, m_snap_name.c_str(), ctx); +} + +template <typename I> +void SnapshotPurgeRequest<I>::handle_snap_remove(int r) { + dout(10) << "r=" << r << dendl; + + if (r == -EBUSY) { + dout(10) << "snapshot in-use" << dendl; + m_ret_val = r; + close_image(); + return; + } else if (r < 0) { + derr << "failed to remove snapshot: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_image(); + return; + } + + m_snaps.pop_back(); + snap_unprotect(); +} + +template <typename I> +void SnapshotPurgeRequest<I>::close_image() { + dout(10) << dendl; + + m_image_ctx->state->close(create_context_callback< + SnapshotPurgeRequest<I>, + &SnapshotPurgeRequest<I>::handle_close_image>(this)); +} + +template <typename I> +void SnapshotPurgeRequest<I>::handle_close_image(int r) { + dout(10) << "r=" << r << dendl; + + m_image_ctx->destroy(); + m_image_ctx = nullptr; + + if (r < 0) { + derr << "failed to close: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + finish(0); +} + +template <typename I> +void SnapshotPurgeRequest<I>::finish(int r) { + if (m_ret_val < 0) { + r = m_ret_val; + } + + m_on_finish->complete(r); + delete this; +} + +template <typename I> +Context *SnapshotPurgeRequest<I>::start_lock_op(int* r) { + RWLock::RLocker owner_locker(m_image_ctx->owner_lock); + return m_image_ctx->exclusive_lock->start_op(r); +} + +} // namespace image_deleter +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_deleter::SnapshotPurgeRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h new file mode 100644 index 00000000..b8b635fe --- /dev/null +++ b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h @@ -0,0 +1,104 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_SNAPSHOT_PURGE_REQUEST_H +#define CEPH_RBD_MIRROR_IMAGE_DELETER_SNAPSHOT_PURGE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include <string> +#include <vector> + +class Context; +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_deleter { + +template <typename ImageCtxT = librbd::ImageCtx> +class SnapshotPurgeRequest { +public: + static SnapshotPurgeRequest* create(librados::IoCtx &io_ctx, + const std::string &image_id, + Context *on_finish) { + return new SnapshotPurgeRequest(io_ctx, image_id, on_finish); + } + + SnapshotPurgeRequest(librados::IoCtx &io_ctx, const std::string &image_id, + Context *on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), m_on_finish(on_finish) { + } + + void send(); + +private: + /* + * @verbatim + * + * <start> + * | + * v + * OPEN_IMAGE + * | + * v + * ACQUIRE_LOCK + * | + * | (repeat for each snapshot) + * |/------------------------\ + * | | + * v (skip if not needed) | + * SNAP_UNPROTECT | + * | | + * v (skip if not needed) | + * SNAP_REMOVE -----------------/ + * | + * v + * CLOSE_IMAGE + * | + * v + * <finish> + * + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + std::string m_image_id; + Context *m_on_finish; + + ImageCtxT *m_image_ctx = nullptr; + int m_ret_val = 0; + + std::vector<librados::snap_t> m_snaps; + cls::rbd::SnapshotNamespace m_snap_namespace; + std::string m_snap_name; + + void open_image(); + void handle_open_image(int r); + + void acquire_lock(); + void handle_acquire_lock(int r); + + void snap_unprotect(); + void handle_snap_unprotect(int r); + + void snap_remove(); + void handle_snap_remove(int r); + + void close_image(); + void handle_close_image(int r); + + void finish(int r); + + Context *start_lock_op(int* r); + +}; + +} // namespace image_deleter +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_deleter::SnapshotPurgeRequest<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_SNAPSHOT_PURGE_REQUEST_H + diff --git a/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc new file mode 100644 index 00000000..92db22ca --- /dev/null +++ b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc @@ -0,0 +1,384 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/image_deleter/TrashMoveRequest.h" +#include "include/rbd_types.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/TrashWatcher.h" +#include "librbd/Utils.h" +#include "librbd/journal/ResetRequest.h" +#include "librbd/trash/MoveRequest.h" +#include "tools/rbd_mirror/image_deleter/Types.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_deleter::TrashMoveRequest: " \ + << this << " " << __func__ << ": " +namespace rbd { +namespace mirror { +namespace image_deleter { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +void TrashMoveRequest<I>::send() { + get_mirror_image_id(); +} + +template <typename I> +void TrashMoveRequest<I>::get_mirror_image_id() { + dout(10) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_image_get_image_id_start(&op, m_global_image_id); + + auto aio_comp = create_rados_callback< + TrashMoveRequest<I>, + &TrashMoveRequest<I>::handle_get_mirror_image_id>(this); + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void TrashMoveRequest<I>::handle_get_mirror_image_id(int r) { + dout(10) << "r=" << r << dendl; + + if (r == 0) { + auto bl_it = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_image_get_image_id_finish(&bl_it, + &m_image_id); + } + if (r == -ENOENT) { + dout(10) << "image " << m_global_image_id << " is not mirrored" << dendl; + finish(r); + return; + } else if (r < 0) { + derr << "error retrieving local id for image " << m_global_image_id << ": " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + get_tag_owner(); +} + +template <typename I> +void TrashMoveRequest<I>::get_tag_owner() { + dout(10) << dendl; + + auto ctx = create_context_callback< + TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_get_tag_owner>(this); + librbd::Journal<I>::get_tag_owner(m_io_ctx, m_image_id, &m_mirror_uuid, + m_op_work_queue, ctx); +} + +template <typename I> +void TrashMoveRequest<I>::handle_get_tag_owner(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + derr << "error retrieving image primary info for image " + << m_global_image_id << ": " << cpp_strerror(r) << dendl; + finish(r); + return; + } else if (r != -ENOENT) { + if (m_mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) { + dout(10) << "image " << m_global_image_id << " is local primary" << dendl; + finish(-EPERM); + return; + } else if (m_mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID && + !m_resync) { + dout(10) << "image " << m_global_image_id << " is orphaned" << dendl; + finish(-EPERM); + return; + } + } + + disable_mirror_image(); +} + +template <typename I> +void TrashMoveRequest<I>::disable_mirror_image() { + dout(10) << dendl; + + cls::rbd::MirrorImage mirror_image; + mirror_image.global_image_id = m_global_image_id; + mirror_image.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING; + + librados::ObjectWriteOperation op; + librbd::cls_client::mirror_image_set(&op, m_image_id, mirror_image); + + auto aio_comp = create_rados_callback< + TrashMoveRequest<I>, + &TrashMoveRequest<I>::handle_disable_mirror_image>(this); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void TrashMoveRequest<I>::handle_disable_mirror_image(int r) { + dout(10) << "r=" << r << dendl; + + if (r == -ENOENT) { + dout(10) << "local image is not mirrored, aborting deletion." << dendl; + finish(r); + return; + } else if (r == -EEXIST || r == -EINVAL) { + derr << "cannot disable mirroring for image " << m_global_image_id + << ": global_image_id has changed/reused: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } else if (r < 0) { + derr << "cannot disable mirroring for image " << m_global_image_id + << ": " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + reset_journal(); +} + +template <typename I> +void TrashMoveRequest<I>::reset_journal() { + dout(10) << dendl; + + // ensure that if the image is recovered any peers will split-brain + auto ctx = create_context_callback< + TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_reset_journal>(this); + auto req = librbd::journal::ResetRequest<I>::create( + m_io_ctx, m_image_id, librbd::Journal<>::IMAGE_CLIENT_ID, + librbd::Journal<>::LOCAL_MIRROR_UUID, m_op_work_queue, ctx); + req->send(); +} + +template <typename I> +void TrashMoveRequest<I>::handle_reset_journal(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + derr << "failed to reset journal: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + open_image(); +} + +template <typename I> +void TrashMoveRequest<I>::open_image() { + dout(10) << dendl; + + m_image_ctx = I::create("", m_image_id, nullptr, m_io_ctx, false); + + { + // don't attempt to open the journal + RWLock::WLocker snap_locker(m_image_ctx->snap_lock); + m_image_ctx->set_journal_policy(new JournalPolicy()); + } + + Context *ctx = create_context_callback< + TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_open_image>(this); + m_image_ctx->state->open(librbd::OPEN_FLAG_SKIP_OPEN_PARENT, ctx); +} + +template <typename I> +void TrashMoveRequest<I>::handle_open_image(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to open image: " << cpp_strerror(r) << dendl; + m_image_ctx->destroy(); + m_image_ctx = nullptr; + finish(r); + return; + } + + if (m_image_ctx->old_format) { + derr << "cannot move v1 image to trash" << dendl; + m_ret_val = -EINVAL; + close_image(); + return; + } + + acquire_lock(); +} + +template <typename I> +void TrashMoveRequest<I>::acquire_lock() { + m_image_ctx->owner_lock.get_read(); + if (m_image_ctx->exclusive_lock == nullptr) { + derr << "exclusive lock feature not enabled" << dendl; + m_image_ctx->owner_lock.put_read(); + m_ret_val = -EINVAL; + close_image(); + return; + } + + dout(10) << dendl; + + Context *ctx = create_context_callback< + TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_acquire_lock>(this); + m_image_ctx->exclusive_lock->block_requests(0); + m_image_ctx->exclusive_lock->acquire_lock(ctx); + m_image_ctx->owner_lock.put_read(); +} + +template <typename I> +void TrashMoveRequest<I>::handle_acquire_lock(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to acquire exclusive lock: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_image(); + return; + } + + trash_move(); +} + +template <typename I> +void TrashMoveRequest<I>::trash_move() { + dout(10) << dendl; + + utime_t delete_time{ceph_clock_now()}; + utime_t deferment_end_time{delete_time}; + deferment_end_time += + m_image_ctx->config.template get_val<uint64_t>("rbd_mirroring_delete_delay"); + + m_trash_image_spec = { + cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING, m_image_ctx->name, delete_time, + deferment_end_time}; + + Context *ctx = create_context_callback< + TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_trash_move>(this); + auto req = librbd::trash::MoveRequest<I>::create( + m_io_ctx, m_image_id, m_trash_image_spec, ctx); + req->send(); +} + +template <typename I> +void TrashMoveRequest<I>::handle_trash_move(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to move image to trash: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_image(); + return; + } + + m_moved_to_trash = true; + remove_mirror_image(); +} + +template <typename I> +void TrashMoveRequest<I>::remove_mirror_image() { + dout(10) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::mirror_image_remove(&op, m_image_id); + + auto aio_comp = create_rados_callback< + TrashMoveRequest<I>, + &TrashMoveRequest<I>::handle_remove_mirror_image>(this); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void TrashMoveRequest<I>::handle_remove_mirror_image(int r) { + dout(10) << "r=" << r << dendl; + + if (r == -ENOENT) { + dout(10) << "local image is not mirrored" << dendl; + } else if (r < 0) { + derr << "failed to remove mirror image state for " << m_global_image_id + << ": " << cpp_strerror(r) << dendl; + m_ret_val = r; + } + + close_image(); +} + +template <typename I> +void TrashMoveRequest<I>::close_image() { + dout(10) << dendl; + + Context *ctx = create_context_callback< + TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_close_image>(this); + m_image_ctx->state->close(ctx); +} + +template <typename I> +void TrashMoveRequest<I>::handle_close_image(int r) { + dout(10) << "r=" << r << dendl; + + m_image_ctx->destroy(); + m_image_ctx = nullptr; + + if (r < 0) { + derr << "failed to close image: " << cpp_strerror(r) << dendl; + } + + // don't send notification if we failed + if (!m_moved_to_trash) { + finish(0); + return; + } + + notify_trash_add(); +} + +template <typename I> +void TrashMoveRequest<I>::notify_trash_add() { + dout(10) << dendl; + + Context *ctx = create_context_callback< + TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_notify_trash_add>(this); + librbd::TrashWatcher<I>::notify_image_added(m_io_ctx, m_image_id, + m_trash_image_spec, ctx); +} + +template <typename I> +void TrashMoveRequest<I>::handle_notify_trash_add(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to notify trash watchers: " << cpp_strerror(r) << dendl; + } + + finish(0); +} + +template <typename I> +void TrashMoveRequest<I>::finish(int r) { + if (m_ret_val < 0) { + r = m_ret_val; + } + + dout(10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_deleter +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_deleter::TrashMoveRequest<librbd::ImageCtx>; + diff --git a/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h new file mode 100644 index 00000000..07b7432e --- /dev/null +++ b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_MOVE_REQUEST_H +#define CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_MOVE_REQUEST_H + +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include <boost/optional.hpp> +#include <string> + +struct Context; +class ContextWQ; +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_deleter { + +template <typename ImageCtxT = librbd::ImageCtx> +class TrashMoveRequest { +public: + static TrashMoveRequest* create(librados::IoCtx& io_ctx, + const std::string& global_image_id, + bool resync, ContextWQ* op_work_queue, + Context* on_finish) { + return new TrashMoveRequest(io_ctx, global_image_id, resync, op_work_queue, + on_finish); + } + + TrashMoveRequest(librados::IoCtx& io_ctx, const std::string& global_image_id, + bool resync, ContextWQ* op_work_queue, Context* on_finish) + : m_io_ctx(io_ctx), m_global_image_id(global_image_id), m_resync(resync), + m_op_work_queue(op_work_queue), m_on_finish(on_finish) { + } + + void send(); + +private: + /* + * @verbatim + * + * <start> + * | + * v + * GET_MIRROR_IMAGE_ID + * | + * v + * GET_TAG_OWNER + * | + * v + * DISABLE_MIRROR_IMAGE + * | + * v + * RESET_JOURNAL + * | + * v + * OPEN_IMAGE + * | + * v + * ACQUIRE_LOCK + * | + * v + * TRASH_MOVE + * | + * v + * REMOVE_MIRROR_IMAGE + * | + * v + * CLOSE_IMAGE + * | + * v + * NOTIFY_TRASH_ADD + * | + * v + * <finish> + * + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + std::string m_global_image_id; + bool m_resync; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + ceph::bufferlist m_out_bl; + std::string m_image_id; + std::string m_mirror_uuid; + cls::rbd::TrashImageSpec m_trash_image_spec; + ImageCtxT *m_image_ctx = nullptr;; + int m_ret_val = 0; + bool m_moved_to_trash = false; + + void get_mirror_image_id(); + void handle_get_mirror_image_id(int r); + + void get_tag_owner(); + void handle_get_tag_owner(int r); + + void disable_mirror_image(); + void handle_disable_mirror_image(int r); + + void reset_journal(); + void handle_reset_journal(int r); + + void open_image(); + void handle_open_image(int r); + + void acquire_lock(); + void handle_acquire_lock(int r); + + void trash_move(); + void handle_trash_move(int r); + + void remove_mirror_image(); + void handle_remove_mirror_image(int r); + + void close_image(); + void handle_close_image(int r); + + void notify_trash_add(); + void handle_notify_trash_add(int r); + + void finish(int r); + +}; + +} // namespace image_deleter +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_deleter::TrashMoveRequest<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H diff --git a/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc new file mode 100644 index 00000000..e7c725dc --- /dev/null +++ b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc @@ -0,0 +1,265 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/image_deleter/TrashRemoveRequest.h" +#include "include/ceph_assert.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/TrashWatcher.h" +#include "librbd/Utils.h" +#include "librbd/trash/RemoveRequest.h" +#include "tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_deleter::TrashRemoveRequest: " \ + << this << " " << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace image_deleter { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +void TrashRemoveRequest<I>::send() { + *m_error_result = ERROR_RESULT_RETRY; + + get_trash_image_spec(); +} + +template <typename I> +void TrashRemoveRequest<I>::get_trash_image_spec() { + dout(10) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::trash_get_start(&op, m_image_id); + + auto aio_comp = create_rados_callback< + TrashRemoveRequest<I>, + &TrashRemoveRequest<I>::handle_get_trash_image_spec>(this); + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void TrashRemoveRequest<I>::handle_get_trash_image_spec(int r) { + dout(10) << "r=" << r << dendl; + + if (r == 0) { + auto bl_it = m_out_bl.cbegin(); + r = librbd::cls_client::trash_get_finish(&bl_it, &m_trash_image_spec); + } + + if (r == -ENOENT || (r >= 0 && m_trash_image_spec.source != + cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING)) { + dout(10) << "image id " << m_image_id << " not in mirroring trash" << dendl; + finish(0); + return; + } else if (r < 0) { + derr << "error getting image id " << m_image_id << " info from trash: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (m_trash_image_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL && + m_trash_image_spec.state != cls::rbd::TRASH_IMAGE_STATE_REMOVING) { + dout(10) << "image " << m_image_id << " is not in an expected trash state: " + << m_trash_image_spec.state << dendl; + *m_error_result = ERROR_RESULT_RETRY_IMMEDIATELY; + finish(-EBUSY); + return; + } + + set_trash_state(); +} + +template <typename I> +void TrashRemoveRequest<I>::set_trash_state() { + if (m_trash_image_spec.state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) { + get_snap_context(); + return; + } + + dout(10) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::trash_state_set(&op, m_image_id, + cls::rbd::TRASH_IMAGE_STATE_REMOVING, + cls::rbd::TRASH_IMAGE_STATE_NORMAL); + + auto aio_comp = create_rados_callback< + TrashRemoveRequest<I>, + &TrashRemoveRequest<I>::handle_set_trash_state>(this); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void TrashRemoveRequest<I>::handle_set_trash_state(int r) { + dout(10) << "r=" << r << dendl; + + if (r == -ENOENT) { + dout(10) << "image id " << m_image_id << " not in mirroring trash" << dendl; + finish(0); + return; + } else if (r < 0 && r != -EOPNOTSUPP) { + derr << "error setting trash image state for image id " << m_image_id + << ": " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + get_snap_context(); +} + +template <typename I> +void TrashRemoveRequest<I>::get_snap_context() { + dout(10) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::get_snapcontext_start(&op); + + std::string header_oid = librbd::util::header_name(m_image_id); + + auto aio_comp = create_rados_callback< + TrashRemoveRequest<I>, + &TrashRemoveRequest<I>::handle_get_snap_context>(this); + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(header_oid, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void TrashRemoveRequest<I>::handle_get_snap_context(int r) { + dout(10) << "r=" << r << dendl; + + ::SnapContext snapc; + if (r == 0) { + auto bl_it = m_out_bl.cbegin(); + r = librbd::cls_client::get_snapcontext_finish(&bl_it, &snapc); + } + if (r < 0 && r != -ENOENT) { + derr << "error retrieving snapshot context for image " + << m_image_id << ": " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + m_has_snapshots = (!snapc.empty()); + purge_snapshots(); +} + +template <typename I> +void TrashRemoveRequest<I>::purge_snapshots() { + if (!m_has_snapshots) { + remove_image(); + return; + } + + dout(10) << dendl; + auto ctx = create_context_callback< + TrashRemoveRequest<I>, + &TrashRemoveRequest<I>::handle_purge_snapshots>(this); + auto req = SnapshotPurgeRequest<I>::create(m_io_ctx, m_image_id, ctx); + req->send(); +} + +template <typename I> +void TrashRemoveRequest<I>::handle_purge_snapshots(int r) { + dout(10) << "r=" << r << dendl; + + if (r == -EBUSY) { + dout(10) << "snapshots still in-use" << dendl; + *m_error_result = ERROR_RESULT_RETRY_IMMEDIATELY; + finish(r); + return; + } else if (r < 0) { + derr << "failed to purge image snapshots: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + remove_image(); +} + +template <typename I> +void TrashRemoveRequest<I>::remove_image() { + dout(10) << dendl; + + auto ctx = create_context_callback< + TrashRemoveRequest<I>, + &TrashRemoveRequest<I>::handle_remove_image>(this); + auto req = librbd::trash::RemoveRequest<I>::create( + m_io_ctx, m_image_id, m_op_work_queue, true, m_progress_ctx, + ctx); + req->send(); +} + +template <typename I> +void TrashRemoveRequest<I>::handle_remove_image(int r) { + dout(10) << "r=" << r << dendl; + if (r == -ENOTEMPTY) { + // image must have clone v2 snapshot still associated to child + dout(10) << "snapshots still in-use" << dendl; + *m_error_result = ERROR_RESULT_RETRY_IMMEDIATELY; + finish(-EBUSY); + return; + } + + if (r < 0 && r != -ENOENT) { + derr << "error removing image " << m_image_id << " " + << "(" << m_image_id << ") from local pool: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + notify_trash_removed(); +} + +template <typename I> +void TrashRemoveRequest<I>::notify_trash_removed() { + dout(10) << dendl; + + Context *ctx = create_context_callback< + TrashRemoveRequest<I>, + &TrashRemoveRequest<I>::handle_notify_trash_removed>(this); + librbd::TrashWatcher<I>::notify_image_removed(m_io_ctx, m_image_id, ctx); +} + +template <typename I> +void TrashRemoveRequest<I>::handle_notify_trash_removed(int r) { + dout(10) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to notify trash watchers: " << cpp_strerror(r) << dendl; + } + + finish(0); +} + +template <typename I> +void TrashRemoveRequest<I>::finish(int r) { + dout(10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_deleter +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_deleter::TrashRemoveRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h new file mode 100644 index 00000000..d2295e8e --- /dev/null +++ b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h @@ -0,0 +1,113 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_TRASH_REMOVE_REQUEST_H +#define CEPH_RBD_MIRROR_IMAGE_DELETER_TRASH_REMOVE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "include/buffer.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/internal.h" +#include "tools/rbd_mirror/image_deleter/Types.h" +#include <string> +#include <vector> + +class Context; +class ContextWQ; +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_deleter { + +template <typename ImageCtxT = librbd::ImageCtx> +class TrashRemoveRequest { +public: + static TrashRemoveRequest* create(librados::IoCtx &io_ctx, + const std::string &image_id, + ErrorResult *error_result, + ContextWQ *op_work_queue, + Context *on_finish) { + return new TrashRemoveRequest(io_ctx, image_id, error_result, op_work_queue, + on_finish); + } + + TrashRemoveRequest(librados::IoCtx &io_ctx, const std::string &image_id, + ErrorResult *error_result, ContextWQ *op_work_queue, + Context *on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), m_error_result(error_result), + m_op_work_queue(op_work_queue), m_on_finish(on_finish) { + } + + void send(); + +private: + /* + * @verbatim + * + * <start> + * | + * v + * GET_TRASH_IMAGE_SPEC + * | + * v + * SET_TRASH_STATE + * | + * v + * GET_SNAP_CONTEXT + * | + * v + * PURGE_SNAPSHOTS + * | + * v + * TRASH_REMOVE + * | + * v + * NOTIFY_TRASH_REMOVE + * | + * v + * <finish> + * + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + std::string m_image_id; + ErrorResult *m_error_result; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + ceph::bufferlist m_out_bl; + cls::rbd::TrashImageSpec m_trash_image_spec; + bool m_has_snapshots = false; + librbd::NoOpProgressContext m_progress_ctx; + + void get_trash_image_spec(); + void handle_get_trash_image_spec(int r); + + void set_trash_state(); + void handle_set_trash_state(int r); + + void get_snap_context(); + void handle_get_snap_context(int r); + + void purge_snapshots(); + void handle_purge_snapshots(int r); + + void remove_image(); + void handle_remove_image(int r); + + void notify_trash_removed(); + void handle_notify_trash_removed(int r); + + void finish(int r); + +}; + +} // namespace image_deleter +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_deleter::TrashRemoveRequest<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_TRASH_REMOVE_REQUEST_H diff --git a/src/tools/rbd_mirror/image_deleter/TrashWatcher.cc b/src/tools/rbd_mirror/image_deleter/TrashWatcher.cc new file mode 100644 index 00000000..8735dfb7 --- /dev/null +++ b/src/tools/rbd_mirror/image_deleter/TrashWatcher.cc @@ -0,0 +1,384 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/image_deleter/TrashWatcher.h" +#include "include/rbd_types.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "tools/rbd_mirror/Threads.h" +#include "tools/rbd_mirror/image_deleter/Types.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_deleter::TrashWatcher: " \ + << this << " " << __func__ << ": " + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +namespace rbd { +namespace mirror { +namespace image_deleter { + +namespace { + +const size_t MAX_RETURN = 1024; + +} // anonymous namespace + +template <typename I> +TrashWatcher<I>::TrashWatcher(librados::IoCtx &io_ctx, Threads<I> *threads, + TrashListener& trash_listener) + : librbd::TrashWatcher<I>(io_ctx, threads->work_queue), + m_io_ctx(io_ctx), m_threads(threads), m_trash_listener(trash_listener), + m_lock(librbd::util::unique_lock_name( + "rbd::mirror::image_deleter::TrashWatcher", this)) { +} + +template <typename I> +void TrashWatcher<I>::init(Context *on_finish) { + dout(5) << dendl; + + { + Mutex::Locker locker(m_lock); + m_on_init_finish = on_finish; + + ceph_assert(!m_trash_list_in_progress); + m_trash_list_in_progress = true; + } + + create_trash(); +} + +template <typename I> +void TrashWatcher<I>::shut_down(Context *on_finish) { + dout(5) << dendl; + + { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + + ceph_assert(!m_shutting_down); + m_shutting_down = true; + if (m_timer_ctx != nullptr) { + m_threads->timer->cancel_event(m_timer_ctx); + m_timer_ctx = nullptr; + } + } + + auto ctx = new FunctionContext([this, on_finish](int r) { + unregister_watcher(on_finish); + }); + m_async_op_tracker.wait_for_ops(ctx); +} + +template <typename I> +void TrashWatcher<I>::handle_image_added(const std::string &image_id, + const cls::rbd::TrashImageSpec& spec) { + dout(10) << "image_id=" << image_id << dendl; + + Mutex::Locker locker(m_lock); + add_image(image_id, spec); +} + +template <typename I> +void TrashWatcher<I>::handle_image_removed(const std::string &image_id) { + // ignore removals -- the image deleter will ignore -ENOENTs +} + +template <typename I> +void TrashWatcher<I>::handle_rewatch_complete(int r) { + dout(5) << "r=" << r << dendl; + + if (r == -EBLACKLISTED) { + dout(0) << "detected client is blacklisted" << dendl; + return; + } else if (r == -ENOENT) { + dout(5) << "trash directory deleted" << dendl; + } else if (r < 0) { + derr << "unexpected error re-registering trash directory watch: " + << cpp_strerror(r) << dendl; + } + schedule_trash_list(30); +} + +template <typename I> +void TrashWatcher<I>::create_trash() { + dout(20) << dendl; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_trash_list_in_progress); + } + + librados::ObjectWriteOperation op; + op.create(false); + + m_async_op_tracker.start_op(); + auto aio_comp = create_rados_callback< + TrashWatcher<I>, &TrashWatcher<I>::handle_create_trash>(this); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void TrashWatcher<I>::handle_create_trash(int r) { + dout(20) << "r=" << r << dendl; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_trash_list_in_progress); + } + + Context* on_init_finish = nullptr; + if (r == -EBLACKLISTED || r == -ENOENT) { + if (r == -EBLACKLISTED) { + dout(0) << "detected client is blacklisted" << dendl; + } else { + dout(0) << "detected pool no longer exists" << dendl; + } + + Mutex::Locker locker(m_lock); + std::swap(on_init_finish, m_on_init_finish); + m_trash_list_in_progress = false; + } else if (r < 0 && r != -EEXIST) { + derr << "failed to create trash object: " << cpp_strerror(r) << dendl; + { + Mutex::Locker locker(m_lock); + m_trash_list_in_progress = false; + } + + schedule_trash_list(30); + } else { + register_watcher(); + } + + m_async_op_tracker.finish_op(); + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + } +} + +template <typename I> +void TrashWatcher<I>::register_watcher() { + { + Mutex::Locker locker(m_lock); + ceph_assert(m_trash_list_in_progress); + } + + // if the watch registration is in-flight, let the watcher + // handle the transition -- only (re-)register if it's not registered + if (!this->is_unregistered()) { + trash_list(true); + return; + } + + // first time registering or the watch failed + dout(5) << dendl; + m_async_op_tracker.start_op(); + + Context *ctx = create_context_callback< + TrashWatcher, &TrashWatcher<I>::handle_register_watcher>(this); + this->register_watch(ctx); +} + +template <typename I> +void TrashWatcher<I>::handle_register_watcher(int r) { + dout(5) << "r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_trash_list_in_progress); + if (r < 0) { + m_trash_list_in_progress = false; + } + } + + Context *on_init_finish = nullptr; + if (r >= 0) { + trash_list(true); + } else if (r == -EBLACKLISTED) { + dout(0) << "detected client is blacklisted" << dendl; + + Mutex::Locker locker(m_lock); + std::swap(on_init_finish, m_on_init_finish); + } else { + derr << "unexpected error registering trash directory watch: " + << cpp_strerror(r) << dendl; + schedule_trash_list(10); + } + + m_async_op_tracker.finish_op(); + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + } +} + +template <typename I> +void TrashWatcher<I>::unregister_watcher(Context* on_finish) { + dout(5) << dendl; + + m_async_op_tracker.start_op(); + Context *ctx = new FunctionContext([this, on_finish](int r) { + handle_unregister_watcher(r, on_finish); + }); + this->unregister_watch(ctx); +} + +template <typename I> +void TrashWatcher<I>::handle_unregister_watcher(int r, Context* on_finish) { + dout(5) << "unregister_watcher: r=" << r << dendl; + if (r < 0) { + derr << "error unregistering watcher for trash directory: " + << cpp_strerror(r) << dendl; + } + m_async_op_tracker.finish_op(); + on_finish->complete(0); +} + +template <typename I> +void TrashWatcher<I>::trash_list(bool initial_request) { + if (initial_request) { + m_async_op_tracker.start_op(); + m_last_image_id = ""; + } + + dout(5) << "last_image_id=" << m_last_image_id << dendl; + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_trash_list_in_progress); + } + + librados::ObjectReadOperation op; + librbd::cls_client::trash_list_start(&op, m_last_image_id, MAX_RETURN); + + librados::AioCompletion *aio_comp = create_rados_callback< + TrashWatcher<I>, &TrashWatcher<I>::handle_trash_list>(this); + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void TrashWatcher<I>::handle_trash_list(int r) { + dout(5) << "r=" << r << dendl; + + std::map<std::string, cls::rbd::TrashImageSpec> images; + if (r >= 0) { + auto bl_it = m_out_bl.cbegin(); + r = librbd::cls_client::trash_list_finish(&bl_it, &images); + } + + Context *on_init_finish = nullptr; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_trash_list_in_progress); + if (r >= 0) { + for (auto& image : images) { + add_image(image.first, image.second); + } + } else if (r == -ENOENT) { + r = 0; + } + + if (r == -EBLACKLISTED) { + dout(0) << "detected client is blacklisted during trash refresh" << dendl; + m_trash_list_in_progress = false; + std::swap(on_init_finish, m_on_init_finish); + } else if (r >= 0 && images.size() < MAX_RETURN) { + m_trash_list_in_progress = false; + std::swap(on_init_finish, m_on_init_finish); + } else if (r < 0) { + m_trash_list_in_progress = false; + } + } + + if (r >= 0 && images.size() == MAX_RETURN) { + m_last_image_id = images.rbegin()->first; + trash_list(false); + return; + } else if (r < 0 && r != -EBLACKLISTED) { + derr << "failed to retrieve trash directory: " << cpp_strerror(r) << dendl; + schedule_trash_list(10); + } + + m_async_op_tracker.finish_op(); + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + } +} + +template <typename I> +void TrashWatcher<I>::schedule_trash_list(double interval) { + Mutex::Locker timer_locker(m_threads->timer_lock); + Mutex::Locker locker(m_lock); + if (m_shutting_down || m_trash_list_in_progress || m_timer_ctx != nullptr) { + if (m_trash_list_in_progress && !m_deferred_trash_list) { + dout(5) << "deferring refresh until in-flight refresh completes" << dendl; + m_deferred_trash_list = true; + } + return; + } + + dout(5) << dendl; + m_timer_ctx = m_threads->timer->add_event_after( + interval, + new FunctionContext([this](int r) { + process_trash_list(); + })); +} + +template <typename I> +void TrashWatcher<I>::process_trash_list() { + dout(5) << dendl; + + ceph_assert(m_threads->timer_lock.is_locked()); + ceph_assert(m_timer_ctx != nullptr); + m_timer_ctx = nullptr; + + { + Mutex::Locker locker(m_lock); + ceph_assert(!m_trash_list_in_progress); + m_trash_list_in_progress = true; + } + + // execute outside of the timer's lock + m_async_op_tracker.start_op(); + Context *ctx = new FunctionContext([this](int r) { + create_trash(); + m_async_op_tracker.finish_op(); + }); + m_threads->work_queue->queue(ctx, 0); +} + +template <typename I> +void TrashWatcher<I>::add_image(const std::string& image_id, + const cls::rbd::TrashImageSpec& spec) { + if (spec.source != cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING) { + return; + } + + ceph_assert(m_lock.is_locked()); + auto& deferment_end_time = spec.deferment_end_time; + dout(10) << "image_id=" << image_id << ", " + << "deferment_end_time=" << deferment_end_time << dendl; + + m_async_op_tracker.start_op(); + auto ctx = new FunctionContext([this, image_id, deferment_end_time](int r) { + m_trash_listener.handle_trash_image(image_id, deferment_end_time); + m_async_op_tracker.finish_op(); + }); + m_threads->work_queue->queue(ctx, 0); +} + +} // namespace image_deleter; +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_deleter::TrashWatcher<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_deleter/TrashWatcher.h b/src/tools/rbd_mirror/image_deleter/TrashWatcher.h new file mode 100644 index 00000000..b6f69833 --- /dev/null +++ b/src/tools/rbd_mirror/image_deleter/TrashWatcher.h @@ -0,0 +1,139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H +#define CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H + +#include "include/rados/librados.hpp" +#include "common/AsyncOpTracker.h" +#include "common/Mutex.h" +#include "librbd/TrashWatcher.h" +#include <set> +#include <string> + +struct Context; +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { + +template <typename> struct Threads; + +namespace image_deleter { + +struct TrashListener; + +template <typename ImageCtxT = librbd::ImageCtx> +class TrashWatcher : public librbd::TrashWatcher<ImageCtxT> { +public: + static TrashWatcher* create(librados::IoCtx &io_ctx, + Threads<ImageCtxT> *threads, + TrashListener& trash_listener) { + return new TrashWatcher(io_ctx, threads, trash_listener); + } + + TrashWatcher(librados::IoCtx &io_ctx, Threads<ImageCtxT> *threads, + TrashListener& trash_listener); + TrashWatcher(const TrashWatcher&) = delete; + TrashWatcher& operator=(const TrashWatcher&) = delete; + + void init(Context *on_finish); + void shut_down(Context *on_finish); + +protected: + void handle_image_added(const std::string &image_id, + const cls::rbd::TrashImageSpec& spec) override; + + void handle_image_removed(const std::string &image_id) override; + + void handle_rewatch_complete(int r) override; + +private: + /** + * @verbatim + * + * <start> + * | + * v + * INIT + * | + * v + * CREATE_TRASH + * | + * v + * REGISTER_WATCHER + * | + * |/--------------------------------\ + * | | + * |/---------\ | + * | | | + * v | (more images) | + * TRASH_LIST ---/ | + * | | + * |/----------------------------\ | + * | | | + * v | | + * <idle> --\ | | + * | | | | + * | |\---> IMAGE_ADDED -----/ | + * | | | + * | \----> WATCH_ERROR ---------/ + * v + * SHUT_DOWN + * | + * v + * UNREGISTER_WATCHER + * | + * v + * <finish> + * + * @endverbatim + */ + + librados::IoCtx m_io_ctx; + Threads<ImageCtxT> *m_threads; + TrashListener& m_trash_listener; + + std::string m_last_image_id; + bufferlist m_out_bl; + + mutable Mutex m_lock; + + Context *m_on_init_finish = nullptr; + Context *m_timer_ctx = nullptr; + + AsyncOpTracker m_async_op_tracker; + bool m_trash_list_in_progress = false; + bool m_deferred_trash_list = false; + bool m_shutting_down = false; + + void register_watcher(); + void handle_register_watcher(int r); + + void create_trash(); + void handle_create_trash(int r); + + void unregister_watcher(Context* on_finish); + void handle_unregister_watcher(int r, Context* on_finish); + + void trash_list(bool initial_request); + void handle_trash_list(int r); + + void schedule_trash_list(double interval); + void process_trash_list(); + + void get_mirror_uuid(); + void handle_get_mirror_uuid(int r); + + void add_image(const std::string& image_id, + const cls::rbd::TrashImageSpec& spec); + +}; + +} // namespace image_deleter +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_deleter::TrashWatcher<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H diff --git a/src/tools/rbd_mirror/image_deleter/Types.h b/src/tools/rbd_mirror/image_deleter/Types.h new file mode 100644 index 00000000..ac3bc64a --- /dev/null +++ b/src/tools/rbd_mirror/image_deleter/Types.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_TYPES_H +#define CEPH_RBD_MIRROR_IMAGE_DELETER_TYPES_H + +#include "include/Context.h" +#include "librbd/journal/Policy.h" +#include <string> + +struct utime_t; + +namespace rbd { +namespace mirror { +namespace image_deleter { + +enum ErrorResult { + ERROR_RESULT_COMPLETE, + ERROR_RESULT_RETRY, + ERROR_RESULT_RETRY_IMMEDIATELY +}; + +struct TrashListener { + TrashListener() { + } + TrashListener(const TrashListener&) = delete; + TrashListener& operator=(const TrashListener&) = delete; + + virtual ~TrashListener() { + } + + virtual void handle_trash_image(const std::string& image_id, + const utime_t& deferment_end_time) = 0; + +}; + +struct JournalPolicy : public librbd::journal::Policy { + bool append_disabled() const override { + return true; + } + bool journal_disabled() const override { + return true; + } + + void allocate_tag_on_lock(Context *on_finish) override { + on_finish->complete(0); + } +}; + +} // namespace image_deleter +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_TYPES_H diff --git a/src/tools/rbd_mirror/image_map/LoadRequest.cc b/src/tools/rbd_mirror/image_map/LoadRequest.cc new file mode 100644 index 00000000..7387b476 --- /dev/null +++ b/src/tools/rbd_mirror/image_map/LoadRequest.cc @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" + +#include "librbd/Utils.h" +#include "include/rbd_types.h" +#include "cls/rbd/cls_rbd_client.h" + +#include "LoadRequest.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_map::LoadRequest: " \ + << this << " " << __func__ + +namespace rbd { +namespace mirror { +namespace image_map { + +static const uint32_t MAX_RETURN = 1024; + +using librbd::util::create_rados_callback; + +template<typename I> +LoadRequest<I>::LoadRequest(librados::IoCtx &ioctx, + std::map<std::string, cls::rbd::MirrorImageMap> *image_mapping, + Context *on_finish) + : m_ioctx(ioctx), + m_image_mapping(image_mapping), + m_on_finish(on_finish) { +} + +template<typename I> +void LoadRequest<I>::send() { + dout(20) << dendl; + + image_map_list(); +} + +template<typename I> +void LoadRequest<I>::image_map_list() { + dout(20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_image_map_list_start(&op, m_start_after, MAX_RETURN); + + librados::AioCompletion *aio_comp = create_rados_callback< + LoadRequest, &LoadRequest::handle_image_map_list>(this); + + m_out_bl.clear(); + int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template<typename I> +void LoadRequest<I>::handle_image_map_list(int r) { + dout(20) << ": r=" << r << dendl; + + std::map<std::string, cls::rbd::MirrorImageMap> image_mapping; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_image_map_list_finish(&it, &image_mapping); + } + + if (r < 0) { + derr << ": failed to get image map: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + m_image_mapping->insert(image_mapping.begin(), image_mapping.end()); + + if (image_mapping.size() == MAX_RETURN) { + m_start_after = image_mapping.rbegin()->first; + image_map_list(); + return; + } + + finish(0); +} + +template<typename I> +void LoadRequest<I>::finish(int r) { + dout(20) << ": r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_map +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_map::LoadRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_map/LoadRequest.h b/src/tools/rbd_mirror/image_map/LoadRequest.h new file mode 100644 index 00000000..7657e110 --- /dev/null +++ b/src/tools/rbd_mirror/image_map/LoadRequest.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_LOAD_REQUEST_H +#define CEPH_RBD_MIRROR_IMAGE_MAP_LOAD_REQUEST_H + +#include "cls/rbd/cls_rbd_types.h" +#include "include/rados/librados.hpp" + +class Context; + +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_map { + +template<typename ImageCtxT = librbd::ImageCtx> +class LoadRequest { +public: + static LoadRequest *create(librados::IoCtx &ioctx, + std::map<std::string, cls::rbd::MirrorImageMap> *image_mapping, + Context *on_finish) { + return new LoadRequest(ioctx, image_mapping, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | . . . . . . . . + * v v . MAX_RETURN + * IMAGE_MAP_LIST. . . . . . . + * | + * v + * <finish> + * + * @endverbatim + */ + LoadRequest(librados::IoCtx &ioctx, + std::map<std::string, cls::rbd::MirrorImageMap> *image_mapping, + Context *on_finish); + + librados::IoCtx &m_ioctx; + std::map<std::string, cls::rbd::MirrorImageMap> *m_image_mapping; + Context *m_on_finish; + + bufferlist m_out_bl; + std::string m_start_after; + + void image_map_list(); + void handle_image_map_list(int r); + + void finish(int r); +}; + +} // namespace image_map +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_IMAGE_MAP_LOAD_REQUEST_H diff --git a/src/tools/rbd_mirror/image_map/Policy.cc b/src/tools/rbd_mirror/image_map/Policy.cc new file mode 100644 index 00000000..6fababdd --- /dev/null +++ b/src/tools/rbd_mirror/image_map/Policy.cc @@ -0,0 +1,406 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" + +#include "librbd/Utils.h" +#include "Policy.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_map::Policy: " << this \ + << " " << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace image_map { + +namespace { + +bool is_instance_action(ActionType action_type) { + switch (action_type) { + case ACTION_TYPE_ACQUIRE: + case ACTION_TYPE_RELEASE: + return true; + case ACTION_TYPE_NONE: + case ACTION_TYPE_MAP_UPDATE: + case ACTION_TYPE_MAP_REMOVE: + break; + } + return false; +} + +} // anonymous namespace + +using ::operator<<; +using librbd::util::unique_lock_name; + +Policy::Policy(librados::IoCtx &ioctx) + : m_ioctx(ioctx), + m_map_lock(unique_lock_name("rbd::mirror::image_map::Policy::m_map_lock", + this)) { + + // map should at least have once instance + std::string instance_id = stringify(ioctx.get_instance_id()); + m_map.emplace(instance_id, std::set<std::string>{}); +} + +void Policy::init( + const std::map<std::string, cls::rbd::MirrorImageMap> &image_mapping) { + dout(20) << dendl; + + RWLock::WLocker map_lock(m_map_lock); + for (auto& it : image_mapping) { + ceph_assert(!it.second.instance_id.empty()); + auto map_result = m_map[it.second.instance_id].emplace(it.first); + ceph_assert(map_result.second); + + auto image_state_result = m_image_states.emplace( + it.first, ImageState{it.second.instance_id, it.second.mapped_time}); + ceph_assert(image_state_result.second); + + // ensure we (re)send image acquire actions to the instance + auto& image_state = image_state_result.first->second; + auto start_action = set_state(&image_state, + StateTransition::STATE_INITIALIZING, false); + ceph_assert(start_action); + } +} + +LookupInfo Policy::lookup(const std::string &global_image_id) { + dout(20) << "global_image_id=" << global_image_id << dendl; + + RWLock::RLocker map_lock(m_map_lock); + LookupInfo info; + + auto it = m_image_states.find(global_image_id); + if (it != m_image_states.end()) { + info.instance_id = it->second.instance_id; + info.mapped_time = it->second.mapped_time; + } + return info; +} + +bool Policy::add_image(const std::string &global_image_id) { + dout(5) << "global_image_id=" << global_image_id << dendl; + + RWLock::WLocker map_lock(m_map_lock); + auto image_state_result = m_image_states.emplace(global_image_id, + ImageState{}); + auto& image_state = image_state_result.first->second; + if (image_state.state == StateTransition::STATE_INITIALIZING) { + // avoid duplicate acquire notifications upon leader startup + return false; + } + + return set_state(&image_state, StateTransition::STATE_ASSOCIATING, false); +} + +bool Policy::remove_image(const std::string &global_image_id) { + dout(5) << "global_image_id=" << global_image_id << dendl; + + RWLock::WLocker map_lock(m_map_lock); + auto it = m_image_states.find(global_image_id); + if (it == m_image_states.end()) { + return false; + } + + auto& image_state = it->second; + return set_state(&image_state, StateTransition::STATE_DISSOCIATING, false); +} + +void Policy::add_instances(const InstanceIds &instance_ids, + GlobalImageIds* global_image_ids) { + dout(5) << "instance_ids=" << instance_ids << dendl; + + RWLock::WLocker map_lock(m_map_lock); + for (auto& instance : instance_ids) { + ceph_assert(!instance.empty()); + m_map.emplace(instance, std::set<std::string>{}); + } + + // post-failover, remove any dead instances and re-shuffle their images + if (m_initial_update) { + dout(5) << "initial instance update" << dendl; + m_initial_update = false; + + std::set<std::string> alive_instances(instance_ids.begin(), + instance_ids.end()); + InstanceIds dead_instances; + for (auto& map_pair : m_map) { + if (alive_instances.find(map_pair.first) == alive_instances.end()) { + dead_instances.push_back(map_pair.first); + } + } + + if (!dead_instances.empty()) { + remove_instances(m_map_lock, dead_instances, global_image_ids); + } + } + + GlobalImageIds shuffle_global_image_ids; + do_shuffle_add_instances(m_map, m_image_states.size(), &shuffle_global_image_ids); + dout(5) << "shuffling global_image_ids=[" << shuffle_global_image_ids + << "]" << dendl; + for (auto& global_image_id : shuffle_global_image_ids) { + auto it = m_image_states.find(global_image_id); + ceph_assert(it != m_image_states.end()); + + auto& image_state = it->second; + if (set_state(&image_state, StateTransition::STATE_SHUFFLING, false)) { + global_image_ids->emplace(global_image_id); + } + } +} + +void Policy::remove_instances(const InstanceIds &instance_ids, + GlobalImageIds* global_image_ids) { + RWLock::WLocker map_lock(m_map_lock); + remove_instances(m_map_lock, instance_ids, global_image_ids); +} + +void Policy::remove_instances(const RWLock& lock, + const InstanceIds &instance_ids, + GlobalImageIds* global_image_ids) { + ceph_assert(m_map_lock.is_wlocked()); + dout(5) << "instance_ids=" << instance_ids << dendl; + + for (auto& instance_id : instance_ids) { + auto map_it = m_map.find(instance_id); + if (map_it == m_map.end()) { + continue; + } + + auto& instance_global_image_ids = map_it->second; + if (instance_global_image_ids.empty()) { + m_map.erase(map_it); + continue; + } + + m_dead_instances.insert(instance_id); + dout(5) << "force shuffling: instance_id=" << instance_id << ", " + << "global_image_ids=[" << instance_global_image_ids << "]"<< dendl; + for (auto& global_image_id : instance_global_image_ids) { + auto it = m_image_states.find(global_image_id); + ceph_assert(it != m_image_states.end()); + + auto& image_state = it->second; + if (is_state_scheduled(image_state, + StateTransition::STATE_DISSOCIATING)) { + // don't shuffle images that no longer exist + continue; + } + + if (set_state(&image_state, StateTransition::STATE_SHUFFLING, true)) { + global_image_ids->emplace(global_image_id); + } + } + } +} + +ActionType Policy::start_action(const std::string &global_image_id) { + RWLock::WLocker map_lock(m_map_lock); + + auto it = m_image_states.find(global_image_id); + ceph_assert(it != m_image_states.end()); + + auto& image_state = it->second; + auto& transition = image_state.transition; + ceph_assert(transition.action_type != ACTION_TYPE_NONE); + + dout(5) << "global_image_id=" << global_image_id << ", " + << "state=" << image_state.state << ", " + << "action_type=" << transition.action_type << dendl; + if (transition.start_policy_action) { + execute_policy_action(global_image_id, &image_state, + *transition.start_policy_action); + transition.start_policy_action = boost::none; + } + return transition.action_type; +} + +bool Policy::finish_action(const std::string &global_image_id, int r) { + RWLock::WLocker map_lock(m_map_lock); + + auto it = m_image_states.find(global_image_id); + ceph_assert(it != m_image_states.end()); + + auto& image_state = it->second; + auto& transition = image_state.transition; + dout(5) << "global_image_id=" << global_image_id << ", " + << "state=" << image_state.state << ", " + << "action_type=" << transition.action_type << ", " + << "r=" << r << dendl; + + // retry on failure unless it's an RPC message to an instance that is dead + if (r < 0 && + (!is_instance_action(image_state.transition.action_type) || + image_state.instance_id == UNMAPPED_INSTANCE_ID || + m_dead_instances.find(image_state.instance_id) == + m_dead_instances.end())) { + return true; + } + + auto finish_policy_action = transition.finish_policy_action; + StateTransition::transit(image_state.state, &image_state.transition); + if (transition.finish_state) { + // in-progress state machine complete + ceph_assert(StateTransition::is_idle(*transition.finish_state)); + image_state.state = *transition.finish_state; + image_state.transition = {}; + } + + if (StateTransition::is_idle(image_state.state) && image_state.next_state) { + // advance to pending state machine + bool start_action = set_state(&image_state, *image_state.next_state, false); + ceph_assert(start_action); + } + + // image state may get purged in execute_policy_action() + bool pending_action = image_state.transition.action_type != ACTION_TYPE_NONE; + if (finish_policy_action) { + execute_policy_action(global_image_id, &image_state, *finish_policy_action); + } + + return pending_action; +} + +void Policy::execute_policy_action( + const std::string& global_image_id, ImageState* image_state, + StateTransition::PolicyAction policy_action) { + dout(5) << "global_image_id=" << global_image_id << ", " + << "policy_action=" << policy_action << dendl; + + switch (policy_action) { + case StateTransition::POLICY_ACTION_MAP: + map(global_image_id, image_state); + break; + case StateTransition::POLICY_ACTION_UNMAP: + unmap(global_image_id, image_state); + break; + case StateTransition::POLICY_ACTION_REMOVE: + if (image_state->state == StateTransition::STATE_UNASSOCIATED) { + ceph_assert(image_state->instance_id == UNMAPPED_INSTANCE_ID); + ceph_assert(!image_state->next_state); + m_image_states.erase(global_image_id); + } + break; + } +} + +void Policy::map(const std::string& global_image_id, ImageState* image_state) { + ceph_assert(m_map_lock.is_wlocked()); + + std::string instance_id = image_state->instance_id; + if (instance_id != UNMAPPED_INSTANCE_ID && !is_dead_instance(instance_id)) { + return; + } + if (is_dead_instance(instance_id)) { + unmap(global_image_id, image_state); + } + + instance_id = do_map(m_map, global_image_id); + ceph_assert(!instance_id.empty()); + dout(5) << "global_image_id=" << global_image_id << ", " + << "instance_id=" << instance_id << dendl; + + image_state->instance_id = instance_id; + image_state->mapped_time = ceph_clock_now(); + + auto ins = m_map[instance_id].emplace(global_image_id); + ceph_assert(ins.second); +} + +void Policy::unmap(const std::string &global_image_id, + ImageState* image_state) { + ceph_assert(m_map_lock.is_wlocked()); + + std::string instance_id = image_state->instance_id; + if (instance_id == UNMAPPED_INSTANCE_ID) { + return; + } + + dout(5) << "global_image_id=" << global_image_id << ", " + << "instance_id=" << instance_id << dendl; + + ceph_assert(!instance_id.empty()); + m_map[instance_id].erase(global_image_id); + image_state->instance_id = UNMAPPED_INSTANCE_ID; + image_state->mapped_time = {}; + + if (is_dead_instance(instance_id) && m_map[instance_id].empty()) { + dout(5) << "removing dead instance_id=" << instance_id << dendl; + m_map.erase(instance_id); + m_dead_instances.erase(instance_id); + } +} + +bool Policy::is_image_shuffling(const std::string &global_image_id) { + ceph_assert(m_map_lock.is_locked()); + + auto it = m_image_states.find(global_image_id); + ceph_assert(it != m_image_states.end()); + auto& image_state = it->second; + + // avoid attempting to re-shuffle a pending shuffle + auto result = is_state_scheduled(image_state, + StateTransition::STATE_SHUFFLING); + dout(20) << "global_image_id=" << global_image_id << ", " + << "result=" << result << dendl; + return result; +} + +bool Policy::can_shuffle_image(const std::string &global_image_id) { + ceph_assert(m_map_lock.is_locked()); + + CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct()); + int migration_throttle = cct->_conf.get_val<uint64_t>( + "rbd_mirror_image_policy_migration_throttle"); + + auto it = m_image_states.find(global_image_id); + ceph_assert(it != m_image_states.end()); + auto& image_state = it->second; + + utime_t last_shuffled_time = image_state.mapped_time; + + // idle images that haven't been recently remapped can shuffle + utime_t now = ceph_clock_now(); + auto result = (StateTransition::is_idle(image_state.state) && + ((migration_throttle <= 0) || + (now - last_shuffled_time >= migration_throttle))); + dout(10) << "global_image_id=" << global_image_id << ", " + << "migration_throttle=" << migration_throttle << ", " + << "last_shuffled_time=" << last_shuffled_time << ", " + << "result=" << result << dendl; + return result; +} + +bool Policy::set_state(ImageState* image_state, StateTransition::State state, + bool ignore_current_state) { + if (!ignore_current_state && image_state->state == state) { + return false; + } else if (StateTransition::is_idle(image_state->state)) { + image_state->state = state; + image_state->next_state = boost::none; + + StateTransition::transit(image_state->state, &image_state->transition); + ceph_assert(image_state->transition.action_type != ACTION_TYPE_NONE); + ceph_assert(!image_state->transition.finish_state); + return true; + } + + image_state->next_state = state; + return false; +} + +bool Policy::is_state_scheduled(const ImageState& image_state, + StateTransition::State state) const { + return (image_state.state == state || + (image_state.next_state && *image_state.next_state == state)); +} + +} // namespace image_map +} // namespace mirror +} // namespace rbd diff --git a/src/tools/rbd_mirror/image_map/Policy.h b/src/tools/rbd_mirror/image_map/Policy.h new file mode 100644 index 00000000..590fdbfe --- /dev/null +++ b/src/tools/rbd_mirror/image_map/Policy.h @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_POLICY_H +#define CEPH_RBD_MIRROR_IMAGE_MAP_POLICY_H + +#include <map> +#include <tuple> +#include <boost/optional.hpp> + +#include "common/RWLock.h" +#include "cls/rbd/cls_rbd_types.h" +#include "include/rados/librados.hpp" +#include "tools/rbd_mirror/image_map/StateTransition.h" +#include "tools/rbd_mirror/image_map/Types.h" + +class Context; + +namespace rbd { +namespace mirror { +namespace image_map { + +class Policy { +public: + Policy(librados::IoCtx &ioctx); + + virtual ~Policy() { + } + + // init -- called during initialization + void init( + const std::map<std::string, cls::rbd::MirrorImageMap> &image_mapping); + + // lookup an image from the map + LookupInfo lookup(const std::string &global_image_id); + + // add, remove + bool add_image(const std::string &global_image_id); + bool remove_image(const std::string &global_image_id); + + // shuffle images when instances are added/removed + void add_instances(const InstanceIds &instance_ids, + GlobalImageIds* global_image_ids); + void remove_instances(const InstanceIds &instance_ids, + GlobalImageIds* global_image_ids); + + ActionType start_action(const std::string &global_image_id); + bool finish_action(const std::string &global_image_id, int r); + +protected: + typedef std::map<std::string, std::set<std::string> > InstanceToImageMap; + + bool is_dead_instance(const std::string instance_id) { + ceph_assert(m_map_lock.is_locked()); + return m_dead_instances.find(instance_id) != m_dead_instances.end(); + } + + bool is_image_shuffling(const std::string &global_image_id); + bool can_shuffle_image(const std::string &global_image_id); + + // map an image (global image id) to an instance + virtual std::string do_map(const InstanceToImageMap& map, + const std::string &global_image_id) = 0; + + // shuffle images when instances are added/removed + virtual void do_shuffle_add_instances( + const InstanceToImageMap& map, size_t image_count, + std::set<std::string> *remap_global_image_ids) = 0; + +private: + struct ImageState { + std::string instance_id = UNMAPPED_INSTANCE_ID; + utime_t mapped_time; + + ImageState() {} + ImageState(const std::string& instance_id, const utime_t& mapped_time) + : instance_id(instance_id), mapped_time(mapped_time) { + } + + // active state and action + StateTransition::State state = StateTransition::STATE_UNASSOCIATED; + StateTransition::Transition transition; + + // next scheduled state + boost::optional<StateTransition::State> next_state = boost::none; + }; + + typedef std::map<std::string, ImageState> ImageStates; + + librados::IoCtx &m_ioctx; + + RWLock m_map_lock; // protects m_map + InstanceToImageMap m_map; // instance_id -> global_id map + + ImageStates m_image_states; + std::set<std::string> m_dead_instances; + + bool m_initial_update = true; + + void remove_instances(const RWLock& lock, const InstanceIds &instance_ids, + GlobalImageIds* global_image_ids); + + bool set_state(ImageState* image_state, StateTransition::State state, + bool ignore_current_state); + + void execute_policy_action(const std::string& global_image_id, + ImageState* image_state, + StateTransition::PolicyAction policy_action); + + void map(const std::string& global_image_id, ImageState* image_state); + void unmap(const std::string &global_image_id, ImageState* image_state); + + bool is_state_scheduled(const ImageState& image_state, + StateTransition::State state) const; + +}; + +} // namespace image_map +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_IMAGE_MAP_POLICY_H diff --git a/src/tools/rbd_mirror/image_map/SimplePolicy.cc b/src/tools/rbd_mirror/image_map/SimplePolicy.cc new file mode 100644 index 00000000..f2680581 --- /dev/null +++ b/src/tools/rbd_mirror/image_map/SimplePolicy.cc @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" + +#include "SimplePolicy.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_map::SimplePolicy: " << this \ + << " " << __func__ << ": " +namespace rbd { +namespace mirror { +namespace image_map { + +SimplePolicy::SimplePolicy(librados::IoCtx &ioctx) + : Policy(ioctx) { +} + +size_t SimplePolicy::calc_images_per_instance(const InstanceToImageMap& map, + size_t image_count) { + size_t nr_instances = 0; + for (auto const &it : map) { + if (!Policy::is_dead_instance(it.first)) { + ++nr_instances; + } + } + ceph_assert(nr_instances > 0); + + size_t images_per_instance = image_count / nr_instances; + if (images_per_instance == 0) { + ++images_per_instance; + } + + return images_per_instance; +} + +void SimplePolicy::do_shuffle_add_instances( + const InstanceToImageMap& map, size_t image_count, + std::set<std::string> *remap_global_image_ids) { + uint64_t images_per_instance = calc_images_per_instance(map, image_count); + dout(5) << "images per instance=" << images_per_instance << dendl; + + for (auto const &instance : map) { + if (instance.second.size() <= images_per_instance) { + continue; + } + + auto it = instance.second.begin(); + uint64_t cut_off = instance.second.size() - images_per_instance; + + while (it != instance.second.end() && cut_off > 0) { + if (Policy::is_image_shuffling(*it)) { + --cut_off; + } else if (Policy::can_shuffle_image(*it)) { + --cut_off; + remap_global_image_ids->emplace(*it); + } + + ++it; + } + } +} + +std::string SimplePolicy::do_map(const InstanceToImageMap& map, + const std::string &global_image_id) { + auto min_it = map.end(); + for (auto it = map.begin(); it != map.end(); ++it) { + ceph_assert(it->second.find(global_image_id) == it->second.end()); + if (Policy::is_dead_instance(it->first)) { + continue; + } else if (min_it == map.end()) { + min_it = it; + } else if (it->second.size() < min_it->second.size()) { + min_it = it; + } + } + + ceph_assert(min_it != map.end()); + dout(20) << "global_image_id=" << global_image_id << " maps to instance_id=" + << min_it->first << dendl; + return min_it->first; +} + +} // namespace image_map +} // namespace mirror +} // namespace rbd diff --git a/src/tools/rbd_mirror/image_map/SimplePolicy.h b/src/tools/rbd_mirror/image_map/SimplePolicy.h new file mode 100644 index 00000000..ad2071b2 --- /dev/null +++ b/src/tools/rbd_mirror/image_map/SimplePolicy.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_SIMPLE_POLICY_H +#define CEPH_RBD_MIRROR_IMAGE_MAP_SIMPLE_POLICY_H + +#include "Policy.h" + +namespace rbd { +namespace mirror { +namespace image_map { + +class SimplePolicy : public Policy { +public: + static SimplePolicy *create(librados::IoCtx &ioctx) { + return new SimplePolicy(ioctx); + } + +protected: + SimplePolicy(librados::IoCtx &ioctx); + + std::string do_map(const InstanceToImageMap& map, + const std::string &global_image_id) override; + + void do_shuffle_add_instances( + const InstanceToImageMap& map, size_t image_count, + std::set<std::string> *remap_global_image_ids) override; + +private: + size_t calc_images_per_instance(const InstanceToImageMap& map, + size_t image_count); + +}; + +} // namespace image_map +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_IMAGE_MAP_SIMPLE_POLICY_H diff --git a/src/tools/rbd_mirror/image_map/StateTransition.cc b/src/tools/rbd_mirror/image_map/StateTransition.cc new file mode 100644 index 00000000..ec5f07ff --- /dev/null +++ b/src/tools/rbd_mirror/image_map/StateTransition.cc @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <ostream> +#include "include/ceph_assert.h" +#include "StateTransition.h" + +namespace rbd { +namespace mirror { +namespace image_map { + +std::ostream &operator<<(std::ostream &os, + const StateTransition::State &state) { + switch(state) { + case StateTransition::STATE_INITIALIZING: + os << "INITIALIZING"; + break; + case StateTransition::STATE_ASSOCIATING: + os << "ASSOCIATING"; + break; + case StateTransition::STATE_ASSOCIATED: + os << "ASSOCIATED"; + break; + case StateTransition::STATE_SHUFFLING: + os << "SHUFFLING"; + break; + case StateTransition::STATE_DISSOCIATING: + os << "DISSOCIATING"; + break; + case StateTransition::STATE_UNASSOCIATED: + os << "UNASSOCIATED"; + break; + } + return os; +} + +std::ostream &operator<<(std::ostream &os, + const StateTransition::PolicyAction &policy_action) { + switch(policy_action) { + case StateTransition::POLICY_ACTION_MAP: + os << "MAP"; + break; + case StateTransition::POLICY_ACTION_UNMAP: + os << "UNMAP"; + break; + case StateTransition::POLICY_ACTION_REMOVE: + os << "REMOVE"; + break; + } + return os; +} + +const StateTransition::TransitionTable StateTransition::s_transition_table { + // state current_action Transition + // --------------------------------------------------------------------------- + {{STATE_INITIALIZING, ACTION_TYPE_NONE}, {ACTION_TYPE_ACQUIRE, {}, {}, + {}}}, + {{STATE_INITIALIZING, ACTION_TYPE_ACQUIRE}, {ACTION_TYPE_NONE, {}, {}, + {STATE_ASSOCIATED}}}, + + {{STATE_ASSOCIATING, ACTION_TYPE_NONE}, {ACTION_TYPE_MAP_UPDATE, + {POLICY_ACTION_MAP}, {}, {}}}, + {{STATE_ASSOCIATING, ACTION_TYPE_MAP_UPDATE}, {ACTION_TYPE_ACQUIRE, {}, {}, + {}}}, + {{STATE_ASSOCIATING, ACTION_TYPE_ACQUIRE}, {ACTION_TYPE_NONE, {}, {}, + {STATE_ASSOCIATED}}}, + + {{STATE_DISSOCIATING, ACTION_TYPE_NONE}, {ACTION_TYPE_RELEASE, {}, + {POLICY_ACTION_UNMAP}, {}}}, + {{STATE_DISSOCIATING, ACTION_TYPE_RELEASE}, {ACTION_TYPE_MAP_REMOVE, {}, + {POLICY_ACTION_REMOVE}, {}}}, + {{STATE_DISSOCIATING, ACTION_TYPE_MAP_REMOVE}, {ACTION_TYPE_NONE, {}, + {}, {STATE_UNASSOCIATED}}}, + + {{STATE_SHUFFLING, ACTION_TYPE_NONE}, {ACTION_TYPE_RELEASE, {}, + {POLICY_ACTION_UNMAP}, {}}}, + {{STATE_SHUFFLING, ACTION_TYPE_RELEASE}, {ACTION_TYPE_MAP_UPDATE, + {POLICY_ACTION_MAP}, {}, {}}}, + {{STATE_SHUFFLING, ACTION_TYPE_MAP_UPDATE}, {ACTION_TYPE_ACQUIRE, {}, {}, + {}}}, + {{STATE_SHUFFLING, ACTION_TYPE_ACQUIRE}, {ACTION_TYPE_NONE, {}, {}, + {STATE_ASSOCIATED}}} +}; + +void StateTransition::transit(State state, Transition* transition) { + auto it = s_transition_table.find({state, transition->action_type}); + ceph_assert(it != s_transition_table.end()); + + *transition = it->second; +} + +} // namespace image_map +} // namespace mirror +} // namespace rbd diff --git a/src/tools/rbd_mirror/image_map/StateTransition.h b/src/tools/rbd_mirror/image_map/StateTransition.h new file mode 100644 index 00000000..02a5ce4e --- /dev/null +++ b/src/tools/rbd_mirror/image_map/StateTransition.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_STATE_TRANSITION_H +#define CEPH_RBD_MIRROR_IMAGE_MAP_STATE_TRANSITION_H + +#include "tools/rbd_mirror/image_map/Types.h" +#include <boost/optional.hpp> +#include <map> + +namespace rbd { +namespace mirror { +namespace image_map { + +class StateTransition { +public: + enum State { + STATE_UNASSOCIATED, + STATE_INITIALIZING, + STATE_ASSOCIATING, + STATE_ASSOCIATED, + STATE_SHUFFLING, + STATE_DISSOCIATING + }; + + enum PolicyAction { + POLICY_ACTION_MAP, + POLICY_ACTION_UNMAP, + POLICY_ACTION_REMOVE + }; + + struct Transition { + // image map action + ActionType action_type = ACTION_TYPE_NONE; + + // policy internal action + boost::optional<PolicyAction> start_policy_action; + boost::optional<PolicyAction> finish_policy_action; + + // state machine complete + boost::optional<State> finish_state; + + Transition() { + } + Transition(ActionType action_type, + const boost::optional<PolicyAction>& start_policy_action, + const boost::optional<PolicyAction>& finish_policy_action, + const boost::optional<State>& finish_state) + : action_type(action_type), start_policy_action(start_policy_action), + finish_policy_action(finish_policy_action), finish_state(finish_state) { + } + }; + + static bool is_idle(State state) { + return (state == STATE_UNASSOCIATED || state == STATE_ASSOCIATED); + } + + static void transit(State state, Transition* transition); + +private: + typedef std::pair<State, ActionType> TransitionKey; + typedef std::map<TransitionKey, Transition> TransitionTable; + + // image transition table + static const TransitionTable s_transition_table; +}; + +std::ostream &operator<<(std::ostream &os, const StateTransition::State &state); +std::ostream &operator<<(std::ostream &os, + const StateTransition::PolicyAction &policy_action); + +} // namespace image_map +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_IMAGE_MAP_STATE_TRANSITION_H diff --git a/src/tools/rbd_mirror/image_map/Types.cc b/src/tools/rbd_mirror/image_map/Types.cc new file mode 100644 index 00000000..47de9c3c --- /dev/null +++ b/src/tools/rbd_mirror/image_map/Types.cc @@ -0,0 +1,138 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Types.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "common/Formatter.h" +#include <iostream> + +namespace rbd { +namespace mirror { +namespace image_map { + +const std::string UNMAPPED_INSTANCE_ID(""); + +namespace { + +template <typename E> +class GetTypeVisitor : public boost::static_visitor<E> { +public: + template <typename T> + inline E operator()(const T&) const { + return T::TYPE; + } +}; + +class EncodeVisitor : public boost::static_visitor<void> { +public: + explicit EncodeVisitor(bufferlist &bl) : m_bl(bl) { + } + + template <typename T> + inline void operator()(const T& t) const { + using ceph::encode; + encode(static_cast<uint32_t>(T::TYPE), m_bl); + t.encode(m_bl); + } +private: + bufferlist &m_bl; +}; + +class DecodeVisitor : public boost::static_visitor<void> { +public: + DecodeVisitor(__u8 version, bufferlist::const_iterator &iter) + : m_version(version), m_iter(iter) { + } + + template <typename T> + inline void operator()(T& t) const { + t.decode(m_version, m_iter); + } +private: + __u8 m_version; + bufferlist::const_iterator &m_iter; +}; + +class DumpVisitor : public boost::static_visitor<void> { +public: + explicit DumpVisitor(Formatter *formatter, const std::string &key) + : m_formatter(formatter), m_key(key) {} + + template <typename T> + inline void operator()(const T& t) const { + auto type = T::TYPE; + m_formatter->dump_string(m_key.c_str(), stringify(type)); + t.dump(m_formatter); + } +private: + ceph::Formatter *m_formatter; + std::string m_key; +}; + +} // anonymous namespace + +PolicyMetaType PolicyData::get_policy_meta_type() const { + return boost::apply_visitor(GetTypeVisitor<PolicyMetaType>(), policy_meta); +} + +void PolicyData::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + boost::apply_visitor(EncodeVisitor(bl), policy_meta); + ENCODE_FINISH(bl); +} + +void PolicyData::decode(bufferlist::const_iterator& it) { + DECODE_START(1, it); + + uint32_t policy_meta_type; + decode(policy_meta_type, it); + + switch (policy_meta_type) { + case POLICY_META_TYPE_NONE: + policy_meta = PolicyMetaNone(); + break; + default: + policy_meta = PolicyMetaUnknown(); + break; + } + + boost::apply_visitor(DecodeVisitor(struct_v, it), policy_meta); + DECODE_FINISH(it); +} + +void PolicyData::dump(Formatter *f) const { + boost::apply_visitor(DumpVisitor(f, "policy_meta_type"), policy_meta); +} + +void PolicyData::generate_test_instances(std::list<PolicyData *> &o) { + o.push_back(new PolicyData(PolicyMetaNone())); +} + +std::ostream &operator<<(std::ostream &os, const ActionType& action_type) { + switch (action_type) { + case ACTION_TYPE_NONE: + os << "NONE"; + break; + case ACTION_TYPE_MAP_UPDATE: + os << "MAP_UPDATE"; + break; + case ACTION_TYPE_MAP_REMOVE: + os << "MAP_REMOVE"; + break; + case ACTION_TYPE_ACQUIRE: + os << "ACQUIRE"; + break; + case ACTION_TYPE_RELEASE: + os << "RELEASE"; + break; + default: + os << "UNKNOWN (" << static_cast<uint32_t>(action_type) << ")"; + break; + } + return os; +} + +} // namespace image_map +} // namespace mirror +} // namespace rbd diff --git a/src/tools/rbd_mirror/image_map/Types.h b/src/tools/rbd_mirror/image_map/Types.h new file mode 100644 index 00000000..5a97430f --- /dev/null +++ b/src/tools/rbd_mirror/image_map/Types.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_TYPES_H +#define CEPH_RBD_MIRROR_IMAGE_MAP_TYPES_H + +#include <iosfwd> +#include <map> +#include <set> +#include <string> +#include <boost/variant.hpp> + +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/utime.h" +#include "tools/rbd_mirror/Types.h" + +struct Context; + +namespace ceph { +class Formatter; +} + +namespace rbd { +namespace mirror { +namespace image_map { + +extern const std::string UNMAPPED_INSTANCE_ID; + +struct Listener { + virtual ~Listener() { + } + + virtual void acquire_image(const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish) = 0; + virtual void release_image(const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish) = 0; + virtual void remove_image(const std::string &mirror_uuid, + const std::string &global_image_id, + const std::string &instance_id, + Context* on_finish) = 0; +}; + +struct LookupInfo { + std::string instance_id = UNMAPPED_INSTANCE_ID; + utime_t mapped_time; +}; + +enum ActionType { + ACTION_TYPE_NONE, + ACTION_TYPE_MAP_UPDATE, + ACTION_TYPE_MAP_REMOVE, + ACTION_TYPE_ACQUIRE, + ACTION_TYPE_RELEASE +}; + +typedef std::vector<std::string> InstanceIds; +typedef std::set<std::string> GlobalImageIds; +typedef std::map<std::string, ActionType> ImageActionTypes; + +enum PolicyMetaType { + POLICY_META_TYPE_NONE = 0, +}; + +struct PolicyMetaNone { + static const PolicyMetaType TYPE = POLICY_META_TYPE_NONE; + + PolicyMetaNone() { + } + + void encode(bufferlist& bl) const { + } + + void decode(__u8 version, bufferlist::const_iterator& it) { + } + + void dump(Formatter *f) const { + } +}; + +struct PolicyMetaUnknown { + static const PolicyMetaType TYPE = static_cast<PolicyMetaType>(-1); + + PolicyMetaUnknown() { + } + + void encode(bufferlist& bl) const { + ceph_abort(); + } + + void decode(__u8 version, bufferlist::const_iterator& it) { + } + + void dump(Formatter *f) const { + } +}; + +typedef boost::variant<PolicyMetaNone, + PolicyMetaUnknown> PolicyMeta; + +struct PolicyData { + PolicyData() + : policy_meta(PolicyMetaUnknown()) { + } + PolicyData(const PolicyMeta &policy_meta) + : policy_meta(policy_meta) { + } + + PolicyMeta policy_meta; + + PolicyMetaType get_policy_meta_type() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list<PolicyData *> &o); +}; + +WRITE_CLASS_ENCODER(PolicyData); + +std::ostream &operator<<(std::ostream &os, const ActionType &action_type); + +} // namespace image_map +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_IMAGE_MAP_TYPES_H diff --git a/src/tools/rbd_mirror/image_map/UpdateRequest.cc b/src/tools/rbd_mirror/image_map/UpdateRequest.cc new file mode 100644 index 00000000..799c5670 --- /dev/null +++ b/src/tools/rbd_mirror/image_map/UpdateRequest.cc @@ -0,0 +1,100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" + +#include "librbd/Utils.h" +#include "include/rbd_types.h" +#include "cls/rbd/cls_rbd_client.h" + +#include "UpdateRequest.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_map::UpdateRequest: " \ + << this << " " << __func__ + +namespace rbd { +namespace mirror { +namespace image_map { + +using librbd::util::create_rados_callback; + +static const uint32_t MAX_UPDATE = 256; + +template <typename I> +UpdateRequest<I>::UpdateRequest(librados::IoCtx &ioctx, + std::map<std::string, cls::rbd::MirrorImageMap> &&update_mapping, + std::set<std::string> &&remove_global_image_ids, Context *on_finish) + : m_ioctx(ioctx), + m_update_mapping(update_mapping), + m_remove_global_image_ids(remove_global_image_ids), + m_on_finish(on_finish) { +} + +template <typename I> +void UpdateRequest<I>::send() { + dout(20) << dendl; + + update_image_map(); +} + +template <typename I> +void UpdateRequest<I>::update_image_map() { + dout(20) << dendl; + + if (m_update_mapping.empty() && m_remove_global_image_ids.empty()) { + finish(0); + return; + } + + uint32_t nr_updates = 0; + librados::ObjectWriteOperation op; + + auto it1 = m_update_mapping.begin(); + while (it1 != m_update_mapping.end() && nr_updates++ < MAX_UPDATE) { + librbd::cls_client::mirror_image_map_update(&op, it1->first, it1->second); + it1 = m_update_mapping.erase(it1); + } + + auto it2 = m_remove_global_image_ids.begin(); + while (it2 != m_remove_global_image_ids.end() && nr_updates++ < MAX_UPDATE) { + librbd::cls_client::mirror_image_map_remove(&op, *it2); + it2 = m_remove_global_image_ids.erase(it2); + } + + librados::AioCompletion *aio_comp = create_rados_callback< + UpdateRequest, &UpdateRequest::handle_update_image_map>(this); + int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void UpdateRequest<I>::handle_update_image_map(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to update image map: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + update_image_map(); +} + +template <typename I> +void UpdateRequest<I>::finish(int r) { + dout(20) << ": r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_map +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_map::UpdateRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_map/UpdateRequest.h b/src/tools/rbd_mirror/image_map/UpdateRequest.h new file mode 100644 index 00000000..841cc6f9 --- /dev/null +++ b/src/tools/rbd_mirror/image_map/UpdateRequest.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_UPDATE_REQUEST_H +#define CEPH_RBD_MIRROR_IMAGE_MAP_UPDATE_REQUEST_H + +#include "cls/rbd/cls_rbd_types.h" +#include "include/rados/librados.hpp" + +class Context; + +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_map { + +template<typename ImageCtxT = librbd::ImageCtx> +class UpdateRequest { +public: + // accepts an image map for updation and a collection of + // global image ids to purge. + static UpdateRequest *create(librados::IoCtx &ioctx, + std::map<std::string, cls::rbd::MirrorImageMap> &&update_mapping, + std::set<std::string> &&remove_global_image_ids, Context *on_finish) { + return new UpdateRequest(ioctx, std::move(update_mapping), std::move(remove_global_image_ids), + on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | . . . . . . . . + * v v . MAX_UPDATE + * UPDATE_IMAGE_MAP. . . . . . . + * | + * v + * <finish> + * + * @endverbatim + */ + UpdateRequest(librados::IoCtx &ioctx, + std::map<std::string, cls::rbd::MirrorImageMap> &&update_mapping, + std::set<std::string> &&remove_global_image_ids, Context *on_finish); + + librados::IoCtx &m_ioctx; + std::map<std::string, cls::rbd::MirrorImageMap> m_update_mapping; + std::set<std::string> m_remove_global_image_ids; + Context *m_on_finish; + + void update_image_map(); + void handle_update_image_map(int r); + + void finish(int r); +}; + +} // namespace image_map +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_IMAGE_MAP_UPDATE_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc new file mode 100644 index 00000000..7ce21b4b --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc @@ -0,0 +1,785 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include "BootstrapRequest.h" +#include "CloseImageRequest.h" +#include "CreateImageRequest.h" +#include "IsPrimaryRequest.h" +#include "OpenImageRequest.h" +#include "OpenLocalImageRequest.h" +#include "common/debug.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "cls/rbd/cls_rbd_client.h" +#include "journal/Journaler.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/journal/Types.h" +#include "tools/rbd_mirror/ProgressContext.h" +#include "tools/rbd_mirror/ImageSync.h" +#include "tools/rbd_mirror/Threads.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::BootstrapRequest: " \ + << this << " " << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; +using librbd::util::unique_lock_name; + +template <typename I> +BootstrapRequest<I>::BootstrapRequest( + Threads<I>* threads, + librados::IoCtx &local_io_ctx, + librados::IoCtx &remote_io_ctx, + InstanceWatcher<I> *instance_watcher, + I **local_image_ctx, + const std::string &local_image_id, + const std::string &remote_image_id, + const std::string &global_image_id, + const std::string &local_mirror_uuid, + const std::string &remote_mirror_uuid, + Journaler *journaler, + cls::journal::ClientState *client_state, + MirrorPeerClientMeta *client_meta, + Context *on_finish, + bool *do_resync, + rbd::mirror::ProgressContext *progress_ctx) + : BaseRequest("rbd::mirror::image_replayer::BootstrapRequest", + reinterpret_cast<CephContext*>(local_io_ctx.cct()), on_finish), + m_threads(threads), m_local_io_ctx(local_io_ctx), + m_remote_io_ctx(remote_io_ctx), m_instance_watcher(instance_watcher), + m_local_image_ctx(local_image_ctx), m_local_image_id(local_image_id), + m_remote_image_id(remote_image_id), m_global_image_id(global_image_id), + m_local_mirror_uuid(local_mirror_uuid), + m_remote_mirror_uuid(remote_mirror_uuid), m_journaler(journaler), + m_client_state(client_state), m_client_meta(client_meta), + m_progress_ctx(progress_ctx), m_do_resync(do_resync), + m_lock(unique_lock_name("BootstrapRequest::m_lock", this)) { + dout(10) << dendl; +} + +template <typename I> +BootstrapRequest<I>::~BootstrapRequest() { + ceph_assert(m_remote_image_ctx == nullptr); +} + +template <typename I> +bool BootstrapRequest<I>::is_syncing() const { + Mutex::Locker locker(m_lock); + return (m_image_sync != nullptr); +} + +template <typename I> +void BootstrapRequest<I>::send() { + *m_do_resync = false; + + get_remote_tag_class(); +} + +template <typename I> +void BootstrapRequest<I>::cancel() { + dout(10) << dendl; + + Mutex::Locker locker(m_lock); + m_canceled = true; + + if (m_image_sync != nullptr) { + m_image_sync->cancel(); + } +} + +template <typename I> +void BootstrapRequest<I>::get_remote_tag_class() { + dout(15) << dendl; + + update_progress("GET_REMOTE_TAG_CLASS"); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_remote_tag_class>( + this); + m_journaler->get_client(librbd::Journal<>::IMAGE_CLIENT_ID, &m_client, ctx); +} + +template <typename I> +void BootstrapRequest<I>::handle_get_remote_tag_class(int r) { + dout(15) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to retrieve remote client: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + librbd::journal::ClientData client_data; + auto it = m_client.data.cbegin(); + try { + decode(client_data, it); + } catch (const buffer::error &err) { + derr << "failed to decode remote client meta data: " << err.what() + << dendl; + finish(-EBADMSG); + return; + } + + librbd::journal::ImageClientMeta *client_meta = + boost::get<librbd::journal::ImageClientMeta>(&client_data.client_meta); + if (client_meta == nullptr) { + derr << "unknown remote client registration" << dendl; + finish(-EINVAL); + return; + } + + m_remote_tag_class = client_meta->tag_class; + dout(10) << "remote tag class=" << m_remote_tag_class << dendl; + + open_remote_image(); +} + +template <typename I> +void BootstrapRequest<I>::open_remote_image() { + dout(15) << "remote_image_id=" << m_remote_image_id << dendl; + + update_progress("OPEN_REMOTE_IMAGE"); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_open_remote_image>( + this); + OpenImageRequest<I> *request = OpenImageRequest<I>::create( + m_remote_io_ctx, &m_remote_image_ctx, m_remote_image_id, false, + ctx); + request->send(); +} + +template <typename I> +void BootstrapRequest<I>::handle_open_remote_image(int r) { + dout(15) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to open remote image: " << cpp_strerror(r) << dendl; + ceph_assert(m_remote_image_ctx == nullptr); + finish(r); + return; + } + + is_primary(); +} + +template <typename I> +void BootstrapRequest<I>::is_primary() { + dout(15) << dendl; + + update_progress("OPEN_REMOTE_IMAGE"); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_is_primary>( + this); + IsPrimaryRequest<I> *request = IsPrimaryRequest<I>::create(m_remote_image_ctx, + &m_primary, ctx); + request->send(); +} + +template <typename I> +void BootstrapRequest<I>::handle_is_primary(int r) { + dout(15) << "r=" << r << dendl; + + if (r == -ENOENT) { + dout(5) << "remote image is not mirrored" << dendl; + m_ret_val = -EREMOTEIO; + close_remote_image(); + return; + } else if (r < 0) { + derr << "error querying remote image primary status: " << cpp_strerror(r) + << dendl; + m_ret_val = r; + close_remote_image(); + return; + } + + if (!m_primary) { + if (m_local_image_id.empty()) { + // no local image and remote isn't primary -- don't sync it + dout(5) << "remote image is not primary -- not syncing" + << dendl; + m_ret_val = -EREMOTEIO; + close_remote_image(); + return; + } else if (m_client_meta->state != + librbd::journal::MIRROR_PEER_STATE_REPLAYING) { + // ensure we attempt to re-sync to remote if it's re-promoted + dout(5) << "remote image is not primary -- sync interrupted" + << dendl; + m_ret_val = -EREMOTEIO; + update_client_state(); + return; + } + } + + if (!m_client_meta->image_id.empty()) { + // have an image id -- use that to open the image since a deletion (resync) + // will leave the old image id registered in the peer + m_local_image_id = m_client_meta->image_id; + } + + if (m_local_image_id.empty()) { + // prepare to create local image + update_client_image(); + return; + } + + open_local_image(); +} + +template <typename I> +void BootstrapRequest<I>::update_client_state() { + dout(15) << dendl; + update_progress("UPDATE_CLIENT_STATE"); + + librbd::journal::MirrorPeerClientMeta client_meta(*m_client_meta); + client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING; + + librbd::journal::ClientData client_data(client_meta); + bufferlist data_bl; + encode(client_data, data_bl); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_update_client_state>( + this); + m_journaler->update_client(data_bl, ctx); +} + +template <typename I> +void BootstrapRequest<I>::handle_update_client_state(int r) { + dout(15) << "r=" << r << dendl; + if (r < 0) { + derr << "failed to update client: " << cpp_strerror(r) << dendl; + } else { + m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING; + } + + close_remote_image(); +} + +template <typename I> +void BootstrapRequest<I>::open_local_image() { + dout(15) << "local_image_id=" << m_local_image_id << dendl; + + update_progress("OPEN_LOCAL_IMAGE"); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_open_local_image>( + this); + OpenLocalImageRequest<I> *request = OpenLocalImageRequest<I>::create( + m_local_io_ctx, m_local_image_ctx, m_local_image_id, m_threads->work_queue, + ctx); + request->send(); +} + +template <typename I> +void BootstrapRequest<I>::handle_open_local_image(int r) { + dout(15) << "r=" << r << dendl; + + if (r == -ENOENT) { + ceph_assert(*m_local_image_ctx == nullptr); + dout(10) << "local image missing" << dendl; + unregister_client(); + return; + } else if (r == -EREMOTEIO) { + ceph_assert(*m_local_image_ctx == nullptr); + dout(10) << "local image is primary -- skipping image replay" << dendl; + m_ret_val = r; + close_remote_image(); + return; + } else if (r < 0) { + ceph_assert(*m_local_image_ctx == nullptr); + derr << "failed to open local image: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_remote_image(); + return; + } + + I *local_image_ctx = (*m_local_image_ctx); + { + local_image_ctx->snap_lock.get_read(); + if (local_image_ctx->journal == nullptr) { + local_image_ctx->snap_lock.put_read(); + + derr << "local image does not support journaling" << dendl; + m_ret_val = -EINVAL; + close_local_image(); + return; + } + + r = (*m_local_image_ctx)->journal->is_resync_requested(m_do_resync); + if (r < 0) { + local_image_ctx->snap_lock.put_read(); + + derr << "failed to check if a resync was requested" << dendl; + m_ret_val = r; + close_local_image(); + return; + } + + m_local_tag_tid = local_image_ctx->journal->get_tag_tid(); + m_local_tag_data = local_image_ctx->journal->get_tag_data(); + dout(10) << "local tag=" << m_local_tag_tid << ", " + << "local tag data=" << m_local_tag_data << dendl; + local_image_ctx->snap_lock.put_read(); + } + + if (m_local_tag_data.mirror_uuid != m_remote_mirror_uuid && !m_primary) { + // if the local mirror is not linked to the (now) non-primary image, + // stop the replay. Otherwise, we ignore that the remote is non-primary + // so that we can replay the demotion + dout(5) << "remote image is not primary -- skipping image replay" + << dendl; + m_ret_val = -EREMOTEIO; + close_local_image(); + return; + } + + if (*m_do_resync) { + close_remote_image(); + return; + } + + if (*m_client_state == cls::journal::CLIENT_STATE_DISCONNECTED) { + dout(10) << "client flagged disconnected -- skipping bootstrap" << dendl; + // The caller is expected to detect disconnect initializing remote journal. + m_ret_val = 0; + close_remote_image(); + return; + } + + get_remote_tags(); +} + +template <typename I> +void BootstrapRequest<I>::unregister_client() { + dout(15) << dendl; + update_progress("UNREGISTER_CLIENT"); + + m_local_image_id = ""; + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_unregister_client>( + this); + m_journaler->unregister_client(ctx); +} + +template <typename I> +void BootstrapRequest<I>::handle_unregister_client(int r) { + dout(15) << "r=" << r << dendl; + if (r < 0) { + derr << "failed to unregister with remote journal: " << cpp_strerror(r) + << dendl; + m_ret_val = r; + close_remote_image(); + return; + } + + *m_client_meta = librbd::journal::MirrorPeerClientMeta(""); + register_client(); +} + +template <typename I> +void BootstrapRequest<I>::register_client() { + dout(15) << dendl; + + update_progress("REGISTER_CLIENT"); + + ceph_assert(m_local_image_id.empty()); + librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta; + mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING; + + librbd::journal::ClientData client_data{mirror_peer_client_meta}; + bufferlist client_data_bl; + encode(client_data, client_data_bl); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_register_client>( + this); + m_journaler->register_client(client_data_bl, ctx); +} + +template <typename I> +void BootstrapRequest<I>::handle_register_client(int r) { + dout(15) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to register with remote journal: " << cpp_strerror(r) + << dendl; + m_ret_val = r; + close_remote_image(); + return; + } + + *m_client_state = cls::journal::CLIENT_STATE_CONNECTED; + *m_client_meta = librbd::journal::MirrorPeerClientMeta(); + m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING; + + is_primary(); +} + +template <typename I> +void BootstrapRequest<I>::update_client_image() { + ceph_assert(m_local_image_id.empty()); + assert(m_local_image_id.empty()); + m_local_image_id = librbd::util::generate_image_id<I>(m_local_io_ctx); + + dout(15) << "local_image_id=" << m_local_image_id << dendl; + update_progress("UPDATE_CLIENT_IMAGE"); + + librbd::journal::MirrorPeerClientMeta client_meta{m_local_image_id}; + client_meta.state = librbd::journal::MIRROR_PEER_STATE_SYNCING; + + librbd::journal::ClientData client_data(client_meta); + bufferlist data_bl; + encode(client_data, data_bl); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_update_client_image>( + this); + m_journaler->update_client(data_bl, ctx); +} + +template <typename I> +void BootstrapRequest<I>::handle_update_client_image(int r) { + dout(15) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to update client: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_remote_image(); + return; + } + + if (m_canceled) { + dout(10) << "request canceled" << dendl; + m_ret_val = -ECANCELED; + close_remote_image(); + return; + } + + *m_client_meta = {m_local_image_id}; + m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_SYNCING; + create_local_image(); +} + +template <typename I> +void BootstrapRequest<I>::create_local_image() { + dout(15) << "local_image_id=" << m_local_image_id << dendl; + update_progress("CREATE_LOCAL_IMAGE"); + + m_remote_image_ctx->snap_lock.get_read(); + std::string image_name = m_remote_image_ctx->name; + m_remote_image_ctx->snap_lock.put_read(); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_create_local_image>( + this); + CreateImageRequest<I> *request = CreateImageRequest<I>::create( + m_threads, m_local_io_ctx, m_global_image_id, m_remote_mirror_uuid, + image_name, m_local_image_id, m_remote_image_ctx, ctx); + request->send(); +} + +template <typename I> +void BootstrapRequest<I>::handle_create_local_image(int r) { + dout(15) << "r=" << r << dendl; + + if (r == -EBADF) { + dout(5) << "image id " << m_local_image_id << " already in-use" << dendl; + m_local_image_id = ""; + update_client_image(); + return; + } else if (r < 0) { + if (r == -ENOENT) { + dout(10) << "parent image does not exist" << dendl; + } else { + derr << "failed to create local image: " << cpp_strerror(r) << dendl; + } + m_ret_val = r; + close_remote_image(); + return; + } + + open_local_image(); +} + +template <typename I> +void BootstrapRequest<I>::get_remote_tags() { + if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_SYNCING) { + // optimization -- no need to compare remote tags if we just created + // the image locally or sync was interrupted + image_sync(); + return; + } + + dout(15) << dendl; + update_progress("GET_REMOTE_TAGS"); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_remote_tags>(this); + m_journaler->get_tags(m_remote_tag_class, &m_remote_tags, ctx); +} + +template <typename I> +void BootstrapRequest<I>::handle_get_remote_tags(int r) { + dout(15) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to retrieve remote tags: " << cpp_strerror(r) << dendl; + m_ret_val = r; + close_local_image(); + return; + } + + if (m_canceled) { + dout(10) << "request canceled" << dendl; + m_ret_val = -ECANCELED; + close_local_image(); + return; + } + + // At this point, the local image was existing, non-primary, and replaying; + // and the remote image is primary. Attempt to link the local image's most + // recent tag to the remote image's tag chain. + bool remote_tag_data_valid = false; + librbd::journal::TagData remote_tag_data; + boost::optional<uint64_t> remote_orphan_tag_tid = + boost::make_optional<uint64_t>(false, 0U); + bool reconnect_orphan = false; + + // decode the remote tags + for (auto &remote_tag : m_remote_tags) { + if (m_local_tag_data.predecessor.commit_valid && + m_local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid && + m_local_tag_data.predecessor.tag_tid > remote_tag.tid) { + dout(15) << "skipping processed predecessor remote tag " + << remote_tag.tid << dendl; + continue; + } + + try { + auto it = remote_tag.data.cbegin(); + decode(remote_tag_data, it); + remote_tag_data_valid = true; + } catch (const buffer::error &err) { + derr << "failed to decode remote tag " << remote_tag.tid << ": " + << err.what() << dendl; + m_ret_val = -EBADMSG; + close_local_image(); + return; + } + + dout(10) << "decoded remote tag " << remote_tag.tid << ": " + << remote_tag_data << dendl; + + if (!m_local_tag_data.predecessor.commit_valid) { + // newly synced local image (no predecessor) replays from the first tag + if (remote_tag_data.mirror_uuid != librbd::Journal<>::LOCAL_MIRROR_UUID) { + dout(15) << "skipping non-primary remote tag" << dendl; + continue; + } + + dout(10) << "using initial primary remote tag" << dendl; + break; + } + + if (m_local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) { + // demotion last available local epoch + + if (remote_tag_data.mirror_uuid == m_local_tag_data.mirror_uuid && + remote_tag_data.predecessor.commit_valid && + remote_tag_data.predecessor.tag_tid == + m_local_tag_data.predecessor.tag_tid) { + // demotion matches remote epoch + + if (remote_tag_data.predecessor.mirror_uuid == m_local_mirror_uuid && + m_local_tag_data.predecessor.mirror_uuid == + librbd::Journal<>::LOCAL_MIRROR_UUID) { + // local demoted and remote has matching event + dout(15) << "found matching local demotion tag" << dendl; + remote_orphan_tag_tid = remote_tag.tid; + continue; + } + + if (m_local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid && + remote_tag_data.predecessor.mirror_uuid == + librbd::Journal<>::LOCAL_MIRROR_UUID) { + // remote demoted and local has matching event + dout(15) << "found matching remote demotion tag" << dendl; + remote_orphan_tag_tid = remote_tag.tid; + continue; + } + } + + if (remote_tag_data.mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID && + remote_tag_data.predecessor.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID && + remote_tag_data.predecessor.commit_valid && remote_orphan_tag_tid && + remote_tag_data.predecessor.tag_tid == *remote_orphan_tag_tid) { + // remote promotion tag chained to remote/local demotion tag + dout(15) << "found chained remote promotion tag" << dendl; + reconnect_orphan = true; + break; + } + + // promotion must follow demotion + remote_orphan_tag_tid = boost::none; + } + } + + if (remote_tag_data_valid && + m_local_tag_data.mirror_uuid == m_remote_mirror_uuid) { + dout(10) << "local image is in clean replay state" << dendl; + } else if (reconnect_orphan) { + dout(10) << "remote image was demoted/promoted" << dendl; + } else { + derr << "split-brain detected -- skipping image replay" << dendl; + m_ret_val = -EEXIST; + close_local_image(); + return; + } + + image_sync(); +} + +template <typename I> +void BootstrapRequest<I>::image_sync() { + if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_REPLAYING) { + // clean replay state -- no image sync required + close_remote_image(); + return; + } + + { + Mutex::Locker locker(m_lock); + if (m_canceled) { + m_ret_val = -ECANCELED; + } else { + dout(15) << dendl; + ceph_assert(m_image_sync == nullptr); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_image_sync>(this); + m_image_sync = ImageSync<I>::create( + *m_local_image_ctx, m_remote_image_ctx, m_threads->timer, + &m_threads->timer_lock, m_local_mirror_uuid, m_journaler, + m_client_meta, m_threads->work_queue, m_instance_watcher, ctx, + m_progress_ctx); + + m_image_sync->get(); + + m_lock.Unlock(); + update_progress("IMAGE_SYNC"); + m_lock.Lock(); + + m_image_sync->send(); + return; + } + } + + dout(10) << "request canceled" << dendl; + close_remote_image(); +} + +template <typename I> +void BootstrapRequest<I>::handle_image_sync(int r) { + dout(15) << "r=" << r << dendl; + + { + Mutex::Locker locker(m_lock); + m_image_sync->put(); + m_image_sync = nullptr; + + if (m_canceled) { + dout(10) << "request canceled" << dendl; + m_ret_val = -ECANCELED; + } + + if (r < 0) { + derr << "failed to sync remote image: " << cpp_strerror(r) << dendl; + m_ret_val = r; + } + } + + close_remote_image(); +} + +template <typename I> +void BootstrapRequest<I>::close_local_image() { + dout(15) << dendl; + + update_progress("CLOSE_LOCAL_IMAGE"); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_close_local_image>( + this); + CloseImageRequest<I> *request = CloseImageRequest<I>::create( + m_local_image_ctx, ctx); + request->send(); +} + +template <typename I> +void BootstrapRequest<I>::handle_close_local_image(int r) { + dout(15) << "r=" << r << dendl; + + if (r < 0) { + derr << "error encountered closing local image: " << cpp_strerror(r) + << dendl; + } + + close_remote_image(); +} + +template <typename I> +void BootstrapRequest<I>::close_remote_image() { + dout(15) << dendl; + + update_progress("CLOSE_REMOTE_IMAGE"); + + Context *ctx = create_context_callback< + BootstrapRequest<I>, &BootstrapRequest<I>::handle_close_remote_image>( + this); + CloseImageRequest<I> *request = CloseImageRequest<I>::create( + &m_remote_image_ctx, ctx); + request->send(); +} + +template <typename I> +void BootstrapRequest<I>::handle_close_remote_image(int r) { + dout(15) << "r=" << r << dendl; + + if (r < 0) { + derr << "error encountered closing remote image: " << cpp_strerror(r) + << dendl; + } + + finish(m_ret_val); +} + +template <typename I> +void BootstrapRequest<I>::update_progress(const std::string &description) { + dout(15) << description << dendl; + + if (m_progress_ctx) { + m_progress_ctx->update_progress(description); + } +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::BootstrapRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h new file mode 100644 index 00000000..ea9f8565 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h @@ -0,0 +1,230 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_BOOTSTRAP_REQUEST_H +#define RBD_MIRROR_IMAGE_REPLAYER_BOOTSTRAP_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "common/Mutex.h" +#include "cls/journal/cls_journal_types.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" +#include "tools/rbd_mirror/BaseRequest.h" +#include "tools/rbd_mirror/Types.h" +#include <list> +#include <string> + +class Context; +class ContextWQ; +class Mutex; +class SafeTimer; +namespace journal { class Journaler; } +namespace librbd { class ImageCtx; } +namespace librbd { namespace journal { struct MirrorPeerClientMeta; } } + +namespace rbd { +namespace mirror { + +class ProgressContext; + +template <typename> class ImageSync; +template <typename> class InstanceWatcher; +template <typename> struct Threads; + +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class BootstrapRequest : public BaseRequest { +public: + typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Journaler Journaler; + typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta; + typedef rbd::mirror::ProgressContext ProgressContext; + + static BootstrapRequest* create( + Threads<ImageCtxT>* threads, + librados::IoCtx &local_io_ctx, + librados::IoCtx &remote_io_ctx, + InstanceWatcher<ImageCtxT> *instance_watcher, + ImageCtxT **local_image_ctx, + const std::string &local_image_id, + const std::string &remote_image_id, + const std::string &global_image_id, + const std::string &local_mirror_uuid, + const std::string &remote_mirror_uuid, + Journaler *journaler, + cls::journal::ClientState *client_state, + MirrorPeerClientMeta *client_meta, + Context *on_finish, + bool *do_resync, + ProgressContext *progress_ctx = nullptr) { + return new BootstrapRequest(threads, local_io_ctx, remote_io_ctx, + instance_watcher, local_image_ctx, + local_image_id, remote_image_id, + global_image_id, local_mirror_uuid, + remote_mirror_uuid, journaler, client_state, + client_meta, on_finish, do_resync, + progress_ctx); + } + + BootstrapRequest(Threads<ImageCtxT>* threads, + librados::IoCtx &local_io_ctx, + librados::IoCtx &remote_io_ctx, + InstanceWatcher<ImageCtxT> *instance_watcher, + ImageCtxT **local_image_ctx, + const std::string &local_image_id, + const std::string &remote_image_id, + const std::string &global_image_id, + const std::string &local_mirror_uuid, + const std::string &remote_mirror_uuid, Journaler *journaler, + cls::journal::ClientState *client_state, + MirrorPeerClientMeta *client_meta, Context *on_finish, + bool *do_resync, ProgressContext *progress_ctx = nullptr); + ~BootstrapRequest() override; + + bool is_syncing() const; + + void send() override; + void cancel() override; + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_REMOTE_TAG_CLASS * * * * * * * * * * * * * * * * * * + * | * (error) + * v * + * OPEN_REMOTE_IMAGE * * * * * * * * * * * * * * * * * * * + * | * + * |/--------------------------------------------------*---\ + * v * | + * IS_PRIMARY * * * * * * * * * * * * * * * * * * * * * * | + * | * * | + * | (remote image primary, no local image id) * * | + * \----> UPDATE_CLIENT_IMAGE * * * * * * * * * * * * | + * | | ^ * * | + * | | * (duplicate image id) * * | + * | v * * * | + * \----> CREATE_LOCAL_IMAGE * * * * * * * * * * * * * | + * | | * * | + * | v * * | + * | (remote image primary) * * | + * \----> OPEN_LOCAL_IMAGE * * * * * * * * * * * * * * | + * | | . * * | + * | | . (image doesn't exist) * * | + * | | . . > UNREGISTER_CLIENT * * * * * * * | + * | | | * * | + * | | v * * | + * | | REGISTER_CLIENT * * * * * * * * | + * | | | * * | + * | | \-----------------------*---*---/ + * | | * * + * | v (skip if not needed) * * + * | GET_REMOTE_TAGS * * * * * * * * * + * | | * * * + * | v (skip if not needed) v * * + * | IMAGE_SYNC * * * > CLOSE_LOCAL_IMAGE * * + * | | | * * + * | \-----------------\ /-----/ * * + * | | * * + * | | * * + * | (skip if not needed) | * * + * \----> UPDATE_CLIENT_STATE *|* * * * * * * * * * * + * | | * * + * /-----------/----------------/ * * + * | * * + * v * * + * CLOSE_REMOTE_IMAGE < * * * * * * * * * * * * * * * * * + * | * + * v * + * <finish> < * * * * * * * * * * * * * * * * * * * * * * * + * + * @endverbatim + */ + typedef std::list<cls::journal::Tag> Tags; + + Threads<ImageCtxT>* m_threads; + librados::IoCtx &m_local_io_ctx; + librados::IoCtx &m_remote_io_ctx; + InstanceWatcher<ImageCtxT> *m_instance_watcher; + ImageCtxT **m_local_image_ctx; + std::string m_local_image_id; + std::string m_remote_image_id; + std::string m_global_image_id; + std::string m_local_mirror_uuid; + std::string m_remote_mirror_uuid; + Journaler *m_journaler; + cls::journal::ClientState *m_client_state; + MirrorPeerClientMeta *m_client_meta; + ProgressContext *m_progress_ctx; + bool *m_do_resync; + + mutable Mutex m_lock; + bool m_canceled = false; + + Tags m_remote_tags; + cls::journal::Client m_client; + uint64_t m_remote_tag_class = 0; + ImageCtxT *m_remote_image_ctx = nullptr; + bool m_primary = false; + int m_ret_val = 0; + ImageSync<ImageCtxT> *m_image_sync = nullptr; + + uint64_t m_local_tag_tid = 0; + librbd::journal::TagData m_local_tag_data; + + bufferlist m_out_bl; + + void get_remote_tag_class(); + void handle_get_remote_tag_class(int r); + + void open_remote_image(); + void handle_open_remote_image(int r); + + void is_primary(); + void handle_is_primary(int r); + + void update_client_state(); + void handle_update_client_state(int r); + + void open_local_image(); + void handle_open_local_image(int r); + + void unregister_client(); + void handle_unregister_client(int r); + + void register_client(); + void handle_register_client(int r); + + void create_local_image(); + void handle_create_local_image(int r); + + void update_client_image(); + void handle_update_client_image(int r); + + void get_remote_tags(); + void handle_get_remote_tags(int r); + + void image_sync(); + void handle_image_sync(int r); + + void close_local_image(); + void handle_close_local_image(int r); + + void close_remote_image(); + void handle_close_remote_image(int r); + + void update_progress(const std::string &description); +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::BootstrapRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_BOOTSTRAP_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc new file mode 100644 index 00000000..5b754823 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "CloseImageRequest.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::CloseImageRequest: " \ + << this << " " << __func__ + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::create_context_callback; + +template <typename I> +CloseImageRequest<I>::CloseImageRequest(I **image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { +} + +template <typename I> +void CloseImageRequest<I>::send() { + close_image(); +} + +template <typename I> +void CloseImageRequest<I>::close_image() { + dout(20) << dendl; + + Context *ctx = create_context_callback< + CloseImageRequest<I>, &CloseImageRequest<I>::handle_close_image>(this); + (*m_image_ctx)->state->close(ctx); +} + +template <typename I> +void CloseImageRequest<I>::handle_close_image(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": error encountered while closing image: " << cpp_strerror(r) + << dendl; + } + + delete *m_image_ctx; + *m_image_ctx = nullptr; + + m_on_finish->complete(0); + delete this; +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::CloseImageRequest<librbd::ImageCtx>; + diff --git a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h new file mode 100644 index 00000000..02481369 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_CLOSE_IMAGE_REQUEST_H +#define RBD_MIRROR_IMAGE_REPLAYER_CLOSE_IMAGE_REQUEST_H + +#include "include/int_types.h" +#include "librbd/ImageCtx.h" +#include <string> + +class Context; +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class CloseImageRequest { +public: + static CloseImageRequest* create(ImageCtxT **image_ctx, Context *on_finish) { + return new CloseImageRequest(image_ctx, on_finish); + } + + CloseImageRequest(ImageCtxT **image_ctx, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * CLOSE_IMAGE + * | + * v + * <finish> + * + * @endverbatim + */ + ImageCtxT **m_image_ctx; + Context *m_on_finish; + + void close_image(); + void handle_close_image(int r); +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::CloseImageRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_CLOSE_IMAGE_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc new file mode 100644 index 00000000..8d8236b2 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc @@ -0,0 +1,506 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "CreateImageRequest.h" +#include "CloseImageRequest.h" +#include "OpenImageRequest.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "cls/rbd/cls_rbd_client.h" +#include "journal/Journaler.h" +#include "journal/Settings.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Utils.h" +#include "librbd/image/CreateRequest.h" +#include "librbd/image/CloneRequest.h" +#include "librbd/journal/Types.h" +#include "tools/rbd_mirror/Threads.h" +#include "tools/rbd_mirror/image_replayer/Utils.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::CreateImageRequest: " \ + << this << " " << __func__ << ": " + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +namespace rbd { +namespace mirror { +namespace image_replayer { + +template <typename I> +CreateImageRequest<I>::CreateImageRequest(Threads<I>* threads, + librados::IoCtx &local_io_ctx, + const std::string &global_image_id, + const std::string &remote_mirror_uuid, + const std::string &local_image_name, + const std::string &local_image_id, + I *remote_image_ctx, + Context *on_finish) + : m_threads(threads), m_local_io_ctx(local_io_ctx), + m_global_image_id(global_image_id), + m_remote_mirror_uuid(remote_mirror_uuid), + m_local_image_name(local_image_name), m_local_image_id(local_image_id), + m_remote_image_ctx(remote_image_ctx), m_on_finish(on_finish) { +} + +template <typename I> +void CreateImageRequest<I>::send() { + int r = validate_parent(); + if (r < 0) { + error(r); + return; + } + + if (m_remote_parent_spec.pool_id == -1) { + create_image(); + } else { + get_local_parent_mirror_uuid(); + } +} + +template <typename I> +void CreateImageRequest<I>::create_image() { + dout(10) << dendl; + + using klass = CreateImageRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_create_image>(this); + + RWLock::RLocker snap_locker(m_remote_image_ctx->snap_lock); + + auto& config{ + reinterpret_cast<CephContext*>(m_local_io_ctx.cct())->_conf}; + + librbd::ImageOptions image_options; + populate_image_options(&image_options); + + auto req = librbd::image::CreateRequest<I>::create( + config, m_local_io_ctx, m_local_image_name, m_local_image_id, + m_remote_image_ctx->size, image_options, m_global_image_id, + m_remote_mirror_uuid, false, m_remote_image_ctx->op_work_queue, ctx); + req->send(); +} + +template <typename I> +void CreateImageRequest<I>::handle_create_image(int r) { + dout(10) << "r=" << r << dendl; + if (r == -EBADF) { + dout(5) << "image id " << m_local_image_id << " already in-use" << dendl; + finish(r); + return; + } else if (r < 0) { + derr << "failed to create local image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void CreateImageRequest<I>::get_local_parent_mirror_uuid() { + dout(10) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_uuid_get_start(&op); + + librados::AioCompletion *aio_comp = create_rados_callback< + CreateImageRequest<I>, + &CreateImageRequest<I>::handle_get_local_parent_mirror_uuid>(this); + m_out_bl.clear(); + int r = m_local_parent_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void CreateImageRequest<I>::handle_get_local_parent_mirror_uuid(int r) { + if (r >= 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_uuid_get_finish( + &it, &m_local_parent_mirror_uuid); + if (r >= 0 && m_local_parent_mirror_uuid.empty()) { + r = -ENOENT; + } + } + + dout(10) << "r=" << r << dendl; + if (r < 0) { + if (r == -ENOENT) { + dout(5) << "local parent mirror uuid missing" << dendl; + } else { + derr << "failed to retrieve local parent mirror uuid: " << cpp_strerror(r) + << dendl; + } + finish(r); + return; + } + + dout(15) << "local_parent_mirror_uuid=" << m_local_parent_mirror_uuid + << dendl; + get_remote_parent_client_state(); +} + +template <typename I> +void CreateImageRequest<I>::get_remote_parent_client_state() { + dout(10) << dendl; + + m_remote_journaler = new Journaler(m_threads->work_queue, m_threads->timer, + &m_threads->timer_lock, + m_remote_parent_io_ctx, + m_remote_parent_spec.image_id, + m_local_parent_mirror_uuid, {}); + + Context *ctx = create_async_context_callback( + m_threads->work_queue, create_context_callback< + CreateImageRequest<I>, + &CreateImageRequest<I>::handle_get_remote_parent_client_state>(this)); + m_remote_journaler->get_client(m_local_parent_mirror_uuid, &m_client, ctx); +} + +template <typename I> +void CreateImageRequest<I>::handle_get_remote_parent_client_state(int r) { + dout(10) << "r=" << r << dendl; + + delete m_remote_journaler; + m_remote_journaler = nullptr; + + librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta; + if (r == -ENOENT) { + dout(15) << "client not registered to parent image" << dendl; + finish(r); + return; + } else if (r < 0) { + derr << "failed to retrieve parent client: " << cpp_strerror(r) << dendl; + finish(r); + return; + } else if (!util::decode_client_meta(m_client, &mirror_peer_client_meta)) { + // require operator intervention since the data is corrupt + derr << "failed to decode parent client: " << cpp_strerror(r) << dendl; + finish(-EBADMSG); + return; + } else if (mirror_peer_client_meta.state != + librbd::journal::MIRROR_PEER_STATE_REPLAYING) { + // avoid possible race w/ incomplete parent image since the parent snapshot + // might be deleted if the sync restarts + dout(15) << "parent image still syncing" << dendl; + finish(-ENOENT); + return; + } + + get_parent_global_image_id(); +} + + +template <typename I> +void CreateImageRequest<I>::get_parent_global_image_id() { + dout(10) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_image_get_start(&op, + m_remote_parent_spec.image_id); + + librados::AioCompletion *aio_comp = create_rados_callback< + CreateImageRequest<I>, + &CreateImageRequest<I>::handle_get_parent_global_image_id>(this); + m_out_bl.clear(); + int r = m_remote_parent_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void CreateImageRequest<I>::handle_get_parent_global_image_id(int r) { + dout(10) << "r=" << r << dendl; + if (r == 0) { + cls::rbd::MirrorImage mirror_image; + auto iter = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_image_get_finish(&iter, &mirror_image); + if (r == 0) { + m_parent_global_image_id = mirror_image.global_image_id; + dout(15) << "parent_global_image_id=" << m_parent_global_image_id + << dendl; + } + } + + if (r == -ENOENT) { + dout(10) << "parent image " << m_remote_parent_spec.image_id + << " not mirrored" << dendl; + finish(r); + return; + } else if (r < 0) { + derr << "failed to retrieve global image id for parent image " + << m_remote_parent_spec.image_id << ": " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + get_local_parent_image_id(); +} + +template <typename I> +void CreateImageRequest<I>::get_local_parent_image_id() { + dout(10) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_image_get_image_id_start( + &op, m_parent_global_image_id); + + librados::AioCompletion *aio_comp = create_rados_callback< + CreateImageRequest<I>, + &CreateImageRequest<I>::handle_get_local_parent_image_id>(this); + m_out_bl.clear(); + int r = m_local_parent_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void CreateImageRequest<I>::handle_get_local_parent_image_id(int r) { + dout(10) << "r=" << r << dendl; + + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_image_get_image_id_finish( + &iter, &m_local_parent_spec.image_id); + } + + if (r == -ENOENT) { + dout(10) << "parent image " << m_parent_global_image_id << " not " + << "registered locally" << dendl; + finish(r); + return; + } else if (r < 0) { + derr << "failed to retrieve local image id for parent image " + << m_parent_global_image_id << ": " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + open_remote_parent_image(); +} + +template <typename I> +void CreateImageRequest<I>::open_remote_parent_image() { + dout(10) << dendl; + + Context *ctx = create_context_callback< + CreateImageRequest<I>, + &CreateImageRequest<I>::handle_open_remote_parent_image>(this); + OpenImageRequest<I> *request = OpenImageRequest<I>::create( + m_remote_parent_io_ctx, &m_remote_parent_image_ctx, + m_remote_parent_spec.image_id, true, ctx); + request->send(); +} + +template <typename I> +void CreateImageRequest<I>::handle_open_remote_parent_image(int r) { + dout(10) << "r=" << r << dendl; + if (r < 0) { + derr << "failed to open remote parent image " << m_parent_pool_name << "/" + << m_remote_parent_spec.image_id << dendl; + finish(r); + return; + } + + clone_image(); +} + +template <typename I> +void CreateImageRequest<I>::clone_image() { + dout(10) << dendl; + + std::string snap_name; + cls::rbd::SnapshotNamespace snap_namespace; + { + RWLock::RLocker remote_snap_locker(m_remote_parent_image_ctx->snap_lock); + auto it = m_remote_parent_image_ctx->snap_info.find( + m_remote_parent_spec.snap_id); + if (it != m_remote_parent_image_ctx->snap_info.end()) { + snap_name = it->second.name; + snap_namespace = it->second.snap_namespace; + } + } + + librbd::ImageOptions opts; + populate_image_options(&opts); + + auto& config{ + reinterpret_cast<CephContext*>(m_local_io_ctx.cct())->_conf}; + + using klass = CreateImageRequest<I>; + Context *ctx = create_context_callback< + klass, &klass::handle_clone_image>(this); + + librbd::image::CloneRequest<I> *req = librbd::image::CloneRequest<I>::create( + config, m_local_parent_io_ctx, m_local_parent_spec.image_id, snap_name, + CEPH_NOSNAP, m_local_io_ctx, m_local_image_name, m_local_image_id, opts, + m_global_image_id, m_remote_mirror_uuid, m_remote_image_ctx->op_work_queue, + ctx); + req->send(); +} + +template <typename I> +void CreateImageRequest<I>::handle_clone_image(int r) { + dout(10) << "r=" << r << dendl; + if (r == -EBADF) { + dout(5) << "image id " << m_local_image_id << " already in-use" << dendl; + finish(r); + return; + } else if (r < 0) { + derr << "failed to clone image " << m_parent_pool_name << "/" + << m_remote_parent_spec.image_id << " to " + << m_local_image_name << dendl; + m_ret_val = r; + } + + close_remote_parent_image(); +} + +template <typename I> +void CreateImageRequest<I>::close_remote_parent_image() { + dout(10) << dendl; + Context *ctx = create_context_callback< + CreateImageRequest<I>, + &CreateImageRequest<I>::handle_close_remote_parent_image>(this); + CloseImageRequest<I> *request = CloseImageRequest<I>::create( + &m_remote_parent_image_ctx, ctx); + request->send(); +} + +template <typename I> +void CreateImageRequest<I>::handle_close_remote_parent_image(int r) { + dout(10) << "r=" << r << dendl; + if (r < 0) { + derr << "error encountered closing remote parent image: " + << cpp_strerror(r) << dendl; + } + + finish(m_ret_val); +} + +template <typename I> +void CreateImageRequest<I>::error(int r) { + dout(10) << "r=" << r << dendl; + + m_threads->work_queue->queue(create_context_callback< + CreateImageRequest<I>, &CreateImageRequest<I>::finish>(this), r); +} + +template <typename I> +void CreateImageRequest<I>::finish(int r) { + dout(10) << "r=" << r << dendl; + m_on_finish->complete(r); + delete this; +} + +template <typename I> +int CreateImageRequest<I>::validate_parent() { + RWLock::RLocker owner_locker(m_remote_image_ctx->owner_lock); + RWLock::RLocker snap_locker(m_remote_image_ctx->snap_lock); + + m_remote_parent_spec = m_remote_image_ctx->parent_md.spec; + + // scan all remote snapshots for a linked parent + for (auto &snap_info_pair : m_remote_image_ctx->snap_info) { + auto &parent_spec = snap_info_pair.second.parent.spec; + if (parent_spec.pool_id == -1) { + continue; + } else if (m_remote_parent_spec.pool_id == -1) { + m_remote_parent_spec = parent_spec; + continue; + } + + if (m_remote_parent_spec != parent_spec) { + derr << "remote image parent spec mismatch" << dendl; + return -EINVAL; + } + } + + if (m_remote_parent_spec.pool_id == -1) { + return 0; + } + + // map remote parent pool to local parent pool + librados::Rados remote_rados(m_remote_image_ctx->md_ctx); + int r = remote_rados.ioctx_create2(m_remote_parent_spec.pool_id, + m_remote_parent_io_ctx); + if (r < 0) { + derr << "failed to open remote parent pool " << m_remote_parent_spec.pool_id + << ": " << cpp_strerror(r) << dendl; + return r; + } + + m_parent_pool_name = m_remote_parent_io_ctx.get_pool_name(); + + librados::Rados local_rados(m_local_io_ctx); + r = local_rados.ioctx_create(m_parent_pool_name.c_str(), + m_local_parent_io_ctx); + if (r < 0) { + derr << "failed to open local parent pool " << m_parent_pool_name << ": " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template <typename I> +void CreateImageRequest<I>::populate_image_options( + librbd::ImageOptions* image_options) { + image_options->set(RBD_IMAGE_OPTION_FEATURES, + m_remote_image_ctx->features); + image_options->set(RBD_IMAGE_OPTION_ORDER, m_remote_image_ctx->order); + image_options->set(RBD_IMAGE_OPTION_STRIPE_UNIT, + m_remote_image_ctx->stripe_unit); + image_options->set(RBD_IMAGE_OPTION_STRIPE_COUNT, + m_remote_image_ctx->stripe_count); + + // Determine the data pool for the local image as follows: + // 1. If the local pool has a default data pool, use it. + // 2. If the remote image has a data pool different from its metadata pool and + // a pool with the same name exists locally, use it. + // 3. Don't set the data pool explicitly. + std::string data_pool; + librados::Rados local_rados(m_local_io_ctx); + auto default_data_pool = g_ceph_context->_conf.get_val<std::string>("rbd_default_data_pool"); + auto remote_md_pool = m_remote_image_ctx->md_ctx.get_pool_name(); + auto remote_data_pool = m_remote_image_ctx->data_ctx.get_pool_name(); + + if (default_data_pool != "") { + data_pool = default_data_pool; + } else if (remote_data_pool != remote_md_pool) { + if (local_rados.pool_lookup(remote_data_pool.c_str()) >= 0) { + data_pool = remote_data_pool; + } + } + + if (data_pool != "") { + image_options->set(RBD_IMAGE_OPTION_DATA_POOL, data_pool); + } + + if (m_remote_parent_spec.pool_id != -1) { + uint64_t clone_format = 1; + if (m_remote_image_ctx->test_op_features( + RBD_OPERATION_FEATURE_CLONE_CHILD)) { + clone_format = 2; + } + image_options->set(RBD_IMAGE_OPTION_CLONE_FORMAT, clone_format); + } +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::CreateImageRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/CreateImageRequest.h b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.h new file mode 100644 index 00000000..0b20da52 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.h @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_CREATE_IMAGE_REQUEST_H +#define RBD_MIRROR_IMAGE_REPLAYER_CREATE_IMAGE_REQUEST_H + +#include "include/int_types.h" +#include "include/types.h" +#include "include/rados/librados.hpp" +#include "cls/journal/cls_journal_types.h" +#include "librbd/Types.h" +#include "librbd/journal/TypeTraits.h" +#include <string> + +class Context; +class ContextWQ; +namespace journal { class Journaler; } +namespace librbd { class ImageCtx; } +namespace librbd { class ImageOptions; } + +namespace rbd { +namespace mirror { + +template <typename> struct Threads; + +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class CreateImageRequest { +public: + static CreateImageRequest *create(Threads<ImageCtxT> *threads, + librados::IoCtx &local_io_ctx, + const std::string &global_image_id, + const std::string &remote_mirror_uuid, + const std::string &local_image_name, + const std::string &local_image_id, + ImageCtxT *remote_image_ctx, + Context *on_finish) { + return new CreateImageRequest(threads, local_io_ctx, global_image_id, + remote_mirror_uuid, local_image_name, + local_image_id, remote_image_ctx, on_finish); + } + + CreateImageRequest(Threads<ImageCtxT> *threads, librados::IoCtx &local_io_ctx, + const std::string &global_image_id, + const std::string &remote_mirror_uuid, + const std::string &local_image_name, + const std::string &local_image_id, + ImageCtxT *remote_image_ctx, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * | * + * | (non-clone) * + * |\------------> CREATE_IMAGE ---------------------\ * (error) + * | | * + * | (clone) | * + * \-------------> GET_LOCAL_PARENT_MIRROR_UUID * * | * * * * + * | | * * + * v | * + * GET_REMOTE_PARENT_CLIENT_STATE * | * * * * + * | | * * + * v | * + * GET_PARENT_GLOBAL_IMAGE_ID * * * | * * * * + * | | * * + * v | * + * GET_LOCAL_PARENT_IMAGE_ID * * * * | * * * * + * | | * * + * v | * + * OPEN_REMOTE_PARENT * * * * * * * | * * * * + * | | * * + * v | * + * CLONE_IMAGE | * + * | | * + * v | * + * CLOSE_REMOTE_PARENT | * + * | v * + * \------------------------> <finish> < * * + * @endverbatim + */ + + typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Journaler Journaler; + + Threads<ImageCtxT> *m_threads; + librados::IoCtx &m_local_io_ctx; + std::string m_global_image_id; + std::string m_remote_mirror_uuid; + std::string m_local_image_name; + std::string m_local_image_id; + ImageCtxT *m_remote_image_ctx; + Context *m_on_finish; + + librados::IoCtx m_remote_parent_io_ctx; + std::string m_local_parent_mirror_uuid; + Journaler *m_remote_journaler = nullptr; + ImageCtxT *m_remote_parent_image_ctx = nullptr; + cls::rbd::ParentImageSpec m_remote_parent_spec; + + librados::IoCtx m_local_parent_io_ctx; + cls::rbd::ParentImageSpec m_local_parent_spec; + + bufferlist m_out_bl; + std::string m_parent_global_image_id; + std::string m_parent_pool_name; + cls::journal::Client m_client; + int m_ret_val = 0; + + void create_image(); + void handle_create_image(int r); + + void get_local_parent_mirror_uuid(); + void handle_get_local_parent_mirror_uuid(int r); + + void get_remote_parent_client_state(); + void handle_get_remote_parent_client_state(int r); + + void get_parent_global_image_id(); + void handle_get_parent_global_image_id(int r); + + void get_local_parent_image_id(); + void handle_get_local_parent_image_id(int r); + + void open_remote_parent_image(); + void handle_open_remote_parent_image(int r); + + void clone_image(); + void handle_clone_image(int r); + + void close_remote_parent_image(); + void handle_close_remote_parent_image(int r); + + void error(int r); + void finish(int r); + + int validate_parent(); + + void populate_image_options(librbd::ImageOptions* image_options); + +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::CreateImageRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_CREATE_IMAGE_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/EventPreprocessor.cc b/src/tools/rbd_mirror/image_replayer/EventPreprocessor.cc new file mode 100644 index 00000000..6314eb7d --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/EventPreprocessor.cc @@ -0,0 +1,204 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "EventPreprocessor.h" +#include "common/debug.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "journal/Journaler.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/journal/Types.h" +#include <boost/variant.hpp> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror + +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::EventPreprocessor: " \ + << this << " " << __func__ + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::create_context_callback; + +template <typename I> +EventPreprocessor<I>::EventPreprocessor(I &local_image_ctx, + Journaler &remote_journaler, + const std::string &local_mirror_uuid, + MirrorPeerClientMeta *client_meta, + ContextWQ *work_queue) + : m_local_image_ctx(local_image_ctx), m_remote_journaler(remote_journaler), + m_local_mirror_uuid(local_mirror_uuid), m_client_meta(client_meta), + m_work_queue(work_queue) { +} + +template <typename I> +EventPreprocessor<I>::~EventPreprocessor() { + ceph_assert(!m_in_progress); +} + +template <typename I> +bool EventPreprocessor<I>::is_required(const EventEntry &event_entry) { + SnapSeqs snap_seqs(m_client_meta->snap_seqs); + return (prune_snap_map(&snap_seqs) || + event_entry.get_event_type() == + librbd::journal::EVENT_TYPE_SNAP_RENAME); +} + +template <typename I> +void EventPreprocessor<I>::preprocess(EventEntry *event_entry, + Context *on_finish) { + ceph_assert(!m_in_progress); + m_in_progress = true; + m_event_entry = event_entry; + m_on_finish = on_finish; + + refresh_image(); +} + +template <typename I> +void EventPreprocessor<I>::refresh_image() { + dout(20) << dendl; + + Context *ctx = create_context_callback< + EventPreprocessor<I>, &EventPreprocessor<I>::handle_refresh_image>(this); + m_local_image_ctx.state->refresh(ctx); +} + +template <typename I> +void EventPreprocessor<I>::handle_refresh_image(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << "error encountered during image refresh: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + preprocess_event(); +} + +template <typename I> +void EventPreprocessor<I>::preprocess_event() { + dout(20) << dendl; + + m_snap_seqs = m_client_meta->snap_seqs; + m_snap_seqs_updated = prune_snap_map(&m_snap_seqs); + + int r = boost::apply_visitor(PreprocessEventVisitor(this), + m_event_entry->event); + if (r < 0) { + finish(r); + return; + } + + update_client(); +} + +template <typename I> +int EventPreprocessor<I>::preprocess_snap_rename( + librbd::journal::SnapRenameEvent &event) { + dout(20) << ": " + << "remote_snap_id=" << event.snap_id << ", " + << "src_snap_name=" << event.src_snap_name << ", " + << "dest_snap_name=" << event.dst_snap_name << dendl; + + auto snap_seq_it = m_snap_seqs.find(event.snap_id); + if (snap_seq_it != m_snap_seqs.end()) { + dout(20) << ": remapping remote snap id " << snap_seq_it->first << " " + << "to local snap id " << snap_seq_it->second << dendl; + event.snap_id = snap_seq_it->second; + return 0; + } + + auto snap_id_it = m_local_image_ctx.snap_ids.find({cls::rbd::UserSnapshotNamespace(), + event.src_snap_name}); + if (snap_id_it == m_local_image_ctx.snap_ids.end()) { + dout(20) << ": cannot map remote snapshot '" << event.src_snap_name << "' " + << "to local snapshot" << dendl; + event.snap_id = CEPH_NOSNAP; + return -ENOENT; + } + + dout(20) << ": mapping remote snap id " << event.snap_id << " " + << "to local snap id " << snap_id_it->second << dendl; + m_snap_seqs_updated = true; + m_snap_seqs[event.snap_id] = snap_id_it->second; + event.snap_id = snap_id_it->second; + return 0; +} + +template <typename I> +void EventPreprocessor<I>::update_client() { + if (!m_snap_seqs_updated) { + finish(0); + return; + } + + dout(20) << dendl; + librbd::journal::MirrorPeerClientMeta client_meta(*m_client_meta); + client_meta.snap_seqs = m_snap_seqs; + + librbd::journal::ClientData client_data(client_meta); + bufferlist data_bl; + encode(client_data, data_bl); + + Context *ctx = create_context_callback< + EventPreprocessor<I>, &EventPreprocessor<I>::handle_update_client>( + this); + m_remote_journaler.update_client(data_bl, ctx); +} + +template <typename I> +void EventPreprocessor<I>::handle_update_client(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << "failed to update mirror peer journal client: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + m_client_meta->snap_seqs = m_snap_seqs; + finish(0); +} + +template <typename I> +bool EventPreprocessor<I>::prune_snap_map(SnapSeqs *snap_seqs) { + bool pruned = false; + + RWLock::RLocker snap_locker(m_local_image_ctx.snap_lock); + for (auto it = snap_seqs->begin(); it != snap_seqs->end(); ) { + auto current_it(it++); + if (m_local_image_ctx.snap_info.count(current_it->second) == 0) { + snap_seqs->erase(current_it); + pruned = true; + } + } + return pruned; +} + +template <typename I> +void EventPreprocessor<I>::finish(int r) { + dout(20) << ": r=" << r << dendl; + + Context *on_finish = m_on_finish; + m_on_finish = nullptr; + m_event_entry = nullptr; + m_in_progress = false; + m_snap_seqs_updated = false; + m_work_queue->queue(on_finish, r); +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::EventPreprocessor<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/EventPreprocessor.h b/src/tools/rbd_mirror/image_replayer/EventPreprocessor.h new file mode 100644 index 00000000..67aeea0b --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/EventPreprocessor.h @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_EVENT_PREPROCESSOR_H +#define RBD_MIRROR_IMAGE_REPLAYER_EVENT_PREPROCESSOR_H + +#include "include/int_types.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" +#include <map> +#include <string> +#include <boost/variant/static_visitor.hpp> + +struct Context; +struct ContextWQ; +namespace journal { class Journaler; } +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class EventPreprocessor { +public: + using Journaler = typename librbd::journal::TypeTraits<ImageCtxT>::Journaler; + using EventEntry = librbd::journal::EventEntry; + using MirrorPeerClientMeta = librbd::journal::MirrorPeerClientMeta; + + static EventPreprocessor *create(ImageCtxT &local_image_ctx, + Journaler &remote_journaler, + const std::string &local_mirror_uuid, + MirrorPeerClientMeta *client_meta, + ContextWQ *work_queue) { + return new EventPreprocessor(local_image_ctx, remote_journaler, + local_mirror_uuid, client_meta, work_queue); + } + + static void destroy(EventPreprocessor* processor) { + delete processor; + } + + EventPreprocessor(ImageCtxT &local_image_ctx, Journaler &remote_journaler, + const std::string &local_mirror_uuid, + MirrorPeerClientMeta *client_meta, ContextWQ *work_queue); + ~EventPreprocessor(); + + bool is_required(const EventEntry &event_entry); + void preprocess(EventEntry *event_entry, Context *on_finish); + +private: + /** + * @verbatim + * + * <start> + * | + * v (skip if not required) + * REFRESH_IMAGE + * | + * v (skip if not required) + * PREPROCESS_EVENT + * | + * v (skip if not required) + * UPDATE_CLIENT + * + * @endverbatim + */ + + typedef std::map<uint64_t, uint64_t> SnapSeqs; + + class PreprocessEventVisitor : public boost::static_visitor<int> { + public: + EventPreprocessor *event_preprocessor; + + PreprocessEventVisitor(EventPreprocessor *event_preprocessor) + : event_preprocessor(event_preprocessor) { + } + + template <typename T> + inline int operator()(T&) const { + return 0; + } + inline int operator()(librbd::journal::SnapRenameEvent &event) const { + return event_preprocessor->preprocess_snap_rename(event); + } + }; + + ImageCtxT &m_local_image_ctx; + Journaler &m_remote_journaler; + std::string m_local_mirror_uuid; + MirrorPeerClientMeta *m_client_meta; + ContextWQ *m_work_queue; + + bool m_in_progress = false; + EventEntry *m_event_entry = nullptr; + Context *m_on_finish = nullptr; + + SnapSeqs m_snap_seqs; + bool m_snap_seqs_updated = false; + + bool prune_snap_map(SnapSeqs *snap_seqs); + + void refresh_image(); + void handle_refresh_image(int r); + + void preprocess_event(); + int preprocess_snap_rename(librbd::journal::SnapRenameEvent &event); + + void update_client(); + void handle_update_client(int r); + + void finish(int r); + +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::EventPreprocessor<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_EVENT_PREPROCESSOR_H diff --git a/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc new file mode 100644 index 00000000..74e97537 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_client.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \ + << "GetMirrorImageIdRequest: " << this << " " \ + << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::create_rados_callback; + +template <typename I> +void GetMirrorImageIdRequest<I>::send() { + dout(20) << dendl; + get_image_id(); +} + +template <typename I> +void GetMirrorImageIdRequest<I>::get_image_id() { + dout(20) << dendl; + + // attempt to cross-reference a image id by the global image id + librados::ObjectReadOperation op; + librbd::cls_client::mirror_image_get_image_id_start(&op, m_global_image_id); + + librados::AioCompletion *aio_comp = create_rados_callback< + GetMirrorImageIdRequest<I>, + &GetMirrorImageIdRequest<I>::handle_get_image_id>( + this); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void GetMirrorImageIdRequest<I>::handle_get_image_id(int r) { + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_image_get_image_id_finish( + &iter, m_image_id); + } + + dout(20) << "r=" << r << ", " + << "image_id=" << *m_image_id << dendl; + + if (r < 0) { + if (r == -ENOENT) { + dout(10) << "global image " << m_global_image_id << " not registered" + << dendl; + } else { + derr << "failed to retrieve image id: " << cpp_strerror(r) << dendl; + } + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void GetMirrorImageIdRequest<I>::finish(int r) { + dout(20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::GetMirrorImageIdRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h new file mode 100644 index 00000000..b2664513 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_GET_MIRROR_IMAGE_ID_REQUEST_H +#define RBD_MIRROR_IMAGE_REPLAYER_GET_MIRROR_IMAGE_ID_REQUEST_H + +#include "include/buffer.h" +#include "include/rados/librados_fwd.hpp" +#include <string> + +namespace librbd { struct ImageCtx; } + +struct Context; + +namespace rbd { +namespace mirror { +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class GetMirrorImageIdRequest { +public: + static GetMirrorImageIdRequest *create(librados::IoCtx &io_ctx, + const std::string &global_image_id, + std::string *image_id, + Context *on_finish) { + return new GetMirrorImageIdRequest(io_ctx, global_image_id, image_id, + on_finish); + } + + GetMirrorImageIdRequest(librados::IoCtx &io_ctx, + const std::string &global_image_id, + std::string *image_id, + Context *on_finish) + : m_io_ctx(io_ctx), m_global_image_id(global_image_id), + m_image_id(image_id), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_IMAGE_ID + * | + * v + * <finish> + + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + std::string m_global_image_id; + std::string *m_image_id; + Context *m_on_finish; + + bufferlist m_out_bl; + + void get_image_id(); + void handle_get_image_id(int r); + + void finish(int r); + +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::GetMirrorImageIdRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_GET_MIRROR_IMAGE_ID_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.cc b/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.cc new file mode 100644 index 00000000..54636fdb --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.cc @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "IsPrimaryRequest.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include <type_traits> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::IsPrimaryRequest: " \ + << this << " " << __func__ << " " + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +IsPrimaryRequest<I>::IsPrimaryRequest(I *image_ctx, bool *primary, + Context *on_finish) + : m_image_ctx(image_ctx), m_primary(primary), m_on_finish(on_finish) { +} + +template <typename I> +void IsPrimaryRequest<I>::send() { + send_get_mirror_state(); +} + +template <typename I> +void IsPrimaryRequest<I>::send_get_mirror_state() { + dout(20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_image_get_start(&op, m_image_ctx->id); + + librados::AioCompletion *aio_comp = create_rados_callback< + IsPrimaryRequest<I>, &IsPrimaryRequest<I>::handle_get_mirror_state>(this); + int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void IsPrimaryRequest<I>::handle_get_mirror_state(int r) { + dout(20) << ": r=" << r << dendl; + + cls::rbd::MirrorImage mirror_image; + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_image_get_finish(&iter, &mirror_image); + if (r == 0) { + if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + send_is_tag_owner(); + return; + } else if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLING) { + dout(5) << ": image mirroring is being disabled" << dendl; + r = -ENOENT; + } else { + derr << ": image mirroring is disabled" << dendl; + r = -EINVAL; + } + } else { + derr << ": failed to decode image mirror state: " << cpp_strerror(r) + << dendl; + } + } else if (r == -ENOENT) { + dout(5) << ": image is not mirrored" << dendl; + } else { + derr << ": failed to retrieve image mirror state: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template <typename I> +void IsPrimaryRequest<I>::send_is_tag_owner() { + // deduce the class type for the journal to support unit tests + using Journal = typename std::decay< + typename std::remove_pointer<decltype(std::declval<I>().journal)> + ::type>::type; + + dout(20) << dendl; + + Context *ctx = create_context_callback< + IsPrimaryRequest<I>, &IsPrimaryRequest<I>::handle_is_tag_owner>(this); + + Journal::is_tag_owner(m_image_ctx, m_primary, ctx); +} + +template <typename I> +void IsPrimaryRequest<I>::handle_is_tag_owner(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to query remote image tag owner: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template <typename I> +void IsPrimaryRequest<I>::finish(int r) { + dout(20) << ": r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::IsPrimaryRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.h b/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.h new file mode 100644 index 00000000..ddb332cb --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_IS_PRIMARY_REQUEST_H +#define RBD_MIRROR_IMAGE_REPLAYER_IS_PRIMARY_REQUEST_H + +#include "include/buffer.h" + +class Context; +class ContextWQ; +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class IsPrimaryRequest { +public: + static IsPrimaryRequest* create(ImageCtxT *image_ctx, bool *primary, + Context *on_finish) { + return new IsPrimaryRequest(image_ctx, primary, on_finish); + } + + IsPrimaryRequest(ImageCtxT *image_ctx, bool *primary, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_MIRROR_STATE * * * * * + * | * + * v * + * IS_TAG_OWNER * * * * * * * (error) + * | * + * v * + * <finish> < * * * * * * * * + * + * @endverbatim + */ + ImageCtxT *m_image_ctx; + bool *m_primary; + Context *m_on_finish; + + bufferlist m_out_bl; + + void send_get_mirror_state(); + void handle_get_mirror_state(int r); + + void send_is_tag_owner(); + void handle_is_tag_owner(int r); + + void finish(int r); +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::IsPrimaryRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_IS_PRIMARY_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc new file mode 100644 index 00000000..7f55745e --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "OpenImageRequest.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include <type_traits> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::OpenImageRequest: " \ + << this << " " << __func__ << " " + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::create_context_callback; + +template <typename I> +OpenImageRequest<I>::OpenImageRequest(librados::IoCtx &io_ctx, I **image_ctx, + const std::string &image_id, + bool read_only, Context *on_finish) + : m_io_ctx(io_ctx), m_image_ctx(image_ctx), m_image_id(image_id), + m_read_only(read_only), m_on_finish(on_finish) { +} + +template <typename I> +void OpenImageRequest<I>::send() { + send_open_image(); +} + +template <typename I> +void OpenImageRequest<I>::send_open_image() { + dout(20) << dendl; + + *m_image_ctx = I::create("", m_image_id, nullptr, m_io_ctx, m_read_only); + + Context *ctx = create_context_callback< + OpenImageRequest<I>, &OpenImageRequest<I>::handle_open_image>( + this); + (*m_image_ctx)->state->open(0, ctx); +} + +template <typename I> +void OpenImageRequest<I>::handle_open_image(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to open image '" << m_image_id << "': " + << cpp_strerror(r) << dendl; + (*m_image_ctx)->destroy(); + *m_image_ctx = nullptr; + } + + finish(r); +} + +template <typename I> +void OpenImageRequest<I>::finish(int r) { + dout(20) << ": r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::OpenImageRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/OpenImageRequest.h b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.h new file mode 100644 index 00000000..01ab3117 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_OPEN_IMAGE_REQUEST_H +#define RBD_MIRROR_IMAGE_REPLAYER_OPEN_IMAGE_REQUEST_H + +#include "include/int_types.h" +#include "librbd/ImageCtx.h" +#include <string> + +class Context; +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class OpenImageRequest { +public: + static OpenImageRequest* create(librados::IoCtx &io_ctx, + ImageCtxT **image_ctx, + const std::string &image_id, + bool read_only, Context *on_finish) { + return new OpenImageRequest(io_ctx, image_ctx, image_id, read_only, + on_finish); + } + + OpenImageRequest(librados::IoCtx &io_ctx, ImageCtxT **image_ctx, + const std::string &image_id, bool read_only, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * OPEN_IMAGE + * | + * v + * <finish> + * + * @endverbatim + */ + librados::IoCtx &m_io_ctx; + ImageCtxT **m_image_ctx; + std::string m_image_id; + bool m_read_only; + Context *m_on_finish; + + void send_open_image(); + void handle_open_image(int r); + + void send_close_image(int r); + void handle_close_image(int r); + + void finish(int r); + +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::OpenImageRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_OPEN_IMAGE_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc new file mode 100644 index 00000000..87b141ca --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc @@ -0,0 +1,271 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include "CloseImageRequest.h" +#include "IsPrimaryRequest.h" +#include "OpenLocalImageRequest.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/journal/Policy.h" +#include <type_traits> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::OpenLocalImageRequest: " \ + << this << " " << __func__ << " " + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::create_context_callback; + +namespace { + +template <typename I> +struct MirrorExclusiveLockPolicy : public librbd::exclusive_lock::Policy { + I *image_ctx; + + MirrorExclusiveLockPolicy(I *image_ctx) : image_ctx(image_ctx) { + } + + bool may_auto_request_lock() override { + return false; + } + + int lock_requested(bool force) override { + int r = -EROFS; + { + RWLock::RLocker owner_locker(image_ctx->owner_lock); + RWLock::RLocker snap_locker(image_ctx->snap_lock); + if (image_ctx->journal == nullptr || image_ctx->journal->is_tag_owner()) { + r = 0; + } + } + + if (r == 0) { + // if the local image journal has been closed or if it was (force) + // promoted allow the lock to be released to another client + image_ctx->exclusive_lock->release_lock(nullptr); + } + return r; + } + + bool accept_blocked_request( + librbd::exclusive_lock::OperationRequestType request_type) override { + if (request_type == + librbd::exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE) { + return true; + } + return false; + } +}; + +struct MirrorJournalPolicy : public librbd::journal::Policy { + ContextWQ *work_queue; + + MirrorJournalPolicy(ContextWQ *work_queue) : work_queue(work_queue) { + } + + bool append_disabled() const override { + // avoid recording any events to the local journal + return true; + } + bool journal_disabled() const override { + return false; + } + + void allocate_tag_on_lock(Context *on_finish) override { + // rbd-mirror will manually create tags by copying them from the peer + work_queue->queue(on_finish, 0); + } +}; + +} // anonymous namespace + +template <typename I> +OpenLocalImageRequest<I>::OpenLocalImageRequest(librados::IoCtx &local_io_ctx, + I **local_image_ctx, + const std::string &local_image_id, + ContextWQ *work_queue, + Context *on_finish) + : m_local_io_ctx(local_io_ctx), m_local_image_ctx(local_image_ctx), + m_local_image_id(local_image_id), m_work_queue(work_queue), + m_on_finish(on_finish) { +} + +template <typename I> +void OpenLocalImageRequest<I>::send() { + send_open_image(); +} + +template <typename I> +void OpenLocalImageRequest<I>::send_open_image() { + dout(20) << dendl; + + *m_local_image_ctx = I::create("", m_local_image_id, nullptr, + m_local_io_ctx, false); + { + RWLock::WLocker owner_locker((*m_local_image_ctx)->owner_lock); + RWLock::WLocker snap_locker((*m_local_image_ctx)->snap_lock); + (*m_local_image_ctx)->set_exclusive_lock_policy( + new MirrorExclusiveLockPolicy<I>(*m_local_image_ctx)); + (*m_local_image_ctx)->set_journal_policy( + new MirrorJournalPolicy(m_work_queue)); + } + + Context *ctx = create_context_callback< + OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_open_image>( + this); + (*m_local_image_ctx)->state->open(0, ctx); +} + +template <typename I> +void OpenLocalImageRequest<I>::handle_open_image(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + if (r == -ENOENT) { + dout(10) << ": local image does not exist" << dendl; + } else { + derr << ": failed to open image '" << m_local_image_id << "': " + << cpp_strerror(r) << dendl; + } + (*m_local_image_ctx)->destroy(); + *m_local_image_ctx = nullptr; + finish(r); + return; + } + + send_is_primary(); +} + +template <typename I> +void OpenLocalImageRequest<I>::send_is_primary() { + dout(20) << dendl; + + Context *ctx = create_context_callback< + OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_is_primary>( + this); + IsPrimaryRequest<I> *request = IsPrimaryRequest<I>::create(*m_local_image_ctx, + &m_primary, ctx); + request->send(); +} + +template <typename I> +void OpenLocalImageRequest<I>::handle_is_primary(int r) { + dout(20) << ": r=" << r << dendl; + + if (r == -ENOENT) { + dout(5) << ": local image is not mirrored" << dendl; + send_close_image(r); + return; + } else if (r < 0) { + derr << ": error querying local image primary status: " << cpp_strerror(r) + << dendl; + send_close_image(r); + return; + } + + // if the local image owns the tag -- don't steal the lock since + // we aren't going to mirror peer data into this image anyway + if (m_primary) { + dout(10) << ": local image is primary -- skipping image replay" << dendl; + send_close_image(-EREMOTEIO); + return; + } + + send_lock_image(); +} + +template <typename I> +void OpenLocalImageRequest<I>::send_lock_image() { + dout(20) << dendl; + + RWLock::RLocker owner_locker((*m_local_image_ctx)->owner_lock); + if ((*m_local_image_ctx)->exclusive_lock == nullptr) { + derr << ": image does not support exclusive lock" << dendl; + send_close_image(-EINVAL); + return; + } + + // disallow any proxied maintenance operations before grabbing lock + (*m_local_image_ctx)->exclusive_lock->block_requests(-EROFS); + + Context *ctx = create_context_callback< + OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_lock_image>( + this); + + (*m_local_image_ctx)->exclusive_lock->acquire_lock(ctx); +} + +template <typename I> +void OpenLocalImageRequest<I>::handle_lock_image(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to lock image '" << m_local_image_id << "': " + << cpp_strerror(r) << dendl; + send_close_image(r); + return; + } + + { + RWLock::RLocker owner_locker((*m_local_image_ctx)->owner_lock); + if ((*m_local_image_ctx)->exclusive_lock == nullptr || + !(*m_local_image_ctx)->exclusive_lock->is_lock_owner()) { + derr << ": image is not locked" << dendl; + send_close_image(-EBUSY); + return; + } + } + + finish(0); +} + +template <typename I> +void OpenLocalImageRequest<I>::send_close_image(int r) { + dout(20) << dendl; + + if (m_ret_val == 0 && r < 0) { + m_ret_val = r; + } + + Context *ctx = create_context_callback< + OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_close_image>( + this); + CloseImageRequest<I> *request = CloseImageRequest<I>::create( + m_local_image_ctx, ctx); + request->send(); +} + +template <typename I> +void OpenLocalImageRequest<I>::handle_close_image(int r) { + dout(20) << dendl; + + ceph_assert(r == 0); + finish(m_ret_val); +} + +template <typename I> +void OpenLocalImageRequest<I>::finish(int r) { + dout(20) << ": r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::OpenLocalImageRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h new file mode 100644 index 00000000..58de545f --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h @@ -0,0 +1,90 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_OPEN_LOCAL_IMAGE_REQUEST_H +#define RBD_MIRROR_IMAGE_REPLAYER_OPEN_LOCAL_IMAGE_REQUEST_H + +#include "include/int_types.h" +#include "librbd/ImageCtx.h" +#include <string> + +class Context; +class ContextWQ; +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class OpenLocalImageRequest { +public: + static OpenLocalImageRequest* create(librados::IoCtx &local_io_ctx, + ImageCtxT **local_image_ctx, + const std::string &local_image_id, + ContextWQ *work_queue, + Context *on_finish) { + return new OpenLocalImageRequest(local_io_ctx, local_image_ctx, + local_image_id, work_queue, on_finish); + } + + OpenLocalImageRequest(librados::IoCtx &local_io_ctx, + ImageCtxT **local_image_ctx, + const std::string &local_image_id, + ContextWQ *m_work_queue, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * OPEN_IMAGE * * * * * * * * + * | * + * v * + * IS_PRIMARY * * * * * * * * + * | * + * v (skip if primary) v + * LOCK_IMAGE * * * > CLOSE_IMAGE + * | | + * v | + * <finish> <---------------/ + * + * @endverbatim + */ + librados::IoCtx &m_local_io_ctx; + ImageCtxT **m_local_image_ctx; + std::string m_local_image_id; + ContextWQ *m_work_queue; + Context *m_on_finish; + + bool m_primary = false; + int m_ret_val = 0; + + void send_open_image(); + void handle_open_image(int r); + + void send_is_primary(); + void handle_is_primary(int r); + + void send_lock_image(); + void handle_lock_image(int r); + + void send_close_image(int r); + void handle_close_image(int r); + + void finish(int r); + +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::OpenLocalImageRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_OPEN_LOCAL_IMAGE_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc new file mode 100644 index 00000000..8e0ea837 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_client.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "tools/rbd_mirror/Threads.h" +#include "tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h" +#include <type_traits> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \ + << "PrepareLocalImageRequest: " << this << " " \ + << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +void PrepareLocalImageRequest<I>::send() { + dout(20) << dendl; + get_local_image_id(); +} + +template <typename I> +void PrepareLocalImageRequest<I>::get_local_image_id() { + dout(20) << dendl; + + Context *ctx = create_context_callback< + PrepareLocalImageRequest<I>, + &PrepareLocalImageRequest<I>::handle_get_local_image_id>(this); + auto req = GetMirrorImageIdRequest<I>::create(m_io_ctx, m_global_image_id, + m_local_image_id, ctx); + req->send(); +} + +template <typename I> +void PrepareLocalImageRequest<I>::handle_get_local_image_id(int r) { + dout(20) << "r=" << r << ", " + << "local_image_id=" << *m_local_image_id << dendl; + + if (r < 0) { + finish(r); + return; + } + + get_local_image_name(); +} + +template <typename I> +void PrepareLocalImageRequest<I>::get_local_image_name() { + dout(20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::dir_get_name_start(&op, *m_local_image_id); + + m_out_bl.clear(); + librados::AioCompletion *aio_comp = create_rados_callback< + PrepareLocalImageRequest<I>, + &PrepareLocalImageRequest<I>::handle_get_local_image_name>(this); + int r = m_io_ctx.aio_operate(RBD_DIRECTORY, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void PrepareLocalImageRequest<I>::handle_get_local_image_name(int r) { + dout(20) << "r=" << r << dendl; + + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::dir_get_name_finish(&it, m_local_image_name); + } + + if (r < 0) { + if (r != -ENOENT) { + derr << "failed to retrieve image name: " << cpp_strerror(r) << dendl; + } + finish(r); + return; + } + + get_mirror_state(); +} + +template <typename I> +void PrepareLocalImageRequest<I>::get_mirror_state() { + dout(20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_image_get_start(&op, *m_local_image_id); + + m_out_bl.clear(); + librados::AioCompletion *aio_comp = create_rados_callback< + PrepareLocalImageRequest<I>, + &PrepareLocalImageRequest<I>::handle_get_mirror_state>(this); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void PrepareLocalImageRequest<I>::handle_get_mirror_state(int r) { + dout(20) << ": r=" << r << dendl; + + cls::rbd::MirrorImage mirror_image; + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_image_get_finish(&iter, &mirror_image); + } + + if (r < 0) { + derr << "failed to retrieve image mirror state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + // TODO save current mirror state to determine if we should + // delete a partially formed image + // (e.g. MIRROR_IMAGE_STATE_CREATING/DELETING) + + get_tag_owner(); +} + +template <typename I> +void PrepareLocalImageRequest<I>::get_tag_owner() { + // deduce the class type for the journal to support unit tests + using Journal = typename std::decay< + typename std::remove_pointer<decltype(std::declval<I>().journal)> + ::type>::type; + + dout(20) << dendl; + + Context *ctx = create_context_callback< + PrepareLocalImageRequest<I>, + &PrepareLocalImageRequest<I>::handle_get_tag_owner>(this); + Journal::get_tag_owner(m_io_ctx, *m_local_image_id, m_tag_owner, + m_work_queue, ctx); +} + +template <typename I> +void PrepareLocalImageRequest<I>::handle_get_tag_owner(int r) { + dout(20) << "r=" << r << ", " + << "tag_owner=" << *m_tag_owner << dendl; + + if (r < 0) { + derr << "failed to retrieve journal tag owner: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void PrepareLocalImageRequest<I>::finish(int r) { + dout(20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::PrepareLocalImageRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h new file mode 100644 index 00000000..3417dd96 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H +#define RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H + +#include "include/buffer.h" +#include "include/rados/librados_fwd.hpp" +#include <string> + +namespace librbd { struct ImageCtx; } + +struct Context; +struct ContextWQ; + +namespace rbd { +namespace mirror { +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class PrepareLocalImageRequest { +public: + static PrepareLocalImageRequest *create(librados::IoCtx &io_ctx, + const std::string &global_image_id, + std::string *local_image_id, + std::string *local_image_name, + std::string *tag_owner, + ContextWQ *work_queue, + Context *on_finish) { + return new PrepareLocalImageRequest(io_ctx, global_image_id, local_image_id, + local_image_name, tag_owner, work_queue, + on_finish); + } + + PrepareLocalImageRequest(librados::IoCtx &io_ctx, + const std::string &global_image_id, + std::string *local_image_id, + std::string *local_image_name, + std::string *tag_owner, + ContextWQ *work_queue, + Context *on_finish) + : m_io_ctx(io_ctx), m_global_image_id(global_image_id), + m_local_image_id(local_image_id), m_local_image_name(local_image_name), + m_tag_owner(tag_owner), m_work_queue(work_queue), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_LOCAL_IMAGE_ID + * | + * v + * GET_LOCAL_IMAGE_NAME + * | + * v + * GET_MIRROR_STATE + * | + * v + * <finish> + + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + std::string m_global_image_id; + std::string *m_local_image_id; + std::string *m_local_image_name; + std::string *m_tag_owner; + ContextWQ *m_work_queue; + Context *m_on_finish; + + bufferlist m_out_bl; + + void get_local_image_id(); + void handle_get_local_image_id(int r); + + void get_local_image_name(); + void handle_get_local_image_name(int r); + + void get_mirror_state(); + void handle_get_mirror_state(int r); + + void get_tag_owner(); + void handle_get_tag_owner(int r); + + void finish(int r); + +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::PrepareLocalImageRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc new file mode 100644 index 00000000..00c141e0 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc @@ -0,0 +1,195 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_client.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "journal/Journaler.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/journal/Types.h" +#include "tools/rbd_mirror/Threads.h" +#include "tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h" +#include "tools/rbd_mirror/image_replayer/Utils.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \ + << "PrepareRemoteImageRequest: " << this << " " \ + << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template <typename I> +void PrepareRemoteImageRequest<I>::send() { + get_remote_mirror_uuid(); +} + +template <typename I> +void PrepareRemoteImageRequest<I>::get_remote_mirror_uuid() { + dout(20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_uuid_get_start(&op); + + librados::AioCompletion *aio_comp = create_rados_callback< + PrepareRemoteImageRequest<I>, + &PrepareRemoteImageRequest<I>::handle_get_remote_mirror_uuid>(this); + int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void PrepareRemoteImageRequest<I>::handle_get_remote_mirror_uuid(int r) { + if (r >= 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_uuid_get_finish(&it, m_remote_mirror_uuid); + if (r >= 0 && m_remote_mirror_uuid->empty()) { + r = -ENOENT; + } + } + + dout(20) << "r=" << r << dendl; + if (r < 0) { + if (r == -ENOENT) { + dout(5) << "remote mirror uuid missing" << dendl; + } else { + derr << "failed to retrieve remote mirror uuid: " << cpp_strerror(r) + << dendl; + } + finish(r); + return; + } + + get_remote_image_id(); +} + +template <typename I> +void PrepareRemoteImageRequest<I>::get_remote_image_id() { + dout(20) << dendl; + + Context *ctx = create_context_callback< + PrepareRemoteImageRequest<I>, + &PrepareRemoteImageRequest<I>::handle_get_remote_image_id>(this); + auto req = GetMirrorImageIdRequest<I>::create(m_remote_io_ctx, + m_global_image_id, + m_remote_image_id, ctx); + req->send(); +} + +template <typename I> +void PrepareRemoteImageRequest<I>::handle_get_remote_image_id(int r) { + dout(20) << "r=" << r << ", " + << "remote_image_id=" << *m_remote_image_id << dendl; + + if (r < 0) { + finish(r); + return; + } + + get_client(); +} + +template <typename I> +void PrepareRemoteImageRequest<I>::get_client() { + dout(20) << dendl; + + ceph_assert(*m_remote_journaler == nullptr); + *m_remote_journaler = new Journaler(m_threads->work_queue, m_threads->timer, + &m_threads->timer_lock, m_remote_io_ctx, + *m_remote_image_id, m_local_mirror_uuid, + m_journal_settings); + + Context *ctx = create_async_context_callback( + m_threads->work_queue, create_context_callback< + PrepareRemoteImageRequest<I>, + &PrepareRemoteImageRequest<I>::handle_get_client>(this)); + (*m_remote_journaler)->get_client(m_local_mirror_uuid, &m_client, ctx); +} + +template <typename I> +void PrepareRemoteImageRequest<I>::handle_get_client(int r) { + dout(20) << "r=" << r << dendl; + + if (r == -ENOENT) { + dout(10) << "client not registered" << dendl; + register_client(); + } else if (r < 0) { + derr << "failed to retrieve client: " << cpp_strerror(r) << dendl; + finish(r); + } else if (!util::decode_client_meta(m_client, m_client_meta)) { + // require operator intervention since the data is corrupt + finish(-EBADMSG); + } else { + // skip registration if it already exists + *m_client_state = m_client.state; + finish(0); + } +} + +template <typename I> +void PrepareRemoteImageRequest<I>::register_client() { + dout(20) << dendl; + + librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{ + m_local_image_id}; + mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING; + + librbd::journal::ClientData client_data{mirror_peer_client_meta}; + bufferlist client_data_bl; + encode(client_data, client_data_bl); + + Context *ctx = create_async_context_callback( + m_threads->work_queue, create_context_callback< + PrepareRemoteImageRequest<I>, + &PrepareRemoteImageRequest<I>::handle_register_client>(this)); + (*m_remote_journaler)->register_client(client_data_bl, ctx); +} + +template <typename I> +void PrepareRemoteImageRequest<I>::handle_register_client(int r) { + dout(20) << "r=" << r << dendl; + + if (r < 0) { + derr << "failed to register with remote journal: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + *m_client_state = cls::journal::CLIENT_STATE_CONNECTED; + *m_client_meta = librbd::journal::MirrorPeerClientMeta(m_local_image_id); + m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING; + + finish(0); +} + +template <typename I> +void PrepareRemoteImageRequest<I>::finish(int r) { + dout(20) << "r=" << r << dendl; + + if (r < 0) { + delete *m_remote_journaler; + *m_remote_journaler = nullptr; + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_replayer::PrepareRemoteImageRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h new file mode 100644 index 00000000..100a066b --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h @@ -0,0 +1,141 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_PREPARE_REMOTE_IMAGE_REQUEST_H +#define RBD_MIRROR_IMAGE_REPLAYER_PREPARE_REMOTE_IMAGE_REQUEST_H + +#include "include/buffer_fwd.h" +#include "include/rados/librados_fwd.hpp" +#include "cls/journal/cls_journal_types.h" +#include "journal/Settings.h" +#include "librbd/journal/TypeTraits.h" +#include <string> + +namespace journal { class Journaler; } +namespace journal { class Settings; } +namespace librbd { struct ImageCtx; } +namespace librbd { namespace journal { struct MirrorPeerClientMeta; } } + +struct Context; +struct ContextWQ; + +namespace rbd { +namespace mirror { + +template <typename> struct Threads; + +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class PrepareRemoteImageRequest { +public: + typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Journaler Journaler; + typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta; + + static PrepareRemoteImageRequest *create(Threads<ImageCtxT> *threads, + librados::IoCtx &remote_io_ctx, + const std::string &global_image_id, + const std::string &local_mirror_uuid, + const std::string &local_image_id, + const journal::Settings &settings, + std::string *remote_mirror_uuid, + std::string *remote_image_id, + Journaler **remote_journaler, + cls::journal::ClientState *client_state, + MirrorPeerClientMeta *client_meta, + Context *on_finish) { + return new PrepareRemoteImageRequest(threads, remote_io_ctx, + global_image_id, local_mirror_uuid, + local_image_id, settings, + remote_mirror_uuid, remote_image_id, + remote_journaler, client_state, + client_meta, on_finish); + } + + PrepareRemoteImageRequest(Threads<ImageCtxT> *threads, + librados::IoCtx &remote_io_ctx, + const std::string &global_image_id, + const std::string &local_mirror_uuid, + const std::string &local_image_id, + const journal::Settings &journal_settings, + std::string *remote_mirror_uuid, + std::string *remote_image_id, + Journaler **remote_journaler, + cls::journal::ClientState *client_state, + MirrorPeerClientMeta *client_meta, + Context *on_finish) + : m_threads(threads), m_remote_io_ctx(remote_io_ctx), + m_global_image_id(global_image_id), + m_local_mirror_uuid(local_mirror_uuid), m_local_image_id(local_image_id), + m_journal_settings(journal_settings), + m_remote_mirror_uuid(remote_mirror_uuid), + m_remote_image_id(remote_image_id), + m_remote_journaler(remote_journaler), m_client_state(client_state), + m_client_meta(client_meta), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * GET_REMOTE_MIRROR_UUID + * | + * v + * GET_REMOTE_IMAGE_ID + * | + * v + * GET_CLIENT + * | + * v (skip if not needed) + * REGISTER_CLIENT + * | + * v + * <finish> + + * @endverbatim + */ + + Threads<ImageCtxT> *m_threads; + librados::IoCtx &m_remote_io_ctx; + std::string m_global_image_id; + std::string m_local_mirror_uuid; + std::string m_local_image_id; + journal::Settings m_journal_settings; + std::string *m_remote_mirror_uuid; + std::string *m_remote_image_id; + Journaler **m_remote_journaler; + cls::journal::ClientState *m_client_state; + MirrorPeerClientMeta *m_client_meta; + Context *m_on_finish; + + bufferlist m_out_bl; + cls::journal::Client m_client; + + void get_remote_mirror_uuid(); + void handle_get_remote_mirror_uuid(int r); + + void get_remote_image_id(); + void handle_get_remote_image_id(int r); + + void get_client(); + void handle_get_client(int r); + + void register_client(); + void handle_register_client(int r); + + void finish(int r); +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_replayer::PrepareRemoteImageRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_REPLAYER_PREPARE_REMOTE_IMAGE_REQUEST_H diff --git a/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc new file mode 100644 index 00000000..f514d749 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc @@ -0,0 +1,246 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ReplayStatusFormatter.h" +#include "common/debug.h" +#include "common/dout.h" +#include "common/errno.h" +#include "journal/Journaler.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::ReplayStatusFormatter: " \ + << this << " " << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace image_replayer { + +using librbd::util::unique_lock_name; + +template <typename I> +ReplayStatusFormatter<I>::ReplayStatusFormatter(Journaler *journaler, + const std::string &mirror_uuid) + : m_journaler(journaler), + m_mirror_uuid(mirror_uuid), + m_lock(unique_lock_name("ReplayStatusFormatter::m_lock", this)) { +} + +template <typename I> +bool ReplayStatusFormatter<I>::get_or_send_update(std::string *description, + Context *on_finish) { + dout(20) << dendl; + + bool in_progress = false; + { + Mutex::Locker locker(m_lock); + if (m_on_finish) { + in_progress = true; + } else { + m_on_finish = on_finish; + } + } + + if (in_progress) { + dout(10) << "previous request is still in progress, ignoring" << dendl; + on_finish->complete(-EAGAIN); + return false; + } + + m_master_position = cls::journal::ObjectPosition(); + m_mirror_position = cls::journal::ObjectPosition(); + + cls::journal::Client master_client, mirror_client; + int r; + + r = m_journaler->get_cached_client(librbd::Journal<>::IMAGE_CLIENT_ID, + &master_client); + if (r < 0) { + derr << "error retrieving registered master client: " + << cpp_strerror(r) << dendl; + } else { + r = m_journaler->get_cached_client(m_mirror_uuid, &mirror_client); + if (r < 0) { + derr << "error retrieving registered mirror client: " + << cpp_strerror(r) << dendl; + } + } + + if (!master_client.commit_position.object_positions.empty()) { + m_master_position = + *(master_client.commit_position.object_positions.begin()); + } + + if (!mirror_client.commit_position.object_positions.empty()) { + m_mirror_position = + *(mirror_client.commit_position.object_positions.begin()); + } + + if (!calculate_behind_master_or_send_update()) { + dout(20) << "need to update tag cache" << dendl; + return false; + } + + format(description); + + { + Mutex::Locker locker(m_lock); + ceph_assert(m_on_finish == on_finish); + m_on_finish = nullptr; + } + + on_finish->complete(-EEXIST); + return true; +} + +template <typename I> +bool ReplayStatusFormatter<I>::calculate_behind_master_or_send_update() { + dout(20) << "m_master_position=" << m_master_position + << ", m_mirror_position=" << m_mirror_position << dendl; + + m_entries_behind_master = 0; + + if (m_master_position == cls::journal::ObjectPosition() || + m_master_position.tag_tid < m_mirror_position.tag_tid) { + return true; + } + + cls::journal::ObjectPosition master = m_master_position; + uint64_t mirror_tag_tid = m_mirror_position.tag_tid; + + while (master.tag_tid > mirror_tag_tid) { + auto tag_it = m_tag_cache.find(master.tag_tid); + if (tag_it == m_tag_cache.end()) { + send_update_tag_cache(master.tag_tid, mirror_tag_tid); + return false; + } + librbd::journal::TagData &tag_data = tag_it->second; + m_entries_behind_master += master.entry_tid; + master = {0, tag_data.predecessor.tag_tid, tag_data.predecessor.entry_tid}; + } + if (master.tag_tid == mirror_tag_tid && + master.entry_tid > m_mirror_position.entry_tid) { + m_entries_behind_master += master.entry_tid - m_mirror_position.entry_tid; + } + + dout(20) << "clearing tags not needed any more (below mirror position)" + << dendl; + + uint64_t tag_tid = mirror_tag_tid; + size_t old_size = m_tag_cache.size(); + while (tag_tid != 0) { + auto tag_it = m_tag_cache.find(tag_tid); + if (tag_it == m_tag_cache.end()) { + break; + } + librbd::journal::TagData &tag_data = tag_it->second; + + dout(20) << "erasing tag " << tag_data << "for tag_tid " << tag_tid + << dendl; + + tag_tid = tag_data.predecessor.tag_tid; + m_tag_cache.erase(tag_it); + } + + dout(20) << old_size - m_tag_cache.size() << " entries cleared" << dendl; + + return true; +} + +template <typename I> +void ReplayStatusFormatter<I>::send_update_tag_cache(uint64_t master_tag_tid, + uint64_t mirror_tag_tid) { + if (master_tag_tid <= mirror_tag_tid || + m_tag_cache.find(master_tag_tid) != m_tag_cache.end()) { + Context *on_finish = nullptr; + { + Mutex::Locker locker(m_lock); + std::swap(m_on_finish, on_finish); + } + + ceph_assert(on_finish); + on_finish->complete(0); + return; + } + + dout(20) << "master_tag_tid=" << master_tag_tid << ", mirror_tag_tid=" + << mirror_tag_tid << dendl; + + FunctionContext *ctx = new FunctionContext( + [this, master_tag_tid, mirror_tag_tid](int r) { + handle_update_tag_cache(master_tag_tid, mirror_tag_tid, r); + }); + m_journaler->get_tag(master_tag_tid, &m_tag, ctx); +} + +template <typename I> +void ReplayStatusFormatter<I>::handle_update_tag_cache(uint64_t master_tag_tid, + uint64_t mirror_tag_tid, + int r) { + librbd::journal::TagData tag_data; + + if (r < 0) { + derr << "error retrieving tag " << master_tag_tid << ": " << cpp_strerror(r) + << dendl; + } else { + dout(20) << "retrieved tag " << master_tag_tid << ": " << m_tag << dendl; + + auto it = m_tag.data.cbegin(); + try { + decode(tag_data, it); + } catch (const buffer::error &err) { + derr << "error decoding tag " << master_tag_tid << ": " << err.what() + << dendl; + } + } + + if (tag_data.predecessor.mirror_uuid != + librbd::Journal<>::LOCAL_MIRROR_UUID && + tag_data.predecessor.mirror_uuid != + librbd::Journal<>::ORPHAN_MIRROR_UUID) { + dout(20) << "hit remote image non-primary epoch" << dendl; + tag_data.predecessor = {}; + } + + dout(20) << "decoded tag " << master_tag_tid << ": " << tag_data << dendl; + + m_tag_cache[master_tag_tid] = tag_data; + send_update_tag_cache(tag_data.predecessor.tag_tid, mirror_tag_tid); +} + +template <typename I> +void ReplayStatusFormatter<I>::format(std::string *description) { + + dout(20) << "m_master_position=" << m_master_position + << ", m_mirror_position=" << m_mirror_position + << ", m_entries_behind_master=" << m_entries_behind_master << dendl; + + std::stringstream ss; + ss << "master_position="; + if (m_master_position == cls::journal::ObjectPosition()) { + ss << "[]"; + } else { + ss << m_master_position; + } + ss << ", mirror_position="; + if (m_mirror_position == cls::journal::ObjectPosition()) { + ss << "[]"; + } else { + ss << m_mirror_position; + } + ss << ", entries_behind_master=" + << (m_entries_behind_master > 0 ? m_entries_behind_master : 0); + + *description = ss.str(); +} + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +template class +rbd::mirror::image_replayer::ReplayStatusFormatter<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h new file mode 100644 index 00000000..59940a65 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_REPLAY_STATUS_FORMATTER_H +#define RBD_MIRROR_IMAGE_REPLAYER_REPLAY_STATUS_FORMATTER_H + +#include "include/Context.h" +#include "common/Mutex.h" +#include "cls/journal/cls_journal_types.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" + +namespace journal { class Journaler; } +namespace librbd { class ImageCtx; } + +namespace rbd { +namespace mirror { +namespace image_replayer { + +template <typename ImageCtxT = librbd::ImageCtx> +class ReplayStatusFormatter { +public: + typedef typename librbd::journal::TypeTraits<ImageCtxT>::Journaler Journaler; + + static ReplayStatusFormatter* create(Journaler *journaler, + const std::string &mirror_uuid) { + return new ReplayStatusFormatter(journaler, mirror_uuid); + } + + static void destroy(ReplayStatusFormatter* formatter) { + delete formatter; + } + + ReplayStatusFormatter(Journaler *journaler, const std::string &mirror_uuid); + + bool get_or_send_update(std::string *description, Context *on_finish); + +private: + Journaler *m_journaler; + std::string m_mirror_uuid; + Mutex m_lock; + Context *m_on_finish = nullptr; + cls::journal::ObjectPosition m_master_position; + cls::journal::ObjectPosition m_mirror_position; + int m_entries_behind_master = 0; + cls::journal::Tag m_tag; + std::map<uint64_t, librbd::journal::TagData> m_tag_cache; + + bool calculate_behind_master_or_send_update(); + void send_update_tag_cache(uint64_t master_tag_tid, uint64_t mirror_tag_tid); + void handle_update_tag_cache(uint64_t master_tag_tid, uint64_t mirror_tag_tid, + int r); + void format(std::string *description); +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +#endif // RBD_MIRROR_IMAGE_REPLAYER_REPLAY_STATUS_FORMATTER_H diff --git a/src/tools/rbd_mirror/image_replayer/Types.h b/src/tools/rbd_mirror/image_replayer/Types.h new file mode 100644 index 00000000..6ab988a7 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/Types.h @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H +#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H + +namespace rbd { +namespace mirror { +namespace image_replayer { + +enum HealthState { + HEALTH_STATE_OK, + HEALTH_STATE_WARNING, + HEALTH_STATE_ERROR +}; + +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H diff --git a/src/tools/rbd_mirror/image_replayer/Utils.cc b/src/tools/rbd_mirror/image_replayer/Utils.cc new file mode 100644 index 00000000..eda0179f --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/Utils.cc @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/image_replayer/Utils.h" +#include "common/debug.h" +#include "common/errno.h" +#include "cls/journal/cls_journal_types.h" +#include "librbd/journal/Types.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_replayer::util::" \ + << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace image_replayer { +namespace util { + +bool decode_client_meta(const cls::journal::Client& client, + librbd::journal::MirrorPeerClientMeta* client_meta) { + dout(15) << dendl; + + librbd::journal::ClientData client_data; + auto it = client.data.cbegin(); + try { + decode(client_data, it); + } catch (const buffer::error &err) { + derr << "failed to decode client meta data: " << err.what() << dendl; + return false; + } + + auto local_client_meta = boost::get<librbd::journal::MirrorPeerClientMeta>( + &client_data.client_meta); + if (local_client_meta == nullptr) { + derr << "unknown peer registration" << dendl; + return false; + } + + *client_meta = *local_client_meta; + dout(15) << "client found: client_meta=" << *client_meta << dendl; + return true; +} + +} // namespace util +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + diff --git a/src/tools/rbd_mirror/image_replayer/Utils.h b/src/tools/rbd_mirror/image_replayer/Utils.h new file mode 100644 index 00000000..d42146d1 --- /dev/null +++ b/src/tools/rbd_mirror/image_replayer/Utils.h @@ -0,0 +1,23 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_REPLAYER_UTILS_H +#define RBD_MIRROR_IMAGE_REPLAYER_UTILS_H + +namespace cls { namespace journal { struct Client; } } +namespace librbd { namespace journal { struct MirrorPeerClientMeta; } } + +namespace rbd { +namespace mirror { +namespace image_replayer { +namespace util { + +bool decode_client_meta(const cls::journal::Client& client, + librbd::journal::MirrorPeerClientMeta* client_meta); + +} // namespace util +} // namespace image_replayer +} // namespace mirror +} // namespace rbd + +#endif // RBD_MIRROR_IMAGE_REPLAYER_UTILS_H diff --git a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc new file mode 100644 index 00000000..ffe2eca9 --- /dev/null +++ b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc @@ -0,0 +1,182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SyncPointCreateRequest.h" +#include "include/uuid.h" +#include "common/debug.h" +#include "common/errno.h" +#include "journal/Journaler.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_sync::SyncPointCreateRequest: " \ + << this << " " << __func__ + +namespace rbd { +namespace mirror { +namespace image_sync { + +namespace { + +static const std::string SNAP_NAME_PREFIX(".rbd-mirror"); + +} // anonymous namespace + +using librbd::util::create_context_callback; + +template <typename I> +SyncPointCreateRequest<I>::SyncPointCreateRequest(I *remote_image_ctx, + const std::string &mirror_uuid, + Journaler *journaler, + MirrorPeerClientMeta *client_meta, + Context *on_finish) + : m_remote_image_ctx(remote_image_ctx), m_mirror_uuid(mirror_uuid), + m_journaler(journaler), m_client_meta(client_meta), m_on_finish(on_finish), + m_client_meta_copy(*client_meta) { + ceph_assert(m_client_meta->sync_points.size() < 2); + + // initialize the updated client meta with the new sync point + m_client_meta_copy.sync_points.emplace_back(); + if (m_client_meta_copy.sync_points.size() > 1) { + m_client_meta_copy.sync_points.back().from_snap_name = + m_client_meta_copy.sync_points.front().snap_name; + } +} + +template <typename I> +void SyncPointCreateRequest<I>::send() { + send_update_client(); +} + +template <typename I> +void SyncPointCreateRequest<I>::send_update_client() { + uuid_d uuid_gen; + uuid_gen.generate_random(); + + MirrorPeerSyncPoint &sync_point = m_client_meta_copy.sync_points.back(); + sync_point.snap_name = SNAP_NAME_PREFIX + "." + m_mirror_uuid + "." + + uuid_gen.to_string(); + + dout(20) << ": sync_point=" << sync_point << dendl; + + bufferlist client_data_bl; + librbd::journal::ClientData client_data(m_client_meta_copy); + encode(client_data, client_data_bl); + + Context *ctx = create_context_callback< + SyncPointCreateRequest<I>, &SyncPointCreateRequest<I>::handle_update_client>( + this); + m_journaler->update_client(client_data_bl, ctx); +} + +template <typename I> +void SyncPointCreateRequest<I>::handle_update_client(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to update client data: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + // update provided meta structure to reflect reality + *m_client_meta = m_client_meta_copy; + + send_refresh_image(); +} + +template <typename I> +void SyncPointCreateRequest<I>::send_refresh_image() { + dout(20) << dendl; + + Context *ctx = create_context_callback< + SyncPointCreateRequest<I>, &SyncPointCreateRequest<I>::handle_refresh_image>( + this); + m_remote_image_ctx->state->refresh(ctx); +} + +template <typename I> +void SyncPointCreateRequest<I>::handle_refresh_image(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": remote image refresh failed: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_create_snap(); +} + +template <typename I> +void SyncPointCreateRequest<I>::send_create_snap() { + dout(20) << dendl; + + MirrorPeerSyncPoint &sync_point = m_client_meta_copy.sync_points.back(); + + Context *ctx = create_context_callback< + SyncPointCreateRequest<I>, &SyncPointCreateRequest<I>::handle_create_snap>( + this); + m_remote_image_ctx->operations->snap_create( + cls::rbd::UserSnapshotNamespace(), sync_point.snap_name.c_str(), ctx); +} + +template <typename I> +void SyncPointCreateRequest<I>::handle_create_snap(int r) { + dout(20) << ": r=" << r << dendl; + + if (r == -EEXIST) { + send_update_client(); + return; + } else if (r < 0) { + derr << ": failed to create snapshot: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_final_refresh_image(); +} + +template <typename I> +void SyncPointCreateRequest<I>::send_final_refresh_image() { + dout(20) << dendl; + + Context *ctx = create_context_callback< + SyncPointCreateRequest<I>, + &SyncPointCreateRequest<I>::handle_final_refresh_image>(this); + m_remote_image_ctx->state->refresh(ctx); +} + +template <typename I> +void SyncPointCreateRequest<I>::handle_final_refresh_image(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to refresh image for snapshot: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template <typename I> +void SyncPointCreateRequest<I>::finish(int r) { + dout(20) << ": r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_sync +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_sync::SyncPointCreateRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h new file mode 100644 index 00000000..45275ec4 --- /dev/null +++ b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H +#define RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H + +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" +#include <string> + +class Context; +namespace journal { class Journaler; } +namespace librbd { class ImageCtx; } +namespace librbd { namespace journal { struct MirrorPeerClientMeta; } } + +namespace rbd { +namespace mirror { +namespace image_sync { + +template <typename ImageCtxT = librbd::ImageCtx> +class SyncPointCreateRequest { +public: + typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Journaler Journaler; + typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta; + typedef librbd::journal::MirrorPeerSyncPoint MirrorPeerSyncPoint; + + static SyncPointCreateRequest* create(ImageCtxT *remote_image_ctx, + const std::string &mirror_uuid, + Journaler *journaler, + MirrorPeerClientMeta *client_meta, + Context *on_finish) { + return new SyncPointCreateRequest(remote_image_ctx, mirror_uuid, journaler, + client_meta, on_finish); + } + + SyncPointCreateRequest(ImageCtxT *remote_image_ctx, + const std::string &mirror_uuid, Journaler *journaler, + MirrorPeerClientMeta *client_meta, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * v + * UPDATE_CLIENT < . . + * | . + * v . + * REFRESH_IMAGE . + * | . (repeat on EEXIST) + * v . + * CREATE_SNAP . . . . + * | + * v + * REFRESH_IMAGE + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT *m_remote_image_ctx; + std::string m_mirror_uuid; + Journaler *m_journaler; + MirrorPeerClientMeta *m_client_meta; + Context *m_on_finish; + + MirrorPeerClientMeta m_client_meta_copy; + + void send_update_client(); + void handle_update_client(int r); + + void send_refresh_image(); + void handle_refresh_image(int r); + + void send_create_snap(); + void handle_create_snap(int r); + + void send_final_refresh_image(); + void handle_final_refresh_image(int r); + + void finish(int r); +}; + +} // namespace image_sync +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_sync::SyncPointCreateRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H diff --git a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc new file mode 100644 index 00000000..2cfed5e6 --- /dev/null +++ b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc @@ -0,0 +1,220 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SyncPointPruneRequest.h" +#include "common/debug.h" +#include "common/errno.h" +#include "journal/Journaler.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include <set> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::image_sync::SyncPointPruneRequest: " \ + << this << " " << __func__ +namespace rbd { +namespace mirror { +namespace image_sync { + +using librbd::util::create_context_callback; + +template <typename I> +SyncPointPruneRequest<I>::SyncPointPruneRequest(I *remote_image_ctx, + bool sync_complete, + Journaler *journaler, + MirrorPeerClientMeta *client_meta, + Context *on_finish) + : m_remote_image_ctx(remote_image_ctx), m_sync_complete(sync_complete), + m_journaler(journaler), m_client_meta(client_meta), m_on_finish(on_finish), + m_client_meta_copy(*client_meta) { +} + +template <typename I> +void SyncPointPruneRequest<I>::send() { + if (m_client_meta->sync_points.empty()) { + send_remove_snap(); + return; + } + + if (m_sync_complete) { + // if sync is complete, we can remove the master sync point + auto it = m_client_meta_copy.sync_points.begin(); + MirrorPeerSyncPoint &sync_point = *it; + + ++it; + if (it == m_client_meta_copy.sync_points.end() || + it->from_snap_name != sync_point.snap_name) { + m_snap_names.push_back(sync_point.snap_name); + } + + if (!sync_point.from_snap_name.empty()) { + m_snap_names.push_back(sync_point.from_snap_name); + } + } else { + // if we have more than one sync point or invalid sync points, + // trim them off + RWLock::RLocker snap_locker(m_remote_image_ctx->snap_lock); + std::set<std::string> snap_names; + for (auto it = m_client_meta_copy.sync_points.rbegin(); + it != m_client_meta_copy.sync_points.rend(); ++it) { + MirrorPeerSyncPoint &sync_point = *it; + if (&sync_point == &m_client_meta_copy.sync_points.front()) { + if (m_remote_image_ctx->get_snap_id( + cls::rbd::UserSnapshotNamespace(), sync_point.snap_name) == + CEPH_NOSNAP) { + derr << ": failed to locate sync point snapshot: " + << sync_point.snap_name << dendl; + } else if (!sync_point.from_snap_name.empty()) { + derr << ": unexpected from_snap_name in primary sync point: " + << sync_point.from_snap_name << dendl; + } else { + // first sync point is OK -- keep it + break; + } + m_invalid_master_sync_point = true; + } + + if (snap_names.count(sync_point.snap_name) == 0) { + snap_names.insert(sync_point.snap_name); + m_snap_names.push_back(sync_point.snap_name); + } + + MirrorPeerSyncPoint &front_sync_point = + m_client_meta_copy.sync_points.front(); + if (!sync_point.from_snap_name.empty() && + snap_names.count(sync_point.from_snap_name) == 0 && + sync_point.from_snap_name != front_sync_point.snap_name) { + snap_names.insert(sync_point.from_snap_name); + m_snap_names.push_back(sync_point.from_snap_name); + } + } + } + + send_remove_snap(); +} + +template <typename I> +void SyncPointPruneRequest<I>::send_remove_snap() { + if (m_snap_names.empty()) { + send_refresh_image(); + return; + } + + const std::string &snap_name = m_snap_names.front(); + + dout(20) << ": snap_name=" << snap_name << dendl; + + Context *ctx = create_context_callback< + SyncPointPruneRequest<I>, &SyncPointPruneRequest<I>::handle_remove_snap>( + this); + m_remote_image_ctx->operations->snap_remove(cls::rbd::UserSnapshotNamespace(), + snap_name.c_str(), + ctx); +} + +template <typename I> +void SyncPointPruneRequest<I>::handle_remove_snap(int r) { + dout(20) << ": r=" << r << dendl; + + ceph_assert(!m_snap_names.empty()); + std::string snap_name = m_snap_names.front(); + m_snap_names.pop_front(); + + if (r == -ENOENT) { + r = 0; + } + if (r < 0) { + derr << ": failed to remove snapshot '" << snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_remove_snap(); +} + +template <typename I> +void SyncPointPruneRequest<I>::send_refresh_image() { + dout(20) << dendl; + + Context *ctx = create_context_callback< + SyncPointPruneRequest<I>, &SyncPointPruneRequest<I>::handle_refresh_image>( + this); + m_remote_image_ctx->state->refresh(ctx); +} + +template <typename I> +void SyncPointPruneRequest<I>::handle_refresh_image(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": remote image refresh failed: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_update_client(); +} + +template <typename I> +void SyncPointPruneRequest<I>::send_update_client() { + dout(20) << dendl; + + if (m_sync_complete) { + m_client_meta_copy.sync_points.pop_front(); + if (m_client_meta_copy.sync_points.empty()) { + m_client_meta_copy.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING; + } + } else { + while (m_client_meta_copy.sync_points.size() > 1) { + m_client_meta_copy.sync_points.pop_back(); + } + if (m_invalid_master_sync_point) { + // all subsequent sync points would have been pruned + m_client_meta_copy.sync_points.clear(); + } + } + + bufferlist client_data_bl; + librbd::journal::ClientData client_data(m_client_meta_copy); + encode(client_data, client_data_bl); + + Context *ctx = create_context_callback< + SyncPointPruneRequest<I>, &SyncPointPruneRequest<I>::handle_update_client>( + this); + m_journaler->update_client(client_data_bl, ctx); +} + +template <typename I> +void SyncPointPruneRequest<I>::handle_update_client(int r) { + dout(20) << ": r=" << r << dendl; + + if (r < 0) { + derr << ": failed to update client data: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + // update provided meta structure to reflect reality + *m_client_meta = m_client_meta_copy; + finish(0); +} + +template <typename I> +void SyncPointPruneRequest<I>::finish(int r) { + dout(20) << ": r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image_sync +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::image_sync::SyncPointPruneRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h new file mode 100644 index 00000000..65e13ef5 --- /dev/null +++ b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H +#define RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H + +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" +#include <list> +#include <string> + +class Context; +namespace journal { class Journaler; } +namespace librbd { class ImageCtx; } +namespace librbd { namespace journal { struct MirrorPeerClientMeta; } } + +namespace rbd { +namespace mirror { +namespace image_sync { + +template <typename ImageCtxT = librbd::ImageCtx> +class SyncPointPruneRequest { +public: + typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::Journaler Journaler; + typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta; + typedef librbd::journal::MirrorPeerSyncPoint MirrorPeerSyncPoint; + + static SyncPointPruneRequest* create(ImageCtxT *remote_image_ctx, + bool sync_complete, + Journaler *journaler, + MirrorPeerClientMeta *client_meta, + Context *on_finish) { + return new SyncPointPruneRequest(remote_image_ctx, sync_complete, journaler, + client_meta, on_finish); + } + + SyncPointPruneRequest(ImageCtxT *remote_image_ctx, bool sync_complete, + Journaler *journaler, MirrorPeerClientMeta *client_meta, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * | . . . . . + * | . . + * v v . (repeat if from snap + * REMOVE_SNAP . . . unused by other sync) + * | + * v + * REFRESH_IMAGE + * | + * v + * UPDATE_CLIENT + * | + * v + * <finish> + * + * @endverbatim + */ + + ImageCtxT *m_remote_image_ctx; + bool m_sync_complete; + Journaler *m_journaler; + MirrorPeerClientMeta *m_client_meta; + Context *m_on_finish; + + MirrorPeerClientMeta m_client_meta_copy; + std::list<std::string> m_snap_names; + + bool m_invalid_master_sync_point = false; + + void send_remove_snap(); + void handle_remove_snap(int r); + + void send_refresh_image(); + void handle_refresh_image(int r); + + void send_update_client(); + void handle_update_client(int r); + + void finish(int r); +}; + +} // namespace image_sync +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::image_sync::SyncPointPruneRequest<librbd::ImageCtx>; + +#endif // RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H diff --git a/src/tools/rbd_mirror/instance_watcher/Types.cc b/src/tools/rbd_mirror/instance_watcher/Types.cc new file mode 100644 index 00000000..0e992273 --- /dev/null +++ b/src/tools/rbd_mirror/instance_watcher/Types.cc @@ -0,0 +1,245 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Types.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "common/Formatter.h" + +namespace rbd { +namespace mirror { +namespace instance_watcher { + +namespace { + +class EncodePayloadVisitor : public boost::static_visitor<void> { +public: + explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {} + + template <typename Payload> + inline void operator()(const Payload &payload) const { + using ceph::encode; + encode(static_cast<uint32_t>(Payload::NOTIFY_OP), m_bl); + payload.encode(m_bl); + } + +private: + bufferlist &m_bl; +}; + +class DecodePayloadVisitor : public boost::static_visitor<void> { +public: + DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter) + : m_version(version), m_iter(iter) {} + + template <typename Payload> + inline void operator()(Payload &payload) const { + payload.decode(m_version, m_iter); + } + +private: + __u8 m_version; + bufferlist::const_iterator &m_iter; +}; + +class DumpPayloadVisitor : public boost::static_visitor<void> { +public: + explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {} + + template <typename Payload> + inline void operator()(const Payload &payload) const { + NotifyOp notify_op = Payload::NOTIFY_OP; + m_formatter->dump_string("notify_op", stringify(notify_op)); + payload.dump(m_formatter); + } + +private: + ceph::Formatter *m_formatter; +}; + +} // anonymous namespace + +void PayloadBase::encode(bufferlist &bl) const { + using ceph::encode; + encode(request_id, bl); +} + +void PayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(request_id, iter); +} + +void PayloadBase::dump(Formatter *f) const { + f->dump_unsigned("request_id", request_id); +} + +void ImagePayloadBase::encode(bufferlist &bl) const { + using ceph::encode; + PayloadBase::encode(bl); + encode(global_image_id, bl); +} + +void ImagePayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + PayloadBase::decode(version, iter); + decode(global_image_id, iter); +} + +void ImagePayloadBase::dump(Formatter *f) const { + PayloadBase::dump(f); + f->dump_string("global_image_id", global_image_id); +} + +void PeerImageRemovedPayload::encode(bufferlist &bl) const { + using ceph::encode; + PayloadBase::encode(bl); + encode(global_image_id, bl); + encode(peer_mirror_uuid, bl); +} + +void PeerImageRemovedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + PayloadBase::decode(version, iter); + decode(global_image_id, iter); + decode(peer_mirror_uuid, iter); +} + +void PeerImageRemovedPayload::dump(Formatter *f) const { + PayloadBase::dump(f); + f->dump_string("global_image_id", global_image_id); + f->dump_string("peer_mirror_uuid", peer_mirror_uuid); +} + +void SyncPayloadBase::encode(bufferlist &bl) const { + using ceph::encode; + PayloadBase::encode(bl); + encode(sync_id, bl); +} + +void SyncPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + PayloadBase::decode(version, iter); + decode(sync_id, iter); +} + +void SyncPayloadBase::dump(Formatter *f) const { + PayloadBase::dump(f); + f->dump_string("sync_id", sync_id); +} + +void UnknownPayload::encode(bufferlist &bl) const { + ceph_abort(); +} + +void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void UnknownPayload::dump(Formatter *f) const { +} + +void NotifyMessage::encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + boost::apply_visitor(EncodePayloadVisitor(bl), payload); + ENCODE_FINISH(bl); +} + +void NotifyMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(2, iter); + + uint32_t notify_op; + decode(notify_op, iter); + + // select the correct payload variant based upon the encoded op + switch (notify_op) { + case NOTIFY_OP_IMAGE_ACQUIRE: + payload = ImageAcquirePayload(); + break; + case NOTIFY_OP_IMAGE_RELEASE: + payload = ImageReleasePayload(); + break; + case NOTIFY_OP_PEER_IMAGE_REMOVED: + payload = PeerImageRemovedPayload(); + break; + case NOTIFY_OP_SYNC_REQUEST: + payload = SyncRequestPayload(); + break; + case NOTIFY_OP_SYNC_START: + payload = SyncStartPayload(); + break; + default: + payload = UnknownPayload(); + break; + } + + apply_visitor(DecodePayloadVisitor(struct_v, iter), payload); + DECODE_FINISH(iter); +} + +void NotifyMessage::dump(Formatter *f) const { + apply_visitor(DumpPayloadVisitor(f), payload); +} + +void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) { + o.push_back(new NotifyMessage(ImageAcquirePayload())); + o.push_back(new NotifyMessage(ImageAcquirePayload(1, "gid"))); + + o.push_back(new NotifyMessage(ImageReleasePayload())); + o.push_back(new NotifyMessage(ImageReleasePayload(1, "gid"))); + + o.push_back(new NotifyMessage(PeerImageRemovedPayload())); + o.push_back(new NotifyMessage(PeerImageRemovedPayload(1, "gid", "uuid"))); + + o.push_back(new NotifyMessage(SyncRequestPayload())); + o.push_back(new NotifyMessage(SyncRequestPayload(1, "sync_id"))); + + o.push_back(new NotifyMessage(SyncStartPayload())); + o.push_back(new NotifyMessage(SyncStartPayload(1, "sync_id"))); +} + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op) { + switch (op) { + case NOTIFY_OP_IMAGE_ACQUIRE: + out << "ImageAcquire"; + break; + case NOTIFY_OP_IMAGE_RELEASE: + out << "ImageRelease"; + break; + case NOTIFY_OP_PEER_IMAGE_REMOVED: + out << "PeerImageRemoved"; + break; + case NOTIFY_OP_SYNC_REQUEST: + out << "SyncRequest"; + break; + case NOTIFY_OP_SYNC_START: + out << "SyncStart"; + break; + default: + out << "Unknown (" << static_cast<uint32_t>(op) << ")"; + break; + } + return out; +} + +void NotifyAckPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(instance_id, bl); + encode(request_id, bl); + encode(ret_val, bl); +} + +void NotifyAckPayload::decode(bufferlist::const_iterator &iter) { + using ceph::decode; + decode(instance_id, iter); + decode(request_id, iter); + decode(ret_val, iter); +} + +void NotifyAckPayload::dump(Formatter *f) const { + f->dump_string("instance_id", instance_id); + f->dump_unsigned("request_id", request_id); + f->dump_int("request_id", ret_val); +} + +} // namespace instance_watcher +} // namespace mirror +} // namespace rbd diff --git a/src/tools/rbd_mirror/instance_watcher/Types.h b/src/tools/rbd_mirror/instance_watcher/Types.h new file mode 100644 index 00000000..b0b7b779 --- /dev/null +++ b/src/tools/rbd_mirror/instance_watcher/Types.h @@ -0,0 +1,197 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_INSTANCE_WATCHER_TYPES_H +#define RBD_MIRROR_INSTANCE_WATCHER_TYPES_H + +#include <string> +#include <set> +#include <boost/variant.hpp> + +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "include/int_types.h" + +namespace ceph { class Formatter; } + +namespace rbd { +namespace mirror { +namespace instance_watcher { + +enum NotifyOp { + NOTIFY_OP_IMAGE_ACQUIRE = 0, + NOTIFY_OP_IMAGE_RELEASE = 1, + NOTIFY_OP_PEER_IMAGE_REMOVED = 2, + NOTIFY_OP_SYNC_REQUEST = 3, + NOTIFY_OP_SYNC_START = 4 +}; + +struct PayloadBase { + uint64_t request_id; + + PayloadBase() : request_id(0) { + } + + PayloadBase(uint64_t request_id) : request_id(request_id) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct ImagePayloadBase : public PayloadBase { + std::string global_image_id; + + ImagePayloadBase() : PayloadBase() { + } + + ImagePayloadBase(uint64_t request_id, const std::string &global_image_id) + : PayloadBase(request_id), global_image_id(global_image_id) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct ImageAcquirePayload : public ImagePayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_ACQUIRE; + + ImageAcquirePayload() { + } + ImageAcquirePayload(uint64_t request_id, const std::string &global_image_id) + : ImagePayloadBase(request_id, global_image_id) { + } +}; + +struct ImageReleasePayload : public ImagePayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_RELEASE; + + ImageReleasePayload() { + } + ImageReleasePayload(uint64_t request_id, const std::string &global_image_id) + : ImagePayloadBase(request_id, global_image_id) { + } +}; + +struct PeerImageRemovedPayload : public PayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_PEER_IMAGE_REMOVED; + + std::string global_image_id; + std::string peer_mirror_uuid; + + PeerImageRemovedPayload() { + } + PeerImageRemovedPayload(uint64_t request_id, + const std::string& global_image_id, + const std::string& peer_mirror_uuid) + : PayloadBase(request_id), + global_image_id(global_image_id), peer_mirror_uuid(peer_mirror_uuid) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct SyncPayloadBase : public PayloadBase { + std::string sync_id; + + SyncPayloadBase() : PayloadBase() { + } + + SyncPayloadBase(uint64_t request_id, const std::string &sync_id) + : PayloadBase(request_id), sync_id(sync_id) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct SyncRequestPayload : public SyncPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_SYNC_REQUEST; + + SyncRequestPayload() : SyncPayloadBase() { + } + + SyncRequestPayload(uint64_t request_id, const std::string &sync_id) + : SyncPayloadBase(request_id, sync_id) { + } +}; + +struct SyncStartPayload : public SyncPayloadBase { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_SYNC_START; + + SyncStartPayload() : SyncPayloadBase() { + } + + SyncStartPayload(uint64_t request_id, const std::string &sync_id) + : SyncPayloadBase(request_id, sync_id) { + } +}; + +struct UnknownPayload { + static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1); + + UnknownPayload() { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +typedef boost::variant<ImageAcquirePayload, + ImageReleasePayload, + PeerImageRemovedPayload, + SyncRequestPayload, + SyncStartPayload, + UnknownPayload> Payload; + +struct NotifyMessage { + NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) { + } + + Payload payload; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list<NotifyMessage *> &o); +}; + +WRITE_CLASS_ENCODER(NotifyMessage); + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op); + +struct NotifyAckPayload { + std::string instance_id; + uint64_t request_id; + int ret_val; + + NotifyAckPayload() : request_id(0), ret_val(0) { + } + + NotifyAckPayload(const std::string &instance_id, uint64_t request_id, + int ret_val) + : instance_id(instance_id), request_id(request_id), ret_val(ret_val) { + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +WRITE_CLASS_ENCODER(NotifyAckPayload); + +} // namespace instance_watcher +} // namespace mirror +} // namespace librbd + +using rbd::mirror::instance_watcher::encode; +using rbd::mirror::instance_watcher::decode; + +#endif // RBD_MIRROR_INSTANCE_WATCHER_TYPES_H diff --git a/src/tools/rbd_mirror/instances/Types.h b/src/tools/rbd_mirror/instances/Types.h new file mode 100644 index 00000000..8b0a68fc --- /dev/null +++ b/src/tools/rbd_mirror/instances/Types.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_INSTANCES_TYPES_H +#define CEPH_RBD_MIRROR_INSTANCES_TYPES_H + +#include <string> +#include <vector> + +namespace rbd { +namespace mirror { +namespace instances { + +struct Listener { + typedef std::vector<std::string> InstanceIds; + + virtual ~Listener() { + } + + virtual void handle_added(const InstanceIds& instance_ids) = 0; + virtual void handle_removed(const InstanceIds& instance_ids) = 0; +}; + +} // namespace instances +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_INSTANCES_TYPES_H diff --git a/src/tools/rbd_mirror/leader_watcher/Types.cc b/src/tools/rbd_mirror/leader_watcher/Types.cc new file mode 100644 index 00000000..d2fb7908 --- /dev/null +++ b/src/tools/rbd_mirror/leader_watcher/Types.cc @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Types.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "common/Formatter.h" + +namespace rbd { +namespace mirror { +namespace leader_watcher { + +namespace { + +class EncodePayloadVisitor : public boost::static_visitor<void> { +public: + explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {} + + template <typename Payload> + inline void operator()(const Payload &payload) const { + using ceph::encode; + encode(static_cast<uint32_t>(Payload::NOTIFY_OP), m_bl); + payload.encode(m_bl); + } + +private: + bufferlist &m_bl; +}; + +class DecodePayloadVisitor : public boost::static_visitor<void> { +public: + DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter) + : m_version(version), m_iter(iter) {} + + template <typename Payload> + inline void operator()(Payload &payload) const { + payload.decode(m_version, m_iter); + } + +private: + __u8 m_version; + bufferlist::const_iterator &m_iter; +}; + +class DumpPayloadVisitor : public boost::static_visitor<void> { +public: + explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {} + + template <typename Payload> + inline void operator()(const Payload &payload) const { + NotifyOp notify_op = Payload::NOTIFY_OP; + m_formatter->dump_string("notify_op", stringify(notify_op)); + payload.dump(m_formatter); + } + +private: + ceph::Formatter *m_formatter; +}; + +} // anonymous namespace + +void HeartbeatPayload::encode(bufferlist &bl) const { +} + +void HeartbeatPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void HeartbeatPayload::dump(Formatter *f) const { +} + +void LockAcquiredPayload::encode(bufferlist &bl) const { +} + +void LockAcquiredPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void LockAcquiredPayload::dump(Formatter *f) const { +} + +void LockReleasedPayload::encode(bufferlist &bl) const { +} + +void LockReleasedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void LockReleasedPayload::dump(Formatter *f) const { +} + +void UnknownPayload::encode(bufferlist &bl) const { + ceph_abort(); +} + +void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void UnknownPayload::dump(Formatter *f) const { +} + +void NotifyMessage::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + boost::apply_visitor(EncodePayloadVisitor(bl), payload); + ENCODE_FINISH(bl); +} + +void NotifyMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(1, iter); + + uint32_t notify_op; + decode(notify_op, iter); + + // select the correct payload variant based upon the encoded op + switch (notify_op) { + case NOTIFY_OP_HEARTBEAT: + payload = HeartbeatPayload(); + break; + case NOTIFY_OP_LOCK_ACQUIRED: + payload = LockAcquiredPayload(); + break; + case NOTIFY_OP_LOCK_RELEASED: + payload = LockReleasedPayload(); + break; + default: + payload = UnknownPayload(); + break; + } + + apply_visitor(DecodePayloadVisitor(struct_v, iter), payload); + DECODE_FINISH(iter); +} + +void NotifyMessage::dump(Formatter *f) const { + apply_visitor(DumpPayloadVisitor(f), payload); +} + +void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) { + o.push_back(new NotifyMessage(HeartbeatPayload())); + o.push_back(new NotifyMessage(LockAcquiredPayload())); + o.push_back(new NotifyMessage(LockReleasedPayload())); +} + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op) { + switch (op) { + case NOTIFY_OP_HEARTBEAT: + out << "Heartbeat"; + break; + case NOTIFY_OP_LOCK_ACQUIRED: + out << "LockAcquired"; + break; + case NOTIFY_OP_LOCK_RELEASED: + out << "LockReleased"; + break; + default: + out << "Unknown (" << static_cast<uint32_t>(op) << ")"; + break; + } + return out; +} + +} // namespace leader_watcher +} // namespace mirror +} // namespace librbd diff --git a/src/tools/rbd_mirror/leader_watcher/Types.h b/src/tools/rbd_mirror/leader_watcher/Types.h new file mode 100644 index 00000000..1278e54b --- /dev/null +++ b/src/tools/rbd_mirror/leader_watcher/Types.h @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RBD_MIRROR_LEADER_WATCHER_TYPES_H +#define RBD_MIRROR_LEADER_WATCHER_TYPES_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include <string> +#include <vector> +#include <boost/variant.hpp> + +struct Context; + +namespace ceph { class Formatter; } + +namespace rbd { +namespace mirror { +namespace leader_watcher { + +struct Listener { + typedef std::vector<std::string> InstanceIds; + + virtual ~Listener() { + } + + virtual void post_acquire_handler(Context *on_finish) = 0; + virtual void pre_release_handler(Context *on_finish) = 0; + + virtual void update_leader_handler( + const std::string &leader_instance_id) = 0; + + virtual void handle_instances_added(const InstanceIds& instance_ids) = 0; + virtual void handle_instances_removed(const InstanceIds& instance_ids) = 0; +}; + +enum NotifyOp { + NOTIFY_OP_HEARTBEAT = 0, + NOTIFY_OP_LOCK_ACQUIRED = 1, + NOTIFY_OP_LOCK_RELEASED = 2, +}; + +struct HeartbeatPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_HEARTBEAT; + + HeartbeatPayload() { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct LockAcquiredPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_LOCK_ACQUIRED; + + LockAcquiredPayload() { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct LockReleasedPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_LOCK_RELEASED; + + LockReleasedPayload() { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct UnknownPayload { + static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1); + + UnknownPayload() { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +typedef boost::variant<HeartbeatPayload, + LockAcquiredPayload, + LockReleasedPayload, + UnknownPayload> Payload; + +struct NotifyMessage { + NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) { + } + + Payload payload; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list<NotifyMessage *> &o); +}; + +WRITE_CLASS_ENCODER(NotifyMessage); + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op); + +} // namespace leader_watcher +} // namespace mirror +} // namespace librbd + +using rbd::mirror::leader_watcher::encode; +using rbd::mirror::leader_watcher::decode; + +#endif // RBD_MIRROR_LEADER_WATCHER_TYPES_H diff --git a/src/tools/rbd_mirror/main.cc b/src/tools/rbd_mirror/main.cc new file mode 100644 index 00000000..ab350a01 --- /dev/null +++ b/src/tools/rbd_mirror/main.cc @@ -0,0 +1,104 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_argparse.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "global/global_init.h" +#include "global/signal_handler.h" +#include "Mirror.h" +#include "Types.h" + +#include <vector> + +rbd::mirror::Mirror *mirror = nullptr; +PerfCounters *g_perf_counters = nullptr; + +void usage() { + std::cout << "usage: rbd-mirror [options...]" << std::endl; + std::cout << "options:\n"; + std::cout << " -m monaddress[:port] connect to specified monitor\n"; + std::cout << " --keyring=<path> path to keyring for local cluster\n"; + std::cout << " --log-file=<logfile> file to log debug output\n"; + std::cout << " --debug-rbd-mirror=<log-level>/<memory-level> set rbd-mirror debug level\n"; + generic_server_usage(); +} + +static void handle_signal(int signum) +{ + if (mirror) + mirror->handle_signal(signum); +} + +int main(int argc, const char **argv) +{ + std::vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + + if (g_conf()->daemonize) { + global_init_daemonize(g_ceph_context); + } + + common_init_finish(g_ceph_context); + + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, handle_signal); + register_async_signal_handler_oneshot(SIGINT, handle_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_signal); + + std::vector<const char*> cmd_args; + argv_to_vec(argc, argv, cmd_args); + + // disable unnecessary librbd cache + g_ceph_context->_conf.set_val_or_die("rbd_cache", "false"); + + auto prio = + g_ceph_context->_conf.get_val<int64_t>("rbd_mirror_perf_stats_prio"); + PerfCountersBuilder plb(g_ceph_context, "rbd_mirror", + rbd::mirror::l_rbd_mirror_first, + rbd::mirror::l_rbd_mirror_last); + plb.add_u64_counter(rbd::mirror::l_rbd_mirror_replay, "replay", "Replays", + "r", prio); + plb.add_u64_counter(rbd::mirror::l_rbd_mirror_replay_bytes, "replay_bytes", + "Replayed data", "rb", prio, unit_t(UNIT_BYTES)); + plb.add_time_avg(rbd::mirror::l_rbd_mirror_replay_latency, "replay_latency", + "Replay latency", "rl", prio); + g_perf_counters = plb.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(g_perf_counters); + + mirror = new rbd::mirror::Mirror(g_ceph_context, cmd_args); + int r = mirror->init(); + if (r < 0) { + std::cerr << "failed to initialize: " << cpp_strerror(r) << std::endl; + goto cleanup; + } + + mirror->run(); + + cleanup: + unregister_async_signal_handler(SIGHUP, handle_signal); + unregister_async_signal_handler(SIGINT, handle_signal); + unregister_async_signal_handler(SIGTERM, handle_signal); + shutdown_async_signal_handler(); + + g_ceph_context->get_perfcounters_collection()->remove(g_perf_counters); + + delete mirror; + delete g_perf_counters; + + return r < 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc new file mode 100644 index 00000000..a1d9c1b5 --- /dev/null +++ b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h" +#include "common/debug.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/Utils.h" +#include <map> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd_mirror +#undef dout_prefix +#define dout_prefix *_dout << "rbd::mirror::pool_watcher::RefreshImagesRequest " \ + << this << " " << __func__ << ": " + +namespace rbd { +namespace mirror { +namespace pool_watcher { + +static const uint32_t MAX_RETURN = 1024; + +using librbd::util::create_rados_callback; + +template <typename I> +void RefreshImagesRequest<I>::send() { + m_image_ids->clear(); + mirror_image_list(); +} + +template <typename I> +void RefreshImagesRequest<I>::mirror_image_list() { + dout(10) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_image_list_start(&op, m_start_after, MAX_RETURN); + + m_out_bl.clear(); + librados::AioCompletion *aio_comp = create_rados_callback< + RefreshImagesRequest<I>, + &RefreshImagesRequest<I>::handle_mirror_image_list>(this); + int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template <typename I> +void RefreshImagesRequest<I>::handle_mirror_image_list(int r) { + dout(10) << "r=" << r << dendl; + + std::map<std::string, std::string> ids; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_image_list_finish(&it, &ids); + } + + if (r < 0 && r != -ENOENT) { + derr << "failed to list mirrored images: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + // store as global -> local image ids + for (auto &id : ids) { + m_image_ids->emplace(id.second, id.first); + } + + if (ids.size() == MAX_RETURN) { + m_start_after = ids.rbegin()->first; + mirror_image_list(); + return; + } + + finish(0); +} + +template <typename I> +void RefreshImagesRequest<I>::finish(int r) { + dout(10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace pool_watcher +} // namespace mirror +} // namespace rbd + +template class rbd::mirror::pool_watcher::RefreshImagesRequest<librbd::ImageCtx>; diff --git a/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h new file mode 100644 index 00000000..8bfeabe2 --- /dev/null +++ b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_POOL_WATCHER_REFRESH_IMAGES_REQUEST_H +#define CEPH_RBD_MIRROR_POOL_WATCHER_REFRESH_IMAGES_REQUEST_H + +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "tools/rbd_mirror/Types.h" +#include <string> + +struct Context; + +namespace librbd { struct ImageCtx; } + +namespace rbd { +namespace mirror { +namespace pool_watcher { + +template <typename ImageCtxT = librbd::ImageCtx> +class RefreshImagesRequest { +public: + static RefreshImagesRequest *create(librados::IoCtx &remote_io_ctx, + ImageIds *image_ids, Context *on_finish) { + return new RefreshImagesRequest(remote_io_ctx, image_ids, on_finish); + } + + RefreshImagesRequest(librados::IoCtx &remote_io_ctx, ImageIds *image_ids, + Context *on_finish) + : m_remote_io_ctx(remote_io_ctx), m_image_ids(image_ids), + m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * <start> + * | + * | /-------------\ + * | | | + * v v | (more images) + * MIRROR_IMAGE_LIST ---/ + * | + * v + * <finish> + * + * @endverbatim + */ + + librados::IoCtx &m_remote_io_ctx; + ImageIds *m_image_ids; + Context *m_on_finish; + + bufferlist m_out_bl; + std::string m_start_after; + + void mirror_image_list(); + void handle_mirror_image_list(int r); + + void finish(int r); + +}; + +} // namespace pool_watcher +} // namespace mirror +} // namespace rbd + +extern template class rbd::mirror::pool_watcher::RefreshImagesRequest<librbd::ImageCtx>; + +#endif // CEPH_RBD_MIRROR_POOL_WATCHER_REFRESH_IMAGES_REQUEST_H diff --git a/src/tools/rbd_mirror/pool_watcher/Types.h b/src/tools/rbd_mirror/pool_watcher/Types.h new file mode 100644 index 00000000..52dfc342 --- /dev/null +++ b/src/tools/rbd_mirror/pool_watcher/Types.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_POOL_WATCHER_TYPES_H +#define CEPH_RBD_MIRROR_POOL_WATCHER_TYPES_H + +#include "tools/rbd_mirror/Types.h" +#include <string> + +namespace rbd { +namespace mirror { +namespace pool_watcher { + +struct Listener { + virtual ~Listener() { + } + + virtual void handle_update(const std::string &mirror_uuid, + ImageIds &&added_image_ids, + ImageIds &&removed_image_ids) = 0; +}; + +} // namespace pool_watcher +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_POOL_WATCHER_TYPES_H diff --git a/src/tools/rbd_mirror/service_daemon/Types.cc b/src/tools/rbd_mirror/service_daemon/Types.cc new file mode 100644 index 00000000..7dc6537c --- /dev/null +++ b/src/tools/rbd_mirror/service_daemon/Types.cc @@ -0,0 +1,29 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tools/rbd_mirror/service_daemon/Types.h" +#include <iostream> + +namespace rbd { +namespace mirror { +namespace service_daemon { + +std::ostream& operator<<(std::ostream& os, const CalloutLevel& callout_level) { + switch (callout_level) { + case CALLOUT_LEVEL_INFO: + os << "info"; + break; + case CALLOUT_LEVEL_WARNING: + os << "warning"; + break; + case CALLOUT_LEVEL_ERROR: + os << "error"; + break; + } + return os; +} + +} // namespace service_daemon +} // namespace mirror +} // namespace rbd + diff --git a/src/tools/rbd_mirror/service_daemon/Types.h b/src/tools/rbd_mirror/service_daemon/Types.h new file mode 100644 index 00000000..3aab7201 --- /dev/null +++ b/src/tools/rbd_mirror/service_daemon/Types.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H +#define CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H + +#include "include/int_types.h" +#include <iosfwd> +#include <string> +#include <boost/variant.hpp> + +namespace rbd { +namespace mirror { +namespace service_daemon { + +typedef uint64_t CalloutId; +const uint64_t CALLOUT_ID_NONE {0}; + +enum CalloutLevel { + CALLOUT_LEVEL_INFO, + CALLOUT_LEVEL_WARNING, + CALLOUT_LEVEL_ERROR +}; + +std::ostream& operator<<(std::ostream& os, const CalloutLevel& callout_level); + +typedef boost::variant<bool, uint64_t, std::string> AttributeValue; + +} // namespace service_daemon +} // namespace mirror +} // namespace rbd + +#endif // CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H diff --git a/src/tools/rbd_nbd/CMakeLists.txt b/src/tools/rbd_nbd/CMakeLists.txt new file mode 100644 index 00000000..5356fae4 --- /dev/null +++ b/src/tools/rbd_nbd/CMakeLists.txt @@ -0,0 +1,4 @@ +add_executable(rbd-nbd rbd-nbd.cc) +target_include_directories(rbd-nbd PUBLIC ${GENL_INCLUDE_DIR}) +target_link_libraries(rbd-nbd librbd librados global ${GENL_LIBRARIES}) +install(TARGETS rbd-nbd DESTINATION bin) diff --git a/src/tools/rbd_nbd/nbd-netlink.h b/src/tools/rbd_nbd/nbd-netlink.h new file mode 100644 index 00000000..f932f96a --- /dev/null +++ b/src/tools/rbd_nbd/nbd-netlink.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2017 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef _UAPILINUX_NBD_NETLINK_H +#define _UAPILINUX_NBD_NETLINK_H + +#define NBD_GENL_FAMILY_NAME "nbd" +#define NBD_GENL_VERSION 0x1 + +/* Configuration policy attributes, used for CONNECT */ +enum { + NBD_ATTR_UNSPEC, + NBD_ATTR_INDEX, + NBD_ATTR_SIZE_BYTES, + NBD_ATTR_BLOCK_SIZE_BYTES, + NBD_ATTR_TIMEOUT, + NBD_ATTR_SERVER_FLAGS, + NBD_ATTR_CLIENT_FLAGS, + NBD_ATTR_SOCKETS, + __NBD_ATTR_MAX, +}; +#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1) + +/* + * This is the format for multiple sockets with NBD_ATTR_SOCKETS + * + * [NBD_ATTR_SOCKETS] + * [NBD_SOCK_ITEM] + * [NBD_SOCK_FD] + * [NBD_SOCK_ITEM] + * [NBD_SOCK_FD] + */ +enum { + NBD_SOCK_ITEM_UNSPEC, + NBD_SOCK_ITEM, + __NBD_SOCK_ITEM_MAX, +}; +#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1) + +enum { + NBD_SOCK_UNSPEC, + NBD_SOCK_FD, + __NBD_SOCK_MAX, +}; +#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1) + +enum { + NBD_CMD_UNSPEC, + NBD_CMD_CONNECT, + NBD_CMD_DISCONNECT, + NBD_CMD_RECONFIGURE, + __NBD_CMD_MAX, +}; +#define NBD_CMD_MAX (__NBD_CMD_MAX - 1) + +#endif /* _UAPILINUX_NBD_NETLINK_H */ diff --git a/src/tools/rbd_nbd/rbd-nbd.cc b/src/tools/rbd_nbd/rbd-nbd.cc new file mode 100644 index 00000000..42dc92ad --- /dev/null +++ b/src/tools/rbd_nbd/rbd-nbd.cc @@ -0,0 +1,1615 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * rbd-nbd - RBD in userspace + * + * Copyright (C) 2015 - 2016 Kylin Corporation + * + * Author: Yunchuan Wen <yunchuan.wen@kylin-cloud.com> + * Li Wang <li.wang@kylin-cloud.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * +*/ + +#include "include/int_types.h" + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> + +#include <linux/nbd.h> +#include <linux/fs.h> +#include <sys/ioctl.h> +#include <sys/socket.h> + +#include "nbd-netlink.h" +#include <libnl3/netlink/genl/genl.h> +#include <libnl3/netlink/genl/ctrl.h> +#include <libnl3/netlink/genl/mngt.h> + +#include <fstream> +#include <iostream> +#include <memory> +#include <regex> +#include <boost/algorithm/string/predicate.hpp> + +#include "common/Formatter.h" +#include "common/Preforker.h" +#include "common/TextTable.h" +#include "common/ceph_argparse.h" +#include "common/config.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/module.h" +#include "common/safe_io.h" +#include "common/version.h" + +#include "global/global_init.h" +#include "global/signal_handler.h" + +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "include/stringify.h" +#include "include/xlist.h" + +#include "mon/MonClient.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "rbd-nbd: " + +struct Config { + int nbds_max = 0; + int max_part = 255; + int timeout = -1; + + bool exclusive = false; + bool readonly = false; + bool set_max_part = false; + bool try_netlink = false; + + std::string poolname; + std::string nsname; + std::string imgname; + std::string snapname; + std::string devpath; + + std::string format; + bool pretty_format = false; +}; + +static void usage() +{ + std::cout << "Usage: rbd-nbd [options] map <image-or-snap-spec> Map an image to nbd device\n" + << " unmap <device|image-or-snap-spec> Unmap nbd device\n" + << " [options] list-mapped List mapped nbd devices\n" + << "Map options:\n" + << " --device <device path> Specify nbd device path (/dev/nbd{num})\n" + << " --read-only Map read-only\n" + << " --nbds_max <limit> Override for module param nbds_max\n" + << " --max_part <limit> Override for module param max_part\n" + << " --exclusive Forbid writes by other clients\n" + << " --timeout <seconds> Set nbd request timeout\n" + << " --try-netlink Use the nbd netlink interface\n" + << "\n" + << "List options:\n" + << " --format plain|json|xml Output format (default: plain)\n" + << " --pretty-format Pretty formatting (json and xml)\n" + << std::endl; + generic_server_usage(); +} + +static int nbd = -1; +static int nbd_index = -1; + +enum Command { + None, + Connect, + Disconnect, + List +}; + +static Command cmd = None; + +#define RBD_NBD_BLKSIZE 512UL + +#define HELP_INFO 1 +#define VERSION_INFO 2 + +#ifdef CEPH_BIG_ENDIAN +#define ntohll(a) (a) +#elif defined(CEPH_LITTLE_ENDIAN) +#define ntohll(a) swab(a) +#else +#error "Could not determine endianess" +#endif +#define htonll(a) ntohll(a) + +static int parse_args(vector<const char*>& args, std::ostream *err_msg, + Command *command, Config *cfg); +static int netlink_resize(int nbd_index, uint64_t size); + +class NBDServer +{ +private: + int fd; + librbd::Image ℑ + +public: + NBDServer(int _fd, librbd::Image& _image) + : fd(_fd) + , image(_image) + , disconnect_lock("NBDServer::DisconnectLocker") + , lock("NBDServer::Locker") + , reader_thread(*this, &NBDServer::reader_entry) + , writer_thread(*this, &NBDServer::writer_entry) + , started(false) + {} + +private: + Mutex disconnect_lock; + Cond disconnect_cond; + std::atomic<bool> terminated = { false }; + + void shutdown() + { + bool expected = false; + if (terminated.compare_exchange_strong(expected, true)) { + ::shutdown(fd, SHUT_RDWR); + + Mutex::Locker l(lock); + cond.Signal(); + } + } + + struct IOContext + { + xlist<IOContext*>::item item; + NBDServer *server = nullptr; + struct nbd_request request; + struct nbd_reply reply; + bufferlist data; + int command = 0; + + IOContext() + : item(this) + {} + }; + + friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx); + + Mutex lock; + Cond cond; + xlist<IOContext*> io_pending; + xlist<IOContext*> io_finished; + + void io_start(IOContext *ctx) + { + Mutex::Locker l(lock); + io_pending.push_back(&ctx->item); + } + + void io_finish(IOContext *ctx) + { + Mutex::Locker l(lock); + ceph_assert(ctx->item.is_on_list()); + ctx->item.remove_myself(); + io_finished.push_back(&ctx->item); + cond.Signal(); + } + + IOContext *wait_io_finish() + { + Mutex::Locker l(lock); + while(io_finished.empty() && !terminated) + cond.Wait(lock); + + if (io_finished.empty()) + return NULL; + + IOContext *ret = io_finished.front(); + io_finished.pop_front(); + + return ret; + } + + void wait_clean() + { + ceph_assert(!reader_thread.is_started()); + Mutex::Locker l(lock); + while(!io_pending.empty()) + cond.Wait(lock); + + while(!io_finished.empty()) { + std::unique_ptr<IOContext> free_ctx(io_finished.front()); + io_finished.pop_front(); + } + } + + static void aio_callback(librbd::completion_t cb, void *arg) + { + librbd::RBD::AioCompletion *aio_completion = + reinterpret_cast<librbd::RBD::AioCompletion*>(cb); + + IOContext *ctx = reinterpret_cast<IOContext *>(arg); + int ret = aio_completion->get_return_value(); + + dout(20) << __func__ << ": " << *ctx << dendl; + + if (ret == -EINVAL) { + // if shrinking an image, a pagecache writeback might reference + // extents outside of the range of the new image extents + dout(0) << __func__ << ": masking IO out-of-bounds error" << dendl; + ctx->data.clear(); + ret = 0; + } + + if (ret < 0) { + ctx->reply.error = htonl(-ret); + } else if ((ctx->command == NBD_CMD_READ) && + ret < static_cast<int>(ctx->request.len)) { + int pad_byte_count = static_cast<int> (ctx->request.len) - ret; + ctx->data.append_zero(pad_byte_count); + dout(20) << __func__ << ": " << *ctx << ": Pad byte count: " + << pad_byte_count << dendl; + ctx->reply.error = htonl(0); + } else { + ctx->reply.error = htonl(0); + } + ctx->server->io_finish(ctx); + + aio_completion->release(); + } + + void reader_entry() + { + while (!terminated) { + std::unique_ptr<IOContext> ctx(new IOContext()); + ctx->server = this; + + dout(20) << __func__ << ": waiting for nbd request" << dendl; + + int r = safe_read_exact(fd, &ctx->request, sizeof(struct nbd_request)); + if (r < 0) { + derr << "failed to read nbd request header: " << cpp_strerror(r) + << dendl; + goto signal; + } + + if (ctx->request.magic != htonl(NBD_REQUEST_MAGIC)) { + derr << "invalid nbd request header" << dendl; + goto signal; + } + + ctx->request.from = ntohll(ctx->request.from); + ctx->request.type = ntohl(ctx->request.type); + ctx->request.len = ntohl(ctx->request.len); + + ctx->reply.magic = htonl(NBD_REPLY_MAGIC); + memcpy(ctx->reply.handle, ctx->request.handle, sizeof(ctx->reply.handle)); + + ctx->command = ctx->request.type & 0x0000ffff; + + dout(20) << *ctx << ": start" << dendl; + + switch (ctx->command) + { + case NBD_CMD_DISC: + // NBD_DO_IT will return when pipe is closed + dout(0) << "disconnect request received" << dendl; + goto signal; + case NBD_CMD_WRITE: + bufferptr ptr(ctx->request.len); + r = safe_read_exact(fd, ptr.c_str(), ctx->request.len); + if (r < 0) { + derr << *ctx << ": failed to read nbd request data: " + << cpp_strerror(r) << dendl; + goto signal; + } + ctx->data.push_back(ptr); + break; + } + + IOContext *pctx = ctx.release(); + io_start(pctx); + librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(pctx, aio_callback); + switch (pctx->command) + { + case NBD_CMD_WRITE: + image.aio_write(pctx->request.from, pctx->request.len, pctx->data, c); + break; + case NBD_CMD_READ: + image.aio_read(pctx->request.from, pctx->request.len, pctx->data, c); + break; + case NBD_CMD_FLUSH: + image.aio_flush(c); + break; + case NBD_CMD_TRIM: + image.aio_discard(pctx->request.from, pctx->request.len, c); + break; + default: + derr << *pctx << ": invalid request command" << dendl; + c->release(); + goto signal; + } + } + dout(20) << __func__ << ": terminated" << dendl; + +signal: + Mutex::Locker l(disconnect_lock); + disconnect_cond.Signal(); + } + + void writer_entry() + { + while (!terminated) { + dout(20) << __func__ << ": waiting for io request" << dendl; + std::unique_ptr<IOContext> ctx(wait_io_finish()); + if (!ctx) { + dout(20) << __func__ << ": no io requests, terminating" << dendl; + return; + } + + dout(20) << __func__ << ": got: " << *ctx << dendl; + + int r = safe_write(fd, &ctx->reply, sizeof(struct nbd_reply)); + if (r < 0) { + derr << *ctx << ": failed to write reply header: " << cpp_strerror(r) + << dendl; + return; + } + if (ctx->command == NBD_CMD_READ && ctx->reply.error == htonl(0)) { + r = ctx->data.write_fd(fd); + if (r < 0) { + derr << *ctx << ": failed to write replay data: " << cpp_strerror(r) + << dendl; + return; + } + } + dout(20) << *ctx << ": finish" << dendl; + } + dout(20) << __func__ << ": terminated" << dendl; + } + + class ThreadHelper : public Thread + { + public: + typedef void (NBDServer::*entry_func)(); + private: + NBDServer &server; + entry_func func; + public: + ThreadHelper(NBDServer &_server, entry_func _func) + :server(_server) + ,func(_func) + {} + protected: + void* entry() override + { + (server.*func)(); + server.shutdown(); + return NULL; + } + } reader_thread, writer_thread; + + bool started; +public: + void start() + { + if (!started) { + dout(10) << __func__ << ": starting" << dendl; + + started = true; + + reader_thread.create("rbd_reader"); + writer_thread.create("rbd_writer"); + } + } + + void wait_for_disconnect() + { + if (!started) + return; + + Mutex::Locker l(disconnect_lock); + disconnect_cond.Wait(disconnect_lock); + } + + ~NBDServer() + { + if (started) { + dout(10) << __func__ << ": terminating" << dendl; + + shutdown(); + + reader_thread.join(); + writer_thread.join(); + + wait_clean(); + + started = false; + } + } +}; + +std::ostream &operator<<(std::ostream &os, const NBDServer::IOContext &ctx) { + + os << "[" << std::hex << ntohll(*((uint64_t *)ctx.request.handle)); + + switch (ctx.command) + { + case NBD_CMD_WRITE: + os << " WRITE "; + break; + case NBD_CMD_READ: + os << " READ "; + break; + case NBD_CMD_FLUSH: + os << " FLUSH "; + break; + case NBD_CMD_TRIM: + os << " TRIM "; + break; + default: + os << " UNKNOWN(" << ctx.command << ") "; + break; + } + + os << ctx.request.from << "~" << ctx.request.len << " " + << std::dec << ntohl(ctx.reply.error) << "]"; + + return os; +} + +class NBDWatchCtx : public librbd::UpdateWatchCtx +{ +private: + int fd; + int nbd_index; + bool use_netlink; + librados::IoCtx &io_ctx; + librbd::Image ℑ + unsigned long size; +public: + NBDWatchCtx(int _fd, + int _nbd_index, + bool _use_netlink, + librados::IoCtx &_io_ctx, + librbd::Image &_image, + unsigned long _size) + : fd(_fd) + , nbd_index(_nbd_index) + , use_netlink(_use_netlink) + , io_ctx(_io_ctx) + , image(_image) + , size(_size) + { } + + ~NBDWatchCtx() override {} + + void handle_notify() override + { + librbd::image_info_t info; + if (image.stat(info, sizeof(info)) == 0) { + unsigned long new_size = info.size; + int ret; + + if (new_size != size) { + dout(5) << "resize detected" << dendl; + if (ioctl(fd, BLKFLSBUF, NULL) < 0) + derr << "invalidate page cache failed: " << cpp_strerror(errno) + << dendl; + if (use_netlink) { + ret = netlink_resize(nbd_index, new_size); + } else { + ret = ioctl(fd, NBD_SET_SIZE, new_size); + if (ret < 0) + derr << "resize failed: " << cpp_strerror(errno) << dendl; + } + + if (!ret) + size = new_size; + + if (ioctl(fd, BLKRRPART, NULL) < 0) { + derr << "rescan of partition table failed: " << cpp_strerror(errno) + << dendl; + } + if (image.invalidate_cache() < 0) + derr << "invalidate rbd cache failed" << dendl; + } + } + } +}; + +class NBDListIterator { +public: + bool get(int *pid, Config *cfg) { + while (true) { + std::string nbd_path = "/sys/block/nbd" + stringify(m_index); + if(access(nbd_path.c_str(), F_OK) != 0) { + return false; + } + + *cfg = Config(); + cfg->devpath = "/dev/nbd" + stringify(m_index++); + + std::ifstream ifs; + ifs.open(nbd_path + "/pid", std::ifstream::in); + if (!ifs.is_open()) { + continue; + } + ifs >> *pid; + + int r = get_mapped_info(*pid, cfg); + if (r < 0) { + continue; + } + + return true; + } + } + +private: + int m_index = 0; + + int get_mapped_info(int pid, Config *cfg) { + int r; + std::string path = "/proc/" + stringify(pid) + "/cmdline"; + std::ifstream ifs; + std::string cmdline; + std::vector<const char*> args; + + ifs.open(path.c_str(), std::ifstream::in); + if (!ifs.is_open()) + return -1; + ifs >> cmdline; + + for (unsigned i = 0; i < cmdline.size(); i++) { + const char *arg = &cmdline[i]; + if (i == 0) { + if (strcmp(basename(arg) , "rbd-nbd") != 0) { + return -EINVAL; + } + } else { + args.push_back(arg); + } + + while (cmdline[i] != '\0') { + i++; + } + } + + std::ostringstream err_msg; + Command command; + r = parse_args(args, &err_msg, &command, cfg); + if (r < 0) { + return r; + } + + if (command != Connect) { + return -ENOENT; + } + + return 0; + } +}; + +static int load_module(Config *cfg) +{ + ostringstream param; + int ret; + + if (cfg->nbds_max) + param << "nbds_max=" << cfg->nbds_max; + + if (cfg->max_part) + param << " max_part=" << cfg->max_part; + + if (!access("/sys/module/nbd", F_OK)) { + if (cfg->nbds_max || cfg->set_max_part) + cerr << "rbd-nbd: ignoring kernel module parameter options: nbd module already loaded" + << std::endl; + return 0; + } + + ret = module_load("nbd", param.str().c_str()); + if (ret < 0) + cerr << "rbd-nbd: failed to load nbd kernel module: " << cpp_strerror(-ret) + << std::endl; + + return ret; +} + +static int check_device_size(int nbd_index, unsigned long expected_size) +{ + // There are bugs with some older kernel versions that result in an + // overflow for large image sizes. This check is to ensure we are + // not affected. + + unsigned long size = 0; + std::string path = "/sys/block/nbd" + stringify(nbd_index) + "/size"; + std::ifstream ifs; + ifs.open(path.c_str(), std::ifstream::in); + if (!ifs.is_open()) { + cerr << "rbd-nbd: failed to open " << path << std::endl; + return -EINVAL; + } + ifs >> size; + size *= RBD_NBD_BLKSIZE; + + if (size == 0) { + // Newer kernel versions will report real size only after nbd + // connect. Assume this is the case and return success. + return 0; + } + + if (size != expected_size) { + cerr << "rbd-nbd: kernel reported invalid device size (" << size + << ", expected " << expected_size << ")" << std::endl; + return -EINVAL; + } + + return 0; +} + +static int parse_nbd_index(const std::string& devpath) +{ + int index, ret; + + ret = sscanf(devpath.c_str(), "/dev/nbd%d", &index); + if (ret <= 0) { + // mean an early matching failure. But some cases need a negative value. + if (ret == 0) + ret = -EINVAL; + cerr << "rbd-nbd: invalid device path: " << devpath + << " (expected /dev/nbd{num})" << std::endl; + return ret; + } + + return index; +} + +static int try_ioctl_setup(Config *cfg, int fd, uint64_t size, uint64_t flags) +{ + int index = 0, r; + + if (cfg->devpath.empty()) { + char dev[64]; + const char *path = "/sys/module/nbd/parameters/nbds_max"; + int nbds_max = -1; + if (access(path, F_OK) == 0) { + std::ifstream ifs; + ifs.open(path, std::ifstream::in); + if (ifs.is_open()) { + ifs >> nbds_max; + ifs.close(); + } + } + + while (true) { + snprintf(dev, sizeof(dev), "/dev/nbd%d", index); + + nbd = open(dev, O_RDWR); + if (nbd < 0) { + if (nbd == -EPERM && nbds_max != -1 && index < (nbds_max-1)) { + ++index; + continue; + } + r = nbd; + cerr << "rbd-nbd: failed to find unused device" << std::endl; + goto done; + } + + r = ioctl(nbd, NBD_SET_SOCK, fd); + if (r < 0) { + close(nbd); + ++index; + continue; + } + + cfg->devpath = dev; + break; + } + } else { + r = parse_nbd_index(cfg->devpath); + if (r < 0) + goto done; + index = r; + + nbd = open(cfg->devpath.c_str(), O_RDWR); + if (nbd < 0) { + r = nbd; + cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl; + goto done; + } + + r = ioctl(nbd, NBD_SET_SOCK, fd); + if (r < 0) { + r = -errno; + cerr << "rbd-nbd: the device " << cfg->devpath << " is busy" << std::endl; + close(nbd); + goto done; + } + } + + r = ioctl(nbd, NBD_SET_BLKSIZE, RBD_NBD_BLKSIZE); + if (r < 0) { + r = -errno; + goto close_nbd; + } + + r = ioctl(nbd, NBD_SET_SIZE, size); + if (r < 0) { + r = -errno; + goto close_nbd; + } + + ioctl(nbd, NBD_SET_FLAGS, flags); + + if (cfg->timeout >= 0) { + r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)cfg->timeout); + if (r < 0) { + r = -errno; + cerr << "rbd-nbd: failed to set timeout: " << cpp_strerror(r) + << std::endl; + goto close_nbd; + } + } + + dout(10) << "ioctl setup complete for " << cfg->devpath << dendl; + nbd_index = index; + return 0; + +close_nbd: + if (r < 0) { + ioctl(nbd, NBD_CLEAR_SOCK); + cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r) << std::endl; + } + close(nbd); +done: + return r; +} + +static void netlink_cleanup(struct nl_sock *sock) +{ + if (!sock) + return; + + nl_close(sock); + nl_socket_free(sock); +} + +static struct nl_sock *netlink_init(int *id) +{ + struct nl_sock *sock; + int ret; + + sock = nl_socket_alloc(); + if (!sock) { + cerr << "rbd-nbd: Could not allocate netlink socket." << std::endl; + return NULL; + } + + ret = genl_connect(sock); + if (ret < 0) { + cerr << "rbd-nbd: Could not connect netlink socket. Error " << ret + << std::endl; + goto free_sock; + } + + *id = genl_ctrl_resolve(sock, "nbd"); + if (*id < 0) + // nbd netlink interface not supported. + goto close_sock; + + return sock; + +close_sock: + nl_close(sock); +free_sock: + nl_socket_free(sock); + return NULL; +} + +static int netlink_disconnect(int index) +{ + struct nl_sock *sock; + struct nl_msg *msg; + int ret, nl_id; + + sock = netlink_init(&nl_id); + if (!sock) + // Try ioctl + return 1; + + nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL); + + msg = nlmsg_alloc(); + if (!msg) { + cerr << "rbd-nbd: Could not allocate netlink message." << std::endl; + goto free_sock; + } + + if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0, + NBD_CMD_DISCONNECT, 0)) { + cerr << "rbd-nbd: Could not setup message." << std::endl; + goto nla_put_failure; + } + + NLA_PUT_U32(msg, NBD_ATTR_INDEX, index); + + ret = nl_send_sync(sock, msg); + netlink_cleanup(sock); + if (ret < 0) { + cerr << "rbd-nbd: netlink disconnect failed: " << nl_geterror(-ret) + << std::endl; + return -EIO; + } + + return 0; + +nla_put_failure: + nlmsg_free(msg); +free_sock: + netlink_cleanup(sock); + return -EIO; +} + +static int netlink_disconnect_by_path(const std::string& devpath) +{ + int index; + + index = parse_nbd_index(devpath); + if (index < 0) + return index; + + return netlink_disconnect(index); +} + +static int netlink_resize(int nbd_index, uint64_t size) +{ + struct nl_sock *sock; + struct nl_msg *msg; + int nl_id, ret; + + sock = netlink_init(&nl_id); + if (!sock) { + cerr << "rbd-nbd: Netlink interface not supported." << std::endl; + return 1; + } + + nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL); + + msg = nlmsg_alloc(); + if (!msg) { + cerr << "rbd-nbd: Could not allocate netlink message." << std::endl; + goto free_sock; + } + + if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0, + NBD_CMD_RECONFIGURE, 0)) { + cerr << "rbd-nbd: Could not setup message." << std::endl; + goto free_msg; + } + + NLA_PUT_U32(msg, NBD_ATTR_INDEX, nbd_index); + NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, size); + + ret = nl_send_sync(sock, msg); + if (ret < 0) { + cerr << "rbd-nbd: netlink resize failed: " << nl_geterror(ret) << std::endl; + goto free_sock; + } + + netlink_cleanup(sock); + dout(10) << "netlink resize complete for nbd" << nbd_index << dendl; + return 0; + +nla_put_failure: +free_msg: + nlmsg_free(msg); +free_sock: + netlink_cleanup(sock); + return -EIO; +} + +static int netlink_connect_cb(struct nl_msg *msg, void *arg) +{ + struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlmsg_hdr(msg)); + Config *cfg = (Config *)arg; + struct nlattr *msg_attr[NBD_ATTR_MAX + 1]; + uint32_t index; + int ret; + + ret = nla_parse(msg_attr, NBD_ATTR_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL); + if (ret) { + cerr << "rbd-nbd: Unsupported netlink reply" << std::endl; + return -NLE_MSGTYPE_NOSUPPORT; + } + + if (!msg_attr[NBD_ATTR_INDEX]) { + cerr << "rbd-nbd: netlink connect reply missing device index." << std::endl; + return -NLE_MSGTYPE_NOSUPPORT; + } + + index = nla_get_u32(msg_attr[NBD_ATTR_INDEX]); + cfg->devpath = "/dev/nbd" + stringify(index); + nbd_index = index; + + return NL_OK; +} + +static int netlink_connect(Config *cfg, struct nl_sock *sock, int nl_id, int fd, + uint64_t size, uint64_t flags) +{ + struct nlattr *sock_attr; + struct nlattr *sock_opt; + struct nl_msg *msg; + int ret; + + nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, netlink_connect_cb, cfg); + + msg = nlmsg_alloc(); + if (!msg) { + cerr << "rbd-nbd: Could not allocate netlink message." << std::endl; + return -ENOMEM; + } + + if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0, NBD_CMD_CONNECT, + 0)) { + cerr << "rbd-nbd: Could not setup message." << std::endl; + goto free_msg; + } + + if (!cfg->devpath.empty()) { + ret = parse_nbd_index(cfg->devpath); + if (ret < 0) + goto free_msg; + + NLA_PUT_U32(msg, NBD_ATTR_INDEX, ret); + } + + if (cfg->timeout >= 0) + NLA_PUT_U64(msg, NBD_ATTR_TIMEOUT, cfg->timeout); + + NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, size); + NLA_PUT_U64(msg, NBD_ATTR_BLOCK_SIZE_BYTES, RBD_NBD_BLKSIZE); + NLA_PUT_U64(msg, NBD_ATTR_SERVER_FLAGS, flags); + + sock_attr = nla_nest_start(msg, NBD_ATTR_SOCKETS); + if (!sock_attr) { + cerr << "rbd-nbd: Could not init sockets in netlink message." << std::endl; + goto free_msg; + } + + sock_opt = nla_nest_start(msg, NBD_SOCK_ITEM); + if (!sock_opt) { + cerr << "rbd-nbd: Could not init sock in netlink message." << std::endl; + goto free_msg; + } + + NLA_PUT_U32(msg, NBD_SOCK_FD, fd); + nla_nest_end(msg, sock_opt); + nla_nest_end(msg, sock_attr); + + ret = nl_send_sync(sock, msg); + if (ret < 0) { + cerr << "rbd-nbd: netlink connect failed: " << nl_geterror(ret) + << std::endl; + return -EIO; + } + + dout(10) << "netlink connect complete for " << cfg->devpath << dendl; + return 0; + +nla_put_failure: +free_msg: + nlmsg_free(msg); + return -EIO; +} + +static int try_netlink_setup(Config *cfg, int fd, uint64_t size, uint64_t flags) +{ + struct nl_sock *sock; + int nl_id, ret; + + sock = netlink_init(&nl_id); + if (!sock) { + cerr << "rbd-nbd: Netlink interface not supported. Using ioctl interface." + << std::endl; + return 1; + } + + dout(10) << "netlink interface supported." << dendl; + + ret = netlink_connect(cfg, sock, nl_id, fd, size, flags); + netlink_cleanup(sock); + + if (ret != 0) + return ret; + + nbd = open(cfg->devpath.c_str(), O_RDWR); + if (nbd < 0) { + cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl; + return nbd; + } + + return 0; +} + +static void handle_signal(int signum) +{ + int ret; + + ceph_assert(signum == SIGINT || signum == SIGTERM); + derr << "*** Got signal " << sig_str(signum) << " ***" << dendl; + + if (nbd < 0 || nbd_index < 0) { + dout(20) << __func__ << ": " << "disconnect not needed." << dendl; + return; + } + + dout(20) << __func__ << ": " << "sending NBD_DISCONNECT" << dendl; + ret = netlink_disconnect(nbd_index); + if (ret == 1) + ret = ioctl(nbd, NBD_DISCONNECT); + + if (ret != 0) { + derr << "rbd-nbd: disconnect failed. Error: " << ret << dendl; + } else { + dout(20) << __func__ << ": " << "disconnected" << dendl; + } +} + +static NBDServer *start_server(int fd, librbd::Image& image) +{ + NBDServer *server; + + server = new NBDServer(fd, image); + server->start(); + + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + register_async_signal_handler_oneshot(SIGINT, handle_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_signal); + + return server; +} + +static void run_server(Preforker& forker, NBDServer *server, bool netlink_used) +{ + if (g_conf()->daemonize) { + global_init_postfork_finish(g_ceph_context); + forker.daemonize(); + } + + if (netlink_used) + server->wait_for_disconnect(); + else + ioctl(nbd, NBD_DO_IT); + + unregister_async_signal_handler(SIGHUP, sighup_handler); + unregister_async_signal_handler(SIGINT, handle_signal); + unregister_async_signal_handler(SIGTERM, handle_signal); + shutdown_async_signal_handler(); +} + +static int do_map(int argc, const char *argv[], Config *cfg) +{ + int r; + + librados::Rados rados; + librbd::RBD rbd; + librados::IoCtx io_ctx; + librbd::Image image; + + int read_only = 0; + unsigned long flags; + unsigned long size; + bool use_netlink; + + int fd[2]; + + librbd::image_info_t info; + + Preforker forker; + NBDServer *server; + + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + g_ceph_context->_conf.set_val_or_die("pid_file", ""); + + if (global_init_prefork(g_ceph_context) >= 0) { + std::string err; + r = forker.prefork(err); + if (r < 0) { + cerr << err << std::endl; + return r; + } + if (forker.is_parent()) { + if (forker.parent_wait(err) != 0) { + return -ENXIO; + } + return 0; + } + global_init_postfork_start(g_ceph_context); + } + + common_init_finish(g_ceph_context); + global_init_chdir(g_ceph_context); + + if (socketpair(AF_UNIX, SOCK_STREAM, 0, fd) == -1) { + r = -errno; + goto close_ret; + } + + r = rados.init_with_context(g_ceph_context); + if (r < 0) + goto close_fd; + + r = rados.connect(); + if (r < 0) + goto close_fd; + + r = rados.ioctx_create(cfg->poolname.c_str(), io_ctx); + if (r < 0) + goto close_fd; + + io_ctx.set_namespace(cfg->nsname); + + r = rbd.open(io_ctx, image, cfg->imgname.c_str()); + if (r < 0) + goto close_fd; + + if (cfg->exclusive) { + r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE); + if (r < 0) { + cerr << "rbd-nbd: failed to acquire exclusive lock: " << cpp_strerror(r) + << std::endl; + goto close_fd; + } + } + + if (!cfg->snapname.empty()) { + r = image.snap_set(cfg->snapname.c_str()); + if (r < 0) + goto close_fd; + } + + r = image.stat(info, sizeof(info)); + if (r < 0) + goto close_fd; + + flags = NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_TRIM | NBD_FLAG_HAS_FLAGS; + if (!cfg->snapname.empty() || cfg->readonly) { + flags |= NBD_FLAG_READ_ONLY; + read_only = 1; + } + + if (info.size > ULONG_MAX) { + r = -EFBIG; + cerr << "rbd-nbd: image is too large (" << byte_u_t(info.size) + << ", max is " << byte_u_t(ULONG_MAX) << ")" << std::endl; + goto close_fd; + } + + size = info.size; + + r = load_module(cfg); + if (r < 0) + goto close_fd; + + server = start_server(fd[1], image); + + use_netlink = cfg->try_netlink; + if (use_netlink) { + r = try_netlink_setup(cfg, fd[0], size, flags); + if (r < 0) { + goto free_server; + } else if (r == 1) { + use_netlink = false; + } + } + + if (!use_netlink) { + r = try_ioctl_setup(cfg, fd[0], size, flags); + if (r < 0) + goto free_server; + } + + r = check_device_size(nbd_index, size); + if (r < 0) + goto close_nbd; + + r = ioctl(nbd, BLKROSET, (unsigned long) &read_only); + if (r < 0) { + r = -errno; + goto close_nbd; + } + + { + uint64_t handle; + + NBDWatchCtx watch_ctx(nbd, nbd_index, use_netlink, io_ctx, image, + info.size); + r = image.update_watch(&watch_ctx, &handle); + if (r < 0) + goto close_nbd; + + cout << cfg->devpath << std::endl; + + run_server(forker, server, use_netlink); + + r = image.update_unwatch(handle); + ceph_assert(r == 0); + } + +close_nbd: + if (r < 0) { + if (use_netlink) { + netlink_disconnect(nbd_index); + } else { + ioctl(nbd, NBD_CLEAR_SOCK); + cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r) + << std::endl; + } + } + close(nbd); +free_server: + delete server; +close_fd: + close(fd[0]); + close(fd[1]); +close_ret: + image.close(); + io_ctx.close(); + rados.shutdown(); + + forker.exit(r < 0 ? EXIT_FAILURE : 0); + // Unreachable; + return r; +} + +static int do_unmap(Config *cfg) +{ + int r, nbd; + + /* + * The netlink disconnect call supports devices setup with netlink or ioctl, + * so we always try that first. + */ + r = netlink_disconnect_by_path(cfg->devpath); + if (r != 1) + return r; + + nbd = open(cfg->devpath.c_str(), O_RDWR); + if (nbd < 0) { + cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl; + return nbd; + } + + r = ioctl(nbd, NBD_DISCONNECT); + if (r < 0) { + cerr << "rbd-nbd: the device is not used" << std::endl; + } + + close(nbd); + return r; +} + +static int parse_imgpath(const std::string &imgpath, Config *cfg, + std::ostream *err_msg) { + std::regex pattern("^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$"); + std::smatch match; + if (!std::regex_match(imgpath, match, pattern)) { + std::cerr << "rbd-nbd: invalid spec '" << imgpath << "'" << std::endl; + return -EINVAL; + } + + if (match[1].matched) { + cfg->poolname = match[1]; + } + + if (match[2].matched) { + cfg->nsname = match[2]; + } + + cfg->imgname = match[3]; + + if (match[4].matched) + cfg->snapname = match[4]; + + return 0; +} + +static int do_list_mapped_devices(const std::string &format, bool pretty_format) +{ + bool should_print = false; + std::unique_ptr<ceph::Formatter> f; + TextTable tbl; + + if (format == "json") { + f.reset(new JSONFormatter(pretty_format)); + } else if (format == "xml") { + f.reset(new XMLFormatter(pretty_format)); + } else if (!format.empty() && format != "plain") { + std::cerr << "rbd-nbd: invalid output format: " << format << std::endl; + return -EINVAL; + } + + if (f) { + f->open_array_section("devices"); + } else { + tbl.define_column("id", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("image", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("device", TextTable::LEFT, TextTable::LEFT); + } + + int pid; + Config cfg; + NBDListIterator it; + while (it.get(&pid, &cfg)) { + if (f) { + f->open_object_section("device"); + f->dump_int("id", pid); + f->dump_string("pool", cfg.poolname); + f->dump_string("namespace", cfg.nsname); + f->dump_string("image", cfg.imgname); + f->dump_string("snap", cfg.snapname); + f->dump_string("device", cfg.devpath); + f->close_section(); + } else { + should_print = true; + if (cfg.snapname.empty()) { + cfg.snapname = "-"; + } + tbl << pid << cfg.poolname << cfg.nsname << cfg.imgname << cfg.snapname + << cfg.devpath << TextTable::endrow; + } + } + + if (f) { + f->close_section(); // devices + f->flush(std::cout); + } + if (should_print) { + std::cout << tbl; + } + return 0; +} + +static bool find_mapped_dev_by_spec(Config *cfg) { + int pid; + Config c; + NBDListIterator it; + while (it.get(&pid, &c)) { + if (c.poolname == cfg->poolname && c.nsname == cfg->nsname && + c.imgname == cfg->imgname && c.snapname == cfg->snapname) { + *cfg = c; + return true; + } + } + return false; +} + + +static int parse_args(vector<const char*>& args, std::ostream *err_msg, + Command *command, Config *cfg) { + std::string conf_file_list; + std::string cluster; + CephInitParameters iparams = ceph_argparse_early_args( + args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list); + + ConfigProxy config{false}; + config->name = iparams.name; + config->cluster = cluster; + + if (!conf_file_list.empty()) { + config.parse_config_files(conf_file_list.c_str(), nullptr, 0); + } else { + config.parse_config_files(nullptr, nullptr, 0); + } + config.parse_env(CEPH_ENTITY_TYPE_CLIENT); + config.parse_argv(args); + cfg->poolname = config.get_val<std::string>("rbd_default_pool"); + + std::vector<const char*>::iterator i; + std::ostringstream err; + + for (i = args.begin(); i != args.end(); ) { + if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) { + return HELP_INFO; + } else if (ceph_argparse_flag(args, i, "-v", "--version", (char*)NULL)) { + return VERSION_INFO; + } else if (ceph_argparse_witharg(args, i, &cfg->devpath, "--device", (char *)NULL)) { + } else if (ceph_argparse_witharg(args, i, &cfg->nbds_max, err, "--nbds_max", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->nbds_max < 0) { + *err_msg << "rbd-nbd: Invalid argument for nbds_max!"; + return -EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &cfg->max_part, err, "--max_part", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if ((cfg->max_part < 0) || (cfg->max_part > 255)) { + *err_msg << "rbd-nbd: Invalid argument for max_part(0~255)!"; + return -EINVAL; + } + cfg->set_max_part = true; + } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) { + cfg->readonly = true; + } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) { + cfg->exclusive = true; + } else if (ceph_argparse_witharg(args, i, &cfg->timeout, err, "--timeout", + (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->timeout < 0) { + *err_msg << "rbd-nbd: Invalid argument for timeout!"; + return -EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &cfg->format, err, "--format", + (char *)NULL)) { + } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) { + cfg->pretty_format = true; + } else if (ceph_argparse_flag(args, i, "--try-netlink", (char *)NULL)) { + cfg->try_netlink = true; + } else { + ++i; + } + } + + Command cmd = None; + if (args.begin() != args.end()) { + if (strcmp(*args.begin(), "map") == 0) { + cmd = Connect; + } else if (strcmp(*args.begin(), "unmap") == 0) { + cmd = Disconnect; + } else if (strcmp(*args.begin(), "list-mapped") == 0) { + cmd = List; + } else { + *err_msg << "rbd-nbd: unknown command: " << *args.begin(); + return -EINVAL; + } + args.erase(args.begin()); + } + + if (cmd == None) { + *err_msg << "rbd-nbd: must specify command"; + return -EINVAL; + } + + switch (cmd) { + case Connect: + if (args.begin() == args.end()) { + *err_msg << "rbd-nbd: must specify image-or-snap-spec"; + return -EINVAL; + } + if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) { + return -EINVAL; + } + args.erase(args.begin()); + break; + case Disconnect: + if (args.begin() == args.end()) { + *err_msg << "rbd-nbd: must specify nbd device or image-or-snap-spec"; + return -EINVAL; + } + if (boost::starts_with(*args.begin(), "/dev/")) { + cfg->devpath = *args.begin(); + } else { + if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) { + return -EINVAL; + } + if (!find_mapped_dev_by_spec(cfg)) { + *err_msg << "rbd-nbd: " << *args.begin() << " is not mapped"; + return -ENOENT; + } + } + args.erase(args.begin()); + break; + default: + //shut up gcc; + break; + } + + if (args.begin() != args.end()) { + *err_msg << "rbd-nbd: unknown args: " << *args.begin(); + return -EINVAL; + } + + *command = cmd; + return 0; +} + +static int rbd_nbd(int argc, const char *argv[]) +{ + int r; + Config cfg; + vector<const char*> args; + argv_to_vec(argc, argv, args); + + std::ostringstream err_msg; + r = parse_args(args, &err_msg, &cmd, &cfg); + if (r == HELP_INFO) { + usage(); + return 0; + } else if (r == VERSION_INFO) { + std::cout << pretty_version_to_str() << std::endl; + return 0; + } else if (r < 0) { + cerr << err_msg.str() << std::endl; + return r; + } + + switch (cmd) { + case Connect: + if (cfg.imgname.empty()) { + cerr << "rbd-nbd: image name was not specified" << std::endl; + return -EINVAL; + } + + r = do_map(argc, argv, &cfg); + if (r < 0) + return -EINVAL; + break; + case Disconnect: + r = do_unmap(&cfg); + if (r < 0) + return -EINVAL; + break; + case List: + r = do_list_mapped_devices(cfg.format, cfg.pretty_format); + if (r < 0) + return -EINVAL; + break; + default: + usage(); + break; + } + + return 0; +} + +int main(int argc, const char *argv[]) +{ + int r = rbd_nbd(argc, argv); + if (r < 0) { + return EXIT_FAILURE; + } + return 0; +} diff --git a/src/tools/rbd_recover_tool/FAQ b/src/tools/rbd_recover_tool/FAQ new file mode 100644 index 00000000..1655e853 --- /dev/null +++ b/src/tools/rbd_recover_tool/FAQ @@ -0,0 +1,16 @@ +# author: min chen(minchen@ubuntukylin.com) 2014 2015 + +1. error "get_image_metadata_v2: no meta_header_seq input" +cause: + database is old, refresh database +solution: + ./rbd-recover-tool database + +2. Error initializing leveldb: IO error: lock /var/lib/ceph/osd/ceph-0/current/omap/LOCK: Resource temporarily unavailable + ERROR: error flushing journal /var/lib/ceph/osd/ceph-0/journal for object store /var/lib/ceph/osd/ceph-0: (1) Operation not permitted +cause: + when ./rbd-recover-tool database is interrupted , but command has been sent to each osd node, and there is a process reading leveldb and it is LOCKED + if run ./rbd-recover-tool database again, all command are sent to osd nodes again, while previous process is locking leveldb, so all new command + are failed. +solution: + wait until all previous command finished. diff --git a/src/tools/rbd_recover_tool/README b/src/tools/rbd_recover_tool/README new file mode 100644 index 00000000..d289c11c --- /dev/null +++ b/src/tools/rbd_recover_tool/README @@ -0,0 +1,97 @@ +# author: Min chen(minchen@ubuntukylin.com) 2014 2015 + +------------- ceph rbd recover tool ------------- + + ceph rbd recover tool is used for recovering ceph rbd image, when all ceph services are killed. +it is based on ceph-0.80.x (Firefly and newer) + currently, ceph service(ceph-mon, ceph-osd) evently are not available caused by bugs or sth else +, especially on large scale ceph cluster, so that the ceph cluster can not supply service +and rbd images can not be accessed. In this case, a tool to recover rbd image is necessary. + ceph rbd recover tool is just used for this, it can collect all objects of an image from distributed +osd nodes with the latest pg epoch, and splice objects by offset to a complete image. To make sure +object data is complete, this tool does flush osd journal on each osd node before recovering. + but, there are some limitions: +-need ssh service and unobstructed network +-osd data must be accessed on local disk +-clone image is not supported, while snapshot is supported +-only support relicated pool + +before you run this tool, you should make sure that: +1). all processes (ceph-osd, ceph-mon, ceph-mds) are shutdown +2). ssh daemon is running & network is ok (ssh to each node without password) +3). ceph-kvstore-tool is installed(for ubuntu: apt-get install ceph-test) +4). osd disk is not crashed and data can be accessed on local filesystem + +-architecture: + + +---- osd.0 + | +admin_node -----------+---- osd.1 + | + +---- osd.2 + | + ...... + +-files: +admin_node: {rbd-recover-tool common_h epoch_h metadata_h database_h} +osd: {osd_job common_h epoch_h metadata_h} #/var/rbd_tool/osd_job +in this architecture, admin_node acts as client, osds act as server. +so, they run different files: +on admin_node run: rbd-recover-tool <action> [<parameters>] +on osd node run: ./osd_job <function> <parameters> +admin_node will copy files: osd_job, common_h, epoch_h, metadata_h to remote osd node + + +-config file +before you run this tool, make sure write config files first +osd_host_path: osd hostnames and osd data path #user input + osdhost0 /var/lib/ceph/osd/ceph-0 + osdhost1 /var/lib/ceph/osd/ceph-1 + ...... +mon_host: all mon node hostname #user input + monhost0 + monhost1 + ...... +mds_host: all mds node hostname #user input + mdshost0 + mdshost1 + ...... +then, init_env_admin function will create file: osd_host +osd_host: all osd node hostname #generated by admin_job, user ignore it + osdhost0 + osdhost1 + ...... + + +-usage: +rbd-recovert-tool <operation> +<operation> : +database #generating offline database: hobject path, node hostname, pg_epoch and image metadata +list #list all images from offline database +lookup <pool_id>/<image_name>[@[<snap_name>]] #lookup image metadata in offline database +recover <pool_id><image_name>[@[<snap_name>]] [/path/to/store/image] #recover image data according to image metadata + +-steps: +1. stop all ceph services: ceph-mon, ceph-osd, ceph-mds +2. setup config files: osd_host_path, mon_host, mds_host +3. rbd-recover-tool database # wait a long time +4. rbd-recover-tool list +4. rbd-recover-tool recover <pool_id>/<image_name>[@[<image_name>]] [/path/to/store/image] + + +-debug & error check +if admin_node operation is failed, you can check it on osd node +cd /var/rbd_tool/osd_job +./osd_job <operation> +<operation> : +do_image_id <image_id_hobject> #get image id of image format v2 +do_image_id <image_header_hobject> #get image id of image format v1 +do_image_metadata_v1 <image_header_hobject> #get image metadata of image format v1, maybe pg epoch is not latest +do_image_metadata_v2 <image_header_hobject> #get image metadata of image format v2, maybe pg epoch is not latest +do_image_list #get all images on this osd(image head hobject) +do_pg_epoch #get all pg epoch and store it in /var/rbd_tool/single_node/node_pg_epoch +do_omap_list #list all omap headers and omap entries on this osd + + +-FAQ +file FAQ lists some common confusing cases while testing diff --git a/src/tools/rbd_recover_tool/TODO b/src/tools/rbd_recover_tool/TODO new file mode 100644 index 00000000..c36d4c94 --- /dev/null +++ b/src/tools/rbd_recover_tool/TODO @@ -0,0 +1,2 @@ + +1.support clone imag diff --git a/src/tools/rbd_recover_tool/common_h b/src/tools/rbd_recover_tool/common_h new file mode 100644 index 00000000..f2df662a --- /dev/null +++ b/src/tools/rbd_recover_tool/common_h @@ -0,0 +1,412 @@ +#!/usr/bin/env bash +# file: common_h +# +# Copyright (C) 2015 Ubuntu Kylin +# +# Author: Min Chen <minchen@ubuntukylin.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +my_dir=$(dirname "$0") + +# admin node init path +rbd_image=/var/rbd_tool/rbd_image +database=$rbd_image/database +image_coll_v1=$rbd_image/image_coll_v1 +image_coll_v2=$rbd_image/image_coll_v2 +pg_coll=$rbd_image/pg_coll +images=$rbd_image/images +images_meta=$rbd_image/images_meta +default_backup_dir=/var/rbd_tool/default_backup_dir + +# admin node: image snap & nosnap +nosnap= #$rbd_image/<image_name>/nosnap +snap= #rbd_image/<image_name>/<snap_name> + +# osd node init path +job_path=/var/rbd_tool/osd_job +single_node=/var/rbd_tool/single_node + +# osd node vars +osd_env= #single_node/$cluster$id/osd_env +osd_data= #/var/lib/ceph/osd/$cluster-$id +omap_path= #$osd_data/current/omap +image_list_v1= #single_node/$cluster-$id/image_list_v1 +image_list_v2= #single_node/$cluster-$id/image_list_v2 +image_v1= #$single_node/$cluster-$id/image_v1 +image_v2= #$single_node/$cluster-$id/image_v2 +pgid_list= #$single_node/$cluster-$id/pgid_list +node_pg_epoch= #$single_node/$cluster-$id/node_pg_epoch +omap_list= #$single_node/$cluster-$id/omap_list + +# admin node config file +osd_host_path=$my_dir/config/osd_host_path +osd_host_mapping= #$pwd_path/config/osd_host_mapping # host --> host_remote: by init_env_admin() +osd_host=$my_dir/config/osd_host #generated by function init_env_admin() +mon_host=$my_dir/config/mon_host +mds_host=$my_dir/config/mds_host + +# ssh option +ssh_option="-o ConnectTimeout=1" + +# gen md5sum +function gen_md5() +{ + echo $1|md5sum|awk '{print $1}' +} + +# on each osd node +# check ceph environment: ssh, ceph-kvstore-tool, osd_data_path +function check_ceph_env() +{ + local func="check_ceph_env" + if [ $# -lt 2 ];then + echo "$func: parameters: <node> <data_path>" + exit + fi + local node=$1 + local data_path=$2 + local res= + local cmd= + + trap 'echo [$node]: ssh failed; exit' INT HUP + ssh -o ConnectTimeout=1 $node "echo -n" </dev/null + res=$? + if [ $res -ne 0 ];then + echo "[$node]: ssh failed" + exit + fi + + cmd=ceph-kvstore-tool + trap 'echo [$node]: $cmd failed; exit' INT HUP + ssh -o ConnectTimeout=1 $node "$cmd &>/dev/null;" </dev/null + res=$? + # ceph-kvstore-tool will return 1 with no parameters input + if [ $res -ne 1 ];then + echo "[$node]: $cmd not installed" + exit + fi + + trap 'echo [$node]: stat $data_path failed; exit' INT HUP + ssh -o ConnectTimeout=1 $node "stat $data_path &>/dev/null;" </dev/null + res=$? + if [ $res -ne 0 ];then + echo "[$node]: $data_path not exists" + exit + fi +} + +# osd node context : osd_data_path +function init_env_osd() +{ + local func="init_env_osd" + if [ "$1"x = ""x ];then + echo "$func: no osd_data_path input" + exit + fi + osd_data=$1 + omap_path=$osd_data/current/omap + + if [ ! -e $single_node ];then + mkdir -p $single_node + fi + + local osd_id=`gen_md5 $osd_data` + local osd_dir=$single_node/$osd_id + + if [ ! -e $osd_dir ];then + mkdir -p $osd_dir + fi + + image_list_v1=$osd_dir/image_list_v1 + image_list_v2=$osd_dir/image_list_v2 + image_v1=$osd_dir/image_v1 + image_v2=$osd_dir/image_v2 + pgid_list=$osd_dir/pgid_list + node_pg_epoch=$osd_dir/node_pg_epoch + omap_list=$osd_dir/omap_list +} + +# admin node process file: osd_host_path +function init_env_admin() +{ + local func="init_env_admin" + local pwd_path=`pwd` + osd_host_mapping=$pwd_path/config/osd_host_mapping + if [ ! -s $osd_host_path ];then + echo "$func: config/osd_host_path not exists or empty" + exit + fi + if [ ! -e $rbd_image ];then + mkdir -p $rbd_image + fi + if [ ! -e $images ];then + mkdir -p $images + fi + + if [ ! -s $mon_host ];then + echo "$func: config/mon_host not exists or empty" + exit + fi + if [ ! -e $mds_host ];then + echo "$func: config/mds_host not exists" + exit + fi + + # we just judge if osd_host is needed to be updated + if [ -s $osd_host ] && [ $osd_host -nt $osd_host_path ];then + return + fi + echo "$func: create osd_host ..." + # create file: osd_host and osd_host_mapping + >$osd_host + >$osd_host_mapping + local lines=0 + local lineno=0 + while read line + do + lineno=$(($lineno + 1)) + if [ "$line"x = ""x ];then + continue; + fi + local node=`echo $line|awk '{print $1}'` + if [ "$node"x = ""x ];then + echo "$func: osd_host_path : line $lineno: osd hostname not input" + rm -rf $osd_host $osd_host_mapping + exit + fi + local data_path=`echo $line|awk '{print $2}'` + if [ "$data_path"x = ""x ];then + echo "$func: osd_host_path : line $lineno: osd data_path not input" + rm -rf $osd_host $osd_host_mapping + exit + fi + lines=$(($lines + 1)) + # in case : there are servral hostnames on the same node + # just need output of `hostname` + local hostname_alias= + hostname_alias=`ssh $ssh_option $node "hostname" 2>/dev/null </dev/null` + if [ "$hostname_alias"x = ""x ];then + echo "$func: osd_host_path: line $lineno: $node: get remote hostname alias failed" + rm -rf $osd_host $osd_host_mapping + exit + fi + echo "$node $hostname_alias" >>$osd_host_mapping + echo $node >> $osd_host + # check ceph env on remote osd + check_ceph_env $node $data_path + done < $osd_host_path + + if [ $lines = 0 ];then + echo "$func: no osd host path valid" + exit + fi +} + +function admin_parse_osd() +{ + local func="admin_parse_osd" + if [ -s $osd_host ];then + return + fi + # create file: osd_host + >$osd_host + local lines=0 + local lineno=0 + while read line + do + lineno=$(($lineno + 1)) + if [ "$line"x = ""x ];then + continue; + fi + local node=`echo $line|awk '{print $1}'` + if [ "$node"x = ""x ];then + echo "$func: osd_host_path : line $lineno: osd_host not input" + exit + fi + local data_path=`echo $line|awk '{print $2}'` + if [ "$data_path"x = ""x ];then + echo "$func: osd_host_path : line $lineno: osd_data not input" + exit + fi + lines=$(($lines + 1)) + echo $node >> $osd_host + done < $osd_host_path +} + +# for osd node +function get_omap_list() +{ + ceph-kvstore-tool $omap_path list > $omap_list +} + +function convert_underline() +{ + if [ "$1"x = ""x ];then + return + fi + + echo $1|sed -e 's/_/\\u/gp'|head -n 1 +} + +function dump_backslash() +{ + echo $*|sed -e 's/\\/\\\\/gp'|head -n 1 +} + +function dump_dump_backslash() +{ + echo $*|sed -e 's/\\/\\\\\\\\/gp'|head -n 1 +} + +function char_convert() +{ + if [ "$1"x = ""x ];then + return + fi + + echo $1|sed -e 's/_/\\u/gp' -e 's/\./%e/gp' -e 's/%/%p/gp'|head -n 1 +} + +function check_osd_process() +{ + local func="check_osd_process" + local host=$1 + if [ "$1"x = ""x ];then + exit + fi + local cmds="ps aux|grep ceph-osd|grep -v grep" + local ret=/tmp/ret.$$$$ + ssh $ssh_option $host $cmds |tee $ret + if [ -s $ret ];then + echo "$func: [$host] ceph-osd process is not killed" + exit + fi + rm -f $ret +} + +function get_map_header_prefix() +{ + echo "_HOBJTOSEQ_" +} + +function get_map_header_key() +{ + local func="get_map_header_key" + if [ "$1"x = ""x ];then + #echo $func': no keyword input' + exit + fi + local keyword=$1 + local res=`cat $omap_list| grep $keyword` + if [ "$res"x = ""x ];then + #echo "$func: map_header_key = $keyword not exists" + exit + fi + echo $res|awk -F ":" '{print $2}' +} + +function get_header_seq() +{ + local func="get_header_seq" + if [ "$1"x == ""x ];then + #echo "$func: no prefix input" + exit; + elif [ "$2"x == ""x ];then + #echo "$func: no key input" + exit; + fi + local prefix=$1; + local key=$2; + local res=/tmp/header_seq.$$$$ + + ceph-kvstore-tool $omap_path get $prefix $key 2>/dev/null 1>$res + if [ $? != 0 ]; then + #echo "$func: <$prefix , $key> not exists" ; + exit; + fi + + # ceph-kvstore-tool get result like this: + # 02 01 7e 00 00 00 12 44 00 00 00 00 00 00 00 00 + # get header seq bytes: + # 12 44 00 00 00 00 00 00 + # -> 00 00 00 00 00 00 44 12 + # echo $((16#0000000000004412)) -> 17426 == header_seq + local seq=`cat $res |head -n 2|tail -n 1| \ + awk ' + BEGIN { + FS=":" + seq=""; + i=7; + } { + split($2, arr, " ") + # header_seq uint64 : 8 bytes + for (x=7; x>=0; --x) { + seq=seq""arr[i+x]; + } + } + END { + print seq + }'` + if [ "$seq"x = ""x ];then + #echo "$func: get <$prefix , $key> failed" + exit; + fi + rm -f $res + echo $((16#$seq)) +} + +# get header info key/value +function get_header_kv() +{ + local func="get_header_kv" + if [ "$1"x = ""x ];then + #echo "$func: no prefix input" + exit + elif [ "$2"x = ""x ];then + #echo "$func: no key input" + exit + elif [ "$3"x != "string"x ] && [ "$3"x != "int"x ];then + #echo "$func: no valid type input, use type (string|int)" + exit + fi + + local prefix=$1 + local key=$2 + local types=$3 + local res=/tmp/kv.$$$$ + + ceph-kvstore-tool $omap_path get $prefix $key 2>/dev/null 1>$res + if [ $? != 0 ];then + #echo "$func: <$prefix , $key> not exists" + exit + fi + + if [ "$types"x = "string"x ];then + local value=`cat $res |tail -n +2|head -n -1|awk -F ": " '{printf $3}'|sed -n 's/^\.\{4\}//p'` + echo $value + elif [ "$types"x = "int"x ];then + local value=`cat $res |tail -n +2|head -n -1| \ + awk ' + BEGIN{ + FS=":" + } { + split($2, arr, " "); + len=length(arr) + for (i=len; i>0; --i) { + printf arr[i]; + } + }'` + echo $((16#$value)) + fi + rm -f $res +} diff --git a/src/tools/rbd_recover_tool/config/mds_host b/src/tools/rbd_recover_tool/config/mds_host new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/tools/rbd_recover_tool/config/mds_host diff --git a/src/tools/rbd_recover_tool/config/mon_host b/src/tools/rbd_recover_tool/config/mon_host new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/tools/rbd_recover_tool/config/mon_host diff --git a/src/tools/rbd_recover_tool/config/osd_host_path b/src/tools/rbd_recover_tool/config/osd_host_path new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/tools/rbd_recover_tool/config/osd_host_path diff --git a/src/tools/rbd_recover_tool/database_h b/src/tools/rbd_recover_tool/database_h new file mode 100644 index 00000000..4ff20425 --- /dev/null +++ b/src/tools/rbd_recover_tool/database_h @@ -0,0 +1,1134 @@ +#!/usr/bin/env bash +# file: database_h +# +# Copyright (C) 2015 Ubuntu Kylin +# +# Author: Min Chen <minchen@ubuntukylin.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +my_dir=$(dirname "$0") + +. $my_dir/common_h +. $my_dir/metadata_h +. $my_dir/epoch_h + +db_image_prefix= +db_image_size= +db_order= +db_snap_id= +db_snap_image_size= +found=0 + +#init osd_data and get all objects path +function gen_database() +{ + local func="gen_database" + rm -rf $database/* + rm -rf $images + rm -rf $raw + mkdir -p $database + local host= + local data_path= + + trap 'echo $func failed; exit;' INT HUP + while read line + do + { + host=`echo $line|awk '{print $1}'` + data_path=`echo $line|awk '{print $2}'` + if [ "$host"x = ""x ] || [ "$data_path"x = ""x ];then + continue + fi + local cmds="find $data_path/current -type f" + ssh $ssh_option $host $cmds > $database/$host + } & + done < $osd_host_path + wait + echo "$func: finish" +} + +# collect hobjects from database +# and choose the object whose epoch is latest +# then, sort the objects by their offsets in image +function gather_hobject_common() +{ + func="gather_hobject_common" + + trap 'echo $func failed; exit;' INT HUP + if [ $# -lt 2 ];then + echo "$func: parameters: <pool_id> <image_prefix> [<snap_id>]" + exit + fi + + local pool_id=$1 + local image_prefix=$2 + pool_id=$(($pool_id)) + local hex_pool_id=`printf "%x" $pool_id` + # NOSNAP = uint64(-2) + local snap_id=`printf "%u" -2` + local hex_snap_id="head" + local psuffix= + local fsuffix="_head" + if [ $# = 3 ];then + snap_id=$(($3)) + hex_snap_id=`printf "%x" $snap_id` + psuffix="_"$snap_id + fsuffix="_"$snap_id + fi + local underline_image_prefix=`convert_underline $image_prefix` + local dump_image_prefix=`dump_backslash $underline_image_prefix` + local ddump_image_prefix=`dump_dump_backslash $underline_image_prefix` + local images_raw_dir=$rbd_image/raw + local image_hobjects_dir=$images/pool_$pool_id/$image_prefix + # $images/raw/$image_prefix"_head" + local image_hobjects_raw=$images_raw_dir/$image_prefix"$fsuffix" + # $images/$image_prefix/$image_prefix"_head" + local image_hobjects_stable=$image_hobjects_dir/$image_prefix"$fsuffix" + + if [ ! -e $images_raw_dir ];then + mkdir -p $images_raw_dir + fi + if [ ! -e $image_hobjects_dir ];then + local image_metadata=$images_meta/$image_name_in + mkdir -p $image_hobjects_dir + fi + + pushd $database >/dev/null + local pattern="\.[0-9a-f]+__"$hex_snap_id"_[0-9A-F]{8}__"$hex_pool_id + >$image_hobjects_raw + grep -r -E $dump_image_prefix""$pattern * >$image_hobjects_raw + if [ ! -s $image_hobjects_raw ];then + echo "$func: image snap [ $image_prefix"$psuffix" ] is empty" + return 1 #no data available + fi + popd >/dev/null + + local offset_dir_temp=$images_raw_dir/$image_prefix"$fsuffix""_dir_temp" + rm -rf $offset_dir_temp + mkdir -p $offset_dir_temp + + echo "gather hobjects from database: snapid=$snap_id ..." + + # format: ceph2:/var/lib/ceph/osd/ceph-1/current/2.d3_head/rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2 + local tmp_image=$offset_dir_temp/tmpimage.$$$$ + >$tmp_image + cat $image_hobjects_raw | + awk -F ':' ' + BEGIN { + pg_coll="'$pg_coll'" + tmp_image="'$tmp_image'" + osd_host_mapping="'$osd_host_mapping'" + snapid="'$snap_id'" + }{ + # $2 = /var/lib/ceph/osd/ceph-1/current/2.d3_head/rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2 + + split($2, arr1, "/current/"); # {/var/lib/ceph/osd/ceph-1/, 2.d3_head/rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2} + split(arr1[2], arr2, "/"); # {2.d3_head, rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2} + split(arr2[1], arr3, "_head"); # {2.d3,} + + hobject=$2; + data_path=arr1[1]; + gsub(/\\u/, "\\\\\\\\u", hobject); # dump backslash to delay escape (\ -> \\) + "awk \"\\$1 == \\\""$1"\\\" {print \\$2}\" "osd_host_mapping" | head -n 1" | getline node + pgid = arr3[1]; + + len=length(arr2); + offset_hobject=arr2[len] # rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2 + split(offset_hobject, offarr1, "."); # {rb, 0, 1293, 6b8b4567, 000000000002__head_FB425CD3__2} + len1=length(offarr1) + offset_p=offarr1[len1] # 000000000002__head_FB425CD3__2 + split(offset_p, offarr2, "__"); # {000000000002, head_FB425CD3, 2} + offset=offarr2[1]; # 000000000002 + + system("echo -n \""node" "pgid" "hobject" "offset" "snapid" \" >>"tmp_image); + #system("echo -n \""node" "pgid" "hobject" "offset" "snapid" \""); + #print node" "pgid" "hobject" "offset" "snapid + + # find pg_epoch from pg_coll database + system("awk \"\\$1 == \\\""node"\\\" && \\$2 == \\\""pgid"\\\" && \\$4 == \\\""data_path"\\\" {print \\$3}\" "pg_coll" >>"tmp_image); + #system("awk \"\\$1 == \\\""node"\\\" && \\$2 == \\\""pgid"\\\" && \\$4 == \\\""data_path"\\\" {print \\$3}\" "pg_coll); + }' + + local sort_image=$offset_dir_temp/sortimage.$$$$ + >$sort_image + sort -t ' ' -k 4.1,4 -k 6.1nr -k 1.1,1 $tmp_image >$sort_image + sort -t ' ' -k 4.1,4 -u $sort_image > $image_hobjects_stable + + #rm -rf $offset_dir_temp + return 0 +} + +function gather_hobject_nosnap() +{ + gather_hobject_common $1 $2 +} + +function gather_hobject_snap() +{ + gather_hobject_common $1 $2 $3 +} + +# select the max pg_epoch item of the same $field +# if no same $field, choose the first +# format : "node $field pg_epoch" +function choose_epoch() +{ + cat $1|sort -t ' ' -k 3.1,3nr -k 2.1,2n |head -n 1; +} + +# lookup image info , after scatter_node_jobs & gather_node_infos +function lookup_image() +{ + local func="lookup_image" + if [ $# -lt 2 ];then + echo "$func: parameters error <pool_id> <image_name> [<snap_name>]" + fi + local pool_id=$1 + local image_name=$2 + local snap_name=$3 + pool_id=$((pool_id)) + echo -e "$func: pool_id = $pool_id\timage_name = $image_name\tsnap_name = $snap_name" + if [ $pool_id -lt 0 ];then + echo "$func: pool_id must great than zero" + exit + fi + local hex_pool_id=`printf "%x" $pool_id` + input_image $image_name + local node= + local item=/tmp/item.$$$$ + local img_name=`dump_backslash $image_name` + + local image_format=0 + local image_id_hobject= + local image_header_hobject= + local result=/tmp/tmp_result.$$$$ + local res1=/tmp/tmp_res1.$$$$ + local res2=/tmp/tmp_res2.$$$$ + local data_path= + + # image format v1 + { + cat $image_coll_v1|grep -E "/$img_name\.rbd__head_[0-9A-F]{8}__$hex_pool_id" >$res1 + if [ -s $res1 ];then + echo -n "$func: rbd_header_hobject = " + choose_epoch $res1| tee $item + #choose_epoch $res1 > $item + + if [ -e $item ];then + node=`cat $item|awk '{print $1}'` + image_header_hobject=`cat $item|awk '{print $2}'` + if [ "$node"x = ""x ];then + echo "$func: v1 node is NULL" + exit + fi + if [ "$image_header_hobject"x = ""x ];then + echo "$func: v1 image_header_hobject is NULL" + exit + fi + rm -f $item + fi + + image_format=1 + echo -e "image_name:\t$image_name_in" + echo -e "image_format:\t$image_format" + data_path=`echo $image_header_hobject|awk -F "/current" '{print $1}'` + + >$result + cmds="bash $job_path/osd_job do_image_metadata_v1 $data_path `dump_backslash $image_header_hobject` $snap_name" + ssh $ssh_option $node $cmds | tee $result + fi + } + + # image format v2 + { + cat $image_coll_v2|grep -E "/rbd\\\\uid\."$img_name"__head_[0-9A-F]{8}__$hex_pool_id" >$res2 + if [ -s $res2 ];then + echo -n "$func: rbd_id_hobject = " + choose_epoch $res2 | tee $item + #choose_epoch $res2 > $item + + if [ -e $item ];then + node=`cat $item|awk '{print $1}'` + image_id_hobject=`cat $item|awk '{print $2}'` + if [ "$node"x = ""x ];then + echo "$func: v2 node is NULL(to get image_id_hobject)" + exit + fi + if [ "$image_id_hobject"x = ""x ];then + echo "$func: v2 image_id_hobject is NULL" + exit + fi + rm -f $item + fi + + check_osd_process $node + image_format=2 + + local tid=/tmp/image_id.$$$$ + data_path=`echo $image_id_hobject|awk -F "/current" '{print $1}'` + >$tid + cmds="bash $job_path/osd_job do_image_id $data_path `dump_backslash $image_id_hobject`" + ssh $ssh_option $node $cmds > $tid + + local image_id=`cat $tid` + rm -f $tid + + #get image_header_hobject + pushd $database >/dev/null + local pattern="header\."$image_id"__head_[0-9A-F]{8}__$hex_pool_id" + local tcoll=/tmp/tmp_image_head_coll.$$$$ + + # hostname(by command hostname) in $pg_coll maybe different from hostname in tcoll(input by user) + # t_host: hostname read from config file ($tcoll) + # t_host_remote: $(hostname) on osd node ($pg_coll) + grep -r -E $pattern * >$tcoll + popd >/dev/null + + local t_host=(`cat $tcoll|awk -F ":" '{print $1}'`) + local t_pgid=(`cat $tcoll|awk -F ":" '{print $2}'|sed -n 's/.*\/\([0-9a-fA-F]\+\.[0-9a-fA-F]\+\)_head\/.*/\1/p'`) + local t_hobject=(`cat $tcoll|awk -F ":" '{print $2}'`) + local t_data_path=(`cat $tcoll|awk -F ":" '{split($2, arr, "/current/"); print arr[1];}'`) + rm -f $tcoll + declare -a t_host_remote + + #if there is no failed pg migration, number of t_host is replica num + #replica num : 3, 4, 5 ... + local t_hostname=/tmp/t_hostname.$$$$ + for ((i=0; i<${#t_host[*]}; i++)) + do + ssh $ssh_option ${t_host[$i]} "hostname" >$t_hostname + if [ $? != 0 ];then + echo "$func: ${t_host[$i]} get host_remote failed" + exit + fi + t_host_remote[$i]=`cat $t_hostname` + done + rm -f $t_hostname + + local t_item=/tmp/tmp_item.$$$$ + local tmp_item=/tmp/tmp_tmp_item.$$$$ + + >$tmp_item + for ((i=0; i<${#t_host_remote[*]}; i++ )) + do + local node=${t_host_remote[$i]} + local pgid=${t_pgid[$i]} + awk '$1 == "'"$node"'" && $2 == "'"$pgid"'" {print}' $pg_coll >>$tmp_item + done + + # t_item: <remote_hostname> <pgid> <epoch> <data_path> + sort -u $tmp_item >$t_item + rm -f $tmp_item + + local entry=`choose_epoch $t_item` #t_host_remote + rm -f $t_item + + node=`echo $entry|awk '{print $1}'` + data_path=`echo $entry|awk '{print $4}'` + if [ "$node"x = ""x ];then + echo "$func: v2 node is NULL (to get image_header_hobject)" + exit + fi + + for ((i=0; i<${#t_host_remote[*]}; i++)) + do + if [ "${t_host_remote[$i]}"x = "$node"x ] && [ "${t_data_path[$i]}"x = "$data_path"x ];then + image_header_hobject=${t_hobject[$i]} + break + fi + done + + if [ "$image_id_hobject"x = ""x ];then + echo "$func: v2 image_header_hobject is NULL" + exit + fi + + check_osd_process $node + + echo "$func: rbd_header_hobject = $node $image_header_hobject" + echo -e "image_name:\t$image_name_in" + echo -e "image_format:\t$image_format" + + #data_path=`echo $image_header_hobject|awk -F "/current" '{print $1}'` + >$result + cmds="bash $job_path/osd_job do_image_metadata_v2 $data_path $image_id `dump_backslash $image_header_hobject` $snap_name" + ssh $ssh_option $node $cmds | tee $result + fi + } + + if [ ! -s $result ];then + echo "$func: $image_name_in not exists" + exit + fi + + # to assign value to global variable + db_image_prefix=`cat $result|awk '/^(object_prefix|block_name):/{print $2}'` + if [ "$db_image_prefix"x = ""x ];then + echo "$func: image_prefix is NULL" + exit + fi + + db_image_size=`cat $result|awk '/^image_size:/{print $2}'` + db_order=`cat $result|awk '/^order:/{print $2}'` + if [ "$snap_name"x != ""x ];then + db_snap_id=`cat $result|awk '/^snapshot:/{print $2}'` + if [ "$db_snap_id"x = ""x ];then + echo "$func: $image_name_in@$snap_name NOT EXISTS" + exit + fi + db_snap_image_size=`cat $result|awk '/^snapshot:/{print $4}'` + else + #save snaplist + local image_snaplist=$images/pool_$pool_id/$image_name_in/@snaplist + local image_dir=$images/pool_$pool_id/$image_name_in + if [ ! -e $image_dir ];then + mkdir -p $image_dir + fi + cat $result|awk '/^snapshot:/{print $2" "$3" "$4}' >$image_snaplist + fi + found=1 + rm -f $result +} + +function list_images() +{ + echo "=============== format ==============" + echo "format: <pool_id>/<image_name>" + echo "================ v1: ================" + #sed -n 's/\(.*\)\/\(.*\)\.rbd__\(.*\)/\2/p' $image_coll_v1|sort -u|sed -e 's/\\u/_/g' + sed -n 's/.*\/\(.*\)\.rbd__head_[0-9A-F]\{8\}__\([0-9a-f]\+\).*/\2 \1/p' $image_coll_v1|sort -u|awk '{print strtonum("0x"$1)"/"$2;}'|sed -e 's/\\u/_/g' + echo "================ v2: ================" + #sed -n 's/\(.*\)\/rbd\\uid.\(.*\)__\(head.*\)/\2/p' $image_coll_v2|sort -u|sed 's/\\u/_/g' + sed -n 's/.*\/rbd\\uid.\(.*\)__head_[0-9A-F]\{8\}__\([0-9a-f]\+\).*/\2 \1/p' $image_coll_v2|sort -u|awk '{print strtonum("0x"$1)"/"$2}'|sed 's/\\u/_/g' +} + +# lookup image metadata +# and +# collect hobjects of image with the latest pg epoch +function discover_image_nosnap() +{ + local func="discover_image_nosnap" + echo "$func ..." + local pool_id=$1 + local image_name=$2 + pool_id=$(($pool_id)) + lookup_image $pool_id $image_name # assign $image_prefix + gather_hobject_nosnap $pool_id $db_image_prefix + if [ $? -ne 0 ];then + exit + fi + local image_hobjects_stable_nosnap=$images/pool_$pool_id/$db_image_prefix/$db_image_prefix"_head" + local image_hobjects_dir=$images/pool_$pool_id/$image_name_in + if [ ! -e $image_hobjects_dir ];then + mkdir -p $image_hobjects_dir + fi + # mv image_prefix to image_name + mv $image_hobjects_stable_nosnap $image_hobjects_dir/$image_name_in + rm -rf $images/pool_$pool_id/$db_image_prefix +} + +# get the offset snapid object +# if there is no object, choose the smallest snapid which is greater than current snapid +function get_object_clone() +{ + local func="get_object_clone" + if [ $# -lt 4 ];then + exit + fi + + local object_offset_string=$1 + local snapid=$2 + local snaplist_path=$3 + local snapset_output_dir=$4 + + # snapid in desc + local snap_coll_arr=(` + cat $snaplist_path|awk '{ if ($1 >= '"$snapid"') print "'"$snapset_output_dir"'/@"$1}'`) + + local hex_snapid=`printf "%x" $snapid` + pushd $snapset_output_dir >/dev/null + # get object with the smallest snapid greater than current snapid + awk '$4 == "'"$object_offset_string"'" && $5 >= '$snapid' {print}' `echo ${snap_coll_arr[@]}` |tail -n 1 + popd >/dev/null +} + +# gather hobject for each snapid +function gen_snapset_hobject() +{ + local func="gen_image_snapset" + echo "$func ..." + if [ $# -lt 4 ];then + echo "$func: parameters: <pool_id> <image_prefix> <snaplist_path> <snapset_output_dir>" + exit + fi + local pool_id=$1 + local image_prefix=$2 + local snaplist_path=$3 + local snapset_output_dir=$4 + pool_id=$(($pool_id)) + OIFS=$IFS + IFS=$'\n' + local snaparr=(`cat $snaplist_path`) + # gather hobject for each snapshot + trap 'echo $func failed; exit;' INT HUP + for line in ${snaparr[@]} + do + OOIFS=$IFS + IFS=$' ' + local field=(`echo $line`) + local snapid=${field[0]} + local image_hobjects_stable_snap=$images/pool_$pool_id/$image_prefix/$image_prefix"_"$snapid + local image_snap=$snapset_output_dir/@$snapid + gather_hobject_snap $pool_id $image_prefix $snapid + local res=$? + if [ $res -ne 0 ];then + touch $image_snap + else + mv $image_hobjects_stable_snap $image_snap + fi + IFS=$OOIFS + done + IFS=$OIFS +} + +# lookup image metadata and get snapid hobjects +function discover_image_snap() +{ + local func="discover_image_snap" + echo "$func ..." + if [ $# -lt 3 ];then + echo "$func: parameters: <pool_id> <image_name> [<snap_name>]" + exit + fi + local pool_id=$1 + local image_name=$2 + local snap_name=$3 + pool_id=$(($pool_id)) + #mkdir -p $images/$image_prefix + lookup_image $pool_id $image_name $snap_name # input image_name and snap_name to lookup metadata and snap_id + if [ "$db_snap_id"x = ""x ];then + echo "$func: lookup image failed to gen snapid" + exit + fi + local image_hobjects_dir_prefix=$images/pool_$pool_id/$db_image_prefix + local image_nosnap=$images/pool_$pool_id/$image_name_in + #check if image nosnap recovered + if [ ! -s $image_nosnap ];then + echo "$func: please recover image nosnap before recover with snap" + rm -rf $image_hobjects_dir_prefix + exit + fi + local image_hobject_dir=$images/pool_$pool_id/$image_name_in + local image_snap_hobject=$image_hobject_dir/$image_name_in@$db_snap_id + local image_snap_hobject_head=$image_hobject_dir/$image_name_in@$db_snap_id@head + local image_snaplist=$image_hobject_dir/@snaplist + local image_snapset_dir=$image_hobject_dir/@snapset_dir + local image_head=$image_hobject_dir/$image_name_in + if [ ! -e $image_hobject_dir ];then + mkdir -p $image_hobject_dir + fi + # only gen snapset one time + if [ ! -e $image_snapset_dir ];then + mkdir -p $image_snapset_dir + gen_snapset_hobject $pool_id $db_image_prefix $image_snaplist $image_snapset_dir + + fi + + echo "$func: will get object clone ..." + >$image_snap_hobject + >$image_snap_hobject_head + + trap 'echo $func failed; exit;' INT HUP + # get each offset 's snapid hobject + while read line + do + #echo $line + OOIFS=$IFS + IFS=$' ' + local field=(`echo $line`) + local offset_string=${field[3]} + IFS=$OOIFS + local entry=`get_object_clone $offset_string $db_snap_id $image_snaplist $image_snapset_dir` + if [ "$entry"x != ""x ];then + echo $entry >> $image_snap_hobject + echo `dump_backslash $line` >> $image_snap_hobject_head + fi + done < $image_head + rm -rf $image_hobjects_dir_prefix +} + +# after discover_image_nosnap +# collect objects from osds one by one in sequence +function copy_image_nosnap_single_thread() +{ + local func="copy_image_nosnap_single_thread" + echo "$func ..." + if [ $# -lt 3 ];then + echo "$func: parameters: <pool_id> <image_hobjects> <backup_dir>" + exit + fi + local pool_id=$1 + local image_hobjects=$2 + local backup_dir=$3 + pool_id=$(($pool_id)) + + # make sure lookup_image first + if [ $found = 0 ];then + echo "$func: image not found, maybe forget to discover_image" + exit + fi + if [ ! -e $backup_dir ];then + mkdir -p $backup_dir + fi + + local image_dir=$backup_dir/pool_$pool_id/$image_name_in + local image_file=$image_dir/$image_name_in + local CURRENT=$image_dir/@CURRENT + local LOCK=$image_dir/@LOCK + if [ ! -e $image_dir ];then + mkdir -p $image_dir + fi + if [ -e $LOCK ];then + echo "$func: $LOCK is locked by other process" + exit + else + touch $LOCK + fi + + >$image_file + truncate -s $db_image_size $image_file + echo "head">$CURRENT + + local count=$(($db_image_size >> $db_order)) + local start=`cat $image_hobjects|head -n 1|awk '{print $4}'` + local end=`cat $image_hobjects|tail -n 1|awk '{print $4}'` + local entry_count=`cat $image_hobjects|wc -l` + + local char_bits=$((`echo $start|wc -c` -1 )) + local format="%0"$char_bits"x" + + local expect_start=`printf $format 0` + local expect_end=`printf $format $(($count -1 ))` + + echo -e "object_count\t$entry_count" + echo -e "expect\t\t[$expect_start ~ $expect_end] count:$count" + echo -e "range\t\t[$start ~ $end] count:$entry_count" + + local icount=0 + local istart= + local iend= + local percent= + + trap 'echo $func failed; exit;' INT HUP + local unit=$((1<<$db_order)) + while read line + do + { + icount=$(($icount+1)) + node=`echo $line|awk '{print $1}'` + hobject=`echo $line|awk '{print $3}'` + offset=`echo $line|awk '{print $4}'` + off=$((16#$offset)) + if [ $icount = 1 ];then + istart=$offset + fi + hobject=`dump_backslash $hobject` + iend=$offset + sshcmd="cat $hobject" + ssh $ssh_option $node $sshcmd < /dev/null | dd of=$image_file bs=$unit seek=$off conv=notrunc 2>/dev/null + percent=`echo "scale=3; 100*$icount/$entry_count"|bc` + tput sc #record current cursor + echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%" + if [ $icount != $entry_count ];then + tput rc # backport most recent cursor + fi + } + done < $image_hobjects + + echo + echo -n "size: " + ls -lh $image_file|awk '{print $5"\t"$9}' + echo -n "du: " + du -h $image_file + #unlock + rm -f $LOCK +} + + +# ssh copy snap_object & head_object from osd to admin node +# copy all snapshot objects +# and +# all head objects which have the same offset as snapshot objects +function collect_image_snap_objects() +{ + local func="collect_image_snap_objects" + #$1=backup_dir, $2=snap_name, $3=snap_hobjects, $4=head_hobjects + if [ $# -lt 6 ];then + echo "$func: parameters: <pool_id> <image_name> <snap_id> <snap_hobjects> <head_hobjects> <backup_dir>" + exit + fi + + local pool_id=$1 + local image_name=$2 + local snap_id=$3 + local snap_hobjects=$4 #snap hobjects info + local head_hobjects=$5 #head hobjects info + local backup_dir=$6 + pool_id=$(($pool_id)) + + local head_dir=$backup_dir/pool_$pool_id/$image_name/@head + local snap_dir=$backup_dir/pool_$pool_id/$image_name/@$snap_id + local CURRENT=$backup_dir/pool_$pool_id/$image_name/@CURRENT + + if [ ! -e $head_dir ];then + mkdir -p $head_dir + fi + if [ ! -e $snap_dir ];then + mkdir -p $snap_dir + fi + + local snap_node= #osd node + local snap_hobject= #hobject path with snapid on osd + local snap_offset= + local snap_filename= + + local head_node= + local head_hobject= + local head_offset= + local head_filename= + + # ignore if there is no object in snapshot(empty ) + if [ ! -s $snap_hobjects ];then + echo "$func: $snap_hobjects is empty" + return 0 + fi + local start=`head -n 1 $snap_hobjects|awk '{print $4}'` + local end=`tail -n 1 $snap_hobjects|awk '{print $4}'` + local entry_count=`cat $snap_hobjects|wc -l` + if [ $((16#$first_offset)) -gt $((16#$last_offset)) ];then + echo "$func: $snap_hobjects not sorted" + return 1 + fi + + # just assert if ignored empty snapshot + if [ "$start"x = ""x ] || [ "$end"x = ""x ];then + return 1 + fi + + # speed up copy snapshot + # lookup the corresponding head hobject of snap hobject + # use command: grep <offset> <head hobjects> + # + # eg. + # head hobjects: (32 objects, snapid = uint64(-2) = 18446744073709551614) + # ceph1 29.4d /var/lib/ceph/osd/ceph-0/current/29.4d_head/rb.0.1c414.6b8b4567.000000000000__head_EC2C1C4D__1d 000000000000 18446744073709551614 869 + # ceph1 29.8c /var/lib/ceph/osd/ceph-0/current/29.8c_head/rb.0.1c414.6b8b4567.000000000001__head_0F439A8C__1d 000000000001 18446744073709551614 867 + # ceph1 29.6a /var/lib/ceph/osd/ceph-0/current/29.6a_head/rb.0.1c414.6b8b4567.000000000002__head_FC55706A__1d 000000000002 18446744073709551614 869 + # ceph1 29.8b /var/lib/ceph/osd/ceph-0/current/29.8b_head/rb.0.1c414.6b8b4567.000000000003__head_20A6328B__1d 000000000003 18446744073709551614 869 + # ceph2 29.75 /var/lib/ceph/osd/ceph-1/current/29.75_head/rb.0.1c414.6b8b4567.000000000004__head_AC5ADB75__1d 000000000004 18446744073709551614 867 + # ceph2 29.23 /var/lib/ceph/osd/ceph-1/current/29.23_head/rb.0.1c414.6b8b4567.000000000005__head_1FDEA823__1d 000000000005 18446744073709551614 867 + # ...... + # ceph1 29.34 /var/lib/ceph/osd/ceph-0/current/29.34_head/rb.0.1c414.6b8b4567.00000000001f__head_52373734__1d 00000000001f 18446744073709551614 869 + # + # snap hobjects: (3 objects, snapid >= 29) + # ceph1 29.8c /var/lib/ceph/osd/ceph-0/current/29.8c_head/rb.0.1c414.6b8b4567.000000000001__1f_0F439A8C__1d 000000000001 31 867 + # ceph1 29.6a /var/lib/ceph/osd/ceph-0/current/29.6a_head/rb.0.1c414.6b8b4567.000000000002__1e_FC55706A__1d 000000000002 30 869 + # ceph1 29.8b /var/lib/ceph/osd/ceph-0/current/29.8b_head/rb.0.1c414.6b8b4567.000000000003__1d_20A6328B__1d 000000000003 29 869 + # + # so find out offset in head hobjects line number: + # snap hobjects: 000000000001 ---> head hobjects: 2 (n1) + # snap hobjects: 000000000003 ---> head hobjects: 4 (n2) + # + # finally , grep range from the whole file [1 ~ N] shranked to part of file [n1 ~ n2] + # the worst case : [n1 ~ n2] = [1 ~ N], means no shranking + + # get the line number of the start offset in head hobjects + local n1=`grep -n $start $head_hobjects|head -n 1|cut -d ":" -f 1` + # get the line number of the end offset in head hobjects + local n2=`grep -n $end $head_hobjects|head -n 1|cut -d ":" -f 1` + + local icount=0 + local istart= + local iend= + local percent= + + OIFS=$IFS + IFS=$'\n' + + #assume file:snap_hobjects is not very large, and can be loaded into memory + local snap_arr=(`cat $snap_hobjects`) + local snap_tmp=/tmp/snaptmp.$$$$ + + # snap_tmp: + # consists of snap hobject or head hobject + # select lineno range: [n1 ~ n2] + head -n $n2 $head_hobjects|tail -n $(($n2-$n1+1)) >$snap_tmp + + echo "copy image snap/head objects from osd ..." + echo -e "object_count\t$entry_count" + echo -e "range\t\t[$start ~ $end] count:$entry_count" + + trap 'echo $func failed; exit;' INT HUP + for line in ${snap_arr[*]} + do + icount=$(($icount+1)) + + OOIFS=$IFS + IFS=$' ' + + local arr=(`echo $line`) + snap_node=${arr[0]} + snap_hobject=${arr[2]} + snap_offset=${arr[3]} + snap_filename=$snap_dir/$snap_offset + + if [ $icount = 1 ];then + istart=$snap_offset + fi + iend=$snap_offset + + #lookup corresponding head hobject of snap hobject + local res=`grep $snap_offset $snap_tmp|head -n 1` + if [ "$res"x = ""x ];then + echo "$func: image object[ $snap_offset ] missing" + exit + fi + + local arr2=(`echo $res`) + head_node=${arr2[0]} + head_hobject=${arr2[2]} + head_offset=${arr2[3]} + head_filename=$head_dir/$head_offset + + # just copy object(snap/head) if it does not exist + if [ ! -e $snap_filename ];then + ssh $ssh_option $snap_node "cat $snap_hobject" > $snap_filename + fi + if [ ! -e $head_filename ];then + ssh $ssh_option $head_node "cat $head_hobject" > $head_filename + fi + IFS=$OOIFS + + percent=`echo "scale=3; 100*$icount/$entry_count"|bc` + tput sc #record current cursor + echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%" + if [ $icount != $entry_count ];then + tput rc # backport most recent cursor + fi + done + echo + IFS=$OIFS + rm -f $snap_tmp + return 0 +} + +# copy all snap objects and corresponding head objects from osds +# in single process +function copy_image_snap_single_thread() +{ + local func="copy_image_snap_single_thread" + if [ $# -lt 6 ];then + echo "$func: parameters: <pool_id> <image_name> <snap_id> <snap_hobjects> <head_hobjects> <backup_dir>" + exit + fi + local pool_id=$1 + local image_name=$2 + local snap_id=$3 + local snap_hobjects=$4 + local head_hobjects=$5 + local backup_dir=$6 + pool_id=$(($pool_id)) + + local CURRENT=$backup_dir/pool_$pool_id/$image_name/@CURRENT + local LOCK=$backup_dir/pool_$pool_id/$image_name/@LOCK + #lock + if [ -e $LOCK ];then + echo "$func: $LOCK is locked by other process" + exit + else + touch $LOCK + fi + collect_image_snap_objects $pool_id $image_name $snap_id $snap_hobjects $head_hobjects $backup_dir + #unlock + rm -f $LOCK +} + +# after all snap objects and necessary head objects are copied, +# just pick appropriate head objects and snap objects and write them to image +# in order to rollback image to snapshot +# +# init: image is created by copy_image_nosnap_single_thread firstly +# +# all output include 3 parts: +# <image> <head objects> <snap objects> +# +# head objects1 --- snap1 objects +# head objects2 --- snap2 objects +# image head objects3 --- snap3 objects +# ...... +# head objectsN --- snapN objects +# +# how to rollback: +# firstly rollback to head, secondly write <snapX objects> +# head = <image> + <head objects> +# snap1 = <image> + <head objects> + <snap1 objects> +# snap2 = <image> + <head objects> + <snap2 objects> +# snap3 = <image> + <head objects> + <snap3 objects> +# ...... +# snapN = <image> + <head objects> + <snapN objects> +# +# improve rollback: +# there is intersection of head objects and snapX objects, if snapX objects are not empty +# and need to deduplicate the intersection. +# deduplicate steps: +# - get difference set of head objects and snapX objects +# - write the difference set objects to image +# - write the snapX objects to image +function rollback_image_snap() +{ + local func="rollback_image_snap" + + echo "$func ..." + + trap 'echo $func failed; exit;' INT HUP + if [ $# -lt 6 ];then + echo "$func: parameters <pool_id> <image_name> <snap_id> <snap_object_dir> <backup_dir> <image_unit>" + exit + fi + local pool_id=$1 + local image_name=$2 + local snap_id=$3 + local snap_object_dir=$4 + local backup_dir=$5 + local image_unit=$6 + + local need_diff_set=0 + + local image_path=$backup_dir/pool_$pool_id/$image_name/$image_name + local head_object_dir=$backup_dir/pool_$pool_id/$image_name/@head + local CURRENT=$backup_dir/pool_$pool_id/$image_name/@CURRENT + local LOCK=$backup_dir/pool_$pool_id/$image_name/@LOCK + if [ -e $LOCK ];then + echo "$func: $LOCK is locked by other process" + exit + else + touch $LOCK + fi + if [ $snap_id -ne -2 ];then + echo $snap_id > $CURRENT + else + echo "head" > $CURRENT + fi + + if [ ! -e $snap_object_dir ];then + return 0 + fi + + if [ "$snap_object_dir"x != "$head_object_dir"x ];then + echo "$func: need to compute diff_set of head" + need_diff_set=1 + else + echo "$func: NO diff_set" + need_diff_set=0 + fi + + local entry_count=0 + local start= + local end= + local offset= + local icount=0 + local istart= + local iend= + local percent= + + local snap_objects= + local head_objects= + local diff_set= + + snap_objects=(`ls $snap_object_dir`) + + # if need to compute difference set of head_objects and snap_objects + if [ $need_diff_set -ne 0 ];then + head_objects=(`ls $head_object_dir`) + + #get the difference set: ( head_objects - snap_objects ) + diff_set=(` + sort -m <(echo ${head_objects[@]}|xargs -n 1 echo) <(echo ${snap_objects[@]}|xargs -n 1 echo) \ + <(echo ${snap_objects[@]}|xargs -n 1 echo) |uniq -u`) + + # copy diff_set of head object to image + pushd $head_object_dir >/dev/null + + echo "$func: copy diff_set head objects ..." + entry_count=${#diff_set[@]} + start=${diff_set[0]} + end= + if [ $entry_count -gt 0 ];then + end=${diff_set[$(($entry_count - 1))]} + fi + offset= + icount=0 + istart= + iend= + percent= + + echo -e "object_count\t$entry_count" + echo -e "range\t\t[$start ~ $end] count:$entry_count" + + for object in ${diff_set[@]} + do + icount=$(($icount+1)) + if [ $icount = 1 ];then + istart=$object + fi + iend=$object + + local offset=$((16#$object)) + dd if=$object of=$image_path bs=$image_unit seek=$offset conv=notrunc 2>/dev/null + + percent=`echo "scale=3; 100*$icount/$entry_count"|bc` + tput sc #record current cursor + echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%" + if [ $icount != $entry_count ];then + tput rc # backport most recent cursor + fi + done + if [ $entry_count -gt 0 ];then + echo + fi + popd >/dev/null + + if [ $snap_id -ne -2 ];then + echo -e "$image_name already rollback diff_set: (head - snap)" + fi + fi + + # copy snap object to image + pushd $snap_object_dir >/dev/null + + if [ $need_diff_set -ne 0 ];then + echo "$func: copy snap objects ..." + else + echo "$func: copy head objects ..." + fi + entry_count=${#snap_objects[@]} + start=${snap_objects[0]} + end= + if [ $entry_count -gt 0 ];then + end=${snap_objects[$(($entry_count - 1))]} + fi + offset= + icount=0 + istart= + iend= + percent= + + echo -e "object_count\t$entry_count" + echo -e "range\t\t[$start ~ $end] count:$entry_count" + + for object in ${snap_objects[@]} + do + icount=$(($icount+1)) + if [ $icount = 1 ];then + istart=$object + fi + iend=$object + + local offset=$((16#$object)) + dd if=$object of=$image_path bs=$image_unit seek=$offset conv=notrunc 2>/dev/null + + percent=`echo "scale=3; 100*$icount/$entry_count"|bc` + tput sc #record current cursor + echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%" + if [ $icount != $entry_count ];then + tput rc # backport most recent cursor + fi + done + if [ $entry_count -gt 0 ];then + echo + fi + popd >/dev/null + + rm -f $LOCK + if [ $snap_id -ne -2 ];then + echo "$image_name rollback to snapid: $snap_id" + else + echo "$image_name rollback to head" + fi +} + +function recover_image() +{ + local func="recover_image" + echo "$func ..." + + if [ $# -lt 3 ];then + echo "$func: parameters: <pool_id> <image_name> <snap_name> [<backup_dir>]" + exit + fi + + local pool_id=$1 + local img_name=$2 + local snap_name=$3 + local backup_dir=$4 + pool_id=$(($pool_id)) + if [ "$snap_name"x = "@"x ];then + snap_name= + fi + if [ "$backup_dir"x = ""x ];then + backup_dir=$default_backup_dir + fi + + #recover image with nosnap + if [ "$snap_name"x = ""x ];then + discover_image_nosnap $pool_id $img_name #input image_name + local image_hobjects=$images/pool_$pool_id/$image_name_in/$image_name_in + copy_image_nosnap_single_thread $pool_id $image_hobjects $backup_dir + + #recover image with snap + else + + # check if recovered head already + local img_hobjects_path=$images/pool_$pool_id/$img_name/$img_name + local img_file_path=$backup_dir/pool_$pool_id/$img_name/$img_name + if [ ! -e $img_hobjects_path ] || [ ! -e $img_file_path ];then + echo "$func: $img_name@$snap_name : can not rollback to snapshot, please recover image head first" + exit + fi + + # rollback to head + if [ "$snap_name"x = "@@"x ];then + local head_dir=$backup_dir/pool_$pool_id/$img_name/@head + if [ -e $head_dir ];then + local unit=`pushd $head_dir >/dev/null; ls|head -n 1|xargs -n 1 stat|awk '/Size:/{print $2}'` + # rollback to head + rollback_image_snap $pool_id $img_name -2 $backup_dir/$img_name/@head $backup_dir $unit + echo "$image_name_in head : $backup_dir/$img_name/$img_name" + else + echo "$func: no need to rollback to head" + fi + return 0 + fi + + # rollback to snap + discover_image_snap $pool_id $img_name $snap_name # get image meta & get snapid object + local snap_hobjects=$images/pool_$pool_id/$image_name_in/$image_name_in@$db_snap_id + local head_hobjects=$images/pool_$pool_id/$image_name_in/$image_name_in@$db_snap_id@head + local snap_object_dir=$backup_dir/pool_$pool_id/$image_name_in/@$db_snap_id + local image_path=$backup_dir/pool_$pool_id/$image_name_in/$image_name_in + local image_unit=$((1<<$db_order)) + copy_image_snap_single_thread $pool_id $image_name_in $db_snap_id $snap_hobjects $head_hobjects $backup_dir + rollback_image_snap $pool_id $image_name_in $db_snap_id $snap_object_dir $backup_dir $image_unit + echo "$image_name_in@$snap_name : $image_path" + fi +} diff --git a/src/tools/rbd_recover_tool/epoch_h b/src/tools/rbd_recover_tool/epoch_h new file mode 100644 index 00000000..e268eafa --- /dev/null +++ b/src/tools/rbd_recover_tool/epoch_h @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +# file: epoch_h +# +# Copyright (C) 2015 Ubuntu Kylin +# +# Author: Min Chen <minchen@ubuntukylin.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +my_dir=$(dirname "$0") +. $my_dir/common_h + +#pgid_list=$single_node/$cluster-$id/pgid_list +function get_pgid_list() +{ + find $osd_data/current/ -type d -name "*_head"|\ + sed -n 's/\(.*\)\/current\/\([0-9a-fA-F]\+\.[0-9a-fA-F]\+\)_head/\2 \1/p'|\ + sort -t ' ' -k 1.1,1h -k 2.1,2 > $pgid_list; +} + +function get_pgid() +{ + hobject_path=$1 + echo $hobject_path| sed -n 's/\(.*\)\/\([0-9a-fA-F]\+\.[0-9a-fA-F]\+\)_head\(.*\)/\2/p' +} + +infos_seq= +function get_infos_seq() +{ + local func="get_infos_seq" + + local keyword=":infos." + local infos_key=`get_map_header_key $keyword` + + if [ "$infos_key"x = ""x ];then + echo "$func: keyword not input or infos_key not exists" + exit + fi + local prefix=`get_map_header_prefix` + local key=$infos_key + + infos_seq=`get_header_seq $prefix $key` + if [ "$infos_seq"x = ""x ];then + echo "$func: infos_seq not exists" + exit + fi +} + +pg_epoch= +function get_pg_epoch() +{ + local func="get_pg_epoch" + if [ "$1"x = ""x ];then + echo "$func: no pgid input" + exit + fi + + get_pg_epoch_firefly "$1" + if [ "$pg_epoch"x != ""x ]; then + # echo "Epoch for $1: $pg_epoch (firefly)" + return + fi + + get_pg_epoch_hammer "$1" + if [ "$pg_epoch"x != ""x ]; then + # echo "Epoch for $1: $pg_epoch (hammer)" + return + fi + + echo "$func: Couldn't find epoch for $1" + exit +} + +function get_pg_epoch_firefly() +{ + local func="get_pg_epoch_firefly" + if [ "$1"x = ""x ];then + echo "$func: no pgid input" + exit + fi + local pgid=$1 + local key=$pgid"_epoch" + + #get_infos_seq; + # infos_seq default to 1 + infos_seq=1 + local infos_seq=`printf "%016d" $infos_seq` + local prefix="_USER_"$infos_seq"_USER_" + + pg_epoch=`get_header_kv $prefix $key int` +} + +function get_pg_epoch_hammer() +{ + local func="get_pg_epoch_hammer" + if [ "$1"x = ""x ];then + echo "$func: no pgid input" + exit + fi + local pgid="$1" + local hkey_prefix="$(get_map_header_prefix)" + local hkey="$(printf '...head.%x.%08X' "$(echo "$pgid"|cut -d'.' -f1)" "$((0x$(echo "$pgid"|cut -d'.' -f2)))")" + + local infos_seq="$(get_header_seq "$hkey_prefix" "$hkey")" + local infos_seq=`printf "%016d" $infos_seq` + local prefix="_USER_"$infos_seq"_USER_" + local key="_epoch" + + pg_epoch=`get_header_kv $prefix $key int` +} diff --git a/src/tools/rbd_recover_tool/metadata_h b/src/tools/rbd_recover_tool/metadata_h new file mode 100644 index 00000000..4aa491b5 --- /dev/null +++ b/src/tools/rbd_recover_tool/metadata_h @@ -0,0 +1,368 @@ +#!/usr/bin/env bash +# file: metadata_h +# +# Copyright (C) 2015 Ubuntu Kylin +# +# Author: Min Chen <minchen@ubuntukylin.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +my_dir=$(dirname "$0") +. $my_dir/common_h +. $my_dir/epoch_h + +# put origin name in $image_name_in: for output +# put convert "_" name in $image_name: for grep image hobjects from database +image_name_in= +image_name= +function input_image() +{ + local func="input_image" + if [ "$1"x = ""x ];then + echo "$func: no image name input" + exit + fi + + image_name_in=$1 + # "_" -> "\u" + image_name=`convert_underline $image_name_in` +} + +#======================================== distinguish v1 or v2 =================================== +#image_list_v1=$single_node/$cluster-$id/image_list_v1 +#image_list_v2=$single_node/$cluster-$id/image_list_v2 +function get_image_list() +{ + find $osd_data/current/ -type f|grep ".rbd__" >$image_list_v1 + find $osd_data/current/ -type f|grep "rbd\\\\uid." >$image_list_v2 +} + +function get_image_format_by_hobject() +{ + local func="get_image_format" + if [ "$1"x = ""x ];then + exit + fi + local res1=`cat $image_list_v1|grep $1` + if [ "$res1"x != ""x ];then + echo 1 + exit + fi + + local res2=`cat $image_list_v2|grep $1` + if [ "$res2"x = ""x ];then + echo 2 + exit + fi +} + +#======================================== image format v1 ======================================== +# <image_name>.rbd include 3 parts: +# header + snap_count*snapshot + snap_count*snap_name +# +# struct rbd_obj_header_ondisk { +# 40 char text[40]; +# 24 char block_name[RBD_MAX_BLOCK_NAME_SIZE]; +# 4 char signature[4]; +# 8 char version[8]; +# struct { +# 1 __u8 order; +# 1 __u8 crypt_type; +# 1 __u8 comp_type; +# 1 __u8 unused; +# } __attribute__((packed)) options; +# 8 __le64 image_size;//hexdump -C s=80 n=8 +# 8 __le64 snap_seq; //hexdump -C s=88 n=8 +# 4 __le32 snap_count;//hexdump -C s=96 n=4 +# 4 __le32 reserved; +# 8 __le64 snap_names_len;//hexdump -C s=104 n=8 +# struct rbd_obj_snap_ondisk snaps[0]; +# } __attribute__((packed)); +# +# sizeof(rbd_obj_header_ondisk): 112 +# +# struct rbd_obj_snap_ondisk { +# 8 __le64 id; //hexdump -C s=112+i*16 n=8 , i=[0, snap_count) +# 8 __le64 image_size;//hexdump -C s=112+i*16+8 n=8, i=[0, snap_count) +# } __attribute__((packed)); +# sizeof(rbd_obj_snap_ondisk): 16 +# +# get snap_names form <image_nane>.rbd +# hexdump -e '10/1 "%_c"' -s $((112 + $snap_count*16)) -n $snap_names_len <image_name>.rbd +# then split snap_names into array + +function get_image_metadata_v1() +{ + local func="get_image_metadata_v1" + if [ "$1"x = ""x ];then + echo "$func: no image head object input" + exit + fi + local snap_name= + if [ "$2"x != ""x ];then + snap_name=$2 + fi + + if [ ! -e $1 ];then + echo "$func: $1 not exists" + exit + fi + local hobject_path=$1 + d_hobject_path=`dump_backslash $1` + local image_format=`get_image_format_by_hobject $d_hobject_path` + if [ $image_format != 1 ];then + echo "$func: image_format must be 1" + exit + fi + + if [ ! -e $hobject_path ];then + echo "$func: $hobject_path not exists" + exit + fi + + # decode rbd_obj_header_ondisk of <image_name>.rbd + local block_name=`hexdump -e '10/1 "%c"' -s 40 -n 24 $hobject_path` + local order=`hexdump -e '10/4 "%u"' -s 76 -n 1 $hobject_path` + local image_size=`hexdump -C -s 80 -n 8 $hobject_path|head -n 1|awk '{for (i=9; i>1; i--) {printf $i}}'` + image_size=$((16#$image_size)) + local snap_seq=`hexdump -C -s 88 -n 8 $hobject_path|head -n 1| + awk '{num=""; for(i=9; i>1; i--){ num=num""$i;} print strtonum("0x"num);}'` + local snap_count=`hexdump -C -s 96 -n 4 $hobject_path|head -n 1| + awk '{num=""; for(i=5; i>1; i--){ num=num""$i;} print strtonum("0x"num);}'` + local snap_names_len=`hexdump -C -s 104 -n 8 $hobject_path|head -n 1| + awk '{num=""; for(i=9; i>1; i--){ num=num""$i;} print strtonum("0x"num);}'` + + echo -e "block_name:\t$block_name" + echo -e "order:\t\t$order" + echo -e "image_size:\t$image_size" + echo -e "snap_seq:\t$snap_seq" + + # decode N rbd_obj_snap_ondisk of <image_name>.rbd + declare -a snap_ids + declare -a snap_names + declare -a snap_image_sizes + local size_header=112 #sizeof(rbd_obj_header_ondisk) + local size_snap=16 #sizeof(rbd_obj_snap_ondisk) + local offset=0 + local id_off=0 + local size_off=0 + for ((i=0; i<$snap_count; i++)) + do + offset=$(($size_header + $i * $size_snap)) + id_off=$offset + size_off=$(($offset + 8)) + snap_ids[$i]=`hexdump -C -s $id_off -n 8 $hobject_path|head -n 1| + awk '{num=""; for(i=9; i>1; i--){num=num""$i;} print strtonum("0x"num);}'` + snap_image_sizes[$i]=`hexdump -C -s $size_off -n 8 $hobject_path|head -n 1| + awk '{num=""; for(i=9; i>1; i--){num=num""$i;} print strtonum("0x"num);}'` + done + offset=$(($size_header + $snap_count * $size_snap)) + snap_names=(`hexdump -e '10/1 "%_c"' -s $offset -n $snap_names_len $hobject_path| + awk -F "\\\\\\\\\\\\\\\\0" '{for(i=1; i<=NF; i++) {print $i" "} }'`); + + echo -e "\t\tID\tNAME\t\tSIZE" + for ((i=0; i<$snap_count; i++)) + do + if [ "$snap_name"x = ""x ];then + echo -n -e "snapshot:\t" + echo -e "${snap_ids[$i]}\t${snap_names[$i]}\t\t${snap_image_sizes[$i]}" + continue + fi + if [ "$snap_name"x = "${snap_names[$i]}"x ];then + echo -n -e "snapshot:\t" + echo -e "${snap_ids[$i]}\t${snap_names[$i]}\t\t${snap_image_sizes[$i]}" + return + fi + done +} + +#======================================== end image format v1 ======================================== + +#======================================== image format v2 ======================================== + +# map_header, header_seq, header, key/value +# eg. +# map_header _HOBJTOSEQ_:rbd%uheader%e139a6b8b4567...head.2.68E826B6 +# meta_header_seq 17426 +# header: _USER_0000000000017426_USER_:object_prefix +# _USER_0000000000017426_USER_:order +# _USER_0000000000017426_USER_:size +# _USER_0000000000017426_USER_:snap_seq +# key/value ceph-kvstore-tool /storepath get _USER_0000000000017426_USER_ (object_prefix|order|size|snap_seq) + +# decode image id from image_id_hobject +function get_image_id() +{ + local func="get_image_id" + if [ "$1"x = ""x ];then + exit; + fi + local image_id_hobject=$1 #from admin node's database + + if [ ! -e $image_id_hobject ];then + #echo "$func: $image_id_hobject not exists" + exit; + fi + + # get len of string + local n=`hexdump -e '10/4 "%u"' -s 0 -n 4 $image_id_hobject` + # get string + hexdump -e '10/1 "%c"' -s 4 -n $n $image_id_hobject +} + +#find image_id omap entry in omaplist +map_header_prefix= +map_header_key= +function get_map_header() +{ + local func="get_map_header" + local image_id=$1 + if [ "$image_id"x = ""x ];then + echo "$func: no image_id input" + exit; + fi + map_header_prefix=`get_map_header_prefix` + local keyword="header%e"$image_id + map_header_key=`get_map_header_key $keyword` + if [ "$map_header_key"x = ""x ];then + echo "$func: map_header_key is NULL(not in omaplist)" + exit + fi +} + +#get meta header seq from map_header +meta_header_seq= +function get_meta_header_seq() +{ + local func="get_meta_header_seq" + if [ "$1"x == ""x ];then + echo "$func: no prefix input" + exit; + elif [ "$2"x == ""x ];then + echo "$func: no key input" + exit; + fi + local prefix=$1; + local key=$2; + meta_header_seq=`get_header_seq $prefix $key` +} + +# get image metadata : object_prefix, order, image_size, snap_seq +object_prefix= +order= +image_size= +snap_seq= +function get_image_metadata_v2() +{ + local func="get_image_metadata_v2" + if [ "$1"x = ""x ];then + echo "$func: no meta_header_seq input" + exit; + fi + local meta_header_seq=`printf "%016d" $1` + #echo "$func: meta_header_seq = "$meta_header_seq + local ghobject_key="_USER_"$meta_header_seq"_USER_" + local prefix=$ghobject_key + + object_prefix=`get_header_kv $prefix object_prefix string` + #object_prefix="rbd_data.$image_id" + order=`get_header_kv $prefix order int` + image_size=`get_header_kv $prefix size int` + snap_seq=`get_header_kv $prefix snap_seq int` + + echo -e "object_prefix:\t$object_prefix" + echo -e "order:\t\t$order" + echo -e "image_size:\t$image_size" + echo -e "snap_seq:\t$snap_seq" + + # list snapshot + list_snaps_v2 $1 $2 +} + +# struct cls_rbd_snap { +# snapid_t id; +# string name; +# uint64_t image_size; +# uint64_t features; +# uint8_t protection_status; +# cls_rbd_parent parent; +# } +# decode cls_rbd_snap +# 1 u8 struct_v +# 1 u8 struct_compat +# 4 u32 struct_len +# 8 u64 snapid_t id //s=6 n=8 +# 4 u32 len of name //s=14 n=4 +# len char name //s=18 n=len +# 8 u64 image_size +# 8 u64 features +# ...... +# +function list_snaps_v2() +{ + local func="list_snaps_v2" + if [ "$1"x = ""x ];then + exit + fi + local sname= + if [ $# -eq 2 ];then + sname=$2 + fi + local meta_header_seq=`printf "%016d" $1` + local prefix="_USER_"$meta_header_seq"_USER_" + local keys=(`awk -F ":" '/snapshot_/ && $1 == "'"$prefix"'" {if ($2 == "") exit; split($2, arr, "_"); + print arr[2];}' $omap_list|sort -r`) + echo -e "\t\tID\tNAME\t\tSIZE" + for key in ${keys[@]} + do + key="snapshot_$key" + local arr=(`ceph-kvstore-tool $omap_path get $prefix $key|awk -F ":" '{print $2}'`); + # get snap_name + tmp= + for ((i=17; i>13; i--)) + do + tmp="$tmp${arr[$i]}" + done + local len=$((16#$tmp)) + local snap_name= + for ((i=18; i<$((18+$len)); i++)) + do + # convert ascii to char + local char=`echo -e "\x${arr[$i]}"` + snap_name="$snap_name$char" + done + # get snap_id (little endian) + local tmp= + for ((i=13; i>5; i--)) + do + tmp="$tmp${arr[$i]}" + done + local snap_id=$((16#$tmp)) + # get image_size of current snap (little endian) + tmp= + for ((i=$((25+$len)); i>$((17+$len)); i--)) + do + tmp="$tmp${arr[$i]}" + done + local image_size=$((16#$tmp)) + if [ "$sname"x = ""x ];then + echo -e "snapshot:\t$snap_id\t$snap_name\t\t$image_size" + continue + fi + if [ "$sname"x = "$snap_name"x ];then + echo -e "snapshot:\t$snap_id\t$snap_name\t\t$image_size" + return + fi + done +} + +#======================================== end image format v2 ======================================== diff --git a/src/tools/rbd_recover_tool/osd_job b/src/tools/rbd_recover_tool/osd_job new file mode 100755 index 00000000..b4b80be8 --- /dev/null +++ b/src/tools/rbd_recover_tool/osd_job @@ -0,0 +1,170 @@ +#!/usr/bin/env bash +# file: osd_job +# +# Copyright (C) 2015 Ubuntu Kylin +# +# Author: Min Chen <minchen@ubuntukylin.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +my_dir=$(dirname "$0") + +. $my_dir/common_h +. $my_dir/metadata_h +. $my_dir/epoch_h + +function check_ceph_osd() +{ + local func="check_ceph_osd" + local host=`hostname` + # if ceph-osd service is still running, except flush-journal + if [ "`ps aux|grep ceph-osd|grep -v flush-journal|grep -v grep`"x != ""x ];then + echo "[$host]: $func: ceph-osd is running..., stop it" + exit + fi +} + +function cat_pg_epoch() +{ + local func="cat_pg_epoch" + init_env_osd $1 + if [ -e $node_pg_epoch ];then + cat $node_pg_epoch + fi +} + +function cat_image_v1() +{ + local func="cat_image_v1" + init_env_osd $1 + if [ -e $image_v1 ];then + cat $image_v1 + fi +} + +function cat_image_v2() +{ + local func="cat_image_v2" + init_env_osd $1 + if [ -e $image_v2 ];then + cat $image_v2 + fi +} + +function flush_osd_journal() +{ + local func="flush_osd_journal" + init_env_osd $1 + local osd_data_path=$osd_data + local osd_journal_path=$osd_data/journal + local whoami_path=$osd_data/whoami + local host=`hostname` + if [ ! -e $whoami_path ];then + echo "[$host]: $func: $whoami_path not exists" + exit + fi + local whoami=`cat $whoami_path` + echo "[$host]: $func ..." + ceph-osd -i $whoami --osd-data $osd_data_path --osd-journal $osd_journal_path --flush-journal >/dev/null + if [ $? -ne 0 ];then + echo "[$host]: $func: flush osd journal failed" + exit + fi +} + +function do_omap_list() +{ + local func="do_omap_list" + init_env_osd $1 + local host=`hostname` + echo "[$host]: $func ..." + get_omap_list +} + +# get all pgs epoch +function do_pg_epoch() +{ + local func="do_pg_epoch" + init_env_osd $1 + local node=`hostname` + get_pgid_list + >$node_pg_epoch + local pgid= + local data_path= + local host=`hostname` + echo "[$host]: $func ..." + while read line + do + { + pgid=`echo $line|awk '{print $1}'` + data_path=`echo $line|awk '{print $2}'` + get_pg_epoch $pgid + echo -e "$node $pgid $pg_epoch $data_path" >>$node_pg_epoch + } + done < $pgid_list +} + +# get an list of image in this osd node, pg epoch maybe not the latest, the admin node will do distinguish +function do_image_list() +{ + local func="do_image_list" + init_env_osd $1 + get_image_list + local node=`hostname` + >$image_v1 + >$image_v2 + local host=`hostname` + echo "[$host]: $func ..." + for line in `cat $image_list_v1` + do + pgid=`get_pgid $line` + get_pg_epoch $pgid + echo "$node $line $pg_epoch" >> $image_v1 + done + for line in `cat $image_list_v2` + do + pgid=`get_pgid $line` + get_pg_epoch $pgid + echo "$node $line $pg_epoch" >> $image_v2 + done +} + +function do_image_id() +{ + local func="do_image_id" + init_env_osd $1 + get_image_id $2 +} + +function do_image_metadata_v1() +{ + local func="do_image_metadata_v1" + init_env_osd $1 + local image_header_hobject=$2 + local snap_name=$3 + get_image_metadata_v1 $image_header_hobject $snap_name +} + +function do_image_metadata_v2() +{ + local func="do_image_metadata_v2" + init_env_osd $1 + local image_id=$2 + local image_header_hobject=$3 + local snap_name=$4 + get_map_header $image_id + get_meta_header_seq $map_header_prefix $map_header_key + get_image_metadata_v2 $meta_header_seq $snap_name +} + +check_ceph_osd +$* diff --git a/src/tools/rbd_recover_tool/rbd-recover-tool b/src/tools/rbd_recover_tool/rbd-recover-tool new file mode 100755 index 00000000..b7a25865 --- /dev/null +++ b/src/tools/rbd_recover_tool/rbd-recover-tool @@ -0,0 +1,327 @@ +#!/usr/bin/env bash +# file: rbd-recover-tool +# +# Copyright (C) 2015 Ubuntu Kylin +# +# Author: Min Chen <minchen@ubuntukylin.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +# rbd-recover-tool is an offline recover tool for rbd image in replicated pool +# when ceph cluster is stopped. +# it is a simple disater recovery policy, just for urgent condition + +my_dir=$(dirname "$0") + +. $my_dir/common_h +. $my_dir/metadata_h +. $my_dir/epoch_h +. $my_dir/database_h + +#scp files from admin node to osd node +file1=common_h +file2=metadata_h +file3=epoch_h +file4=osd_job + +#------------ admin node's action ------------- + +function scp_file() +{ + local func="scp_file" + file=$1 + if [ "$1"x = ""x ];then + echo "$func: not file input" + exit + fi + for host in `cat $osd_host` + do + { + echo "$func: $host" + scp $ssh_option $file $host:$job_path 1>/dev/null + } & + done +} + +function scp_files() +{ + local func="scp_files" + for host in `cat $osd_host` + do + { + echo "$func: $host" + scp $ssh_option $file1 $host:$job_path + scp $ssh_option $file2 $host:$job_path + scp $ssh_option $file3 $host:$job_path + scp $ssh_option $file4 $host:$job_path + } & + done + wait + echo "$func: finish" +} + +function scatter_node_jobs() +{ + local func="scatter_node_jobs" + local host= + local data_path= + echo "$func: flush osd journal & generate infos: omap, pg, image metadata ..." + + trap 'echo $func failed; exit' INT HUP + while read line + do + { + host=`echo $line|awk '{print $1}'` + data_path=`echo $line|awk '{print $2}'` + check_osd_process $host + + cmd="mkdir -p $job_path" + ssh $ssh_option $host $cmd + scp $ssh_option $file1 $host:$job_path >/dev/null + scp $ssh_option $file2 $host:$job_path >/dev/null + scp $ssh_option $file3 $host:$job_path >/dev/null + scp $ssh_option $file4 $host:$job_path >/dev/null + + cmd="bash $job_path/osd_job flush_osd_journal $data_path;" + cmd="$cmd $job_path/osd_job do_omap_list $data_path;" + cmd="$cmd bash $job_path/osd_job do_pg_epoch $data_path;" + cmd="$cmd bash $job_path/osd_job do_image_list $data_path;" + + ssh $ssh_option $host $cmd </dev/null + } & + done < $osd_host_path + wait + echo "$func: finish" +} + +function gather_node_infos() +{ + local func="gather_node_infos" + echo "$func ..." + >$pg_coll + >$image_coll_v1 + >$image_coll_v2 + trap 'echo $func failed; exit' INT HUP + while read line + do + { + host=`echo $line|awk '{print $1}'` + data_path=`echo $line|awk '{print $2}'` + echo "$func: $host" + check_osd_process $host + + #pg epoch + cmd1="bash $job_path/osd_job cat_pg_epoch $data_path" + ssh $ssh_option $host $cmd1 >> $pg_coll + #image v1 + cmd2="bash $job_path/osd_job cat_image_v1 $data_path" + ssh $ssh_option $host $cmd2 >> $image_coll_v1 + #image v2 + cmd3="bash $job_path/osd_job cat_image_v2 $data_path" + ssh $ssh_option $host $cmd3 >> $image_coll_v2 + } & + done < $osd_host_path + wait + echo "$func: finish" +} + +function scatter_gather() +{ + local func="scatter_gather" + if [ ! -s $osd_host ];then + echo "$func: no osd_host input" + exit + fi + if [ ! -s $mon_host ];then + echo "$func: no mon_host input" + exit + fi + scatter_node_jobs + gather_node_infos +} + + +#------------- operations -------------- + +function database() +{ + scatter_gather + gen_database +} + +function list() +{ + list_images +} + +function lookup() +{ + lookup_image $1 $2 $3 +} + +function recover() +{ + recover_image $1 $2 $3 $4 +} + +#------------- helper ------------- + +function usage() +{ + local cmd_name="rbd-recover-tool" + echo + echo "$cmd_name is used to recover rbd image of replicated pool, + when all ceph services are stopped" + echo "Usage:" + echo "$cmd_name database + gather pg info, object info, image metadata, + and epoch info from all osd nodes, + this will cosume a long time, just be patient, + especially when scale up to 1000+ osds" + echo "$cmd_name list + list all rbd images of all replicated pools, + before to lookup & recover" + echo "$cmd_name lookup <pool_id>/<image_name>[@[<snap_name>]] + show image metadata: image format, rbd id, size, order, snapseq + In addition, for image with snapshots, + this will list all snapshot infomations" + echo "$cmd_name recover <pool_id>/<image_name>[@[<snap_name>]] [</path/to/store/image>] + all snapshots share one image head, to economize disk space + so there is only one snapshot at any time, + image is saved at </path/to/store/image>/pool_<pool_id>/image_name/image_name + cat <path/to/store/image>/pool_<pool_id>/image_name/@CURRENT, + will show snapid + recover to raw image/nosnap/head: <image_name> + rollback to image head: <image_name>@ + rollback to image snap: <image_name>@<snap_name> + recover steps: + 1. recover image nosnap (only one time) + 2. rollback to image snap" +} + +function get_path() +{ + local func="get_path" + if [ $# -lt 1 ];then + return + fi + if [[ $1 =~ // ]];then + return # "/path//to" is invalid + fi + local parent=`dirname $1` + local name=`basename $1` + if [ "$parent"x = "/"x ];then + echo "$parent$name" + else + echo -n "$parent/$name" + fi +} + +function admin_cmd() +{ + local func="admin_cmd" + if [ $# -lt 1 ];then + usage + exit + fi + if [ "$1"x = "-h"x ] || [ "$1"x = "--help"x ];then + usage + exit + fi + + if [ "$1"x = "database"x ];then + if [ $# -gt 1 ];then + usage + exit + fi + # remove osd_host to refresh osd_host and osd_host_mapping + rm -f $osd_host + init_env_admin + database + elif [ "$1"x = "list"x ];then + if [ $# -gt 1 ];then + usage + exit + fi + init_env_admin + list + elif [ "$1"x = "lookup"x ];then + if [ $# -gt 2 ];then + usage + exit + fi + local pool_id=-1 + local image_name= + local snap_name= + if [[ $2 =~ ^([^@/]+)/([^@/]+)$ ]];then + pool_id="${BASH_REMATCH[1]}" + image_name="${BASH_REMATCH[2]}" + elif [[ $2 =~ ^([^@/]+)/([^@/]+)@([^@/]*)$ ]];then + pool_id="${BASH_REMATCH[1]}" + image_name="${BASH_REMATCH[2]}" + snap_name="${BASH_REMATCH[3]}" + else + echo "format: $2 is invalid, use <pool_id>/<image_name>[@[<snap_name>]]" + exit + fi + init_env_admin + lookup $pool_id $image_name $snap_name + elif [ "$1"x = "recover"x ];then + if [ $# -lt 2 ] || [ $# -gt 3 ];then + usage + exit + fi + local pool_id=-1 + local image_name= + local snap_name=@ + local image_dir= + if [[ $2 =~ ^([^@/]+)/([^@/]+)$ ]];then + pool_id="${BASH_REMATCH[1]}" + image_name="${BASH_REMATCH[2]}" + elif [[ $2 =~ ^([^@/]+)/([^@/]+)@([^@/]*)$ ]];then + pool_id="${BASH_REMATCH[1]}" + image_name="${BASH_REMATCH[2]}" + snap_name="${BASH_REMATCH[3]}" + if [ "$snap_name"x = ""x ];then + snap_name=@@ + fi + else + echo "format: $2 is invalid, use <pool_id>/<image_name>[@[<snap_name>]]" + exit + fi + if [ $# = 3 ];then + image_dir=`get_path $3` + if [ "image_dir"x = ""x ];then + echo "$3 invalid" + exit + fi + fi + init_env_admin + recover $pool_id $image_name $snap_name $image_dir + elif [ "$1"x = "scp_files"x ];then + if [ $# -gt 1 ];then + exit + fi + admin_parse_osd + scp_files + elif [ "$1"x = "scp_file"x ];then + if [ $# -gt 2 ];then + exit + fi + admin_parse_osd + scp_file $2 + else + echo "$func: $1: command not found" + fi +} + +admin_cmd $* diff --git a/src/tools/rbd_recover_tool/test_rbd_recover_tool.sh b/src/tools/rbd_recover_tool/test_rbd_recover_tool.sh new file mode 100755 index 00000000..876b47b9 --- /dev/null +++ b/src/tools/rbd_recover_tool/test_rbd_recover_tool.sh @@ -0,0 +1,542 @@ +#!/usr/bin/env bash +# +# Copyright (C) 2015 Ubuntu Kylin +# +# Author: Min Chen <minchen@ubuntukylin.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +# unit test case for rbd-recover-tool + +#prepare: +# - write config files: config/osd_host, config/mon_host, config/storage_path, config/mds_host if exist mds +#step 1. rbd export all images as you need +#step 2. stop all ceph services +#step 3. use ceph_rbd_recover_tool to recover all images +#step 4. compare md5sum of recover image with that of export image who has the same image name + +ssh_opt="-o ConnectTimeout=1" +my_dir=$(dirname "$0") +tool_dir=$my_dir + +#storage_path=$my_dir/config/storage_path +mon_host=$my_dir/config/mon_host +osd_host=$my_dir/config/osd_host +mds_host=$my_dir/config/mds_host + +test_dir= # `cat $storage_path` +export_dir= #$test_dir/export +recover_dir= #$test_dir/recover +image_names= #$test_dir/image_names +online_images= #$test_dir/online_images, all images on ceph rbd pool +gen_db= #$test_dir/gen_db, label database if exist +pool=rbd +pool_id=2 + +function get_pool_id() +{ + local pool_id_file=/tmp/pool_id_file.$$$$ + ceph osd pool stats $pool|head -n 1|awk '{print $4}' >$pool_id_file + if [ $? -ne 0 ];then + echo "$func: get pool id failed: pool = $pool" + rm -f $pool_id_file + exit + fi + pool_id=`cat $pool_id_file` + echo "$func: pool_id = $pool_id" + rm -f $pool_id_file +} + +function init() +{ + local func="init" + if [ $# -eq 0 ];then + echo "$func: must input <path> to storage images, enough disk space is good" + exit + fi + if [ ! -s $osd_host ];then + echo "$func: config/osd_host not exists or empty" + exit + fi + if [ ! -s $mon_host ];then + echo "$func: config/mon_host not exists or empty" + exit + fi + if [ ! -e $mds_host ];then + echo "$func: config/mds_host not exists" + exit + fi + test_dir=$1 + export_dir=$test_dir/export + recover_dir=$test_dir/recover + image_names=$test_dir/image_names + online_images=$test_dir/online_images + gen_db=$test_dir/gen_db + + trap 'echo "ceph cluster is stopped ..."; exit;' INT + ceph -s >/dev/null + get_pool_id + + mkdir -p $test_dir + mkdir -p $export_dir + mkdir -p $recover_dir + rm -rf $export_dir/* + rm -rf $recover_dir/* +} + +function do_gen_database() +{ + local func="do_gen_database" + if [ -s $gen_db ] && [ `cat $gen_db` = 1 ];then + echo "$func: database already existed" + exit + fi + bash $tool_dir/rbd-recover-tool database + echo 1 >$gen_db +} + +#check if all ceph processes are stopped +function check_ceph_service() +{ + local func="check_ceph_service" + local res=`cat $osd_host $mon_host $mds_host|sort -u|tr -d [:blank:]|xargs -n 1 -I @ ssh $ssh_opt @ "ps aux|grep -E \"(ceph-osd|ceph-mon|ceph-mds)\"|grep -v grep"` + if [ "$res"x != ""x ];then + echo "$func: NOT all ceph services are stopped" + return 1 + exit + fi + echo "$func: all ceph services are stopped" + return 0 +} + +function stop_ceph() +{ + local func="stop_ceph" + #cat osd_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-osd" + while read osd + do + { + osd=`echo $osd|tr -d [:blank:]` + if [ "$osd"x = ""x ];then + continue + fi + #ssh $ssh_opt $osd "killall ceph-osd ceph-mon ceph-mds" </dev/null + ssh $ssh_opt $osd "killall ceph-osd" </dev/null + } & + done < $osd_host + wait + echo "waiting kill all osd ..." + sleep 1 + #cat $mon_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mon ceph-osd ceph-mds" + cat $mon_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mon" + #cat $mds_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mds ceph-mon ceph-osd" + cat $mds_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mds" +} + +function create_image() +{ + local func="create_image" + if [ ${#} -lt 3 ];then + echo "create_image: parameters: <image_name> <size> <image_format>" + exit + fi + local image_name=$1 + local size=$2 + local image_format=$3 + if [ $image_format -lt 1 ] || [ $image_format -gt 2 ];then + echo "$func: image_format must be 1 or 2" + exit + fi + local res=`rbd list|grep -E "^$1$"` + echo "$func $image_name ..." + if [ "$res"x = ""x ];then + rbd -p $pool create $image_name --size $size --image_format $image_format + else + if [ $image_format -eq 2 ];then + rbd snap ls $image_name|tail -n +2|awk '{print $2}'|xargs -n 1 -I % rbd snap unprotect $image_name@% + fi + rbd snap purge $image_name + #rbd rm $image_name + rbd -p $pool resize --allow-shrink --size $size $image_name + fi +} + +function export_image() +{ + local func="export_image" + + if [ $# -lt 2 ];then + echo "$func: parameters: <image_name> <image_format> [<image_size>]" + exit + fi + + local image_name=$1 + local format=$(($2)) + local size=$(($3)) #MB + + if [ $format -ne 1 ] && [ $format -ne 2 ];then + echo "$func: image format must be 1 or 2" + exit + fi + + if [ $size -eq 0 ];then + size=24 #MB + echo "$func: size = $size" + fi + local mnt=/rbdfuse + + mount |grep "rbd-fuse on /rbdfuse" &>/dev/null + if [ $? -ne 0 ];then + rbd-fuse $mnt + fi + + create_image $image_name $size $format + + dd conv=notrunc if=/dev/urandom of=$mnt/$image_name bs=4M count=$(($size/4)) + + local export_image_dir=$export_dir/pool_$pool_id/$image_name + mkdir -p $export_image_dir + local export_md5_nosnap=$export_image_dir/@md5_nosnap + >$export_md5_nosnap + + local export_image_path=$export_image_dir/$image_name + rm -f $export_image_path + + rbd export $pool/$image_name $export_image_path + md5sum $export_image_path |awk '{print $1}' >$export_md5_nosnap +} + +function recover_image() +{ + local func="recover_snapshots" + if [ $# -lt 1 ];then + echo "$func: parameters: <image_name>" + exit + fi + + local image_name=$1 + #pool_id=29 + + local recover_image_dir=$recover_dir/pool_$pool_id/$image_name + mkdir -p $recover_image_dir + local recover_md5_nosnap=$recover_image_dir/@md5_nosnap + >$recover_md5_nosnap + local snapshot= + + bash $tool_dir/rbd-recover-tool recover $pool_id/$image_name $recover_dir + md5sum $recover_image_dir/$image_name|awk '{print $1}' >$recover_md5_nosnap +} + +function make_snapshot() +{ + local func="make_snapshot" + if [ $# -lt 5 ];then + echo "$func: parameters: <ofile> <seek> <count> <snap> <export_image_dir>" + exit + fi + local ofile=$1 + local seek=$(($2)) + local count=$(($3)) + local snap=$4 + local export_image_dir=$5 + + if [ $seek -lt 0 ];then + echo "$func: seek can not be minus" + exit + fi + + if [ $count -lt 1 ];then + echo "$func: count must great than zero" + exit + fi + + echo "[$snap] $func ..." + echo "$1 $2 $3 $4" + rbd snap ls $image_name|grep $snap; + + local res=$? + if [ $res -eq 0 ];then + return $res + fi + + dd conv=notrunc if=/dev/urandom of=$ofile bs=1M count=$count seek=$seek 2>/dev/null + snapshot=$image_name@$snap + rbd snap create $snapshot + rm -f $export_image_dir/$snapshot + rbd export $pool/$image_name $export_image_dir/$snapshot + pushd $export_image_dir >/dev/null + md5sum $snapshot >> @md5 + popd >/dev/null +} + +function recover_snapshots() +{ + local func="recover_snapshots" + if [ $# -lt 1 ];then + echo "$func: parameters: <image_name>" + exit + fi + + local image_name=$1 + #pool_id=29 + + local recover_image_dir=$recover_dir/pool_$pool_id/$image_name + mkdir -p $recover_image_dir + local recover_md5=$recover_image_dir/@md5 + >$recover_md5 + local snapshot= + + + # recover head + bash $tool_dir/rbd-recover-tool recover $pool_id/$image_name $recover_dir + + # recover snapshots + for((i=1; i<10; i++)) + do + snapshot=snap$i + bash $tool_dir/rbd-recover-tool recover $pool_id/$image_name@$snapshot $recover_dir + pushd $recover_image_dir >/dev/null + local chksum=`md5sum $image_name|awk '{print $1}'` + echo "$chksum $image_name@$snapshot" >>@md5 + popd >/dev/null + done +} + +function export_snapshots() +{ + local func="export_snapshots" + + if [ $# -lt 2 ];then + echo "$func: parameters: <image_name> <image_format> [<image_size>]" + exit + fi + + local image_name=$1 + local format=$(($2)) + local size=$(($3)) #MB + + if [ $format -ne 1 ] && [ $format -ne 2 ];then + echo "$func: image format must be 1 or 2" + exit + fi + + if [ $size -eq 0 ];then + size=24 #MB + echo "$func: size = $size" + fi + local mnt=/rbdfuse + + mount |grep "rbd-fuse on /rbdfuse" &>/dev/null + if [ $? -ne 0 ];then + rbd-fuse $mnt + fi + + create_image $image_name $size $format + + local export_image_dir=$export_dir/pool_$pool_id/$image_name + mkdir -p $export_image_dir + local export_md5=$export_image_dir/@md5 + >$export_md5 + + # create 9 snapshots + # image = {object0, object1, object2, object3, object4, object5, ...} + # + # snap1 : init/write all objects + # snap2 : write object0 + # snap3 : write object1 + # snap4 : write object2 + # snap5 : write object3 + # snap6 : write object4 + # snap7 : write object5 + # snap8 : write object0 + # snap9 : write object3 + + make_snapshot $mnt/$image_name 0 $size snap1 $export_image_dir + make_snapshot $mnt/$image_name 0 1 snap2 $export_image_dir + make_snapshot $mnt/$image_name 4 1 snap3 $export_image_dir + make_snapshot $mnt/$image_name 8 1 snap4 $export_image_dir + make_snapshot $mnt/$image_name 12 1 snap5 $export_image_dir + make_snapshot $mnt/$image_name 16 1 snap6 $export_image_dir + make_snapshot $mnt/$image_name 20 1 snap7 $export_image_dir + make_snapshot $mnt/$image_name 1 1 snap8 $export_image_dir + make_snapshot $mnt/$image_name 13 1 snap9 $export_image_dir +} + +function check_recover_nosnap() +{ + local func="check_recover_nosnap" + if [ $# -lt 3 ];then + echo "$func: parameters: <export_md5_file> <recover_md5_file> <image_name>" + fi + local export_md5=$1 + local recover_md5=$2 + local image_name=$3 + + local ifpassed="FAILED" + + echo "================ < $image_name nosnap > ================" + + local export_md5sum=`cat $export_md5` + local recover_md5sum=`cat $recover_md5` + + if [ "$export_md5sum"x != ""x ] && [ "$export_md5sum"x = "$recover_md5sum"x ];then + ifpassed="PASSED" + fi + echo "export: $export_md5sum" + echo "recover: $recover_md5sum $ifpassed" +} + +function check_recover_snapshots() +{ + local func="check_recover_snapshots" + if [ $# -lt 3 ];then + echo "$func: parameters: <export_md5_file> <recover_md5_file> <image_name>" + fi + local export_md5=$1 + local recover_md5=$2 + local image_name=$3 + + local ifpassed="FAILED" + + echo "================ < $image_name snapshots > ================" + + OIFS=$IFS + IFS=$'\n' + local export_md5s=(`cat $export_md5`) + local recover_md5s=(`cat $recover_md5`) + for((i=0; i<9; i++)) + do + OOIFS=$IFS + IFS=$' ' + local x=$(($i+1)) + snapshot=snap$x + + local export_arr=(`echo ${export_md5s[$i]}`) + local recover_arr=(`echo ${recover_md5s[$i]}`) + echo "export: ${export_md5s[$i]}" + if [ "${export_arr[1]}"x != ""x ] && [ "${export_arr[1]}"x = "${recover_arr[1]}"x ];then + ifpassed="PASSED" + fi + echo "recover: ${recover_md5s[$i]} $ifpassed" + IFS=$OOIFS + done + IFS=$OIFS +} + +# step 1: export image, snapshot +function do_export_nosnap() +{ + export_image image_v1_nosnap 1 + export_image image_v2_nosnap 2 +} + +function do_export_snap() +{ + export_snapshots image_v1_snap 1 + export_snapshots image_v2_snap 2 +} + +# step 2: stop ceph cluster and gen database +function stop_cluster_gen_database() +{ + trap 'echo stop ceph cluster failed; exit;' INT HUP + stop_ceph + sleep 2 + check_ceph_service + local res=$? + while [ $res -ne 0 ] + do + stop_ceph + sleep 2 + check_ceph_service + res=$? + done + + echo 0 >$gen_db + do_gen_database +} + +# step 3: recover image,snapshot +function do_recover_nosnap() +{ + recover_image image_v1_nosnap + recover_image image_v2_nosnap +} + +function do_recover_snap() +{ + recover_snapshots image_v1_snap + recover_snapshots image_v2_snap +} + +# step 4: check md5sum pair<export_md5sum, recover_md5sum> +function do_check_recover_nosnap() +{ + local image1=image_v1_nosnap + local image2=image_v2_nosnap + + local export_md5_1=$export_dir/pool_$pool_id/$image1/@md5_nosnap + local export_md5_2=$export_dir/pool_$pool_id/$image2/@md5_nosnap + local recover_md5_1=$recover_dir/pool_$pool_id/$image1/@md5_nosnap + local recover_md5_2=$recover_dir/pool_$pool_id/$image2/@md5_nosnap + + check_recover_nosnap $export_md5_1 $recover_md5_1 $image1 + check_recover_nosnap $export_md5_2 $recover_md5_2 $image2 +} + +function do_check_recover_snap() +{ + local image1=image_v1_snap + local image2=image_v2_snap + + local export_md5_1=$export_dir/pool_$pool_id/$image1/@md5 + local export_md5_2=$export_dir/pool_$pool_id/$image2/@md5 + local recover_md5_1=$recover_dir/pool_$pool_id/$image1/@md5 + local recover_md5_2=$recover_dir/pool_$pool_id/$image2/@md5 + + check_recover_snapshots $export_md5_1 $recover_md5_1 $image1 + check_recover_snapshots $export_md5_2 $recover_md5_2 $image2 +} + +function test_case_1() +{ + do_export_nosnap + stop_cluster_gen_database + do_recover_nosnap + do_check_recover_nosnap +} + +function test_case_2() +{ + do_export_snap + stop_cluster_gen_database + do_recover_snap + do_check_recover_snap +} + +function test_case_3() +{ + do_export_nosnap + do_export_snap + + stop_cluster_gen_database + + do_recover_nosnap + do_recover_snap + + do_check_recover_nosnap + do_check_recover_snap +} + + +init $* +test_case_3 diff --git a/src/tools/rebuild_mondb.cc b/src/tools/rebuild_mondb.cc new file mode 100644 index 00000000..8e3d5b45 --- /dev/null +++ b/src/tools/rebuild_mondb.cc @@ -0,0 +1,351 @@ +#include "auth/cephx/CephxKeyServer.h" +#include "common/errno.h" +#include "mon/AuthMonitor.h" +#include "mon/MonitorDBStore.h" +#include "os/ObjectStore.h" +#include "osd/OSD.h" + +static int update_auth(const string& keyring_path, + const OSDSuperblock& sb, + MonitorDBStore& ms); +static int update_monitor(const OSDSuperblock& sb, MonitorDBStore& ms); +static int update_osdmap(ObjectStore& fs, + OSDSuperblock& sb, + MonitorDBStore& ms); + +int update_mon_db(ObjectStore& fs, OSDSuperblock& sb, + const string& keyring, + const string& store_path) +{ + MonitorDBStore ms(store_path); + int r = ms.create_and_open(cerr); + if (r < 0) { + cerr << "unable to open mon store: " << store_path << std::endl; + return r; + } + if ((r = update_auth(keyring, sb, ms)) < 0) { + goto out; + } + if ((r = update_osdmap(fs, sb, ms)) < 0) { + goto out; + } + if ((r = update_monitor(sb, ms)) < 0) { + goto out; + } + out: + ms.close(); + return r; +} + +static void add_auth(KeyServerData::Incremental& auth_inc, + MonitorDBStore& ms) +{ + AuthMonitor::Incremental inc; + inc.inc_type = AuthMonitor::AUTH_DATA; + encode(auth_inc, inc.auth_data); + inc.auth_type = CEPH_AUTH_CEPHX; + + bufferlist bl; + __u8 v = 1; + encode(v, bl); + inc.encode(bl, CEPH_FEATURES_ALL); + + const string prefix("auth"); + auto last_committed = ms.get(prefix, "last_committed") + 1; + auto t = make_shared<MonitorDBStore::Transaction>(); + t->put(prefix, last_committed, bl); + t->put(prefix, "last_committed", last_committed); + auto first_committed = ms.get(prefix, "first_committed"); + if (!first_committed) { + t->put(prefix, "first_committed", last_committed); + } + ms.apply_transaction(t); +} + +static int get_auth_inc(const string& keyring_path, + const OSDSuperblock& sb, + KeyServerData::Incremental* auth_inc) +{ + auth_inc->op = KeyServerData::AUTH_INC_ADD; + + // get the name + EntityName entity; + // assuming the entity name of OSD is "osd.<osd_id>" + entity.set(CEPH_ENTITY_TYPE_OSD, std::to_string(sb.whoami)); + auth_inc->name = entity; + + // read keyring from disk + KeyRing keyring; + { + bufferlist bl; + string error; + int r = bl.read_file(keyring_path.c_str(), &error); + if (r < 0) { + if (r == -ENOENT) { + cout << "ignoring keyring (" << keyring_path << ")" + << ": " << error << std::endl; + return 0; + } else { + cerr << "unable to read keyring (" << keyring_path << ")" + << ": " << error << std::endl; + return r; + } + } else if (bl.length() == 0) { + cout << "ignoring empty keyring: " << keyring_path << std::endl; + return 0; + } + auto bp = bl.cbegin(); + try { + decode(keyring, bp); + } catch (const buffer::error& e) { + cerr << "error decoding keyring: " << keyring_path << std::endl; + return -EINVAL; + } + } + + // get the key + EntityAuth new_inc; + if (!keyring.get_auth(auth_inc->name, new_inc)) { + cerr << "key for " << auth_inc->name << " not found in keyring: " + << keyring_path << std::endl; + return -EINVAL; + } + auth_inc->auth.key = new_inc.key; + + // get the caps + map<string,bufferlist> caps; + if (new_inc.caps.empty()) { + // fallback to default caps for an OSD + // osd 'allow *' mon 'allow rwx' + // as suggested by document. + encode(string("allow *"), caps["osd"]); + encode(string("allow rwx"), caps["mon"]); + } else { + caps = new_inc.caps; + } + auth_inc->auth.caps = caps; + return 0; +} + +// rebuild +// - auth/${epoch} +// - auth/first_committed +// - auth/last_committed +static int update_auth(const string& keyring_path, + const OSDSuperblock& sb, + MonitorDBStore& ms) +{ + // stolen from AuthMonitor::prepare_command(), where prefix is "auth add" + KeyServerData::Incremental auth_inc; + int r; + if ((r = get_auth_inc(keyring_path, sb, &auth_inc))) { + return r; + } + add_auth(auth_inc, ms); + return 0; +} + +// stolen from Monitor::check_fsid() +static int check_fsid(const uuid_d& fsid, MonitorDBStore& ms) +{ + bufferlist bl; + int r = ms.get("monitor", "cluster_uuid", bl); + if (r == -ENOENT) + return r; + string uuid(bl.c_str(), bl.length()); + auto end = uuid.find_first_of('\n'); + if (end != uuid.npos) { + uuid.resize(end); + } + uuid_d existing; + if (!existing.parse(uuid.c_str())) { + cerr << "error: unable to parse uuid" << std::endl; + return -EINVAL; + } + if (fsid != existing) { + cerr << "error: cluster_uuid " << existing << " != " << fsid << std::endl; + return -EEXIST; + } + return 0; +} + +// rebuild +// - monitor/cluster_uuid +int update_monitor(const OSDSuperblock& sb, MonitorDBStore& ms) +{ + switch (check_fsid(sb.cluster_fsid, ms)) { + case -ENOENT: + break; + case -EINVAL: + return -EINVAL; + case -EEXIST: + return -EEXIST; + case 0: + return 0; + default: + ceph_abort(); + } + string uuid = stringify(sb.cluster_fsid) + "\n"; + bufferlist bl; + bl.append(uuid); + auto t = make_shared<MonitorDBStore::Transaction>(); + t->put("monitor", "cluster_uuid", bl); + ms.apply_transaction(t); + return 0; +} + +// rebuild +// - osdmap/${epoch} +// - osdmap/full_${epoch} +// - osdmap/full_latest +// - osdmap/first_committed +// - osdmap/last_committed +int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms) +{ + const string prefix("osdmap"); + const string first_committed_name("first_committed"); + const string last_committed_name("last_committed"); + epoch_t first_committed = ms.get(prefix, first_committed_name); + epoch_t last_committed = ms.get(prefix, last_committed_name); + auto t = make_shared<MonitorDBStore::Transaction>(); + + // trim stale maps + unsigned ntrimmed = 0; + // osdmap starts at 1. if we have a "0" first_committed, then there is nothing + // to trim. and "1 osdmaps trimmed" in the output message is misleading. so + // let's make it an exception. + for (auto e = first_committed; first_committed && e < sb.oldest_map; e++) { + t->erase(prefix, e); + t->erase(prefix, ms.combine_strings("full", e)); + ntrimmed++; + } + // make sure we have a non-zero first_committed. OSDMonitor relies on this. + // because PaxosService::put_last_committed() set it to last_committed, if it + // is zero. which breaks OSDMonitor::update_from_paxos(), in which we believe + // that latest_full should always be greater than last_committed. + if (first_committed == 0 && sb.oldest_map < sb.newest_map) { + first_committed = 1; + } else if (ntrimmed) { + first_committed += ntrimmed; + } + if (first_committed) { + t->put(prefix, first_committed_name, first_committed); + ms.apply_transaction(t); + t = make_shared<MonitorDBStore::Transaction>(); + } + + unsigned nadded = 0; + + auto ch = fs.open_collection(coll_t::meta()); + OSDMap osdmap; + for (auto e = std::max(last_committed+1, sb.oldest_map); + e <= sb.newest_map; e++) { + bool have_crc = false; + uint32_t crc = -1; + uint64_t features = 0; + // add inc maps + auto add_inc_result = [&] { + const auto oid = OSD::get_inc_osdmap_pobject_name(e); + bufferlist bl; + int nread = fs.read(ch, oid, 0, 0, bl); + if (nread <= 0) { + cout << "missing " << oid << std::endl; + return -ENOENT; + } + t->put(prefix, e, bl); + + OSDMap::Incremental inc; + auto p = bl.cbegin(); + inc.decode(p); + features = inc.encode_features | CEPH_FEATURE_RESERVED; + if (osdmap.get_epoch() && e > 1) { + if (osdmap.apply_incremental(inc)) { + cerr << "bad fsid: " + << osdmap.get_fsid() << " != " << inc.fsid << std::endl; + return -EINVAL; + } + have_crc = inc.have_crc; + if (inc.have_crc) { + crc = inc.full_crc; + bufferlist fbl; + osdmap.encode(fbl, features); + if (osdmap.get_crc() != inc.full_crc) { + cerr << "mismatched inc crc: " + << osdmap.get_crc() << " != " << inc.full_crc << std::endl; + return -EINVAL; + } + // inc.decode() verifies `inc_crc`, so it's been taken care of. + } + } + return 0; + }(); + switch (add_inc_result) { + case -ENOENT: + // no worries, we always have full map + break; + case -EINVAL: + return -EINVAL; + case 0: + break; + default: + assert(0); + } + // add full maps + { + const auto oid = OSD::get_osdmap_pobject_name(e); + bufferlist bl; + int nread = fs.read(ch, oid, 0, 0, bl); + if (nread <= 0) { + cerr << "missing " << oid << std::endl; + return -EINVAL; + } + t->put(prefix, ms.combine_strings("full", e), bl); + + auto p = bl.cbegin(); + osdmap.decode(p); + if (osdmap.have_crc()) { + if (have_crc && osdmap.get_crc() != crc) { + cerr << "mismatched full/inc crc: " + << osdmap.get_crc() << " != " << crc << std::endl; + return -EINVAL; + } + uint32_t saved_crc = osdmap.get_crc(); + bufferlist fbl; + osdmap.encode(fbl, features); + if (osdmap.get_crc() != saved_crc) { + cerr << "mismatched full crc: " + << saved_crc << " != " << osdmap.get_crc() << std::endl; + return -EINVAL; + } + } + } + nadded++; + + // last_committed + t->put(prefix, last_committed_name, e); + // full last + t->put(prefix, ms.combine_strings("full", "latest"), e); + + // this number comes from the default value of osd_target_transaction_size, + // so we won't OOM or stuff too many maps in a single transaction if OSD is + // keeping a large series of osdmap + static constexpr unsigned TRANSACTION_SIZE = 30; + if (t->size() >= TRANSACTION_SIZE) { + ms.apply_transaction(t); + t = make_shared<MonitorDBStore::Transaction>(); + } + } + if (!t->empty()) { + ms.apply_transaction(t); + } + t.reset(); + + string osd_name("osd."); + osd_name += std::to_string(sb.whoami); + cout << std::left << setw(8) + << osd_name << ": " + << ntrimmed << " osdmaps trimmed, " + << nadded << " osdmaps added." << std::endl; + return 0; +} + diff --git a/src/tools/rebuild_mondb.h b/src/tools/rebuild_mondb.h new file mode 100644 index 00000000..8a2317d8 --- /dev/null +++ b/src/tools/rebuild_mondb.h @@ -0,0 +1,9 @@ +#pragma once +#include <string> + +class ObjectStore; +class OSDSuperblock; + +int update_mon_db(ObjectStore& fs, OSDSuperblock& sb, + const std::string& keyring_path, + const std::string& store_path); diff --git a/src/tools/rgw/parse-cr-dump.py b/src/tools/rgw/parse-cr-dump.py new file mode 100755 index 00000000..539929b1 --- /dev/null +++ b/src/tools/rgw/parse-cr-dump.py @@ -0,0 +1,168 @@ +#!/usr/bin/python +from __future__ import print_function +from collections import Counter +import argparse +import json +import re +import sys + +def gen_mgrs(args, cr_dump): + """ traverse and return one manager at a time """ + mgrs = cr_dump['coroutine_managers'] + if args.manager is not None: + yield mgrs[args.manager] + else: + for mgr in mgrs: + yield mgr + +def gen_stacks(args, cr_dump): + """ traverse and return one stack at a time """ + for mgr in gen_mgrs(args, cr_dump): + for ctx in mgr['run_contexts']: + for stack in ctx['entries']: + yield stack + +def gen_ops(args, cr_dump): + """ traverse and return one op at a time """ + for stack in gen_stacks(args, cr_dump): + for op in stack['ops']: + yield stack, op + +def op_status(op): + """ return op status or (none) """ + # "status": {"status": "...", "timestamp": "..."} + return op.get('status', {}).get('status', '(none)') + +def do_crs(args, cr_dump): + """ print a sorted list of coroutines """ + counter = Counter() + + if args.group == 'status': + print('Count:\tStatus:') + for _, op in gen_ops(args, cr_dump): + if args.filter and not re.search(args.filter, op['type']): + continue + counter[op_status(op)] += 1 + else: + print('Count:\tCoroutine:') + for _, op in gen_ops(args, cr_dump): + name = op['type'] + if args.filter and not re.search(args.filter, name): + continue + counter[name] += 1 + + crs = counter.most_common(); + + if args.order == 'asc': + crs.reverse() + if args.limit: + crs = crs[:args.limit] + + for op in crs: + print('%d\t%s' % (op[1], op[0])) + print('Total:', sum(counter.values())) + return 0 + +def match_ops(name, ops): + """ return true if any op matches the given filter """ + for op in ops: + if re.search(name, op): + return True + return False + +def do_stacks(args, cr_dump): + """ print a list of coroutine stacks """ + print('Stack:\t\tCoroutines:') + count = 0 + for stack in gen_stacks(args, cr_dump): + stack_id = stack['stack'] + ops = [op['type'] for op in stack['ops']] + if args.filter and not match_ops(args.filter, ops): + continue + if args.limit and count == args.limit: + print('...') + break + print('%s\t%s' % (stack_id, ', '.join(ops))) + count += 1 + print('Total:', count) + return 0 + +def traverse_spawned_stacks(args, stack, depth, stacks, callback): + """ recurse through spawned stacks, passing each op to the callback """ + for op in stack['ops']: + # only filter ops in base stack + if depth == 0 and args.filter and not re.search(args.filter, op['type']): + continue + if not callback(stack, op, depth): + return False + for spawned in op.get('spawned', []): + s = stacks.get(spawned) + if not s: + continue + if not traverse_spawned_stacks(args, s, depth + 1, stacks, callback): + return False + return True + +def do_stack(args, cr_dump): + """ inspect a given stack and its descendents """ + # build a lookup table of stacks by id + stacks = {s['stack']: s for s in gen_stacks(args, cr_dump)} + + stack = stacks.get(args.stack) + if not stack: + print('Stack %s not found' % args.stack, file=sys.stderr) + return 1 + + do_stack.count = 0 # for use in closure + def print_stack_op(stack, op, depth): + indent = ' ' * depth * 4 + if args.limit and do_stack.count == args.limit: + print('%s...' % indent) + return False # stop traversal + do_stack.count += 1 + print('%s[%s] %s: %s' % (indent, stack['stack'], op['type'], op_status(op))) + return True + + traverse_spawned_stacks(args, stack, 0, stacks, print_stack_op) + return 0 + +def do_spawned(args, cr_dump): + """ search all ops for the given spawned stack """ + for stack, op in gen_ops(args, cr_dump): + if args.stack in op.get('spawned', []): + print('Stack %s spawned by [%s] %s' % (args.stack, stack['stack'], op['type'])) + return 0 + print('Stack %s not spawned' % args.stack, file=sys.stderr) + return 1 + +def main(): + parser = argparse.ArgumentParser(description='Parse and inspect the output of the "cr dump" admin socket command.') + parser.add_argument('--filename', type=argparse.FileType(), default=sys.stdin, help='Input filename (or stdin if empty)') + parser.add_argument('--filter', type=str, help='Filter by coroutine type (regex syntax is supported)') + parser.add_argument('--limit', type=int) + parser.add_argument('--manager', type=int, help='Index into coroutine_managers[]') + + subparsers = parser.add_subparsers() + + crs_parser = subparsers.add_parser('crs', help='Produce a sorted list of coroutines') + crs_parser.add_argument('--group', type=str, choices=['type', 'status']) + crs_parser.add_argument('--order', type=str, choices=['desc', 'asc']) + crs_parser.set_defaults(func=do_crs) + + stacks_parser = subparsers.add_parser('stacks', help='Produce a list of coroutine stacks and their ops') + stacks_parser.set_defaults(func=do_stacks) + + stack_parser = subparsers.add_parser('stack', help='Inspect a given coroutine stack') + stack_parser.add_argument('stack', type=str) + stack_parser.set_defaults(func=do_stack) + + spawned_parser = subparsers.add_parser('spawned', help='Find the op that spawned the given stack') + spawned_parser.add_argument('stack', type=str) + spawned_parser.set_defaults(func=do_spawned) + + args = parser.parse_args() + return args.func(args, json.load(args.filename)) + +if __name__ == "__main__": + result = main() + sys.exit(result) diff --git a/src/tools/scratchtool.c b/src/tools/scratchtool.c new file mode 100644 index 00000000..899447ec --- /dev/null +++ b/src/tools/scratchtool.c @@ -0,0 +1,319 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/rados/librados.h" + +#include <assert.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +static int do_rados_setxattr(rados_ioctx_t io_ctx, const char *oid, + const char *key, const char *val) +{ + int ret = rados_setxattr(io_ctx, oid, key, val, strlen(val) + 1); + if (ret < 0) { + printf("rados_setxattr failed with error %d\n", ret); + return 1; + } + printf("rados_setxattr %s=%s\n", key, val); + return 0; +} + +static int do_rados_getxattr(rados_ioctx_t io_ctx, const char *oid, + const char *key, const char *expected) +{ + size_t blen = strlen(expected) + 1; + char buf[blen]; + memset(buf, 0, sizeof(buf)); + int r = rados_getxattr(io_ctx, oid, key, buf, blen); + if (r < 0) { + printf("rados_getxattr(%s) failed with error %d\n", key, r); + return 1; + } + if (strcmp(buf, expected) != 0) { + printf("rados_getxattr(%s) got wrong result! " + "expected: '%s'. got '%s'\n", key, expected, buf); + return 1; + } + printf("rados_getxattr %s=%s\n", key, buf); + return 0; +} + +static int do_rados_getxattrs(rados_ioctx_t io_ctx, const char *oid, + const char **exkeys, const char **exvals) +{ + rados_xattrs_iter_t iter; + int nval = 0, i, nfound = 0, r = 0, ret = 1; + + for (i = 0; exvals[i]; ++i) { + ++nval; + } + r = rados_getxattrs(io_ctx, oid, &iter); + if (r) { + printf("rados_getxattrs(%s) failed with error %d\n", oid, r); + return 1; + } + while (1) { + size_t len; + const char *key, *val; + r = rados_getxattrs_next(iter, &key, &val, &len); + if (r) { + printf("rados_getxattrs(%s): rados_getxattrs_next " + "returned error %d\n", oid, r); + goto out_err; + } + if (!key) + break; + for (i = 0; i < nval; ++i) { + if (strcmp(exkeys[i], key)) + continue; + if ((len == strlen(exvals[i]) + 1) && (val != NULL) && (!strcmp(exvals[i], val))) { + nfound++; + break; + } + printf("rados_getxattrs(%s): got key %s, but the " + "value was %s rather than %s.\n", + oid, key, val, exvals[i]); + goto out_err; + } + } + if (nfound != nval) { + printf("rados_getxattrs(%s): only found %d extended attributes. " + "Expected %d\n", oid, nfound, nval); + goto out_err; + } + ret = 0; + printf("rados_getxattrs(%s)\n", oid); + +out_err: + rados_getxattrs_end(iter); + return ret; +} + +static int testrados(void) +{ + char tmp[32]; + int i, r; + int ret = 1; //set 1 as error case + rados_t cl; + + if (rados_create(&cl, NULL) < 0) { + printf("error initializing\n"); + return 1; + } + + if (rados_conf_read_file(cl, NULL)) { + printf("error reading configuration file\n"); + goto out_err; + } + + // Try to set a configuration option that doesn't exist. + // This should fail. + if (!rados_conf_set(cl, "config option that doesn't exist", + "some random value")) { + printf("error: succeeded in setting nonexistent config option\n"); + goto out_err; + } + + if (rados_conf_get(cl, "log to stderr", tmp, sizeof(tmp))) { + printf("error: failed to read log_to_stderr from config\n"); + goto out_err; + } + + // Can we change it? + if (rados_conf_set(cl, "log to stderr", "true")) { + printf("error: error setting log_to_stderr\n"); + goto out_err; + } + if (rados_conf_get(cl, "log to stderr", tmp, sizeof(tmp))) { + printf("error: failed to read log_to_stderr from config\n"); + goto out_err; + } + if (strcmp(tmp, "true")) { + printf("error: new setting for log_to_stderr failed to take effect.\n"); + goto out_err; + } + + if (rados_connect(cl)) { + printf("error connecting\n"); + goto out_err; + } + if (rados_connect(cl) == 0) { + printf("second connect attempt didn't return an error\n"); + goto out_err; + } + + /* create an io_ctx */ + r = rados_pool_create(cl, "foo"); + printf("rados_pool_create = %d\n", r); + + rados_ioctx_t io_ctx; + r = rados_ioctx_create(cl, "foo", &io_ctx); + if (r < 0) { + printf("error creating ioctx\n"); + goto out_err; + } + printf("rados_ioctx_create = %d, io_ctx = %p\n", r, io_ctx); + + /* list all pools */ + { + int buf_sz = rados_pool_list(cl, NULL, 0); + printf("need buffer size of %d\n", buf_sz); + char buf[buf_sz]; + int r = rados_pool_list(cl, buf, buf_sz); + if (r != buf_sz) { + printf("buffer size mismatch: got %d the first time, but %d " + "the second.\n", buf_sz, r); + goto out_err_cleanup; + } + const char *b = buf; + printf("begin pools.\n"); + while (1) { + if (b[0] == '\0') + break; + printf(" pool: '%s'\n", b); + b += strlen(b) + 1; + }; + printf("end pools.\n"); + } + + + /* stat */ + struct rados_pool_stat_t st; + r = rados_ioctx_pool_stat(io_ctx, &st); + printf("rados_ioctx_pool_stat = %d, %lld KB, %lld objects\n", r, (long long)st.num_kb, (long long)st.num_objects); + + /* snapshots */ + r = rados_ioctx_snap_create(io_ctx, "snap1"); + printf("rados_ioctx_snap_create snap1 = %d\n", r); + rados_snap_t snaps[10]; + r = rados_ioctx_snap_list(io_ctx, snaps, 10); + for (i=0; i<r; i++) { + char name[100]; + rados_ioctx_snap_get_name(io_ctx, snaps[i], name, sizeof(name)); + printf("rados_ioctx_snap_list got snap %lld %s\n", (long long)snaps[i], name); + } + rados_snap_t snapid; + r = rados_ioctx_snap_lookup(io_ctx, "snap1", &snapid); + printf("rados_ioctx_snap_lookup snap1 got %lld, result %d\n", (long long)snapid, r); + r = rados_ioctx_snap_remove(io_ctx, "snap1"); + printf("rados_ioctx_snap_remove snap1 = %d\n", r); + + /* sync io */ + time_t tm; + char buf[128], buf2[128]; + time(&tm); + snprintf(buf, 128, "%s", ctime(&tm)); + const char *oid = "foo_object"; + r = rados_write(io_ctx, oid, buf, strlen(buf) + 1, 0); + printf("rados_write = %d\n", r); + r = rados_read(io_ctx, oid, buf2, sizeof(buf2), 0); + printf("rados_read = %d\n", r); + if (memcmp(buf, buf2, r)) + printf("*** content mismatch ***\n"); + + /* attrs */ + if (do_rados_setxattr(io_ctx, oid, "b", "2")) + goto out_err_cleanup; + if (do_rados_setxattr(io_ctx, oid, "a", "1")) + goto out_err_cleanup; + if (do_rados_setxattr(io_ctx, oid, "c", "3")) + goto out_err_cleanup; + if (do_rados_getxattr(io_ctx, oid, "a", "1")) + goto out_err_cleanup; + if (do_rados_getxattr(io_ctx, oid, "b", "2")) + goto out_err_cleanup; + if (do_rados_getxattr(io_ctx, oid, "c", "3")) + goto out_err_cleanup; + const char *exkeys[] = { "a", "b", "c", NULL }; + const char *exvals[] = { "1", "2", "3", NULL }; + if (do_rados_getxattrs(io_ctx, oid, exkeys, exvals)) + goto out_err_cleanup; + + uint64_t size; + time_t mtime; + r = rados_stat(io_ctx, oid, &size, &mtime); + printf("rados_stat size = %lld mtime = %d = %d\n", (long long)size, (int)mtime, r); + r = rados_stat(io_ctx, "does_not_exist", NULL, NULL); + printf("rados_stat(does_not_exist) = %d\n", r); + + /* exec */ + rados_exec(io_ctx, oid, "crypto", "md5", buf, strlen(buf) + 1, buf, 128); + printf("exec result=%s\n", buf); + r = rados_read(io_ctx, oid, buf2, 128, 0); + printf("read result=%s\n", buf2); + printf("size=%d\n", r); + + /* aio */ + rados_completion_t a, b; + rados_aio_create_completion(0, 0, 0, &a); + rados_aio_create_completion(0, 0, 0, &b); + rados_aio_write(io_ctx, "a", a, buf, 100, 0); + rados_aio_write(io_ctx, "../b/bb_bb_bb\\foo\\bar", b, buf, 100, 0); + rados_aio_wait_for_safe(a); + printf("a safe\n"); + rados_aio_wait_for_safe(b); + printf("b safe\n"); + rados_aio_release(a); + rados_aio_release(b); + + /* test flush */ + printf("testing aio flush\n"); + rados_completion_t c; + rados_aio_create_completion(0, 0, 0, &c); + rados_aio_write(io_ctx, "c", c, buf, 100, 0); + int safe = rados_aio_is_safe(c); + printf("a should not yet be safe and ... %s\n", safe ? "is":"is not"); + assert(!safe); + rados_aio_flush(io_ctx); + safe = rados_aio_is_safe(c); + printf("a should be safe and ... %s\n", safe ? "is":"is not"); + assert(safe); + rados_aio_release(c); + + rados_read(io_ctx, "../b/bb_bb_bb\\foo\\bar", buf2, 128, 0); + + /* list objects */ + rados_list_ctx_t h; + r = rados_nobjects_list_open(io_ctx, &h); + printf("rados_nobjects_list_open = %d, h = %p\n", r, h); + const char *poolname; + while (rados_nobjects_list_next2(h, &poolname, NULL, NULL, NULL, NULL, NULL) == 0) + printf("rados_nobjects_list_next2 got object '%s'\n", poolname); + rados_nobjects_list_close(h); + + /* stat */ + r = rados_ioctx_pool_stat(io_ctx, &st); + printf("rados_stat_pool = %d, %lld KB, %lld objects\n", r, (long long)st.num_kb, (long long)st.num_objects); + + ret = 0; + +out_err_cleanup: + /* delete a pool */ + rados_ioctx_destroy(io_ctx); + + r = rados_pool_delete(cl, "foo"); + printf("rados_delete_pool = %d\n", r); + +out_err: + rados_shutdown(cl); + return ret; +} + +int main(int argc, const char **argv) +{ + return testrados(); +} diff --git a/src/tools/scratchtoolpp.cc b/src/tools/scratchtoolpp.cc new file mode 100644 index 00000000..26a35beb --- /dev/null +++ b/src/tools/scratchtoolpp.cc @@ -0,0 +1,293 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/types.h" +#include "include/rados/librados.hpp" + +using namespace librados; + +#include <iostream> + +#include <errno.h> +#include <stdlib.h> +#include <time.h> + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +void buf_to_hex(const unsigned char *buf, int len, char *str) +{ + str[0] = '\0'; + for (int i = 0; i < len; i++) { + sprintf(&str[i*2], "%02x", (int)buf[i]); + } +} + +class C_Watch : public WatchCtx { +public: + C_Watch() {} + void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) override { + cout << "C_Watch::notify() opcode=" << (int)opcode << " ver=" << ver << std::endl; + } +}; + +void testradospp_milestone(void) +{ + int c; + cout << "*** press enter to continue ***" << std::endl; + while ((c = getchar()) != EOF) { + if (c == '\n') + break; + } +} + +int main(int argc, const char **argv) +{ + Rados rados; + if (rados.init(NULL) < 0) { + cerr << "couldn't initialize rados!" << std::endl; + exit(1); + } + + if (rados.conf_read_file(NULL)) { + cerr << "couldn't read configuration file." << std::endl; + exit(1); + } + rados.conf_parse_argv(argc, argv); + + if (!rados.conf_set("config option that doesn't exist", + "some random value")) { + printf("error: succeeded in setting nonexistent config option\n"); + exit(1); + } + if (rados.conf_set("log to stderr", "true")) { + printf("error: error setting log_to_stderr\n"); + exit(1); + } + std::string tmp; + if (rados.conf_get("log to stderr", tmp)) { + printf("error: failed to read log_to_stderr from config\n"); + exit(1); + } + if (tmp != "true") { + printf("error: new setting for log_to_stderr failed to take effect.\n"); + exit(1); + } + + if (rados.connect()) { + printf("error connecting\n"); + exit(1); + } + + cout << "rados_initialize completed" << std::endl; + testradospp_milestone(); + + time_t tm; + bufferlist bl, bl2, blf; + char buf[128]; + + time(&tm); + snprintf(buf, 128, "%s", ctime(&tm)); + bl.append(buf, strlen(buf)); + blf.append(buf, 16); + + const char *oid = "bar"; + + int r = rados.pool_create("foo"); + cout << "pool_create result = " << r << std::endl; + + IoCtx io_ctx; + r = rados.ioctx_create("foo", io_ctx); + cout << "ioctx_create result = " << r << std::endl; + + r = io_ctx.write(oid, bl, bl.length(), 0); + uint64_t objver = io_ctx.get_last_version(); + ceph_assert(objver > 0); + cout << "io_ctx.write returned " << r << " last_ver=" << objver << std::endl; + + uint64_t stat_size; + time_t stat_mtime; + r = io_ctx.stat(oid, &stat_size, &stat_mtime); + cout << "io_ctx.stat returned " << r << " size = " << stat_size << " mtime = " << stat_mtime << std::endl; + + r = io_ctx.stat(oid, NULL, NULL); + cout << "io_ctx.stat(does_not_exist) = " << r << std::endl; + + uint64_t handle; + C_Watch wc; + r = io_ctx.watch(oid, objver, &handle, &wc); + cout << "io_ctx.watch returned " << r << std::endl; + + testradospp_milestone(); + io_ctx.set_notify_timeout(7); + bufferlist notify_bl; + r = io_ctx.notify(oid, objver, notify_bl); + cout << "io_ctx.notify returned " << r << std::endl; + testradospp_milestone(); + + r = io_ctx.notify(oid, objver, notify_bl); + cout << "io_ctx.notify returned " << r << std::endl; + testradospp_milestone(); + + r = io_ctx.unwatch(oid, handle); + cout << "io_ctx.unwatch returned " << r << std::endl; + testradospp_milestone(); + + r = io_ctx.notify(oid, objver, notify_bl); + cout << "io_ctx.notify returned " << r << std::endl; + testradospp_milestone(); + io_ctx.set_assert_version(objver); + + r = io_ctx.write(oid, bl, bl.length() - 1, 0); + cout << "io_ctx.write returned " << r << std::endl; + + r = io_ctx.write(oid, bl, bl.length() - 2, 0); + cout << "io_ctx.write returned " << r << std::endl; + r = io_ctx.write(oid, bl, bl.length() - 3, 0); + cout << "rados.write returned " << r << std::endl; + r = io_ctx.append(oid, bl, bl.length()); + cout << "rados.write returned " << r << std::endl; + r = io_ctx.write_full(oid, blf); + cout << "rados.write_full returned " << r << std::endl; + r = io_ctx.read(oid, bl, bl.length(), 0); + cout << "rados.read returned " << r << std::endl; + r = io_ctx.trunc(oid, 8); + cout << "rados.trunc returned " << r << std::endl; + r = io_ctx.read(oid, bl, bl.length(), 0); + cout << "rados.read returned " << r << std::endl; + r = io_ctx.exec(oid, "crypto", "md5", bl, bl2); + cout << "exec returned " << r << " buf size=" << bl2.length() << std::endl; + const unsigned char *md5 = (const unsigned char *)bl2.c_str(); + char md5_str[bl2.length()*2 + 1]; + buf_to_hex(md5, bl2.length(), md5_str); + cout << "md5 result=" << md5_str << std::endl; + + // test assert_version + r = io_ctx.read(oid, bl, 0, 1); + ceph_assert(r >= 0); + uint64_t v = io_ctx.get_last_version(); + cout << oid << " version is " << v << std::endl; + ceph_assert(v > 0); + io_ctx.set_assert_version(v); + r = io_ctx.read(oid, bl, 0, 1); + ceph_assert(r >= 0); + io_ctx.set_assert_version(v - 1); + r = io_ctx.read(oid, bl, 0, 1); + ceph_assert(r == -ERANGE); + io_ctx.set_assert_version(v + 1); + r = io_ctx.read(oid, bl, 0, 1); + ceph_assert(r == -EOVERFLOW); + + r = io_ctx.exec(oid, "crypto", "sha1", bl, bl2); + cout << "exec returned " << r << std::endl; + const unsigned char *sha1 = (const unsigned char *)bl2.c_str(); + char sha1_str[bl2.length()*2 + 1]; + buf_to_hex(sha1, bl2.length(), sha1_str); + cout << "sha1 result=" << sha1_str << std::endl; + + r = io_ctx.exec(oid, "acl", "set", bl, bl2); + cout << "exec (set) returned " << r << std::endl; + r = io_ctx.exec(oid, "acl", "get", bl, bl2); + cout << "exec (get) returned " << r << std::endl; + if (bl2.length() > 0) { + cout << "attr=" << bl2.c_str() << std::endl; + } + + int size = io_ctx.read(oid, bl2, 128, 0); + if (size <= 0) { + cout << "failed to read oid " << oid << "." << std::endl; + exit(1); + } + if (size > 4096) { + cout << "read too many bytes from oid " << oid << "." << std::endl; + exit(1); + } + char rbuf[size + 1]; + memcpy(rbuf, bl2.c_str(), size); + rbuf[size] = '\0'; + cout << "read result='" << rbuf << "'" << std::endl; + cout << "size=" << size << std::endl; + + const char *oid2 = "jjj10.rbd"; + r = io_ctx.exec(oid2, "rbd", "snap_list", bl, bl2); + cout << "snap_list result=" << r << std::endl; + r = io_ctx.exec(oid2, "rbd", "snap_add", bl, bl2); + cout << "snap_add result=" << r << std::endl; + + if (r > 0) { + char *s = bl2.c_str(); + for (int i=0; i<r; i++, s += strlen(s) + 1) + cout << s << std::endl; + } + + cout << "compound operation..." << std::endl; + ObjectWriteOperation o; + o.write(0, bl); + o.setxattr("foo", bl2); + r = io_ctx.operate(oid, &o); + cout << "operate result=" << r << std::endl; + + cout << "cmpxattr" << std::endl; + bufferlist val; + val.append("foo"); + r = io_ctx.setxattr(oid, "foo", val); + ceph_assert(r >= 0); + { + ObjectReadOperation o; + o.cmpxattr("foo", CEPH_OSD_CMPXATTR_OP_EQ, val); + r = io_ctx.operate(oid, &o, &bl2); + cout << " got " << r << " wanted >= 0" << std::endl; + ceph_assert(r >= 0); + } + val.append("..."); + { + ObjectReadOperation o; + o.cmpxattr("foo", CEPH_OSD_CMPXATTR_OP_EQ, val); + r = io_ctx.operate(oid, &o, &bl2); + cout << " got " << r << " wanted " << -ECANCELED << " (-ECANCELED)" << std::endl; + ceph_assert(r == -ECANCELED); + } + + io_ctx.locator_set_key(string()); + + cout << "iterating over objects..." << std::endl; + int num_objs = 0; + for (NObjectIterator iter = io_ctx.nobjects_begin(); + iter != io_ctx.nobjects_end(); ++iter) { + num_objs++; + cout << "'" << *iter << "'" << std::endl; + } + cout << "iterated over " << num_objs << " objects." << std::endl; + map<string, bufferlist> attrset; + io_ctx.getxattrs(oid, attrset); + + map<string, bufferlist>::iterator it; + for (it = attrset.begin(); it != attrset.end(); ++it) { + cout << "xattr: " << it->first << std::endl; + } + + r = io_ctx.remove(oid); + cout << "remove result=" << r << std::endl; + + r = rados.pool_delete("foo"); + cout << "pool_delete result=" << r << std::endl; + + rados.shutdown(); + + return 0; +} + +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" diff --git a/src/tools/setup-virtualenv.sh b/src/tools/setup-virtualenv.sh new file mode 100755 index 00000000..f0fa1e43 --- /dev/null +++ b/src/tools/setup-virtualenv.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# +# Copyright (C) 2016 <contact@redhat.com> +# +# Author: Loic Dachary <loic@dachary.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +SCRIPTNAME="$(basename $0)" +if [ `uname` == FreeBSD ]; then + GETOPT="/usr/local/bin/getopt" +else + GETOPT=getopt +fi + +function usage { + echo + echo "$SCRIPTNAME - automate setup of Python virtual environment" + echo " (for use in building Ceph)" + echo + echo "Usage:" + echo " $SCRIPTNAME [--python=PYTHON_BINARY] TARGET_DIRECTORY" + echo + echo " TARGET_DIRECTORY will be created if it doesn't exist," + echo " and completely destroyed and re-created if it does!" + echo + exit 1 +} + +TEMP=$($GETOPT --options "h" --long "help,python:" --name "$SCRIPTNAME" -- "$@") +test $? != 0 && usage +eval set -- "$TEMP" + +PYTHON_OPTION="" +while true ; do + case "$1" in + -h|--help) usage ;; # does not return + --python) PYTHON_OPTION="--python=$2" ; shift ; shift ;; + --) shift ; break ;; + *) echo "Internal error" ; exit 1 ;; + esac +done + +DIR=$1 +if [ -z "$DIR" ] ; then + echo "$SCRIPTNAME: need a directory path, but none was provided" + usage +fi +rm -fr $DIR +mkdir -p $DIR +virtualenv $PYTHON_OPTION $DIR +. $DIR/bin/activate + +if pip --help | grep -q disable-pip-version-check; then + DISABLE_PIP_VERSION_CHECK=--disable-pip-version-check +else + DISABLE_PIP_VERSION_CHECK= +fi + +# older versions of pip will not install wrap_console scripts +# when using wheel packages +pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install --upgrade 'pip >= 6.1' + +if pip --help | grep -q disable-pip-version-check; then + DISABLE_PIP_VERSION_CHECK=--disable-pip-version-check +else + DISABLE_PIP_VERSION_CHECK= +fi + +if test -d wheelhouse ; then + export NO_INDEX=--no-index +fi + +pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX --find-links=file://$(pwd)/wheelhouse 'tox >=2.9.1' +if test -f requirements.txt ; then + if ! test -f wheelhouse/md5 || ! md5sum -c wheelhouse/md5 > /dev/null; then + NO_INDEX='' + fi + pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX --find-links=file://$(pwd)/wheelhouse -r requirements.txt +fi |