diff options
Diffstat (limited to 'src/tools/rbd_nbd')
-rw-r--r-- | src/tools/rbd_nbd/CMakeLists.txt | 4 | ||||
-rw-r--r-- | src/tools/rbd_nbd/nbd-netlink.h | 100 | ||||
-rw-r--r-- | src/tools/rbd_nbd/rbd-nbd.cc | 2304 | ||||
-rwxr-xr-x | src/tools/rbd_nbd/rbd-nbd_quiesce | 31 |
4 files changed, 2439 insertions, 0 deletions
diff --git a/src/tools/rbd_nbd/CMakeLists.txt b/src/tools/rbd_nbd/CMakeLists.txt new file mode 100644 index 000000000..da758f514 --- /dev/null +++ b/src/tools/rbd_nbd/CMakeLists.txt @@ -0,0 +1,4 @@ +find_package(nl REQUIRED genl) +add_executable(rbd-nbd rbd-nbd.cc) +target_link_libraries(rbd-nbd librbd librados global nl::genl) +install(TARGETS rbd-nbd DESTINATION bin) diff --git a/src/tools/rbd_nbd/nbd-netlink.h b/src/tools/rbd_nbd/nbd-netlink.h new file mode 100644 index 000000000..2d0b90964 --- /dev/null +++ b/src/tools/rbd_nbd/nbd-netlink.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2017 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef _UAPILINUX_NBD_NETLINK_H +#define _UAPILINUX_NBD_NETLINK_H + +#define NBD_GENL_FAMILY_NAME "nbd" +#define NBD_GENL_VERSION 0x1 +#define NBD_GENL_MCAST_GROUP_NAME "nbd_mc_group" + +/* Configuration policy attributes, used for CONNECT */ +enum { + NBD_ATTR_UNSPEC, + NBD_ATTR_INDEX, + NBD_ATTR_SIZE_BYTES, + NBD_ATTR_BLOCK_SIZE_BYTES, + NBD_ATTR_TIMEOUT, + NBD_ATTR_SERVER_FLAGS, + NBD_ATTR_CLIENT_FLAGS, + NBD_ATTR_SOCKETS, + NBD_ATTR_DEAD_CONN_TIMEOUT, + NBD_ATTR_DEVICE_LIST, + NBD_ATTR_BACKEND_IDENTIFIER, + __NBD_ATTR_MAX, +}; +#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1) + +/* + * This is the format for multiple devices with NBD_ATTR_DEVICE_LIST + * + * [NBD_ATTR_DEVICE_LIST] + * [NBD_DEVICE_ITEM] + * [NBD_DEVICE_INDEX] + * [NBD_DEVICE_CONNECTED] + */ +enum { + NBD_DEVICE_ITEM_UNSPEC, + NBD_DEVICE_ITEM, + __NBD_DEVICE_ITEM_MAX, +}; +#define NBD_DEVICE_ITEM_MAX (__NBD_DEVICE_ITEM_MAX - 1) + +enum { + NBD_DEVICE_UNSPEC, + NBD_DEVICE_INDEX, + NBD_DEVICE_CONNECTED, + __NBD_DEVICE_MAX, +}; +#define NBD_DEVICE_ATTR_MAX (__NBD_DEVICE_MAX - 1) + +/* + * This is the format for multiple sockets with NBD_ATTR_SOCKETS + * + * [NBD_ATTR_SOCKETS] + * [NBD_SOCK_ITEM] + * [NBD_SOCK_FD] + * [NBD_SOCK_ITEM] + * [NBD_SOCK_FD] + */ +enum { + NBD_SOCK_ITEM_UNSPEC, + NBD_SOCK_ITEM, + __NBD_SOCK_ITEM_MAX, +}; +#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1) + +enum { + NBD_SOCK_UNSPEC, + NBD_SOCK_FD, + __NBD_SOCK_MAX, +}; +#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1) + +enum { + NBD_CMD_UNSPEC, + NBD_CMD_CONNECT, + NBD_CMD_DISCONNECT, + NBD_CMD_RECONFIGURE, + NBD_CMD_LINK_DEAD, + NBD_CMD_STATUS, + __NBD_CMD_MAX, +}; +#define NBD_CMD_MAX (__NBD_CMD_MAX - 1) + +#endif /* _UAPILINUX_NBD_NETLINK_H */ diff --git a/src/tools/rbd_nbd/rbd-nbd.cc b/src/tools/rbd_nbd/rbd-nbd.cc new file mode 100644 index 000000000..eb9e858f4 --- /dev/null +++ b/src/tools/rbd_nbd/rbd-nbd.cc @@ -0,0 +1,2304 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * rbd-nbd - RBD in userspace + * + * Copyright (C) 2015 - 2016 Kylin Corporation + * + * Author: Yunchuan Wen <yunchuan.wen@kylin-cloud.com> + * Li Wang <li.wang@kylin-cloud.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * +*/ + +#include "acconfig.h" +#include "include/int_types.h" +#include "include/scope_guard.h" + +#include <libgen.h> +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> + +#include <linux/nbd.h> +#include <linux/fs.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/syscall.h> + +#include "nbd-netlink.h" +#include <libnl3/netlink/genl/genl.h> +#include <libnl3/netlink/genl/ctrl.h> +#include <libnl3/netlink/genl/mngt.h> + +#if __has_include(<filesystem>) +#include <filesystem> +namespace fs = std::filesystem; +#else +#include <experimental/filesystem> +namespace fs = std::experimental::filesystem; +#endif +#include <fstream> +#include <iostream> +#include <memory> +#include <regex> +#include <boost/algorithm/string/predicate.hpp> +#include <boost/lexical_cast.hpp> + +#include "common/Formatter.h" +#include "common/Preforker.h" +#include "common/SubProcess.h" +#include "common/TextTable.h" +#include "common/ceph_argparse.h" +#include "common/config.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/event_socket.h" +#include "common/module.h" +#include "common/safe_io.h" +#include "common/version.h" + +#include "global/global_init.h" +#include "global/signal_handler.h" + +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "include/stringify.h" +#include "include/xlist.h" + +#include "mon/MonClient.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "rbd-nbd: " + +enum Command { + None, + Map, + Unmap, + Attach, + Detach, + List +}; + +struct Config { + int nbds_max = 0; + int max_part = 255; + int io_timeout = -1; + int reattach_timeout = 30; + + bool exclusive = false; + bool quiesce = false; + bool readonly = false; + bool set_max_part = false; + bool try_netlink = false; + bool show_cookie = false; + + std::string poolname; + std::string nsname; + std::string imgname; + std::string snapname; + std::string devpath; + std::string quiesce_hook = CMAKE_INSTALL_LIBEXECDIR "/rbd-nbd/rbd-nbd_quiesce"; + + std::string format; + bool pretty_format = false; + + std::optional<librbd::encryption_format_t> encryption_format; + std::optional<std::string> encryption_passphrase_file; + + Command command = None; + int pid = 0; + std::string cookie; + + std::string image_spec() const { + std::string spec = poolname + "/"; + + if (!nsname.empty()) { + spec += "/" + nsname; + } + spec += imgname; + + if (!snapname.empty()) { + spec += "@" + snapname; + } + + return spec; + } +}; + +static void usage() +{ + std::cout << "Usage: rbd-nbd [options] map <image-or-snap-spec> Map image to nbd device\n" + << " detach <device|image-or-snap-spec> Detach image from nbd device\n" + << " [options] attach <image-or-snap-spec> Attach image to nbd device\n" + << " unmap <device|image-or-snap-spec> Unmap nbd device\n" + << " [options] list-mapped List mapped nbd devices\n" + << "Map and attach options:\n" + << " --device <device path> Specify nbd device path (/dev/nbd{num})\n" + << " --encryption-format Image encryption format\n" + << " (possible values: luks1, luks2)\n" + << " --encryption-passphrase-file Path of file containing passphrase for unlocking image encryption\n" + << " --exclusive Forbid writes by other clients\n" + << " --io-timeout <sec> Set nbd IO timeout\n" + << " --max_part <limit> Override for module param max_part\n" + << " --nbds_max <limit> Override for module param nbds_max\n" + << " --quiesce Use quiesce callbacks\n" + << " --quiesce-hook <path> Specify quiesce hook path\n" + << " (default: " << Config().quiesce_hook << ")\n" + << " --read-only Map read-only\n" + << " --reattach-timeout <sec> Set nbd re-attach timeout\n" + << " (default: " << Config().reattach_timeout << ")\n" + << " --try-netlink Use the nbd netlink interface\n" + << " --show-cookie Show device cookie\n" + << " --cookie Specify device cookie\n" + << "\n" + << "List options:\n" + << " --format plain|json|xml Output format (default: plain)\n" + << " --pretty-format Pretty formatting (json and xml)\n" + << std::endl; + generic_server_usage(); +} + +static int nbd = -1; +static int nbd_index = -1; +static EventSocket terminate_event_sock; + +#define RBD_NBD_BLKSIZE 512UL + +#define HELP_INFO 1 +#define VERSION_INFO 2 + +#ifdef CEPH_BIG_ENDIAN +#define ntohll(a) (a) +#elif defined(CEPH_LITTLE_ENDIAN) +#define ntohll(a) swab(a) +#else +#error "Could not determine endianess" +#endif +#define htonll(a) ntohll(a) + +static int parse_args(vector<const char*>& args, std::ostream *err_msg, + Config *cfg); +static int netlink_disconnect(int index); +static int netlink_resize(int nbd_index, uint64_t size); + +static int run_quiesce_hook(const std::string &quiesce_hook, + const std::string &devpath, + const std::string &command); + +static std::string get_cookie(const std::string &devpath); + +class NBDServer +{ +public: + uint64_t quiesce_watch_handle = 0; + +private: + int fd; + librbd::Image ℑ + Config *cfg; + +public: + NBDServer(int fd, librbd::Image& image, Config *cfg) + : fd(fd) + , image(image) + , cfg(cfg) + , reader_thread(*this, &NBDServer::reader_entry) + , writer_thread(*this, &NBDServer::writer_entry) + , quiesce_thread(*this, &NBDServer::quiesce_entry) + { + std::vector<librbd::config_option_t> options; + image.config_list(&options); + for (auto &option : options) { + if ((option.name == std::string("rbd_cache") || + option.name == std::string("rbd_cache_writethrough_until_flush")) && + option.value == "false") { + allow_internal_flush = true; + break; + } + } + } + + Config *get_cfg() const { + return cfg; + } + +private: + int terminate_event_fd = -1; + ceph::mutex disconnect_lock = + ceph::make_mutex("NBDServer::DisconnectLocker"); + ceph::condition_variable disconnect_cond; + std::atomic<bool> terminated = { false }; + std::atomic<bool> allow_internal_flush = { false }; + + struct IOContext + { + xlist<IOContext*>::item item; + NBDServer *server = nullptr; + struct nbd_request request; + struct nbd_reply reply; + bufferlist data; + int command = 0; + + IOContext() + : item(this) + {} + }; + + friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx); + + ceph::mutex lock = ceph::make_mutex("NBDServer::Locker"); + ceph::condition_variable cond; + xlist<IOContext*> io_pending; + xlist<IOContext*> io_finished; + + void io_start(IOContext *ctx) + { + std::lock_guard l{lock}; + io_pending.push_back(&ctx->item); + } + + void io_finish(IOContext *ctx) + { + std::lock_guard l{lock}; + ceph_assert(ctx->item.is_on_list()); + ctx->item.remove_myself(); + io_finished.push_back(&ctx->item); + cond.notify_all(); + } + + IOContext *wait_io_finish() + { + std::unique_lock l{lock}; + cond.wait(l, [this] { + return !io_finished.empty() || + (io_pending.empty() && terminated); + }); + + if (io_finished.empty()) + return NULL; + + IOContext *ret = io_finished.front(); + io_finished.pop_front(); + + return ret; + } + + void wait_clean() + { + std::unique_lock l{lock}; + cond.wait(l, [this] { return io_pending.empty(); }); + + while(!io_finished.empty()) { + std::unique_ptr<IOContext> free_ctx(io_finished.front()); + io_finished.pop_front(); + } + } + + void assert_clean() + { + std::unique_lock l{lock}; + + ceph_assert(!reader_thread.is_started()); + ceph_assert(!writer_thread.is_started()); + ceph_assert(io_pending.empty()); + ceph_assert(io_finished.empty()); + } + + static void aio_callback(librbd::completion_t cb, void *arg) + { + librbd::RBD::AioCompletion *aio_completion = + reinterpret_cast<librbd::RBD::AioCompletion*>(cb); + + IOContext *ctx = reinterpret_cast<IOContext *>(arg); + int ret = aio_completion->get_return_value(); + + dout(20) << __func__ << ": " << *ctx << dendl; + + if (ret == -EINVAL) { + // if shrinking an image, a pagecache writeback might reference + // extents outside of the range of the new image extents + dout(0) << __func__ << ": masking IO out-of-bounds error" << dendl; + ctx->data.clear(); + ret = 0; + } + + if (ret < 0) { + ctx->reply.error = htonl(-ret); + } else if ((ctx->command == NBD_CMD_READ) && + ret < static_cast<int>(ctx->request.len)) { + int pad_byte_count = static_cast<int> (ctx->request.len) - ret; + ctx->data.append_zero(pad_byte_count); + dout(20) << __func__ << ": " << *ctx << ": Pad byte count: " + << pad_byte_count << dendl; + ctx->reply.error = htonl(0); + } else { + ctx->reply.error = htonl(0); + } + ctx->server->io_finish(ctx); + + aio_completion->release(); + } + + void reader_entry() + { + struct pollfd poll_fds[2]; + memset(poll_fds, 0, sizeof(struct pollfd) * 2); + poll_fds[0].fd = fd; + poll_fds[0].events = POLLIN; + poll_fds[1].fd = terminate_event_fd; + poll_fds[1].events = POLLIN; + + while (true) { + std::unique_ptr<IOContext> ctx(new IOContext()); + ctx->server = this; + + dout(20) << __func__ << ": waiting for nbd request" << dendl; + + int r = poll(poll_fds, 2, -1); + if (r == -1) { + if (errno == EINTR) { + continue; + } + r = -errno; + derr << "failed to poll nbd: " << cpp_strerror(r) << dendl; + goto error; + } + + if ((poll_fds[1].revents & POLLIN) != 0) { + dout(0) << __func__ << ": terminate received" << dendl; + goto signal; + } + + if ((poll_fds[0].revents & POLLIN) == 0) { + dout(20) << __func__ << ": nothing to read" << dendl; + continue; + } + + r = safe_read_exact(fd, &ctx->request, sizeof(struct nbd_request)); + if (r < 0) { + derr << "failed to read nbd request header: " << cpp_strerror(r) + << dendl; + goto error; + } + + if (ctx->request.magic != htonl(NBD_REQUEST_MAGIC)) { + derr << "invalid nbd request header" << dendl; + goto signal; + } + + ctx->request.from = ntohll(ctx->request.from); + ctx->request.type = ntohl(ctx->request.type); + ctx->request.len = ntohl(ctx->request.len); + + ctx->reply.magic = htonl(NBD_REPLY_MAGIC); + memcpy(ctx->reply.handle, ctx->request.handle, sizeof(ctx->reply.handle)); + + ctx->command = ctx->request.type & 0x0000ffff; + + dout(20) << *ctx << ": start" << dendl; + + switch (ctx->command) + { + case NBD_CMD_DISC: + // NBD_DO_IT will return when pipe is closed + dout(0) << "disconnect request received" << dendl; + goto signal; + case NBD_CMD_WRITE: + bufferptr ptr(ctx->request.len); + r = safe_read_exact(fd, ptr.c_str(), ctx->request.len); + if (r < 0) { + derr << *ctx << ": failed to read nbd request data: " + << cpp_strerror(r) << dendl; + goto error; + } + ctx->data.push_back(ptr); + break; + } + + IOContext *pctx = ctx.release(); + io_start(pctx); + librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(pctx, aio_callback); + switch (pctx->command) + { + case NBD_CMD_WRITE: + image.aio_write(pctx->request.from, pctx->request.len, pctx->data, c); + break; + case NBD_CMD_READ: + image.aio_read(pctx->request.from, pctx->request.len, pctx->data, c); + break; + case NBD_CMD_FLUSH: + image.aio_flush(c); + allow_internal_flush = true; + break; + case NBD_CMD_TRIM: + image.aio_discard(pctx->request.from, pctx->request.len, c); + break; + default: + derr << *pctx << ": invalid request command" << dendl; + c->release(); + goto signal; + } + } +error: + { + int r = netlink_disconnect(nbd_index); + if (r == 1) { + ioctl(nbd, NBD_DISCONNECT); + } + } +signal: + std::lock_guard l{lock}; + terminated = true; + cond.notify_all(); + + std::lock_guard disconnect_l{disconnect_lock}; + disconnect_cond.notify_all(); + + dout(20) << __func__ << ": terminated" << dendl; + } + + void writer_entry() + { + while (true) { + dout(20) << __func__ << ": waiting for io request" << dendl; + std::unique_ptr<IOContext> ctx(wait_io_finish()); + if (!ctx) { + dout(20) << __func__ << ": no io requests, terminating" << dendl; + goto done; + } + + dout(20) << __func__ << ": got: " << *ctx << dendl; + + int r = safe_write(fd, &ctx->reply, sizeof(struct nbd_reply)); + if (r < 0) { + derr << *ctx << ": failed to write reply header: " << cpp_strerror(r) + << dendl; + goto error; + } + if (ctx->command == NBD_CMD_READ && ctx->reply.error == htonl(0)) { + r = ctx->data.write_fd(fd); + if (r < 0) { + derr << *ctx << ": failed to write replay data: " << cpp_strerror(r) + << dendl; + goto error; + } + } + dout(20) << *ctx << ": finish" << dendl; + } + error: + wait_clean(); + done: + ::shutdown(fd, SHUT_RDWR); + + dout(20) << __func__ << ": terminated" << dendl; + } + + bool wait_quiesce() { + dout(20) << __func__ << dendl; + + std::unique_lock locker{lock}; + cond.wait(locker, [this] { return quiesce || terminated; }); + + if (terminated) { + return false; + } + + dout(20) << __func__ << ": got quiesce request" << dendl; + return true; + } + + void wait_unquiesce(std::unique_lock<ceph::mutex> &locker) { + dout(20) << __func__ << dendl; + + cond.wait(locker, [this] { return !quiesce || terminated; }); + + dout(20) << __func__ << ": got unquiesce request" << dendl; + } + + void wait_inflight_io() { + if (!allow_internal_flush) { + return; + } + + uint64_t features = 0; + image.features(&features); + if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) { + bool is_owner = false; + image.is_exclusive_lock_owner(&is_owner); + if (!is_owner) { + return; + } + } + + dout(20) << __func__ << dendl; + + int r = image.flush(); + if (r < 0) { + derr << "flush failed: " << cpp_strerror(r) << dendl; + } + } + + void quiesce_entry() + { + ceph_assert(cfg->quiesce); + + while (wait_quiesce()) { + + int r = run_quiesce_hook(cfg->quiesce_hook, cfg->devpath, "quiesce"); + + wait_inflight_io(); + + { + std::unique_lock locker{lock}; + ceph_assert(quiesce == true); + + image.quiesce_complete(quiesce_watch_handle, r); + + if (r < 0) { + quiesce = false; + continue; + } + + wait_unquiesce(locker); + } + + run_quiesce_hook(cfg->quiesce_hook, cfg->devpath, "unquiesce"); + } + + dout(20) << __func__ << ": terminated" << dendl; + } + + class ThreadHelper : public Thread + { + public: + typedef void (NBDServer::*entry_func)(); + private: + NBDServer &server; + entry_func func; + public: + ThreadHelper(NBDServer &_server, entry_func _func) + :server(_server) + ,func(_func) + {} + protected: + void* entry() override + { + (server.*func)(); + return NULL; + } + } reader_thread, writer_thread, quiesce_thread; + + bool started = false; + bool quiesce = false; + +public: + void start() + { + if (!started) { + dout(10) << __func__ << ": starting" << dendl; + + started = true; + + terminate_event_fd = eventfd(0, EFD_NONBLOCK); + ceph_assert(terminate_event_fd > 0); + int r = terminate_event_sock.init(terminate_event_fd, + EVENT_SOCKET_TYPE_EVENTFD); + ceph_assert(r >= 0); + + reader_thread.create("rbd_reader"); + writer_thread.create("rbd_writer"); + if (cfg->quiesce) { + quiesce_thread.create("rbd_quiesce"); + } + } + } + + void wait_for_disconnect() + { + if (!started) + return; + + std::unique_lock l{disconnect_lock}; + disconnect_cond.wait(l); + } + + void notify_quiesce() { + dout(10) << __func__ << dendl; + + ceph_assert(cfg->quiesce); + + std::unique_lock locker{lock}; + ceph_assert(quiesce == false); + quiesce = true; + cond.notify_all(); + } + + void notify_unquiesce() { + dout(10) << __func__ << dendl; + + ceph_assert(cfg->quiesce); + + std::unique_lock locker{lock}; + ceph_assert(quiesce == true); + quiesce = false; + cond.notify_all(); + } + + ~NBDServer() + { + if (started) { + dout(10) << __func__ << ": terminating" << dendl; + + terminate_event_sock.notify(); + + reader_thread.join(); + writer_thread.join(); + if (cfg->quiesce) { + quiesce_thread.join(); + } + + assert_clean(); + + close(terminate_event_fd); + started = false; + } + } +}; + +std::ostream &operator<<(std::ostream &os, const NBDServer::IOContext &ctx) { + + os << "[" << std::hex << ntohll(*((uint64_t *)ctx.request.handle)); + + switch (ctx.command) + { + case NBD_CMD_WRITE: + os << " WRITE "; + break; + case NBD_CMD_READ: + os << " READ "; + break; + case NBD_CMD_FLUSH: + os << " FLUSH "; + break; + case NBD_CMD_TRIM: + os << " TRIM "; + break; + case NBD_CMD_DISC: + os << " DISC "; + break; + default: + os << " UNKNOWN(" << ctx.command << ") "; + break; + } + + os << ctx.request.from << "~" << ctx.request.len << " " + << std::dec << ntohl(ctx.reply.error) << "]"; + + return os; +} + +class NBDQuiesceWatchCtx : public librbd::QuiesceWatchCtx +{ +public: + NBDQuiesceWatchCtx(NBDServer *server) : server(server) { + } + + void handle_quiesce() override { + server->notify_quiesce(); + } + + void handle_unquiesce() override { + server->notify_unquiesce(); + } + +private: + NBDServer *server; +}; + +class NBDWatchCtx : public librbd::UpdateWatchCtx +{ +private: + int fd; + int nbd_index; + bool use_netlink; + librados::IoCtx &io_ctx; + librbd::Image ℑ + unsigned long size; +public: + NBDWatchCtx(int _fd, + int _nbd_index, + bool _use_netlink, + librados::IoCtx &_io_ctx, + librbd::Image &_image, + unsigned long _size) + : fd(_fd) + , nbd_index(_nbd_index) + , use_netlink(_use_netlink) + , io_ctx(_io_ctx) + , image(_image) + , size(_size) + { } + + ~NBDWatchCtx() override {} + + void handle_notify() override + { + librbd::image_info_t info; + if (image.stat(info, sizeof(info)) == 0) { + unsigned long new_size = info.size; + int ret; + + if (new_size != size) { + dout(5) << "resize detected" << dendl; + if (ioctl(fd, BLKFLSBUF, NULL) < 0) + derr << "invalidate page cache failed: " << cpp_strerror(errno) + << dendl; + if (use_netlink) { + ret = netlink_resize(nbd_index, new_size); + } else { + ret = ioctl(fd, NBD_SET_SIZE, new_size); + if (ret < 0) + derr << "resize failed: " << cpp_strerror(errno) << dendl; + } + + if (!ret) + size = new_size; + + if (ioctl(fd, BLKRRPART, NULL) < 0) { + derr << "rescan of partition table failed: " << cpp_strerror(errno) + << dendl; + } + if (image.invalidate_cache() < 0) + derr << "invalidate rbd cache failed" << dendl; + } + } + } +}; + +class NBDListIterator { +public: + bool get(Config *cfg) { + while (true) { + std::string nbd_path = "/sys/block/nbd" + stringify(m_index); + if(access(nbd_path.c_str(), F_OK) != 0) { + return false; + } + + *cfg = Config(); + cfg->devpath = "/dev/nbd" + stringify(m_index++); + + int pid; + std::ifstream ifs; + ifs.open(nbd_path + "/pid", std::ifstream::in); + if (!ifs.is_open()) { + continue; + } + ifs >> pid; + ifs.close(); + + // If the rbd-nbd is re-attached the pid may store garbage + // here. We are sure this is the case when it is negative or + // zero. Then we just try to find the attached process scanning + // /proc fs. If it is positive we check the process with this + // pid first and if it is not rbd-nbd fallback to searching the + // attached process. + do { + if (pid <= 0) { + pid = find_attached(cfg->devpath); + if (pid <= 0) { + break; + } + } + + if (get_mapped_info(pid, cfg) >= 0) { + return true; + } + pid = -1; + } while (true); + } + } + +private: + int m_index = 0; + std::map<int, Config> m_mapped_info_cache; + + int get_mapped_info(int pid, Config *cfg) { + ceph_assert(!cfg->devpath.empty()); + + auto it = m_mapped_info_cache.find(pid); + if (it != m_mapped_info_cache.end()) { + if (it->second.devpath != cfg->devpath) { + return -EINVAL; + } + *cfg = it->second; + return 0; + } + + m_mapped_info_cache[pid] = {}; + + int r; + std::string path = "/proc/" + stringify(pid) + "/comm"; + std::ifstream ifs; + std::string comm; + ifs.open(path.c_str(), std::ifstream::in); + if (!ifs.is_open()) + return -1; + ifs >> comm; + if (comm != "rbd-nbd") { + return -EINVAL; + } + ifs.close(); + + path = "/proc/" + stringify(pid) + "/cmdline"; + std::string cmdline; + std::vector<const char*> args; + + ifs.open(path.c_str(), std::ifstream::in); + if (!ifs.is_open()) + return -1; + ifs >> cmdline; + + if (cmdline.empty()) { + return -EINVAL; + } + + for (unsigned i = 0; i < cmdline.size(); i++) { + char *arg = &cmdline[i]; + if (i == 0) { + if (strcmp(basename(arg) , "rbd-nbd") != 0) { + return -EINVAL; + } + } else { + args.push_back(arg); + } + + while (cmdline[i] != '\0') { + i++; + } + } + + std::ostringstream err_msg; + Config c; + r = parse_args(args, &err_msg, &c); + if (r < 0) { + return r; + } + + if (c.command != Map && c.command != Attach) { + return -ENOENT; + } + + c.pid = pid; + m_mapped_info_cache.erase(pid); + if (!c.devpath.empty()) { + m_mapped_info_cache[pid] = c; + if (c.devpath != cfg->devpath) { + return -ENOENT; + } + } else { + c.devpath = cfg->devpath; + } + + c.cookie = get_cookie(cfg->devpath); + *cfg = c; + return 0; + } + + int find_attached(const std::string &devpath) { + for (auto &entry : fs::directory_iterator("/proc")) { + if (!fs::is_directory(entry.status())) { + continue; + } + + int pid; + try { + pid = boost::lexical_cast<uint64_t>(entry.path().filename().c_str()); + } catch (boost::bad_lexical_cast&) { + continue; + } + + Config cfg; + cfg.devpath = devpath; + if (get_mapped_info(pid, &cfg) >=0 && cfg.command == Attach) { + return cfg.pid; + } + } + + return -1; + } +}; + +static std::string get_cookie(const std::string &devpath) +{ + std::string cookie; + std::ifstream ifs; + std::string path = "/sys/block/" + devpath.substr(sizeof("/dev/") - 1) + "/backend"; + + ifs.open(path, std::ifstream::in); + if (ifs.is_open()) { + std::getline(ifs, cookie); + ifs.close(); + } + return cookie; +} + +static int load_module(Config *cfg) +{ + ostringstream param; + int ret; + + if (cfg->nbds_max) + param << "nbds_max=" << cfg->nbds_max; + + if (cfg->max_part) + param << " max_part=" << cfg->max_part; + + if (!access("/sys/module/nbd", F_OK)) { + if (cfg->nbds_max || cfg->set_max_part) + cerr << "rbd-nbd: ignoring kernel module parameter options: nbd module already loaded" + << std::endl; + return 0; + } + + ret = module_load("nbd", param.str().c_str()); + if (ret < 0) + cerr << "rbd-nbd: failed to load nbd kernel module: " << cpp_strerror(-ret) + << std::endl; + + return ret; +} + +static int check_device_size(int nbd_index, unsigned long expected_size) +{ + // There are bugs with some older kernel versions that result in an + // overflow for large image sizes. This check is to ensure we are + // not affected. + + unsigned long size = 0; + std::string path = "/sys/block/nbd" + stringify(nbd_index) + "/size"; + std::ifstream ifs; + ifs.open(path.c_str(), std::ifstream::in); + if (!ifs.is_open()) { + cerr << "rbd-nbd: failed to open " << path << std::endl; + return -EINVAL; + } + ifs >> size; + size *= RBD_NBD_BLKSIZE; + + if (size == 0) { + // Newer kernel versions will report real size only after nbd + // connect. Assume this is the case and return success. + return 0; + } + + if (size != expected_size) { + cerr << "rbd-nbd: kernel reported invalid device size (" << size + << ", expected " << expected_size << ")" << std::endl; + return -EINVAL; + } + + return 0; +} + +static int parse_nbd_index(const std::string& devpath) +{ + int index, ret; + + ret = sscanf(devpath.c_str(), "/dev/nbd%d", &index); + if (ret <= 0) { + // mean an early matching failure. But some cases need a negative value. + if (ret == 0) + ret = -EINVAL; + cerr << "rbd-nbd: invalid device path: " << devpath + << " (expected /dev/nbd{num})" << std::endl; + return ret; + } + + return index; +} + +static int try_ioctl_setup(Config *cfg, int fd, uint64_t size, + uint64_t blksize, uint64_t flags) +{ + int index = 0, r; + + if (cfg->devpath.empty()) { + char dev[64]; + const char *path = "/sys/module/nbd/parameters/nbds_max"; + int nbds_max = -1; + if (access(path, F_OK) == 0) { + std::ifstream ifs; + ifs.open(path, std::ifstream::in); + if (ifs.is_open()) { + ifs >> nbds_max; + ifs.close(); + } + } + + while (true) { + snprintf(dev, sizeof(dev), "/dev/nbd%d", index); + + nbd = open(dev, O_RDWR); + if (nbd < 0) { + if (nbd == -EPERM && nbds_max != -1 && index < (nbds_max-1)) { + ++index; + continue; + } + r = nbd; + cerr << "rbd-nbd: failed to find unused device" << std::endl; + goto done; + } + + r = ioctl(nbd, NBD_SET_SOCK, fd); + if (r < 0) { + close(nbd); + ++index; + continue; + } + + cfg->devpath = dev; + break; + } + } else { + r = parse_nbd_index(cfg->devpath); + if (r < 0) + goto done; + index = r; + + nbd = open(cfg->devpath.c_str(), O_RDWR); + if (nbd < 0) { + r = nbd; + cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl; + goto done; + } + + r = ioctl(nbd, NBD_SET_SOCK, fd); + if (r < 0) { + r = -errno; + cerr << "rbd-nbd: the device " << cfg->devpath << " is busy" << std::endl; + close(nbd); + goto done; + } + } + + r = ioctl(nbd, NBD_SET_BLKSIZE, blksize); + if (r < 0) { + r = -errno; + cerr << "rbd-nbd: NBD_SET_BLKSIZE failed" << std::endl; + goto close_nbd; + } + + r = ioctl(nbd, NBD_SET_SIZE, size); + if (r < 0) { + cerr << "rbd-nbd: NBD_SET_SIZE failed" << std::endl; + r = -errno; + goto close_nbd; + } + + ioctl(nbd, NBD_SET_FLAGS, flags); + + if (cfg->io_timeout >= 0) { + r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)cfg->io_timeout); + if (r < 0) { + r = -errno; + cerr << "rbd-nbd: failed to set IO timeout: " << cpp_strerror(r) + << std::endl; + goto close_nbd; + } + } + + dout(10) << "ioctl setup complete for " << cfg->devpath << dendl; + nbd_index = index; + return 0; + +close_nbd: + if (r < 0) { + ioctl(nbd, NBD_CLEAR_SOCK); + cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r) << std::endl; + } + close(nbd); +done: + return r; +} + +static void netlink_cleanup(struct nl_sock *sock) +{ + if (!sock) + return; + + nl_close(sock); + nl_socket_free(sock); +} + +static struct nl_sock *netlink_init(int *id) +{ + struct nl_sock *sock; + int ret; + + sock = nl_socket_alloc(); + if (!sock) { + cerr << "rbd-nbd: Could not allocate netlink socket." << std::endl; + return NULL; + } + + ret = genl_connect(sock); + if (ret < 0) { + cerr << "rbd-nbd: Could not connect netlink socket. Error " << ret + << std::endl; + goto free_sock; + } + + *id = genl_ctrl_resolve(sock, "nbd"); + if (*id < 0) + // nbd netlink interface not supported. + goto close_sock; + + return sock; + +close_sock: + nl_close(sock); +free_sock: + nl_socket_free(sock); + return NULL; +} + +static int netlink_disconnect(int index) +{ + struct nl_sock *sock; + struct nl_msg *msg; + int ret, nl_id; + + sock = netlink_init(&nl_id); + if (!sock) + // Try ioctl + return 1; + + nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL); + + msg = nlmsg_alloc(); + if (!msg) { + cerr << "rbd-nbd: Could not allocate netlink message." << std::endl; + goto free_sock; + } + + if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0, + NBD_CMD_DISCONNECT, 0)) { + cerr << "rbd-nbd: Could not setup message." << std::endl; + goto nla_put_failure; + } + + NLA_PUT_U32(msg, NBD_ATTR_INDEX, index); + + ret = nl_send_sync(sock, msg); + netlink_cleanup(sock); + if (ret < 0) { + cerr << "rbd-nbd: netlink disconnect failed: " << nl_geterror(-ret) + << std::endl; + return -EIO; + } + + return 0; + +nla_put_failure: + nlmsg_free(msg); +free_sock: + netlink_cleanup(sock); + return -EIO; +} + +static int netlink_disconnect_by_path(const std::string& devpath) +{ + int index; + + index = parse_nbd_index(devpath); + if (index < 0) + return index; + + return netlink_disconnect(index); +} + +static int netlink_resize(int nbd_index, uint64_t size) +{ + struct nl_sock *sock; + struct nl_msg *msg; + int nl_id, ret; + + sock = netlink_init(&nl_id); + if (!sock) { + cerr << "rbd-nbd: Netlink interface not supported." << std::endl; + return 1; + } + + nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL); + + msg = nlmsg_alloc(); + if (!msg) { + cerr << "rbd-nbd: Could not allocate netlink message." << std::endl; + goto free_sock; + } + + if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0, + NBD_CMD_RECONFIGURE, 0)) { + cerr << "rbd-nbd: Could not setup message." << std::endl; + goto free_msg; + } + + NLA_PUT_U32(msg, NBD_ATTR_INDEX, nbd_index); + NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, size); + + ret = nl_send_sync(sock, msg); + if (ret < 0) { + cerr << "rbd-nbd: netlink resize failed: " << nl_geterror(ret) << std::endl; + goto free_sock; + } + + netlink_cleanup(sock); + dout(10) << "netlink resize complete for nbd" << nbd_index << dendl; + return 0; + +nla_put_failure: +free_msg: + nlmsg_free(msg); +free_sock: + netlink_cleanup(sock); + return -EIO; +} + +static int netlink_connect_cb(struct nl_msg *msg, void *arg) +{ + struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlmsg_hdr(msg)); + Config *cfg = (Config *)arg; + struct nlattr *msg_attr[NBD_ATTR_MAX + 1]; + uint32_t index; + int ret; + + ret = nla_parse(msg_attr, NBD_ATTR_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL); + if (ret) { + cerr << "rbd-nbd: Unsupported netlink reply" << std::endl; + return -NLE_MSGTYPE_NOSUPPORT; + } + + if (!msg_attr[NBD_ATTR_INDEX]) { + cerr << "rbd-nbd: netlink connect reply missing device index." << std::endl; + return -NLE_MSGTYPE_NOSUPPORT; + } + + index = nla_get_u32(msg_attr[NBD_ATTR_INDEX]); + cfg->devpath = "/dev/nbd" + stringify(index); + nbd_index = index; + + return NL_OK; +} + +static int netlink_connect(Config *cfg, struct nl_sock *sock, int nl_id, int fd, + uint64_t size, uint64_t flags, bool reconnect) +{ + struct nlattr *sock_attr; + struct nlattr *sock_opt; + struct nl_msg *msg; + int ret; + + if (reconnect) { + dout(10) << "netlink try reconnect for " << cfg->devpath << dendl; + + nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL); + } else { + nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, netlink_connect_cb, + cfg); + } + + msg = nlmsg_alloc(); + if (!msg) { + cerr << "rbd-nbd: Could not allocate netlink message." << std::endl; + return -ENOMEM; + } + + if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0, + reconnect ? NBD_CMD_RECONFIGURE : NBD_CMD_CONNECT, 0)) { + cerr << "rbd-nbd: Could not setup message." << std::endl; + goto free_msg; + } + + if (!cfg->devpath.empty()) { + ret = parse_nbd_index(cfg->devpath); + if (ret < 0) + goto free_msg; + + NLA_PUT_U32(msg, NBD_ATTR_INDEX, ret); + if (reconnect) { + nbd_index = ret; + } + } + + if (cfg->io_timeout >= 0) + NLA_PUT_U64(msg, NBD_ATTR_TIMEOUT, cfg->io_timeout); + + NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, size); + NLA_PUT_U64(msg, NBD_ATTR_BLOCK_SIZE_BYTES, RBD_NBD_BLKSIZE); + NLA_PUT_U64(msg, NBD_ATTR_SERVER_FLAGS, flags); + NLA_PUT_U64(msg, NBD_ATTR_DEAD_CONN_TIMEOUT, cfg->reattach_timeout); + if (!cfg->cookie.empty()) + NLA_PUT_STRING(msg, NBD_ATTR_BACKEND_IDENTIFIER, cfg->cookie.c_str()); + + sock_attr = nla_nest_start(msg, NBD_ATTR_SOCKETS); + if (!sock_attr) { + cerr << "rbd-nbd: Could not init sockets in netlink message." << std::endl; + goto free_msg; + } + + sock_opt = nla_nest_start(msg, NBD_SOCK_ITEM); + if (!sock_opt) { + cerr << "rbd-nbd: Could not init sock in netlink message." << std::endl; + goto free_msg; + } + + NLA_PUT_U32(msg, NBD_SOCK_FD, fd); + nla_nest_end(msg, sock_opt); + nla_nest_end(msg, sock_attr); + + ret = nl_send_sync(sock, msg); + if (ret < 0) { + cerr << "rbd-nbd: netlink connect failed: " << nl_geterror(ret) + << std::endl; + return -EIO; + } + + dout(10) << "netlink connect complete for " << cfg->devpath << dendl; + return 0; + +nla_put_failure: +free_msg: + nlmsg_free(msg); + return -EIO; +} + +static int try_netlink_setup(Config *cfg, int fd, uint64_t size, uint64_t flags, + bool reconnect) +{ + struct nl_sock *sock; + int nl_id, ret; + + sock = netlink_init(&nl_id); + if (!sock) { + cerr << "rbd-nbd: Netlink interface not supported. Using ioctl interface." + << std::endl; + return 1; + } + + dout(10) << "netlink interface supported." << dendl; + + ret = netlink_connect(cfg, sock, nl_id, fd, size, flags, reconnect); + netlink_cleanup(sock); + + if (ret != 0) + return ret; + + nbd = open(cfg->devpath.c_str(), O_RDWR); + if (nbd < 0) { + cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl; + return nbd; + } + + return 0; +} + +static int run_quiesce_hook(const std::string &quiesce_hook, + const std::string &devpath, + const std::string &command) { + dout(10) << __func__ << ": " << quiesce_hook << " " << devpath << " " + << command << dendl; + + SubProcess hook(quiesce_hook.c_str(), SubProcess::CLOSE, SubProcess::PIPE, + SubProcess::PIPE); + hook.add_cmd_args(devpath.c_str(), command.c_str(), NULL); + bufferlist err; + int r = hook.spawn(); + if (r < 0) { + err.append("subprocess spawn failed"); + } else { + err.read_fd(hook.get_stderr(), 16384); + r = hook.join(); + if (r > 0) { + r = -r; + } + } + if (r < 0) { + derr << __func__ << ": " << quiesce_hook << " " << devpath << " " + << command << " failed: " << err.to_str() << dendl; + } else { + dout(10) << " succeeded: " << err.to_str() << dendl; + } + + return r; +} + +static void handle_signal(int signum) +{ + ceph_assert(signum == SIGINT || signum == SIGTERM); + derr << "*** Got signal " << sig_str(signum) << " ***" << dendl; + + dout(20) << __func__ << ": " << "notifying terminate" << dendl; + + ceph_assert(terminate_event_sock.is_valid()); + terminate_event_sock.notify(); +} + +static NBDServer *start_server(int fd, librbd::Image& image, Config *cfg) +{ + NBDServer *server; + + server = new NBDServer(fd, image, cfg); + server->start(); + + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + register_async_signal_handler_oneshot(SIGINT, handle_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_signal); + + return server; +} + +static void run_server(Preforker& forker, NBDServer *server, bool netlink_used) +{ + if (g_conf()->daemonize) { + global_init_postfork_finish(g_ceph_context); + forker.daemonize(); + } + + if (netlink_used) + server->wait_for_disconnect(); + else + ioctl(nbd, NBD_DO_IT); + + unregister_async_signal_handler(SIGHUP, sighup_handler); + unregister_async_signal_handler(SIGINT, handle_signal); + unregister_async_signal_handler(SIGTERM, handle_signal); + shutdown_async_signal_handler(); +} + +// Eventually it should be removed when pidfd_open is widely supported. + +static int wait_for_terminate_legacy(int pid, int timeout) +{ + for (int i = 0; ; i++) { + if (kill(pid, 0) == -1) { + if (errno == ESRCH) { + return 0; + } + int r = -errno; + cerr << "rbd-nbd: kill(" << pid << ", 0) failed: " + << cpp_strerror(r) << std::endl; + return r; + } + if (i >= timeout * 2) { + break; + } + usleep(500000); + } + + cerr << "rbd-nbd: waiting for process exit timed out" << std::endl; + return -ETIMEDOUT; +} + +// Eventually it should be replaced with glibc' pidfd_open +// when it is widely available. + +#ifdef __NR_pidfd_open +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} +#else +static int pidfd_open(pid_t pid, unsigned int flags) +{ + errno = ENOSYS; + return -1; +} +#endif + +static int wait_for_terminate(int pid, int timeout) +{ + int fd = pidfd_open(pid, 0); + if (fd == -1) { + if (errno == ENOSYS) { + return wait_for_terminate_legacy(pid, timeout); + } + if (errno == ESRCH) { + return 0; + } + int r = -errno; + cerr << "rbd-nbd: pidfd_open(" << pid << ") failed: " + << cpp_strerror(r) << std::endl; + return r; + } + + struct pollfd poll_fds[1]; + memset(poll_fds, 0, sizeof(struct pollfd)); + poll_fds[0].fd = fd; + poll_fds[0].events = POLLIN; + + int r = poll(poll_fds, 1, timeout * 1000); + if (r == -1) { + r = -errno; + cerr << "rbd-nbd: failed to poll rbd-nbd process: " << cpp_strerror(r) + << std::endl; + goto done; + } else { + r = 0; + } + + if ((poll_fds[0].revents & POLLIN) == 0) { + cerr << "rbd-nbd: waiting for process exit timed out" << std::endl; + r = -ETIMEDOUT; + } + +done: + close(fd); + + return r; +} + +static int do_map(int argc, const char *argv[], Config *cfg, bool reconnect) +{ + int r; + + librados::Rados rados; + librbd::RBD rbd; + librados::IoCtx io_ctx; + librbd::Image image; + + int read_only = 0; + unsigned long flags; + unsigned long size; + unsigned long blksize = RBD_NBD_BLKSIZE; + bool use_netlink; + + int fd[2]; + + librbd::image_info_t info; + + Preforker forker; + NBDServer *server; + + vector<const char*> args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + g_ceph_context->_conf.set_val_or_die("pid_file", ""); + + if (global_init_prefork(g_ceph_context) >= 0) { + std::string err; + r = forker.prefork(err); + if (r < 0) { + cerr << err << std::endl; + return r; + } + if (forker.is_parent()) { + if (forker.parent_wait(err) != 0) { + return -ENXIO; + } + return 0; + } + global_init_postfork_start(g_ceph_context); + } + + common_init_finish(g_ceph_context); + global_init_chdir(g_ceph_context); + + if (socketpair(AF_UNIX, SOCK_STREAM, 0, fd) == -1) { + r = -errno; + goto close_ret; + } + + r = rados.init_with_context(g_ceph_context); + if (r < 0) + goto close_fd; + + r = rados.connect(); + if (r < 0) + goto close_fd; + + r = rados.ioctx_create(cfg->poolname.c_str(), io_ctx); + if (r < 0) + goto close_fd; + + io_ctx.set_namespace(cfg->nsname); + + r = rbd.open(io_ctx, image, cfg->imgname.c_str()); + if (r < 0) + goto close_fd; + + if (cfg->exclusive) { + r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE); + if (r < 0) { + cerr << "rbd-nbd: failed to acquire exclusive lock: " << cpp_strerror(r) + << std::endl; + goto close_fd; + } + } + + if (!cfg->snapname.empty()) { + r = image.snap_set(cfg->snapname.c_str()); + if (r < 0) + goto close_fd; + } + + if (cfg->encryption_format.has_value()) { + if (!cfg->encryption_passphrase_file.has_value()) { + r = -EINVAL; + cerr << "rbd-nbd: missing encryption-passphrase-file" << std::endl; + goto close_fd; + } + std::ifstream file(cfg->encryption_passphrase_file.value().c_str()); + if (file.fail()) { + r = -errno; + std::cerr << "rbd-nbd: unable to open passphrase file:" + << cpp_strerror(errno) << std::endl; + goto close_fd; + } + std::string passphrase((std::istreambuf_iterator<char>(file)), + (std::istreambuf_iterator<char>())); + auto sg = make_scope_guard([&] { + ceph_memzero_s(&passphrase[0], passphrase.size(), passphrase.size()); }); + file.close(); + if (!passphrase.empty() && passphrase[passphrase.length() - 1] == '\n') { + passphrase.erase(passphrase.length() - 1); + } + + switch (cfg->encryption_format.value()) { + case RBD_ENCRYPTION_FORMAT_LUKS1: { + librbd::encryption_luks1_format_options_t opts = {}; + opts.passphrase = passphrase; + r = image.encryption_load( + RBD_ENCRYPTION_FORMAT_LUKS1, &opts, sizeof(opts)); + break; + } + case RBD_ENCRYPTION_FORMAT_LUKS2: { + librbd::encryption_luks2_format_options_t opts = {}; + opts.passphrase = passphrase; + r = image.encryption_load( + RBD_ENCRYPTION_FORMAT_LUKS2, &opts, sizeof(opts)); + blksize = 4096; + break; + } + default: + r = -ENOTSUP; + cerr << "rbd-nbd: unsupported encryption format" << std::endl; + goto close_fd; + } + + if (r != 0) { + cerr << "rbd-nbd: failed to load encryption: " << cpp_strerror(r) + << std::endl; + goto close_fd; + } + } + + r = image.stat(info, sizeof(info)); + if (r < 0) + goto close_fd; + + flags = NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_TRIM | NBD_FLAG_HAS_FLAGS; + if (!cfg->snapname.empty() || cfg->readonly) { + flags |= NBD_FLAG_READ_ONLY; + read_only = 1; + } + + if (info.size > ULONG_MAX) { + r = -EFBIG; + cerr << "rbd-nbd: image is too large (" << byte_u_t(info.size) + << ", max is " << byte_u_t(ULONG_MAX) << ")" << std::endl; + goto close_fd; + } + + size = info.size; + + r = load_module(cfg); + if (r < 0) + goto close_fd; + + server = start_server(fd[1], image, cfg); + + use_netlink = cfg->try_netlink || reconnect; + if (use_netlink) { + // generate when the cookie is not supplied at CLI + if (!reconnect && cfg->cookie.empty()) { + uuid_d uuid_gen; + uuid_gen.generate_random(); + cfg->cookie = uuid_gen.to_string(); + } + r = try_netlink_setup(cfg, fd[0], size, flags, reconnect); + if (r < 0) { + goto free_server; + } else if (r == 1) { + use_netlink = false; + } + } + + if (!use_netlink) { + r = try_ioctl_setup(cfg, fd[0], size, blksize, flags); + if (r < 0) + goto free_server; + } + + r = check_device_size(nbd_index, size); + if (r < 0) + goto close_nbd; + + r = ioctl(nbd, BLKROSET, (unsigned long) &read_only); + if (r < 0) { + r = -errno; + goto close_nbd; + } + + { + NBDQuiesceWatchCtx quiesce_watch_ctx(server); + if (cfg->quiesce) { + r = image.quiesce_watch(&quiesce_watch_ctx, + &server->quiesce_watch_handle); + if (r < 0) { + goto close_nbd; + } + } + + uint64_t handle; + + NBDWatchCtx watch_ctx(nbd, nbd_index, use_netlink, io_ctx, image, + info.size); + r = image.update_watch(&watch_ctx, &handle); + if (r < 0) + goto close_nbd; + + std::string cookie; + if (use_netlink) { + cookie = get_cookie(cfg->devpath); + ceph_assert(cookie == cfg->cookie || cookie.empty()); + } + if (cfg->show_cookie && !cookie.empty()) { + cout << cfg->devpath << " " << cookie << std::endl; + } else { + cout << cfg->devpath << std::endl; + } + + run_server(forker, server, use_netlink); + + if (cfg->quiesce) { + r = image.quiesce_unwatch(server->quiesce_watch_handle); + ceph_assert(r == 0); + } + + r = image.update_unwatch(handle); + ceph_assert(r == 0); + } + +close_nbd: + if (r < 0) { + if (use_netlink) { + netlink_disconnect(nbd_index); + } else { + ioctl(nbd, NBD_CLEAR_SOCK); + cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r) + << std::endl; + } + } + close(nbd); +free_server: + delete server; +close_fd: + close(fd[0]); + close(fd[1]); +close_ret: + image.close(); + io_ctx.close(); + rados.shutdown(); + + forker.exit(r < 0 ? EXIT_FAILURE : 0); + // Unreachable; + return r; +} + +static int do_detach(Config *cfg) +{ + int r = kill(cfg->pid, SIGTERM); + if (r == -1) { + r = -errno; + cerr << "rbd-nbd: failed to terminate " << cfg->pid << ": " + << cpp_strerror(r) << std::endl; + return r; + } + + return wait_for_terminate(cfg->pid, cfg->reattach_timeout); +} + +static int do_unmap(Config *cfg) +{ + /* + * The netlink disconnect call supports devices setup with netlink or ioctl, + * so we always try that first. + */ + int r = netlink_disconnect_by_path(cfg->devpath); + if (r < 0) { + return r; + } + + if (r == 1) { + int nbd = open(cfg->devpath.c_str(), O_RDWR); + if (nbd < 0) { + cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl; + return nbd; + } + + r = ioctl(nbd, NBD_DISCONNECT); + if (r < 0) { + cerr << "rbd-nbd: the device is not used" << std::endl; + } + + close(nbd); + + if (r < 0) { + return r; + } + } + + if (cfg->pid > 0) { + r = wait_for_terminate(cfg->pid, cfg->reattach_timeout); + } + + return 0; +} + +static int parse_imgpath(const std::string &imgpath, Config *cfg, + std::ostream *err_msg) { + std::regex pattern("^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$"); + std::smatch match; + if (!std::regex_match(imgpath, match, pattern)) { + std::cerr << "rbd-nbd: invalid spec '" << imgpath << "'" << std::endl; + return -EINVAL; + } + + if (match[1].matched) { + cfg->poolname = match[1]; + } + + if (match[2].matched) { + cfg->nsname = match[2]; + } + + cfg->imgname = match[3]; + + if (match[4].matched) + cfg->snapname = match[4]; + + return 0; +} + +static int do_list_mapped_devices(const std::string &format, bool pretty_format) +{ + bool should_print = false; + std::unique_ptr<ceph::Formatter> f; + TextTable tbl; + + if (format == "json") { + f.reset(new JSONFormatter(pretty_format)); + } else if (format == "xml") { + f.reset(new XMLFormatter(pretty_format)); + } else if (!format.empty() && format != "plain") { + std::cerr << "rbd-nbd: invalid output format: " << format << std::endl; + return -EINVAL; + } + + if (f) { + f->open_array_section("devices"); + } else { + tbl.define_column("id", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("image", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("device", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("cookie", TextTable::LEFT, TextTable::LEFT); + } + + Config cfg; + NBDListIterator it; + while (it.get(&cfg)) { + if (f) { + f->open_object_section("device"); + f->dump_int("id", cfg.pid); + f->dump_string("pool", cfg.poolname); + f->dump_string("namespace", cfg.nsname); + f->dump_string("image", cfg.imgname); + f->dump_string("snap", cfg.snapname); + f->dump_string("device", cfg.devpath); + f->dump_string("cookie", cfg.cookie); + f->close_section(); + } else { + should_print = true; + if (cfg.snapname.empty()) { + cfg.snapname = "-"; + } + tbl << cfg.pid << cfg.poolname << cfg.nsname << cfg.imgname + << cfg.snapname << cfg.devpath << cfg.cookie << TextTable::endrow; + } + } + + if (f) { + f->close_section(); // devices + f->flush(std::cout); + } + if (should_print) { + std::cout << tbl; + } + return 0; +} + +static bool find_mapped_dev_by_spec(Config *cfg, int skip_pid=-1) { + Config c; + NBDListIterator it; + while (it.get(&c)) { + if (c.pid != skip_pid && + c.poolname == cfg->poolname && c.nsname == cfg->nsname && + c.imgname == cfg->imgname && c.snapname == cfg->snapname && + (cfg->devpath.empty() || c.devpath == cfg->devpath)) { + *cfg = c; + return true; + } + } + return false; +} + +static int find_proc_by_dev(Config *cfg) { + Config c; + NBDListIterator it; + while (it.get(&c)) { + if (c.devpath == cfg->devpath) { + *cfg = c; + return true; + } + } + return false; +} + +static int parse_args(vector<const char*>& args, std::ostream *err_msg, + Config *cfg) { + std::string conf_file_list; + std::string cluster; + CephInitParameters iparams = ceph_argparse_early_args( + args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list); + + ConfigProxy config{false}; + config->name = iparams.name; + config->cluster = cluster; + + if (!conf_file_list.empty()) { + config.parse_config_files(conf_file_list.c_str(), nullptr, 0); + } else { + config.parse_config_files(nullptr, nullptr, 0); + } + config.parse_env(CEPH_ENTITY_TYPE_CLIENT); + config.parse_argv(args); + cfg->poolname = config.get_val<std::string>("rbd_default_pool"); + + std::vector<const char*>::iterator i; + std::ostringstream err; + std::string arg_value; + + for (i = args.begin(); i != args.end(); ) { + if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) { + return HELP_INFO; + } else if (ceph_argparse_flag(args, i, "-v", "--version", (char*)NULL)) { + return VERSION_INFO; + } else if (ceph_argparse_witharg(args, i, &cfg->devpath, "--device", (char *)NULL)) { + } else if (ceph_argparse_witharg(args, i, &cfg->io_timeout, err, + "--io-timeout", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->io_timeout < 0) { + *err_msg << "rbd-nbd: Invalid argument for io-timeout!"; + return -EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &cfg->nbds_max, err, "--nbds_max", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->nbds_max < 0) { + *err_msg << "rbd-nbd: Invalid argument for nbds_max!"; + return -EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &cfg->max_part, err, "--max_part", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if ((cfg->max_part < 0) || (cfg->max_part > 255)) { + *err_msg << "rbd-nbd: Invalid argument for max_part(0~255)!"; + return -EINVAL; + } + cfg->set_max_part = true; + } else if (ceph_argparse_flag(args, i, "--quiesce", (char *)NULL)) { + cfg->quiesce = true; + } else if (ceph_argparse_witharg(args, i, &cfg->quiesce_hook, + "--quiesce-hook", (char *)NULL)) { + } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) { + cfg->readonly = true; + } else if (ceph_argparse_witharg(args, i, &cfg->reattach_timeout, err, + "--reattach-timeout", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->reattach_timeout < 0) { + *err_msg << "rbd-nbd: Invalid argument for reattach-timeout!"; + return -EINVAL; + } + } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) { + cfg->exclusive = true; + } else if (ceph_argparse_witharg(args, i, &cfg->io_timeout, err, + "--timeout", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->io_timeout < 0) { + *err_msg << "rbd-nbd: Invalid argument for timeout!"; + return -EINVAL; + } + *err_msg << "rbd-nbd: --timeout is deprecated (use --io-timeout)"; + } else if (ceph_argparse_witharg(args, i, &cfg->format, err, "--format", + (char *)NULL)) { + } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) { + cfg->pretty_format = true; + } else if (ceph_argparse_flag(args, i, "--try-netlink", (char *)NULL)) { + cfg->try_netlink = true; + } else if (ceph_argparse_flag(args, i, "--show-cookie", (char *)NULL)) { + cfg->show_cookie = true; + } else if (ceph_argparse_witharg(args, i, &cfg->cookie, "--cookie", (char *)NULL)) { + } else if (ceph_argparse_witharg(args, i, &arg_value, + "--encryption-format", (char *)NULL)) { + if (arg_value == "luks1") { + cfg->encryption_format = + std::make_optional(RBD_ENCRYPTION_FORMAT_LUKS1); + } else if (arg_value == "luks2") { + cfg->encryption_format = + std::make_optional(RBD_ENCRYPTION_FORMAT_LUKS2); + } else { + *err_msg << "rbd-nbd: Invalid encryption format"; + return -EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &arg_value, + "--encryption-passphrase-file", + (char *)NULL)) { + cfg->encryption_passphrase_file = std::make_optional(arg_value); + } else { + ++i; + } + } + + Command cmd = None; + if (args.begin() != args.end()) { + if (strcmp(*args.begin(), "map") == 0) { + cmd = Map; + } else if (strcmp(*args.begin(), "unmap") == 0) { + cmd = Unmap; + } else if (strcmp(*args.begin(), "attach") == 0) { + cmd = Attach; + } else if (strcmp(*args.begin(), "detach") == 0) { + cmd = Detach; + } else if (strcmp(*args.begin(), "list-mapped") == 0) { + cmd = List; + } else { + *err_msg << "rbd-nbd: unknown command: " << *args.begin(); + return -EINVAL; + } + args.erase(args.begin()); + } + + if (cmd == None) { + *err_msg << "rbd-nbd: must specify command"; + return -EINVAL; + } + + std::string cookie; + switch (cmd) { + case Attach: + if (cfg->devpath.empty()) { + *err_msg << "rbd-nbd: must specify device to attach"; + return -EINVAL; + } + // Allowing attach without --cookie option for kernel without + // NBD_ATTR_BACKEND_IDENTIFIER support for compatibility + cookie = get_cookie(cfg->devpath); + if (!cookie.empty()) { + if (cfg->cookie.empty()) { + *err_msg << "rbd-nbd: must specify cookie to attach"; + return -EINVAL; + } else if (cookie != cfg->cookie) { + *err_msg << "rbd-nbd: cookie mismatch"; + return -EINVAL; + } + } else if (!cfg->cookie.empty()) { + *err_msg << "rbd-nbd: kernel does not have cookie support"; + return -EINVAL; + } + [[fallthrough]]; + case Map: + if (args.begin() == args.end()) { + *err_msg << "rbd-nbd: must specify image-or-snap-spec"; + return -EINVAL; + } + if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) { + return -EINVAL; + } + args.erase(args.begin()); + break; + case Detach: + case Unmap: + if (args.begin() == args.end()) { + *err_msg << "rbd-nbd: must specify nbd device or image-or-snap-spec"; + return -EINVAL; + } + if (boost::starts_with(*args.begin(), "/dev/")) { + cfg->devpath = *args.begin(); + } else { + if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) { + return -EINVAL; + } + } + args.erase(args.begin()); + break; + default: + //shut up gcc; + break; + } + + if (args.begin() != args.end()) { + *err_msg << "rbd-nbd: unknown args: " << *args.begin(); + return -EINVAL; + } + + cfg->command = cmd; + return 0; +} + +static int rbd_nbd(int argc, const char *argv[]) +{ + int r; + Config cfg; + vector<const char*> args; + argv_to_vec(argc, argv, args); + + std::ostringstream err_msg; + r = parse_args(args, &err_msg, &cfg); + if (r == HELP_INFO) { + usage(); + return 0; + } else if (r == VERSION_INFO) { + std::cout << pretty_version_to_str() << std::endl; + return 0; + } else if (r < 0) { + cerr << err_msg.str() << std::endl; + return r; + } + + if (!err_msg.str().empty()) { + cerr << err_msg.str() << std::endl; + } + + switch (cfg.command) { + case Attach: + ceph_assert(!cfg.devpath.empty()); + if (find_mapped_dev_by_spec(&cfg, getpid())) { + cerr << "rbd-nbd: " << cfg.devpath << " has process " << cfg.pid + << " connected" << std::endl; + return -EBUSY; + } + [[fallthrough]]; + case Map: + if (cfg.imgname.empty()) { + cerr << "rbd-nbd: image name was not specified" << std::endl; + return -EINVAL; + } + + r = do_map(argc, argv, &cfg, cfg.command == Attach); + if (r < 0) + return -EINVAL; + break; + case Detach: + if (cfg.devpath.empty()) { + if (!find_mapped_dev_by_spec(&cfg)) { + cerr << "rbd-nbd: " << cfg.image_spec() << " is not mapped" + << std::endl; + return -ENOENT; + } + } else if (!find_proc_by_dev(&cfg)) { + cerr << "rbd-nbd: no process attached to " << cfg.devpath << " found" + << std::endl; + return -ENOENT; + } + r = do_detach(&cfg); + if (r < 0) + return -EINVAL; + break; + case Unmap: + if (cfg.devpath.empty()) { + if (!find_mapped_dev_by_spec(&cfg)) { + cerr << "rbd-nbd: " << cfg.image_spec() << " is not mapped" + << std::endl; + return -ENOENT; + } + } else if (!find_proc_by_dev(&cfg)) { + // still try to send disconnect to the device + } + r = do_unmap(&cfg); + if (r < 0) + return -EINVAL; + break; + case List: + r = do_list_mapped_devices(cfg.format, cfg.pretty_format); + if (r < 0) + return -EINVAL; + break; + default: + usage(); + break; + } + + return 0; +} + +int main(int argc, const char *argv[]) +{ + int r = rbd_nbd(argc, argv); + if (r < 0) { + return EXIT_FAILURE; + } + return 0; +} diff --git a/src/tools/rbd_nbd/rbd-nbd_quiesce b/src/tools/rbd_nbd/rbd-nbd_quiesce new file mode 100755 index 000000000..a62a12b15 --- /dev/null +++ b/src/tools/rbd_nbd/rbd-nbd_quiesce @@ -0,0 +1,31 @@ +#!/bin/sh + +echo "$0 $@" >&2 + +if [ $# -lt 2 ]; then + echo "usage: $0 <dev> <cmd>" >&2 + exit 1 +fi + +dev=$1 +cmd=$2 + +export PATH=/usr/sbin:/usr/bin:/sbin:/bin + +findmnt -S "${dev}" -fno TARGET | +while read mnt; do + case "${cmd}" in + quiesce) + echo "freezing ${mnt}" >&2 + fsfreeze -f "${mnt}" + ;; + unquiesce) + echo "unfreezing ${mnt}" >&2 + fsfreeze -u "${mnt}" + ;; + *) + echo "unknown command ${cmd}" >&2 + exit 1 + ;; + esac +done |