diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/client | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/client')
36 files changed, 27960 insertions, 0 deletions
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt new file mode 100644 index 000000000..8897ada7b --- /dev/null +++ b/src/client/CMakeLists.txt @@ -0,0 +1,13 @@ +set(libclient_srcs + Client.cc + Dentry.cc + Fh.cc + Inode.cc + MetaRequest.cc + ClientSnapRealm.cc + MetaSession.cc + Trace.cc + posix_acl.cc + Delegation.cc) +add_library(client STATIC ${libclient_srcs}) +target_link_libraries(client osdc) diff --git a/src/client/Client.cc b/src/client/Client.cc new file mode 100644 index 000000000..2b7db5a89 --- /dev/null +++ b/src/client/Client.cc @@ -0,0 +1,16517 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +// unix-ey fs stuff +#include <unistd.h> +#include <sys/types.h> +#include <time.h> +#include <utime.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/param.h> +#include <fcntl.h> +#include <sys/file.h> +#ifndef _WIN32 +#include <sys/utsname.h> +#endif +#include <sys/uio.h> + +#include <boost/lexical_cast.hpp> +#include <boost/fusion/include/std_pair.hpp> + +#include "common/async/waiter.h" + +#if defined(__FreeBSD__) +#define XATTR_CREATE 0x1 +#define XATTR_REPLACE 0x2 +#elif !defined(_WIN32) +#include <sys/xattr.h> +#endif + +#if defined(__linux__) +#include <linux/falloc.h> +#endif + +#include <sys/statvfs.h> + +#include "common/config.h" +#include "common/version.h" +#include "common/async/blocked_completion.h" + +#include "mon/MonClient.h" + +#include "messages/MClientCaps.h" +#include "messages/MClientLease.h" +#include "messages/MClientQuota.h" +#include "messages/MClientReclaim.h" +#include "messages/MClientReclaimReply.h" +#include "messages/MClientReconnect.h" +#include "messages/MClientReply.h" +#include "messages/MClientRequest.h" +#include "messages/MClientRequestForward.h" +#include "messages/MClientSession.h" +#include "messages/MClientSnap.h" +#include "messages/MClientMetrics.h" +#include "messages/MCommandReply.h" +#include "messages/MFSMap.h" +#include "messages/MFSMapUser.h" +#include "messages/MMDSMap.h" +#include "messages/MOSDMap.h" + +#include "mds/flock.h" +#include "mds/cephfs_features.h" +#include "mds/snap.h" +#include "osd/OSDMap.h" +#include "osdc/Filer.h" + +#include "common/Cond.h" +#include "common/perf_counters.h" +#include "common/admin_socket.h" +#include "common/errno.h" +#include "include/str_list.h" + +#define dout_subsys ceph_subsys_client + +#include "include/lru.h" +#include "include/compat.h" +#include "include/stringify.h" +#include "include/random.h" + +#include "Client.h" +#include "Inode.h" +#include "Dentry.h" +#include "Delegation.h" +#include "Dir.h" +#include "ClientSnapRealm.h" +#include "Fh.h" +#include "MetaSession.h" +#include "MetaRequest.h" +#include "ObjecterWriteback.h" +#include "posix_acl.h" + +#include "include/ceph_assert.h" +#include "include/stat.h" + +#include "include/cephfs/ceph_ll_client.h" + +#if HAVE_GETGROUPLIST +#include <grp.h> +#include <pwd.h> +#include <unistd.h> +#endif + +#undef dout_prefix +#define dout_prefix *_dout << "client." << whoami << " " + +#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout + +// FreeBSD fails to define this +#ifndef O_DSYNC +#define O_DSYNC 0x0 +#endif +// Darwin fails to define this +#ifndef O_RSYNC +#define O_RSYNC 0x0 +#endif + +#ifndef O_DIRECT +#define O_DIRECT 0x0 +#endif + +// Windows doesn't define those values. While the Posix compatibilty layer +// doesn't support those values, the Windows native functions do provide +// similar flags. Special care should be taken if we're going to use those +// flags in ceph-dokan. The current values are no-ops, while propagating +// them to the rest of the code might cause the Windows functions to reject +// them as invalid. +#ifndef O_NOFOLLOW +#define O_NOFOLLOW 0x0 +#endif + +#ifndef O_SYNC +#define O_SYNC 0x0 +#endif + +#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED) + +#ifndef S_IXUGO +#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH) +#endif + +using std::dec; +using std::hex; +using std::list; +using std::oct; +using std::pair; +using std::string; +using std::vector; + +using namespace TOPNSPC::common; + +namespace bs = boost::system; +namespace ca = ceph::async; + +void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset) +{ + Client *client = static_cast<Client*>(p); + client->flush_set_callback(oset); +} + +bool Client::is_reserved_vino(vinodeno_t &vino) { + if (MDS_IS_PRIVATE_INO(vino.ino)) { + ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl; + return true; + } + return false; +} + +// running average and standard deviation -- presented in +// Donald Knuth's TAoCP, Volume II. +double calc_average(double old_avg, double value, uint64_t count) { + double new_avg; + if (count == 1) { + new_avg = value; + } else { + new_avg = old_avg + ((value - old_avg) / count); + } + + return new_avg; +} + +double calc_sq_sum(double old_sq_sum, double old_mean, double new_mean, + double value, uint64_t count) { + double new_sq_sum; + if (count == 1) { + new_sq_sum = 0.0; + } else { + new_sq_sum = old_sq_sum + (value - old_mean)*(value - new_mean); + } + + return new_sq_sum; +} + +// ------------- + +Client::CommandHook::CommandHook(Client *client) : + m_client(client) +{ +} + +int Client::CommandHook::call( + std::string_view command, + const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& errss, + bufferlist& out) +{ + f->open_object_section("result"); + { + std::scoped_lock l{m_client->client_lock}; + if (command == "mds_requests") + m_client->dump_mds_requests(f); + else if (command == "mds_sessions") { + bool cap_dump = false; + cmd_getval(cmdmap, "cap_dump", cap_dump); + m_client->dump_mds_sessions(f, cap_dump); + } else if (command == "dump_cache") + m_client->dump_cache(f); + else if (command == "kick_stale_sessions") + m_client->_kick_stale_sessions(); + else if (command == "status") + m_client->dump_status(f); + else + ceph_abort_msg("bad command registered"); + } + f->close_section(); + return 0; +} + + +// ------------- + +int Client::get_fd_inode(int fd, InodeRef *in) { + int r = 0; + if (fd == CEPHFS_AT_FDCWD) { + *in = cwd; + } else { + Fh *f = get_filehandle(fd); + if (!f) { + r = -CEPHFS_EBADF; + } else { + *in = f->inode; + } + } + return r; +} + +dir_result_t::dir_result_t(Inode *in, const UserPerm& perms) + : inode(in), offset(0), next_offset(2), + release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0), + perms(perms) + { } + +void Client::_reset_faked_inos() +{ + ino_t start = 1024; + free_faked_inos.clear(); + free_faked_inos.insert(start, (uint32_t)-1 - start + 1); + last_used_faked_ino = 0; + last_used_faked_root = 0; + #ifdef _WIN32 + // On Windows, sizeof(ino_t) is just 2. Despite that, most "native" + // Windows structures, including Dokan ones, are using 64B identifiers. + _use_faked_inos = false; + #else + _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos; + #endif +} + +void Client::_assign_faked_ino(Inode *in) +{ + if (0 == last_used_faked_ino) + last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root + interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1); + if (it == free_faked_inos.end() && last_used_faked_ino > 0) { + last_used_faked_ino = 2048; + it = free_faked_inos.lower_bound(last_used_faked_ino + 1); + } + ceph_assert(it != free_faked_inos.end()); + if (last_used_faked_ino < it.get_start()) { + ceph_assert(it.get_len() > 0); + last_used_faked_ino = it.get_start(); + } else { + ++last_used_faked_ino; + ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino); + } + in->faked_ino = last_used_faked_ino; + free_faked_inos.erase(in->faked_ino); + faked_ino_map[in->faked_ino] = in->vino(); +} + +/* + * In the faked mode, if you export multiple subdirectories, + * you will see that the inode numbers of the exported subdirectories + * are the same. so we distinguish the mount point by reserving + * the "fake ids" between "1024~2048" and combining the last + * 10bits(0x3ff) of the "root inodes". +*/ +void Client::_assign_faked_root(Inode *in) +{ + interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1); + if (it == free_faked_inos.end() && last_used_faked_root > 0) { + last_used_faked_root = 0; + it = free_faked_inos.lower_bound(last_used_faked_root + 1); + } + ceph_assert(it != free_faked_inos.end()); + vinodeno_t inode_info = in->vino(); + uint64_t inode_num = (uint64_t)inode_info.ino; + ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl; + last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048 + ceph_assert(it.get_start() + it.get_len() > last_used_faked_root); + + in->faked_ino = last_used_faked_root; + free_faked_inos.erase(in->faked_ino); + faked_ino_map[in->faked_ino] = in->vino(); +} + +void Client::_release_faked_ino(Inode *in) +{ + free_faked_inos.insert(in->faked_ino); + faked_ino_map.erase(in->faked_ino); +} + +vinodeno_t Client::_map_faked_ino(ino_t ino) +{ + vinodeno_t vino; + if (ino == 1) + vino = root->vino(); + else if (faked_ino_map.count(ino)) + vino = faked_ino_map[ino]; + else + vino = vinodeno_t(0, CEPH_NOSNAP); + ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl; + return vino; +} + +vinodeno_t Client::map_faked_ino(ino_t ino) +{ + std::scoped_lock lock(client_lock); + return _map_faked_ino(ino); +} + +// cons/des + +Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_) + : Dispatcher(m->cct->get()), + timer(m->cct, timer_lock, false), + messenger(m), + monclient(mc), + objecter(objecter_), + whoami(mc->get_global_id()), + mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"), + initialize_state(CLIENT_NEW, "Client::initstate_lock"), + cct_deleter{m->cct, [](CephContext *p) {p->put();}}, + async_ino_invalidator(m->cct), + async_dentry_invalidator(m->cct), + interrupt_finisher(m->cct), + remount_finisher(m->cct), + async_ino_releasor(m->cct), + objecter_finisher(m->cct), + m_command_hook(this), + fscid(0) +{ + _reset_faked_inos(); + + user_id = cct->_conf->client_mount_uid; + group_id = cct->_conf->client_mount_gid; + fuse_default_permissions = cct->_conf.get_val<bool>( + "fuse_default_permissions"); + + _collect_and_send_global_metrics = cct->_conf.get_val<bool>( + "client_collect_and_send_global_metrics"); + + mount_timeout = cct->_conf.get_val<std::chrono::seconds>( + "client_mount_timeout"); + + caps_release_delay = cct->_conf.get_val<std::chrono::seconds>( + "client_caps_release_delay"); + + if (cct->_conf->client_acl_type == "posix_acl") + acl_type = POSIX_ACL; + + lru.lru_set_midpoint(cct->_conf->client_cache_mid); + + // file handles + free_fd_set.insert(10, 1<<30); + + mdsmap.reset(new MDSMap); + + // osd interfaces + writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher, + &client_lock)); + objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock, + client_flush_set_callback, // all commit callback + (void*)this, + cct->_conf->client_oc_size, + cct->_conf->client_oc_max_objects, + cct->_conf->client_oc_max_dirty, + cct->_conf->client_oc_target_dirty, + cct->_conf->client_oc_max_dirty_age, + true)); +} + + +Client::~Client() +{ + ceph_assert(ceph_mutex_is_not_locked(client_lock)); + + // If the task is crashed or aborted and doesn't + // get any chance to run the umount and shutdow. + { + std::scoped_lock l{client_lock}; + tick_thread_stopped = true; + upkeep_cond.notify_one(); + } + + if (upkeeper.joinable()) + upkeeper.join(); + + // It is necessary to hold client_lock, because any inode destruction + // may call into ObjectCacher, which asserts that it's lock (which is + // client_lock) is held. + std::scoped_lock l{client_lock}; + tear_down_cache(); +} + +void Client::tear_down_cache() +{ + // fd's + for (auto &[fd, fh] : fd_map) { + ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl; + _release_fh(fh); + } + fd_map.clear(); + + while (!opened_dirs.empty()) { + dir_result_t *dirp = *opened_dirs.begin(); + ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl; + _closedir(dirp); + } + + // caps! + // *** FIXME *** + + // empty lru + trim_cache(); + ceph_assert(lru.lru_get_size() == 0); + + // close root ino + ceph_assert(inode_map.size() <= 1 + root_parents.size()); + if (root && inode_map.size() == 1 + root_parents.size()) { + root.reset(); + } + + ceph_assert(inode_map.empty()); +} + +inodeno_t Client::get_root_ino() +{ + std::scoped_lock l(client_lock); + if (use_faked_inos()) + return root->faked_ino; + else + return root->ino; +} + +Inode *Client::get_root() +{ + std::scoped_lock l(client_lock); + root->ll_get(); + return root.get(); +} + + +// debug crapola + +void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected) +{ + filepath path; + in->make_long_path(path); + ldout(cct, 1) << "dump_inode: " + << (disconnected ? "DISCONNECTED ":"") + << "inode " << in->ino + << " " << path + << " ref " << in->get_nref() + << " " << *in << dendl; + + if (f) { + f->open_object_section("inode"); + f->dump_stream("path") << path; + if (disconnected) + f->dump_int("disconnected", 1); + in->dump(f); + f->close_section(); + } + + did.insert(in); + if (in->dir) { + ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl; + for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin(); + it != in->dir->dentries.end(); + ++it) { + ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl; + if (f) { + f->open_object_section("dentry"); + it->second->dump(f); + f->close_section(); + } + if (it->second->inode) + dump_inode(f, it->second->inode.get(), did, false); + } + } +} + +void Client::dump_cache(Formatter *f) +{ + set<Inode*> did; + + ldout(cct, 1) << __func__ << dendl; + + if (f) + f->open_array_section("cache"); + + if (root) + dump_inode(f, root.get(), did, true); + + // make a second pass to catch anything disconnected + for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin(); + it != inode_map.end(); + ++it) { + if (did.count(it->second)) + continue; + dump_inode(f, it->second, did, true); + } + + if (f) + f->close_section(); +} + +void Client::dump_status(Formatter *f) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + ldout(cct, 1) << __func__ << dendl; + + const epoch_t osd_epoch + = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch)); + + if (f) { + f->open_object_section("metadata"); + for (const auto& kv : metadata) + f->dump_string(kv.first.c_str(), kv.second); + f->close_section(); + + f->dump_int("dentry_count", lru.lru_get_size()); + f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned()); + f->dump_int("id", get_nodeid().v); + entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy()); + f->dump_object("inst", inst); + f->dump_object("addr", inst.addr); + f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str(); + f->dump_string("addr_str", inst.addr.get_legacy_str()); + f->dump_int("inode_count", inode_map.size()); + f->dump_int("mds_epoch", mdsmap->get_epoch()); + f->dump_int("osd_epoch", osd_epoch); + f->dump_int("osd_epoch_barrier", cap_epoch_barrier); + f->dump_bool("blocklisted", blocklisted); + f->dump_string("fs_name", mdsmap->get_fs_name()); + } +} + +void Client::_pre_init() +{ + timer.init(); + + objecter_finisher.start(); + filer.reset(new Filer(objecter, &objecter_finisher)); + + objectcacher->start(); +} + +int Client::init() +{ + RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false); + ceph_assert(iref_writer.is_first_writer()); + + _pre_init(); + { + std::scoped_lock l{client_lock}; + messenger->add_dispatcher_tail(this); + } + _finish_init(); + iref_writer.update_state(CLIENT_INITIALIZED); + return 0; +} + +void Client::_finish_init() +{ + { + std::scoped_lock l{client_lock}; + // logger + PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last); + plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request"); + plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request"); + plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation"); + plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation"); + plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation"); + // average, standard deviation mds/r/w/ latencies + plb.add_time(l_c_md_avg, "mdavg", "Average latency for processing metadata requests"); + plb.add_u64(l_c_md_sqsum, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests"); + plb.add_u64(l_c_md_ops, "mdops", "Total metadata IO operations"); + plb.add_time(l_c_rd_avg, "readavg", "Average latency for processing read requests"); + plb.add_u64(l_c_rd_sqsum, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests"); + plb.add_u64(l_c_rd_ops, "rdops", "Total read IO operations"); + plb.add_time(l_c_wr_avg, "writeavg", "Average latency for processing write requests"); + plb.add_u64(l_c_wr_sqsum, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests"); + plb.add_u64(l_c_wr_ops, "rdops", "Total write IO operations"); + logger.reset(plb.create_perf_counters()); + cct->get_perfcounters_collection()->add(logger.get()); + } + + cct->_conf.add_observer(this); + + AdminSocket* admin_socket = cct->get_admin_socket(); + int ret = admin_socket->register_command("mds_requests", + &m_command_hook, + "show in-progress mds requests"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } + ret = admin_socket->register_command("mds_sessions " + "name=cap_dump,type=CephBool,req=false", + &m_command_hook, + "show mds session state"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } + ret = admin_socket->register_command("dump_cache", + &m_command_hook, + "show in-memory metadata cache contents"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } + ret = admin_socket->register_command("kick_stale_sessions", + &m_command_hook, + "kick sessions that were remote reset"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } + ret = admin_socket->register_command("status", + &m_command_hook, + "show overall client status"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } +} + +void Client::shutdown() +{ + ldout(cct, 1) << __func__ << dendl; + + // If we were not mounted, but were being used for sending + // MDS commands, we may have sessions that need closing. + { + std::scoped_lock l{client_lock}; + + // To make sure the tick thread will be stoppped before + // destructing the Client, just in case like the _mount() + // failed but didn't not get a chance to stop the tick + // thread + tick_thread_stopped = true; + upkeep_cond.notify_one(); + + _close_sessions(); + } + cct->_conf.remove_observer(this); + + cct->get_admin_socket()->unregister_commands(&m_command_hook); + + if (ino_invalidate_cb) { + ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl; + async_ino_invalidator.wait_for_empty(); + async_ino_invalidator.stop(); + } + + if (dentry_invalidate_cb) { + ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl; + async_dentry_invalidator.wait_for_empty(); + async_dentry_invalidator.stop(); + } + + if (switch_interrupt_cb) { + ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl; + interrupt_finisher.wait_for_empty(); + interrupt_finisher.stop(); + } + + if (remount_cb) { + ldout(cct, 10) << "shutdown stopping remount finisher" << dendl; + remount_finisher.wait_for_empty(); + remount_finisher.stop(); + } + + if (ino_release_cb) { + ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl; + async_ino_releasor.wait_for_empty(); + async_ino_releasor.stop(); + } + + objectcacher->stop(); // outside of client_lock! this does a join. + + /* + * We are shuting down the client. + * + * Just declare the state to CLIENT_NEW to block and fail any + * new comming "reader" and then try to wait all the in-flight + * "readers" to finish. + */ + RWRef_t iref_writer(initialize_state, CLIENT_NEW, false); + if (!iref_writer.is_first_writer()) + return; + iref_writer.wait_readers_done(); + + { + std::scoped_lock l(timer_lock); + timer.shutdown(); + } + + objecter_finisher.wait_for_empty(); + objecter_finisher.stop(); + + if (logger) { + cct->get_perfcounters_collection()->remove(logger.get()); + logger.reset(); + } +} + +void Client::update_io_stat_metadata(utime_t latency) { + auto lat_nsec = latency.to_nsec(); + // old values are used to compute new ones + auto o_avg = logger->tget(l_c_md_avg).to_nsec(); + auto o_sqsum = logger->get(l_c_md_sqsum); + + auto n_avg = calc_average(o_avg, lat_nsec, nr_metadata_request); + auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec, + nr_metadata_request); + + logger->tinc(l_c_lat, latency); + logger->tinc(l_c_reply, latency); + + utime_t avg; + avg.set_from_double(n_avg / 1000000000); + logger->tset(l_c_md_avg, avg); + logger->set(l_c_md_sqsum, n_sqsum); + logger->set(l_c_md_ops, nr_metadata_request); +} + +void Client::update_io_stat_read(utime_t latency) { + auto lat_nsec = latency.to_nsec(); + // old values are used to compute new ones + auto o_avg = logger->tget(l_c_rd_avg).to_nsec(); + auto o_sqsum = logger->get(l_c_rd_sqsum); + + auto n_avg = calc_average(o_avg, lat_nsec, nr_read_request); + auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec, + nr_read_request); + + logger->tinc(l_c_read, latency); + + utime_t avg; + avg.set_from_double(n_avg / 1000000000); + logger->tset(l_c_rd_avg, avg); + logger->set(l_c_rd_sqsum, n_sqsum); + logger->set(l_c_rd_ops, nr_read_request); +} + +void Client::update_io_stat_write(utime_t latency) { + auto lat_nsec = latency.to_nsec(); + // old values are used to compute new ones + auto o_avg = logger->tget(l_c_wr_avg).to_nsec(); + auto o_sqsum = logger->get(l_c_wr_sqsum); + + auto n_avg = calc_average(o_avg, lat_nsec, nr_write_request); + auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec, + nr_write_request); + + logger->tinc(l_c_wrlat, latency); + + utime_t avg; + avg.set_from_double(n_avg / 1000000000); + logger->tset(l_c_wr_avg, avg); + logger->set(l_c_wr_sqsum, n_sqsum); + logger->set(l_c_wr_ops, nr_write_request); +} + +// =================== +// metadata cache stuff + +void Client::trim_cache(bool trim_kernel_dcache) +{ + uint64_t max = cct->_conf->client_cache_size; + ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl; + unsigned last = 0; + while (lru.lru_get_size() != last) { + last = lru.lru_get_size(); + + if (!is_unmounting() && lru.lru_get_size() <= max) break; + + // trim! + Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire()); + if (!dn) + break; // done + + trim_dentry(dn); + } + + if (trim_kernel_dcache && lru.lru_get_size() > max) + _invalidate_kernel_dcache(); + + // hose root? + if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) { + ldout(cct, 15) << "trim_cache trimmed root " << root << dendl; + root.reset(); + } +} + +void Client::trim_cache_for_reconnect(MetaSession *s) +{ + mds_rank_t mds = s->mds_num; + ldout(cct, 20) << __func__ << " mds." << mds << dendl; + + int trimmed = 0; + list<Dentry*> skipped; + while (lru.lru_get_size() > 0) { + Dentry *dn = static_cast<Dentry*>(lru.lru_expire()); + if (!dn) + break; + + if ((dn->inode && dn->inode->caps.count(mds)) || + dn->dir->parent_inode->caps.count(mds)) { + trim_dentry(dn); + trimmed++; + } else + skipped.push_back(dn); + } + + for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p) + lru.lru_insert_mid(*p); + + ldout(cct, 20) << __func__ << " mds." << mds + << " trimmed " << trimmed << " dentries" << dendl; + + if (s->caps.size() > 0) + _invalidate_kernel_dcache(); +} + +void Client::trim_dentry(Dentry *dn) +{ + ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name + << " in dir " + << std::hex << dn->dir->parent_inode->ino << std::dec + << dendl; + if (dn->inode) { + Inode *diri = dn->dir->parent_inode; + clear_dir_complete_and_ordered(diri, true); + } + unlink(dn, false, false); // drop dir, drop dentry +} + + +void Client::update_inode_file_size(Inode *in, int issued, uint64_t size, + uint64_t truncate_seq, uint64_t truncate_size) +{ + uint64_t prior_size = in->size; + + if (truncate_seq > in->truncate_seq || + (truncate_seq == in->truncate_seq && size > in->size)) { + ldout(cct, 10) << "size " << in->size << " -> " << size << dendl; + in->size = size; + in->reported_size = size; + if (truncate_seq != in->truncate_seq) { + ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> " + << truncate_seq << dendl; + in->truncate_seq = truncate_seq; + in->oset.truncate_seq = truncate_seq; + + // truncate cached file data + if (prior_size > size) { + _invalidate_inode_cache(in, size, prior_size - size); + } + } + + // truncate inline data + if (in->inline_version < CEPH_INLINE_NONE) { + uint32_t len = in->inline_data.length(); + if (size < len) + in->inline_data.splice(size, len - size); + } + } + if (truncate_seq >= in->truncate_seq && + in->truncate_size != truncate_size) { + if (in->is_file()) { + ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> " + << truncate_size << dendl; + in->truncate_size = truncate_size; + in->oset.truncate_size = truncate_size; + } else { + ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl; + } + } +} + +void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq, + utime_t ctime, utime_t mtime, utime_t atime) +{ + ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued) + << " ctime " << ctime << " mtime " << mtime << dendl; + + if (time_warp_seq > in->time_warp_seq) + ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq + << " is higher than local time_warp_seq " + << in->time_warp_seq << dendl; + + int warn = false; + // be careful with size, mtime, atime + if (issued & (CEPH_CAP_FILE_EXCL| + CEPH_CAP_FILE_WR| + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { + ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl; + if (ctime > in->ctime) + in->ctime = ctime; + if (time_warp_seq > in->time_warp_seq) { + //the mds updated times, so take those! + in->mtime = mtime; + in->atime = atime; + in->time_warp_seq = time_warp_seq; + } else if (time_warp_seq == in->time_warp_seq) { + //take max times + if (mtime > in->mtime) + in->mtime = mtime; + if (atime > in->atime) + in->atime = atime; + } else if (issued & CEPH_CAP_FILE_EXCL) { + //ignore mds values as we have a higher seq + } else warn = true; + } else { + ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl; + if (time_warp_seq >= in->time_warp_seq) { + in->ctime = ctime; + in->mtime = mtime; + in->atime = atime; + in->time_warp_seq = time_warp_seq; + } else warn = true; + } + if (warn) { + ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq " + << time_warp_seq << " is lower than local time_warp_seq " + << in->time_warp_seq + << dendl; + } +} + +void Client::_fragmap_remove_non_leaves(Inode *in) +{ + for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); ) + if (!in->dirfragtree.is_leaf(p->first)) + in->fragmap.erase(p++); + else + ++p; +} + +void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds) +{ + for (auto p = in->fragmap.begin(); p != in->fragmap.end(); ) + if (p->second == mds) + in->fragmap.erase(p++); + else + ++p; +} + +Inode * Client::add_update_inode(InodeStat *st, utime_t from, + MetaSession *session, + const UserPerm& request_perms) +{ + Inode *in; + bool was_new = false; + if (inode_map.count(st->vino)) { + in = inode_map[st->vino]; + ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl; + } else { + in = new Inode(this, st->vino, &st->layout); + inode_map[st->vino] = in; + + if (use_faked_inos()) + _assign_faked_ino(in); + + if (!root) { + root = in; + if (use_faked_inos()) + _assign_faked_root(root.get()); + root_ancestor = in; + cwd = root; + } else if (is_mounting()) { + root_parents[root_ancestor] = in; + root_ancestor = in; + } + + // immutable bits + in->ino = st->vino.ino; + in->snapid = st->vino.snapid; + in->mode = st->mode & S_IFMT; + was_new = true; + } + + in->rdev = st->rdev; + if (in->is_symlink()) + in->symlink = st->symlink; + + // only update inode if mds info is strictly newer, or it is the same and projected (odd). + bool new_version = false; + if (in->version == 0 || + ((st->cap.flags & CEPH_CAP_FLAG_AUTH) && + (in->version & ~1) < st->version)) + new_version = true; + + int issued; + in->caps_issued(&issued); + issued |= in->caps_dirty(); + int new_issued = ~issued & (int)st->cap.caps; + + bool need_snapdir_attr_refresh = false; + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && + !(issued & CEPH_CAP_AUTH_EXCL)) { + in->mode = st->mode; + in->uid = st->uid; + in->gid = st->gid; + in->btime = st->btime; + in->snap_btime = st->snap_btime; + in->snap_metadata = st->snap_metadata; + in->fscrypt_auth = st->fscrypt_auth; + need_snapdir_attr_refresh = true; + } + + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && + !(issued & CEPH_CAP_LINK_EXCL)) { + in->nlink = st->nlink; + } + + if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { + need_snapdir_attr_refresh = true; + update_inode_file_time(in, issued, st->time_warp_seq, + st->ctime, st->mtime, st->atime); + } + + if (new_version || + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + in->layout = st->layout; + in->fscrypt_file = st->fscrypt_file; + update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size); + } + + if (in->is_dir()) { + if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) { + in->dirstat = st->dirstat; + } + // dir_layout/rstat/quota are not tracked by capability, update them only if + // the inode stat is from auth mds + if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) { + in->dir_layout = st->dir_layout; + ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl; + in->rstat = st->rstat; + in->quota = st->quota; + in->dir_pin = st->dir_pin; + } + // move me if/when version reflects fragtree changes. + if (in->dirfragtree != st->dirfragtree) { + in->dirfragtree = st->dirfragtree; + _fragmap_remove_non_leaves(in); + } + } + + if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && + st->xattrbl.length() && + st->xattr_version > in->xattr_version) { + auto p = st->xattrbl.cbegin(); + decode(in->xattrs, p); + in->xattr_version = st->xattr_version; + need_snapdir_attr_refresh = true; + } + + if (st->inline_version > in->inline_version) { + in->inline_data = st->inline_data; + in->inline_version = st->inline_version; + } + + /* always take a newer change attr */ + ldout(cct, 12) << __func__ << " client inode change_attr: " << in->change_attr << " , mds inodestat change_attr: " << st->change_attr << dendl; + if (st->change_attr > in->change_attr) + in->change_attr = st->change_attr; + + if (st->version > in->version) + in->version = st->version; + + if (was_new) + ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl; + + if (!st->cap.caps) + return in; // as with readdir returning indoes in different snaprealms (no caps!) + + if (in->snapid == CEPH_NOSNAP) { + add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted, + st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm), + st->cap.flags, request_perms); + if (in->auth_cap && in->auth_cap->session == session) { + in->max_size = st->max_size; + in->rstat = st->rstat; + } + + // setting I_COMPLETE needs to happen after adding the cap + if (in->is_dir() && + (st->cap.caps & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + in->dirstat.nfiles == 0 && + in->dirstat.nsubdirs == 0) { + ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl; + in->flags |= I_COMPLETE | I_DIR_ORDERED; + if (in->dir) { + ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with " + << in->dir->dentries.size() << " entries, marking all dentries null" << dendl; + in->dir->readdir_cache.clear(); + for (const auto& p : in->dir->dentries) { + unlink(p.second, true, true); // keep dir, keep dentry + } + if (in->dir->dentries.empty()) + close_dir(in->dir); + } + } + } else { + in->snap_caps |= st->cap.caps; + } + + if (need_snapdir_attr_refresh && in->is_dir() && in->snapid == CEPH_NOSNAP) { + vinodeno_t vino(in->ino, CEPH_SNAPDIR); + if (inode_map.count(vino)) { + refresh_snapdir_attrs(inode_map[vino], in); + } + } + + return in; +} + + +/* + * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache. + */ +Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease, + Inode *in, utime_t from, MetaSession *session, + Dentry *old_dentry) +{ + Dentry *dn = NULL; + if (dir->dentries.count(dname)) + dn = dir->dentries[dname]; + + ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino() + << " in dir " << dir->parent_inode->vino() << " dn " << dn + << dendl; + + if (dn && dn->inode) { + if (dn->inode->vino() == in->vino()) { + touch_dn(dn); + ldout(cct, 12) << " had dentry " << dname + << " with correct vino " << dn->inode->vino() + << dendl; + } else { + ldout(cct, 12) << " had dentry " << dname + << " with WRONG vino " << dn->inode->vino() + << dendl; + unlink(dn, true, true); // keep dir, keep dentry + } + } + + if (!dn || !dn->inode) { + InodeRef tmp_ref(in); + if (old_dentry) { + if (old_dentry->dir != dir) { + Inode *old_diri = old_dentry->dir->parent_inode; + clear_dir_complete_and_ordered(old_diri, false); + } + unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir + } + Inode *diri = dir->parent_inode; + clear_dir_complete_and_ordered(diri, false); + dn = link(dir, dname, in, dn); + + if (old_dentry) { + dn->is_renaming = false; + signal_cond_list(waiting_for_rename); + } + } + + update_dentry_lease(dn, dlease, from, session); + return dn; +} + +void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session) +{ + utime_t dttl = from; + dttl += (float)dlease->duration_ms / 1000.0; + + ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl; + + ceph_assert(dn); + + if (dlease->mask & CEPH_LEASE_VALID) { + if (dttl > dn->lease_ttl) { + ldout(cct, 10) << "got dentry lease on " << dn->name + << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl; + dn->lease_ttl = dttl; + dn->lease_mds = session->mds_num; + dn->lease_seq = dlease->seq; + dn->lease_gen = session->cap_gen; + } + } + dn->cap_shared_gen = dn->dir->parent_inode->shared_gen; + if (dlease->mask & CEPH_LEASE_PRIMARY_LINK) + dn->mark_primary(); + dn->alternate_name = std::move(dlease->alternate_name); +} + + +/* + * update MDS location cache for a single inode + */ +void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from) +{ + // auth + ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl; + if (dst->auth >= 0) { + in->fragmap[dst->frag] = dst->auth; + } else { + in->fragmap.erase(dst->frag); + } + if (!in->dirfragtree.is_leaf(dst->frag)) { + in->dirfragtree.force_to_leaf(cct, dst->frag); + _fragmap_remove_non_leaves(in); + } + + // replicated, only update from auth mds reply + if (from == dst->auth) { + in->dir_replicated = !dst->dist.empty(); + if (!dst->dist.empty()) + in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ; + else + in->frag_repmap.erase(dst->frag); + } +} + +void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete) +{ + if (complete) + diri->dir_release_count++; + else + diri->dir_ordered_count++; + if (diri->flags & I_COMPLETE) { + if (complete) { + ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl; + diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED); + } else { + if (diri->flags & I_DIR_ORDERED) { + ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl; + diri->flags &= ~I_DIR_ORDERED; + } + } + if (diri->dir) + diri->dir->readdir_cache.clear(); + } +} + +/* + * insert results from readdir or lssnap into the metadata cache. + */ +void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, + Inode *diri, Inode *diri_other) { + + auto& reply = request->reply; + ConnectionRef con = request->reply->get_connection(); + uint64_t features; + if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) { + features = (uint64_t)-1; + } + else { + features = con->get_features(); + } + + dir_result_t *dirp = request->dirp; + ceph_assert(dirp); + + // the extra buffer list is only set for readdir, lssnap and + // readdir_snapdiff replies + auto p = reply->get_extra_bl().cbegin(); + if (!p.end()) { + // snapdir? + if (request->head.op == CEPH_MDS_OP_LSSNAP) { + ceph_assert(diri); + diri = open_snapdir(diri); + } + bool snapdiff_req = request->head.op == CEPH_MDS_OP_READDIR_SNAPDIFF; + frag_t fg; + unsigned offset_hash; + if (snapdiff_req) { + fg = (unsigned)request->head.args.snapdiff.frag; + offset_hash = (unsigned)request->head.args.snapdiff.offset_hash; + } else { + fg = (unsigned)request->head.args.readdir.frag; + offset_hash = (unsigned)request->head.args.readdir.offset_hash; + } + + // only open dir if we're actually adding stuff to it! + Dir *dir = diri->open_dir(); + ceph_assert(dir); + //open opponent dir for snapdiff if any + Dir *dir_other = nullptr; + if (snapdiff_req) { + ceph_assert(diri_other); + dir_other = diri_other->open_dir(); + ceph_assert(dir_other); + } + + // dirstat + DirStat dst(p, features); + __u32 numdn; + __u16 flags; + decode(numdn, p); + decode(flags, p); + + bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END); + bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER); + + unsigned readdir_offset = dirp->next_offset; + string readdir_start = dirp->last_name; + ceph_assert(!readdir_start.empty() || readdir_offset == 2); + + unsigned last_hash = 0; + if (hash_order) { + if (!readdir_start.empty()) { + last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start)); + } else if (flags & CEPH_READDIR_OFFSET_HASH) { + /* mds understands offset_hash */ + last_hash = offset_hash; + } + } + + if (fg != dst.frag) { + ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl; + fg = dst.frag; + if (!hash_order) { + readdir_offset = 2; + readdir_start.clear(); + dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false); + } + } + + ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end + << ", hash_order=" << hash_order + << ", readdir_start " << readdir_start + << ", last_hash " << last_hash + << ", next_offset " << readdir_offset << dendl; + + if (diri->snapid != CEPH_SNAPDIR && + fg.is_leftmost() && readdir_offset == 2 && + !(hash_order && last_hash)) { + dirp->release_count = diri->dir_release_count; + dirp->ordered_count = diri->dir_ordered_count; + dirp->start_shared_gen = diri->shared_gen; + dirp->cache_index = 0; + } + + dirp->buffer_frag = fg; + + _readdir_drop_dirp_buffer(dirp); + dirp->buffer.reserve(numdn); + + string dname; + LeaseStat dlease; + for (unsigned i=0; i<numdn; i++) { + decode(dname, p); + dlease.decode(p, features); + InodeStat ist(p, features); + + ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl; + + Inode *in = add_update_inode(&ist, request->sent_stamp, session, + request->perms); + auto *effective_dir = dir; + auto *effective_diri = diri; + + if (snapdiff_req && in->snapid != diri->snapid) { + ceph_assert(diri_other); + ceph_assert(dir_other); + effective_diri = diri_other; + effective_dir = dir_other; + } + Dentry *dn; + if (effective_dir->dentries.count(dname)) { + Dentry *olddn = effective_dir->dentries[dname]; + if (olddn->inode != in) { + // replace incorrect dentry + unlink(olddn, true, true); // keep dir, dentry + dn = link(effective_dir, dname, in, olddn); + ceph_assert(dn == olddn); + } else { + // keep existing dn + dn = olddn; + touch_dn(dn); + } + } else { + // new dn + dn = link(effective_dir, dname, in, NULL); + } + dn->alternate_name = std::move(dlease.alternate_name); + + update_dentry_lease(dn, &dlease, request->sent_stamp, session); + if (hash_order) { + unsigned hash = ceph_frag_value(effective_diri->hash_dentry_name(dname)); + if (hash != last_hash) + readdir_offset = 2; + last_hash = hash; + dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true); + } else { + dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false); + } + // add to readdir cache + if (!snapdiff_req && + dirp->release_count == effective_diri->dir_release_count && + dirp->ordered_count == effective_diri->dir_ordered_count && + dirp->start_shared_gen == effective_diri->shared_gen) { + if (dirp->cache_index == effective_dir->readdir_cache.size()) { + if (i == 0) { + ceph_assert(!dirp->inode->is_complete_and_ordered()); + dir->readdir_cache.reserve(dirp->cache_index + numdn); + } + effective_dir->readdir_cache.push_back(dn); + } else if (dirp->cache_index < effective_dir->readdir_cache.size()) { + if (dirp->inode->is_complete_and_ordered()) + ceph_assert(effective_dir->readdir_cache[dirp->cache_index] == dn); + else + effective_dir->readdir_cache[dirp->cache_index] = dn; + } else { + ceph_abort_msg("unexpected readdir buffer idx"); + } + dirp->cache_index++; + } + // add to cached result list + dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in)); + ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl; + } + + if (numdn > 0) + dirp->last_name = dname; + if (end) + dirp->next_offset = 2; + else + dirp->next_offset = readdir_offset; + + if (dir->is_empty()) + close_dir(dir); + if (dir_other && dir_other->is_empty()) + close_dir(dir_other); + } +} + +/** insert_trace + * + * insert a trace from a MDS reply into the cache. + */ +Inode* Client::insert_trace(MetaRequest *request, MetaSession *session) +{ + auto& reply = request->reply; + int op = request->get_op(); + + ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num + << " is_target=" << (int)reply->head.is_target + << " is_dentry=" << (int)reply->head.is_dentry + << dendl; + + auto p = reply->get_trace_bl().cbegin(); + if (request->got_unsafe) { + ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl; + ceph_assert(p.end()); + return NULL; + } + + if (p.end()) { + ldout(cct, 10) << "insert_trace -- no trace" << dendl; + + Dentry *d = request->dentry(); + if (d) { + Inode *diri = d->dir->parent_inode; + clear_dir_complete_and_ordered(diri, true); + } + + if (d && reply->get_result() == 0) { + if (op == CEPH_MDS_OP_RENAME) { + // rename + Dentry *od = request->old_dentry(); + ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl; + ceph_assert(od); + unlink(od, true, true); // keep dir, dentry + } else if (op == CEPH_MDS_OP_RMDIR || + op == CEPH_MDS_OP_UNLINK) { + // unlink, rmdir + ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl; + unlink(d, true, true); // keep dir, dentry + } + } + return NULL; + } + + ConnectionRef con = request->reply->get_connection(); + uint64_t features; + if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) { + features = (uint64_t)-1; + } + else { + features = con->get_features(); + } + ldout(cct, 10) << " features 0x" << hex << features << dec << dendl; + + // snap trace + SnapRealm *realm = NULL; + if (reply->snapbl.length()) + update_snap_trace(session, reply->snapbl, &realm); + + ldout(cct, 10) << " hrm " + << " is_target=" << (int)reply->head.is_target + << " is_dentry=" << (int)reply->head.is_dentry + << dendl; + + InodeStat dirst; + DirStat dst; + string dname; + LeaseStat dlease; + InodeStat ist; + + if (reply->head.is_dentry) { + dirst.decode(p, features); + dst.decode(p, features); + decode(dname, p); + dlease.decode(p, features); + } + + Inode *in = 0; + if (reply->head.is_target) { + ist.decode(p, features); + if (cct->_conf->client_debug_getattr_caps) { + unsigned wanted = 0; + if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP) + wanted = request->head.args.getattr.mask; + else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE) + wanted = request->head.args.open.mask; + + if ((wanted & CEPH_CAP_XATTR_SHARED) && + !(ist.xattr_version > 0 && ist.xattrbl.length() > 0)) + ceph_abort_msg("MDS reply does not contain xattrs"); + } + + in = add_update_inode(&ist, request->sent_stamp, session, + request->perms); + } + + Inode *diri = NULL; + if (reply->head.is_dentry) { + diri = add_update_inode(&dirst, request->sent_stamp, session, + request->perms); + mds_rank_t from_mds = mds_rank_t(reply->get_source().num()); + update_dir_dist(diri, &dst, from_mds); // dir stat info is attached to .. + + if (in) { + Dir *dir = diri->open_dir(); + insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session, + (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL); + } else { + Dentry *dn = NULL; + if (diri->dir && diri->dir->dentries.count(dname)) { + dn = diri->dir->dentries[dname]; + if (dn->inode) { + clear_dir_complete_and_ordered(diri, false); + unlink(dn, true, true); // keep dir, dentry + } + } + if (dlease.duration_ms > 0) { + if (!dn) { + Dir *dir = diri->open_dir(); + dn = link(dir, dname, NULL, NULL); + } + update_dentry_lease(dn, &dlease, request->sent_stamp, session); + } + } + } else if (op == CEPH_MDS_OP_LOOKUPSNAP || + op == CEPH_MDS_OP_MKSNAP) { + ldout(cct, 10) << " faking snap lookup weirdness" << dendl; + // fake it for snap lookup + vinodeno_t vino = ist.vino; + vino.snapid = CEPH_SNAPDIR; + ceph_assert(inode_map.count(vino)); + diri = inode_map[vino]; + + string dname = request->path.last_dentry(); + + LeaseStat dlease; + dlease.duration_ms = 0; + + if (in) { + Dir *dir = diri->open_dir(); + insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session); + } else { + if (diri->dir && diri->dir->dentries.count(dname)) { + Dentry *dn = diri->dir->dentries[dname]; + if (dn->inode) + unlink(dn, true, true); // keep dir, dentry + } + } + } + + if (in) { + if (op == CEPH_MDS_OP_READDIR || + op == CEPH_MDS_OP_LSSNAP) { + insert_readdir_results(request, + session, + in, + nullptr); + } else if (op == CEPH_MDS_OP_LOOKUPNAME) { + // hack: return parent inode instead + in = diri; + } else if (op == CEPH_MDS_OP_READDIR_SNAPDIFF) { + // provide both request's inode (aka snapA) and traced one (snapB) + // to properly match snapdiff results + insert_readdir_results(request, + session, + request->inode(), + in); + } + + if (request->dentry() == NULL && in != request->inode()) { + // pin the target inode if its parent dentry is not pinned + request->set_other_inode(in); + } + } + + if (realm) + put_snap_realm(realm); + + request->target = in; + return in; +} + +// ------- + +mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri) +{ + mds_rank_t mds = MDS_RANK_NONE; + __u32 hash = 0; + bool is_hash = false; + int issued = 0; + + Inode *in = NULL; + Dentry *de = NULL; + + if (req->resend_mds >= 0) { + mds = req->resend_mds; + req->resend_mds = -1; + ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl; + goto out; + } + + if (cct->_conf->client_use_random_mds) + goto random_mds; + + in = req->inode(); + de = req->dentry(); + if (in) { + ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl; + if (req->path.depth()) { + hash = in->hash_dentry_name(req->path[0]); + ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash + << " on " << req->path[0] + << " => " << hash << dendl; + is_hash = true; + } + } else if (de) { + if (de->inode) { + in = de->inode.get(); + ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl; + } else { + in = de->dir->parent_inode; + hash = in->hash_dentry_name(de->name); + ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash + << " on " << de->name + << " => " << hash << dendl; + is_hash = true; + } + } + if (in) { + if (in->snapid != CEPH_NOSNAP) { + ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl; + while (in->snapid != CEPH_NOSNAP) { + if (in->snapid == CEPH_SNAPDIR) + in = in->snapdir_parent.get(); + else if (!in->dentries.empty()) + /* In most cases there will only be one dentry, so getting it + * will be the correct action. If there are multiple hard links, + * I think the MDS should be able to redirect as needed*/ + in = in->get_first_parent()->dir->parent_inode; + else { + ldout(cct, 10) << __func__ << "got unlinked inode, can't look at parent" << dendl; + break; + } + } + is_hash = false; + } + + ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash + << " hash=" << hash << dendl; + + if (req->get_op() == CEPH_MDS_OP_GETATTR) + issued = req->inode()->caps_issued(); + + if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) { + frag_t fg = in->dirfragtree[hash]; + if (!req->auth_is_best(issued)) { + auto repmapit = in->frag_repmap.find(fg); + if (repmapit != in->frag_repmap.end()) { + auto& repmap = repmapit->second; + auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1); + mds = repmap.at(r); + } + } else if (in->fragmap.count(fg)) { + mds = in->fragmap[fg]; + if (phash_diri) + *phash_diri = in; + } else if (in->auth_cap) { + req->send_to_auth = true; + mds = in->auth_cap->session->mds_num; + } + if (mds >= 0) { + ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl; + goto out; + } + } + + if (in->auth_cap && req->auth_is_best(issued)) { + mds = in->auth_cap->session->mds_num; + } else if (!in->caps.empty()) { + mds = in->caps.begin()->second.session->mds_num; + } else { + goto random_mds; + } + ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl; + + goto out; + } + +random_mds: + if (mds < 0) { + mds = _get_random_up_mds(); + ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl; + } + +out: + ldout(cct, 20) << "mds is " << mds << dendl; + return mds; +} + +void Client::connect_mds_targets(mds_rank_t mds) +{ + ldout(cct, 10) << __func__ << " for mds." << mds << dendl; + ceph_assert(mds_sessions.count(mds)); + const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds); + for (const auto &rank : info.export_targets) { + if (mds_sessions.count(rank) == 0 && + mdsmap->is_clientreplay_or_active_or_stopping(rank)) { + ldout(cct, 10) << "check_mds_sessions opening mds." << mds + << " export target mds." << rank << dendl; + + auto session = _get_or_open_mds_session(rank); + if (session->state == MetaSession::STATE_OPENING || + session->state == MetaSession::STATE_OPEN) + continue; + + _open_mds_session(rank); + } + } +} + +void Client::dump_mds_sessions(Formatter *f, bool cap_dump) +{ + f->dump_int("id", get_nodeid().v); + entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy()); + f->dump_object("inst", inst); + f->dump_stream("inst_str") << inst; + f->dump_stream("addr_str") << inst.addr; + f->open_array_section("sessions"); + for (const auto &p : mds_sessions) { + f->open_object_section("session"); + p.second->dump(f, cap_dump); + f->close_section(); + } + f->close_section(); + f->dump_int("mdsmap_epoch", mdsmap->get_epoch()); +} + +void Client::dump_mds_requests(Formatter *f) +{ + for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin(); + p != mds_requests.end(); + ++p) { + f->open_object_section("request"); + p->second->dump(f); + f->close_section(); + } +} + +int Client::verify_reply_trace(int r, MetaSession *session, + MetaRequest *request, const MConstRef<MClientReply>& reply, + InodeRef *ptarget, bool *pcreated, + const UserPerm& perms) +{ + // check whether this request actually did the create, and set created flag + bufferlist extra_bl; + inodeno_t created_ino; + bool got_created_ino = false; + ceph::unordered_map<vinodeno_t, Inode*>::iterator p; + + extra_bl = reply->get_extra_bl(); + if (extra_bl.length() >= 8) { + if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) { + struct openc_response_t ocres; + + decode(ocres, extra_bl); + created_ino = ocres.created_ino; + /* + * The userland cephfs client doesn't have a way to do an async create + * (yet), so just discard delegated_inos for now. Eventually we should + * store them and use them in create calls, even if they are synchronous, + * if only for testing purposes. + */ + ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl; + } else { + // u64 containing number of created ino + decode(created_ino, extra_bl); + } + ldout(cct, 10) << "make_request created ino " << created_ino << dendl; + got_created_ino = true; + } + + if (pcreated) + *pcreated = got_created_ino; + + if (request->target) { + *ptarget = request->target; + ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl; + } else { + if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) { + (*ptarget) = p->second; + ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl; + } else { + // we got a traceless reply, and need to look up what we just + // created. for now, do this by name. someday, do this by the + // ino... which we know! FIXME. + InodeRef target; + Dentry *d = request->dentry(); + if (d) { + if (d->dir) { + ldout(cct, 10) << "make_request got traceless reply, looking up #" + << d->dir->parent_inode->ino << "/" << d->name + << " got_ino " << got_created_ino + << " ino " << created_ino + << dendl; + r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask, + &target, perms); + } else { + // if the dentry is not linked, just do our best. see #5021. + ceph_abort_msg("how did this happen? i want logs!"); + } + } else { + Inode *in = request->inode(); + ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #" + << in->ino << dendl; + r = _getattr(in, request->regetattr_mask, perms, true); + target = in; + } + if (r >= 0) { + // verify ino returned in reply and trace_dist are the same + if (got_created_ino && + created_ino.val != target->ino.val) { + ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl; + r = -CEPHFS_EINTR; + } + if (ptarget) + ptarget->swap(target); + } + } + } + + return r; +} + + +/** + * make a request + * + * Blocking helper to make an MDS request. + * + * If the ptarget flag is set, behavior changes slightly: the caller + * expects to get a pointer to the inode we are creating or operating + * on. As a result, we will follow up any traceless mutation reply + * with a getattr or lookup to transparently handle a traceless reply + * from the MDS (as when the MDS restarts and the client has to replay + * a request). + * + * @param request the MetaRequest to execute + * @param perms The user uid/gid to execute as (eventually, full group lists?) + * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on + * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file + * @param use_mds [optional] prefer a specific mds (-1 for default) + * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller + */ +int Client::make_request(MetaRequest *request, + const UserPerm& perms, + InodeRef *ptarget, bool *pcreated, + mds_rank_t use_mds, + bufferlist *pdirbl, + size_t feature_needed) +{ + int r = 0; + + // assign a unique tid + ceph_tid_t tid = ++last_tid; + request->set_tid(tid); + + // and timestamp + request->op_stamp = ceph_clock_now(); + request->created = ceph::coarse_mono_clock::now(); + + // make note + mds_requests[tid] = request->get(); + if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK) + oldest_tid = tid; + + request->set_caller_perms(perms); + + if (cct->_conf->client_inject_fixed_oldest_tid) { + ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl; + request->set_oldest_client_tid(1); + } else { + request->set_oldest_client_tid(oldest_tid); + } + + // hack target mds? + if (use_mds >= 0) + request->resend_mds = use_mds; + + MetaSessionRef session = NULL; + while (1) { + if (request->aborted()) + break; + + if (blocklisted) { + request->abort(-CEPHFS_EBLOCKLISTED); + break; + } + + // set up wait cond + ceph::condition_variable caller_cond; + request->caller_cond = &caller_cond; + + // choose mds + Inode *hash_diri = NULL; + mds_rank_t mds = choose_target_mds(request, &hash_diri); + int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds); + if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) { + if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) { + if (hash_diri) { + ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl; + _fragmap_remove_stopped_mds(hash_diri, mds); + } else { + ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl; + request->resend_mds = _get_random_up_mds(); + } + } else { + ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl; + wait_on_list(waiting_for_mdsmap); + } + continue; + } + + // open a session? + if (!have_open_session(mds)) { + session = _get_or_open_mds_session(mds); + if (session->state == MetaSession::STATE_REJECTED) { + request->abort(-CEPHFS_EPERM); + break; + } + // wait + if (session->state == MetaSession::STATE_OPENING) { + ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl; + wait_on_context_list(session->waiting_for_open); + continue; + } + + if (!have_open_session(mds)) + continue; + } else { + session = mds_sessions.at(mds); + } + + if (feature_needed != ULONG_MAX && !session->mds_features.test(feature_needed)) { + request->abort(-CEPHFS_EOPNOTSUPP); + break; + } + + // send request. + send_request(request, session.get()); + + // wait for signal + ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl; + request->kick = false; + std::unique_lock l{client_lock, std::adopt_lock}; + caller_cond.wait(l, [request] { + return (request->reply || // reply + request->resend_mds >= 0 || // forward + request->kick); + }); + l.release(); + request->caller_cond = nullptr; + + // did we get a reply? + if (request->reply) + break; + } + + if (!request->reply) { + ceph_assert(request->aborted()); + ceph_assert(!request->got_unsafe); + r = request->get_abort_code(); + request->item.remove_myself(); + unregister_request(request); + put_request(request); + return r; + } + + // got it! + auto reply = std::move(request->reply); + r = reply->get_result(); + if (r >= 0) + request->success = true; + + // kick dispatcher (we've got it!) + ceph_assert(request->dispatch_cond); + request->dispatch_cond->notify_all(); + ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl; + request->dispatch_cond = 0; + + if (r >= 0 && ptarget) + r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms); + + if (pdirbl) + *pdirbl = reply->get_extra_bl(); + + // -- log times -- + utime_t lat = ceph_clock_now(); + lat -= request->sent_stamp; + ldout(cct, 20) << "lat " << lat << dendl; + + ++nr_metadata_request; + update_io_stat_metadata(lat); + + put_request(request); + return r; +} + +void Client::unregister_request(MetaRequest *req) +{ + mds_requests.erase(req->tid); + if (req->tid == oldest_tid) { + map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid); + while (true) { + if (p == mds_requests.end()) { + oldest_tid = 0; + break; + } + if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) { + oldest_tid = p->first; + break; + } + ++p; + } + } + put_request(req); +} + +void Client::put_request(MetaRequest *request) +{ + if (request->_put()) { + int op = -1; + if (request->success) + op = request->get_op(); + InodeRef other_in; + request->take_other_inode(&other_in); + delete request; + + if (other_in && + (op == CEPH_MDS_OP_RMDIR || + op == CEPH_MDS_OP_RENAME || + op == CEPH_MDS_OP_RMSNAP)) { + _try_to_trim_inode(other_in.get(), false); + } + } +} + +int Client::encode_inode_release(Inode *in, MetaRequest *req, + mds_rank_t mds, int drop, + int unless, int force) +{ + ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req + << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless) + << ", force:" << force << ")" << dendl; + int released = 0; + auto it = in->caps.find(mds); + if (it != in->caps.end()) { + Cap &cap = it->second; + drop &= ~(in->dirty_caps | get_caps_used(in)); + if ((drop & cap.issued) && + !(unless & cap.issued)) { + ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl; + cap.issued &= ~drop; + cap.implemented &= ~drop; + released = 1; + } else { + released = force; + } + if (released) { + cap.wanted = in->caps_wanted(); + if (&cap == in->auth_cap && + !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) { + in->requested_max_size = 0; + ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl; + } + ceph_mds_request_release rel; + rel.ino = in->ino; + rel.cap_id = cap.cap_id; + rel.seq = cap.seq; + rel.issue_seq = cap.issue_seq; + rel.mseq = cap.mseq; + rel.caps = cap.implemented; + rel.wanted = cap.wanted; + rel.dname_len = 0; + rel.dname_seq = 0; + req->cap_releases.push_back(MClientRequest::Release(rel,"")); + } + } + ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:" + << released << dendl; + return released; +} + +void Client::encode_dentry_release(Dentry *dn, MetaRequest *req, + mds_rank_t mds, int drop, int unless) +{ + ldout(cct, 20) << __func__ << " enter(dn:" + << dn << ")" << dendl; + int released = 0; + if (dn->dir) + released = encode_inode_release(dn->dir->parent_inode, req, + mds, drop, unless, 1); + if (released && dn->lease_mds == mds) { + ldout(cct, 25) << "preemptively releasing dn to mds" << dendl; + auto& rel = req->cap_releases.back(); + rel.item.dname_len = dn->name.length(); + rel.item.dname_seq = dn->lease_seq; + rel.dname = dn->name; + dn->lease_mds = -1; + } + ldout(cct, 25) << __func__ << " exit(dn:" + << dn << ")" << dendl; +} + + +/* + * This requires the MClientRequest *request member to be set. + * It will error out horribly without one. + * Additionally, if you set any *drop member, you'd better have + * set the corresponding dentry! + */ +void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds) +{ + ldout(cct, 20) << __func__ << " enter (req: " + << req << ", mds: " << mds << ")" << dendl; + if (req->inode_drop && req->inode()) + encode_inode_release(req->inode(), req, + mds, req->inode_drop, + req->inode_unless); + + if (req->old_inode_drop && req->old_inode()) + encode_inode_release(req->old_inode(), req, + mds, req->old_inode_drop, + req->old_inode_unless); + if (req->other_inode_drop && req->other_inode()) + encode_inode_release(req->other_inode(), req, + mds, req->other_inode_drop, + req->other_inode_unless); + + if (req->dentry_drop && req->dentry()) + encode_dentry_release(req->dentry(), req, + mds, req->dentry_drop, + req->dentry_unless); + + if (req->old_dentry_drop && req->old_dentry()) + encode_dentry_release(req->old_dentry(), req, + mds, req->old_dentry_drop, + req->old_dentry_unless); + ldout(cct, 25) << __func__ << " exit (req: " + << req << ", mds " << mds <<dendl; +} + +bool Client::have_open_session(mds_rank_t mds) +{ + const auto &it = mds_sessions.find(mds); + return it != mds_sessions.end() && + (it->second->state == MetaSession::STATE_OPEN || + it->second->state == MetaSession::STATE_STALE); +} + +MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con) +{ + const auto &it = mds_sessions.find(mds); + if (it == mds_sessions.end() || it->second->con != con) { + return NULL; + } else { + return it->second; + } +} + +MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds) +{ + auto it = mds_sessions.find(mds); + return it == mds_sessions.end() ? _open_mds_session(mds) : it->second; +} + +/** + * Populate a map of strings with client-identifying metadata, + * such as the hostname. Call this once at initialization. + */ +void Client::populate_metadata(const std::string &mount_root) +{ + // Hostname +#ifdef _WIN32 + // TODO: move this to compat.h + char hostname[64]; + DWORD hostname_sz = 64; + GetComputerNameA(hostname, &hostname_sz); + metadata["hostname"] = hostname; +#else + struct utsname u; + int r = uname(&u); + if (r >= 0) { + metadata["hostname"] = u.nodename; + ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl; + } else { + ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl; + } +#endif + + metadata["pid"] = stringify(getpid()); + + // Ceph entity id (the '0' in "client.0") + metadata["entity_id"] = cct->_conf->name.get_id(); + + // Our mount position + if (!mount_root.empty()) { + metadata["root"] = mount_root; + } + + // Ceph version + metadata["ceph_version"] = pretty_version_to_str(); + metadata["ceph_sha1"] = git_version_to_str(); + + // Apply any metadata from the user's configured overrides + std::vector<std::string> tokens; + get_str_vec(cct->_conf->client_metadata, ",", tokens); + for (const auto &i : tokens) { + auto eqpos = i.find("="); + // Throw out anything that isn't of the form "<str>=<str>" + if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) { + lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl; + continue; + } + metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1); + } +} + +/** + * Optionally add or override client metadata fields. + */ +void Client::update_metadata(std::string const &k, std::string const &v) +{ + RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED); + ceph_assert(iref_reader.is_state_satisfied()); + + std::scoped_lock l(client_lock); + + auto it = metadata.find(k); + if (it != metadata.end()) { + ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k + << "' from '" << it->second << "' to '" << v << "'" << dendl; + } + + metadata[k] = v; +} + +MetaSessionRef Client::_open_mds_session(mds_rank_t mds) +{ + ldout(cct, 10) << __func__ << " mds." << mds << dendl; + auto addrs = mdsmap->get_addrs(mds); + auto em = mds_sessions.emplace(std::piecewise_construct, + std::forward_as_tuple(mds), + std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs))); + ceph_assert(em.second); /* not already present */ + auto session = em.first->second; + + auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN); + m->metadata = metadata; + m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED); + m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL); + session->con->send_message2(std::move(m)); + return session; +} + +void Client::_close_mds_session(MetaSession *s) +{ + ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl; + s->state = MetaSession::STATE_CLOSING; + s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq)); +} + +void Client::_closed_mds_session(MetaSession *s, int err, bool rejected) +{ + ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl; + if (rejected && s->state != MetaSession::STATE_CLOSING) + s->state = MetaSession::STATE_REJECTED; + else + s->state = MetaSession::STATE_CLOSED; + s->con->mark_down(); + signal_context_list(s->waiting_for_open); + mount_cond.notify_all(); + remove_session_caps(s, err); + kick_requests_closed(s); + mds_ranks_closing.erase(s->mds_num); + if (s->state == MetaSession::STATE_CLOSED) + mds_sessions.erase(s->mds_num); +} + +static void reinit_mds_features(MetaSession *session, + const MConstRef<MClientSession>& m) { + session->mds_features = std::move(m->supported_features); + session->mds_metric_flags = std::move(m->metric_spec.metric_flags); +} + +void Client::handle_client_session(const MConstRef<MClientSession>& m) +{ + mds_rank_t from = mds_rank_t(m->get_source().num()); + ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl; + + std::scoped_lock cl(client_lock); + auto session = _get_mds_session(from, m->get_connection().get()); + if (!session) { + ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl; + return; + } + + switch (m->get_op()) { + case CEPH_SESSION_OPEN: + { + if (session->state == MetaSession::STATE_OPEN) { + ldout(cct, 10) << "mds." << from << " already opened, ignore it" + << dendl; + // The MDS could send a client_session(open) message even when + // the session state is STATE_OPEN. Normally, its fine to + // ignore this message, but, if the MDS sent this message just + // after it got upgraded, the MDS feature bits could differ + // than the one before the upgrade - so, refresh the feature + // bits the client holds. + reinit_mds_features(session.get(), m); + return; + } + /* + * The connection maybe broken and the session in client side + * has been reinitialized, need to update the seq anyway. + */ + if (!session->seq && m->get_seq()) + session->seq = m->get_seq(); + + reinit_mds_features(session.get(), m); + + renew_caps(session.get()); + session->state = MetaSession::STATE_OPEN; + if (is_unmounting()) + mount_cond.notify_all(); + else + connect_mds_targets(from); + signal_context_list(session->waiting_for_open); + break; + } + + case CEPH_SESSION_CLOSE: + _closed_mds_session(session.get()); + break; + + case CEPH_SESSION_RENEWCAPS: + if (session->cap_renew_seq == m->get_seq()) { + bool was_stale = ceph_clock_now() >= session->cap_ttl; + session->cap_ttl = + session->last_cap_renew_request + mdsmap->get_session_timeout(); + if (was_stale) + wake_up_session_caps(session.get(), false); + } + break; + + case CEPH_SESSION_STALE: + // invalidate session caps/leases + session->cap_gen++; + session->cap_ttl = ceph_clock_now(); + session->cap_ttl -= 1; + renew_caps(session.get()); + break; + + case CEPH_SESSION_RECALL_STATE: + /* + * Call the renew caps and flush cap releases just before + * triming the caps in case the tick() won't get a chance + * to run them, which could cause the client to be blocklisted + * and MDS daemons trying to recall the caps again and + * again. + * + * In most cases it will do nothing, and the new cap releases + * added by trim_caps() followed will be deferred flushing + * by tick(). + */ + renew_and_flush_cap_releases(); + trim_caps(session.get(), m->get_max_caps()); + break; + + case CEPH_SESSION_FLUSHMSG: + /* flush cap release */ + if (auto& m = session->release; m) { + session->con->send_message2(std::move(m)); + } + session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq())); + break; + + case CEPH_SESSION_FORCE_RO: + force_session_readonly(session.get()); + break; + + case CEPH_SESSION_REJECT: + { + std::string_view error_str; + auto it = m->metadata.find("error_string"); + if (it != m->metadata.end()) + error_str = it->second; + else + error_str = "unknown error"; + lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl; + + _closed_mds_session(session.get(), -CEPHFS_EPERM, true); + } + break; + + default: + ceph_abort(); + } +} + +bool Client::_any_stale_sessions() const +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + for (const auto &p : mds_sessions) { + if (p.second->state == MetaSession::STATE_STALE) { + return true; + } + } + + return false; +} + +void Client::_kick_stale_sessions() +{ + ldout(cct, 1) << __func__ << dendl; + + for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) { + auto s = it->second; + if (s->state == MetaSession::STATE_REJECTED) { + mds_sessions.erase(it->first); + continue; + } + if (s->state == MetaSession::STATE_STALE) + _closed_mds_session(s.get()); + } +} + +void Client::send_request(MetaRequest *request, MetaSession *session, + bool drop_cap_releases) +{ + // make the request + mds_rank_t mds = session->mds_num; + ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid() + << " for mds." << mds << dendl; + auto r = build_client_request(request, mds); + if (!r) + return; + + if (request->dentry()) { + r->set_dentry_wanted(); + } + if (request->got_unsafe) { + r->set_replayed_op(); + if (request->target) + r->head.ino = request->target->ino; + } else { + encode_cap_releases(request, mds); + if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases + request->cap_releases.clear(); + else + r->releases.swap(request->cap_releases); + } + r->set_mdsmap_epoch(mdsmap->get_epoch()); + if (r->head.op == CEPH_MDS_OP_SETXATTR) { + objecter->with_osdmap([r](const OSDMap& o) { + r->set_osdmap_epoch(o.get_epoch()); + }); + } + + if (request->mds == -1) { + request->sent_stamp = ceph_clock_now(); + ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl; + } + request->mds = mds; + + Inode *in = request->inode(); + if (in) { + auto it = in->caps.find(mds); + if (it != in->caps.end()) { + request->sent_on_mseq = it->second.mseq; + } + } + + session->requests.push_back(&request->item); + + ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl; + session->con->send_message2(std::move(r)); +} + +ref_t<MClientRequest> Client::build_client_request(MetaRequest *request, mds_rank_t mds) +{ + auto session = mds_sessions.at(mds); + bool old_version = !session->mds_features.test(CEPHFS_FEATURE_32BITS_RETRY_FWD); + + /* + * Avoid inifinite retrying after overflow. + * + * The client will increase the retry count and if the MDS is + * old version, so we limit to retry at most 256 times. + */ + if (request->retry_attempt) { + int old_max_retry = sizeof(((struct ceph_mds_request_head*)0)->num_retry); + old_max_retry = 1 << (old_max_retry * CHAR_BIT); + if ((old_version && request->retry_attempt >= old_max_retry) || + (uint32_t)request->retry_attempt >= UINT32_MAX) { + request->abort(-CEPHFS_EMULTIHOP); + request->caller_cond->notify_all(); + ldout(cct, 1) << __func__ << " request tid " << request->tid + << " retry seq overflow" << ", abort it" << dendl; + return nullptr; + } + } + + auto req = make_message<MClientRequest>(request->get_op(), session->mds_features); + req->set_tid(request->tid); + req->set_stamp(request->op_stamp); + memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head)); + + // if the filepath's haven't been set, set them! + if (request->path.empty()) { + Inode *in = request->inode(); + Dentry *de = request->dentry(); + if (in) + in->make_nosnap_relative_path(request->path); + else if (de) { + if (de->inode) + de->inode->make_nosnap_relative_path(request->path); + else if (de->dir) { + de->dir->parent_inode->make_nosnap_relative_path(request->path); + request->path.push_dentry(de->name); + } + else ldout(cct, 1) << "Warning -- unable to construct a filepath!" + << " No path, inode, or appropriately-endowed dentry given!" + << dendl; + } else ldout(cct, 1) << "Warning -- unable to construct a filepath!" + << " No path, inode, or dentry given!" + << dendl; + } + req->set_filepath(request->get_filepath()); + req->set_filepath2(request->get_filepath2()); + req->set_alternate_name(request->alternate_name); + req->set_data(request->data); + req->fscrypt_auth = request->fscrypt_auth; + req->fscrypt_file = request->fscrypt_file; + req->set_retry_attempt(request->retry_attempt++); + req->head.ext_num_fwd = request->num_fwd; + const gid_t *_gids; + int gid_count = request->perms.get_gids(&_gids); + req->set_gid_list(gid_count, _gids); + return req; +} + + + +void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd) +{ + mds_rank_t mds = mds_rank_t(fwd->get_source().num()); + + std::scoped_lock cl(client_lock); + auto session = _get_mds_session(mds, fwd->get_connection().get()); + if (!session) { + return; + } + ceph_tid_t tid = fwd->get_tid(); + + if (mds_requests.count(tid) == 0) { + ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl; + return; + } + + MetaRequest *request = mds_requests[tid]; + ceph_assert(request); + + /* + * Avoid inifinite retrying after overflow. + * + * The MDS will increase the fwd count and in client side + * if the num_fwd is less than the one saved in request + * that means the MDS is an old version and overflowed of + * 8 bits. + */ + auto num_fwd = fwd->get_num_fwd(); + if (num_fwd <= request->num_fwd || (uint32_t)num_fwd >= UINT32_MAX) { + request->abort(-CEPHFS_EMULTIHOP); + request->caller_cond->notify_all(); + ldout(cct, 0) << __func__ << " request tid " << tid << " new num_fwd " + << num_fwd << " old num_fwd " << request->num_fwd << ", fwd seq overflow" + << ", abort it" << dendl; + return; + } + + // reset retry counter + request->retry_attempt = 0; + + // request not forwarded, or dest mds has no session. + // resend. + ldout(cct, 10) << __func__ << " tid " << tid + << " fwd " << fwd->get_num_fwd() + << " to mds." << fwd->get_dest_mds() + << ", resending to " << fwd->get_dest_mds() + << dendl; + + request->mds = -1; + request->item.remove_myself(); + request->num_fwd = num_fwd; + request->resend_mds = fwd->get_dest_mds(); + request->caller_cond->notify_all(); +} + +bool Client::is_dir_operation(MetaRequest *req) +{ + int op = req->get_op(); + if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK || + op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME || + op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR || + op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE) + return true; + return false; +} + +void Client::handle_client_reply(const MConstRef<MClientReply>& reply) +{ + mds_rank_t mds_num = mds_rank_t(reply->get_source().num()); + + std::scoped_lock cl(client_lock); + auto session = _get_mds_session(mds_num, reply->get_connection().get()); + if (!session) { + return; + } + + ceph_tid_t tid = reply->get_tid(); + bool is_safe = reply->is_safe(); + + if (mds_requests.count(tid) == 0) { + lderr(cct) << __func__ << " no pending request on tid " << tid + << " safe is:" << is_safe << dendl; + return; + } + MetaRequest *request = mds_requests.at(tid); + + ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe + << " tid " << tid << dendl; + + // correct sessions ? + if (request->mds != mds_num) { + ldout(cct, 0) << "got a stale reply from mds." << mds_num + << " instead of mds." << request->mds << dendl; + return; + } + + if (request->got_unsafe && !is_safe) { + //duplicate response + ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds " + << mds_num << " safe:" << is_safe << dendl; + return; + } + + ceph_assert(!request->reply); + request->reply = reply; + insert_trace(request, session.get()); + + // Handle unsafe reply + if (!is_safe) { + request->got_unsafe = true; + session->unsafe_requests.push_back(&request->unsafe_item); + if (is_dir_operation(request)) { + Inode *dir = request->inode(); + ceph_assert(dir); + dir->unsafe_ops.push_back(&request->unsafe_dir_item); + } + if (request->target) { + InodeRef &in = request->target; + in->unsafe_ops.push_back(&request->unsafe_target_item); + } + } + + // Only signal the caller once (on the first reply): + // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent. + if (!is_safe || !request->got_unsafe) { + ceph::condition_variable cond; + request->dispatch_cond = &cond; + + // wake up waiter + ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl; + request->caller_cond->notify_all(); + + // wake for kick back + std::unique_lock l{client_lock, std::adopt_lock}; + cond.wait(l, [tid, request, &cond, this] { + if (request->dispatch_cond) { + ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " + << tid << " " << &cond << dendl; + } + return !request->dispatch_cond; + }); + l.release(); + } + + if (is_safe) { + // the filesystem change is committed to disk + // we're done, clean up + if (request->got_unsafe) { + request->unsafe_item.remove_myself(); + request->unsafe_dir_item.remove_myself(); + request->unsafe_target_item.remove_myself(); + signal_cond_list(request->waitfor_safe); + } + request->item.remove_myself(); + unregister_request(request); + } + if (is_unmounting()) + mount_cond.notify_all(); +} + +void Client::_handle_full_flag(int64_t pool) +{ + ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations " + << "on " << pool << dendl; + // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary + // to do this rather than blocking, because otherwise when we fill up we + // potentially lock caps forever on files with dirty pages, and we need + // to be able to release those caps to the MDS so that it can delete files + // and free up space. + epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool); + + // For all inodes with layouts in this pool and a pending flush write op + // (i.e. one of the ones we will cancel), we've got to purge_set their data + // from ObjectCacher so that it doesn't re-issue the write in response to + // the ENOSPC error. + // Fortunately since we're cancelling everything in a given pool, we don't + // need to know which ops belong to which ObjectSet, we can just blow all + // the un-flushed cached data away and mark any dirty inodes' async_err + // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were + // affecting this pool, and all the objectsets we're purging were also + // in this pool. + for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin(); + i != inode_map.end(); ++i) + { + Inode *inode = i->second; + if (inode->oset.dirty_or_tx + && (pool == -1 || inode->layout.pool_id == pool)) { + ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec + << " has dirty objects, purging and setting ENOSPC" << dendl; + objectcacher->purge_set(&inode->oset); + inode->set_async_err(-CEPHFS_ENOSPC); + } + } + + if (cancelled_epoch != (epoch_t)-1) { + set_cap_epoch_barrier(cancelled_epoch); + } +} + +void Client::handle_osd_map(const MConstRef<MOSDMap>& m) +{ + std::scoped_lock cl(client_lock); + + const auto myaddrs = messenger->get_myaddrs(); + bool new_blocklist = objecter->with_osdmap( + [&](const OSDMap& o) { + return o.is_blocklisted(myaddrs); + }); + + if (new_blocklist && !blocklisted) { + auto epoch = objecter->with_osdmap([](const OSDMap &o){ + return o.get_epoch(); + }); + lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl; + blocklisted = true; + + _abort_mds_sessions(-CEPHFS_EBLOCKLISTED); + + // Since we know all our OSD ops will fail, cancel them all preemtively, + // so that on an unhealthy cluster we can umount promptly even if e.g. + // some PGs were inaccessible. + objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED); + + } + + if (blocklisted) { + // Handle case where we were blocklisted but no longer are + blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){ + return o.is_blocklisted(myaddrs);}); + } + + // Always subscribe to next osdmap for blocklisted client + // until this client is not blocklisted. + if (blocklisted) { + objecter->maybe_request_map(); + } + + if (objecter->osdmap_full_flag()) { + _handle_full_flag(-1); + } else { + // Accumulate local list of full pools so that I can drop + // the objecter lock before re-entering objecter in + // cancel_writes + std::vector<int64_t> full_pools; + + objecter->with_osdmap([&full_pools](const OSDMap &o) { + for (const auto& kv : o.get_pools()) { + if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) { + full_pools.push_back(kv.first); + } + } + }); + + for (auto p : full_pools) + _handle_full_flag(p); + + // Subscribe to subsequent maps to watch for the full flag going + // away. For the global full flag objecter does this for us, but + // it pays no attention to the per-pool full flag so in this branch + // we do it ourselves. + if (!full_pools.empty()) { + objecter->maybe_request_map(); + } + } +} + + +// ------------------------ +// incoming messages + + +bool Client::ms_dispatch2(const MessageRef &m) +{ + RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED); + if (!iref_reader.is_state_satisfied()) { + ldout(cct, 10) << "inactive, discarding " << *m << dendl; + return true; + } + + switch (m->get_type()) { + // mounting and mds sessions + case CEPH_MSG_MDS_MAP: + handle_mds_map(ref_cast<MMDSMap>(m)); + break; + case CEPH_MSG_FS_MAP: + handle_fs_map(ref_cast<MFSMap>(m)); + break; + case CEPH_MSG_FS_MAP_USER: + handle_fs_map_user(ref_cast<MFSMapUser>(m)); + break; + case CEPH_MSG_CLIENT_SESSION: + handle_client_session(ref_cast<MClientSession>(m)); + break; + + case CEPH_MSG_OSD_MAP: + handle_osd_map(ref_cast<MOSDMap>(m)); + break; + + // requests + case CEPH_MSG_CLIENT_REQUEST_FORWARD: + handle_client_request_forward(ref_cast<MClientRequestForward>(m)); + break; + case CEPH_MSG_CLIENT_REPLY: + handle_client_reply(ref_cast<MClientReply>(m)); + break; + + // reclaim reply + case CEPH_MSG_CLIENT_RECLAIM_REPLY: + handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m)); + break; + + case CEPH_MSG_CLIENT_SNAP: + handle_snap(ref_cast<MClientSnap>(m)); + break; + case CEPH_MSG_CLIENT_CAPS: + handle_caps(ref_cast<MClientCaps>(m)); + break; + case CEPH_MSG_CLIENT_LEASE: + handle_lease(ref_cast<MClientLease>(m)); + break; + case MSG_COMMAND_REPLY: + if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) { + handle_command_reply(ref_cast<MCommandReply>(m)); + } else { + return false; + } + break; + case CEPH_MSG_CLIENT_QUOTA: + handle_quota(ref_cast<MClientQuota>(m)); + break; + + default: + return false; + } + + // unmounting? + std::scoped_lock cl(client_lock); + if (is_unmounting()) { + ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size() + << "+" << inode_map.size() << dendl; + uint64_t size = lru.lru_get_size() + inode_map.size(); + trim_cache(); + if (size > lru.lru_get_size() + inode_map.size()) { + ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl; + mount_cond.notify_all(); + } else { + ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size() + << "+" << inode_map.size() << dendl; + } + } + + return true; +} + +void Client::handle_fs_map(const MConstRef<MFSMap>& m) +{ + std::scoped_lock cl(client_lock); + fsmap.reset(new FSMap(m->get_fsmap())); + + signal_cond_list(waiting_for_fsmap); + + monclient->sub_got("fsmap", fsmap->get_epoch()); +} + +void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m) +{ + std::scoped_lock cl(client_lock); + fsmap_user.reset(new FSMapUser); + *fsmap_user = m->get_fsmap(); + + monclient->sub_got("fsmap.user", fsmap_user->get_epoch()); + signal_cond_list(waiting_for_fsmap); +} + +// Cancel all the commands for missing or laggy GIDs +void Client::cancel_commands(const MDSMap& newmap) +{ + std::vector<ceph_tid_t> cancel_ops; + + std::scoped_lock cmd_lock(command_lock); + auto &commands = command_table.get_commands(); + for (const auto &[tid, op] : commands) { + const mds_gid_t op_mds_gid = op.mds_gid; + if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) { + ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl; + cancel_ops.push_back(tid); + if (op.outs) { + std::ostringstream ss; + ss << "MDS " << op_mds_gid << " went away"; + *(op.outs) = ss.str(); + } + /* + * No need to make the con->mark_down under + * client_lock here, because the con will + * has its own lock. + */ + op.con->mark_down(); + if (op.on_finish) + op.on_finish->complete(-CEPHFS_ETIMEDOUT); + } + } + + for (const auto &tid : cancel_ops) + command_table.erase(tid); +} + +void Client::handle_mds_map(const MConstRef<MMDSMap>& m) +{ + std::unique_lock cl(client_lock); + if (m->get_epoch() <= mdsmap->get_epoch()) { + ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() + << " is identical to or older than our " + << mdsmap->get_epoch() << dendl; + return; + } + + cl.unlock(); + ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl; + std::unique_ptr<MDSMap> _mdsmap(new MDSMap); + _mdsmap->decode(m->get_encoded()); + cancel_commands(*_mdsmap.get()); + cl.lock(); + + _mdsmap.swap(mdsmap); + + // reset session + for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) { + mds_rank_t mds = p->first; + MetaSessionRef session = p->second; + ++p; + + int oldstate = _mdsmap->get_state(mds); + int newstate = mdsmap->get_state(mds); + if (!mdsmap->is_up(mds)) { + session->con->mark_down(); + } else if (mdsmap->get_addrs(mds) != session->addrs) { + auto old_inc = _mdsmap->get_incarnation(mds); + auto new_inc = mdsmap->get_incarnation(mds); + if (old_inc != new_inc) { + ldout(cct, 1) << "mds incarnation changed from " + << old_inc << " to " << new_inc << dendl; + oldstate = MDSMap::STATE_NULL; + } + session->con->mark_down(); + session->addrs = mdsmap->get_addrs(mds); + // When new MDS starts to take over, notify kernel to trim unused entries + // in its dcache/icache. Hopefully, the kernel will release some unused + // inodes before the new MDS enters reconnect state. + trim_cache_for_reconnect(session.get()); + } else if (oldstate == newstate) + continue; // no change + + session->mds_state = newstate; + if (newstate == MDSMap::STATE_RECONNECT) { + session->con = messenger->connect_to_mds(session->addrs); + send_reconnect(session.get()); + } else if (newstate > MDSMap::STATE_RECONNECT) { + if (oldstate < MDSMap::STATE_RECONNECT) { + ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl; + _closed_mds_session(session.get()); + continue; + } + if (newstate >= MDSMap::STATE_ACTIVE) { + if (oldstate < MDSMap::STATE_ACTIVE) { + // kick new requests + kick_requests(session.get()); + kick_flushing_caps(session.get()); + signal_context_list(session->waiting_for_open); + wake_up_session_caps(session.get(), true); + } + connect_mds_targets(mds); + } + } else if (newstate == MDSMap::STATE_NULL && + mds >= mdsmap->get_max_mds()) { + _closed_mds_session(session.get()); + } + } + + // kick any waiting threads + signal_cond_list(waiting_for_mdsmap); + + monclient->sub_got("mdsmap", mdsmap->get_epoch()); +} + +void Client::send_reconnect(MetaSession *session) +{ + mds_rank_t mds = session->mds_num; + ldout(cct, 10) << __func__ << " to mds." << mds << dendl; + + // trim unused caps to reduce MDS's cache rejoin time + trim_cache_for_reconnect(session); + + session->readonly = false; + + session->release.reset(); + + // reset my cap seq number + session->seq = 0; + //connect to the mds' offload targets + connect_mds_targets(mds); + //make sure unsafe requests get saved + resend_unsafe_requests(session); + + early_kick_flushing_caps(session); + + auto m = make_message<MClientReconnect>(); + bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT); + + // i have an open session. + ceph::unordered_set<inodeno_t> did_snaprealm; + for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin(); + p != inode_map.end(); + ++p) { + Inode *in = p->second; + auto it = in->caps.find(mds); + if (it != in->caps.end()) { + if (allow_multi && + m->get_approx_size() >= + static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) { + m->mark_more(); + session->con->send_message2(std::move(m)); + + m = make_message<MClientReconnect>(); + } + + Cap &cap = it->second; + ldout(cct, 10) << " caps on " << p->first + << " " << ccap_string(cap.issued) + << " wants " << ccap_string(in->caps_wanted()) + << dendl; + filepath path; + in->make_short_path(path); + ldout(cct, 10) << " path " << path << dendl; + + bufferlist flockbl; + _encode_filelocks(in, flockbl); + + cap.seq = 0; // reset seq. + cap.issue_seq = 0; // reset seq. + cap.mseq = 0; // reset seq. + // cap gen should catch up with session cap_gen + if (cap.gen < session->cap_gen) { + cap.gen = session->cap_gen; + cap.issued = cap.implemented = CEPH_CAP_PIN; + } else { + cap.issued = cap.implemented; + } + snapid_t snap_follows = 0; + if (!in->cap_snaps.empty()) + snap_follows = in->cap_snaps.begin()->first; + + m->add_cap(p->first.ino, + cap.cap_id, + path.get_ino(), path.get_path(), // ino + in->caps_wanted(), // wanted + cap.issued, // issued + in->snaprealm->ino, + snap_follows, + flockbl); + + if (did_snaprealm.count(in->snaprealm->ino) == 0) { + ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl; + m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent); + did_snaprealm.insert(in->snaprealm->ino); + } + } + } + + if (!allow_multi) + m->set_encoding_version(0); // use connection features to choose encoding + session->con->send_message2(std::move(m)); + + mount_cond.notify_all(); + + if (session->reclaim_state == MetaSession::RECLAIMING) + signal_cond_list(waiting_for_reclaim); +} + + +void Client::kick_requests(MetaSession *session) +{ + ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl; + for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin(); + p != mds_requests.end(); + ++p) { + MetaRequest *req = p->second; + if (req->got_unsafe) + continue; + if (req->aborted()) { + if (req->caller_cond) { + req->kick = true; + req->caller_cond->notify_all(); + } + continue; + } + if (req->retry_attempt > 0) + continue; // new requests only + if (req->mds == session->mds_num) { + send_request(p->second, session); + } + } +} + +void Client::resend_unsafe_requests(MetaSession *session) +{ + for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin(); + !iter.end(); + ++iter) + send_request(*iter, session); + + // also re-send old requests when MDS enters reconnect stage. So that MDS can + // process completed requests in clientreplay stage. + for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin(); + p != mds_requests.end(); + ++p) { + MetaRequest *req = p->second; + if (req->got_unsafe) + continue; + if (req->aborted()) + continue; + if (req->retry_attempt == 0) + continue; // old requests only + if (req->mds == session->mds_num) + send_request(req, session, true); + } +} + +void Client::wait_unsafe_requests() +{ + list<MetaRequest*> last_unsafe_reqs; + for (const auto &p : mds_sessions) { + const auto s = p.second; + if (!s->unsafe_requests.empty()) { + MetaRequest *req = s->unsafe_requests.back(); + req->get(); + last_unsafe_reqs.push_back(req); + } + } + + for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin(); + p != last_unsafe_reqs.end(); + ++p) { + MetaRequest *req = *p; + if (req->unsafe_item.is_on_list()) + wait_on_list(req->waitfor_safe); + put_request(req); + } +} + +void Client::kick_requests_closed(MetaSession *session) +{ + ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl; + for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin(); + p != mds_requests.end(); ) { + MetaRequest *req = p->second; + ++p; + if (req->mds == session->mds_num) { + if (req->caller_cond) { + req->kick = true; + req->caller_cond->notify_all(); + } + req->item.remove_myself(); + if (req->got_unsafe) { + lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl; + req->unsafe_item.remove_myself(); + if (is_dir_operation(req)) { + Inode *dir = req->inode(); + ceph_assert(dir); + dir->set_async_err(-CEPHFS_EIO); + lderr(cct) << "kick_requests_closed drop req of inode(dir) : " + << dir->ino << " " << req->get_tid() << dendl; + req->unsafe_dir_item.remove_myself(); + } + if (req->target) { + InodeRef &in = req->target; + in->set_async_err(-CEPHFS_EIO); + lderr(cct) << "kick_requests_closed drop req of inode : " + << in->ino << " " << req->get_tid() << dendl; + req->unsafe_target_item.remove_myself(); + } + signal_cond_list(req->waitfor_safe); + unregister_request(req); + } + } + } + ceph_assert(session->requests.empty()); + ceph_assert(session->unsafe_requests.empty()); +} + + + + +/************ + * leases + */ + +void Client::got_mds_push(MetaSession *s) +{ + s->seq++; + ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl; + if (s->state == MetaSession::STATE_CLOSING) { + s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq)); + } +} + +void Client::handle_lease(const MConstRef<MClientLease>& m) +{ + ldout(cct, 10) << __func__ << " " << *m << dendl; + + ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE); + mds_rank_t mds = mds_rank_t(m->get_source().num()); + + std::scoped_lock cl(client_lock); + auto session = _get_mds_session(mds, m->get_connection().get()); + if (!session) { + return; + } + + got_mds_push(session.get()); + + ceph_seq_t seq = m->get_seq(); + + Inode *in; + vinodeno_t vino(m->get_ino(), CEPH_NOSNAP); + if (inode_map.count(vino) == 0) { + ldout(cct, 10) << " don't have vino " << vino << dendl; + goto revoke; + } + in = inode_map[vino]; + + if (m->get_mask() & CEPH_LEASE_VALID) { + if (!in->dir || in->dir->dentries.count(m->dname) == 0) { + ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl; + goto revoke; + } + Dentry *dn = in->dir->dentries[m->dname]; + ldout(cct, 10) << " revoked DN lease on " << dn << dendl; + dn->lease_mds = -1; + } + + revoke: + { + auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq, + m->get_mask(), m->get_ino(), + m->get_first(), m->get_last(), m->dname); + m->get_connection()->send_message2(std::move(reply)); + } +} + +void Client::_put_inode(Inode *in, int n) +{ + ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl; + + int left = in->get_nref(); + ceph_assert(left >= n + 1); + in->iput(n); + left -= n; + if (left == 1) { // the last one will be held by the inode_map + // release any caps + remove_all_caps(in); + + ldout(cct, 10) << __func__ << " deleting " << *in << dendl; + bool unclean = objectcacher->release_set(&in->oset); + ceph_assert(!unclean); + inode_map.erase(in->vino()); + if (use_faked_inos()) + _release_faked_ino(in); + + if (root == nullptr) { + root_ancestor = 0; + while (!root_parents.empty()) + root_parents.erase(root_parents.begin()); + } + + in->iput(); + } +} + +void Client::delay_put_inodes(bool wakeup) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + std::map<Inode*,int> release; + { + std::scoped_lock dl(delay_i_lock); + release.swap(delay_i_release); + } + + if (release.empty()) + return; + + for (auto &[in, cnt] : release) + _put_inode(in, cnt); + + if (wakeup) + mount_cond.notify_all(); +} + +void Client::put_inode(Inode *in, int n) +{ + ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl; + + std::scoped_lock dl(delay_i_lock); + delay_i_release[in] += n; +} + +void Client::close_dir(Dir *dir) +{ + Inode *in = dir->parent_inode; + ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl; + ceph_assert(dir->is_empty()); + ceph_assert(in->dir == dir); + ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked + if (!in->dentries.empty()) + in->get_first_parent()->put(); // unpin dentry + + delete in->dir; + in->dir = 0; + put_inode(in); // unpin inode +} + + /** + * Don't call this with in==NULL, use get_or_create for that + * leave dn set to default NULL unless you're trying to add + * a new inode to a pre-created Dentry + */ +Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn) +{ + if (!dn) { + // create a new Dentry + dn = new Dentry(dir, name); + + lru.lru_insert_mid(dn); // mid or top? + + if(in) { + ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << *in + << " dn " << *dn << " (new dn)" << dendl; + } else { + ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' " + << " dn " << *dn << " (new dn)" << dendl; + } + } else { + ceph_assert(!dn->inode); + ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << in + << " dn " << *dn << " (old dn)" << dendl; + } + + if (in) { // link to inode + InodeRef tmp_ref; + // only one parent for directories! + if (in->is_dir() && !in->dentries.empty()) { + tmp_ref = in; // prevent unlink below from freeing the inode. + Dentry *olddn = in->get_first_parent(); + ceph_assert(olddn->dir != dir || olddn->name != name); + Inode *old_diri = olddn->dir->parent_inode; + clear_dir_complete_and_ordered(old_diri, true); + unlink(olddn, true, true); // keep dir, dentry + } + + dn->link(in); + inc_dentry_nr(); + ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl; + } + + return dn; +} + +void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry) +{ + InodeRef in(dn->inode); + ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn + << " inode " << dn->inode << dendl; + + // unlink from inode + if (dn->inode) { + dn->unlink(); + dec_dentry_nr(); + ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl; + } + + if (keepdentry) { + dn->lease_mds = -1; + } else { + ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl; + + // unlink from dir + Dir *dir = dn->dir; + dn->detach(); + + // delete den + lru.lru_remove(dn); + dn->put(); + + if (dir->is_empty() && !keepdir) + close_dir(dir); + } +} + +/** + * For asynchronous flushes, check for errors from the IO and + * update the inode if necessary + */ +class C_Client_FlushComplete : public Context { +private: + Client *client; + InodeRef inode; +public: + C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { } + void finish(int r) override { + ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock)); + if (r != 0) { + client_t const whoami = client->whoami; // For the benefit of ldout prefix + ldout(client->cct, 1) << "I/O error from flush on inode " << inode + << " 0x" << std::hex << inode->ino << std::dec + << ": " << r << "(" << cpp_strerror(r) << ")" << dendl; + inode->set_async_err(r); + } + } +}; + + +/**** + * caps + */ + +void Client::get_cap_ref(Inode *in, int cap) +{ + if ((cap & CEPH_CAP_FILE_BUFFER) && + in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) { + ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl; + in->iget(); + } + if ((cap & CEPH_CAP_FILE_CACHE) && + in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) { + ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl; + in->iget(); + } + in->get_cap_ref(cap); +} + +void Client::put_cap_ref(Inode *in, int cap) +{ + int last = in->put_cap_ref(cap); + if (last) { + int put_nref = 0; + int drop = last & ~in->caps_issued(); + if (in->snapid == CEPH_NOSNAP) { + if ((last & CEPH_CAP_FILE_WR) && + !in->cap_snaps.empty() && + in->cap_snaps.rbegin()->second.writing) { + ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl; + in->cap_snaps.rbegin()->second.writing = 0; + finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in)); + signal_cond_list(in->waitfor_caps); // wake up blocked sync writers + } + if (last & CEPH_CAP_FILE_BUFFER) { + for (auto &p : in->cap_snaps) + p.second.dirty_data = 0; + signal_cond_list(in->waitfor_commit); + ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl; + ++put_nref; + + if (!in->cap_snaps.empty()) { + flush_snaps(in); + } + } + } + if (last & CEPH_CAP_FILE_CACHE) { + ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl; + ++put_nref; + } + if (drop) + check_caps(in, 0); + if (put_nref) + put_inode(in, put_nref); + } +} + +// get caps for a given file handle -- the inode should have @need caps +// issued by the mds and @want caps not revoked (or not under revocation). +// this routine blocks till the cap requirement is satisfied. also account +// (track) for capability hit when required (when cap requirement succeedes). +int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff) +{ + Inode *in = fh->inode.get(); + + int r = check_pool_perm(in, need); + if (r < 0) + return r; + + while (1) { + int file_wanted = in->caps_file_wanted(); + if ((file_wanted & need) != need) { + ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need) + << " file_wanted " << ccap_string(file_wanted) << ", EBADF " + << dendl; + return -CEPHFS_EBADF; + } + + if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen) + return -CEPHFS_EBADF; + + if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks()) + return -CEPHFS_EIO; + + int implemented; + int have = in->caps_issued(&implemented); + + bool waitfor_caps = false; + bool waitfor_commit = false; + + if (have & need & CEPH_CAP_FILE_WR) { + if (endoff > 0) { + if ((endoff >= (loff_t)in->max_size || + endoff > (loff_t)(in->size << 1)) && + endoff > (loff_t)in->wanted_max_size) { + ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl; + in->wanted_max_size = endoff; + } + if (in->wanted_max_size > in->max_size && + in->wanted_max_size > in->requested_max_size) + check_caps(in, 0); + } + + if (endoff >= 0 && endoff > (loff_t)in->max_size) { + ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl; + waitfor_caps = true; + } + if (!in->cap_snaps.empty()) { + if (in->cap_snaps.rbegin()->second.writing) { + ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl; + waitfor_caps = true; + } + for (auto &p : in->cap_snaps) { + if (p.second.dirty_data) { + waitfor_commit = true; + break; + } + } + if (waitfor_commit) { + _flush(in, new C_Client_FlushComplete(this, in)); + ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl; + } + } + } + + if (!waitfor_caps && !waitfor_commit) { + if ((have & need) == need) { + int revoking = implemented & ~have; + ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have) + << " need " << ccap_string(need) << " want " << ccap_string(want) + << " revoking " << ccap_string(revoking) + << dendl; + if ((revoking & want) == 0) { + *phave = need | (have & want); + in->get_cap_ref(need); + cap_hit(); + return 0; + } + } + ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl; + waitfor_caps = true; + } + + if ((need & CEPH_CAP_FILE_WR) && + ((in->auth_cap && in->auth_cap->session->readonly) || + // userland clients are only allowed to read if fscrypt enabled + in->is_fscrypt_enabled())) + return -CEPHFS_EROFS; + + if (in->flags & I_CAP_DROPPED) { + int mds_wanted = in->caps_mds_wanted(); + if ((mds_wanted & need) != need) { + int ret = _renew_caps(in); + if (ret < 0) + return ret; + continue; + } + if (!(file_wanted & ~mds_wanted)) + in->flags &= ~I_CAP_DROPPED; + } + + if (waitfor_caps) + wait_on_list(in->waitfor_caps); + else if (waitfor_commit) + wait_on_list(in->waitfor_commit); + } +} + +int Client::get_caps_used(Inode *in) +{ + unsigned used = in->caps_used(); + if (!(used & CEPH_CAP_FILE_CACHE) && + !objectcacher->set_is_empty(&in->oset)) + used |= CEPH_CAP_FILE_CACHE; + return used; +} + +void Client::cap_delay_requeue(Inode *in) +{ + ldout(cct, 10) << __func__ << " on " << *in << dendl; + + in->hold_caps_until = ceph::coarse_mono_clock::now() + caps_release_delay; + delayed_list.push_back(&in->delay_cap_item); +} + +void Client::send_cap(Inode *in, MetaSession *session, Cap *cap, + int flags, int used, int want, int retain, + int flush, ceph_tid_t flush_tid) +{ + int held = cap->issued | cap->implemented; + int revoking = cap->implemented & ~cap->issued; + retain &= ~revoking; + int dropping = cap->issued & ~retain; + int op = CEPH_CAP_OP_UPDATE; + + ldout(cct, 10) << __func__ << " " << *in + << " mds." << session->mds_num << " seq " << cap->seq + << " used " << ccap_string(used) + << " want " << ccap_string(want) + << " flush " << ccap_string(flush) + << " retain " << ccap_string(retain) + << " held "<< ccap_string(held) + << " revoking " << ccap_string(revoking) + << " dropping " << ccap_string(dropping) + << dendl; + + if (cct->_conf->client_inject_release_failure && revoking) { + const int would_have_issued = cap->issued & retain; + const int would_have_implemented = cap->implemented & (cap->issued | used); + // Simulated bug: + // - tell the server we think issued is whatever they issued plus whatever we implemented + // - leave what we have implemented in place + ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl; + cap->issued = cap->issued | cap->implemented; + + // Make an exception for revoking xattr caps: we are injecting + // failure to release other caps, but allow xattr because client + // will block on xattr ops if it can't release these to MDS (#9800) + const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; + cap->issued ^= xattr_mask & revoking; + cap->implemented ^= xattr_mask & revoking; + + ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl; + ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl; + } else { + // Normal behaviour + cap->issued &= retain; + cap->implemented &= cap->issued | used; + } + + snapid_t follows = 0; + + if (flush) + follows = in->snaprealm->get_snap_context().seq; + + auto m = make_message<MClientCaps>(op, + in->ino, + 0, + cap->cap_id, cap->seq, + cap->implemented, + want, + flush, + cap->mseq, + cap_epoch_barrier); + m->caller_uid = in->cap_dirtier_uid; + m->caller_gid = in->cap_dirtier_gid; + + m->head.issue_seq = cap->issue_seq; + m->set_tid(flush_tid); + + m->head.uid = in->uid; + m->head.gid = in->gid; + m->head.mode = in->mode; + + m->head.nlink = in->nlink; + + if (flush & CEPH_CAP_XATTR_EXCL) { + encode(in->xattrs, m->xattrbl); + m->head.xattr_version = in->xattr_version; + } + + m->size = in->size; + m->max_size = in->max_size; + m->truncate_seq = in->truncate_seq; + m->truncate_size = in->truncate_size; + m->mtime = in->mtime; + m->atime = in->atime; + m->ctime = in->ctime; + m->btime = in->btime; + m->time_warp_seq = in->time_warp_seq; + m->change_attr = in->change_attr; + m->fscrypt_auth = in->fscrypt_auth; + m->fscrypt_file = in->fscrypt_file; + + if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) && + !in->cap_snaps.empty() && + in->cap_snaps.rbegin()->second.flush_tid == 0) + flags |= MClientCaps::FLAG_PENDING_CAPSNAP; + m->flags = flags; + + if (flush & CEPH_CAP_FILE_WR) { + m->inline_version = in->inline_version; + m->inline_data = in->inline_data; + } + + in->reported_size = in->size; + m->set_snap_follows(follows); + cap->wanted = want; + if (cap == in->auth_cap) { + if (want & CEPH_CAP_ANY_FILE_WR) { + m->set_max_size(in->wanted_max_size); + in->requested_max_size = in->wanted_max_size; + ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl; + } else { + in->requested_max_size = 0; + ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl; + } + } + + if (!session->flushing_caps_tids.empty()) + m->set_oldest_flush_tid(*session->flushing_caps_tids.begin()); + + session->con->send_message2(std::move(m)); +} + +static bool is_max_size_approaching(Inode *in) +{ + /* mds will adjust max size according to the reported size */ + if (in->flushing_caps & CEPH_CAP_FILE_WR) + return false; + if (in->size >= in->max_size) + return true; + /* half of previous max_size increment has been used */ + if (in->max_size > in->reported_size && + (in->size << 1) >= in->max_size + in->reported_size) + return true; + return false; +} + +static int adjust_caps_used_for_lazyio(int used, int issued, int implemented) +{ + if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER))) + return used; + if (!(implemented & CEPH_CAP_FILE_LAZYIO)) + return used; + + if (issued & CEPH_CAP_FILE_LAZYIO) { + if (!(issued & CEPH_CAP_FILE_CACHE)) { + used &= ~CEPH_CAP_FILE_CACHE; + used |= CEPH_CAP_FILE_LAZYIO; + } + if (!(issued & CEPH_CAP_FILE_BUFFER)) { + used &= ~CEPH_CAP_FILE_BUFFER; + used |= CEPH_CAP_FILE_LAZYIO; + } + } else { + if (!(implemented & CEPH_CAP_FILE_CACHE)) { + used &= ~CEPH_CAP_FILE_CACHE; + used |= CEPH_CAP_FILE_LAZYIO; + } + if (!(implemented & CEPH_CAP_FILE_BUFFER)) { + used &= ~CEPH_CAP_FILE_BUFFER; + used |= CEPH_CAP_FILE_LAZYIO; + } + } + return used; +} + +/** + * check_caps + * + * Examine currently used and wanted versus held caps. Release, flush or ack + * revoked caps to the MDS as appropriate. + * + * @param in the inode to check + * @param flags flags to apply to cap check + */ +void Client::check_caps(Inode *in, unsigned flags) +{ + unsigned wanted = in->caps_wanted(); + unsigned used = get_caps_used(in); + unsigned cap_used; + + int implemented; + int issued = in->caps_issued(&implemented); + int revoking = implemented & ~issued; + + int orig_used = used; + used = adjust_caps_used_for_lazyio(used, issued, implemented); + + int retain = wanted | used | CEPH_CAP_PIN; + if (!is_unmounting() && in->nlink > 0) { + if (wanted) { + retain |= CEPH_CAP_ANY; + } else if (in->is_dir() && + (issued & CEPH_CAP_FILE_SHARED) && + (in->flags & I_COMPLETE)) { + // we do this here because we don't want to drop to Fs (and then + // drop the Fs if we do a create!) if that alone makes us send lookups + // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere + wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + retain |= wanted; + } else { + retain |= CEPH_CAP_ANY_SHARED; + // keep RD only if we didn't have the file open RW, + // because then the mds would revoke it anyway to + // journal max_size=0. + if (in->max_size == 0) + retain |= CEPH_CAP_ANY_RD; + } + } + + ldout(cct, 10) << __func__ << " on " << *in + << " wanted " << ccap_string(wanted) + << " used " << ccap_string(used) + << " issued " << ccap_string(issued) + << " revoking " << ccap_string(revoking) + << " flags=" << flags + << dendl; + + if (in->snapid != CEPH_NOSNAP) + return; //snap caps last forever, can't write + + if (in->caps.empty()) + return; // guard if at end of func + + if (!(orig_used & CEPH_CAP_FILE_BUFFER) && + (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) { + if (_release(in)) + used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO); + } + + for (auto &[mds, cap] : in->caps) { + auto session = mds_sessions.at(mds); + + cap_used = used; + if (in->auth_cap && &cap != in->auth_cap) + cap_used &= ~in->auth_cap->issued; + + revoking = cap.implemented & ~cap.issued; + + ldout(cct, 10) << " cap mds." << mds + << " issued " << ccap_string(cap.issued) + << " implemented " << ccap_string(cap.implemented) + << " revoking " << ccap_string(revoking) << dendl; + + if (in->wanted_max_size > in->max_size && + in->wanted_max_size > in->requested_max_size && + &cap == in->auth_cap) + goto ack; + + /* approaching file_max? */ + if ((cap.issued & CEPH_CAP_FILE_WR) && + &cap == in->auth_cap && + is_max_size_approaching(in)) { + ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size + << ", reported " << in->reported_size << dendl; + goto ack; + } + + /* completed revocation? */ + if (revoking && (revoking & cap_used) == 0) { + ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl; + goto ack; + } + + /* want more caps from mds? */ + if (wanted & ~(cap.wanted | cap.issued)) + goto ack; + + if (!revoking && is_unmounting() && (cap_used == 0)) + goto ack; + + if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like + !in->dirty_caps) // and we have no dirty caps + continue; + + if (!(flags & CHECK_CAPS_NODELAY)) { + ldout(cct, 10) << "delaying cap release" << dendl; + cap_delay_requeue(in); + continue; + } + + ack: + if (&cap == in->auth_cap) { + if (in->flags & I_KICK_FLUSH) { + ldout(cct, 20) << " reflushing caps (check_caps) on " << *in + << " to mds." << mds << dendl; + kick_flushing_caps(in, session.get()); + } + if (!in->cap_snaps.empty() && + in->cap_snaps.rbegin()->second.flush_tid == 0) + flush_snaps(in); + } + + int flushing; + int msg_flags = 0; + ceph_tid_t flush_tid; + if (in->auth_cap == &cap && in->dirty_caps) { + flushing = mark_caps_flushing(in, &flush_tid); + if (flags & CHECK_CAPS_SYNCHRONOUS) + msg_flags |= MClientCaps::FLAG_SYNC; + } else { + flushing = 0; + flush_tid = 0; + } + + in->delay_cap_item.remove_myself(); + send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain, + flushing, flush_tid); + } +} + + +void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc) +{ + int used = get_caps_used(in); + int dirty = in->caps_dirty(); + ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl; + + if (in->cap_snaps.size() && + in->cap_snaps.rbegin()->second.writing) { + ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl; + return; + } else if (dirty || (used & CEPH_CAP_FILE_WR)) { + const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in)); + ceph_assert(capsnapem.second); /* element inserted */ + CapSnap &capsnap = capsnapem.first->second; + capsnap.context = old_snapc; + capsnap.issued = in->caps_issued(); + capsnap.dirty = dirty; + + capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER); + + capsnap.uid = in->uid; + capsnap.gid = in->gid; + capsnap.mode = in->mode; + capsnap.btime = in->btime; + capsnap.xattrs = in->xattrs; + capsnap.xattr_version = in->xattr_version; + capsnap.cap_dirtier_uid = in->cap_dirtier_uid; + capsnap.cap_dirtier_gid = in->cap_dirtier_gid; + + if (used & CEPH_CAP_FILE_WR) { + ldout(cct, 10) << __func__ << " WR used on " << *in << dendl; + capsnap.writing = 1; + } else { + finish_cap_snap(in, capsnap, used); + } + } else { + ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl; + } +} + +void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used) +{ + ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl; + capsnap.size = in->size; + capsnap.mtime = in->mtime; + capsnap.atime = in->atime; + capsnap.ctime = in->ctime; + capsnap.time_warp_seq = in->time_warp_seq; + capsnap.change_attr = in->change_attr; + capsnap.dirty |= in->caps_dirty(); + + /* Only reset it if it wasn't set before */ + if (capsnap.cap_dirtier_uid == -1) { + capsnap.cap_dirtier_uid = in->cap_dirtier_uid; + capsnap.cap_dirtier_gid = in->cap_dirtier_gid; + } + + if (capsnap.dirty & CEPH_CAP_FILE_WR) { + capsnap.inline_data = in->inline_data; + capsnap.inline_version = in->inline_version; + } + + if (used & CEPH_CAP_FILE_BUFFER) { + ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used + << " WRBUFFER, trigger to flush dirty buffer" << dendl; + + /* trigger to flush the buffer */ + _flush(in, new C_Client_FlushComplete(this, in)); + } else { + capsnap.dirty_data = 0; + flush_snaps(in); + } +} + +void Client::send_flush_snap(Inode *in, MetaSession *session, + snapid_t follows, CapSnap& capsnap) +{ + auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP, + in->ino, in->snaprealm->ino, 0, + in->auth_cap->mseq, cap_epoch_barrier); + m->caller_uid = capsnap.cap_dirtier_uid; + m->caller_gid = capsnap.cap_dirtier_gid; + + m->set_client_tid(capsnap.flush_tid); + m->head.snap_follows = follows; + + m->head.caps = capsnap.issued; + m->head.dirty = capsnap.dirty; + + m->head.uid = capsnap.uid; + m->head.gid = capsnap.gid; + m->head.mode = capsnap.mode; + m->btime = capsnap.btime; + + m->size = capsnap.size; + + m->head.xattr_version = capsnap.xattr_version; + encode(capsnap.xattrs, m->xattrbl); + + m->ctime = capsnap.ctime; + m->btime = capsnap.btime; + m->mtime = capsnap.mtime; + m->atime = capsnap.atime; + m->time_warp_seq = capsnap.time_warp_seq; + m->change_attr = capsnap.change_attr; + + if (capsnap.dirty & CEPH_CAP_FILE_WR) { + m->inline_version = in->inline_version; + m->inline_data = in->inline_data; + } + + ceph_assert(!session->flushing_caps_tids.empty()); + m->set_oldest_flush_tid(*session->flushing_caps_tids.begin()); + + session->con->send_message2(std::move(m)); +} + +void Client::flush_snaps(Inode *in) +{ + ldout(cct, 10) << "flush_snaps on " << *in << dendl; + ceph_assert(in->cap_snaps.size()); + + // pick auth mds + ceph_assert(in->auth_cap); + MetaSession *session = in->auth_cap->session; + + for (auto &p : in->cap_snaps) { + CapSnap &capsnap = p.second; + // only do new flush + if (capsnap.flush_tid > 0) + continue; + + ldout(cct, 10) << "flush_snaps mds." << session->mds_num + << " follows " << p.first + << " size " << capsnap.size + << " mtime " << capsnap.mtime + << " dirty_data=" << capsnap.dirty_data + << " writing=" << capsnap.writing + << " on " << *in << dendl; + if (capsnap.dirty_data || capsnap.writing) + break; + + capsnap.flush_tid = ++last_flush_tid; + session->flushing_caps_tids.insert(capsnap.flush_tid); + in->flushing_cap_tids[capsnap.flush_tid] = 0; + if (!in->flushing_cap_item.is_on_list()) + session->flushing_caps.push_back(&in->flushing_cap_item); + + send_flush_snap(in, session, p.first, capsnap); + } +} + +void Client::wait_on_list(list<ceph::condition_variable*>& ls) +{ + ceph::condition_variable cond; + ls.push_back(&cond); + std::unique_lock l{client_lock, std::adopt_lock}; + cond.wait(l); + l.release(); + ls.remove(&cond); +} + +void Client::signal_cond_list(list<ceph::condition_variable*>& ls) +{ + for (auto cond : ls) { + cond->notify_all(); + } +} + +void Client::wait_on_context_list(list<Context*>& ls) +{ + ceph::condition_variable cond; + bool done = false; + int r; + ls.push_back(new C_Cond(cond, &done, &r)); + std::unique_lock l{client_lock, std::adopt_lock}; + cond.wait(l, [&done] { return done;}); + l.release(); +} + +void Client::signal_context_list(list<Context*>& ls) +{ + while (!ls.empty()) { + ls.front()->complete(0); + ls.pop_front(); + } +} + +void Client::wake_up_session_caps(MetaSession *s, bool reconnect) +{ + for (const auto &cap : s->caps) { + auto &in = cap->inode; + if (reconnect) { + in.requested_max_size = 0; + in.wanted_max_size = 0; + } else { + if (cap->gen < s->cap_gen) { + // mds did not re-issue stale cap. + cap->issued = cap->implemented = CEPH_CAP_PIN; + // make sure mds knows what we want. + if (in.caps_file_wanted() & ~cap->wanted) + in.flags |= I_CAP_DROPPED; + } + } + signal_cond_list(in.waitfor_caps); + } +} + + +// flush dirty data (from objectcache) + +class C_Client_CacheInvalidate : public Context { +private: + Client *client; + vinodeno_t ino; + int64_t offset, length; +public: + C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) : + client(c), offset(off), length(len) { + if (client->use_faked_inos()) + ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP); + else + ino = in->vino(); + } + void finish(int r) override { + // _async_invalidate takes the lock when it needs to, call this back from outside of lock. + ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock)); + client->_async_invalidate(ino, offset, length); + } +}; + +void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return; + + ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl; + ino_invalidate_cb(callback_handle, ino, off, len); +} + +void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) { + + if (ino_invalidate_cb) + // we queue the invalidate, which calls the callback and decrements the ref + async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len)); +} + +void Client::_invalidate_inode_cache(Inode *in) +{ + ldout(cct, 10) << __func__ << " " << *in << dendl; + + // invalidate our userspace inode cache + if (cct->_conf->client_oc) { + objectcacher->release_set(&in->oset); + if (!objectcacher->set_is_empty(&in->oset)) + lderr(cct) << "failed to invalidate cache for " << *in << dendl; + } + + _schedule_invalidate_callback(in, 0, 0); +} + +void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len) +{ + ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl; + + // invalidate our userspace inode cache + if (cct->_conf->client_oc) { + vector<ObjectExtent> ls; + Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls); + objectcacher->discard_writeback(&in->oset, ls, nullptr); + } + + _schedule_invalidate_callback(in, off, len); +} + +bool Client::_release(Inode *in) +{ + ldout(cct, 20) << "_release " << *in << dendl; + if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) { + _invalidate_inode_cache(in); + return true; + } + return false; +} + +bool Client::_flush(Inode *in, Context *onfinish) +{ + ldout(cct, 10) << "_flush " << *in << dendl; + + if (!in->oset.dirty_or_tx) { + ldout(cct, 10) << " nothing to flush" << dendl; + onfinish->complete(0); + return true; + } + + if (objecter->osdmap_pool_full(in->layout.pool_id)) { + ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl; + objectcacher->purge_set(&in->oset); + if (onfinish) { + onfinish->complete(-CEPHFS_ENOSPC); + } + return true; + } + + return objectcacher->flush_set(&in->oset, onfinish); +} + +void Client::_flush_range(Inode *in, int64_t offset, uint64_t size) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + if (!in->oset.dirty_or_tx) { + ldout(cct, 10) << " nothing to flush" << dendl; + return; + } + + C_SaferCond onflush("Client::_flush_range flock"); + bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(), + offset, size, &onflush); + if (!ret) { + // wait for flush + client_lock.unlock(); + onflush.wait(); + client_lock.lock(); + } +} + +void Client::flush_set_callback(ObjectCacher::ObjectSet *oset) +{ + // std::scoped_lock l(client_lock); + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ... + Inode *in = static_cast<Inode *>(oset->parent); + ceph_assert(in); + _flushed(in); +} + +void Client::_flushed(Inode *in) +{ + ldout(cct, 10) << "_flushed " << *in << dendl; + + put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER); +} + + + +// checks common to add_update_cap, handle_cap_grant +void Client::check_cap_issue(Inode *in, unsigned issued) +{ + unsigned had = in->caps_issued(); + + if ((issued & CEPH_CAP_FILE_CACHE) && + !(had & CEPH_CAP_FILE_CACHE)) + in->cache_gen++; + + if ((issued & CEPH_CAP_FILE_SHARED) != + (had & CEPH_CAP_FILE_SHARED)) { + if (issued & CEPH_CAP_FILE_SHARED) + in->shared_gen++; + if (in->is_dir()) + clear_dir_complete_and_ordered(in, true); + } +} + +void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id, + unsigned issued, unsigned wanted, unsigned seq, unsigned mseq, + inodeno_t realm, int flags, const UserPerm& cap_perms) +{ + if (!in->is_any_caps()) { + ceph_assert(in->snaprealm == 0); + in->snaprealm = get_snap_realm(realm); + in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item); + ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl; + } else { + ceph_assert(in->snaprealm); + if ((flags & CEPH_CAP_FLAG_AUTH) && + realm != inodeno_t(-1) && in->snaprealm->ino != realm) { + in->snaprealm_item.remove_myself(); + auto oldrealm = in->snaprealm; + in->snaprealm = get_snap_realm(realm); + in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item); + put_snap_realm(oldrealm); + } + } + + mds_rank_t mds = mds_session->mds_num; + const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session)); + Cap &cap = capem.first->second; + if (!capem.second) { + if (cap.gen < mds_session->cap_gen) + cap.issued = cap.implemented = CEPH_CAP_PIN; + + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap.seq) <= 0) { + if (&cap != in->auth_cap) + ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl; + + ceph_assert(cap.cap_id == cap_id); + seq = cap.seq; + mseq = cap.mseq; + issued |= cap.issued; + flags |= CEPH_CAP_FLAG_AUTH; + } + } else { + inc_pinned_icaps(); + } + + check_cap_issue(in, issued); + + if (flags & CEPH_CAP_FLAG_AUTH) { + if (in->auth_cap != &cap && + (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) { + if (in->auth_cap) { + if (in->flushing_cap_item.is_on_list()) { + ldout(cct, 10) << __func__ << " changing auth cap: " + << "add myself to new auth MDS' flushing caps list" << dendl; + adjust_session_flushing_caps(in, in->auth_cap->session, mds_session); + } + if (in->dirty_cap_item.is_on_list()) { + ldout(cct, 10) << __func__ << " changing auth cap: " + << "add myself to new auth MDS' dirty caps list" << dendl; + mds_session->get_dirty_list().push_back(&in->dirty_cap_item); + } + } + + in->auth_cap = ∩ + } + } + + unsigned old_caps = cap.issued; + cap.cap_id = cap_id; + cap.issued = issued; + cap.implemented |= issued; + if (ceph_seq_cmp(mseq, cap.mseq) > 0) + cap.wanted = wanted; + else + cap.wanted |= wanted; + cap.seq = seq; + cap.issue_seq = seq; + cap.mseq = mseq; + cap.gen = mds_session->cap_gen; + cap.latest_perms = cap_perms; + ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued) + << " from mds." << mds + << " on " << *in + << dendl; + + if ((issued & ~old_caps) && in->auth_cap == &cap) { + // non-auth MDS is revoking the newly grant caps ? + for (auto &p : in->caps) { + if (&p.second == &cap) + continue; + if (p.second.implemented & ~p.second.issued & issued) { + check_caps(in, CHECK_CAPS_NODELAY); + break; + } + } + } + + if (issued & ~old_caps) + signal_cond_list(in->waitfor_caps); +} + +void Client::remove_cap(Cap *cap, bool queue_release) +{ + auto &in = cap->inode; + MetaSession *session = cap->session; + mds_rank_t mds = cap->session->mds_num; + + ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl; + + if (queue_release) { + session->enqueue_cap_release( + in.ino, + cap->cap_id, + cap->issue_seq, + cap->mseq, + cap_epoch_barrier); + } else { + dec_pinned_icaps(); + } + + + if (in.auth_cap == cap) { + if (in.flushing_cap_item.is_on_list()) { + ldout(cct, 10) << " removing myself from flushing_cap list" << dendl; + in.flushing_cap_item.remove_myself(); + } + in.auth_cap = NULL; + } + size_t n = in.caps.erase(mds); + ceph_assert(n == 1); + cap = nullptr; + + if (!in.is_any_caps()) { + ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl; + in.snaprealm_item.remove_myself(); + put_snap_realm(in.snaprealm); + in.snaprealm = 0; + } +} + +void Client::remove_all_caps(Inode *in) +{ + while (!in->caps.empty()) + remove_cap(&in->caps.begin()->second, true); +} + +void Client::remove_session_caps(MetaSession *s, int err) +{ + ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl; + + while (s->caps.size()) { + Cap *cap = *s->caps.begin(); + InodeRef in(&cap->inode); + bool dirty_caps = false; + if (in->auth_cap == cap) { + dirty_caps = in->dirty_caps | in->flushing_caps; + in->wanted_max_size = 0; + in->requested_max_size = 0; + if (in->has_any_filelocks()) + in->flags |= I_ERROR_FILELOCK; + } + auto caps = cap->implemented; + if (cap->wanted | cap->issued) + in->flags |= I_CAP_DROPPED; + remove_cap(cap, false); + in->cap_snaps.clear(); + if (dirty_caps) { + lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl; + if (in->flushing_caps) { + num_flushing_caps--; + in->flushing_cap_tids.clear(); + } + in->flushing_caps = 0; + in->mark_caps_clean(); + put_inode(in.get()); + } + caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER; + if (caps && !in->caps_issued_mask(caps, true)) { + if (err == -CEPHFS_EBLOCKLISTED) { + if (in->oset.dirty_or_tx) { + lderr(cct) << __func__ << " still has dirty data on " << *in << dendl; + in->set_async_err(err); + } + objectcacher->purge_set(&in->oset); + } else { + objectcacher->release_set(&in->oset); + } + _schedule_invalidate_callback(in.get(), 0, 0); + } + + signal_cond_list(in->waitfor_caps); + } + s->flushing_caps_tids.clear(); + sync_cond.notify_all(); +} + +std::pair<int, bool> Client::_do_remount(bool retry_on_error) +{ + uint64_t max_retries = cct->_conf.get_val<uint64_t>("client_max_retries_on_remount_failure"); + bool abort_on_failure = false; + + errno = 0; + int r = remount_cb(callback_handle); + if (r == 0) { + retries_on_invalidate = 0; + } else { + int e = errno; + client_t whoami = get_nodeid(); + if (r == -1) { + lderr(cct) << + "failed to remount (to trim kernel dentries): " + "errno = " << e << " (" << strerror(e) << ")" << dendl; + } else { + lderr(cct) << + "failed to remount (to trim kernel dentries): " + "return code = " << r << dendl; + } + bool should_abort = + (cct->_conf.get_val<bool>("client_die_on_failed_remount") || + cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) && + !(retry_on_error && (++retries_on_invalidate < max_retries)); + if (should_abort && !is_unmounting()) { + lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl; + abort_on_failure = true; + } + } + return std::make_pair(r, abort_on_failure); +} + +class C_Client_Remount : public Context { +private: + Client *client; +public: + explicit C_Client_Remount(Client *c) : client(c) {} + void finish(int r) override { + ceph_assert(r == 0); + auto result = client->_do_remount(true); + if (result.second) { + ceph_abort(); + } + } +}; + +void Client::_invalidate_kernel_dcache() +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return; + + if (can_invalidate_dentries) { + if (dentry_invalidate_cb && root->dir) { + for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin(); + p != root->dir->dentries.end(); + ++p) { + if (p->second->inode) + _schedule_invalidate_dentry_callback(p->second, false); + } + } + } else if (remount_cb) { + // Hacky: + // when remounting a file system, linux kernel trims all unused dentries in the fs + remount_finisher.queue(new C_Client_Remount(this)); + } +} + +void Client::_trim_negative_child_dentries(InodeRef& in) +{ + if (!in->is_dir()) + return; + + Dir* dir = in->dir; + if (dir && dir->dentries.size() == dir->num_null_dentries) { + for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) { + Dentry *dn = p->second; + ++p; + ceph_assert(!dn->inode); + if (dn->lru_is_expireable()) + unlink(dn, true, false); // keep dir, drop dentry + } + if (dir->dentries.empty()) { + close_dir(dir); + } + } + + if (in->flags & I_SNAPDIR_OPEN) { + InodeRef snapdir = open_snapdir(in.get()); + _trim_negative_child_dentries(snapdir); + } +} + +class C_Client_CacheRelease : public Context { +private: + Client *client; + vinodeno_t ino; +public: + C_Client_CacheRelease(Client *c, Inode *in) : + client(c) { + if (client->use_faked_inos()) + ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP); + else + ino = in->vino(); + } + void finish(int r) override { + ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock)); + client->_async_inode_release(ino); + } +}; + +void Client::_async_inode_release(vinodeno_t ino) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return; + + ldout(cct, 10) << __func__ << " " << ino << dendl; + ino_release_cb(callback_handle, ino); +} + +void Client::_schedule_ino_release_callback(Inode *in) { + + if (ino_release_cb) + // we queue the invalidate, which calls the callback and decrements the ref + async_ino_releasor.queue(new C_Client_CacheRelease(this, in)); +} + +void Client::trim_caps(MetaSession *s, uint64_t max) +{ + mds_rank_t mds = s->mds_num; + size_t caps_size = s->caps.size(); + ldout(cct, 10) << __func__ << " mds." << mds << " max " << max + << " caps " << caps_size << dendl; + + uint64_t trimmed = 0; + auto p = s->caps.begin(); + std::set<Dentry *> to_trim; /* this avoids caps other than the one we're + * looking at from getting deleted during traversal. */ + while ((caps_size - trimmed) > max && !p.end()) { + Cap *cap = *p; + InodeRef in(&cap->inode); + + // Increment p early because it will be invalidated if cap + // is deleted inside remove_cap + ++p; + + if (in->caps.size() > 1 && cap != in->auth_cap) { + int mine = cap->issued | cap->implemented; + int oissued = in->auth_cap ? in->auth_cap->issued : 0; + // disposable non-auth cap + if (!(get_caps_used(in.get()) & ~oissued & mine)) { + ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl; + cap = (remove_cap(cap, true), nullptr); + trimmed++; + } + } else { + ldout(cct, 20) << " trying to trim dentries for " << *in << dendl; + _trim_negative_child_dentries(in); + bool all = true; + auto q = in->dentries.begin(); + while (q != in->dentries.end()) { + Dentry *dn = *q; + ++q; + if (dn->lru_is_expireable()) { + if (can_invalidate_dentries && + dn->dir->parent_inode->ino == CEPH_INO_ROOT) { + // Only issue one of these per DN for inodes in root: handle + // others more efficiently by calling for root-child DNs at + // the end of this function. + _schedule_invalidate_dentry_callback(dn, true); + } + ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl; + to_trim.insert(dn); + } else { + ldout(cct, 20) << " not expirable: " << dn->name << dendl; + all = false; + } + } + if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) { + _schedule_ino_release_callback(in.get()); + } + if (all && in->ino != CEPH_INO_ROOT) { + ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl; + trimmed++; + } + } + } + ldout(cct, 20) << " trimming queued dentries: " << dendl; + for (const auto &dn : to_trim) { + trim_dentry(dn); + } + to_trim.clear(); + + caps_size = s->caps.size(); + if (caps_size > (size_t)max) + _invalidate_kernel_dcache(); +} + +void Client::force_session_readonly(MetaSession *s) +{ + s->readonly = true; + for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) { + auto &in = (*p)->inode; + if (in.caps_wanted() & CEPH_CAP_FILE_WR) + signal_cond_list(in.waitfor_caps); + } +} + +int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid) +{ + MetaSession *session = in->auth_cap->session; + + int flushing = in->dirty_caps; + ceph_assert(flushing); + + ceph_tid_t flush_tid = ++last_flush_tid; + in->flushing_cap_tids[flush_tid] = flushing; + + if (!in->flushing_caps) { + ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl; + num_flushing_caps++; + } else { + ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl; + } + + in->flushing_caps |= flushing; + in->mark_caps_clean(); + + if (!in->flushing_cap_item.is_on_list()) + session->flushing_caps.push_back(&in->flushing_cap_item); + session->flushing_caps_tids.insert(flush_tid); + + *ptid = flush_tid; + return flushing; +} + +void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s) +{ + for (auto &p : in->cap_snaps) { + CapSnap &capsnap = p.second; + if (capsnap.flush_tid > 0) { + old_s->flushing_caps_tids.erase(capsnap.flush_tid); + new_s->flushing_caps_tids.insert(capsnap.flush_tid); + } + } + for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin(); + it != in->flushing_cap_tids.end(); + ++it) { + old_s->flushing_caps_tids.erase(it->first); + new_s->flushing_caps_tids.insert(it->first); + } + new_s->flushing_caps.push_back(&in->flushing_cap_item); +} + +/* + * Flush all the dirty caps back to the MDS. Because the callers + * generally wait on the result of this function (syncfs and umount + * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call. + */ +void Client::flush_caps_sync() +{ + ldout(cct, 10) << __func__ << dendl; + for (auto &q : mds_sessions) { + auto s = q.second; + xlist<Inode*>::iterator p = s->dirty_list.begin(); + while (!p.end()) { + unsigned flags = CHECK_CAPS_NODELAY; + Inode *in = *p; + + ++p; + if (p.end()) + flags |= CHECK_CAPS_SYNCHRONOUS; + check_caps(in, flags); + } + } +} + +void Client::wait_sync_caps(Inode *in, ceph_tid_t want) +{ + while (in->flushing_caps) { + map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin(); + ceph_assert(it != in->flushing_cap_tids.end()); + if (it->first > want) + break; + ldout(cct, 10) << __func__ << " on " << *in << " flushing " + << ccap_string(it->second) << " want " << want + << " last " << it->first << dendl; + wait_on_list(in->waitfor_caps); + } +} + +void Client::wait_sync_caps(ceph_tid_t want) +{ + retry: + ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", " + << num_flushing_caps << " total flushing)" << dendl; + for (auto &p : mds_sessions) { + auto s = p.second; + if (s->flushing_caps_tids.empty()) + continue; + ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin(); + if (oldest_tid <= want) { + ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid + << " (want " << want << ")" << dendl; + std::unique_lock l{client_lock, std::adopt_lock}; + sync_cond.wait(l); + l.release(); + goto retry; + } + } +} + +void Client::kick_flushing_caps(Inode *in, MetaSession *session) +{ + in->flags &= ~I_KICK_FLUSH; + + Cap *cap = in->auth_cap; + ceph_assert(cap->session == session); + + ceph_tid_t last_snap_flush = 0; + for (auto p = in->flushing_cap_tids.rbegin(); + p != in->flushing_cap_tids.rend(); + ++p) { + if (!p->second) { + last_snap_flush = p->first; + break; + } + } + + int wanted = in->caps_wanted(); + int used = get_caps_used(in) | in->caps_dirty(); + auto it = in->cap_snaps.begin(); + for (auto& p : in->flushing_cap_tids) { + if (p.second) { + int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0; + send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented), + p.second, p.first); + } else { + ceph_assert(it != in->cap_snaps.end()); + ceph_assert(it->second.flush_tid == p.first); + send_flush_snap(in, session, it->first, it->second); + ++it; + } + } +} + +void Client::kick_flushing_caps(MetaSession *session) +{ + mds_rank_t mds = session->mds_num; + ldout(cct, 10) << __func__ << " mds." << mds << dendl; + + for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) { + Inode *in = *p; + if (in->flags & I_KICK_FLUSH) { + ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl; + kick_flushing_caps(in, session); + } + } +} + +void Client::early_kick_flushing_caps(MetaSession *session) +{ + for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) { + Inode *in = *p; + Cap *cap = in->auth_cap; + ceph_assert(cap); + + // if flushing caps were revoked, we re-send the cap flush in client reconnect + // stage. This guarantees that MDS processes the cap flush message before issuing + // the flushing caps to other client. + if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) { + in->flags |= I_KICK_FLUSH; + continue; + } + + ldout(cct, 20) << " reflushing caps (early_kick) on " << *in + << " to mds." << session->mds_num << dendl; + // send_reconnect() also will reset these sequence numbers. make sure + // sequence numbers in cap flush message match later reconnect message. + cap->seq = 0; + cap->issue_seq = 0; + cap->mseq = 0; + cap->issued = cap->implemented; + + kick_flushing_caps(in, session); + } +} + +void Client::invalidate_snaprealm_and_children(SnapRealm *realm) +{ + list<SnapRealm*> q; + q.push_back(realm); + + while (!q.empty()) { + realm = q.front(); + q.pop_front(); + + ldout(cct, 10) << __func__ << " " << *realm << dendl; + realm->invalidate_cache(); + + for (set<SnapRealm*>::iterator p = realm->pchildren.begin(); + p != realm->pchildren.end(); + ++p) + q.push_back(*p); + } +} + +SnapRealm *Client::get_snap_realm(inodeno_t r) +{ + SnapRealm *realm = snap_realms[r]; + + ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref was " + << (realm ? realm->nref : 0) << dendl; + if (!realm) { + snap_realms[r] = realm = new SnapRealm(r); + + // Do not release the global snaprealm until unmounting. + if (r == CEPH_INO_GLOBAL_SNAPREALM) + realm->nref++; + } + + realm->nref++; + ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref now is " + << realm->nref << dendl; + return realm; +} + +SnapRealm *Client::get_snap_realm_maybe(inodeno_t r) +{ + if (snap_realms.count(r) == 0) { + ldout(cct, 20) << __func__ << " " << r << " fail" << dendl; + return NULL; + } + SnapRealm *realm = snap_realms[r]; + ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl; + realm->nref++; + return realm; +} + +void Client::put_snap_realm(SnapRealm *realm) +{ + ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm + << " " << realm->nref << " -> " << (realm->nref - 1) << dendl; + if (--realm->nref == 0) { + snap_realms.erase(realm->ino); + if (realm->pparent) { + realm->pparent->pchildren.erase(realm); + put_snap_realm(realm->pparent); + } + delete realm; + } +} + +bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent) +{ + if (realm->parent != parent) { + ldout(cct, 10) << __func__ << " " << *realm + << " " << realm->parent << " -> " << parent << dendl; + realm->parent = parent; + if (realm->pparent) { + realm->pparent->pchildren.erase(realm); + put_snap_realm(realm->pparent); + } + realm->pparent = get_snap_realm(parent); + realm->pparent->pchildren.insert(realm); + return true; + } + return false; +} + +static bool has_new_snaps(const SnapContext& old_snapc, + const SnapContext& new_snapc) +{ + return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq; +} + +struct SnapRealmInfoMeta { + SnapRealmInfoMeta(utime_t last_modified, uint64_t change_attr) + : last_modified(last_modified), + change_attr(change_attr) { + } + + utime_t last_modified; + uint64_t change_attr; +}; + +static std::pair<SnapRealmInfo, std::optional<SnapRealmInfoMeta>> get_snap_realm_info( + MetaSession *session, bufferlist::const_iterator &p) { + if (session->mds_features.test(CEPHFS_FEATURE_NEW_SNAPREALM_INFO)) { + SnapRealmInfoNew ninfo; + decode(ninfo, p); + return std::make_pair(ninfo.info, SnapRealmInfoMeta(ninfo.last_modified, ninfo.change_attr)); + } else { + SnapRealmInfo info; + decode(info, p); + return std::make_pair(info, std::nullopt); + } +} + + +void Client::update_snap_trace(MetaSession *session, const bufferlist& bl, SnapRealm **realm_ret, bool flush) +{ + SnapRealm *first_realm = NULL; + ldout(cct, 10) << __func__ << " len " << bl.length() << dendl; + + map<SnapRealm*, SnapContext> dirty_realms; + + auto p = bl.cbegin(); + while (!p.end()) { + auto [info, realm_info_meta] = get_snap_realm_info(session, p); + SnapRealm *realm = get_snap_realm(info.ino()); + + bool invalidate = false; + + if (info.seq() > realm->seq || + (realm_info_meta && (*realm_info_meta).change_attr > realm->change_attr)) { + ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq + << dendl; + + if (flush) { + // writeback any dirty caps _before_ updating snap list (i.e. with old snap info) + // flush me + children + list<SnapRealm*> q; + q.push_back(realm); + while (!q.empty()) { + SnapRealm *realm = q.front(); + q.pop_front(); + + for (set<SnapRealm*>::iterator p = realm->pchildren.begin(); + p != realm->pchildren.end(); + ++p) + q.push_back(*p); + + if (dirty_realms.count(realm) == 0) { + realm->nref++; + dirty_realms[realm] = realm->get_snap_context(); + } + } + } + + // update + realm->seq = info.seq(); + realm->created = info.created(); + realm->parent_since = info.parent_since(); + realm->prior_parent_snaps = info.prior_parent_snaps; + if (realm_info_meta) { + realm->last_modified = (*realm_info_meta).last_modified; + realm->change_attr = (*realm_info_meta).change_attr; + } + realm->my_snaps = info.my_snaps; + invalidate = true; + } + + // _always_ verify parent + if (adjust_realm_parent(realm, info.parent())) + invalidate = true; + + if (invalidate) { + invalidate_snaprealm_and_children(realm); + ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl; + ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl; + } else { + ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() + << " <= " << realm->seq << " and same parent, SKIPPING" << dendl; + } + + if (!first_realm) + first_realm = realm; + else + put_snap_realm(realm); + } + + for (auto &[realm, snapc] : dirty_realms) { + // if there are new snaps ? + if (has_new_snaps(snapc, realm->get_snap_context())) { + ldout(cct, 10) << " flushing caps on " << *realm << dendl; + for (auto&& in : realm->inodes_with_caps) { + queue_cap_snap(in, snapc); + } + } else { + ldout(cct, 10) << " no new snap on " << *realm << dendl; + } + put_snap_realm(realm); + } + + if (realm_ret) + *realm_ret = first_realm; + else + put_snap_realm(first_realm); +} + +void Client::handle_snap(const MConstRef<MClientSnap>& m) +{ + ldout(cct, 10) << __func__ << " " << *m << dendl; + mds_rank_t mds = mds_rank_t(m->get_source().num()); + + std::scoped_lock cl(client_lock); + auto session = _get_mds_session(mds, m->get_connection().get()); + if (!session) { + return; + } + + got_mds_push(session.get()); + + map<Inode*, SnapContext> to_move; + SnapRealm *realm = 0; + + if (m->head.op == CEPH_SNAP_OP_SPLIT) { + ceph_assert(m->head.split); + auto p = m->bl.cbegin(); + auto [info, _] = get_snap_realm_info(session.get(), p); + ceph_assert(info.ino() == m->head.split); + + // flush, then move, ino's. + realm = get_snap_realm(info.ino()); + ldout(cct, 10) << " splitting off " << *realm << dendl; + for (auto& ino : m->split_inos) { + vinodeno_t vino(ino, CEPH_NOSNAP); + if (inode_map.count(vino)) { + Inode *in = inode_map[vino]; + if (!in->snaprealm || in->snaprealm == realm) + continue; + if (in->snaprealm->created > info.created()) { + ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm " + << *in->snaprealm << dendl; + continue; + } + ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl; + + + in->snaprealm_item.remove_myself(); + to_move[in] = in->snaprealm->get_snap_context(); + put_snap_realm(in->snaprealm); + } + } + + // move child snaprealms, too + for (auto& child_realm : m->split_realms) { + ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl; + SnapRealm *child = get_snap_realm_maybe(child_realm); + if (!child) + continue; + adjust_realm_parent(child, realm->ino); + put_snap_realm(child); + } + } + + update_snap_trace(session.get(), m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY); + + if (realm) { + for (auto p = to_move.begin(); p != to_move.end(); ++p) { + Inode *in = p->first; + in->snaprealm = realm; + realm->inodes_with_caps.push_back(&in->snaprealm_item); + realm->nref++; + // queue for snap writeback + if (has_new_snaps(p->second, realm->get_snap_context())) + queue_cap_snap(in, p->second); + } + put_snap_realm(realm); + } +} + +void Client::handle_quota(const MConstRef<MClientQuota>& m) +{ + mds_rank_t mds = mds_rank_t(m->get_source().num()); + + std::scoped_lock cl(client_lock); + auto session = _get_mds_session(mds, m->get_connection().get()); + if (!session) { + return; + } + + got_mds_push(session.get()); + + ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl; + + vinodeno_t vino(m->ino, CEPH_NOSNAP); + if (inode_map.count(vino)) { + Inode *in = NULL; + in = inode_map[vino]; + + if (in) { + in->quota = m->quota; + in->rstat = m->rstat; + } + } +} + +void Client::handle_caps(const MConstRef<MClientCaps>& m) +{ + mds_rank_t mds = mds_rank_t(m->get_source().num()); + + std::scoped_lock cl(client_lock); + auto session = _get_mds_session(mds, m->get_connection().get()); + if (!session) { + return; + } + + if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) { + // Pause RADOS operations until we see the required epoch + objecter->set_epoch_barrier(m->osd_epoch_barrier); + } + + if (m->osd_epoch_barrier > cap_epoch_barrier) { + // Record the barrier so that we will transmit it to MDS when releasing + set_cap_epoch_barrier(m->osd_epoch_barrier); + } + + got_mds_push(session.get()); + + bool do_cap_release = false; + Inode *in; + vinodeno_t vino(m->get_ino(), CEPH_NOSNAP); + if (auto it = inode_map.find(vino); it != inode_map.end()) { + in = it->second; + + /* MDS maybe waiting for cap release with increased seq */ + switch (m->get_op()) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + if (!in->caps.count(mds)) { + do_cap_release = true; + ldout(cct, 5) << __func__ << " vino " << vino << " don't have cap " + << m->get_cap_id() << " op " << m->get_op() + << ", immediately releasing" << dendl; + } + } + } else { + /* MDS maybe waiting for cap release with increased seq */ + switch (m->get_op()) { + case CEPH_CAP_OP_IMPORT: + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + do_cap_release = true; + ldout(cct, 5) << __func__ << " don't have vino " << vino << " op " + << m->get_op() << ", immediately releasing" << dendl; + break; + default: + ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl; + return; + } + } + + // In case the mds is waiting on e.g. a revocation + if (do_cap_release) { + session->enqueue_cap_release( + m->get_ino(), + m->get_cap_id(), + m->get_seq(), + m->get_mseq(), + cap_epoch_barrier); + + flush_cap_releases(); + return; + } + + switch (m->get_op()) { + case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m); + case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m); + case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m); + } + + if (auto it = in->caps.find(mds); it != in->caps.end()) { + Cap &cap = in->caps.at(mds); + + switch (m->get_op()) { + case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m); + case CEPH_CAP_OP_IMPORT: + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m); + case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m); + } + } else { + ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl; + return; + } +} + +void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m) +{ + mds_rank_t mds = session->mds_num; + + ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq() + << " IMPORT from mds." << mds << dendl; + + const mds_rank_t peer_mds = mds_rank_t(m->peer.mds); + Cap *cap = NULL; + UserPerm cap_perms; + if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) { + cap = &it->second; + cap_perms = cap->latest_perms; + } + + // add/update it + SnapRealm *realm = NULL; + update_snap_trace(session, m->snapbl, &realm); + + int issued = m->get_caps(); + int wanted = m->get_wanted(); + add_update_cap(in, session, m->get_cap_id(), + issued, wanted, m->get_seq(), m->get_mseq(), + m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms); + + if (cap && cap->cap_id == m->peer.cap_id) { + remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE)); + } + + if (realm) + put_snap_realm(realm); + + if (in->auth_cap && in->auth_cap->session == session) { + if (!(wanted & CEPH_CAP_ANY_FILE_WR) || + in->requested_max_size > m->get_max_size()) { + in->requested_max_size = 0; + ldout(cct, 15) << "reset requested_max_size after cap import" << dendl; + } + // reflush any/all caps (if we are now the auth_cap) + kick_flushing_caps(in, session); + } +} + +void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m) +{ + mds_rank_t mds = session->mds_num; + + ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq() + << " EXPORT from mds." << mds << dendl; + + auto it = in->caps.find(mds); + if (it != in->caps.end()) { + Cap &cap = it->second; + if (cap.cap_id == m->get_cap_id()) { + if (m->peer.cap_id) { + const auto peer_mds = mds_rank_t(m->peer.mds); + auto tsession = _get_or_open_mds_session(peer_mds); + auto it = in->caps.find(peer_mds); + if (it != in->caps.end()) { + Cap &tcap = it->second; + if (tcap.cap_id == m->peer.cap_id && + ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) { + tcap.cap_id = m->peer.cap_id; + tcap.seq = m->peer.seq - 1; + tcap.issue_seq = tcap.seq; + tcap.issued |= cap.issued; + tcap.implemented |= cap.issued; + if (&cap == in->auth_cap) + in->auth_cap = &tcap; + if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list()) + adjust_session_flushing_caps(in, session, tsession.get()); + } + } else { + add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0, + m->peer.seq - 1, m->peer.mseq, (uint64_t)-1, + &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0, + cap.latest_perms); + } + } else { + if (cap.wanted | cap.issued) + in->flags |= I_CAP_DROPPED; + } + + remove_cap(&cap, false); + } + } +} + +void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m) +{ + mds_rank_t mds = session->mds_num; + ceph_assert(in->caps.count(mds)); + + uint64_t size = m->get_size(); + if (in->is_fscrypt_enabled()) { + size = std::stoll(std::string(std::rbegin(m->fscrypt_file), + std::rend(m->fscrypt_file))); + } + ldout(cct, 10) << __func__ << " on ino " << *in + << " size " << in->size << " -> " << m->get_size() + << dendl; + + int issued; + in->caps_issued(&issued); + issued |= in->caps_dirty(); + update_inode_file_size(in, issued, size, m->get_truncate_seq(), + m->get_truncate_size()); +} + +void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m) +{ + ceph_tid_t flush_ack_tid = m->get_client_tid(); + int dirty = m->get_dirty(); + int cleaned = 0; + int flushed = 0; + + auto it = in->flushing_cap_tids.begin(); + if (it->first < flush_ack_tid) { + ldout(cct, 0) << __func__ << " mds." << session->mds_num + << " got unexpected flush ack tid " << flush_ack_tid + << " expected is " << it->first << dendl; + } + for (; it != in->flushing_cap_tids.end(); ) { + if (!it->second) { + // cap snap + ++it; + continue; + } + if (it->first == flush_ack_tid) + cleaned = it->second; + if (it->first <= flush_ack_tid) { + session->flushing_caps_tids.erase(it->first); + in->flushing_cap_tids.erase(it++); + ++flushed; + continue; + } + cleaned &= ~it->second; + if (!cleaned) + break; + ++it; + } + + ldout(cct, 5) << __func__ << " mds." << session->mds_num + << " cleaned " << ccap_string(cleaned) << " on " << *in + << " with " << ccap_string(dirty) << dendl; + + if (flushed) { + signal_cond_list(in->waitfor_caps); + if (session->flushing_caps_tids.empty() || + *session->flushing_caps_tids.begin() > flush_ack_tid) + sync_cond.notify_all(); + } + + if (!dirty) { + in->cap_dirtier_uid = -1; + in->cap_dirtier_gid = -1; + } + + if (!cleaned) { + ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl; + } else { + if (in->flushing_caps) { + ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps) + << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl; + in->flushing_caps &= ~cleaned; + if (in->flushing_caps == 0) { + ldout(cct, 10) << " " << *in << " !flushing" << dendl; + num_flushing_caps--; + if (in->flushing_cap_tids.empty()) + in->flushing_cap_item.remove_myself(); + } + if (!in->caps_dirty()) + put_inode(in); + } + } +} + + +void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m) +{ + ceph_tid_t flush_ack_tid = m->get_client_tid(); + mds_rank_t mds = session->mds_num; + ceph_assert(in->caps.count(mds)); + snapid_t follows = m->get_snap_follows(); + + if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) { + auto& capsnap = it->second; + if (flush_ack_tid != capsnap.flush_tid) { + ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl; + } else { + InodeRef tmp_ref(in); + ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows + << " on " << *in << dendl; + session->flushing_caps_tids.erase(capsnap.flush_tid); + in->flushing_cap_tids.erase(capsnap.flush_tid); + if (in->flushing_caps == 0 && in->flushing_cap_tids.empty()) + in->flushing_cap_item.remove_myself(); + in->cap_snaps.erase(it); + + signal_cond_list(in->waitfor_caps); + if (session->flushing_caps_tids.empty() || + *session->flushing_caps_tids.begin() > flush_ack_tid) + sync_cond.notify_all(); + } + } else { + ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows + << " on " << *in << dendl; + // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back) + } +} + +class C_Client_DentryInvalidate : public Context { +private: + Client *client; + vinodeno_t dirino; + vinodeno_t ino; + string name; +public: + C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) : + client(c), name(dn->name) { + if (client->use_faked_inos()) { + dirino.ino = dn->dir->parent_inode->faked_ino; + if (del) + ino.ino = dn->inode->faked_ino; + } else { + dirino = dn->dir->parent_inode->vino(); + if (del) + ino = dn->inode->vino(); + } + if (!del) + ino.ino = inodeno_t(); + } + void finish(int r) override { + // _async_dentry_invalidate is responsible for its own locking + ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock)); + client->_async_dentry_invalidate(dirino, ino, name); + } +}; + +void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return; + + ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino + << " in dir " << dirino << dendl; + dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length()); +} + +void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del) +{ + if (dentry_invalidate_cb && dn->inode->ll_ref > 0) + async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del)); +} + +void Client::_try_to_trim_inode(Inode *in, bool sched_inval) +{ + int ref = in->get_nref(); + ldout(cct, 5) << __func__ << " in " << *in <<dendl; + + if (in->dir && !in->dir->dentries.empty()) { + for (auto p = in->dir->dentries.begin(); + p != in->dir->dentries.end(); ) { + Dentry *dn = p->second; + ++p; + /* rmsnap removes whole subtree, need trim inodes recursively. + * we don't need to invalidate dentries recursively. because + * invalidating a directory dentry effectively invalidate + * whole subtree */ + if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir()) + _try_to_trim_inode(dn->inode.get(), false); + + if (dn->lru_is_expireable()) + unlink(dn, true, false); // keep dir, drop dentry + } + if (in->dir->dentries.empty()) { + close_dir(in->dir); + --ref; + } + } + + if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) { + InodeRef snapdir = open_snapdir(in); + _try_to_trim_inode(snapdir.get(), false); + --ref; + } + + if (ref > 1) { + auto q = in->dentries.begin(); + while (q != in->dentries.end()) { + Dentry *dn = *q; + ++q; + if( in->ll_ref > 0 && sched_inval) { + // FIXME: we play lots of unlink/link tricks when handling MDS replies, + // so in->dentries doesn't always reflect the state of kernel's dcache. + _schedule_invalidate_dentry_callback(dn, true); + } + unlink(dn, true, true); + } + } +} + +void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m) +{ + mds_rank_t mds = session->mds_num; + int used = get_caps_used(in); + int wanted = in->caps_wanted(); + int flags = 0; + + const unsigned new_caps = m->get_caps(); + const bool was_stale = session->cap_gen > cap->gen; + ldout(cct, 5) << __func__ << " on in " << m->get_ino() + << " mds." << mds << " seq " << m->get_seq() + << " caps now " << ccap_string(new_caps) + << " was " << ccap_string(cap->issued) + << (was_stale ? " (stale)" : "") << dendl; + + if (was_stale) + cap->issued = cap->implemented = CEPH_CAP_PIN; + cap->seq = m->get_seq(); + cap->gen = session->cap_gen; + + check_cap_issue(in, new_caps); + + // update inode + int issued; + in->caps_issued(&issued); + issued |= in->caps_dirty(); + + if ((new_caps & CEPH_CAP_AUTH_SHARED) && + !(issued & CEPH_CAP_AUTH_EXCL)) { + in->mode = m->head.mode; + in->uid = m->head.uid; + in->gid = m->head.gid; + in->btime = m->btime; + } + bool deleted_inode = false; + if ((new_caps & CEPH_CAP_LINK_SHARED) && + !(issued & CEPH_CAP_LINK_EXCL)) { + in->nlink = m->head.nlink; + if (in->nlink == 0) + deleted_inode = true; + } + if (!(issued & CEPH_CAP_XATTR_EXCL) && + m->xattrbl.length() && + m->head.xattr_version > in->xattr_version) { + auto p = m->xattrbl.cbegin(); + decode(in->xattrs, p); + in->xattr_version = m->head.xattr_version; + } + + if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) { + in->dirstat.nfiles = m->get_nfiles(); + in->dirstat.nsubdirs = m->get_nsubdirs(); + } + + if (new_caps & CEPH_CAP_ANY_RD) { + update_inode_file_time(in, issued, m->get_time_warp_seq(), + m->get_ctime(), m->get_mtime(), m->get_atime()); + } + + if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { + in->layout = m->get_layout(); + update_inode_file_size(in, issued, m->get_size(), + m->get_truncate_seq(), m->get_truncate_size()); + } + + if (m->inline_version > in->inline_version) { + in->inline_data = m->inline_data; + in->inline_version = m->inline_version; + } + + /* always take a newer change attr */ + if (m->get_change_attr() > in->change_attr) + in->change_attr = m->get_change_attr(); + + // max_size + if (cap == in->auth_cap && + (new_caps & CEPH_CAP_ANY_FILE_WR) && + (m->get_max_size() != in->max_size)) { + ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl; + in->max_size = m->get_max_size(); + if (in->max_size > in->wanted_max_size) { + in->wanted_max_size = 0; + in->requested_max_size = 0; + } + } + + bool check = false; + if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) && + (wanted & ~(cap->wanted | new_caps))) { + // If mds is importing cap, prior cap messages that update 'wanted' + // may get dropped by mds (migrate seq mismatch). + // + // We don't send cap message to update 'wanted' if what we want are + // already issued. If mds revokes caps, cap message that releases caps + // also tells mds what we want. But if caps got revoked by mds forcedly + // (session stale). We may haven't told mds what we want. + check = true; + } + + + // update caps + auto revoked = cap->issued & ~new_caps; + if (revoked) { + ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl; + cap->issued = new_caps; + cap->implemented |= new_caps; + + // recall delegations if we're losing caps necessary for them + if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD)) + in->recall_deleg(false); + else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR)) + in->recall_deleg(true); + + used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented); + if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) && + !_flush(in, new C_Client_FlushComplete(this, in))) { + // waitin' for flush + } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) { + if (_release(in)) { + check = true; + flags = CHECK_CAPS_NODELAY; + } + } else { + cap->wanted = 0; // don't let check_caps skip sending a response to MDS + check = true; + flags = CHECK_CAPS_NODELAY; + } + } else if (cap->issued == new_caps) { + ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl; + } else { + ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl; + cap->issued = new_caps; + cap->implemented |= new_caps; + + if (cap == in->auth_cap) { + // non-auth MDS is revoking the newly grant caps ? + for (const auto &p : in->caps) { + if (&p.second == cap) + continue; + if (p.second.implemented & ~p.second.issued & new_caps) { + check = true; + break; + } + } + } + } + + // just in case the caps was released just before we get the revoke msg + if (!check && m->get_op() == CEPH_CAP_OP_REVOKE) { + cap->wanted = 0; // don't let check_caps skip sending a response to MDS + check = true; + flags = CHECK_CAPS_NODELAY; + } + + if (check) + check_caps(in, flags); + + // wake up waiters + if (new_caps) + signal_cond_list(in->waitfor_caps); + + // may drop inode's last ref + if (deleted_inode) + _try_to_trim_inode(in, true); +} + +int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want) +{ + if (perms.uid() == 0) { + // For directories, DACs are overridable. + // For files, Read/write DACs are always overridable but executable DACs are + // overridable when there is at least one exec bit set + if(!S_ISDIR(in->mode) && (want & MAY_EXEC) && !(in->mode & S_IXUGO)) + return -CEPHFS_EACCES; + return 0; + } + + if (perms.uid() != in->uid && (in->mode & S_IRWXG)) { + int ret = _posix_acl_permission(in, perms, want); + if (ret != -CEPHFS_EAGAIN) + return ret; + } + + // check permissions before doing anything else + if (!in->check_mode(perms, want)) + return -CEPHFS_EACCES; + return 0; +} + +int Client::xattr_permission(Inode *in, const char *name, unsigned want, + const UserPerm& perms) +{ + int r = _getattr_for_perm(in, perms); + if (r < 0) + goto out; + + r = 0; + if (strncmp(name, "system.", 7) == 0) { + if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid)) + r = -CEPHFS_EPERM; + } else { + r = inode_permission(in, perms, want); + } +out: + ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl; + return r; +} + +std::ostream& operator<<(std::ostream &out, const UserPerm& perm) { + out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")"; + return out; +} + +int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *in << "; " << perms << " stx_mode: " + << hex << stx->stx_mode << " mask:" << mask << dec << dendl; + int r = _getattr_for_perm(in, perms); + if (r < 0) + goto out; + + if (mask & CEPH_SETATTR_SIZE) { + r = inode_permission(in, perms, MAY_WRITE); + if (r < 0) + goto out; + } + + r = -CEPHFS_EPERM; + if (mask & CEPH_SETATTR_UID) { + if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid)) + goto out; + } + if (mask & CEPH_SETATTR_GID) { + if (perms.uid() != 0 && (perms.uid() != in->uid || + (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid))) + goto out; + } + + if (mask & CEPH_SETATTR_MODE) { + uint32_t m = ~stx->stx_mode & in->mode; // mode bits removed + ldout(cct, 20) << __func__ << " " << *in << " = " << hex << m << dec << dendl; + if (perms.uid() != 0 && perms.uid() != in->uid && + /* + * Currently the kernel fuse and libfuse code is buggy and + * won't pass the ATTR_KILL_SUID/ATTR_KILL_SGID to ceph-fuse. + * But will just set the ATTR_MODE and at the same time by + * clearing the suid/sgid bits. + * + * Only allow unprivileged users to clear S_ISUID and S_ISUID. + */ + (m & ~(S_ISUID | S_ISGID))) + goto out; + + gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid; + if (perms.uid() != 0 && !perms.gid_in_groups(i_gid)) + stx->stx_mode &= ~S_ISGID; + } + + if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME | + CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) { + if (perms.uid() != 0 && perms.uid() != in->uid) { + int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME; + if (!(mask & CEPH_SETATTR_MTIME_NOW)) + check_mask |= CEPH_SETATTR_MTIME; + if (!(mask & CEPH_SETATTR_ATIME_NOW)) + check_mask |= CEPH_SETATTR_ATIME; + if (check_mask & mask) { + goto out; + } else { + r = inode_permission(in, perms, MAY_WRITE); + if (r < 0) + goto out; + } + } + } + r = 0; +out: + ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl; + return r; +} + +int Client::may_open(Inode *in, int flags, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl; + unsigned want = 0; + + if ((flags & O_ACCMODE) == O_WRONLY) + want = MAY_WRITE; + else if ((flags & O_ACCMODE) == O_RDWR) + want = MAY_READ | MAY_WRITE; + else if ((flags & O_ACCMODE) == O_RDONLY) + want = MAY_READ; + if (flags & O_TRUNC) + want |= MAY_WRITE; + + int r = 0; + switch (in->mode & S_IFMT) { + case S_IFLNK: + r = -CEPHFS_ELOOP; + goto out; + case S_IFDIR: + if (want & MAY_WRITE) { + r = -CEPHFS_EISDIR; + goto out; + } + break; + } + + r = _getattr_for_perm(in, perms); + if (r < 0) + goto out; + + r = inode_permission(in, perms, want); +out: + ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl; + return r; +} + +int Client::may_lookup(Inode *dir, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl; + int r = _getattr_for_perm(dir, perms); + if (r < 0) + goto out; + + r = inode_permission(dir, perms, MAY_EXEC); +out: + ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl; + return r; +} + +int Client::may_create(Inode *dir, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl; + int r = _getattr_for_perm(dir, perms); + if (r < 0) + goto out; + + r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE); +out: + ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl; + return r; +} + +int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl; + int r = _getattr_for_perm(dir, perms); + if (r < 0) + goto out; + + r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE); + if (r < 0) + goto out; + + /* 'name == NULL' means rmsnap w/o permission checks */ + if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) { + InodeRef otherin; + r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms); + if (r < 0) + goto out; + if (dir->uid != perms.uid() && otherin->uid != perms.uid()) + r = -CEPHFS_EPERM; + } +out: + ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl; + return r; +} + +int Client::may_delete(const char *relpath, const UserPerm& perms) { + ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl; + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + + std::scoped_lock lock(client_lock); + int r = path_walk(path, &dir, perms); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + int r = may_delete(dir.get(), name.c_str(), perms); + if (r < 0) + return r; + } + + return 0; +} + +int Client::may_hardlink(Inode *in, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl; + int r = _getattr_for_perm(in, perms); + if (r < 0) + goto out; + + if (perms.uid() == 0 || perms.uid() == in->uid) { + r = 0; + goto out; + } + + r = -CEPHFS_EPERM; + if (!S_ISREG(in->mode)) + goto out; + + if (in->mode & S_ISUID) + goto out; + + if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) + goto out; + + r = inode_permission(in, perms, MAY_READ | MAY_WRITE); +out: + ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl; + return r; +} + +int Client::_getattr_for_perm(Inode *in, const UserPerm& perms) +{ + int mask = CEPH_STAT_CAP_MODE; + bool force = false; + if (acl_type != NO_ACL) { + mask |= CEPH_STAT_CAP_XATTR; + force = in->xattr_version == 0; + } + return _getattr(in, mask, perms, force); +} + +vinodeno_t Client::_get_vino(Inode *in) +{ + /* The caller must hold the client lock */ + return vinodeno_t(in->ino, in->snapid); +} + +/** + * Resolve an MDS spec to a list of MDS daemon GIDs. + * + * The spec is a string representing a GID, rank, filesystem:rank, or name/id. + * It may be '*' in which case it matches all GIDs. + * + * If no error is returned, the `targets` vector will be populated with at least + * one MDS. + */ +int Client::resolve_mds( + const std::string &mds_spec, + std::vector<mds_gid_t> *targets) +{ + ceph_assert(fsmap); + ceph_assert(targets != nullptr); + + mds_role_t role; + CachedStackStringStream css; + int role_r = fsmap->parse_role(mds_spec, &role, *css); + if (role_r == 0) { + // We got a role, resolve it to a GID + auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank); + ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '" + << role << "' aka " << info.human_name() << dendl; + targets->push_back(info.global_id); + return 0; + } + + std::string strtol_err; + long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err); + if (strtol_err.empty()) { + // It is a possible GID + const mds_gid_t mds_gid = mds_gid_t(rank_or_gid); + if (fsmap->gid_exists(mds_gid)) { + auto& info = fsmap->get_info_gid(mds_gid); + ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka " + << info.human_name() << dendl; + targets->push_back(mds_gid); + return 0; + } else { + lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map" + << dendl; + lderr(cct) << "FSMap: " << *fsmap << dendl; + return -CEPHFS_ENOENT; + } + } else if (mds_spec == "*") { + // It is a wildcard: use all MDSs + const auto& mds_info = fsmap->get_mds_info(); + + ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl; + if (mds_info.empty()) { + lderr(cct) << __func__ << ": no MDS daemons found" << dendl; + lderr(cct) << "FSMap: " << *fsmap << dendl; + return -CEPHFS_ENOENT; + } + + for (const auto& [gid, info] : mds_info) { + ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl; + targets->push_back(gid); + } + return 0; + } else { + // It did not parse as an integer, it is not a wildcard, it must be a name + const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec); + if (mds_gid == mds_gid_t{0}) { + lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl; + lderr(cct) << "FSMap: " << *fsmap << dendl; + return -CEPHFS_ENOENT; + } else { + auto& info = fsmap->get_info_gid(mds_gid); + ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec + << "' to " << info.human_name() << dendl; + targets->push_back(mds_gid); + } + return 0; + } +} + + +/** + * Authenticate with mon and establish global ID + */ +int Client::authenticate() +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + if (monclient->is_authenticated()) { + return 0; + } + + client_lock.unlock(); + int r = monclient->authenticate(std::chrono::duration<double>(mount_timeout).count()); + client_lock.lock(); + if (r < 0) { + return r; + } + + whoami = monclient->get_global_id(); + messenger->set_myname(entity_name_t::CLIENT(whoami.v)); + + return 0; +} + +int Client::fetch_fsmap(bool user) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + // Retrieve FSMap to enable looking up daemon addresses. We need FSMap + // rather than MDSMap because no one MDSMap contains all the daemons, and + // a `tell` can address any daemon. + version_t fsmap_latest; + bs::error_code ec; + do { + client_lock.unlock(); + std::tie(fsmap_latest, std::ignore) = + monclient->get_version("fsmap", ca::use_blocked[ec]); + client_lock.lock(); + } while (ec == bs::errc::resource_unavailable_try_again); + + if (ec) { + lderr(cct) << "Failed to learn FSMap version: " << ec << dendl; + return ceph::from_error_code(ec); + } + + ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl; + + if (user) { + if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) { + monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME); + monclient->renew_subs(); + wait_on_list(waiting_for_fsmap); + } + ceph_assert(fsmap_user); + ceph_assert(fsmap_user->get_epoch() >= fsmap_latest); + } else { + if (!fsmap || fsmap->get_epoch() < fsmap_latest) { + monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME); + monclient->renew_subs(); + wait_on_list(waiting_for_fsmap); + } + ceph_assert(fsmap); + ceph_assert(fsmap->get_epoch() >= fsmap_latest); + } + ldout(cct, 10) << __func__ << " finished waiting for FSMap version " + << fsmap_latest << dendl; + return 0; +} + +/** + * + * @mds_spec one of ID, rank, GID, "*" + * + */ +int Client::mds_command( + const std::string &mds_spec, + const vector<string>& cmd, + const bufferlist& inbl, + bufferlist *outbl, + string *outs, + Context *onfinish) +{ + RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED); + if (!iref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::unique_lock cl(client_lock); + + int r; + r = authenticate(); + if (r < 0) { + return r; + } + + r = fetch_fsmap(false); + if (r < 0) { + return r; + } + + // Look up MDS target(s) of the command + std::vector<mds_gid_t> targets; + r = resolve_mds(mds_spec, &targets); + if (r < 0) { + return r; + } + + // If daemons are laggy, we won't send them commands. If all + // are laggy then we fail. + std::vector<mds_gid_t> non_laggy; + for (const auto& gid : targets) { + const auto info = fsmap->get_info_gid(gid); + if (!info.laggy()) { + non_laggy.push_back(gid); + } + } + if (non_laggy.size() == 0) { + *outs = "All targeted MDS daemons are laggy"; + return -CEPHFS_ENOENT; + } + + if (metadata.empty()) { + // We are called on an unmounted client, so metadata + // won't be initialized yet. + populate_metadata(""); + } + + // Send commands to targets + C_GatherBuilder gather(cct, onfinish); + for (const auto& target_gid : non_laggy) { + const auto info = fsmap->get_info_gid(target_gid); + + // Open a connection to the target MDS + ConnectionRef conn = messenger->connect_to_mds(info.get_addrs()); + + cl.unlock(); + { + std::scoped_lock cmd_lock(command_lock); + // Generate MDSCommandOp state + auto &op = command_table.start_command(); + + op.on_finish = gather.new_sub(); + op.cmd = cmd; + op.outbl = outbl; + op.outs = outs; + op.inbl = inbl; + op.mds_gid = target_gid; + op.con = conn; + + ldout(cct, 4) << __func__ << ": new command op to " << target_gid + << " tid=" << op.tid << cmd << dendl; + + // Construct and send MCommand + MessageRef m = op.get_message(monclient->get_fsid()); + conn->send_message2(std::move(m)); + } + cl.lock(); + } + gather.activate(); + + return 0; +} + +void Client::handle_command_reply(const MConstRef<MCommandReply>& m) +{ + ceph_tid_t const tid = m->get_tid(); + + ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl; + + std::scoped_lock cmd_lock(command_lock); + if (!command_table.exists(tid)) { + ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl; + return; + } + + auto &op = command_table.get_command(tid); + if (op.outbl) { + *op.outbl = m->get_data(); + } + if (op.outs) { + *op.outs = m->rs; + } + + if (op.on_finish) { + op.on_finish->complete(m->r); + } + + command_table.erase(tid); +} + +// ------------------- +// MOUNT + +int Client::subscribe_mdsmap(const std::string &fs_name) +{ + int r = authenticate(); + if (r < 0) { + lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl; + return r; + } + + std::string resolved_fs_name; + if (fs_name.empty()) { + resolved_fs_name = cct->_conf.get_val<std::string>("client_fs"); + if (resolved_fs_name.empty()) + // Try the backwards compatibility fs name option + resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace"); + } else { + resolved_fs_name = fs_name; + } + + std::string want = "mdsmap"; + if (!resolved_fs_name.empty()) { + r = fetch_fsmap(true); + if (r < 0) + return r; + fscid = fsmap_user->get_fs_cid(resolved_fs_name); + if (fscid == FS_CLUSTER_ID_NONE) { + return -CEPHFS_ENOENT; + } + + std::ostringstream oss; + oss << want << "." << fscid; + want = oss.str(); + } + ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl; + + monclient->sub_want(want, 0, 0); + monclient->renew_subs(); + + return 0; +} + +int Client::mount(const std::string &mount_root, const UserPerm& perms, + bool require_mds, const std::string &fs_name) +{ + ceph_assert(is_initialized()); + + /* + * To make sure that the _unmount() must wait until the mount() + * is done. + */ + RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false); + if (!mref_writer.is_first_writer()) // already mounting or mounted + return 0; + + std::unique_lock cl(client_lock); + + int r = subscribe_mdsmap(fs_name); + if (r < 0) { + lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl; + return r; + } + + start_tick_thread(); // start tick thread + + if (require_mds) { + while (1) { + auto availability = mdsmap->is_cluster_available(); + if (availability == MDSMap::STUCK_UNAVAILABLE) { + // Error out + ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl; + return CEPH_FUSE_NO_MDS_UP; + } else if (availability == MDSMap::AVAILABLE) { + // Continue to mount + break; + } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) { + // Else, wait. MDSMonitor will update the map to bring + // us to a conclusion eventually. + wait_on_list(waiting_for_mdsmap); + } else { + // Unexpected value! + ceph_abort(); + } + } + } + + if(mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) { + lderr(cct) << "connections cannot be made while" + " the flag refuse_client_session is set" << dendl; + return -CEPHFS_EACCES; + } + + populate_metadata(mount_root.empty() ? "/" : mount_root); + + filepath fp(CEPH_INO_ROOT); + if (!mount_root.empty()) { + fp = filepath(mount_root.c_str()); + } + while (true) { + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR); + req->set_filepath(fp); + req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL; + int res = make_request(req, perms); + if (res < 0) { + if (res == -CEPHFS_EACCES && root) { + ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl; + break; + } + return res; + } + + if (fp.depth()) + fp.pop_dentry(); + else + break; + } + + ceph_assert(root); + _ll_get(root.get()); + + // trace? + if (!cct->_conf->client_trace.empty()) { + traceout.open(cct->_conf->client_trace.c_str()); + if (traceout.is_open()) { + ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl; + } else { + ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl; + } + } + + /* + ldout(cct, 3) << "op: // client trace data structs" << dendl; + ldout(cct, 3) << "op: struct stat st;" << dendl; + ldout(cct, 3) << "op: struct utimbuf utim;" << dendl; + ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl; + ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl; + ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl; + ldout(cct, 3) << "op: map<int, int> open_files;" << dendl; + ldout(cct, 3) << "op: int fd;" << dendl; + */ + + mref_writer.update_state(CLIENT_MOUNTED); + return 0; +} + +// UNMOUNT + +void Client::_close_sessions() +{ + for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) { + if (it->second->state == MetaSession::STATE_REJECTED) + mds_sessions.erase(it++); + else + ++it; + } + + while (!mds_sessions.empty()) { + // send session closes! + for (auto &p : mds_sessions) { + if (p.second->state != MetaSession::STATE_CLOSING) { + _close_mds_session(p.second.get()); + mds_ranks_closing.insert(p.first); + } + } + + // wait for sessions to close + double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count(); + ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: " + << timo << "s)" << dendl; + std::unique_lock l{client_lock, std::adopt_lock}; + if (!timo) { + mount_cond.wait(l); + } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) { + ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl; + while (!mds_ranks_closing.empty()) { + auto session = mds_sessions.at(*mds_ranks_closing.begin()); + // this prunes entry from mds_sessions and mds_ranks_closing + _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT); + } + } + + mds_ranks_closing.clear(); + l.release(); + } +} + +void Client::flush_mdlog_sync(Inode *in) +{ + if (in->unsafe_ops.empty()) { + return; + } + + std::set<mds_rank_t> anchor; + for (auto &&p : in->unsafe_ops) { + anchor.emplace(p->mds); + } + if (in->auth_cap) { + anchor.emplace(in->auth_cap->session->mds_num); + } + + for (auto &rank : anchor) { + auto session = &mds_sessions.at(rank); + flush_mdlog(session->get()); + } +} + +void Client::flush_mdlog_sync() +{ + if (mds_requests.empty()) + return; + for (auto &p : mds_sessions) { + flush_mdlog(p.second.get()); + } +} + +void Client::flush_mdlog(MetaSession *session) +{ + // Only send this to Luminous or newer MDS daemons, older daemons + // will crash if they see an unknown CEPH_SESSION_* value in this msg. + const uint64_t features = session->con->get_features(); + if (HAVE_FEATURE(features, SERVER_LUMINOUS)) { + auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG); + session->con->send_message2(std::move(m)); + } +} + + +void Client::_abort_mds_sessions(int err) +{ + for (auto p = mds_requests.begin(); p != mds_requests.end(); ) { + auto req = p->second; + ++p; + // unsafe requests will be removed during close session below. + if (req->got_unsafe) + continue; + + req->abort(err); + if (req->caller_cond) { + req->kick = true; + req->caller_cond->notify_all(); + } + } + + // Process aborts on any requests that were on this waitlist. + // Any requests that were on a waiting_for_open session waitlist + // will get kicked during close session below. + signal_cond_list(waiting_for_mdsmap); + + // Force-close all sessions + while(!mds_sessions.empty()) { + auto session = mds_sessions.begin()->second; + _closed_mds_session(session.get(), err); + } +} + +void Client::_unmount(bool abort) +{ + /* + * We are unmounting the client. + * + * Just declare the state to STATE_UNMOUNTING to block and fail + * any new comming "reader" and then try to wait all the in-flight + * "readers" to finish. + */ + RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false); + if (!mref_writer.is_first_writer()) + return; + mref_writer.wait_readers_done(); + + std::unique_lock lock{client_lock}; + + if (abort || blocklisted) { + ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl; + } else { + ldout(cct, 2) << "unmounting" << dendl; + } + + deleg_timeout = 0; + + if (abort) { + mount_aborted = true; + // Abort all mds sessions + _abort_mds_sessions(-CEPHFS_ENOTCONN); + + objecter->op_cancel_writes(-CEPHFS_ENOTCONN); + } else { + // flush the mdlog for pending requests, if any + flush_mdlog_sync(); + } + + mount_cond.wait(lock, [this] { + // Only wait for write OPs + for (auto& [tid, req] : mds_requests) { + if (req->is_write()) { + ldout(cct, 10) << "waiting for write request '" << tid + << "' to complete, currently there are " + << mds_requests.size() + << " outstanding read/write requests" + << dendl; + return false; + } + } + return true; + }); + + cwd.reset(); + root.reset(); + + // clean up any unclosed files + while (!fd_map.empty()) { + Fh *fh = fd_map.begin()->second; + fd_map.erase(fd_map.begin()); + ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl; + _release_fh(fh); + } + + while (!ll_unclosed_fh_set.empty()) { + set<Fh*>::iterator it = ll_unclosed_fh_set.begin(); + Fh *fh = *it; + ll_unclosed_fh_set.erase(fh); + ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl; + _release_fh(fh); + } + + while (!opened_dirs.empty()) { + dir_result_t *dirp = *opened_dirs.begin(); + ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl; + _closedir(dirp); + } + + _ll_drop_pins(); + + if (cct->_conf->client_oc) { + // flush/release all buffered data + std::list<InodeRef> anchor; + for (auto& p : inode_map) { + Inode *in = p.second; + if (!in) { + ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl; + ceph_assert(in); + } + + // prevent inode from getting freed + anchor.emplace_back(in); + + if (abort || blocklisted) { + objectcacher->purge_set(&in->oset); + } else if (!in->caps.empty()) { + _release(in); + _flush(in, new C_Client_FlushComplete(this, in)); + } + } + } + + if (abort || blocklisted) { + for (auto &q : mds_sessions) { + auto s = q.second; + for (auto p = s->dirty_list.begin(); !p.end(); ) { + Inode *in = *p; + ++p; + if (in->dirty_caps) { + ldout(cct, 0) << " drop dirty caps on " << *in << dendl; + in->mark_caps_clean(); + put_inode(in); + } + } + } + } else { + flush_caps_sync(); + wait_sync_caps(last_flush_tid); + } + + // empty lru cache + trim_cache(); + + delay_put_inodes(); + + while (lru.lru_get_size() > 0 || + !inode_map.empty()) { + ldout(cct, 2) << "cache still has " << lru.lru_get_size() + << "+" << inode_map.size() << " items" + << ", waiting (for caps to release?)" + << dendl; + + if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5)); + r == std::cv_status::timeout) { + dump_cache(NULL); + } + } + ceph_assert(lru.lru_get_size() == 0); + ceph_assert(inode_map.empty()); + + // stop tracing + if (!cct->_conf->client_trace.empty()) { + ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl; + traceout.close(); + } + + // stop the tick thread + tick_thread_stopped = true; + upkeep_cond.notify_one(); + + _close_sessions(); + + // release the global snapshot realm + SnapRealm *global_realm = snap_realms[CEPH_INO_GLOBAL_SNAPREALM]; + if (global_realm) { + ceph_assert(global_realm->nref == 1); + put_snap_realm(global_realm); + } + + mref_writer.update_state(CLIENT_UNMOUNTED); + + /* + * Stop the remount_queue before clearing the mountpoint memory + * to avoid possible use-after-free bug. + */ + if (remount_cb) { + ldout(cct, 10) << "unmount stopping remount finisher" << dendl; + remount_finisher.wait_for_empty(); + remount_finisher.stop(); + remount_cb = nullptr; + } + + ldout(cct, 2) << "unmounted." << dendl; +} + +void Client::unmount() +{ + _unmount(false); +} + +void Client::abort_conn() +{ + _unmount(true); +} + +void Client::flush_cap_releases() +{ + uint64_t nr_caps = 0; + + // send any cap releases + for (auto &p : mds_sessions) { + auto session = p.second; + if (session->release && mdsmap->is_clientreplay_or_active_or_stopping( + p.first)) { + nr_caps += session->release->caps.size(); + if (cct->_conf->client_inject_release_failure) { + ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl; + } else { + session->con->send_message2(std::move(session->release)); + } + session->release.reset(); + } + } + + if (nr_caps > 0) { + dec_pinned_icaps(nr_caps); + } +} + +void Client::renew_and_flush_cap_releases() +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + if (!mount_aborted && mdsmap->get_epoch()) { + // renew caps? + auto el = ceph::coarse_mono_clock::now() - last_cap_renew; + if (unlikely(utime_t(el) > mdsmap->get_session_timeout() / 3.0)) + renew_caps(); + + flush_cap_releases(); + } +} + +void Client::tick() +{ + ldout(cct, 20) << "tick" << dendl; + + auto now = ceph::coarse_mono_clock::now(); + + /* + * If the mount() is not finished + */ + if (is_mounting() && !mds_requests.empty()) { + MetaRequest *req = mds_requests.begin()->second; + + if (req->created + mount_timeout < now) { + req->abort(-CEPHFS_ETIMEDOUT); + if (req->caller_cond) { + req->kick = true; + req->caller_cond->notify_all(); + } + signal_cond_list(waiting_for_mdsmap); + for (auto &p : mds_sessions) { + signal_context_list(p.second->waiting_for_open); + } + } + } + + renew_and_flush_cap_releases(); + + // delayed caps + xlist<Inode*>::iterator p = delayed_list.begin(); + while (!p.end()) { + Inode *in = *p; + ++p; + if (!mount_aborted && in->hold_caps_until > now) + break; + delayed_list.pop_front(); + if (!mount_aborted) + check_caps(in, CHECK_CAPS_NODELAY); + } + + if (!mount_aborted) + collect_and_send_metrics(); + + delay_put_inodes(is_unmounting()); + trim_cache(true); + + if (blocklisted && (is_mounted() || is_unmounting()) && + last_auto_reconnect + std::chrono::seconds(30 * 60) < now && + cct->_conf.get_val<bool>("client_reconnect_stale")) { + messenger->client_reset(); + fd_gen++; // invalidate open files + blocklisted = false; + _kick_stale_sessions(); + last_auto_reconnect = now; + } +} + +void Client::start_tick_thread() +{ + upkeeper = std::thread([this]() { + using time = ceph::coarse_mono_time; + using sec = std::chrono::seconds; + + auto last_tick = time::min(); + + std::unique_lock cl(client_lock); + while (!tick_thread_stopped) { + auto now = clock::now(); + auto since = now - last_tick; + + auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval")); + auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay")); + + auto interval = std::max(t_interval, d_interval); + if (likely(since >= interval*.90)) { + tick(); + last_tick = clock::now(); + } else { + interval -= since; + } + + ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl; + if (!tick_thread_stopped) + upkeep_cond.wait_for(cl, interval); + } + }); +} + +void Client::collect_and_send_metrics() { + ldout(cct, 20) << __func__ << dendl; + + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + // right now, we only track and send global metrics. its sufficient + // to send these metrics to MDS rank0. + collect_and_send_global_metrics(); +} + +void Client::collect_and_send_global_metrics() { + ldout(cct, 20) << __func__ << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + /* Do not send the metrics until the MDS rank is ready */ + if (!mdsmap->is_active((mds_rank_t)0)) { + ldout(cct, 5) << __func__ << " MDS rank 0 is not ready yet -- not sending metric" + << dendl; + return; + } + + if (!have_open_session((mds_rank_t)0)) { + ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric" + << dendl; + return; + } + auto session = _get_or_open_mds_session((mds_rank_t)0); + if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) { + ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl; + return; + } + + ClientMetricMessage metric; + std::vector<ClientMetricMessage> message; + + // read latency + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) { + metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read), + logger->tget(l_c_rd_avg), + logger->get(l_c_rd_sqsum), + nr_read_request)); + message.push_back(metric); + } + + // write latency + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) { + metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat), + logger->tget(l_c_wr_avg), + logger->get(l_c_wr_sqsum), + nr_write_request)); + message.push_back(metric); + } + + // metadata latency + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) { + metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat), + logger->tget(l_c_md_avg), + logger->get(l_c_md_sqsum), + nr_metadata_request)); + message.push_back(metric); + } + + // cap hit ratio -- nr_caps is unused right now + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) { + auto [cap_hits, cap_misses] = get_cap_hit_rates(); + metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0)); + message.push_back(metric); + } + + // dentry lease hit ratio + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) { + auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates(); + metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr)); + message.push_back(metric); + } + + // opened files + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) { + auto [opened_files, total_inodes] = get_opened_files_rates(); + metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes)); + message.push_back(metric); + } + + // pinned i_caps + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) { + auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates(); + metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes)); + message.push_back(metric); + } + + // opened inodes + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) { + auto [opened_inodes, total_inodes] = get_opened_inodes_rates(); + metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes)); + message.push_back(metric); + } + + // read io sizes + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) { + metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops, + total_read_size)); + message.push_back(metric); + } + + // write io sizes + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) { + metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops, + total_write_size)); + message.push_back(metric); + } + + session->con->send_message2(make_message<MClientMetrics>(std::move(message))); +} + +void Client::renew_caps() +{ + ldout(cct, 10) << "renew_caps()" << dendl; + last_cap_renew = ceph::coarse_mono_clock::now(); + + for (auto &p : mds_sessions) { + ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl; + if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN) + renew_caps(p.second.get()); + } +} + +void Client::renew_caps(MetaSession *session) +{ + ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl; + session->last_cap_renew_request = ceph_clock_now(); + uint64_t seq = ++session->cap_renew_seq; + session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq)); +} + + +// =============================================================== +// high level (POSIXy) interface + +int Client::_do_lookup(Inode *dir, const string& name, int mask, + InodeRef *target, const UserPerm& perms) +{ + int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; + MetaRequest *req = new MetaRequest(op); + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_inode(dir); + if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP) + mask |= DEBUG_GETATTR_CAPS; + req->head.args.getattr.mask = mask; + + ldout(cct, 10) << __func__ << " on " << path << dendl; + + int r = make_request(req, perms, target); + ldout(cct, 10) << __func__ << " res is " << r << dendl; + return r; +} + +bool Client::_dentry_valid(const Dentry *dn) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + // is dn lease valid? + utime_t now = ceph_clock_now(); + if (dn->lease_mds >= 0 && dn->lease_ttl > now && + mds_sessions.count(dn->lease_mds)) { + auto s = mds_sessions.at(dn->lease_mds); + if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) { + dlease_hit(); + return true; + } + + ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen + << " vs lease_gen " << dn->lease_gen << dendl; + } + + dlease_miss(); + return false; +} + +int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target, + const UserPerm& perms, std::string* alternate_name, + bool is_rename) +{ + int r = 0; + Dentry *dn = NULL; + bool did_lookup_request = false; + // can only request shared caps + mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT; + + if (dname == "..") { + if (dir->dentries.empty()) { + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT); + filepath path(dir->ino); + req->set_filepath(path); + + InodeRef tmptarget; + int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds()); + + if (r == 0) { + *target = std::move(tmptarget); + ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl; + } else { + *target = dir; + } + } + else + *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked + goto done; + } + + if (dname == ".") { + *target = dir; + goto done; + } + + if (!dir->is_dir()) { + r = -CEPHFS_ENOTDIR; + goto done; + } + + if (dname.length() > NAME_MAX) { + r = -CEPHFS_ENAMETOOLONG; + goto done; + } + + if (dname == cct->_conf->client_snapdir && + dir->snapid == CEPH_NOSNAP) { + *target = open_snapdir(dir); + goto done; + } + +relookup: + if (dir->dir && + dir->dir->dentries.count(dname)) { + dn = dir->dir->dentries[dname]; + + ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds + << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl; + + if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) { + if (_dentry_valid(dn)) { + // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to + // make trim_caps() behave. + dir->try_touch_cap(dn->lease_mds); + goto hit_dn; + } + // dir shared caps? + if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) { + if (dn->cap_shared_gen == dir->shared_gen && + (!dn->inode || dn->inode->caps_issued_mask(mask, true))) + goto hit_dn; + if (!dn->inode && (dir->flags & I_COMPLETE)) { + ldout(cct, 10) << __func__ << " concluded ENOENT locally for " + << *dir << " dn '" << dname << "'" << dendl; + return -CEPHFS_ENOENT; + } + } + } else { + ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl; + } + + // In rare case during the rename if another thread tries to + // lookup the dst dentry, it may get an inconsistent result + // that both src dentry and dst dentry will link to the same + // inode at the same time. + // Will wait the rename to finish and try it again. + if (!is_rename && dn->is_renaming) { + ldout(cct, 1) << __func__ << " dir " << *dir + << " rename is on the way, will wait for dn '" + << dname << "'" << dendl; + wait_on_list(waiting_for_rename); + goto relookup; + } + } else { + // can we conclude ENOENT locally? + if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) && + (dir->flags & I_COMPLETE)) { + ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl; + return -CEPHFS_ENOENT; + } + } + + if (did_lookup_request) { + r = 0; + goto done; + } + r = _do_lookup(dir, dname, mask, target, perms); + did_lookup_request = true; + if (r == 0) { + /* complete lookup to get dentry for alternate_name */ + goto relookup; + } else { + goto done; + } + + hit_dn: + if (dn->inode) { + *target = dn->inode; + if (alternate_name) + *alternate_name = dn->alternate_name; + } else { + r = -CEPHFS_ENOENT; + } + touch_dn(dn); + goto done; + + done: + if (r < 0) + ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl; + else + ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl; + return r; +} + +Dentry *Client::get_or_create(Inode *dir, const char* name) +{ + // lookup + ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl; + dir->open_dir(); + if (dir->dir->dentries.count(name)) + return dir->dir->dentries[name]; + else // otherwise link up a new one + return link(dir->dir, name, NULL, NULL); +} + +int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 10) << __func__ << ": " << path << dendl; + + std::scoped_lock lock(client_lock); + + return path_walk(path, wdr, perms, followsym); +} + +int Client::path_walk(const filepath& origpath, InodeRef *end, + const UserPerm& perms, bool followsym, int mask, InodeRef dirinode) +{ + walk_dentry_result wdr; + int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode); + *end = std::move(wdr.in); + return rc; +} + +int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms, + bool followsym, int mask, InodeRef dirinode) +{ + filepath path = origpath; + InodeRef cur; + std::string alternate_name; + if (origpath.absolute()) + cur = root; + else if (!dirinode) + cur = cwd; + else { + cur = dirinode; + } + ceph_assert(cur); + + ldout(cct, 20) << __func__ << " cur=" << *cur << dendl; + ldout(cct, 10) << __func__ << " " << path << dendl; + + int symlinks = 0; + + unsigned i=0; + while (i < path.depth() && cur) { + int caps = 0; + const string &dname = path[i]; + ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl; + ldout(cct, 20) << " (path is " << path << ")" << dendl; + InodeRef next; + if (cct->_conf->client_permissions) { + int r = may_lookup(cur.get(), perms); + if (r < 0) + return r; + caps = CEPH_CAP_AUTH_SHARED; + } + + /* Get extra requested caps on the last component */ + if (i == (path.depth() - 1)) + caps |= mask; + int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name); + if (r < 0) + return r; + // only follow trailing symlink if followsym. always follow + // 'directory' symlinks. + if (next && next->is_symlink()) { + symlinks++; + ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl; + if (symlinks > MAXSYMLINKS) { + return -CEPHFS_ELOOP; + } + + if (i < path.depth() - 1) { + // dir symlink + // replace consumed components of path with symlink dir target + filepath resolved(next->symlink.c_str()); + resolved.append(path.postfixpath(i + 1)); + path = resolved; + i = 0; + if (next->symlink[0] == '/') { + cur = root; + } + continue; + } else if (followsym) { + if (next->symlink[0] == '/') { + path = next->symlink.c_str(); + i = 0; + // reset position + cur = root; + } else { + filepath more(next->symlink.c_str()); + // we need to remove the symlink component from off of the path + // before adding the target that the symlink points to. remain + // at the same position in the path. + path.pop_dentry(); + path.append(more); + } + continue; + } + } + cur.swap(next); + i++; + } + if (!cur) + return -CEPHFS_ENOENT; + if (result) { + result->in = std::move(cur); + result->alternate_name = std::move(alternate_name); + } + return 0; +} + + +// namespace ops + +int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << "link" << std::endl; + tout(cct) << relexisting << std::endl; + tout(cct) << relpath << std::endl; + + filepath existing(relexisting); + + InodeRef in, dir; + + std::scoped_lock lock(client_lock); + int r = path_walk(existing, &in, perm, true); + if (r < 0) + return r; + if (std::string(relpath) == "/") { + r = -CEPHFS_EEXIST; + return r; + } + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + + r = path_walk(path, &dir, perm, true); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + if (S_ISDIR(in->mode)) { + r = -CEPHFS_EPERM; + return r; + } + r = may_hardlink(in.get(), perm); + if (r < 0) + return r; + r = may_create(dir.get(), perm); + if (r < 0) + return r; + } + r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name)); + return r; +} + +int Client::unlink(const char *relpath, const UserPerm& perm) +{ + return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm); +} + +int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + tout(cct) << __func__ << std::endl; + tout(cct) << dirfd << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << flags << std::endl; + + if (std::string(relpath) == "/") { + return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR; + } + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + + std::scoped_lock lock(client_lock); + + InodeRef dirinode; + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + + r = path_walk(path, &dir, perm, true, 0, dirinode); + if (r < 0) { + return r; + } + if (cct->_conf->client_permissions) { + r = may_delete(dir.get(), name.c_str(), perm); + if (r < 0) { + return r; + } + } + if (flags & AT_REMOVEDIR) { + r = _rmdir(dir.get(), name.c_str(), perm); + } else { + r = _unlink(dir.get(), name.c_str(), perm); + } + return r; +} + +int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << relfrom << std::endl; + tout(cct) << relto << std::endl; + + if (std::string(relfrom) == "/" || std::string(relto) == "/") + return -CEPHFS_EBUSY; + + filepath from(relfrom); + filepath to(relto); + string fromname = from.last_dentry(); + from.pop_dentry(); + string toname = to.last_dentry(); + to.pop_dentry(); + + InodeRef fromdir, todir; + + std::scoped_lock lock(client_lock); + int r = path_walk(from, &fromdir, perm); + if (r < 0) + goto out; + r = path_walk(to, &todir, perm); + if (r < 0) + goto out; + + if (cct->_conf->client_permissions) { + int r = may_delete(fromdir.get(), fromname.c_str(), perm); + if (r < 0) + return r; + r = may_delete(todir.get(), toname.c_str(), perm); + if (r < 0 && r != -CEPHFS_ENOENT) + return r; + } + r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name)); +out: + return r; +} + +// dirs + +int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name) +{ + return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name); +} + +int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm, + std::string alternate_name) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << dirfd << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mode << std::endl; + ldout(cct, 10) << __func__ << ": " << relpath << dendl; + + if (std::string(relpath) == "/") { + return -CEPHFS_EEXIST; + } + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + + std::scoped_lock lock(client_lock); + + InodeRef dirinode; + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + + r = path_walk(path, &dir, perm, true, 0, dirinode); + if (r < 0) { + return r; + } + if (cct->_conf->client_permissions) { + r = may_create(dir.get(), perm); + if (r < 0) { + return r; + } + } + return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name)); +} + +int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 10) << "Client::mkdirs " << relpath << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mode << std::endl; + + //get through existing parts of path + filepath path(relpath); + unsigned int i; + int r = 0, caps = 0; + InodeRef cur, next; + + std::scoped_lock lock(client_lock); + cur = cwd; + for (i=0; i<path.depth(); ++i) { + if (cct->_conf->client_permissions) { + r = may_lookup(cur.get(), perms); + if (r < 0) + break; + caps = CEPH_CAP_AUTH_SHARED; + } + r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms); + if (r < 0) + break; + cur.swap(next); + } + if (r!=-CEPHFS_ENOENT) return r; + ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl; + //make new directory at each level + for (; i<path.depth(); ++i) { + if (cct->_conf->client_permissions) { + r = may_create(cur.get(), perms); + if (r < 0) + return r; + } + //make new dir + r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next); + + //check proper creation/existence + if(-CEPHFS_EEXIST == r && i < path.depth() - 1) { + r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms); + } + if (r < 0) + return r; + //move to new dir and continue + cur.swap(next); + ldout(cct, 20) << __func__ << ": successfully created directory " + << filepath(cur->ino).get_path() << dendl; + } + return 0; +} + +int Client::rmdir(const char *relpath, const UserPerm& perms) +{ + return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms); +} + +int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mode << std::endl; + tout(cct) << rdev << std::endl; + + if (std::string(relpath) == "/") + return -CEPHFS_EEXIST; + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + + std::scoped_lock lock(client_lock); + int r = path_walk(path, &dir, perms); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + int r = may_create(dir.get(), perms); + if (r < 0) + return r; + } + return _mknod(dir.get(), name.c_str(), mode, rdev, perms); +} + +// symlinks + +int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name) +{ + return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name); +} + +int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms, + std::string alternate_name) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + tout(cct) << __func__ << std::endl; + tout(cct) << target << std::endl; + tout(cct) << dirfd << std::endl; + tout(cct) << relpath << std::endl; + + if (std::string(relpath) == "/") { + return -CEPHFS_EEXIST; + } + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + + std::scoped_lock lock(client_lock); + + InodeRef dirinode; + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + r = path_walk(path, &dir, perms, true, 0, dirinode); + if (r < 0) { + return r; + } + if (cct->_conf->client_permissions) { + int r = may_create(dir.get(), perms); + if (r < 0) { + return r; + } + } + return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name)); +} + +int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms) +{ + return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms); +} + +int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) { + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + tout(cct) << __func__ << std::endl; + tout(cct) << dirfd << std::endl; + tout(cct) << relpath << std::endl; + + InodeRef dirinode; + std::scoped_lock lock(client_lock); + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + + InodeRef in; + filepath path(relpath); + r = path_walk(path, &in, perms, false, 0, dirinode); + if (r < 0) { + return r; + } + + return _readlink(in.get(), buf, size); +} + +int Client::_readlink(Inode *in, char *buf, size_t size) +{ + if (!in->is_symlink()) + return -CEPHFS_EINVAL; + + // copy into buf (at most size bytes) + int r = in->symlink.length(); + if (r > (int)size) + r = size; + memcpy(buf, in->symlink.c_str(), r); + return r; +} + + +// inode stuff + +int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force) +{ + bool yes = in->caps_issued_mask(mask, true); + + ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl; + if (yes && !force) + return 0; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(in); + req->head.args.getattr.mask = mask; + + int res = make_request(req, perms); + ldout(cct, 10) << __func__ << " result=" << res << dendl; + return res; +} + +int Client::_getvxattr( + Inode *in, + const UserPerm& perms, + const char *xattr_name, + ssize_t size, + void *value, + mds_rank_t rank) +{ + if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) { + return -CEPHFS_ENODATA; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(in); + req->set_string2(xattr_name); + + bufferlist bl; + int res = make_request(req, perms, nullptr, nullptr, rank, &bl, + CEPHFS_FEATURE_OP_GETVXATTR); + ldout(cct, 10) << __func__ << " result=" << res << dendl; + + if (res < 0) { + if (res == -CEPHFS_EOPNOTSUPP) { + return -CEPHFS_ENODATA; + } + return res; + } + + std::string buf; + auto p = bl.cbegin(); + + DECODE_START(1, p); + decode(buf, p); + DECODE_FINISH(p); + + ssize_t len = buf.length(); + + res = len; // refer to man getxattr(2) for output buffer size == 0 + + if (size > 0) { + if (len > size) { + res = -CEPHFS_ERANGE; // insufficient output buffer space + } else { + memcpy(value, buf.c_str(), len); + } + } + return res; +} + +int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms, InodeRef *inp, + std::vector<uint8_t>* aux) +{ + int issued = in->caps_issued(); + union ceph_mds_request_args args; + bool kill_sguid = false; + int inode_drop = 0; + size_t auxsize = 0; + + if (aux) + auxsize = aux->size(); + + ldout(cct, 10) << __func__ << " mask " << mask << " issued " << + ccap_string(issued) << " aux size " << auxsize << dendl; + + if (in->snapid != CEPH_NOSNAP) { + return -CEPHFS_EROFS; + } + if ((mask & CEPH_SETATTR_SIZE) && + (uint64_t)stx->stx_size > in->size && + is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size, + perms)) { + return -CEPHFS_EDQUOT; + } + + // Can't set fscrypt_auth and file at the same time! + if ((mask & (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE)) == + (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE)) + return -CEPHFS_EINVAL; + + if (!aux && (mask & (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE))) + return -CEPHFS_EINVAL; + + memset(&args, 0, sizeof(args)); + + // make the change locally? + if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) || + (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) { + ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid() + << " != cap dirtier " << in->cap_dirtier_uid << ":" + << in->cap_dirtier_gid << ", forcing sync setattr" + << dendl; + /* + * This works because we implicitly flush the caps as part of the + * request, so the cap update check will happen with the writeback + * cap context, and then the setattr check will happen with the + * caller's context. + * + * In reality this pattern is likely pretty rare (different users + * setattr'ing the same file). If that turns out not to be the + * case later, we can build a more complex pipelined cap writeback + * infrastructure... + */ + mask |= CEPH_SETATTR_CTIME; + } + + if (!mask) { + // caller just needs us to bump the ctime + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + if (issued & CEPH_CAP_AUTH_EXCL) + in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL); + else if (issued & CEPH_CAP_FILE_EXCL) + in->mark_caps_dirty(CEPH_CAP_FILE_EXCL); + else if (issued & CEPH_CAP_XATTR_EXCL) + in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL); + else + mask |= CEPH_SETATTR_CTIME; + } + + if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) { + kill_sguid = !!(mask & CEPH_SETATTR_KILL_SGUID); + } + + if (mask & CEPH_SETATTR_UID) { + ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl; + + if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->uid = stx->stx_uid; + in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL); + mask &= ~CEPH_SETATTR_UID; + kill_sguid = true; + } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) || + in->uid != stx->stx_uid) { + args.setattr.uid = stx->stx_uid; + inode_drop |= CEPH_CAP_AUTH_SHARED; + } else { + mask &= ~CEPH_SETATTR_UID; + } + } + + if (mask & CEPH_SETATTR_GID) { + ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl; + + if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->gid = stx->stx_gid; + in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL); + mask &= ~CEPH_SETATTR_GID; + kill_sguid = true; + } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) || + in->gid != stx->stx_gid) { + args.setattr.gid = stx->stx_gid; + inode_drop |= CEPH_CAP_AUTH_SHARED; + } else { + mask &= ~CEPH_SETATTR_GID; + } + } + + if (mask & CEPH_SETATTR_MODE) { + ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl; + + if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777); + in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL); + mask &= ~CEPH_SETATTR_MODE; + } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) || + in->mode != stx->stx_mode) { + args.setattr.mode = stx->stx_mode; + inode_drop |= CEPH_CAP_AUTH_SHARED; + } else { + mask &= ~CEPH_SETATTR_MODE; + } + } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) && S_ISREG(in->mode)) { + if (kill_sguid && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) { + in->mode &= ~(S_ISUID|S_ISGID); + } else { + if (mask & CEPH_SETATTR_KILL_SUID) { + in->mode &= ~S_ISUID; + } + if (mask & CEPH_SETATTR_KILL_SGID) { + in->mode &= ~S_ISGID; + } + } + mask &= ~(CEPH_SETATTR_KILL_SGUID|CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID); + in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL); + } + + if (mask & CEPH_SETATTR_BTIME) { + ldout(cct,10) << "changing btime to " << in->btime << dendl; + + if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->btime = utime_t(stx->stx_btime); + in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL); + mask &= ~CEPH_SETATTR_BTIME; + } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) || + in->btime != utime_t(stx->stx_btime)) { + args.setattr.btime = utime_t(stx->stx_btime); + inode_drop |= CEPH_CAP_AUTH_SHARED; + } else { + mask &= ~CEPH_SETATTR_BTIME; + } + } + + if (mask & CEPH_SETATTR_FSCRYPT_AUTH) { + ldout(cct,10) << "resetting cached fscrypt_auth field. size now " + << in->fscrypt_auth.size() << dendl; + + if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->fscrypt_auth = *aux; + in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL); + mask &= ~CEPH_SETATTR_FSCRYPT_AUTH; + } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) || + in->fscrypt_auth != *aux) { + inode_drop |= CEPH_CAP_AUTH_SHARED; + } else { + mask &= ~CEPH_SETATTR_FSCRYPT_AUTH; + } + } + + if (mask & CEPH_SETATTR_SIZE) { + if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) { + //too big! + ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl; + return -CEPHFS_EFBIG; + } + + ldout(cct,10) << "changing size to " << stx->stx_size << dendl; + if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) && + !(mask & CEPH_SETATTR_KILL_SGUID) && + stx->stx_size >= in->size) { + if (stx->stx_size > in->size) { + in->size = in->reported_size = stx->stx_size; + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->mark_caps_dirty(CEPH_CAP_FILE_EXCL); + mask &= ~(CEPH_SETATTR_SIZE); + mask |= CEPH_SETATTR_MTIME; + } else { + // ignore it when size doesn't change + mask &= ~(CEPH_SETATTR_SIZE); + } + } else { + args.setattr.size = stx->stx_size; + inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + + if (mask & CEPH_SETATTR_FSCRYPT_FILE) { + ldout(cct,10) << "resetting cached fscrypt_file field. size now " + << in->fscrypt_file.size() << dendl; + + if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->fscrypt_file = *aux; + in->mark_caps_dirty(CEPH_CAP_FILE_EXCL); + mask &= ~CEPH_SETATTR_FSCRYPT_FILE; + } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) || + in->fscrypt_file != *aux) { + inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } else { + mask &= ~CEPH_SETATTR_FSCRYPT_FILE; + } + } + + if (mask & CEPH_SETATTR_MTIME) { + if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) { + in->mtime = utime_t(stx->stx_mtime); + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->time_warp_seq++; + in->mark_caps_dirty(CEPH_CAP_FILE_EXCL); + mask &= ~CEPH_SETATTR_MTIME; + } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) && + utime_t(stx->stx_mtime) > in->mtime) { + in->mtime = utime_t(stx->stx_mtime); + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->mark_caps_dirty(CEPH_CAP_FILE_WR); + mask &= ~CEPH_SETATTR_MTIME; + } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) || + in->mtime != utime_t(stx->stx_mtime)) { + args.setattr.mtime = utime_t(stx->stx_mtime); + inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } else { + mask &= ~CEPH_SETATTR_MTIME; + } + } + + if (mask & CEPH_SETATTR_ATIME) { + if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) { + in->atime = utime_t(stx->stx_atime); + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->time_warp_seq++; + in->mark_caps_dirty(CEPH_CAP_FILE_EXCL); + mask &= ~CEPH_SETATTR_ATIME; + } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) && + utime_t(stx->stx_atime) > in->atime) { + in->atime = utime_t(stx->stx_atime); + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->mark_caps_dirty(CEPH_CAP_FILE_WR); + mask &= ~CEPH_SETATTR_ATIME; + } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) || + in->atime != utime_t(stx->stx_atime)) { + args.setattr.atime = utime_t(stx->stx_atime); + inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } else { + mask &= ~CEPH_SETATTR_ATIME; + } + } + + if (!mask) { + in->change_attr++; + if (in->is_dir() && in->snapid == CEPH_NOSNAP) { + vinodeno_t vino(in->ino, CEPH_SNAPDIR); + if (inode_map.count(vino)) { + refresh_snapdir_attrs(inode_map[vino], in); + } + } + return 0; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR); + + filepath path; + + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(in); + + req->head.args = args; + req->inode_drop = inode_drop; + if (mask & CEPH_SETATTR_FSCRYPT_AUTH) { + req->fscrypt_auth = *aux; + } else if (mask & CEPH_SETATTR_FSCRYPT_FILE) { + req->fscrypt_file = *aux; + } + req->head.args.setattr.mask = mask; + req->regetattr_mask = mask; + + int res = make_request(req, perms, inp); + ldout(cct, 10) << "_setattr result=" << res << dendl; + return res; +} + +/* Note that we only care about attrs that setattr cares about */ +void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx) +{ + stx->stx_size = st->st_size; + stx->stx_mode = st->st_mode; + stx->stx_uid = st->st_uid; + stx->stx_gid = st->st_gid; +#ifdef __APPLE__ + stx->stx_mtime = st->st_mtimespec; + stx->stx_atime = st->st_atimespec; +#elif __WIN32 + stx->stx_mtime.tv_sec = st->st_mtime; + stx->stx_mtime.tv_nsec = 0; + stx->stx_atime.tv_sec = st->st_atime; + stx->stx_atime.tv_nsec = 0; +#else + stx->stx_mtime = st->st_mtim; + stx->stx_atime = st->st_atim; +#endif +} + +int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms, InodeRef *inp) +{ + if (mask & CEPH_SETATTR_SIZE) { + mask |= clear_suid_sgid(in, perms, true); + } + + int ret = _do_setattr(in, stx, mask, perms, inp); + if (ret < 0) + return ret; + if (mask & CEPH_SETATTR_MODE) + ret = _posix_acl_chmod(in, stx->stx_mode, perms); + return ret; +} + +int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask, + const UserPerm& perms) +{ + mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID | + CEPH_SETATTR_GID | CEPH_SETATTR_MTIME | + CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE | + CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME); + if (cct->_conf->client_permissions) { + int r = may_setattr(in.get(), stx, mask, perms); + if (r < 0) + return r; + } + return __setattrx(in.get(), stx, mask, perms); +} + +int Client::_setattr(InodeRef &in, struct stat *attr, int mask, + const UserPerm& perms) +{ + struct ceph_statx stx; + + stat_to_statx(attr, &stx); + mask &= ~CEPH_SETATTR_BTIME; + + if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) { + mask &= ~CEPH_SETATTR_UID; + } + if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) { + mask &= ~CEPH_SETATTR_GID; + } + + return _setattrx(in, &stx, mask, perms); +} + +int Client::setattr(const char *relpath, struct stat *attr, int mask, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mask << std::endl; + + filepath path(relpath); + InodeRef in; + + std::scoped_lock lock(client_lock); + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + return _setattr(in, attr, mask, perms); +} + +int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask, + const UserPerm& perms, int flags) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mask << std::endl; + + filepath path(relpath); + InodeRef in; + + std::scoped_lock lock(client_lock); + int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW)); + if (r < 0) + return r; + return _setattrx(in, stx, mask, perms); +} + +int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << mask << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + return _setattr(f->inode, attr, mask, perms); +} + +int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << mask << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + return _setattrx(f->inode, stx, mask, perms); +} + +int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms, + frag_info_t *dirstat, int mask) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl; + tout(cct) << "stat" << std::endl; + tout(cct) << relpath << std::endl; + + filepath path(relpath); + InodeRef in; + + std::scoped_lock lock(client_lock); + int r = path_walk(path, &in, perms, true, mask); + if (r < 0) + return r; + r = _getattr(in, mask, perms); + if (r < 0) { + ldout(cct, 3) << __func__ << " exit on error!" << dendl; + return r; + } + fill_stat(in, stbuf, dirstat); + ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl; + return r; +} + +unsigned Client::statx_to_mask(unsigned int flags, unsigned int want) +{ + unsigned mask = 0; + + /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */ + if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_DONT_SYNC) + goto out; + + /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */ + mask |= CEPH_CAP_PIN; + if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION)) + mask |= CEPH_CAP_AUTH_SHARED; + if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION)) + mask |= CEPH_CAP_LINK_SHARED; + if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION)) + mask |= CEPH_CAP_FILE_SHARED; + if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME)) + mask |= CEPH_CAP_XATTR_SHARED; +out: + return mask; +} + +int Client::statx(const char *relpath, struct ceph_statx *stx, + const UserPerm& perms, + unsigned int want, unsigned int flags) +{ + return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags); +} + +int Client::lstat(const char *relpath, struct stat *stbuf, + const UserPerm& perms, frag_info_t *dirstat, int mask) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << relpath << std::endl; + + filepath path(relpath); + InodeRef in; + + std::scoped_lock lock(client_lock); + // don't follow symlinks + int r = path_walk(path, &in, perms, false, mask); + if (r < 0) + return r; + r = _getattr(in, mask, perms); + if (r < 0) { + ldout(cct, 3) << __func__ << " exit on error!" << dendl; + return r; + } + fill_stat(in, stbuf, dirstat); + ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl; + return r; +} + +int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat) +{ + ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid + << " mode 0" << oct << in->mode << dec + << " mtime " << in->mtime << " ctime " << in->ctime << dendl; + memset(st, 0, sizeof(struct stat)); + if (use_faked_inos()) + st->st_ino = in->faked_ino; + else + st->st_ino = in->ino; + st->st_dev = in->snapid; + st->st_mode = in->mode; + st->st_rdev = in->rdev; + if (in->is_dir()) { + switch (in->nlink) { + case 0: + st->st_nlink = 0; /* dir is unlinked */ + break; + case 1: + st->st_nlink = 1 /* parent dentry */ + + 1 /* <dir>/. */ + + in->dirstat.nsubdirs; /* include <dir>/. self-reference */ + break; + default: + ceph_abort(); + } + } else { + st->st_nlink = in->nlink; + } + st->st_uid = in->uid; + st->st_gid = in->gid; + if (in->ctime > in->mtime) { + stat_set_ctime_sec(st, in->ctime.sec()); + stat_set_ctime_nsec(st, in->ctime.nsec()); + } else { + stat_set_ctime_sec(st, in->mtime.sec()); + stat_set_ctime_nsec(st, in->mtime.nsec()); + } + stat_set_atime_sec(st, in->atime.sec()); + stat_set_atime_nsec(st, in->atime.nsec()); + stat_set_mtime_sec(st, in->mtime.sec()); + stat_set_mtime_nsec(st, in->mtime.nsec()); + if (in->is_dir()) { + if (cct->_conf->client_dirsize_rbytes) { + st->st_size = in->rstat.rbytes; + } else if (in->snapid == CEPH_SNAPDIR) { + SnapRealm *realm = get_snap_realm_maybe(in->vino().ino); + if (realm) { + st->st_size = realm->my_snaps.size(); + put_snap_realm(realm); + } + } else { + st->st_size = in->dirstat.size(); + } +// The Windows "stat" structure provides just a subset of the fields that are +// available on Linux. +#ifndef _WIN32 + st->st_blocks = 1; +#endif + } else { + st->st_size = in->size; +#ifndef _WIN32 + st->st_blocks = (in->size + 511) >> 9; +#endif + } +#ifndef _WIN32 + st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096); +#endif + + if (dirstat) + *dirstat = in->dirstat; + if (rstat) + *rstat = in->rstat; + + return in->caps_issued(); +} + +void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx) +{ + ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid + << " mode 0" << oct << in->mode << dec + << " mtime " << in->mtime << " ctime " << in->ctime << " change_attr " << in->change_attr << dendl; + memset(stx, 0, sizeof(struct ceph_statx)); + + /* + * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask + * so that all bits are set. + */ + if (!mask) + mask = ~0; + + /* These are always considered to be available */ + stx->stx_dev = in->snapid; + stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096); + + /* Type bits are always set, even when CEPH_STATX_MODE is not */ + stx->stx_mode = S_IFMT & in->mode; + stx->stx_ino = use_faked_inos() ? in->faked_ino : (uint64_t)in->ino; + stx->stx_rdev = in->rdev; + stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV); + + if (mask & CEPH_CAP_AUTH_SHARED) { + stx->stx_uid = in->uid; + stx->stx_gid = in->gid; + stx->stx_mode = in->mode; + in->btime.to_timespec(&stx->stx_btime); + stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME); + } + + if (mask & CEPH_CAP_LINK_SHARED) { + if (in->is_dir()) { + switch (in->nlink) { + case 0: + stx->stx_nlink = 0; /* dir is unlinked */ + break; + case 1: + stx->stx_nlink = 1 /* parent dentry */ + + 1 /* <dir>/. */ + + in->dirstat.nsubdirs; /* include <dir>/. self-reference */ + break; + default: + ceph_abort(); + } + } else { + stx->stx_nlink = in->nlink; + } + stx->stx_mask |= CEPH_STATX_NLINK; + } + + if (mask & CEPH_CAP_FILE_SHARED) { + + in->atime.to_timespec(&stx->stx_atime); + in->mtime.to_timespec(&stx->stx_mtime); + + if (in->is_dir()) { + if (cct->_conf->client_dirsize_rbytes) { + stx->stx_size = in->rstat.rbytes; + } else if (in->snapid == CEPH_SNAPDIR) { + SnapRealm *realm = get_snap_realm_maybe(in->vino().ino); + if (realm) { + stx->stx_size = realm->my_snaps.size(); + put_snap_realm(realm); + } + } else { + stx->stx_size = in->dirstat.size(); + } + stx->stx_blocks = 1; + } else { + stx->stx_size = in->size; + stx->stx_blocks = (in->size + 511) >> 9; + } + stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME| + CEPH_STATX_SIZE|CEPH_STATX_BLOCKS); + } + + /* Change time and change_attr both require all shared caps to view */ + if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) { + stx->stx_version = in->change_attr; + if (in->ctime > in->mtime) + in->ctime.to_timespec(&stx->stx_ctime); + else + in->mtime.to_timespec(&stx->stx_ctime); + stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION); + } + +} + +void Client::touch_dn(Dentry *dn) +{ + lru.lru_touch(dn); +} + +int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms) +{ + return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms); +} + +int Client::fchmod(int fd, mode_t mode, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << mode << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + struct stat attr; + attr.st_mode = mode; + return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms); +} + +int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags, + const UserPerm& perms) { + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + tout(cct) << __func__ << std::endl; + tout(cct) << dirfd << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mode << std::endl; + tout(cct) << flags << std::endl; + + filepath path(relpath); + InodeRef in; + InodeRef dirinode; + + std::scoped_lock lock(client_lock); + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + + r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode); + if (r < 0) { + return r; + } + struct stat attr; + attr.st_mode = mode; + return _setattr(in, &attr, CEPH_SETATTR_MODE, perms); +} + +int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms) +{ + return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms); +} + +int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid, + const UserPerm& perms) +{ + return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms); +} + +int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << new_uid << std::endl; + tout(cct) << new_gid << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + struct stat attr; + attr.st_uid = new_uid; + attr.st_gid = new_gid; + int mask = 0; + if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID; + if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID; + return _setattr(f->inode, &attr, mask, perms); +} + +int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid, + const UserPerm& perms) +{ + return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms); +} + +int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid, + int flags, const UserPerm& perms) { + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + tout(cct) << __func__ << std::endl; + tout(cct) << dirfd << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << new_uid << std::endl; + tout(cct) << new_gid << std::endl; + tout(cct) << flags << std::endl; + + filepath path(relpath); + InodeRef in; + InodeRef dirinode; + + std::scoped_lock lock(client_lock); + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + + r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode); + if (r < 0) { + return r; + } + struct stat attr; + attr.st_uid = new_uid; + attr.st_gid = new_gid; + return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms); +} + +static void attr_set_atime_and_mtime(struct stat *attr, + const utime_t &atime, + const utime_t &mtime) +{ + stat_set_atime_sec(attr, atime.tv.tv_sec); + stat_set_atime_nsec(attr, atime.tv.tv_nsec); + stat_set_mtime_sec(attr, mtime.tv.tv_sec); + stat_set_mtime_nsec(attr, mtime.tv.tv_nsec); +} + +// for [l]utime() invoke the timeval variant as the timespec +// variant are not yet implemented. for futime[s](), invoke +// the timespec variant. +int Client::utime(const char *relpath, struct utimbuf *buf, + const UserPerm& perms) +{ + struct timeval tv[2]; + tv[0].tv_sec = buf->actime; + tv[0].tv_usec = 0; + tv[1].tv_sec = buf->modtime; + tv[1].tv_usec = 0; + + return utimes(relpath, tv, perms); +} + +int Client::lutime(const char *relpath, struct utimbuf *buf, + const UserPerm& perms) +{ + struct timeval tv[2]; + tv[0].tv_sec = buf->actime; + tv[0].tv_usec = 0; + tv[1].tv_sec = buf->modtime; + tv[1].tv_usec = 0; + + return lutimes(relpath, tv, perms); +} + +int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms) +{ + struct timespec ts[2]; + ts[0].tv_sec = buf->actime; + ts[0].tv_nsec = 0; + ts[1].tv_sec = buf->modtime; + ts[1].tv_nsec = 0; + + return futimens(fd, ts, perms); +} + +int Client::utimes(const char *relpath, struct timeval times[2], + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec + << std::endl; + tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec + << std::endl; + + filepath path(relpath); + InodeRef in; + + std::scoped_lock lock(client_lock); + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + struct ceph_statx attr; + utime_t(times[0]).to_timespec(&attr.stx_atime); + utime_t(times[1]).to_timespec(&attr.stx_mtime); + + return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms); +} + +int Client::lutimes(const char *relpath, struct timeval times[2], + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec + << std::endl; + tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec + << std::endl; + + filepath path(relpath); + InodeRef in; + + std::scoped_lock lock(client_lock); + int r = path_walk(path, &in, perms, false); + if (r < 0) + return r; + struct ceph_statx attr; + utime_t(times[0]).to_timespec(&attr.stx_atime); + utime_t(times[1]).to_timespec(&attr.stx_mtime); + + return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms); +} + +int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms) +{ + struct timespec ts[2]; + ts[0].tv_sec = times[0].tv_sec; + ts[0].tv_nsec = times[0].tv_usec * 1000; + ts[1].tv_sec = times[1].tv_sec; + ts[1].tv_nsec = times[1].tv_usec * 1000; + + return futimens(fd, ts, perms); +} + +int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec + << std::endl; + tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec + << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + struct ceph_statx attr; + utime_t(times[0]).to_timespec(&attr.stx_atime); + utime_t(times[1]).to_timespec(&attr.stx_mtime); + + return _setattrx(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms); +} + +int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags, + const UserPerm& perms) { + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + tout(cct) << __func__ << std::endl; + tout(cct) << dirfd << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec + << std::endl; + tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec + << std::endl; + tout(cct) << flags << std::endl; + + filepath path(relpath); + InodeRef in; + InodeRef dirinode; + + std::scoped_lock lock(client_lock); + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + +#if defined(__linux__) && defined(O_PATH) + if (flags & O_PATH) { + return -CEPHFS_EBADF; + } +#endif + + r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode); + if (r < 0) { + return r; + } + struct ceph_statx attr; + utime_t(times[0]).to_timespec(&attr.stx_atime); + utime_t(times[1]).to_timespec(&attr.stx_mtime); + + return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms); +} + +int Client::flock(int fd, int operation, uint64_t owner) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << operation << std::endl; + tout(cct) << owner << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + + return _flock(f, operation, owner); +} + +int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << relpath << std::endl; + + filepath path(relpath); + InodeRef in; + + std::scoped_lock lock(client_lock); + int r = path_walk(path, &in, perms, true); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + int r = may_open(in.get(), O_RDONLY, perms); + if (r < 0) + return r; + } + r = _opendir(in.get(), dirpp, perms); + /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */ + if (r != -CEPHFS_ENOTDIR) + tout(cct) << (uintptr_t)*dirpp << std::endl; + return r; +} + +int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) { + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + tout(cct) << __func__ << std::endl; + tout(cct) << dirfd << std::endl; + + InodeRef dirinode; + std::scoped_lock locker(client_lock); + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + + if (cct->_conf->client_permissions) { + r = may_open(dirinode.get(), O_RDONLY, perms); + if (r < 0) { + return r; + } + } + r = _opendir(dirinode.get(), dirpp, perms); + /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */ + if (r != -CEPHFS_ENOTDIR) { + tout(cct) << (uintptr_t)*dirpp << std::endl; + } + return r; +} + +int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms) +{ + if (!in->is_dir()) + return -CEPHFS_ENOTDIR; + *dirpp = new dir_result_t(in, perms); + opened_dirs.insert(*dirpp); + ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl; + return 0; +} + + +int Client::closedir(dir_result_t *dir) +{ + tout(cct) << __func__ << std::endl; + tout(cct) << (uintptr_t)dir << std::endl; + + ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl; + std::scoped_lock lock(client_lock); + _closedir(dir); + return 0; +} + +void Client::_closedir(dir_result_t *dirp) +{ + ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl; + + if (dirp->inode) { + ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl; + dirp->inode.reset(); + } + _readdir_drop_dirp_buffer(dirp); + opened_dirs.erase(dirp); + delete dirp; +} + +void Client::rewinddir(dir_result_t *dirp) +{ + ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl; + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return; + + std::scoped_lock lock(client_lock); + dir_result_t *d = static_cast<dir_result_t*>(dirp); + _readdir_drop_dirp_buffer(d); + d->reset(); +} + +loff_t Client::telldir(dir_result_t *dirp) +{ + dir_result_t *d = static_cast<dir_result_t*>(dirp); + ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl; + return d->offset; +} + +void Client::seekdir(dir_result_t *dirp, loff_t offset) +{ + ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl; + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return; + + std::scoped_lock lock(client_lock); + + if (offset == dirp->offset) + return; + + if (offset > dirp->offset) + dirp->release_count = 0; // bump if we do a forward seek + else + dirp->ordered_count = 0; // disable filling readdir cache + + if (dirp->hash_order()) { + if (dirp->offset > offset) { + _readdir_drop_dirp_buffer(dirp); + dirp->reset(); + } + } else { + if (offset == 0 || + dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) || + dirp->offset_low() > dir_result_t::fpos_low(offset)) { + _readdir_drop_dirp_buffer(dirp); + dirp->reset(); + } + } + + dirp->offset = offset; +} + + +//struct dirent { +// ino_t d_ino; /* inode number */ +// off_t d_off; /* offset to the next dirent */ +// unsigned short d_reclen; /* length of this record */ +// unsigned char d_type; /* type of file */ +// char d_name[256]; /* filename */ +//}; +void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off) +{ + strncpy(de->d_name, name, 255); + de->d_name[255] = '\0'; +#if !defined(__CYGWIN__) && !(defined(_WIN32)) + de->d_ino = ino; +#if !defined(__APPLE__) && !defined(__FreeBSD__) + de->d_off = next_off; +#endif + de->d_reclen = 1; + de->d_type = IFTODT(type); + ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino) + << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl; +#endif +} + +void Client::_readdir_next_frag(dir_result_t *dirp) +{ + frag_t fg = dirp->buffer_frag; + + if (fg.is_rightmost()) { + ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl; + dirp->set_end(); + return; + } + + // advance + fg = fg.next(); + ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl; + + if (dirp->hash_order()) { + // keep last_name + int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true); + if (dirp->offset < new_offset) // don't decrease offset + dirp->offset = new_offset; + } else { + dirp->last_name.clear(); + dirp->offset = dir_result_t::make_fpos(fg, 2, false); + _readdir_rechoose_frag(dirp); + } +} + +void Client::_readdir_rechoose_frag(dir_result_t *dirp) +{ + ceph_assert(dirp->inode); + + if (dirp->hash_order()) + return; + + frag_t cur = frag_t(dirp->offset_high()); + frag_t fg = dirp->inode->dirfragtree[cur.value()]; + if (fg != cur) { + ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl; + dirp->offset = dir_result_t::make_fpos(fg, 2, false); + dirp->last_name.clear(); + dirp->next_offset = 2; + } +} + +void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp) +{ + ldout(cct, 10) << __func__ << " " << dirp << dendl; + dirp->buffer.clear(); +} + +int Client::_readdir_get_frag(int op, dir_result_t* dirp, + fill_readdir_args_cb_t fill_req_cb) +{ + ceph_assert(dirp); + ceph_assert(dirp->inode); + + // get the current frag. + frag_t fg; + if (dirp->hash_order()) + fg = dirp->inode->dirfragtree[dirp->offset_high()]; + else + fg = frag_t(dirp->offset_high()); + + ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg + << " offset " << hex << dirp->offset << dec << dendl; + + InodeRef& diri = dirp->inode; + + MetaRequest *req = new MetaRequest(op); + fill_req_cb(dirp, req, diri, fg); + + bufferlist dirbl; + int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl); + + if (res == -CEPHFS_EAGAIN) { + ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl; + _readdir_rechoose_frag(dirp); + return _readdir_get_frag(op, dirp, fill_req_cb); + } + + if (res == 0) { + ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag + << " size " << dirp->buffer.size() << dendl; + } else { + ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl; + dirp->set_end(); + } + + return res; +} + +struct dentry_off_lt { + bool operator()(const Dentry* dn, int64_t off) const { + return dir_result_t::fpos_cmp(dn->offset, off) < 0; + } +}; + +int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p, + int caps, bool getref) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino + << " last_name " << dirp->last_name + << " offset " << hex << dirp->offset << dec + << dendl; + Dir *dir = dirp->inode->dir; + + if (!dir) { + ldout(cct, 10) << " dir is empty" << dendl; + dirp->set_end(); + return 0; + } + + vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(), + dir->readdir_cache.end(), + dirp->offset, dentry_off_lt()); + + string dn_name; + while (true) { + int mask = caps; + if (!dirp->inode->is_complete_and_ordered()) + return -CEPHFS_EAGAIN; + if (pd == dir->readdir_cache.end()) + break; + Dentry *dn = *pd; + if (dn->inode == NULL) { + ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl; + ++pd; + continue; + } + if (dn->cap_shared_gen != dir->parent_inode->shared_gen) { + ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl; + ++pd; + continue; + } + + int idx = pd - dir->readdir_cache.begin(); + if (dn->inode->is_dir()) { + mask |= CEPH_STAT_RSTAT; + } + int r = _getattr(dn->inode, mask, dirp->perms); + if (r < 0) + return r; + + // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator + pd = dir->readdir_cache.begin() + idx; + if (pd >= dir->readdir_cache.end() || *pd != dn) + return -CEPHFS_EAGAIN; + + struct ceph_statx stx; + struct dirent de; + fill_statx(dn->inode, caps, &stx); + + uint64_t next_off = dn->offset + 1; + fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off); + ++pd; + if (pd == dir->readdir_cache.end()) + next_off = dir_result_t::END; + + Inode *in = NULL; + if (getref) { + in = dn->inode.get(); + _ll_get(in); + } + + dn_name = dn->name; // fill in name while we have lock + + client_lock.unlock(); + r = cb(p, &de, &stx, next_off, in); // _next_ offset + client_lock.lock(); + ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec + << " = " << r << dendl; + if (r < 0) { + return r; + } + + dirp->offset = next_off; + if (dirp->at_end()) + dirp->next_offset = 2; + else + dirp->next_offset = dirp->offset_low(); + dirp->last_name = dn_name; // we successfully returned this one; update! + dirp->release_count = 0; // last_name no longer match cache index + if (r > 0) + return r; + } + + ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl; + dirp->set_end(); + return 0; +} + +int Client::readdir_r_cb(dir_result_t* d, + add_dirent_cb_t cb, + void* p, + unsigned want, + unsigned flags, + bool getref) +{ + auto fill_readdir_cb = [](dir_result_t* dirp, + MetaRequest* req, + InodeRef& diri, + frag_t fg) { + filepath path; + diri->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(diri.get()); + req->head.args.readdir.frag = fg; + req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS; + if (dirp->last_name.length()) { + req->path2.set_path(dirp->last_name); + } else if (dirp->hash_order()) { + req->head.args.readdir.offset_hash = dirp->offset_high(); + } + req->dirp = dirp; + }; + int op = CEPH_MDS_OP_READDIR; + if (d->inode && d->inode->snapid == CEPH_SNAPDIR) + op = CEPH_MDS_OP_LSSNAP; + return _readdir_r_cb(op, + d, + cb, + fill_readdir_cb, + p, + want, + flags, + getref, + false); +} + +// +// NB: this is used for both readdir and readdir_snapdiff results processing +// hence it should be request type agnostic +// +int Client::_readdir_r_cb(int op, + dir_result_t *d, + add_dirent_cb_t cb, + fill_readdir_args_cb_t fill_cb, + void *p, + unsigned want, + unsigned flags, + bool getref, + bool bypass_cache) +{ + int caps = statx_to_mask(flags, want); + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::unique_lock cl(client_lock); + + dir_result_t *dirp = static_cast<dir_result_t*>(d); + + ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset + << dec << " at_end=" << dirp->at_end() + << " hash_order=" << dirp->hash_order() << dendl; + + struct dirent de; + struct ceph_statx stx; + memset(&de, 0, sizeof(de)); + memset(&stx, 0, sizeof(stx)); + + InodeRef& diri = dirp->inode; + + if (dirp->at_end()) + return 0; + + if (dirp->offset == 0) { + ldout(cct, 15) << " including ." << dendl; + ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir + uint64_t next_off = 1; + + int r; + r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms); + if (r < 0) + return r; + + fill_statx(diri, caps, &stx); + fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off); + + Inode *inode = NULL; + if (getref) { + inode = diri.get(); + _ll_get(inode); + } + + cl.unlock(); + r = cb(p, &de, &stx, next_off, inode); + cl.lock(); + if (r < 0) + return r; + + dirp->offset = next_off; + if (r > 0) + return r; + } + if (dirp->offset == 1) { + ldout(cct, 15) << " including .." << dendl; + uint64_t next_off = 2; + InodeRef in; + if (diri->dentries.empty()) + in = diri; + else + in = diri->get_first_parent()->dir->parent_inode; + + int r; + r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms); + if (r < 0) + return r; + + fill_statx(in, caps, &stx); + fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off); + + Inode *inode = NULL; + if (getref) { + inode = in.get(); + _ll_get(inode); + } + + cl.unlock(); + r = cb(p, &de, &stx, next_off, inode); + cl.lock(); + if (r < 0) + return r; + + dirp->offset = next_off; + if (r > 0) + return r; + } + + // can we read from our cache? + ldout(cct, 10) << __func__ + << " offset " << hex << dirp->offset << dec + << " snapid " << dirp->inode->snapid << " (complete && ordered) " + << dirp->inode->is_complete_and_ordered() + << " issued " << ccap_string(dirp->inode->caps_issued()) + << dendl; + if (!bypass_cache && + dirp->inode->snapid != CEPH_SNAPDIR && + dirp->inode->is_complete_and_ordered() && + dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) { + int err = _readdir_cache_cb(dirp, cb, p, caps, getref); + if (err != -CEPHFS_EAGAIN) + return err; + } + + while (1) { + if (dirp->at_end()) + return 0; + + bool check_caps = true; + if (!dirp->is_cached()) { + int r = _readdir_get_frag(op, dirp, fill_cb); + if (r) + return r; + // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is + // different than the requested one. (our dirfragtree was outdated) + check_caps = false; + } + frag_t fg = dirp->buffer_frag; + + ldout(cct, 10) << __func__ + << " frag " << fg << " buffer size " << dirp->buffer.size() + << " offset " << hex << dirp->offset << dendl; + + for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(), + dirp->offset, dir_result_t::dentry_off_lt()); + it != dirp->buffer.end(); + ++it) { + dir_result_t::dentry &entry = *it; + + uint64_t next_off = entry.offset + 1; + + int r; + if (check_caps) { + int mask = caps; + if(entry.inode->is_dir()){ + mask |= CEPH_STAT_RSTAT; + } + r = _getattr(entry.inode, mask, dirp->perms); + if (r < 0) + return r; + } + + fill_statx(entry.inode, caps, &stx); + fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off); + + Inode *inode = NULL; + if (getref) { + inode = entry.inode.get(); + _ll_get(inode); + } + + cl.unlock(); + r = cb(p, &de, &stx, next_off, inode); // _next_ offset + cl.lock(); + + ldout(cct, 15) << __func__ + << " de " << de.d_name << " off " << hex << next_off - 1 << dec + << " snap " << entry.inode->snapid + << " = " << r << dendl; + if (r < 0) + return r; + + dirp->offset = next_off; + if (r > 0) + return r; + } + + if (dirp->next_offset > 2) { + ldout(cct, 10) << " fetching next chunk of this frag" << dendl; + _readdir_drop_dirp_buffer(dirp); + continue; // more! + } + + if (!fg.is_rightmost()) { + // next frag! + _readdir_next_frag(dirp); + continue; + } + + if (!bypass_cache && + diri->shared_gen == dirp->start_shared_gen && + diri->dir_release_count == dirp->release_count) { + if (diri->dir_ordered_count == dirp->ordered_count) { + ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl; + if (diri->dir) { + ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index); + diri->dir->readdir_cache.resize(dirp->cache_index); + } + diri->flags |= I_COMPLETE | I_DIR_ORDERED; + } else { + ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl; + diri->flags |= I_COMPLETE; + } + } + + dirp->set_end(); + return 0; + } + ceph_abort(); + return 0; +} + + +int Client::readdir_r(dir_result_t *d, struct dirent *de) +{ + return readdirplus_r(d, de, 0, 0, 0, NULL); +} + +/* + * readdirplus_r + * + * returns + * 1 if we got a dirent + * 0 for end of directory + * <0 on error + */ + +struct single_readdir { + struct dirent *de; + struct ceph_statx *stx; + Inode *inode; + bool full; +}; + +static int _readdir_single_dirent_cb(void *p, struct dirent *de, + struct ceph_statx *stx, off_t off, + Inode *in) +{ + single_readdir *c = static_cast<single_readdir *>(p); + + if (c->full) + return -1; // already filled this dirent + + *c->de = *de; + if (c->stx) + *c->stx = *stx; + c->inode = in; + c->full = true; + return 1; +} + +struct dirent *Client::readdir(dir_result_t *d) +{ + int ret; + auto& de = d->de; + single_readdir sr; + sr.de = &de; + sr.stx = NULL; + sr.inode = NULL; + sr.full = false; + + // our callback fills the dirent and sets sr.full=true on first + // call, and returns -1 the second time around. + ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr); + if (ret < -1) { + errno = -ret; // this sucks. + return (dirent *) NULL; + } + if (sr.full) { + return &de; + } + return (dirent *) NULL; +} + +int Client::readdirplus_r(dir_result_t *d, struct dirent *de, + struct ceph_statx *stx, unsigned want, + unsigned flags, Inode **out) +{ + single_readdir sr; + sr.de = de; + sr.stx = stx; + sr.inode = NULL; + sr.full = false; + + // our callback fills the dirent and sets sr.full=true on first + // call, and returns -1 the second time around. + int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out); + if (r < -1) + return r; + if (out) + *out = sr.inode; + if (sr.full) + return 1; + return 0; +} + +int Client::readdir_snapdiff(dir_result_t* d1, snapid_t snap2, + struct dirent* out_de, + snapid_t* out_snap) +{ + if (!d1 || !d1->inode || d1->inode->snapid == snap2) { + lderr(cct) << __func__ << " invalid parameters: " + << " d1:" << d1 + << " d1->inode:" << (d1 ? d1->inode : nullptr) + << " snap2 id :" << snap2 + << dendl; + errno = EINVAL; + return -errno; + } + + auto& de = d1->de; + ceph_statx stx; + single_readdir sr; + sr.de = &de; + sr.stx = &stx; + sr.inode = NULL; + sr.full = false; + + auto fill_snapdiff_cb = [&](dir_result_t* dirp, + MetaRequest* req, + InodeRef& diri, + frag_t fg) { + filepath path; + diri->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(diri.get()); + req->head.args.snapdiff.snap_other = snap2; + req->head.args.snapdiff.frag = fg; + req->head.args.snapdiff.flags = CEPH_READDIR_REPLY_BITFLAGS; + if (dirp->last_name.length()) { + req->path2.set_path(dirp->last_name); + } else if (dirp->hash_order()) { + req->head.args.snapdiff.offset_hash = dirp->offset_high(); + } + req->dirp = dirp; + }; + + // our callback fills the dirent and sets sr.full=true on first + // call, and returns -1 the second time around. + int ret = _readdir_r_cb(CEPH_MDS_OP_READDIR_SNAPDIFF, + d1, + _readdir_single_dirent_cb, + fill_snapdiff_cb, + (void*)&sr, + 0, + AT_STATX_DONT_SYNC, + false, + true); + if (ret < -1) { + lderr(cct) << __func__ << " error: " + << cpp_strerror(ret) + << dendl; + errno = -ret; // this sucks. + return ret; + } + + ldout(cct, 15) << __func__ << " " << ret + << " " << sr.de->d_name + << " " << stx.stx_dev + << dendl; + if (sr.full) { + if (out_de) { + *out_de = de; + } + if (out_snap) { + *out_snap = stx.stx_dev; + } + return 1; + } + return 0; +} + +/* getdents */ +struct getdents_result { + char *buf; + int buflen; + int pos; + bool fullent; +}; + +static int _readdir_getdent_cb(void *p, struct dirent *de, + struct ceph_statx *stx, off_t off, Inode *in) +{ + struct getdents_result *c = static_cast<getdents_result *>(p); + + int dlen; + if (c->fullent) + dlen = sizeof(*de); + else + dlen = strlen(de->d_name) + 1; + + if (c->pos + dlen > c->buflen) + return -1; // doesn't fit + + if (c->fullent) { + memcpy(c->buf + c->pos, de, sizeof(*de)); + } else { + memcpy(c->buf + c->pos, de->d_name, dlen); + } + c->pos += dlen; + return 0; +} + +int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent) +{ + getdents_result gr; + gr.buf = buf; + gr.buflen = buflen; + gr.fullent = fullent; + gr.pos = 0; + + int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr); + + if (r < 0) { // some error + if (r == -1) { // buffer ran out of space + if (gr.pos) { // but we got some entries already! + return gr.pos; + } // or we need a larger buffer + return -CEPHFS_ERANGE; + } else { // actual error, return it + return r; + } + } + return gr.pos; +} + + +/* getdir */ +struct getdir_result { + list<string> *contents; + int num; +}; + +static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in) +{ + getdir_result *r = static_cast<getdir_result *>(p); + + r->contents->push_back(de->d_name); + r->num++; + return 0; +} + +int Client::getdir(const char *relpath, list<string>& contents, + const UserPerm& perms) +{ + ldout(cct, 3) << "getdir(" << relpath << ")" << dendl; + tout(cct) << "getdir" << std::endl; + tout(cct) << relpath << std::endl; + + dir_result_t *d; + int r = opendir(relpath, &d, perms); + if (r < 0) + return r; + + getdir_result gr; + gr.contents = &contents; + gr.num = 0; + r = readdir_r_cb(d, _getdir_cb, (void *)&gr); + + closedir(d); + + if (r < 0) + return r; + return gr.num; +} + + +/****** file i/o **********/ + +// common parts for open and openat. call with client_lock locked. +int Client::create_and_open(int dirfd, const char *relpath, int flags, + const UserPerm& perms, mode_t mode, int stripe_unit, + int stripe_count, int object_size, const char *data_pool, + std::string alternate_name) { + ceph_assert(ceph_mutex_is_locked(client_lock)); + int cflags = ceph_flags_sys2wire(flags); + tout(cct) << cflags << std::endl; + + Fh *fh = NULL; + +#if defined(__linux__) && defined(O_PATH) + /* When the O_PATH is being specified, others flags than O_DIRECTORY + * and O_NOFOLLOW are ignored. Please refer do_entry_open() function + * in kernel (fs/open.c). */ + if (flags & O_PATH) + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; +#endif + + filepath path(relpath); + InodeRef in; + bool created = false; + /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */ + bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL))); + int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags)); + + InodeRef dirinode = nullptr; + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + + r = path_walk(path, &in, perms, followsym, mask, dirinode); + if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL)) + return -CEPHFS_EEXIST; + +#if defined(__linux__) && defined(O_PATH) + if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH)) +#else + if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW)) +#endif + return -CEPHFS_ELOOP; + + if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) { + filepath dirpath = path; + string dname = dirpath.last_dentry(); + dirpath.pop_dentry(); + InodeRef dir; + r = path_walk(dirpath, &dir, perms, true, + cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode); + if (r < 0) { + goto out; + } + if (cct->_conf->client_permissions) { + r = may_create(dir.get(), perms); + if (r < 0) + goto out; + } + r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit, + stripe_count, object_size, data_pool, &created, perms, + std::move(alternate_name)); + } + if (r < 0) + goto out; + + if (!created) { + // posix says we can only check permissions of existing files + if (cct->_conf->client_permissions) { + r = may_open(in.get(), flags, perms); + if (r < 0) + goto out; + } + } + + if (!fh) + r = _open(in.get(), flags, mode, &fh, perms); + if (r >= 0) { + // allocate a integer file descriptor + ceph_assert(fh); + r = get_fd(); + ceph_assert(fd_map.count(r) == 0); + fd_map[r] = fh; + } + + out: + return r; +} + +int Client::open(const char *relpath, int flags, const UserPerm& perms, + mode_t mode, int stripe_unit, int stripe_count, + int object_size, const char *data_pool, std::string alternate_name) +{ + return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit, + stripe_count, object_size, data_pool, alternate_name); +} + +int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms, + mode_t mode, int stripe_unit, int stripe_count, int object_size, + const char *data_pool, std::string alternate_name) { + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl; + tout(cct) << dirfd << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << flags << std::endl; + tout(cct) << mode << std::endl; + + std::scoped_lock locker(client_lock); + int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count, + object_size, data_pool, alternate_name); + + tout(cct) << r << std::endl; + ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl; + return r; +} + +int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name, + const UserPerm& perms) +{ + ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl; + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH); + filepath path(ino); + req->set_filepath(path); + + uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name)); + char f[30]; + sprintf(f, "%u", h); + filepath path2(dirino); + path2.push_dentry(string(f)); + req->set_filepath2(path2); + + int r = make_request(req, perms, NULL, NULL, + rand() % mdsmap->get_num_in_mds()); + ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl; + return r; +} + + +/** + * Load inode into local cache. + * + * If inode pointer is non-NULL, and take a reference on + * the resulting Inode object in one operation, so that caller + * can safely assume inode will still be there after return. + */ +int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode) +{ + ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl; + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + if (is_reserved_vino(vino)) + return -CEPHFS_ESTALE; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO); + filepath path(vino.ino); + req->set_filepath(path); + + /* + * The MDS expects either a "real" snapid here or 0. The special value + * carveouts for the snapid are all at the end of the range so we can + * just look for any snapid below this value. + */ + if (vino.snapid < CEPH_NOSNAP) + req->head.args.lookupino.snapid = vino.snapid; + + int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds()); + if (r == 0 && inode != NULL) { + unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino); + ceph_assert(p != inode_map.end()); + *inode = p->second; + _ll_get(*inode); + } + ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl; + return r; +} + +int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode) +{ + vinodeno_t vino(ino, CEPH_NOSNAP); + std::scoped_lock lock(client_lock); + return _lookup_vino(vino, perms, inode); +} + +/** + * Find the parent inode of `ino` and insert it into + * our cache. Conditionally also set `parent` to a referenced + * Inode* if caller provides non-NULL value. + */ +int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent) +{ + ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT); + filepath path(ino->ino); + req->set_filepath(path); + + InodeRef target; + int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds()); + // Give caller a reference to the parent ino if they provided a pointer. + if (parent != NULL) { + if (r == 0) { + *parent = target.get(); + _ll_get(*parent); + ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl; + } else { + *parent = NULL; + } + } + ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl; + return r; +} + +/** + * Populate the parent dentry for `ino`, provided it is + * a child of `parent`. + */ +int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms) +{ + ceph_assert(parent->is_dir()); + ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl; + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME); + req->set_filepath2(filepath(parent->ino)); + req->set_filepath(filepath(ino->ino)); + req->set_inode(ino); + + int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds()); + ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl; + return r; +} + +int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms) +{ + std::scoped_lock lock(client_lock); + return _lookup_name(ino, parent, perms); +} + +Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms) +{ + ceph_assert(in); + Fh *f = new Fh(in, flags, cmode, fd_gen, perms); + + ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl; + + if (in->snapid != CEPH_NOSNAP) { + in->snap_cap_refs++; + ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps " + << ccap_string(in->caps_issued()) << dendl; + } + + const auto& conf = cct->_conf; + f->readahead.set_trigger_requests(1); + f->readahead.set_min_readahead_size(conf->client_readahead_min); + uint64_t max_readahead = Readahead::NO_LIMIT; + if (conf->client_readahead_max_bytes) { + max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes); + } + if (conf->client_readahead_max_periods) { + max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods); + } + f->readahead.set_max_readahead_size(max_readahead); + vector<uint64_t> alignments; + alignments.push_back(in->layout.get_period()); + alignments.push_back(in->layout.stripe_unit); + f->readahead.set_alignments(alignments); + + return f; +} + +int Client::_release_fh(Fh *f) +{ + //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl; + //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl; + Inode *in = f->inode.get(); + ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl; + + in->unset_deleg(f); + + if (in->snapid == CEPH_NOSNAP) { + if (in->put_open_ref(f->mode)) { + _flush(in, new C_Client_FlushComplete(this, in)); + check_caps(in, 0); + } + } else { + ceph_assert(in->snap_cap_refs > 0); + in->snap_cap_refs--; + } + + _release_filelocks(f); + + // Finally, read any async err (i.e. from flushes) + int err = f->take_async_err(); + if (err != 0) { + ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = " + << cpp_strerror(err) << dendl; + } else { + ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl; + } + + _put_fh(f); + + return err; +} + +void Client::_put_fh(Fh *f) +{ + int left = f->put(); + if (!left) { + delete f; + } +} + +int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, + const UserPerm& perms) +{ + if (in->snapid != CEPH_NOSNAP && + (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) { + return -CEPHFS_EROFS; + } + + // use normalized flags to generate cmode + int cflags = ceph_flags_sys2wire(flags); + if (cct->_conf.get_val<bool>("client_force_lazyio")) + cflags |= CEPH_O_LAZY; + + int cmode = ceph_flags_to_mode(cflags); + int want = ceph_caps_for_mode(cmode); + int result = 0; + + in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps. + + if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) { + // update wanted? + check_caps(in, CHECK_CAPS_NODELAY); + } else { + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->head.args.open.flags = cflags & ~CEPH_O_CREAT; + req->head.args.open.mode = mode; + req->head.args.open.pool = -1; + if (cct->_conf->client_debug_getattr_caps) + req->head.args.open.mask = DEBUG_GETATTR_CAPS; + else + req->head.args.open.mask = 0; + req->head.args.open.old_size = in->size; // for O_TRUNC + req->set_inode(in); + result = make_request(req, perms); + + /* + * NFS expects that delegations will be broken on a conflicting open, + * not just when there is actual conflicting access to the file. SMB leases + * and oplocks also have similar semantics. + * + * Ensure that clients that have delegations enabled will wait on minimal + * caps during open, just to ensure that other clients holding delegations + * return theirs first. + */ + if (deleg_timeout && result == 0) { + int need = 0, have; + + if (cmode & CEPH_FILE_MODE_WR) + need |= CEPH_CAP_FILE_WR; + if (cmode & CEPH_FILE_MODE_RD) + need |= CEPH_CAP_FILE_RD; + + Fh fh(in, flags, cmode, fd_gen, perms); + result = get_caps(&fh, need, want, &have, -1); + if (result < 0) { + ldout(cct, 8) << "Unable to get caps after open of inode " << *in << + " . Denying open: " << + cpp_strerror(result) << dendl; + } else { + put_cap_ref(in, need); + } + } + } + + // success? + if (result >= 0) { + if (fhp) + *fhp = _create_fh(in, flags, cmode, perms); + } else { + in->put_open_ref(cmode); + } + + trim_cache(); + + return result; +} + +int Client::_renew_caps(Inode *in) +{ + int wanted = in->caps_file_wanted(); + if (in->is_any_caps() && + ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) { + check_caps(in, CHECK_CAPS_NODELAY); + return 0; + } + + int flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->head.args.open.flags = flags; + req->head.args.open.pool = -1; + if (cct->_conf->client_debug_getattr_caps) + req->head.args.open.mask = DEBUG_GETATTR_CAPS; + else + req->head.args.open.mask = 0; + req->set_inode(in); + + // duplicate in case Cap goes away; not sure if that race is a concern? + const UserPerm *pperm = in->get_best_perms(); + UserPerm perms; + if (pperm != NULL) + perms = *pperm; + int ret = make_request(req, perms); + return ret; +} + +int Client::_close(int fd) +{ + ldout(cct, 3) << "close enter(" << fd << ")" << dendl; + tout(cct) << "close" << std::endl; + tout(cct) << fd << std::endl; + + Fh *fh = get_filehandle(fd); + if (!fh) + return -CEPHFS_EBADF; + int err = _release_fh(fh); + fd_map.erase(fd); + put_fd(fd); + ldout(cct, 3) << "close exit(" << fd << ")" << dendl; + return err; +} + +int Client::close(int fd) { + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + return _close(fd); +} + +// ------------ +// read, write + +loff_t Client::lseek(int fd, loff_t offset, int whence) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << "lseek" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << offset << std::endl; + tout(cct) << whence << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + return _lseek(f, offset, whence); +} + +loff_t Client::_lseek(Fh *f, loff_t offset, int whence) +{ + Inode *in = f->inode.get(); + bool whence_check = false; + loff_t pos = -1; + + switch (whence) { + case SEEK_END: + whence_check = true; + break; + +#ifdef SEEK_DATA + case SEEK_DATA: + whence_check = true; + break; +#endif + +#ifdef SEEK_HOLE + case SEEK_HOLE: + whence_check = true; + break; +#endif + } + + if (whence_check) { + int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms); + if (r < 0) + return r; + } + + switch (whence) { + case SEEK_SET: + pos = offset; + break; + + case SEEK_CUR: + pos = f->pos + offset; + break; + + case SEEK_END: + pos = in->size + offset; + break; + +#ifdef SEEK_DATA + case SEEK_DATA: + if (offset < 0 || static_cast<uint64_t>(offset) >= in->size) + return -CEPHFS_ENXIO; + pos = offset; + break; +#endif + +#ifdef SEEK_HOLE + case SEEK_HOLE: + if (offset < 0 || static_cast<uint64_t>(offset) >= in->size) + return -CEPHFS_ENXIO; + pos = in->size; + break; +#endif + + default: + ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl; + return -CEPHFS_EINVAL; + } + + if (pos < 0) { + return -CEPHFS_EINVAL; + } else { + f->pos = pos; + } + + ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl; + return f->pos; +} + + +void Client::lock_fh_pos(Fh *f) +{ + ldout(cct, 10) << __func__ << " " << f << dendl; + + if (f->pos_locked || !f->pos_waiters.empty()) { + ceph::condition_variable cond; + f->pos_waiters.push_back(&cond); + ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl; + std::unique_lock l{client_lock, std::adopt_lock}; + cond.wait(l, [f, me=&cond] { + return !f->pos_locked && f->pos_waiters.front() == me; + }); + l.release(); + ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl; + ceph_assert(f->pos_waiters.front() == &cond); + f->pos_waiters.pop_front(); + } + + f->pos_locked = true; +} + +void Client::unlock_fh_pos(Fh *f) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + ldout(cct, 10) << __func__ << " " << f << dendl; + f->pos_locked = false; + if (!f->pos_waiters.empty()) { + // only wake up the oldest waiter + auto cond = f->pos_waiters.front(); + cond->notify_one(); + } +} + +int Client::uninline_data(Inode *in, Context *onfinish) +{ + if (!in->inline_data.length()) { + onfinish->complete(0); + return 0; + } + + char oid_buf[32]; + snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino); + object_t oid = oid_buf; + + ObjectOperation create_ops; + create_ops.create(false); + + objecter->mutate(oid, + OSDMap::file_to_object_locator(in->layout), + create_ops, + in->snaprealm->get_snap_context(), + ceph::real_clock::now(), + 0, + NULL); + + bufferlist inline_version_bl; + encode(in->inline_version, inline_version_bl); + + ObjectOperation uninline_ops; + uninline_ops.cmpxattr("inline_version", + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64, + inline_version_bl); + bufferlist inline_data = in->inline_data; + uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq); + uninline_ops.setxattr("inline_version", stringify(in->inline_version)); + + objecter->mutate(oid, + OSDMap::file_to_object_locator(in->layout), + uninline_ops, + in->snaprealm->get_snap_context(), + ceph::real_clock::now(), + 0, + onfinish); + + return 0; +} + +// + +// blocking osd interface + +int Client::read(int fd, char *buf, loff_t size, loff_t offset) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << "read" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << size << std::endl; + tout(cct) << offset << std::endl; + + std::unique_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + bufferlist bl; + /* We can't return bytes written larger than INT_MAX, clamp size to that */ + size = std::min(size, (loff_t)INT_MAX); + int r = _read(f, offset, size, &bl); + ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl; + if (r >= 0) { + lock.unlock(); + bl.begin().copy(bl.length(), buf); + r = bl.length(); + } + return r; +} + +int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset) +{ + if (iovcnt < 0) + return -CEPHFS_EINVAL; + return _preadv_pwritev(fd, iov, iovcnt, offset, false); +} + +int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + int want, have = 0; + bool movepos = false; + int64_t rc = 0; + const auto& conf = cct->_conf; + Inode *in = f->inode.get(); + utime_t lat; + utime_t start = ceph_clock_now(); + + if ((f->mode & CEPH_FILE_MODE_RD) == 0) + return -CEPHFS_EBADF; + //bool lazy = f->mode == CEPH_FILE_MODE_LAZY; + + if (offset < 0) { + lock_fh_pos(f); + offset = f->pos; + movepos = true; + } + loff_t start_pos = offset; + + if (in->inline_version == 0) { + auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true); + if (r < 0) { + rc = r; + goto done; + } + ceph_assert(in->inline_version > 0); + } + +retry: + if (f->mode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + { + auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1); + if (r < 0) { + rc = r; + goto done; + } + } + if (f->flags & O_DIRECT) + have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO); + + if (in->inline_version < CEPH_INLINE_NONE) { + uint32_t len = in->inline_data.length(); + uint64_t endoff = offset + size; + if (endoff > in->size) + endoff = in->size; + + if (offset < len) { + if (endoff <= len) { + bl->substr_of(in->inline_data, offset, endoff - offset); + } else { + bl->substr_of(in->inline_data, offset, len - offset); + bl->append_zero(endoff - len); + } + rc = endoff - offset; + } else if ((uint64_t)offset < endoff) { + bl->append_zero(endoff - offset); + rc = endoff - offset; + } else { + rc = 0; + } + goto success; + } + + if (!conf->client_debug_force_sync_read && + conf->client_oc && + (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) { + + if (f->flags & O_RSYNC) { + _flush_range(in, offset, size); + } + rc = _read_async(f, offset, size, bl); + if (rc < 0) + goto done; + } else { + if (f->flags & O_DIRECT) + _flush_range(in, offset, size); + + bool checkeof = false; + rc = _read_sync(f, offset, size, bl, &checkeof); + if (rc < 0) + goto done; + if (checkeof) { + offset += rc; + size -= rc; + + put_cap_ref(in, CEPH_CAP_FILE_RD); + have = 0; + // reverify size + { + auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms); + if (r < 0) { + rc = r; + goto done; + } + } + + // eof? short read. + if ((uint64_t)offset < in->size) + goto retry; + } + } + +success: + ceph_assert(rc >= 0); + update_read_io_size(bl->length()); + if (movepos) { + // adjust fd pos + f->pos = start_pos + rc; + } + + lat = ceph_clock_now(); + lat -= start; + + ++nr_read_request; + update_io_stat_read(lat); + +done: + // done! + if (have) { + put_cap_ref(in, CEPH_CAP_FILE_RD); + } + if (movepos) { + unlock_fh_pos(f); + } + return rc; +} + +Client::C_Readahead::C_Readahead(Client *c, Fh *f) : + client(c), f(f) { + f->get(); + f->readahead.inc_pending(); +} + +Client::C_Readahead::~C_Readahead() { + f->readahead.dec_pending(); + client->_put_fh(f); +} + +void Client::C_Readahead::finish(int r) { + lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl; + client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE); + if (r > 0) { + client->update_read_io_size(r); + } +} + +int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + const auto& conf = cct->_conf; + Inode *in = f->inode.get(); + + ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl; + + // trim read based on file size? + if (off >= in->size) + return 0; + if (len == 0) + return 0; + if (off + len > in->size) { + len = in->size - off; + } + + ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size() + << " max_bytes=" << f->readahead.get_max_readahead_size() + << " max_periods=" << conf->client_readahead_max_periods << dendl; + + // read (and possibly block) + int r = 0; + C_SaferCond onfinish("Client::_read_async flock"); + r = objectcacher->file_read(&in->oset, &in->layout, in->snapid, + off, len, bl, 0, &onfinish); + if (r == 0) { + get_cap_ref(in, CEPH_CAP_FILE_CACHE); + client_lock.unlock(); + r = onfinish.wait(); + client_lock.lock(); + put_cap_ref(in, CEPH_CAP_FILE_CACHE); + update_read_io_size(bl->length()); + } + + if(f->readahead.get_min_readahead_size() > 0) { + pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size); + if (readahead_extent.second > 0) { + ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second + << " (caller wants " << off << "~" << len << ")" << dendl; + Context *onfinish2 = new C_Readahead(this, f); + int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid, + readahead_extent.first, readahead_extent.second, + NULL, 0, onfinish2); + if (r2 == 0) { + ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl; + get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE); + } else { + ldout(cct, 20) << "readahead was no-op, already cached" << dendl; + delete onfinish2; + } + } + } + + return r; +} + +int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, + bool *checkeof) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + Inode *in = f->inode.get(); + uint64_t pos = off; + int left = len; + int read = 0; + + ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl; + + // 0 success, 1 continue and < 0 error happen. + auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) { + int r = onfinish.wait(); + + // if we get ENOENT from OSD, assume 0 bytes returned + if (r == -CEPHFS_ENOENT) + r = 0; + if (r < 0) + return r; + + if (tbl.length()) { + r = tbl.length(); + + read += r; + pos += r; + left -= r; + bl->claim_append(tbl); + } + // short read? + if (r >= 0 && r < wanted) { + if (pos < in->size) { + // zero up to known EOF + int64_t some = in->size - pos; + if (some > left) + some = left; + auto z = buffer::ptr_node::create(some); + z->zero(); + bl->push_back(std::move(z)); + read += some; + pos += some; + left -= some; + if (left == 0) + return 0; + } + + *checkeof = true; + return 0; + } + return 1; + }; + + while (left > 0) { + C_SaferCond onfinish("Client::_read_sync flock"); + bufferlist tbl; + + int wanted = left; + filer->read_trunc(in->ino, &in->layout, in->snapid, + pos, left, &tbl, 0, + in->truncate_size, in->truncate_seq, + &onfinish); + client_lock.unlock(); + int r = wait_and_copy(onfinish, tbl, wanted); + client_lock.lock(); + if (!r) + return read; + if (r < 0) + return r; + } + return read; +} + +int Client::write(int fd, const char *buf, loff_t size, loff_t offset) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << "write" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << size << std::endl; + tout(cct) << offset << std::endl; + + std::scoped_lock lock(client_lock); + Fh *fh = get_filehandle(fd); + if (!fh) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (fh->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + /* We can't return bytes written larger than INT_MAX, clamp size to that */ + size = std::min(size, (loff_t)INT_MAX); + int r = _write(fh, offset, size, buf, NULL, false); + ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl; + return r; +} + +int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset) +{ + if (iovcnt < 0) + return -CEPHFS_EINVAL; + return _preadv_pwritev(fd, iov, iovcnt, offset, true); +} + +int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov, + unsigned iovcnt, int64_t offset, + bool write, bool clamp_to_int) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + +#if defined(__linux__) && defined(O_PATH) + if (fh->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + loff_t totallen = 0; + for (unsigned i = 0; i < iovcnt; i++) { + totallen += iov[i].iov_len; + } + + /* + * Some of the API functions take 64-bit size values, but only return + * 32-bit signed integers. Clamp the I/O sizes in those functions so that + * we don't do I/Os larger than the values we can return. + */ + if (clamp_to_int) { + totallen = std::min(totallen, (loff_t)INT_MAX); + } + if (write) { + int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt); + ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl; + return w; + } else { + bufferlist bl; + int64_t r = _read(fh, offset, totallen, &bl); + ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl; + if (r <= 0) + return r; + + client_lock.unlock(); + auto iter = bl.cbegin(); + for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) { + /* + * This piece of code aims to handle the case that bufferlist + * does not have enough data to fill in the iov + */ + const auto round_size = std::min<unsigned>(resid, iov[j].iov_len); + iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base)); + resid -= round_size; + /* iter is self-updating */ + } + client_lock.lock(); + return r; + } +} + +int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << fd << std::endl; + tout(cct) << offset << std::endl; + + std::scoped_lock cl(client_lock); + Fh *fh = get_filehandle(fd); + if (!fh) + return -CEPHFS_EBADF; + return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true); +} + +int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf, + const struct iovec *iov, int iovcnt) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + uint64_t fpos = 0; + Inode *in = f->inode.get(); + + if ( (uint64_t)(offset+size) > mdsmap->get_max_filesize() && //exceeds config + (uint64_t)(offset+size) > in->size ) { //exceeds filesize + return -CEPHFS_EFBIG; + } + //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl; + + if (objecter->osdmap_pool_full(in->layout.pool_id)) { + return -CEPHFS_ENOSPC; + } + + ceph_assert(in->snapid == CEPH_NOSNAP); + + // was Fh opened as writeable? + if ((f->mode & CEPH_FILE_MODE_WR) == 0) + return -CEPHFS_EBADF; + + // use/adjust fd pos? + if (offset < 0) { + lock_fh_pos(f); + /* + * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may + * change out from under us. + */ + if (f->flags & O_APPEND) { + auto r = _lseek(f, 0, SEEK_END); + if (r < 0) { + unlock_fh_pos(f); + return r; + } + } + offset = f->pos; + fpos = offset+size; + unlock_fh_pos(f); + } + + // check quota + uint64_t endoff = offset + size; + if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size, + f->actor_perms)) { + return -CEPHFS_EDQUOT; + } + + //bool lazy = f->mode == CEPH_FILE_MODE_LAZY; + + ldout(cct, 10) << "cur file size is " << in->size << dendl; + + // time it. + utime_t start = ceph_clock_now(); + + if (in->inline_version == 0) { + int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true); + if (r < 0) + return r; + ceph_assert(in->inline_version > 0); + } + + // copy into fresh buffer (since our write may be resub, async) + bufferlist bl; + if (buf) { + if (size > 0) + bl.append(buf, size); + } else if (iov){ + for (int i = 0; i < iovcnt; i++) { + if (iov[i].iov_len > 0) { + bl.append((const char *)iov[i].iov_base, iov[i].iov_len); + } + } + } + + utime_t lat; + uint64_t totalwritten; + int want, have; + if (f->mode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff); + if (r < 0) + return r; + + put_cap_ref(in, CEPH_CAP_AUTH_SHARED); + if (size > 0) { + r = clear_suid_sgid(in, f->actor_perms); + if (r < 0) { + put_cap_ref(in, CEPH_CAP_FILE_WR); + return r; + } + } + + if (f->flags & O_DIRECT) + have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO); + + ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl; + + std::unique_ptr<C_SaferCond> onuninline = nullptr; + + if (in->inline_version < CEPH_INLINE_NONE) { + if (endoff > cct->_conf->client_max_inline_size || + endoff > CEPH_INLINE_MAX_SIZE || + !(have & CEPH_CAP_FILE_BUFFER)) { + onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock")); + uninline_data(in, onuninline.get()); + } else { + get_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + uint32_t len = in->inline_data.length(); + + if (endoff < len) + in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX + + if (offset < len) + in->inline_data.splice(offset, len - offset); + else if (offset > len) + in->inline_data.append_zero(offset - len); + + in->inline_data.append(bl); + in->inline_version++; + + put_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + goto success; + } + } + + if (cct->_conf->client_oc && + (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) { + // do buffered write + if (!in->oset.dirty_or_tx) + get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER); + + get_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + // async, caching, non-blocking. + r = objectcacher->file_write(&in->oset, &in->layout, + in->snaprealm->get_snap_context(), + offset, size, bl, ceph::real_clock::now(), + 0); + put_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + if (r < 0) + goto done; + + // flush cached write if O_SYNC is set on file fh + // O_DSYNC == O_SYNC on linux < 2.6.33 + // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33 + if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) { + _flush_range(in, offset, size); + } + } else { + if (f->flags & O_DIRECT) + _flush_range(in, offset, size); + + // simple, non-atomic sync write + C_SaferCond onfinish("Client::_write flock"); + get_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(), + offset, size, bl, ceph::real_clock::now(), 0, + in->truncate_size, in->truncate_seq, + &onfinish); + client_lock.unlock(); + r = onfinish.wait(); + client_lock.lock(); + put_cap_ref(in, CEPH_CAP_FILE_BUFFER); + if (r < 0) + goto done; + } + + // if we get here, write was successful, update client metadata +success: + update_write_io_size(size); + // time + lat = ceph_clock_now(); + lat -= start; + + ++nr_write_request; + update_io_stat_write(lat); + + if (fpos) { + lock_fh_pos(f); + f->pos = fpos; + unlock_fh_pos(f); + } + totalwritten = size; + r = (int64_t)totalwritten; + + // extend file? + if (totalwritten + offset > in->size) { + in->size = totalwritten + offset; + in->mark_caps_dirty(CEPH_CAP_FILE_WR); + + if (is_quota_bytes_approaching(in, f->actor_perms)) { + check_caps(in, CHECK_CAPS_NODELAY); + } else if (is_max_size_approaching(in)) { + check_caps(in, 0); + } + + ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl; + } else { + ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl; + } + + // mtime + in->mtime = in->ctime = ceph_clock_now(); + in->change_attr++; + in->mark_caps_dirty(CEPH_CAP_FILE_WR); + +done: + + if (nullptr != onuninline) { + client_lock.unlock(); + int uninline_ret = onuninline->wait(); + client_lock.lock(); + + if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) { + in->inline_data.clear(); + in->inline_version = CEPH_INLINE_NONE; + in->mark_caps_dirty(CEPH_CAP_FILE_WR); + check_caps(in, 0); + } else + r = uninline_ret; + } + + put_cap_ref(in, CEPH_CAP_FILE_WR); + return r; +} + +int Client::_flush(Fh *f) +{ + Inode *in = f->inode.get(); + int err = f->take_async_err(); + if (err != 0) { + ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = " + << cpp_strerror(err) << dendl; + } else { + ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl; + } + + return err; +} + +int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms) +{ + struct ceph_statx stx; + stx.stx_size = length; + return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms); +} + +int Client::ftruncate(int fd, loff_t length, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << length << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + if ((f->mode & CEPH_FILE_MODE_WR) == 0) + return -CEPHFS_EBADF; + struct stat attr; + attr.st_size = length; + return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms); +} + +int Client::fsync(int fd, bool syncdataonly) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << "fsync" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << syncdataonly << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + int r = _fsync(f, syncdataonly); + if (r == 0) { + // The IOs in this fsync were okay, but maybe something happened + // in the background that we shoudl be reporting? + r = f->take_async_err(); + ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly + << ") = 0, async_err = " << r << dendl; + } else { + // Assume that an error we encountered during fsync, even reported + // synchronously, would also have applied the error to the Fh, and we + // should clear it here to avoid returning the same error again on next + // call. + ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = " + << r << dendl; + f->take_async_err(); + } + return r; +} + +int Client::_fsync(Inode *in, bool syncdataonly) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + int r = 0; + std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr; + ceph_tid_t flush_tid = 0; + InodeRef tmp_ref; + utime_t lat; + utime_t start = ceph_clock_now(); + + ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl; + + if (cct->_conf->client_oc) { + object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock")); + tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either + _flush(in, object_cacher_completion.get()); + ldout(cct, 15) << "using return-valued form of _fsync" << dendl; + } + + if (!syncdataonly && in->dirty_caps) { + check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS); + if (in->flushing_caps) + flush_tid = last_flush_tid; + } else ldout(cct, 10) << "no metadata needs to commit" << dendl; + + if (!syncdataonly && !in->unsafe_ops.empty()) { + flush_mdlog_sync(in); + + MetaRequest *req = in->unsafe_ops.back(); + ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl; + + req->get(); + wait_on_list(req->waitfor_safe); + put_request(req); + } + + if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing + client_lock.unlock(); + ldout(cct, 15) << "waiting on data to flush" << dendl; + r = object_cacher_completion->wait(); + client_lock.lock(); + ldout(cct, 15) << "got " << r << " from flush writeback" << dendl; + } else { + // FIXME: this can starve + while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) { + ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER] + << " uncommitted, waiting" << dendl; + wait_on_list(in->waitfor_commit); + } + } + + if (!r) { + if (flush_tid > 0) + wait_sync_caps(in, flush_tid); + + ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl; + } else { + ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! " + << cpp_strerror(-r) << dendl; + } + + lat = ceph_clock_now(); + lat -= start; + logger->tinc(l_c_fsync, lat); + + return r; +} + +int Client::_fsync(Fh *f, bool syncdataonly) +{ + ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl; + return _fsync(f->inode.get(), syncdataonly); +} + +int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << "fstat mask " << hex << mask << dec << std::endl; + tout(cct) << fd << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + int r = _getattr(f->inode, mask, perms); + if (r < 0) + return r; + fill_stat(f->inode, stbuf, NULL); + ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl; + return r; +} + +int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms, + unsigned int want, unsigned int flags) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl; + tout(cct) << fd << std::endl; + + std::scoped_lock lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + + unsigned mask = statx_to_mask(flags, want); + + int r = 0; + if (mask) { + r = _getattr(f->inode, mask, perms); + if (r < 0) { + ldout(cct, 3) << "fstatx exit on error!" << dendl; + return r; + } + } + + fill_statx(f->inode, mask, stx); + ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl; + return r; +} + +int Client::statxat(int dirfd, const char *relpath, + struct ceph_statx *stx, const UserPerm& perms, + unsigned int want, unsigned int flags) { + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl; + tout(cct) << dirfd << std::endl; + tout(cct) << relpath << std::endl; + + unsigned mask = statx_to_mask(flags, want); + + InodeRef dirinode; + std::scoped_lock lock(client_lock); + int r = get_fd_inode(dirfd, &dirinode); + if (r < 0) { + return r; + } + + InodeRef in; + filepath path(relpath); + r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode); + if (r < 0) { + return r; + } + r = _getattr(in, mask, perms); + if (r < 0) { + ldout(cct, 3) << __func__ << " exit on error!" << dendl; + return r; + } + + fill_statx(in, mask, stx); + ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl; + return r; +} + +// not written yet, but i want to link! + +int Client::chdir(const char *relpath, std::string &new_cwd, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << "chdir" << std::endl; + tout(cct) << relpath << std::endl; + + filepath path(relpath); + InodeRef in; + + std::scoped_lock lock(client_lock); + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + + if (!(in.get()->is_dir())) + return -CEPHFS_ENOTDIR; + + if (cwd != in) + cwd.swap(in); + ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl; + + _getcwd(new_cwd, perms); + return 0; +} + +void Client::_getcwd(string& dir, const UserPerm& perms) +{ + filepath path; + ldout(cct, 10) << __func__ << " " << *cwd << dendl; + + Inode *in = cwd.get(); + while (in != root.get()) { + ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked + + // A cwd or ancester is unlinked + if (in->dentries.empty()) { + return; + } + + Dentry *dn = in->get_first_parent(); + + + if (!dn) { + // look it up + ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl; + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME); + filepath path(in->ino); + req->set_filepath(path); + req->set_inode(in); + int res = make_request(req, perms); + if (res < 0) + break; + + // start over + path = filepath(); + in = cwd.get(); + continue; + } + path.push_front_dentry(dn->name); + in = dn->dir->parent_inode; + } + dir = "/"; + dir += path.get_path(); +} + +void Client::getcwd(string& dir, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return; + + std::scoped_lock l(client_lock); + + _getcwd(dir, perms); +} + +int Client::statfs(const char *path, struct statvfs *stbuf, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << std::endl; + unsigned long int total_files_on_fs; + + ceph_statfs stats; + C_SaferCond cond; + + std::unique_lock lock(client_lock); + const vector<int64_t> &data_pools = mdsmap->get_data_pools(); + if (data_pools.size() == 1) { + objecter->get_fs_stats(stats, data_pools[0], &cond); + } else { + objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond); + } + + lock.unlock(); + int rval = cond.wait(); + lock.lock(); + + ceph_assert(root); + total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs; + + if (rval < 0) { + ldout(cct, 1) << "underlying call to statfs returned error: " + << cpp_strerror(rval) + << dendl; + return rval; + } + + memset(stbuf, 0, sizeof(*stbuf)); + + /* + * we're going to set a block size of 4MB so we can represent larger + * FSes without overflowing. Additionally convert the space + * measurements from KB to bytes while making them in terms of + * blocks. We use 4MB only because it is big enough, and because it + * actually *is* the (ceph) default block size. + */ + const int CEPH_BLOCK_SHIFT = 22; + stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT; + stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + stbuf->f_files = total_files_on_fs; + stbuf->f_ffree = -1; + stbuf->f_favail = -1; + stbuf->f_fsid = -1; // ?? + stbuf->f_flag = 0; // ?? + stbuf->f_namemax = NAME_MAX; + + // Usually quota_root will == root_ancestor, but if the mount root has no + // quota but we can see a parent of it that does have a quota, we'll + // respect that one instead. + ceph_assert(root != nullptr); + InodeRef quota_root = root->quota.is_enabled(QUOTA_MAX_BYTES) ? root : get_quota_root(root.get(), perms, QUOTA_MAX_BYTES); + + // get_quota_root should always give us something if client quotas are + // enabled + ceph_assert(cct->_conf.get_val<bool>("client_quota") == false || quota_root != nullptr); + + /* If bytes quota is set on a directory and conf option "client quota df" + * is also set, available space = quota limit - used space. Else, + * available space = total space - used space. */ + if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) { + + // Skip the getattr if any sessions are stale, as we don't want to + // block `df` if this client has e.g. been evicted, or if the MDS cluster + // is unhealthy. + if (!_any_stale_sessions()) { + int r = _getattr(quota_root, 0, perms, true); + if (r != 0) { + // Ignore return value: error getting latest inode metadata is not a good + // reason to break "df". + lderr(cct) << "Error in getattr on quota root 0x" + << std::hex << quota_root->ino << std::dec + << " statfs result may be outdated" << dendl; + } + } + + // Special case: if there is a size quota set on the Inode acting + // as the root for this client mount, then report the quota status + // as the filesystem statistics. + const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT; + const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT; + // It is possible for a quota to be exceeded: arithmetic here must + // handle case where used > total. + const fsblkcnt_t free = total > used ? total - used : 0; + + stbuf->f_blocks = total; + stbuf->f_bfree = free; + stbuf->f_bavail = free; + } else { + // General case: report the cluster statistics returned from RADOS. Because + // multiple pools may be used without one filesystem namespace via + // layouts, this is the most correct thing we can do. + stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10); + stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10); + stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10); + } + + return rval; +} + +int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep, + struct flock *fl, uint64_t owner, bool removing) +{ + ldout(cct, 10) << __func__ << " ino " << in->ino + << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock") + << " type " << fl->l_type << " owner " << owner + << " " << fl->l_start << "~" << fl->l_len << dendl; + + if (in->flags & I_ERROR_FILELOCK) + return -CEPHFS_EIO; + + int lock_cmd; + if (F_RDLCK == fl->l_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->l_type) + lock_cmd = CEPH_LOCK_EXCL; + else if (F_UNLCK == fl->l_type) + lock_cmd = CEPH_LOCK_UNLOCK; + else + return -CEPHFS_EIO; + + if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK) + sleep = 0; + + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + owner |= (1ULL << 63); + + MetaRequest *req = new MetaRequest(op); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(in); + + req->head.args.filelock_change.rule = lock_type; + req->head.args.filelock_change.type = lock_cmd; + req->head.args.filelock_change.owner = owner; + req->head.args.filelock_change.pid = fl->l_pid; + req->head.args.filelock_change.start = fl->l_start; + req->head.args.filelock_change.length = fl->l_len; + req->head.args.filelock_change.wait = sleep; + + int ret; + bufferlist bl; + + if (sleep && switch_interrupt_cb) { + // enable interrupt + switch_interrupt_cb(callback_handle, req->get()); + ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl); + // disable interrupt + switch_interrupt_cb(callback_handle, NULL); + if (ret == 0 && req->aborted()) { + // effect of this lock request has been revoked by the 'lock intr' request + ret = req->get_abort_code(); + } + put_request(req); + } else { + ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl); + } + + if (ret == 0) { + if (op == CEPH_MDS_OP_GETFILELOCK) { + ceph_filelock filelock; + auto p = bl.cbegin(); + decode(filelock, p); + + if (CEPH_LOCK_SHARED == filelock.type) + fl->l_type = F_RDLCK; + else if (CEPH_LOCK_EXCL == filelock.type) + fl->l_type = F_WRLCK; + else + fl->l_type = F_UNLCK; + + fl->l_whence = SEEK_SET; + fl->l_start = filelock.start; + fl->l_len = filelock.length; + fl->l_pid = filelock.pid; + } else if (op == CEPH_MDS_OP_SETFILELOCK) { + ceph_lock_state_t *lock_state; + if (lock_type == CEPH_LOCK_FCNTL) { + if (!in->fcntl_locks) + in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL)); + lock_state = in->fcntl_locks.get(); + } else if (lock_type == CEPH_LOCK_FLOCK) { + if (!in->flock_locks) + in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK)); + lock_state = in->flock_locks.get(); + } else { + ceph_abort(); + return -CEPHFS_EINVAL; + } + _update_lock_state(fl, owner, lock_state); + + if (!removing) { + if (lock_type == CEPH_LOCK_FCNTL) { + if (!fh->fcntl_locks) + fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL)); + lock_state = fh->fcntl_locks.get(); + } else { + if (!fh->flock_locks) + fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK)); + lock_state = fh->flock_locks.get(); + } + _update_lock_state(fl, owner, lock_state); + } + } else + ceph_abort(); + } + return ret; +} + +int Client::_interrupt_filelock(MetaRequest *req) +{ + // Set abort code, but do not kick. The abort code prevents the request + // from being re-sent. + req->abort(-CEPHFS_EINTR); + if (req->mds < 0) + return 0; // haven't sent the request + + Inode *in = req->inode(); + + int lock_type; + if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK) + lock_type = CEPH_LOCK_FLOCK_INTR; + else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL) + lock_type = CEPH_LOCK_FCNTL_INTR; + else { + ceph_abort(); + return -CEPHFS_EINVAL; + } + + MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK); + filepath path; + in->make_nosnap_relative_path(path); + intr_req->set_filepath(path); + intr_req->set_inode(in); + intr_req->head.args.filelock_change = req->head.args.filelock_change; + intr_req->head.args.filelock_change.rule = lock_type; + intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK; + + UserPerm perms(req->get_uid(), req->get_gid()); + return make_request(intr_req, perms, NULL, NULL, -1); +} + +void Client::_encode_filelocks(Inode *in, bufferlist& bl) +{ + if (!in->fcntl_locks && !in->flock_locks) + return; + + unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0; + encode(nr_fcntl_locks, bl); + if (nr_fcntl_locks) { + auto &lock_state = in->fcntl_locks; + for(auto p = lock_state->held_locks.begin(); + p != lock_state->held_locks.end(); + ++p) + encode(p->second, bl); + } + + unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0; + encode(nr_flock_locks, bl); + if (nr_flock_locks) { + auto &lock_state = in->flock_locks; + for(auto p = lock_state->held_locks.begin(); + p != lock_state->held_locks.end(); + ++p) + encode(p->second, bl); + } + + ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks + << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl; +} + +void Client::_release_filelocks(Fh *fh) +{ + if (!fh->fcntl_locks && !fh->flock_locks) + return; + + Inode *in = fh->inode.get(); + ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl; + + list<ceph_filelock> activated_locks; + + list<pair<int, ceph_filelock> > to_release; + + if (fh->fcntl_locks) { + auto &lock_state = fh->fcntl_locks; + for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) { + auto q = p++; + if (in->flags & I_ERROR_FILELOCK) { + lock_state->remove_lock(q->second, activated_locks); + } else { + to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second)); + } + } + lock_state.reset(); + } + if (fh->flock_locks) { + auto &lock_state = fh->flock_locks; + for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) { + auto q = p++; + if (in->flags & I_ERROR_FILELOCK) { + lock_state->remove_lock(q->second, activated_locks); + } else { + to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second)); + } + } + lock_state.reset(); + } + + if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks()) + in->flags &= ~I_ERROR_FILELOCK; + + if (to_release.empty()) + return; + + struct flock fl; + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_type = F_UNLCK; + + for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin(); + p != to_release.end(); + ++p) { + fl.l_start = p->second.start; + fl.l_len = p->second.length; + fl.l_pid = p->second.pid; + _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl, + p->second.owner, true); + } +} + +void Client::_update_lock_state(struct flock *fl, uint64_t owner, + ceph_lock_state_t *lock_state) +{ + int lock_cmd; + if (F_RDLCK == fl->l_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->l_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK;; + + ceph_filelock filelock; + filelock.start = fl->l_start; + filelock.length = fl->l_len; + filelock.client = 0; + // see comment in _do_filelock() + filelock.owner = owner | (1ULL << 63); + filelock.pid = fl->l_pid; + filelock.type = lock_cmd; + + if (filelock.type == CEPH_LOCK_UNLOCK) { + list<ceph_filelock> activated_locks; + lock_state->remove_lock(filelock, activated_locks); + } else { + bool r = lock_state->add_lock(filelock, false, false, NULL); + ceph_assert(r); + } +} + +int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner) +{ + Inode *in = fh->inode.get(); + ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl; + int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner); + return ret; +} + +int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep) +{ + Inode *in = fh->inode.get(); + ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl; + int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner); + ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl; + return ret; +} + +int Client::_flock(Fh *fh, int cmd, uint64_t owner) +{ + Inode *in = fh->inode.get(); + ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl; + + int sleep = !(cmd & LOCK_NB); + cmd &= ~LOCK_NB; + + int type; + switch (cmd) { + case LOCK_SH: + type = F_RDLCK; + break; + case LOCK_EX: + type = F_WRLCK; + break; + case LOCK_UN: + type = F_UNLCK; + break; + default: + return -CEPHFS_EINVAL; + } + + struct flock fl; + memset(&fl, 0, sizeof(fl)); + fl.l_type = type; + fl.l_whence = SEEK_SET; + + int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner); + ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl; + return ret; +} + +int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) { + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) { + return -CEPHFS_ENOTCONN; + } + + std::scoped_lock lock(client_lock); + InodeRef in; + int r = Client::path_walk(path, &in, perms, true); + if (r < 0) { + return r; + } + + if (in->snapid == CEPH_NOSNAP) { + return -CEPHFS_EINVAL; + } + + snap_info->id = in->snapid; + snap_info->metadata = in->snap_metadata; + return 0; +} + +int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms) +{ + /* Since the only thing this does is wrap a call to statfs, and + statfs takes a lock, it doesn't seem we have a need to split it + out. */ + return statfs(0, stbuf, perms); +} + +void Client::_ll_register_callbacks(struct ceph_client_callback_args *args) +{ + if (!args) + return; + + ldout(cct, 10) << __func__ << " cb " << args->handle + << " invalidate_ino_cb " << args->ino_cb + << " invalidate_dentry_cb " << args->dentry_cb + << " switch_interrupt_cb " << args->switch_intr_cb + << " remount_cb " << args->remount_cb + << dendl; + callback_handle = args->handle; + if (args->ino_cb) { + ino_invalidate_cb = args->ino_cb; + async_ino_invalidator.start(); + } + if (args->dentry_cb) { + dentry_invalidate_cb = args->dentry_cb; + async_dentry_invalidator.start(); + } + if (args->switch_intr_cb) { + switch_interrupt_cb = args->switch_intr_cb; + interrupt_finisher.start(); + } + if (args->remount_cb) { + remount_cb = args->remount_cb; + remount_finisher.start(); + } + if (args->ino_release_cb) { + ino_release_cb = args->ino_release_cb; + async_ino_releasor.start(); + } + if (args->umask_cb) + umask_cb = args->umask_cb; +} + +// This is deprecated, use ll_register_callbacks2() instead. +void Client::ll_register_callbacks(struct ceph_client_callback_args *args) +{ + ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting()); + + _ll_register_callbacks(args); +} + +int Client::ll_register_callbacks2(struct ceph_client_callback_args *args) +{ + if (is_mounting() || is_mounted() || is_unmounting()) + return -CEPHFS_EBUSY; + + _ll_register_callbacks(args); + return 0; +} + +std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate) +{ + std::pair <int, bool> r(0, false); + + RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED); + if (!iref_reader.is_state_satisfied()) + return std::make_pair(-CEPHFS_ENOTCONN, false); + + can_invalidate_dentries = can_invalidate; + + /* + * Force to use the old and slow method to invalidate the dcache + * if the euid is non-root, or the remount may fail with return + * code 1 or 32. + */ + uid_t euid = geteuid(); + ldout(cct, 10) << "euid: " << euid << dendl; + if (euid != 0) { + can_invalidate_dentries = true; + } + + if (can_invalidate_dentries) { + ceph_assert(dentry_invalidate_cb); + ldout(cct, 1) << "using dentry_invalidate_cb" << dendl; + } else { + ceph_assert(remount_cb); + ldout(cct, 1) << "using remount_cb" << dendl; + r = _do_remount(false); + } + + return r; +} + +int Client::_sync_fs() +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + ldout(cct, 10) << __func__ << dendl; + + // flush file data + std::unique_ptr<C_SaferCond> cond = nullptr; + if (cct->_conf->client_oc) { + cond.reset(new C_SaferCond("Client::_sync_fs:lock")); + objectcacher->flush_all(cond.get()); + } + + // flush caps + flush_caps_sync(); + ceph_tid_t flush_tid = last_flush_tid; + + // flush the mdlog before waiting for unsafe requests. + flush_mdlog_sync(); + + // wait for unsafe mds requests + wait_unsafe_requests(); + + wait_sync_caps(flush_tid); + + if (nullptr != cond) { + client_lock.unlock(); + ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl; + cond->wait(); + ldout(cct, 15) << __func__ << " flush finished" << dendl; + client_lock.lock(); + } + + return 0; +} + +int Client::sync_fs() +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock l(client_lock); + + return _sync_fs(); +} + +int64_t Client::drop_caches() +{ + std::scoped_lock l(client_lock); + return objectcacher->release_all(); +} + +int Client::_lazyio(Fh *fh, int enable) +{ + Inode *in = fh->inode.get(); + ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl; + + if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable) + return 0; + + int orig_mode = fh->mode; + if (enable) { + fh->mode |= CEPH_FILE_MODE_LAZY; + in->get_open_ref(fh->mode); + in->put_open_ref(orig_mode); + check_caps(in, CHECK_CAPS_NODELAY); + } else { + fh->mode &= ~CEPH_FILE_MODE_LAZY; + in->get_open_ref(fh->mode); + in->put_open_ref(orig_mode); + check_caps(in, 0); + } + + return 0; +} + +int Client::lazyio(int fd, int enable) +{ + std::scoped_lock l(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + + return _lazyio(f, enable); +} + +int Client::ll_lazyio(Fh *fh, int enable) +{ + ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl; + tout(cct) << __func__ << std::endl; + + std::scoped_lock lock(client_lock); + return _lazyio(fh, enable); +} + +int Client::lazyio_propagate(int fd, loff_t offset, size_t count) +{ + std::scoped_lock l(client_lock); + ldout(cct, 3) << "op: client->lazyio_propagate(" << fd + << ", " << offset << ", " << count << ")" << dendl; + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + + // for now + _fsync(f, true); + + return 0; +} + +int Client::lazyio_synchronize(int fd, loff_t offset, size_t count) +{ + std::scoped_lock l(client_lock); + ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd + << ", " << offset << ", " << count << ")" << dendl; + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + Inode *in = f->inode.get(); + + _fsync(f, true); + if (_release(in)) { + int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms); + if (r < 0) + return r; + } + return 0; +} + + +// ============================= +// snaps + +int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm, + mode_t mode, const std::map<std::string, std::string> &metadata) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock l(client_lock); + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perm); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + r = may_create(in.get(), perm); + if (r < 0) + return r; + } + Inode *snapdir = open_snapdir(in.get()); + return _mkdir(snapdir, name, mode, perm, nullptr, metadata); +} + +int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock l(client_lock); + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + Inode *snapdir = open_snapdir(in.get()); + if (cct->_conf->client_permissions) { + r = may_delete(snapdir, check_perms ? name : NULL, perms); + if (r < 0) + return r; + } + return _rmdir(snapdir, name, perms); +} + +// ============================= +// expose caps + +int Client::get_caps_issued(int fd) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + + return f->inode->caps_issued(); +} + +int Client::get_caps_issued(const char *path, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + filepath p(path); + InodeRef in; + int r = path_walk(p, &in, perms, true); + if (r < 0) + return r; + return in->caps_issued(); +} + +// ========================================= +// low level + +void Client::refresh_snapdir_attrs(Inode *in, Inode *diri) { + ldout(cct, 10) << __func__ << ": snapdir inode=" << *in + << ", inode=" << *diri << dendl; + in->ino = diri->ino; + in->snapid = CEPH_SNAPDIR; + in->mode = diri->mode; + in->uid = diri->uid; + in->gid = diri->gid; + in->nlink = 1; + in->mtime = diri->snaprealm->last_modified; + in->ctime = in->mtime; + in->change_attr = diri->snaprealm->change_attr; + in->btime = diri->btime; + in->atime = diri->atime; + in->size = diri->size; + + in->dirfragtree.clear(); + in->snapdir_parent = diri; + // copy posix acls to snapshotted inode + in->xattrs.clear(); + for (auto &[xattr_key, xattr_value] : diri->xattrs) { + if (xattr_key.rfind("system.", 0) == 0) { + in->xattrs[xattr_key] = xattr_value; + } + } +} + +Inode *Client::open_snapdir(Inode *diri) +{ + Inode *in; + vinodeno_t vino(diri->ino, CEPH_SNAPDIR); + if (!inode_map.count(vino)) { + in = new Inode(this, vino, &diri->layout); + refresh_snapdir_attrs(in, diri); + diri->flags |= I_SNAPDIR_OPEN; + inode_map[vino] = in; + if (use_faked_inos()) + _assign_faked_ino(in); + ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl; + } else { + in = inode_map[vino]; + ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl; + } + return in; +} + +int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr, + Inode **out, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << name << std::endl; + + std::scoped_lock lock(client_lock); + + int r = 0; + if (!fuse_default_permissions) { + if (strcmp(name, ".") && strcmp(name, "..")) { + r = may_lookup(parent, perms); + if (r < 0) + return r; + } + } + + string dname(name); + InodeRef in; + + r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms); + if (r < 0) { + attr->st_ino = 0; + goto out; + } + + ceph_assert(in); + fill_stat(in, attr); + _ll_get(in.get()); + + out: + ldout(cct, 3) << __func__ << " " << vparent << " " << name + << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; + tout(cct) << attr->st_ino << std::endl; + *out = in.get(); + return r; +} + +int Client::ll_lookup_vino( + vinodeno_t vino, + const UserPerm& perms, + Inode **inode) +{ + ceph_assert(inode != NULL); + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + if (is_reserved_vino(vino)) + return -CEPHFS_ESTALE; + + std::scoped_lock lock(client_lock); + ldout(cct, 3) << __func__ << " " << vino << dendl; + + // Check the cache first + unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino); + if (p != inode_map.end()) { + *inode = p->second; + _ll_get(*inode); + return 0; + } + + uint64_t snapid = vino.snapid; + + // for snapdir, find the non-snapped dir inode + if (snapid == CEPH_SNAPDIR) + vino.snapid = CEPH_NOSNAP; + + int r = _lookup_vino(vino, perms, inode); + if (r) + return r; + ceph_assert(*inode != NULL); + + if (snapid == CEPH_SNAPDIR) { + Inode *tmp = *inode; + + // open the snapdir and put the inode ref + *inode = open_snapdir(tmp); + _ll_forget(tmp, 1); + _ll_get(*inode); + } + return 0; +} + +int Client::ll_lookup_inode( + struct inodeno_t ino, + const UserPerm& perms, + Inode **inode) +{ + vinodeno_t vino(ino, CEPH_NOSNAP); + return ll_lookup_vino(vino, perms, inode); +} + +int Client::ll_lookupx(Inode *parent, const char *name, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl; + tout(cct) << "ll_lookupx" << std::endl; + tout(cct) << name << std::endl; + + std::scoped_lock lock(client_lock); + + int r = 0; + if (!fuse_default_permissions) { + r = may_lookup(parent, perms); + if (r < 0) + return r; + } + + string dname(name); + InodeRef in; + + unsigned mask = statx_to_mask(flags, want); + r = _lookup(parent, dname, mask, &in, perms); + if (r < 0) { + stx->stx_ino = 0; + stx->stx_mask = 0; + } else { + ceph_assert(in); + fill_statx(in, mask, stx); + _ll_get(in.get()); + } + + ldout(cct, 3) << __func__ << " " << vparent << " " << name + << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl; + tout(cct) << stx->stx_ino << std::endl; + *out = in.get(); + return r; +} + +int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx, + unsigned int want, unsigned int flags, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + filepath fp(name, 0); + InodeRef in; + int rc; + unsigned mask = statx_to_mask(flags, want); + + ldout(cct, 3) << __func__ << " " << name << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << name << std::endl; + + std::scoped_lock lock(client_lock); + rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask); + if (rc < 0) { + /* zero out mask, just in case... */ + stx->stx_mask = 0; + stx->stx_ino = 0; + *out = NULL; + return rc; + } else { + ceph_assert(in); + fill_statx(in, mask, stx); + _ll_get(in.get()); + *out = in.get(); + return 0; + } +} + +void Client::_ll_get(Inode *in) +{ + if (in->ll_ref == 0) { + in->iget(); + if (in->is_dir() && !in->dentries.empty()) { + ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked + in->get_first_parent()->get(); // pin dentry + } + if (in->snapid != CEPH_NOSNAP) + ll_snap_ref[in->snapid]++; + } + in->ll_get(); + ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl; +} + +int Client::_ll_put(Inode *in, uint64_t num) +{ + in->ll_put(num); + ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl; + if (in->ll_ref == 0) { + if (in->is_dir() && !in->dentries.empty()) { + ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked + in->get_first_parent()->put(); // unpin dentry + } + if (in->snapid != CEPH_NOSNAP) { + auto p = ll_snap_ref.find(in->snapid); + ceph_assert(p != ll_snap_ref.end()); + ceph_assert(p->second > 0); + if (--p->second == 0) + ll_snap_ref.erase(p); + } + put_inode(in); + return 0; + } else { + return in->ll_ref; + } +} + +void Client::_ll_drop_pins() +{ + ldout(cct, 10) << __func__ << dendl; + std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit + ceph::unordered_map<vinodeno_t, Inode*>::iterator next; + for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin(); + it != inode_map.end(); + it = next) { + Inode *in = it->second; + next = it; + ++next; + if (in->ll_ref){ + to_be_put.insert(in); + _ll_put(in, in->ll_ref); + } + } +} + +bool Client::_ll_forget(Inode *in, uint64_t count) +{ + inodeno_t ino = in->ino; + + ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << ino.val << std::endl; + tout(cct) << count << std::endl; + + // Ignore forget if we're no longer mounted + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return true; + + if (ino == 1) return true; // ignore forget on root. + + bool last = false; + if (in->ll_ref < count) { + ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count + << ", which only has ll_ref=" << in->ll_ref << dendl; + _ll_put(in, in->ll_ref); + last = true; + } else { + if (_ll_put(in, count) == 0) + last = true; + } + + return last; +} + +bool Client::ll_forget(Inode *in, uint64_t count) +{ + std::scoped_lock lock(client_lock); + return _ll_forget(in, count); +} + +bool Client::ll_put(Inode *in) +{ + /* ll_forget already takes the lock */ + return ll_forget(in, 1); +} + +int Client::ll_get_snap_ref(snapid_t snap) +{ + std::scoped_lock lock(client_lock); + auto p = ll_snap_ref.find(snap); + if (p != ll_snap_ref.end()) + return p->second; + return 0; +} + +snapid_t Client::ll_get_snapid(Inode *in) +{ + std::scoped_lock lock(client_lock); + return in->snapid; +} + +Inode *Client::ll_get_inode(ino_t ino) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return NULL; + + std::scoped_lock lock(client_lock); + + vinodeno_t vino = _map_faked_ino(ino); + unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino); + if (p == inode_map.end()) + return NULL; + Inode *in = p->second; + _ll_get(in); + return in; +} + +Inode *Client::ll_get_inode(vinodeno_t vino) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return NULL; + + if (is_reserved_vino(vino)) + return NULL; + + std::scoped_lock lock(client_lock); + + unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino); + if (p == inode_map.end()) + return NULL; + Inode *in = p->second; + _ll_get(in); + return in; +} + +int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms) +{ + vinodeno_t vino = _get_vino(in); + + ldout(cct, 8) << __func__ << " " << vino << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << vino.ino.val << std::endl; + + if (vino.snapid < CEPH_NOSNAP) + return 0; + else + return _getattr(in, caps, perms); +} + +int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms); + + if (res == 0) + fill_stat(in, attr); + ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl; + return res; +} + +int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want, + unsigned int flags, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + int res = 0; + unsigned mask = statx_to_mask(flags, want); + + if (mask && !in->caps_issued_mask(mask, true)) + res = _ll_getattr(in, mask, perms); + + if (res == 0) + fill_statx(in, mask, stx); + ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl; + return res; +} + +int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms, InodeRef *inp) +{ + vinodeno_t vino = _get_vino(in); + + ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec + << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << stx->stx_mode << std::endl; + tout(cct) << stx->stx_uid << std::endl; + tout(cct) << stx->stx_gid << std::endl; + tout(cct) << stx->stx_size << std::endl; + tout(cct) << stx->stx_mtime << std::endl; + tout(cct) << stx->stx_atime << std::endl; + tout(cct) << stx->stx_btime << std::endl; + tout(cct) << mask << std::endl; + + if (!fuse_default_permissions) { + int res = may_setattr(in, stx, mask, perms); + if (res < 0) + return res; + } + + mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW); + + return __setattrx(in, stx, mask, perms, inp); +} + +int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + InodeRef target(in); + int res = _ll_setattrx(in, stx, mask, perms, &target); + if (res == 0) { + ceph_assert(in == target.get()); + fill_statx(in, in->caps_issued(), stx); + } + + ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl; + return res; +} + +int Client::ll_setattr(Inode *in, struct stat *attr, int mask, + const UserPerm& perms) +{ + struct ceph_statx stx; + stat_to_statx(attr, &stx); + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + InodeRef target(in); + int res = _ll_setattrx(in, &stx, mask, perms, &target); + if (res == 0) { + ceph_assert(in == target.get()); + fill_stat(in, attr); + } + + ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl; + return res; +} + + +// ---------- +// xattrs + +int Client::getxattr(const char *path, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + InodeRef in; + int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR); + if (r < 0) + return r; + return _getxattr(in, name, value, size, perms); +} + +int Client::lgetxattr(const char *path, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + InodeRef in; + int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR); + if (r < 0) + return r; + return _getxattr(in, name, value, size, perms); +} + +int Client::fgetxattr(int fd, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + return _getxattr(f->inode, name, value, size, perms); +} + +int Client::listxattr(const char *path, char *list, size_t size, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + InodeRef in; + int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR); + if (r < 0) + return r; + return Client::_listxattr(in.get(), list, size, perms); +} + +int Client::llistxattr(const char *path, char *list, size_t size, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + InodeRef in; + int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR); + if (r < 0) + return r; + return Client::_listxattr(in.get(), list, size, perms); +} + +int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + return Client::_listxattr(f->inode.get(), list, size, perms); +} + +int Client::removexattr(const char *path, const char *name, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + InodeRef in; + int r = Client::path_walk(path, &in, perms, true); + if (r < 0) + return r; + return _removexattr(in, name, perms); +} + +int Client::lremovexattr(const char *path, const char *name, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + InodeRef in; + int r = Client::path_walk(path, &in, perms, false); + if (r < 0) + return r; + return _removexattr(in, name, perms); +} + +int Client::fremovexattr(int fd, const char *name, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + return _removexattr(f->inode, name, perms); +} + +int Client::setxattr(const char *path, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + _setxattr_maybe_wait_for_osdmap(name, value, size); + + std::scoped_lock lock(client_lock); + + InodeRef in; + int r = Client::path_walk(path, &in, perms, true); + if (r < 0) + return r; + return _setxattr(in, name, value, size, flags, perms); +} + +int Client::lsetxattr(const char *path, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + _setxattr_maybe_wait_for_osdmap(name, value, size); + + std::scoped_lock lock(client_lock); + + InodeRef in; + int r = Client::path_walk(path, &in, perms, false); + if (r < 0) + return r; + return _setxattr(in, name, value, size, flags, perms); +} + +int Client::fsetxattr(int fd, const char *name, const void *value, size_t size, + int flags, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + _setxattr_maybe_wait_for_osdmap(name, value, size); + + std::scoped_lock lock(client_lock); + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + return _setxattr(f->inode, name, value, size, flags, perms); +} + +int Client::_getxattr(Inode *in, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + int r; + const VXattr *vxattr = nullptr; + + vxattr = _match_vxattr(in, name); + if (vxattr) { + r = -CEPHFS_ENODATA; + + // Do a force getattr to get the latest quota before returning + // a value to userspace. + int flags = 0; + if (vxattr->flags & VXATTR_RSTAT) { + flags |= CEPH_STAT_RSTAT; + } + if (vxattr->flags & VXATTR_DIRSTAT) { + flags |= CEPH_CAP_FILE_SHARED; + } + r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true); + if (r != 0) { + // Error from getattr! + return r; + } + + // call pointer-to-member function + char buf[256]; + if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) { + r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf)); + } else { + r = -CEPHFS_ENODATA; + } + + if (size != 0) { + if (r > (int)size) { + r = -CEPHFS_ERANGE; + } else if (r > 0) { + memcpy(value, buf, r); + } + } + goto out; + } + + if (!strncmp(name, "ceph.", 5)) { + r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE); + goto out; + } + + if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) { + r = -CEPHFS_EOPNOTSUPP; + goto out; + } + + r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0); + if (r == 0) { + string n(name); + r = -CEPHFS_ENODATA; + if (in->xattrs.count(n)) { + r = in->xattrs[n].length(); + if (r > 0 && size != 0) { + if (size >= (unsigned)r) + memcpy(value, in->xattrs[n].c_str(), r); + else + r = -CEPHFS_ERANGE; + } + } + } + out: + ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl; + return r; +} + +int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + if (cct->_conf->client_permissions) { + int r = xattr_permission(in.get(), name, MAY_READ, perms); + if (r < 0) + return r; + } + return _getxattr(in.get(), name, value, size, perms); +} + +int Client::ll_getxattr(Inode *in, const char *name, void *value, + size_t size, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + std::scoped_lock lock(client_lock); + if (!fuse_default_permissions) { + int r = xattr_permission(in, name, MAY_READ, perms); + if (r < 0) + return r; + } + + return _getxattr(in, name, value, size, perms); +} + +int Client::_listxattr(Inode *in, char *name, size_t size, + const UserPerm& perms) +{ + bool len_only = (size == 0); + int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0); + if (r != 0) { + goto out; + } + + r = 0; + for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) { + if (xattr_name.rfind("ceph.", 0) == 0) { + continue; + } + + size_t this_len = xattr_name.length() + 1; + r += this_len; + if (len_only) + continue; + + if (this_len > size) { + r = -CEPHFS_ERANGE; + goto out; + } + + memcpy(name, xattr_name.c_str(), this_len); + name += this_len; + size -= this_len; + } +out: + ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl; + return r; +} + +int Client::ll_listxattr(Inode *in, char *names, size_t size, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << size << std::endl; + + std::scoped_lock lock(client_lock); + return _listxattr(in, names, size, perms); +} + +int Client::_do_setxattr(Inode *in, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + + int xattr_flags = 0; + if (!value) + xattr_flags |= CEPH_XATTR_REMOVE; + if (flags & XATTR_CREATE) + xattr_flags |= CEPH_XATTR_CREATE; + if (flags & XATTR_REPLACE) + xattr_flags |= CEPH_XATTR_REPLACE; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_string2(name); + req->set_inode(in); + req->head.args.setxattr.flags = xattr_flags; + + bufferlist bl; + ceph_assert(value || size == 0); + bl.append((const char*)value, size); + req->set_data(bl); + + int res = make_request(req, perms); + + trim_cache(); + ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " << + res << dendl; + return res; +} + +int Client::_setxattr(Inode *in, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + if (in->snapid != CEPH_NOSNAP) { + return -CEPHFS_EROFS; + } + + if (size == 0) { + value = ""; + } else if (value == NULL) { + return -CEPHFS_EINVAL; + } + + bool posix_acl_xattr = false; + if (acl_type == POSIX_ACL) + posix_acl_xattr = !strncmp(name, "system.", 7); + + if (strncmp(name, "user.", 5) && + strncmp(name, "security.", 9) && + strncmp(name, "trusted.", 8) && + strncmp(name, "ceph.", 5) && + !posix_acl_xattr) + return -CEPHFS_EOPNOTSUPP; + + bool check_realm = false; + + if (posix_acl_xattr) { + if (!strcmp(name, ACL_EA_ACCESS)) { + mode_t new_mode = in->mode; + if (value) { + int ret = posix_acl_equiv_mode(value, size, &new_mode); + if (ret < 0) + return ret; + if (ret == 0) { + value = NULL; + size = 0; + } + if (new_mode != in->mode) { + struct ceph_statx stx; + stx.stx_mode = new_mode; + ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, nullptr); + if (ret < 0) + return ret; + } + } + } else if (!strcmp(name, ACL_EA_DEFAULT)) { + if (value) { + if (!S_ISDIR(in->mode)) + return -CEPHFS_EACCES; + int ret = posix_acl_check(value, size); + if (ret < 0) + return -CEPHFS_EINVAL; + if (ret == 0) { + value = NULL; + size = 0; + } + } + } else { + return -CEPHFS_EOPNOTSUPP; + } + } else { + const VXattr *vxattr = _match_vxattr(in, name); + if (vxattr) { + if (vxattr->readonly) + return -CEPHFS_EOPNOTSUPP; + if (vxattr->setxattr_cb) + return (this->*(vxattr->setxattr_cb))(in, value, size, perms); + if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value) + check_realm = true; + } + } + + int ret = _do_setxattr(in, name, value, size, flags, perms); + if (ret >= 0 && check_realm) { + // check if snaprealm was created for quota inode + if (in->quota.is_enabled() && + !(in->snaprealm && in->snaprealm->ino == in->ino)) + ret = -CEPHFS_EOPNOTSUPP; + } + + return ret; +} + +int Client::_setxattr(InodeRef &in, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + if (cct->_conf->client_permissions) { + int r = xattr_permission(in.get(), name, MAY_WRITE, perms); + if (r < 0) + return r; + } + return _setxattr(in.get(), name, value, size, flags, perms); +} + +int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap) +{ + string tmp; + if (name == "layout") { + string::iterator begin = value.begin(); + string::iterator end = value.end(); + keys_and_values<string::iterator> p; // create instance of parser + std::map<string, string> m; // map to receive results + if (!qi::parse(begin, end, p, m)) { // returns true if successful + return -CEPHFS_EINVAL; + } + if (begin != end) + return -CEPHFS_EINVAL; + for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) { + if (q->first == "pool") { + tmp = q->second; + break; + } + } + } else if (name == "layout.pool") { + tmp = value; + } + + if (tmp.length()) { + int64_t pool; + try { + pool = boost::lexical_cast<unsigned>(tmp); + if (!osdmap->have_pg_pool(pool)) + return -CEPHFS_ENOENT; + } catch (boost::bad_lexical_cast const&) { + pool = osdmap->lookup_pg_pool_name(tmp); + if (pool < 0) { + return -CEPHFS_ENOENT; + } + } + } + + return 0; +} + +void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size) +{ + // For setting pool of layout, MetaRequest need osdmap epoch. + // There is a race which create a new data pool but client and mds both don't have. + // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap. + ldout(cct, 15) << __func__ << ": name = " << name << dendl; + if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 || + strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) { + string rest(strstr(name, "layout")); + string v((const char*)value, size); + int r = objecter->with_osdmap([&](const OSDMap& o) { + return _setxattr_check_data_pool(rest, v, &o); + }); + + if (r == -CEPHFS_ENOENT) { + bs::error_code ec; + ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl; + objecter->wait_for_latest_osdmap(ca::use_blocked[ec]); + ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl; + } + } +} + +int Client::ll_setxattr(Inode *in, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + _setxattr_maybe_wait_for_osdmap(name, value, size); + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl; + tout(cct) << __func__ << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + std::scoped_lock lock(client_lock); + if (!fuse_default_permissions) { + int r = xattr_permission(in, name, MAY_WRITE, perms); + if (r < 0) + return r; + } + return _setxattr(in, name, value, size, flags, perms); +} + +int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms) +{ + if (in->snapid != CEPH_NOSNAP) { + return -CEPHFS_EROFS; + } + + // same xattrs supported by kernel client + if (strncmp(name, "user.", 5) && + strncmp(name, "system.", 7) && + strncmp(name, "security.", 9) && + strncmp(name, "trusted.", 8) && + strncmp(name, "ceph.", 5)) + return -CEPHFS_EOPNOTSUPP; + + const VXattr *vxattr = _match_vxattr(in, name); + if (vxattr && vxattr->readonly) + return -CEPHFS_EOPNOTSUPP; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_filepath2(name); + req->set_inode(in); + + int res = make_request(req, perms); + + trim_cache(); + ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl; + return res; +} + +int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms) +{ + if (cct->_conf->client_permissions) { + int r = xattr_permission(in.get(), name, MAY_WRITE, perms); + if (r < 0) + return r; + } + return _removexattr(in.get(), name, perms); +} + +int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl; + tout(cct) << "ll_removexattr" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + std::scoped_lock lock(client_lock); + if (!fuse_default_permissions) { + int r = xattr_permission(in, name, MAY_WRITE, perms); + if (r < 0) + return r; + } + + return _removexattr(in, name, perms); +} + +bool Client::_vxattrcb_fscrypt_auth_exists(Inode *in) +{ + bool exists = !in->fscrypt_auth.empty(); + + ldout(cct, 10) << "fscrypt_auth exists " << exists << dendl; + return exists; +} + +size_t Client::_vxattrcb_fscrypt_auth(Inode *in, char *val, size_t size) +{ + size_t count = in->fscrypt_auth.size(); + + if (count <= size) + memcpy(val, in->fscrypt_auth.data(), count); + return count; +} + +int Client::_vxattrcb_fscrypt_auth_set(Inode *in, const void *val, size_t size, + const UserPerm& perms) +{ + struct ceph_statx stx = { 0 }; + std::vector<uint8_t> aux; + + aux.resize(size); + memcpy(aux.data(), val, size); + + return _do_setattr(in, &stx, CEPH_SETATTR_FSCRYPT_AUTH, perms, nullptr, &aux); +} + +bool Client::_vxattrcb_fscrypt_file_exists(Inode *in) +{ + return !in->fscrypt_file.empty(); +} + +size_t Client::_vxattrcb_fscrypt_file(Inode *in, char *val, size_t size) +{ + size_t count = in->fscrypt_file.size(); + + if (count <= size) + memcpy(val, in->fscrypt_file.data(), count); + return count; +} + +int Client::_vxattrcb_fscrypt_file_set(Inode *in, const void *val, size_t size, + const UserPerm& perms) +{ + struct ceph_statx stx = { 0 }; + std::vector<uint8_t> aux; + + aux.resize(size); + memcpy(aux.data(), val, size); + + return _do_setattr(in, &stx, CEPH_SETATTR_FSCRYPT_FILE, perms, nullptr, &aux); +} + +bool Client::_vxattrcb_quota_exists(Inode *in) +{ + return in->quota.is_enabled() && + (in->snapid != CEPH_NOSNAP || + (in->snaprealm && in->snaprealm->ino == in->ino)); +} +size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, + "max_bytes=%lld max_files=%lld", + (long long int)in->quota.max_bytes, + (long long int)in->quota.max_files); +} +size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes); +} +size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (long long int)in->quota.max_files); +} + +bool Client::_vxattrcb_layout_exists(Inode *in) +{ + return in->layout != file_layout_t(); +} +size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size) +{ + int r = snprintf(val, size, + "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=", + (unsigned long long)in->layout.stripe_unit, + (unsigned long long)in->layout.stripe_count, + (unsigned long long)in->layout.object_size); + objecter->with_osdmap([&](const OSDMap& o) { + if (o.have_pg_pool(in->layout.pool_id)) + r += snprintf(val + r, size - r, "%s", + o.get_pool_name(in->layout.pool_id).c_str()); + else + r += snprintf(val + r, size - r, "%" PRIu64, + (uint64_t)in->layout.pool_id); + }); + if (in->layout.pool_ns.length()) + r += snprintf(val + r, size - r, " pool_namespace=%s", + in->layout.pool_ns.c_str()); + return r; +} +size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit); +} +size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count); +} +size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size); +} +size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size) +{ + size_t r; + objecter->with_osdmap([&](const OSDMap& o) { + if (o.have_pg_pool(in->layout.pool_id)) + r = snprintf(val, size, "%s", o.get_pool_name( + in->layout.pool_id).c_str()); + else + r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id); + }); + return r; +} +size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%s", in->layout.pool_ns.c_str()); +} +size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs)); +} +size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles); +} +size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs); +} +size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs)); +} +size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles); +} +size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs); +} +size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps); +} +size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes); +} +size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(), + (long)in->rstat.rctime.nsec()); +} +bool Client::_vxattrcb_dir_pin_exists(Inode *in) +{ + return in->dir_pin != -CEPHFS_ENODATA; +} +size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%ld", (long)in->dir_pin); +} + +bool Client::_vxattrcb_snap_btime_exists(Inode *in) +{ + return !in->snap_btime.is_zero(); +} + +size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%llu.%09lu", + (long long unsigned)in->snap_btime.sec(), + (long unsigned)in->snap_btime.nsec()); +} + +size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size) +{ + int issued; + + in->caps_issued(&issued); + return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued); +} + +bool Client::_vxattrcb_mirror_info_exists(Inode *in) +{ + // checking one of the xattrs would suffice + return in->xattrs.count("ceph.mirror.info.cluster_id") != 0; +} + +size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s", + in->xattrs["ceph.mirror.info.cluster_id"].length(), + in->xattrs["ceph.mirror.info.cluster_id"].c_str(), + in->xattrs["ceph.mirror.info.fs_id"].length(), + in->xattrs["ceph.mirror.info.fs_id"].c_str()); +} + +size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str()); +} + +size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size) +{ + auto name = messenger->get_myname(); + return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num()); +} + +#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2 + +#define XATTR_NAME_CEPH(_type, _name, _flags) \ +{ \ + name: CEPH_XATTR_NAME(_type, _name), \ + getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \ + readonly: true, \ + exists_cb: NULL, \ + flags: _flags, \ +} +#define XATTR_LAYOUT_FIELD(_type, _name, _field) \ +{ \ + name: CEPH_XATTR_NAME2(_type, _name, _field), \ + getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \ + readonly: false, \ + exists_cb: &Client::_vxattrcb_layout_exists, \ + flags: 0, \ +} +#define XATTR_QUOTA_FIELD(_type, _name) \ +{ \ + name: CEPH_XATTR_NAME(_type, _name), \ + getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \ + readonly: false, \ + exists_cb: &Client::_vxattrcb_quota_exists, \ + flags: 0, \ +} + +const Client::VXattr Client::_dir_vxattrs[] = { + { + name: "ceph.dir.layout", + getxattr_cb: &Client::_vxattrcb_layout, + readonly: false, + exists_cb: &Client::_vxattrcb_layout_exists, + flags: 0, + }, + // FIXME + // Delete the following dir layout field definitions for release "S" + XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), + XATTR_LAYOUT_FIELD(dir, layout, stripe_count), + XATTR_LAYOUT_FIELD(dir, layout, object_size), + XATTR_LAYOUT_FIELD(dir, layout, pool), + XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), + XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT), + XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT), + XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT), + XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT), + XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT), + XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT), + XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT), + XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT), + XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT), + { + name: "ceph.quota", + getxattr_cb: &Client::_vxattrcb_quota, + readonly: false, + exists_cb: &Client::_vxattrcb_quota_exists, + flags: 0, + }, + XATTR_QUOTA_FIELD(quota, max_bytes), + XATTR_QUOTA_FIELD(quota, max_files), + // FIXME + // Delete the following dir pin field definitions for release "S" + { + name: "ceph.dir.pin", + getxattr_cb: &Client::_vxattrcb_dir_pin, + readonly: false, + exists_cb: &Client::_vxattrcb_dir_pin_exists, + flags: 0, + }, + { + name: "ceph.snap.btime", + getxattr_cb: &Client::_vxattrcb_snap_btime, + readonly: true, + exists_cb: &Client::_vxattrcb_snap_btime_exists, + flags: 0, + }, + { + name: "ceph.mirror.info", + getxattr_cb: &Client::_vxattrcb_mirror_info, + readonly: false, + exists_cb: &Client::_vxattrcb_mirror_info_exists, + flags: 0, + }, + { + name: "ceph.caps", + getxattr_cb: &Client::_vxattrcb_caps, + readonly: true, + exists_cb: NULL, + flags: 0, + }, + { name: "" } /* Required table terminator */ +}; + +const Client::VXattr Client::_file_vxattrs[] = { + { + name: "ceph.file.layout", + getxattr_cb: &Client::_vxattrcb_layout, + readonly: false, + exists_cb: &Client::_vxattrcb_layout_exists, + flags: 0, + }, + XATTR_LAYOUT_FIELD(file, layout, stripe_unit), + XATTR_LAYOUT_FIELD(file, layout, stripe_count), + XATTR_LAYOUT_FIELD(file, layout, object_size), + XATTR_LAYOUT_FIELD(file, layout, pool), + XATTR_LAYOUT_FIELD(file, layout, pool_namespace), + { + name: "ceph.snap.btime", + getxattr_cb: &Client::_vxattrcb_snap_btime, + readonly: true, + exists_cb: &Client::_vxattrcb_snap_btime_exists, + flags: 0, + }, + { + name: "ceph.caps", + getxattr_cb: &Client::_vxattrcb_caps, + readonly: true, + exists_cb: NULL, + flags: 0, + }, + { name: "" } /* Required table terminator */ +}; + +const Client::VXattr Client::_common_vxattrs[] = { + { + name: "ceph.cluster_fsid", + getxattr_cb: &Client::_vxattrcb_cluster_fsid, + readonly: true, + exists_cb: nullptr, + flags: 0, + }, + { + name: "ceph.client_id", + getxattr_cb: &Client::_vxattrcb_client_id, + readonly: true, + exists_cb: nullptr, + flags: 0, + }, + { + name: "ceph.fscrypt.auth", + getxattr_cb: &Client::_vxattrcb_fscrypt_auth, + setxattr_cb: &Client::_vxattrcb_fscrypt_auth_set, + readonly: false, + exists_cb: &Client::_vxattrcb_fscrypt_auth_exists, + flags: 0, + }, + { + name: "ceph.fscrypt.file", + getxattr_cb: &Client::_vxattrcb_fscrypt_file, + setxattr_cb: &Client::_vxattrcb_fscrypt_file_set, + readonly: false, + exists_cb: &Client::_vxattrcb_fscrypt_file_exists, + flags: 0, + }, + { name: "" } /* Required table terminator */ +}; + +const Client::VXattr *Client::_get_vxattrs(Inode *in) +{ + if (in->is_dir()) + return _dir_vxattrs; + else if (in->is_file()) + return _file_vxattrs; + return NULL; +} + +const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name) +{ + if (strncmp(name, "ceph.", 5) == 0) { + const VXattr *vxattr = _get_vxattrs(in); + if (vxattr) { + while (!vxattr->name.empty()) { + if (vxattr->name == name) + return vxattr; + vxattr++; + } + } + + // for common vxattrs + vxattr = _common_vxattrs; + while (!vxattr->name.empty()) { + if (vxattr->name == name) + return vxattr; + vxattr++; + } + } + + return NULL; +} + +int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_readlink " << vino << dendl; + tout(cct) << "ll_readlink" << std::endl; + tout(cct) << vino.ino.val << std::endl; + + std::scoped_lock lock(client_lock); + for (auto dn : in->dentries) { + touch_dn(dn); + } + + int r = _readlink(in, buf, buflen); // FIXME: no permission checking! + ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl; + return r; +} + +int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, + const UserPerm& perms, InodeRef *inp) +{ + ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct + << mode << dec << ", " << rdev << ", uid " << perms.uid() + << ", gid " << perms.gid() << ")" << dendl; + + if (strlen(name) > NAME_MAX) + return -CEPHFS_ENAMETOOLONG; + + if (dir->snapid != CEPH_NOSNAP) { + return -CEPHFS_EROFS; + } + if (is_quota_files_exceeded(dir, perms)) { + return -CEPHFS_EDQUOT; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD); + + req->set_inode_owner_uid_gid(perms.uid(), perms.gid()); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_inode(dir); + req->head.args.mknod.rdev = rdev; + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + bufferlist xattrs_bl; + int res = _posix_acl_create(dir, &mode, xattrs_bl, perms); + if (res < 0) { + put_request(req); + return res; + } + req->head.args.mknod.mode = mode; + if (xattrs_bl.length() > 0) + req->set_data(xattrs_bl); + + Dentry *de = get_or_create(dir, name); + req->set_dentry(de); + + res = make_request(req, perms, inp); + + trim_cache(); + + ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl; + return res; +} + +int Client::ll_mknod(Inode *parent, const char *name, mode_t mode, + dev_t rdev, struct stat *attr, Inode **out, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl; + tout(cct) << "ll_mknod" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + tout(cct) << rdev << std::endl; + + std::scoped_lock lock(client_lock); + if (!fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _mknod(parent, name, mode, rdev, perms, &in); + if (r == 0) { + fill_stat(in, attr); + _ll_get(in.get()); + } + tout(cct) << attr->st_ino << std::endl; + ldout(cct, 3) << "ll_mknod " << vparent << " " << name + << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode, + dev_t rdev, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + unsigned caps = statx_to_mask(flags, want); + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl; + tout(cct) << "ll_mknodx" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + tout(cct) << rdev << std::endl; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _mknod(parent, name, mode, rdev, perms, &in); + if (r == 0) { + fill_statx(in, caps, stx); + _ll_get(in.get()); + } + tout(cct) << stx->stx_ino << std::endl; + ldout(cct, 3) << "ll_mknodx " << vparent << " " << name + << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, + InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count, + int object_size, const char *data_pool, bool *created, + const UserPerm& perms, std::string alternate_name) +{ + ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct << + mode << dec << ")" << dendl; + + if (strlen(name) > NAME_MAX) + return -CEPHFS_ENAMETOOLONG; + if (dir->snapid != CEPH_NOSNAP) { + return -CEPHFS_EROFS; + } + if (is_quota_files_exceeded(dir, perms)) { + return -CEPHFS_EDQUOT; + } + + // use normalized flags to generate cmode + int cflags = ceph_flags_sys2wire(flags); + if (cct->_conf.get_val<bool>("client_force_lazyio")) + cflags |= CEPH_O_LAZY; + + int cmode = ceph_flags_to_mode(cflags); + + int64_t pool_id = -1; + if (data_pool && *data_pool) { + pool_id = objecter->with_osdmap( + std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool); + if (pool_id < 0) + return -CEPHFS_EINVAL; + if (pool_id > 0xffffffffll) + return -CEPHFS_ERANGE; // bummer! + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE); + + req->set_inode_owner_uid_gid(perms.uid(), perms.gid()); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_alternate_name(std::move(alternate_name)); + req->set_inode(dir); + req->head.args.open.flags = cflags | CEPH_O_CREAT; + + req->head.args.open.stripe_unit = stripe_unit; + req->head.args.open.stripe_count = stripe_count; + req->head.args.open.object_size = object_size; + if (cct->_conf->client_debug_getattr_caps) + req->head.args.open.mask = DEBUG_GETATTR_CAPS; + else + req->head.args.open.mask = 0; + req->head.args.open.pool = pool_id; + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + mode |= S_IFREG; + bufferlist xattrs_bl; + int res = _posix_acl_create(dir, &mode, xattrs_bl, perms); + if (res < 0) { + put_request(req); + return res; + } + req->head.args.open.mode = mode; + if (xattrs_bl.length() > 0) + req->set_data(xattrs_bl); + + Dentry *de = get_or_create(dir, name); + req->set_dentry(de); + + res = make_request(req, perms, inp, created); + if (res < 0) { + goto reply_error; + } + + /* If the caller passed a value in fhp, do the open */ + if(fhp) { + (*inp)->get_open_ref(cmode); + *fhp = _create_fh(inp->get(), flags, cmode, perms); + } + + reply_error: + trim_cache(); + + ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec + << " layout " << stripe_unit + << ' ' << stripe_count + << ' ' << object_size + <<") = " << res << dendl; + return res; +} + +int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm, + InodeRef *inp, const std::map<std::string, std::string> &metadata, + std::string alternate_name) +{ + ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct + << mode << dec << ", uid " << perm.uid() + << ", gid " << perm.gid() << ")" << dendl; + + if (strlen(name) > NAME_MAX) + return -CEPHFS_ENAMETOOLONG; + + if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) { + return -CEPHFS_EROFS; + } + if (is_quota_files_exceeded(dir, perm)) { + return -CEPHFS_EDQUOT; + } + + bool is_snap_op = dir->snapid == CEPH_SNAPDIR; + MetaRequest *req = new MetaRequest(is_snap_op ? + CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR); + + if (!is_snap_op) + req->set_inode_owner_uid_gid(perm.uid(), perm.gid()); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_inode(dir); + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + req->set_alternate_name(std::move(alternate_name)); + + mode |= S_IFDIR; + bufferlist bl; + int res = _posix_acl_create(dir, &mode, bl, perm); + if (res < 0) { + put_request(req); + return res; + } + req->head.args.mkdir.mode = mode; + if (is_snap_op) { + SnapPayload payload; + // clear the bufferlist that may have been populated by the call + // to _posix_acl_create(). MDS mksnap does not make use of it. + // So, reuse it to pass metadata payload. + bl.clear(); + payload.metadata = metadata; + encode(payload, bl); + } + if (bl.length() > 0) { + req->set_data(bl); + } + + Dentry *de = get_or_create(dir, name); + req->set_dentry(de); + + ldout(cct, 10) << "_mkdir: making request" << dendl; + res = make_request(req, perm, inp); + ldout(cct, 10) << "_mkdir result is " << res << dendl; + + trim_cache(); + + ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl; + return res; +} + +int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode, + struct stat *attr, Inode **out, const UserPerm& perm) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl; + tout(cct) << "ll_mkdir" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + int r = may_create(parent, perm); + if (r < 0) + return r; + } + + InodeRef in; + int r = _mkdir(parent, name, mode, perm, &in); + if (r == 0) { + fill_stat(in, attr); + _ll_get(in.get()); + } + tout(cct) << attr->st_ino << std::endl; + ldout(cct, 3) << "ll_mkdir " << vparent << " " << name + << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl; + tout(cct) << "ll_mkdirx" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _mkdir(parent, name, mode, perms, &in); + if (r == 0) { + fill_statx(in, statx_to_mask(flags, want), stx); + _ll_get(in.get()); + } else { + stx->stx_ino = 0; + stx->stx_mask = 0; + } + tout(cct) << stx->stx_ino << std::endl; + ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name + << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::_symlink(Inode *dir, const char *name, const char *target, + const UserPerm& perms, std::string alternate_name, InodeRef *inp) +{ + ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target + << ", uid " << perms.uid() << ", gid " << perms.gid() << ")" + << dendl; + + if (strlen(name) > NAME_MAX) + return -CEPHFS_ENAMETOOLONG; + + if (dir->snapid != CEPH_NOSNAP) { + return -CEPHFS_EROFS; + } + if (is_quota_files_exceeded(dir, perms)) { + return -CEPHFS_EDQUOT; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK); + + req->set_inode_owner_uid_gid(perms.uid(), perms.gid()); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_alternate_name(std::move(alternate_name)); + req->set_inode(dir); + req->set_string2(target); + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + Dentry *de = get_or_create(dir, name); + req->set_dentry(de); + + int res = make_request(req, perms, inp); + + trim_cache(); + ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " << + res << dendl; + return res; +} + +int Client::ll_symlink(Inode *parent, const char *name, const char *value, + struct stat *attr, Inode **out, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value + << dendl; + tout(cct) << "ll_symlink" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << value << std::endl; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _symlink(parent, name, value, perms, "", &in); + if (r == 0) { + fill_stat(in, attr); + _ll_get(in.get()); + } + tout(cct) << attr->st_ino << std::endl; + ldout(cct, 3) << "ll_symlink " << vparent << " " << name + << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::ll_symlinkx(Inode *parent, const char *name, const char *value, + Inode **out, struct ceph_statx *stx, unsigned want, + unsigned flags, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value + << dendl; + tout(cct) << "ll_symlinkx" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << value << std::endl; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _symlink(parent, name, value, perms, "", &in); + if (r == 0) { + fill_statx(in, statx_to_mask(flags, want), stx); + _ll_get(in.get()); + } + tout(cct) << stx->stx_ino << std::endl; + ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name + << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm) +{ + ldout(cct, 8) << "_unlink(" << dir->ino << " " << name + << " uid " << perm.uid() << " gid " << perm.gid() + << ")" << dendl; + + if (dir->snapid != CEPH_NOSNAP) { + return -CEPHFS_EROFS; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + + InodeRef otherin; + Inode *in; + Dentry *de = get_or_create(dir, name); + req->set_dentry(de); + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + int res = _lookup(dir, name, 0, &otherin, perm); + if (res < 0) { + put_request(req); + return res; + } + + in = otherin.get(); + req->set_other_inode(in); + in->break_all_delegs(); + req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + req->set_inode(dir); + + res = make_request(req, perm); + + trim_cache(); + ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl; + return res; +} + +int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl; + tout(cct) << "ll_unlink" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + int r = may_delete(in, name, perm); + if (r < 0) + return r; + } + return _unlink(in, name, perm); +} + +int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms) +{ + ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid " + << perms.uid() << " gid " << perms.gid() << ")" << dendl; + + if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) { + return -CEPHFS_EROFS; + } + + int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR; + MetaRequest *req = new MetaRequest(op); + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_inode(dir); + + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + InodeRef in; + + Dentry *de = get_or_create(dir, name); + if (op == CEPH_MDS_OP_RMDIR) + req->set_dentry(de); + else + de->get(); + + int res = _lookup(dir, name, 0, &in, perms); + if (res < 0) { + put_request(req); + return res; + } + + if (op == CEPH_MDS_OP_RMSNAP) { + unlink(de, true, true); + de->put(); + } + req->set_other_inode(in.get()); + + res = make_request(req, perms); + + trim_cache(); + ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl; + return res; +} + +int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl; + tout(cct) << "ll_rmdir" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + int r = may_delete(in, name, perms); + if (r < 0) + return r; + } + + return _rmdir(in, name, perms); +} + +int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name) +{ + ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to " + << todir->ino << " " << toname + << " uid " << perm.uid() << " gid " << perm.gid() << ")" + << dendl; + + if (fromdir->snapid != todir->snapid) + return -CEPHFS_EXDEV; + + int op = CEPH_MDS_OP_RENAME; + if (fromdir->snapid != CEPH_NOSNAP) { + if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -CEPHFS_EROFS; + } + + // don't allow cross-quota renames + if (cct->_conf.get_val<bool>("client_quota") && fromdir != todir) { + Inode *fromdir_root = + fromdir->quota.is_enabled() ? fromdir : get_quota_root(fromdir, perm); + Inode *todir_root = + todir->quota.is_enabled() ? todir : get_quota_root(todir, perm); + if (fromdir_root != todir_root) { + return -CEPHFS_EXDEV; + } + } + + InodeRef target; + MetaRequest *req = new MetaRequest(op); + + filepath from; + fromdir->make_nosnap_relative_path(from); + from.push_dentry(fromname); + filepath to; + todir->make_nosnap_relative_path(to); + to.push_dentry(toname); + req->set_filepath(to); + req->set_filepath2(from); + req->set_alternate_name(std::move(alternate_name)); + + Dentry *oldde = get_or_create(fromdir, fromname); + Dentry *de = get_or_create(todir, toname); + + int res; + if (op == CEPH_MDS_OP_RENAME) { + req->set_old_dentry(oldde); + req->old_dentry_drop = CEPH_CAP_FILE_SHARED; + req->old_dentry_unless = CEPH_CAP_FILE_EXCL; + + de->is_renaming = true; + req->set_dentry(de); + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + InodeRef oldin, otherin; + res = _lookup(fromdir, fromname, 0, &oldin, perm, nullptr, true); + if (res < 0) + goto fail; + + Inode *oldinode = oldin.get(); + oldinode->break_all_delegs(); + req->set_old_inode(oldinode); + req->old_inode_drop = CEPH_CAP_LINK_SHARED; + + res = _lookup(todir, toname, 0, &otherin, perm, nullptr, true); + switch (res) { + case 0: + { + Inode *in = otherin.get(); + req->set_other_inode(in); + in->break_all_delegs(); + } + req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + break; + case -CEPHFS_ENOENT: + break; + default: + goto fail; + } + + req->set_inode(todir); + } else { + // renamesnap reply contains no tracedn, so we need to invalidate + // dentry manually + unlink(oldde, true, true); + unlink(de, true, true); + + req->set_inode(todir); + } + + res = make_request(req, perm, &target); + ldout(cct, 10) << "rename result is " << res << dendl; + + // if rename fails it will miss waking up the waiters + if (op == CEPH_MDS_OP_RENAME && de->is_renaming) { + de->is_renaming = false; + signal_cond_list(waiting_for_rename); + } + + // renamed item from our cache + + trim_cache(); + ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl; + return res; + + fail: + put_request(req); + return res; +} + +int Client::ll_rename(Inode *parent, const char *name, Inode *newparent, + const char *newname, const UserPerm& perm) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + vinodeno_t vnewparent = _get_vino(newparent); + + ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to " + << vnewparent << " " << newname << dendl; + tout(cct) << "ll_rename" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << vnewparent.ino.val << std::endl; + tout(cct) << newname << std::endl; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + int r = may_delete(parent, name, perm); + if (r < 0) + return r; + r = may_delete(newparent, newname, perm); + if (r < 0 && r != -CEPHFS_ENOENT) + return r; + } + + return _rename(parent, name, newparent, newname, perm, ""); +} + +int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp) +{ + ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname + << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl; + + if (strlen(newname) > NAME_MAX) + return -CEPHFS_ENAMETOOLONG; + + if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) { + return -CEPHFS_EROFS; + } + if (is_quota_files_exceeded(dir, perm)) { + return -CEPHFS_EDQUOT; + } + + in->break_all_delegs(); + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK); + + filepath path(newname, dir->ino); + req->set_filepath(path); + req->set_alternate_name(std::move(alternate_name)); + filepath existing(in->ino); + req->set_filepath2(existing); + + req->set_inode(dir); + req->inode_drop = CEPH_CAP_FILE_SHARED; + req->inode_unless = CEPH_CAP_FILE_EXCL; + + Dentry *de = get_or_create(dir, newname); + req->set_dentry(de); + + int res = make_request(req, perm, inp); + ldout(cct, 10) << "link result is " << res << dendl; + + trim_cache(); + ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl; + return res; +} + +int Client::ll_link(Inode *in, Inode *newparent, const char *newname, + const UserPerm& perm) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + vinodeno_t vnewparent = _get_vino(newparent); + + ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " << + newname << dendl; + tout(cct) << "ll_link" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << vnewparent << std::endl; + tout(cct) << newname << std::endl; + + InodeRef target; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + if (S_ISDIR(in->mode)) + return -CEPHFS_EPERM; + + int r = may_hardlink(in, perm); + if (r < 0) + return r; + + r = may_create(newparent, perm); + if (r < 0) + return r; + } + + return _link(in, newparent, newname, perm, "", &target); +} + +int Client::ll_num_osds(void) +{ + std::scoped_lock lock(client_lock); + return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds)); +} + +int Client::ll_osdaddr(int osd, uint32_t *addr) +{ + std::scoped_lock lock(client_lock); + + entity_addr_t g; + bool exists = objecter->with_osdmap([&](const OSDMap& o) { + if (!o.exists(osd)) + return false; + g = o.get_addrs(osd).front(); + return true; + }); + if (!exists) + return -1; + uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr; + *addr = ntohl(nb_addr); + return 0; +} + +uint32_t Client::ll_stripe_unit(Inode *in) +{ + std::scoped_lock lock(client_lock); + return in->layout.stripe_unit; +} + +uint64_t Client::ll_snap_seq(Inode *in) +{ + std::scoped_lock lock(client_lock); + return in->snaprealm->seq; +} + +int Client::ll_file_layout(Inode *in, file_layout_t *layout) +{ + std::scoped_lock lock(client_lock); + *layout = in->layout; + return 0; +} + +int Client::ll_file_layout(Fh *fh, file_layout_t *layout) +{ + return ll_file_layout(fh->inode.get(), layout); +} + +/* Currently we cannot take advantage of redundancy in reads, since we + would have to go through all possible placement groups (a + potentially quite large number determined by a hash), and use CRUSH + to calculate the appropriate set of OSDs for each placement group, + then index into that. An array with one entry per OSD is much more + tractable and works for demonstration purposes. */ + +int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno, + file_layout_t* layout) +{ + std::scoped_lock lock(client_lock); + + inodeno_t ino = in->ino; + uint32_t object_size = layout->object_size; + uint32_t su = layout->stripe_unit; + uint32_t stripe_count = layout->stripe_count; + uint64_t stripes_per_object = object_size / su; + uint64_t stripeno = 0, stripepos = 0; + + if(stripe_count) { + stripeno = blockno / stripe_count; // which horizontal stripe (Y) + stripepos = blockno % stripe_count; // which object in the object set (X) + } + uint64_t objectsetno = stripeno / stripes_per_object; // which object set + uint64_t objectno = objectsetno * stripe_count + stripepos; // object id + + object_t oid = file_object_t(ino, objectno); + return objecter->with_osdmap([&](const OSDMap& o) { + ceph_object_layout olayout = + o.file_to_object_layout(oid, *layout); + pg_t pg = (pg_t)olayout.ol_pgid; + vector<int> osds; + int primary; + o.pg_to_acting_osds(pg, &osds, &primary); + return primary; + }); +} + +/* Return the offset of the block, internal to the object */ + +uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno) +{ + std::scoped_lock lock(client_lock); + file_layout_t *layout=&(in->layout); + uint32_t object_size = layout->object_size; + uint32_t su = layout->stripe_unit; + uint64_t stripes_per_object = object_size / su; + + return (blockno % stripes_per_object) * su; +} + +int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_opendir " << vino << dendl; + tout(cct) << "ll_opendir" << std::endl; + tout(cct) << vino.ino.val << std::endl; + + std::scoped_lock lock(client_lock); + + if (!fuse_default_permissions) { + int r = may_open(in, flags, perms); + if (r < 0) + return r; + } + + int r = _opendir(in, dirpp, perms); + tout(cct) << (uintptr_t)*dirpp << std::endl; + + ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")" + << dendl; + return r; +} + +int Client::ll_releasedir(dir_result_t *dirp) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << "ll_releasedir " << dirp << dendl; + tout(cct) << "ll_releasedir" << std::endl; + tout(cct) << (uintptr_t)dirp << std::endl; + + std::scoped_lock lock(client_lock); + + _closedir(dirp); + return 0; +} + +int Client::ll_fsyncdir(dir_result_t *dirp) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl; + tout(cct) << "ll_fsyncdir" << std::endl; + tout(cct) << (uintptr_t)dirp << std::endl; + + std::scoped_lock lock(client_lock); + return _fsync(dirp->inode.get(), false); +} + +int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms) +{ + ceph_assert(!(flags & O_CREAT)); + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl; + tout(cct) << "ll_open" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << ceph_flags_sys2wire(flags) << std::endl; + + std::scoped_lock lock(client_lock); + + int r; + if (!fuse_default_permissions) { + r = may_open(in, flags, perms); + if (r < 0) + goto out; + } + + r = _open(in, flags, 0, fhp /* may be NULL */, perms); + + out: + Fh *fhptr = fhp ? *fhp : NULL; + if (fhptr) { + ll_unclosed_fh_set.insert(fhptr); + } + tout(cct) << (uintptr_t)fhptr << std::endl; + ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << + " = " << r << " (" << fhptr << ")" << dendl; + return r; +} + +int Client::_ll_create(Inode *parent, const char *name, mode_t mode, + int flags, InodeRef *in, int caps, Fh **fhp, + const UserPerm& perms) +{ + *fhp = NULL; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct << + mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid() + << ", gid " << perms.gid() << dendl; + tout(cct) << "ll_create" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + tout(cct) << ceph_flags_sys2wire(flags) << std::endl; + + bool created = false; + int r = _lookup(parent, name, caps, in, perms); + + if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL)) + return -CEPHFS_EEXIST; + + if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) { + if (!fuse_default_permissions) { + r = may_create(parent, perms); + if (r < 0) + goto out; + } + r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created, + perms, ""); + if (r < 0) + goto out; + } + + if (r < 0) + goto out; + + ceph_assert(*in); + + ldout(cct, 20) << "_ll_create created = " << created << dendl; + if (!created) { + if (!fuse_default_permissions) { + r = may_open(in->get(), flags, perms); + if (r < 0) { + if (*fhp) { + int release_r = _release_fh(*fhp); + ceph_assert(release_r == 0); // during create, no async data ops should have happened + } + goto out; + } + } + if (*fhp == NULL) { + r = _open(in->get(), flags, mode, fhp, perms); + if (r < 0) + goto out; + } + } + +out: + if (*fhp) { + ll_unclosed_fh_set.insert(*fhp); + } + + #ifdef _WIN32 + uint64_t ino = 0; + #else + ino_t ino = 0; + #endif + if (r >= 0) { + Inode *inode = in->get(); + if (use_faked_inos()) + ino = inode->faked_ino; + else + ino = inode->ino; + } + + tout(cct) << (uintptr_t)*fhp << std::endl; + tout(cct) << ino << std::endl; + ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct << + mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" << + *fhp << " " << hex << ino << dec << ")" << dendl; + + return r; +} + +int Client::ll_create(Inode *parent, const char *name, mode_t mode, + int flags, struct stat *attr, Inode **outp, Fh **fhp, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + InodeRef in; + + int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL, + fhp, perms); + if (r >= 0) { + ceph_assert(in); + + // passing an Inode in outp requires an additional ref + if (outp) { + _ll_get(in.get()); + *outp = in.get(); + } + fill_stat(in, attr); + } else { + attr->st_ino = 0; + } + + return r; +} + +int Client::ll_createx(Inode *parent, const char *name, mode_t mode, + int oflags, Inode **outp, Fh **fhp, + struct ceph_statx *stx, unsigned want, unsigned lflags, + const UserPerm& perms) +{ + unsigned caps = statx_to_mask(lflags, want); + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + InodeRef in; + + int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms); + if (r >= 0) { + ceph_assert(in); + + // passing an Inode in outp requires an additional ref + if (outp) { + _ll_get(in.get()); + *outp = in.get(); + } + fill_statx(in, caps, stx); + } else { + stx->stx_ino = 0; + stx->stx_mask = 0; + } + + return r; +} + +loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << "ll_lseek" << std::endl; + tout(cct) << offset << std::endl; + tout(cct) << whence << std::endl; + + std::scoped_lock lock(client_lock); + return _lseek(fh, offset, whence); +} + +int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl; + tout(cct) << "ll_read" << std::endl; + tout(cct) << (uintptr_t)fh << std::endl; + tout(cct) << off << std::endl; + tout(cct) << len << std::endl; + + /* We can't return bytes written larger than INT_MAX, clamp len to that */ + len = std::min(len, (loff_t)INT_MAX); + std::scoped_lock lock(client_lock); + + int r = _read(fh, off, len, bl); + ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r + << dendl; + return r; +} + +int Client::ll_read_block(Inode *in, uint64_t blockid, + char *buf, + uint64_t offset, + uint64_t length, + file_layout_t* layout) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + vinodeno_t vino = _get_vino(in); + object_t oid = file_object_t(vino.ino, blockid); + C_SaferCond onfinish; + bufferlist bl; + + objecter->read(oid, + object_locator_t(layout->pool_id), + offset, + length, + vino.snapid, + &bl, + CEPH_OSD_FLAG_READ, + &onfinish); + + int r = onfinish.wait(); + if (r >= 0) { + bl.begin().copy(bl.length(), buf); + r = bl.length(); + } + + return r; +} + +/* It appears that the OSD doesn't return success unless the entire + buffer was written, return the write length on success. */ + +int Client::ll_write_block(Inode *in, uint64_t blockid, + char* buf, uint64_t offset, + uint64_t length, file_layout_t* layout, + uint64_t snapseq, uint32_t sync) +{ + vinodeno_t vino = ll_get_vino(in); + int r = 0; + std::unique_ptr<C_SaferCond> onsafe = nullptr; + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + if (length == 0) { + return -CEPHFS_EINVAL; + } + if (true || sync) { + /* if write is stable, the epilogue is waiting on + * flock */ + onsafe.reset(new C_SaferCond("Client::ll_write_block flock")); + } + object_t oid = file_object_t(vino.ino, blockid); + SnapContext fakesnap; + ceph::bufferlist bl; + if (length > 0) { + bl.push_back(buffer::copy(buf, length)); + } + + ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid + << dendl; + + fakesnap.seq = snapseq; + + /* lock just in time */ + objecter->write(oid, + object_locator_t(layout->pool_id), + offset, + length, + fakesnap, + bl, + ceph::real_clock::now(), + 0, + onsafe.get()); + + if (nullptr != onsafe) { + r = onsafe->wait(); + } + + if (r < 0) { + return r; + } else { + return length; + } +} + +int Client::ll_commit_blocks(Inode *in, + uint64_t offset, + uint64_t length) +{ + /* + BarrierContext *bctx; + vinodeno_t vino = _get_vino(in); + uint64_t ino = vino.ino; + + ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from " + << offset << " to " << length << dendl; + + if (length == 0) { + return -CEPHFS_EINVAL; + } + + std::scoped_lock lock(client_lock); + map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino); + if (p != barriers.end()) { + barrier_interval civ(offset, offset + length); + p->second->commit_barrier(civ); + } + */ + return 0; +} + +int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data) +{ + ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off << + "~" << len << dendl; + tout(cct) << "ll_write" << std::endl; + tout(cct) << (uintptr_t)fh << std::endl; + tout(cct) << off << std::endl; + tout(cct) << len << std::endl; + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + /* We can't return bytes written larger than INT_MAX, clamp len to that */ + len = std::min(len, (loff_t)INT_MAX); + std::scoped_lock lock(client_lock); + + int r = _write(fh, off, len, data, NULL, 0); + ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r + << dendl; + return r; +} + +int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock cl(client_lock); + return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false); +} + +int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock cl(client_lock); + return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false); +} + +int Client::ll_flush(Fh *fh) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl; + tout(cct) << "ll_flush" << std::endl; + tout(cct) << (uintptr_t)fh << std::endl; + + std::scoped_lock lock(client_lock); + return _flush(fh); +} + +int Client::ll_fsync(Fh *fh, bool syncdataonly) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl; + tout(cct) << "ll_fsync" << std::endl; + tout(cct) << (uintptr_t)fh << std::endl; + + std::scoped_lock lock(client_lock); + int r = _fsync(fh, syncdataonly); + if (r) { + // If we're returning an error, clear it from the FH + fh->take_async_err(); + } + return r; +} + +int Client::ll_sync_inode(Inode *in, bool syncdataonly) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl; + tout(cct) << "ll_sync_inode" << std::endl; + tout(cct) << (uintptr_t)in << std::endl; + + std::scoped_lock lock(client_lock); + return _fsync(in, syncdataonly); +} + +int Client::clear_suid_sgid(Inode *in, const UserPerm& perms, bool defer) +{ + ldout(cct, 20) << __func__ << " " << *in << "; " << perms << " defer " + << defer << dendl; + + if (!in->is_file()) { + return 0; + } + + if (likely(!(in->mode & (S_ISUID|S_ISGID)))) { + return 0; + } + + if (perms.uid() == 0 || perms.uid() == in->uid) { + return 0; + } + + int mask = 0; + + // always drop the suid + if (unlikely(in->mode & S_ISUID)) { + mask = CEPH_SETATTR_KILL_SUID; + } + + // remove the sgid if S_IXUGO is set or the inode is + // is not in the caller's group list. + if ((in->mode & S_ISGID) && + ((in->mode & S_IXUGO) || !perms.gid_in_groups(in->gid))) { + mask |= CEPH_SETATTR_KILL_SGID; + } + + ldout(cct, 20) << __func__ << " mask " << mask << dendl; + if (defer) { + return mask; + } + + struct ceph_statx stx = { 0 }; + return __setattrx(in, &stx, mask, perms); +} + +int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + if (offset < 0 || length <= 0) + return -CEPHFS_EINVAL; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -CEPHFS_EOPNOTSUPP; + + if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE)) + return -CEPHFS_EOPNOTSUPP; + + Inode *in = fh->inode.get(); + + if (objecter->osdmap_pool_full(in->layout.pool_id) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { + return -CEPHFS_ENOSPC; + } + + if (in->snapid != CEPH_NOSNAP) + return -CEPHFS_EROFS; + + if ((fh->mode & CEPH_FILE_MODE_WR) == 0) + return -CEPHFS_EBADF; + + uint64_t size = offset + length; + if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) && + size > in->size && + is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) { + return -CEPHFS_EDQUOT; + } + + int have; + int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1); + if (r < 0) + return r; + + r = clear_suid_sgid(in, fh->actor_perms); + if (r < 0) { + put_cap_ref(in, CEPH_CAP_FILE_WR); + return r; + } + + std::unique_ptr<C_SaferCond> onuninline = nullptr; + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (in->inline_version < CEPH_INLINE_NONE && + (have & CEPH_CAP_FILE_BUFFER)) { + bufferlist bl; + auto inline_iter = in->inline_data.cbegin(); + int len = in->inline_data.length(); + if (offset < len) { + if (offset > 0) + inline_iter.copy(offset, bl); + int size = length; + if (offset + size > len) + size = len - offset; + if (size > 0) + bl.append_zero(size); + if (offset + size < len) { + inline_iter += size; + inline_iter.copy(len - offset - size, bl); + } + in->inline_data = bl; + in->inline_version++; + } + in->mtime = in->ctime = ceph_clock_now(); + in->change_attr++; + in->mark_caps_dirty(CEPH_CAP_FILE_WR); + } else { + if (in->inline_version < CEPH_INLINE_NONE) { + onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock")); + uninline_data(in, onuninline.get()); + } + + C_SaferCond onfinish("Client::_punch_hole flock"); + + get_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + _invalidate_inode_cache(in, offset, length); + filer->zero(in->ino, &in->layout, + in->snaprealm->get_snap_context(), + offset, length, + ceph::real_clock::now(), + 0, true, &onfinish); + in->mtime = in->ctime = ceph_clock_now(); + in->change_attr++; + in->mark_caps_dirty(CEPH_CAP_FILE_WR); + + client_lock.unlock(); + onfinish.wait(); + client_lock.lock(); + put_cap_ref(in, CEPH_CAP_FILE_BUFFER); + } + } else if (!(mode & FALLOC_FL_KEEP_SIZE)) { + uint64_t size = offset + length; + if (size > in->size) { + in->size = size; + in->mtime = in->ctime = ceph_clock_now(); + in->change_attr++; + in->mark_caps_dirty(CEPH_CAP_FILE_WR); + + if (is_quota_bytes_approaching(in, fh->actor_perms)) { + check_caps(in, CHECK_CAPS_NODELAY); + } else if (is_max_size_approaching(in)) { + check_caps(in, 0); + } + } + } + + if (nullptr != onuninline) { + client_lock.unlock(); + int ret = onuninline->wait(); + client_lock.lock(); + + if (ret >= 0 || ret == -CEPHFS_ECANCELED) { + in->inline_data.clear(); + in->inline_version = CEPH_INLINE_NONE; + in->mark_caps_dirty(CEPH_CAP_FILE_WR); + check_caps(in, 0); + } else + r = ret; + } + + put_cap_ref(in, CEPH_CAP_FILE_WR); + return r; +} + +int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl; + tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl; + tout(cct) << (uintptr_t)fh << std::endl; + + std::scoped_lock lock(client_lock); + return _fallocate(fh, mode, offset, length); +} + +int Client::fallocate(int fd, int mode, loff_t offset, loff_t length) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + tout(cct) << __func__ << " " << fd << mode << " " << offset << " " << length << std::endl; + + std::scoped_lock lock(client_lock); + Fh *fh = get_filehandle(fd); + if (!fh) + return -CEPHFS_EBADF; +#if defined(__linux__) && defined(O_PATH) + if (fh->flags & O_PATH) + return -CEPHFS_EBADF; +#endif + return _fallocate(fh, mode, offset, length); +} + +int Client::ll_release(Fh *fh) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " << + dendl; + tout(cct) << __func__ << " (fh)" << std::endl; + tout(cct) << (uintptr_t)fh << std::endl; + + std::scoped_lock lock(client_lock); + + if (ll_unclosed_fh_set.count(fh)) + ll_unclosed_fh_set.erase(fh); + return _release_fh(fh); +} + +int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl; + tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl; + + std::scoped_lock lock(client_lock); + return _getlk(fh, fl, owner); +} + +int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl; + tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl; + + std::scoped_lock lock(client_lock); + return _setlk(fh, fl, owner, sleep); +} + +int Client::ll_flock(Fh *fh, int cmd, uint64_t owner) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl; + tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl; + + std::scoped_lock lock(client_lock); + return _flock(fh, cmd, owner); +} + +int Client::set_deleg_timeout(uint32_t timeout) +{ + std::scoped_lock lock(client_lock); + + /* + * The whole point is to prevent blocklisting so we must time out the + * delegation before the session autoclose timeout kicks in. + */ + if (timeout >= mdsmap->get_session_autoclose()) + return -CEPHFS_EINVAL; + + deleg_timeout = timeout; + return 0; +} + +int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv) +{ + int ret = -CEPHFS_EINVAL; + + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + Inode *inode = fh->inode.get(); + + switch(cmd) { + case CEPH_DELEGATION_NONE: + inode->unset_deleg(fh); + ret = 0; + break; + default: + try { + ret = inode->set_deleg(fh, cmd, cb, priv); + } catch (std::bad_alloc&) { + ret = -CEPHFS_ENOMEM; + } + break; + } + return ret; +} + +class C_Client_RequestInterrupt : public Context { +private: + Client *client; + MetaRequest *req; +public: + C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) { + req->get(); + } + void finish(int r) override { + std::scoped_lock l(client->client_lock); + ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK); + client->_interrupt_filelock(req); + client->put_request(req); + } +}; + +void Client::ll_interrupt(void *d) +{ + MetaRequest *req = static_cast<MetaRequest*>(d); + ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl; + tout(cct) << __func__ << " tid " << req->get_tid() << std::endl; + interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req)); +} + +// ========================================= +// layout + +// expose file layouts + +int Client::describe_layout(const char *relpath, file_layout_t *lp, + const UserPerm& perms) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + + *lp = in->layout; + + ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl; + return 0; +} + +int Client::fdescribe_layout(int fd, file_layout_t *lp) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + Inode *in = f->inode.get(); + + *lp = in->layout; + + ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl; + return 0; +} + +int64_t Client::get_default_pool_id() +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + /* first data pool is the default */ + return mdsmap->get_first_data_pool(); +} + +// expose osdmap + +int64_t Client::get_pool_id(const char *pool_name) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name), + pool_name); +} + +string Client::get_pool_name(int64_t pool) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return string(); + + std::scoped_lock lock(client_lock); + + return objecter->with_osdmap([pool](const OSDMap& o) { + return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string(); + }); +} + +int Client::get_pool_replication(int64_t pool) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + return objecter->with_osdmap([pool](const OSDMap& o) { + return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT; + }); +} + +int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + Inode *in = f->inode.get(); + + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents); + ceph_assert(extents.size() == 1); + + objecter->with_osdmap([&](const OSDMap& o) { + pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc); + o.pg_to_acting_osds(pg, osds); + }); + + if (osds.empty()) + return -CEPHFS_EINVAL; + + /* + * Return the remainder of the extent (stripe unit) + * + * If length = 1 is passed to Striper::file_to_extents we get a single + * extent back, but its length is one so we still need to compute the length + * to the end of the stripe unit. + * + * If length = su then we may get 1 or 2 objects back in the extents vector + * which would have to be examined. Even then, the offsets are local to the + * object, so matching up to the file offset is extra work. + * + * It seems simpler to stick with length = 1 and manually compute the + * remainder. + */ + if (len) { + uint64_t su = in->layout.stripe_unit; + *len = su - (off % su); + } + + return 0; +} + +int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + if (id < 0) + return -CEPHFS_EINVAL; + return objecter->with_osdmap([&](const OSDMap& o) { + return o.crush->get_full_location_ordered(id, path); + }); +} + +int Client::get_file_stripe_address(int fd, loff_t offset, + vector<entity_addr_t>& address) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + Inode *in = f->inode.get(); + + // which object? + vector<ObjectExtent> extents; + Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1, + in->truncate_size, extents); + ceph_assert(extents.size() == 1); + + // now we have the object and its 'layout' + return objecter->with_osdmap([&](const OSDMap& o) { + pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc); + vector<int> osds; + o.pg_to_acting_osds(pg, osds); + if (osds.empty()) + return -CEPHFS_EINVAL; + for (unsigned i = 0; i < osds.size(); i++) { + entity_addr_t addr = o.get_addrs(osds[i]).front(); + address.push_back(addr); + } + return 0; + }); +} + +int Client::get_osd_addr(int osd, entity_addr_t& addr) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + return objecter->with_osdmap([&](const OSDMap& o) { + if (!o.exists(osd)) + return -CEPHFS_ENOENT; + + addr = o.get_addrs(osd).front(); + return 0; + }); +} + +int Client::enumerate_layout(int fd, vector<ObjectExtent>& result, + loff_t length, loff_t offset) +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + Fh *f = get_filehandle(fd); + if (!f) + return -CEPHFS_EBADF; + Inode *in = f->inode.get(); + + // map to a list of extents + Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result); + + ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl; + return 0; +} + + +/* find an osd with the same ip. -CEPHFS_ENXIO if none. */ +int Client::get_local_osd() +{ + RWRef_t mref_reader(mount_state, CLIENT_MOUNTING); + if (!mref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + std::scoped_lock lock(client_lock); + + objecter->with_osdmap([this](const OSDMap& o) { + if (o.get_epoch() != local_osd_epoch) { + local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front()); + local_osd_epoch = o.get_epoch(); + } + }); + return local_osd; +} + + + + + + +// =============================== + +void Client::ms_handle_connect(Connection *con) +{ + ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl; +} + +bool Client::ms_handle_reset(Connection *con) +{ + ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl; + return false; +} + +void Client::ms_handle_remote_reset(Connection *con) +{ + std::scoped_lock lock(client_lock); + ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl; + switch (con->get_peer_type()) { + case CEPH_ENTITY_TYPE_MDS: + { + // kludge to figure out which mds this is; fixme with a Connection* state + mds_rank_t mds = MDS_RANK_NONE; + MetaSessionRef s = NULL; + for (auto &p : mds_sessions) { + if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) { + mds = p.first; + s = p.second; + } + } + if (mds >= 0) { + ceph_assert(s != NULL); + switch (s->state) { + case MetaSession::STATE_CLOSING: + ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl; + _closed_mds_session(s.get()); + break; + + case MetaSession::STATE_OPENING: + { + ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl; + list<Context*> waiters; + waiters.swap(s->waiting_for_open); + _closed_mds_session(s.get()); + auto news = _get_or_open_mds_session(mds); + news->waiting_for_open.swap(waiters); + } + break; + + case MetaSession::STATE_OPEN: + { + objecter->maybe_request_map(); /* to check if we are blocklisted */ + if (cct->_conf.get_val<bool>("client_reconnect_stale")) { + ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl; + _closed_mds_session(s.get()); + } else { + ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl; + s->state = MetaSession::STATE_STALE; + } + } + break; + + case MetaSession::STATE_NEW: + case MetaSession::STATE_CLOSED: + default: + break; + } + } + } + break; + } +} + +bool Client::ms_handle_refused(Connection *con) +{ + ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl; + return false; +} + +Inode *Client::get_quota_root(Inode *in, const UserPerm& perms, quota_max_t type) +{ + Inode *quota_in = root_ancestor; + SnapRealm *realm = in->snaprealm; + + if (!cct->_conf.get_val<bool>("client_quota")) + return NULL; + + while (realm) { + ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl; + if (realm->ino != in->ino) { + auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP)); + if (p == inode_map.end()) + break; + + if (p->second->quota.is_enabled(type)) { + quota_in = p->second; + break; + } + } + realm = realm->pparent; + } + ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl; + return quota_in; +} + +/** + * Traverse quota ancestors of the Inode, return true + * if any of them passes the passed function + */ +bool Client::check_quota_condition(Inode *in, const UserPerm& perms, + std::function<bool (const Inode &in)> test) +{ + if (!cct->_conf.get_val<bool>("client_quota")) + return false; + + while (true) { + ceph_assert(in != NULL); + if (test(*in)) { + return true; + } + + if (in == root_ancestor) { + // We're done traversing, drop out + return false; + } else { + // Continue up the tree + in = get_quota_root(in, perms); + } + } + + return false; +} + +bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms) +{ + return check_quota_condition(in, perms, + [](const Inode &in) { + return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files; + }); +} + +bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes, + const UserPerm& perms) +{ + return check_quota_condition(in, perms, + [&new_bytes](const Inode &in) { + return in.quota.max_bytes && (in.rstat.rbytes + new_bytes) + > in.quota.max_bytes; + }); +} + +bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms) +{ + ceph_assert(in->size >= in->reported_size); + const uint64_t size = in->size - in->reported_size; + return check_quota_condition(in, perms, + [&size](const Inode &in) { + if (in.quota.max_bytes) { + if (in.rstat.rbytes >= in.quota.max_bytes) { + return true; + } + + const uint64_t space = in.quota.max_bytes - in.rstat.rbytes; + return (space >> 4) < size; + } else { + return false; + } + }); +} + +enum { + POOL_CHECKED = 1, + POOL_CHECKING = 2, + POOL_READ = 4, + POOL_WRITE = 8, +}; + +int Client::check_pool_perm(Inode *in, int need) +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + if (!cct->_conf->client_check_pool_perm) + return 0; + + /* Only need to do this for regular files */ + if (!in->is_file()) + return 0; + + int64_t pool_id = in->layout.pool_id; + std::string pool_ns = in->layout.pool_ns; + std::pair<int64_t, std::string> perm_key(pool_id, pool_ns); + int have = 0; + while (true) { + auto it = pool_perms.find(perm_key); + if (it == pool_perms.end()) + break; + if (it->second == POOL_CHECKING) { + // avoid concurrent checkings + wait_on_list(waiting_for_pool_perm); + } else { + have = it->second; + ceph_assert(have & POOL_CHECKED); + break; + } + } + + if (!have) { + if (in->snapid != CEPH_NOSNAP) { + // pool permission check needs to write to the first object. But for snapshot, + // head of the first object may have already been deleted. To avoid creating + // orphan object, skip the check for now. + return 0; + } + + pool_perms[perm_key] = POOL_CHECKING; + + char oid_buf[32]; + snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino); + object_t oid = oid_buf; + + SnapContext nullsnapc; + + C_SaferCond rd_cond; + ObjectOperation rd_op; + rd_op.stat(nullptr, nullptr, nullptr); + + objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op, + nullsnapc, ceph::real_clock::now(), 0, &rd_cond); + + C_SaferCond wr_cond; + ObjectOperation wr_op; + wr_op.create(true); + + objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op, + nullsnapc, ceph::real_clock::now(), 0, &wr_cond); + + client_lock.unlock(); + int rd_ret = rd_cond.wait(); + int wr_ret = wr_cond.wait(); + client_lock.lock(); + + bool errored = false; + + if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT) + have |= POOL_READ; + else if (rd_ret != -CEPHFS_EPERM) { + ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns + << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl; + errored = true; + } + + if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST) + have |= POOL_WRITE; + else if (wr_ret != -CEPHFS_EPERM) { + ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns + << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl; + errored = true; + } + + if (errored) { + // Indeterminate: erase CHECKING state so that subsequent calls re-check. + // Raise EIO because actual error code might be misleading for + // userspace filesystem user. + pool_perms.erase(perm_key); + signal_cond_list(waiting_for_pool_perm); + return -CEPHFS_EIO; + } + + pool_perms[perm_key] = have | POOL_CHECKED; + signal_cond_list(waiting_for_pool_perm); + } + + if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) { + ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns + << " need " << ccap_string(need) << ", but no read perm" << dendl; + return -CEPHFS_EPERM; + } + if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) { + ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns + << " need " << ccap_string(need) << ", but no write perm" << dendl; + return -CEPHFS_EPERM; + } + + return 0; +} + +int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want) +{ + if (acl_type == POSIX_ACL) { + if (in->xattrs.count(ACL_EA_ACCESS)) { + const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS]; + + return posix_acl_permits(access_acl, in->uid, in->gid, perms, want); + } + } + return -CEPHFS_EAGAIN; +} + +int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms) +{ + if (acl_type == NO_ACL) + return 0; + + int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0); + if (r < 0) + goto out; + + if (acl_type == POSIX_ACL) { + if (in->xattrs.count(ACL_EA_ACCESS)) { + const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS]; + bufferptr acl(access_acl.c_str(), access_acl.length()); + r = posix_acl_access_chmod(acl, mode); + if (r < 0) + goto out; + r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms); + } else { + r = 0; + } + } +out: + ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl; + return r; +} + +int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl, + const UserPerm& perms) +{ + if (acl_type == NO_ACL) + return 0; + + if (S_ISLNK(*mode)) + return 0; + + int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0); + if (r < 0) + goto out; + + if (acl_type == POSIX_ACL) { + if (dir->xattrs.count(ACL_EA_DEFAULT)) { + map<string, bufferptr> xattrs; + + const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT]; + bufferptr acl(default_acl.c_str(), default_acl.length()); + r = posix_acl_inherit_mode(acl, mode); + if (r < 0) + goto out; + + if (r > 0) { + r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode); + if (r < 0) + goto out; + if (r > 0) + xattrs[ACL_EA_ACCESS] = acl; + } + + if (S_ISDIR(*mode)) + xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT]; + + r = xattrs.size(); + if (r > 0) + encode(xattrs, xattrs_bl); + } else { + if (umask_cb) + *mode &= ~umask_cb(callback_handle); + r = 0; + } + } +out: + ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl; + return r; +} + +void Client::set_filer_flags(int flags) +{ + std::scoped_lock l(client_lock); + ceph_assert(flags == 0 || + flags == CEPH_OSD_FLAG_LOCALIZE_READS); + objecter->add_global_op_flags(flags); +} + +void Client::clear_filer_flags(int flags) +{ + std::scoped_lock l(client_lock); + ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS); + objecter->clear_global_op_flag(flags); +} + +// called before mount +void Client::set_uuid(const std::string& uuid) +{ + RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED); + ceph_assert(iref_reader.is_state_satisfied()); + + std::scoped_lock l(client_lock); + ceph_assert(!uuid.empty()); + + metadata["uuid"] = uuid; + _close_sessions(); +} + +// called before mount. 0 means infinite +void Client::set_session_timeout(unsigned timeout) +{ + RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED); + ceph_assert(iref_reader.is_state_satisfied()); + + std::scoped_lock l(client_lock); + + metadata["timeout"] = stringify(timeout); +} + +// called before mount +int Client::start_reclaim(const std::string& uuid, unsigned flags, + const std::string& fs_name) +{ + RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED); + if (!iref_reader.is_state_satisfied()) + return -CEPHFS_ENOTCONN; + + if (uuid.empty()) + return -CEPHFS_EINVAL; + + std::unique_lock l(client_lock); + { + auto it = metadata.find("uuid"); + if (it != metadata.end() && it->second == uuid) + return -CEPHFS_EINVAL; + } + + int r = subscribe_mdsmap(fs_name); + if (r < 0) { + lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl; + return r; + } + + if (metadata.empty()) + populate_metadata(""); + + while (mdsmap->get_epoch() == 0) + wait_on_list(waiting_for_mdsmap); + + reclaim_errno = 0; + for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) { + if (!mdsmap->is_up(mds)) { + ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl; + wait_on_list(waiting_for_mdsmap); + continue; + } + + MetaSessionRef session; + if (!have_open_session(mds)) { + session = _get_or_open_mds_session(mds); + if (session->state == MetaSession::STATE_REJECTED) + return -CEPHFS_EPERM; + if (session->state != MetaSession::STATE_OPENING) { + // umounting? + return -CEPHFS_EINVAL; + } + ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl; + wait_on_context_list(session->waiting_for_open); + continue; + } + + session = mds_sessions.at(mds); + if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT)) + return -CEPHFS_EOPNOTSUPP; + + if (session->reclaim_state == MetaSession::RECLAIM_NULL || + session->reclaim_state == MetaSession::RECLAIMING) { + session->reclaim_state = MetaSession::RECLAIMING; + auto m = make_message<MClientReclaim>(uuid, flags); + session->con->send_message2(std::move(m)); + wait_on_list(waiting_for_reclaim); + } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) { + return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE; + } else { + mds++; + } + } + + // didn't find target session in any mds + if (reclaim_target_addrs.empty()) { + if (flags & CEPH_RECLAIM_RESET) + return -CEPHFS_ENOENT; + return -CEPHFS_ENOTRECOVERABLE; + } + + if (flags & CEPH_RECLAIM_RESET) + return 0; + + // use blocklist to check if target session was killed + // (config option mds_session_blocklist_on_evict needs to be true) + ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl; + bs::error_code ec; + l.unlock(); + objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]); + l.lock(); + + if (ec) + return ceph::from_error_code(ec); + + bool blocklisted = objecter->with_osdmap( + [this](const OSDMap &osd_map) -> bool { + return osd_map.is_blocklisted(reclaim_target_addrs); + }); + if (blocklisted) + return -CEPHFS_ENOTRECOVERABLE; + + metadata["reclaiming_uuid"] = uuid; + return 0; +} + +void Client::finish_reclaim() +{ + auto it = metadata.find("reclaiming_uuid"); + if (it == metadata.end()) { + for (auto &p : mds_sessions) + p.second->reclaim_state = MetaSession::RECLAIM_NULL; + return; + } + + for (auto &p : mds_sessions) { + p.second->reclaim_state = MetaSession::RECLAIM_NULL; + auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH); + p.second->con->send_message2(std::move(m)); + } + + metadata["uuid"] = it->second; + metadata.erase(it); +} + +void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply) +{ + mds_rank_t from = mds_rank_t(reply->get_source().num()); + ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl; + + std::scoped_lock cl(client_lock); + auto session = _get_mds_session(from, reply->get_connection().get()); + if (!session) { + ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl; + return; + } + + if (reply->get_result() >= 0) { + session->reclaim_state = MetaSession::RECLAIM_OK; + if (reply->get_epoch() > reclaim_osd_epoch) + reclaim_osd_epoch = reply->get_epoch(); + if (!reply->get_addrs().empty()) + reclaim_target_addrs = reply->get_addrs(); + } else { + session->reclaim_state = MetaSession::RECLAIM_FAIL; + reclaim_errno = reply->get_result(); + } + + signal_cond_list(waiting_for_reclaim); +} + +/** + * This is included in cap release messages, to cause + * the MDS to wait until this OSD map epoch. It is necessary + * in corner cases where we cancel RADOS ops, so that + * nobody else tries to do IO to the same objects in + * the same epoch as the cancelled ops. + */ +void Client::set_cap_epoch_barrier(epoch_t e) +{ + ldout(cct, 5) << __func__ << " epoch = " << e << dendl; + cap_epoch_barrier = e; +} + +const char** Client::get_tracked_conf_keys() const +{ + static const char* keys[] = { + "client_cache_size", + "client_cache_mid", + "client_acl_type", + "client_deleg_timeout", + "client_deleg_break_on_open", + "client_oc_size", + "client_oc_max_objects", + "client_oc_max_dirty", + "client_oc_target_dirty", + "client_oc_max_dirty_age", + "client_caps_release_delay", + "client_mount_timeout", + NULL + }; + return keys; +} + +void Client::handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + std::scoped_lock lock(client_lock); + + if (changed.count("client_cache_mid")) { + lru.lru_set_midpoint(cct->_conf->client_cache_mid); + } + if (changed.count("client_acl_type")) { + acl_type = NO_ACL; + if (cct->_conf->client_acl_type == "posix_acl") + acl_type = POSIX_ACL; + } + if (changed.count("client_oc_size")) { + objectcacher->set_max_size(cct->_conf->client_oc_size); + } + if (changed.count("client_oc_max_objects")) { + objectcacher->set_max_objects(cct->_conf->client_oc_max_objects); + } + if (changed.count("client_oc_max_dirty")) { + objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty); + } + if (changed.count("client_oc_target_dirty")) { + objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty); + } + if (changed.count("client_oc_max_dirty_age")) { + objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age); + } + if (changed.count("client_collect_and_send_global_metrics")) { + _collect_and_send_global_metrics = cct->_conf.get_val<bool>( + "client_collect_and_send_global_metrics"); + } + if (changed.count("client_caps_release_delay")) { + caps_release_delay = cct->_conf.get_val<std::chrono::seconds>( + "client_caps_release_delay"); + } + if (changed.count("client_mount_timeout")) { + mount_timeout = cct->_conf.get_val<std::chrono::seconds>( + "client_mount_timeout"); + } +} + +void intrusive_ptr_add_ref(Inode *in) +{ + in->iget(); +} + +void intrusive_ptr_release(Inode *in) +{ + in->client->put_inode(in); +} + +mds_rank_t Client::_get_random_up_mds() const +{ + ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + + std::set<mds_rank_t> up; + mdsmap->get_up_mds_set(up); + + if (up.empty()) + return MDS_RANK_NONE; + std::set<mds_rank_t>::const_iterator p = up.begin(); + for (int n = rand() % up.size(); n; n--) + ++p; + return *p; +} + + +StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc, + boost::asio::io_context& ictx) + : Client(m, mc, new Objecter(m->cct, m, mc, ictx)) +{ + monclient->set_messenger(m); + objecter->set_client_incarnation(0); +} + +StandaloneClient::~StandaloneClient() +{ + delete objecter; + objecter = nullptr; +} + +int StandaloneClient::init() +{ + RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false); + ceph_assert(iref_writer.is_first_writer()); + + _pre_init(); + objecter->init(); + + client_lock.lock(); + + messenger->add_dispatcher_tail(objecter); + messenger->add_dispatcher_tail(this); + + monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD); + int r = monclient->init(); + if (r < 0) { + // need to do cleanup because we're in an intermediate init state + { + std::scoped_lock l(timer_lock); + timer.shutdown(); + } + + client_lock.unlock(); + objecter->shutdown(); + objectcacher->stop(); + monclient->shutdown(); + return r; + } + objecter->start(); + + client_lock.unlock(); + _finish_init(); + iref_writer.update_state(CLIENT_INITIALIZED); + + return 0; +} + +void StandaloneClient::shutdown() +{ + Client::shutdown(); + objecter->shutdown(); + monclient->shutdown(); +} diff --git a/src/client/Client.h b/src/client/Client.h new file mode 100644 index 000000000..911a8b460 --- /dev/null +++ b/src/client/Client.h @@ -0,0 +1,1648 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_CLIENT_H +#define CEPH_CLIENT_H + +#include "common/CommandTable.h" +#include "common/Finisher.h" +#include "common/Timer.h" +#include "common/ceph_mutex.h" +#include "common/cmdparse.h" +#include "common/compiler_extensions.h" +#include "include/common_fwd.h" +#include "include/cephfs/ceph_ll_client.h" +#include "include/filepath.h" +#include "include/interval_set.h" +#include "include/lru.h" +#include "include/types.h" +#include "include/unordered_map.h" +#include "include/unordered_set.h" +#include "include/cephfs/metrics/Types.h" +#include "mds/mdstypes.h" +#include "include/cephfs/types.h" +#include "msg/Dispatcher.h" +#include "msg/MessageRef.h" +#include "msg/Messenger.h" +#include "osdc/ObjectCacher.h" + +#include "RWRef.h" +#include "InodeRef.h" +#include "MetaSession.h" +#include "UserPerm.h" + +#include <fstream> +#include <map> +#include <memory> +#include <set> +#include <string> +#include <thread> + +using std::set; +using std::map; +using std::fstream; + +class FSMap; +class FSMapUser; +class MonClient; + + +struct DirStat; +struct LeaseStat; +struct InodeStat; + +class Filer; +class Objecter; +class WritebackHandler; + +class MDSMap; +class Message; +class destructive_lock_ref_t; + +enum { + l_c_first = 20000, + l_c_reply, + l_c_lat, + l_c_wrlat, + l_c_read, + l_c_fsync, + l_c_md_avg, + l_c_md_sqsum, + l_c_md_ops, + l_c_rd_avg, + l_c_rd_sqsum, + l_c_rd_ops, + l_c_wr_avg, + l_c_wr_sqsum, + l_c_wr_ops, + l_c_last, +}; + + +class MDSCommandOp : public CommandOp +{ + public: + mds_gid_t mds_gid; + + explicit MDSCommandOp(ceph_tid_t t) : CommandOp(t) {} +}; + +/* error code for ceph_fuse */ +#define CEPH_FUSE_NO_MDS_UP -((1<<16)+0) /* no mds up deteced in ceph_fuse */ +#define CEPH_FUSE_LAST -((1<<16)+1) /* (unused) */ + +// ============================================ +// types for my local metadata cache +/* basic structure: + + - Dentries live in an LRU loop. they get expired based on last access. + see include/lru.h. items can be bumped to "mid" or "top" of list, etc. + - Inode has ref count for each Fh, Dir, or Dentry that points to it. + - when Inode ref goes to 0, it's expired. + - when Dir is empty, it's removed (and it's Inode ref--) + +*/ + +/* getdir result */ +struct DirEntry { + explicit DirEntry(const std::string &s) : d_name(s), stmask(0) {} + DirEntry(const std::string &n, struct stat& s, int stm) + : d_name(n), st(s), stmask(stm) {} + + std::string d_name; + struct stat st; + int stmask; +}; + +struct Cap; +class Dir; +class Dentry; +struct SnapRealm; +struct Fh; +struct CapSnap; + +struct MetaRequest; +class ceph_lock_state_t; + +// ======================================================== +// client interface + +struct dir_result_t { + static const int SHIFT = 28; + static const int64_t MASK = (1 << SHIFT) - 1; + static const int64_t HASH = 0xFFULL << (SHIFT + 24); // impossible frag bits + static const loff_t END = 1ULL << (SHIFT + 32); + + struct dentry { + int64_t offset; + std::string name; + std::string alternate_name; + InodeRef inode; + explicit dentry(int64_t o) : offset(o) {} + dentry(int64_t o, std::string n, std::string an, InodeRef in) : + offset(o), name(std::move(n)), alternate_name(std::move(an)), inode(std::move(in)) {} + }; + struct dentry_off_lt { + bool operator()(const dentry& d, int64_t off) const { + return dir_result_t::fpos_cmp(d.offset, off) < 0; + } + }; + + + explicit dir_result_t(Inode *in, const UserPerm& perms); + + + static uint64_t make_fpos(unsigned h, unsigned l, bool hash) { + uint64_t v = ((uint64_t)h<< SHIFT) | (uint64_t)l; + if (hash) + v |= HASH; + else + ceph_assert((v & HASH) != HASH); + return v; + } + static unsigned fpos_high(uint64_t p) { + unsigned v = (p & (END-1)) >> SHIFT; + if ((p & HASH) == HASH) + return ceph_frag_value(v); + return v; + } + static unsigned fpos_low(uint64_t p) { + return p & MASK; + } + static int fpos_cmp(uint64_t l, uint64_t r) { + int c = ceph_frag_compare(fpos_high(l), fpos_high(r)); + if (c) + return c; + if (fpos_low(l) == fpos_low(r)) + return 0; + return fpos_low(l) < fpos_low(r) ? -1 : 1; + } + + unsigned offset_high() { return fpos_high(offset); } + unsigned offset_low() { return fpos_low(offset); } + + void set_end() { offset |= END; } + bool at_end() { return (offset & END); } + + void set_hash_order() { offset |= HASH; } + bool hash_order() { return (offset & HASH) == HASH; } + + bool is_cached() { + if (buffer.empty()) + return false; + if (hash_order()) { + return buffer_frag.contains(offset_high()); + } else { + return buffer_frag == frag_t(offset_high()); + } + } + + void reset() { + last_name.clear(); + next_offset = 2; + offset = 0; + ordered_count = 0; + cache_index = 0; + buffer.clear(); + } + + InodeRef inode; + int64_t offset; // hash order: + // (0xff << 52) | ((24 bits hash) << 28) | + // (the nth entry has hash collision); + // frag+name order; + // ((frag value) << 28) | (the nth entry in frag); + + unsigned next_offset; // offset of next chunk (last_name's + 1) + std::string last_name; // last entry in previous chunk + + uint64_t release_count; + uint64_t ordered_count; + unsigned cache_index; + int start_shared_gen; // dir shared_gen at start of readdir + UserPerm perms; + + frag_t buffer_frag; + + std::vector<dentry> buffer; + struct dirent de; +}; + +class Client : public Dispatcher, public md_config_obs_t { +public: + friend class C_Block_Sync; // Calls block map and protected helpers + friend class C_Client_CacheInvalidate; // calls ino_invalidate_cb + friend class C_Client_DentryInvalidate; // calls dentry_invalidate_cb + friend class C_Client_FlushComplete; // calls put_inode() + friend class C_Client_Remount; + friend class C_Client_RequestInterrupt; + friend class C_Deleg_Timeout; // Asserts on client_lock, called when a delegation is unreturned + friend class C_Client_CacheRelease; // Asserts on client_lock + friend class SyntheticClient; + friend void intrusive_ptr_release(Inode *in); + template <typename T> friend struct RWRefState; + template <typename T> friend class RWRef; + + using Dispatcher::cct; + using clock = ceph::coarse_mono_clock; + + typedef int (*add_dirent_cb_t)(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in); + + struct walk_dentry_result { + InodeRef in; + std::string alternate_name; + }; + + class CommandHook : public AdminSocketHook { + public: + explicit CommandHook(Client *client); + int call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& errss, + bufferlist& out) override; + private: + Client *m_client; + }; + + // snapshot info returned via get_snap_info(). nothing to do + // with SnapInfo on the MDS. + struct SnapInfo { + snapid_t id; + std::map<std::string, std::string> metadata; + }; + + Client(Messenger *m, MonClient *mc, Objecter *objecter_); + Client(const Client&) = delete; + Client(const Client&&) = delete; + virtual ~Client() override; + + static UserPerm pick_my_perms(CephContext *c) { + uid_t uid = c->_conf->client_mount_uid >= 0 ? c->_conf->client_mount_uid : -1; + gid_t gid = c->_conf->client_mount_gid >= 0 ? c->_conf->client_mount_gid : -1; + return UserPerm(uid, gid); + } + UserPerm pick_my_perms() { + uid_t uid = user_id >= 0 ? user_id : -1; + gid_t gid = group_id >= 0 ? group_id : -1; + return UserPerm(uid, gid); + } + + int mount(const std::string &mount_root, const UserPerm& perms, + bool require_mds=false, const std::string &fs_name=""); + void unmount(); + bool is_unmounting() const { + return mount_state.check_current_state(CLIENT_UNMOUNTING); + } + bool is_mounted() const { + return mount_state.check_current_state(CLIENT_MOUNTED); + } + bool is_mounting() const { + return mount_state.check_current_state(CLIENT_MOUNTING); + } + bool is_initialized() const { + return initialize_state.check_current_state(CLIENT_INITIALIZED); + } + void abort_conn(); + + void set_uuid(const std::string& uuid); + void set_session_timeout(unsigned timeout); + int start_reclaim(const std::string& uuid, unsigned flags, + const std::string& fs_name); + void finish_reclaim(); + + fs_cluster_id_t get_fs_cid() { + return fscid; + } + + int mds_command( + const std::string &mds_spec, + const std::vector<std::string>& cmd, + const bufferlist& inbl, + bufferlist *poutbl, std::string *prs, Context *onfinish); + + // these should (more or less) mirror the actual system calls. + int statfs(const char *path, struct statvfs *stbuf, const UserPerm& perms); + + // crap + int chdir(const char *s, std::string &new_cwd, const UserPerm& perms); + void _getcwd(std::string& cwd, const UserPerm& perms); + void getcwd(std::string& cwd, const UserPerm& perms); + + // namespace ops + int opendir(const char *name, dir_result_t **dirpp, const UserPerm& perms); + int fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm& perms); + int closedir(dir_result_t *dirp); + + /** + * Fill a directory listing from dirp, invoking cb for each entry + * with the given pointer, the dirent, the struct stat, the stmask, + * and the offset. + * + * Returns 0 if it reached the end of the directory. + * If @a cb returns a negative error code, stop and return that. + */ + int readdir_r_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p, + unsigned want=0, unsigned flags=AT_STATX_DONT_SYNC, + bool getref=false); + + struct dirent * readdir(dir_result_t *d); + int readdir_r(dir_result_t *dirp, struct dirent *de); + int readdirplus_r(dir_result_t *dirp, struct dirent *de, struct ceph_statx *stx, unsigned want, unsigned flags, Inode **out); + + /* + * Get the next snapshot delta entry. + * + */ + int readdir_snapdiff(dir_result_t* dir1, snapid_t snap2, + struct dirent* out_de, snapid_t* out_snap); + + int getdir(const char *relpath, std::list<std::string>& names, + const UserPerm& perms); // get the whole dir at once. + + /** + * Returns the length of the buffer that got filled in, or -errno. + * If it returns -CEPHFS_ERANGE you just need to increase the size of the + * buffer and try again. + */ + int _getdents(dir_result_t *dirp, char *buf, int buflen, bool ful); // get a bunch of dentries at once + int getdents(dir_result_t *dirp, char *buf, int buflen) { + return _getdents(dirp, buf, buflen, true); + } + int getdnames(dir_result_t *dirp, char *buf, int buflen) { + return _getdents(dirp, buf, buflen, false); + } + + void rewinddir(dir_result_t *dirp); + loff_t telldir(dir_result_t *dirp); + void seekdir(dir_result_t *dirp, loff_t offset); + + int may_delete(const char *relpath, const UserPerm& perms); + int link(const char *existing, const char *newname, const UserPerm& perm, std::string alternate_name=""); + int unlink(const char *path, const UserPerm& perm); + int unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm); + int rename(const char *from, const char *to, const UserPerm& perm, std::string alternate_name=""); + + // dirs + int mkdir(const char *path, mode_t mode, const UserPerm& perm, std::string alternate_name=""); + int mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm, + std::string alternate_name=""); + int mkdirs(const char *path, mode_t mode, const UserPerm& perms); + int rmdir(const char *path, const UserPerm& perms); + + // symlinks + int readlink(const char *path, char *buf, loff_t size, const UserPerm& perms); + int readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms); + + int symlink(const char *existing, const char *newname, const UserPerm& perms, std::string alternate_name=""); + int symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms, + std::string alternate_name=""); + + // path traversal for high-level interface + int walk(std::string_view path, struct walk_dentry_result* result, const UserPerm& perms, bool followsym=true); + + // inode stuff + unsigned statx_to_mask(unsigned int flags, unsigned int want); + int stat(const char *path, struct stat *stbuf, const UserPerm& perms, + frag_info_t *dirstat=0, int mask=CEPH_STAT_CAP_INODE_ALL); + int statx(const char *path, struct ceph_statx *stx, + const UserPerm& perms, + unsigned int want, unsigned int flags); + int lstat(const char *path, struct stat *stbuf, const UserPerm& perms, + frag_info_t *dirstat=0, int mask=CEPH_STAT_CAP_INODE_ALL); + + int setattr(const char *relpath, struct stat *attr, int mask, + const UserPerm& perms); + int setattrx(const char *relpath, struct ceph_statx *stx, int mask, + const UserPerm& perms, int flags=0); + int fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms); + int fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms); + int chmod(const char *path, mode_t mode, const UserPerm& perms); + int fchmod(int fd, mode_t mode, const UserPerm& perms); + int chmodat(int dirfd, const char *relpath, mode_t mode, int flags, const UserPerm& perms); + int lchmod(const char *path, mode_t mode, const UserPerm& perms); + int chown(const char *path, uid_t new_uid, gid_t new_gid, + const UserPerm& perms); + int fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms); + int lchown(const char *path, uid_t new_uid, gid_t new_gid, + const UserPerm& perms); + int chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid, + int flags, const UserPerm& perms); + int utime(const char *path, struct utimbuf *buf, const UserPerm& perms); + int lutime(const char *path, struct utimbuf *buf, const UserPerm& perms); + int futime(int fd, struct utimbuf *buf, const UserPerm& perms); + int utimes(const char *relpath, struct timeval times[2], const UserPerm& perms); + int lutimes(const char *relpath, struct timeval times[2], const UserPerm& perms); + int futimes(int fd, struct timeval times[2], const UserPerm& perms); + int futimens(int fd, struct timespec times[2], const UserPerm& perms); + int utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags, + const UserPerm& perms); + int flock(int fd, int operation, uint64_t owner); + int truncate(const char *path, loff_t size, const UserPerm& perms); + + // file ops + int mknod(const char *path, mode_t mode, const UserPerm& perms, dev_t rdev=0); + + int create_and_open(int dirfd, const char *relpath, int flags, const UserPerm& perms, + mode_t mode, int stripe_unit, int stripe_count, int object_size, + const char *data_pool, std::string alternate_name); + int open(const char *path, int flags, const UserPerm& perms, mode_t mode=0, std::string alternate_name="") { + return open(path, flags, perms, mode, 0, 0, 0, NULL, alternate_name); + } + int open(const char *path, int flags, const UserPerm& perms, + mode_t mode, int stripe_unit, int stripe_count, int object_size, + const char *data_pool, std::string alternate_name=""); + int openat(int dirfd, const char *relpath, int flags, const UserPerm& perms, + mode_t mode, int stripe_unit, int stripe_count, + int object_size, const char *data_pool, std::string alternate_name); + int openat(int dirfd, const char *path, int flags, const UserPerm& perms, mode_t mode=0, + std::string alternate_name="") { + return openat(dirfd, path, flags, perms, mode, 0, 0, 0, NULL, alternate_name); + } + + int lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name, + const UserPerm& perms); + int lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode=NULL); + int lookup_name(Inode *in, Inode *parent, const UserPerm& perms); + int _close(int fd); + int close(int fd); + loff_t lseek(int fd, loff_t offset, int whence); + int read(int fd, char *buf, loff_t size, loff_t offset=-1); + int preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset=-1); + int write(int fd, const char *buf, loff_t size, loff_t offset=-1); + int pwritev(int fd, const struct iovec *iov, int iovcnt, loff_t offset=-1); + int fake_write_size(int fd, loff_t size); + int ftruncate(int fd, loff_t size, const UserPerm& perms); + int fsync(int fd, bool syncdataonly); + int fstat(int fd, struct stat *stbuf, const UserPerm& perms, + int mask=CEPH_STAT_CAP_INODE_ALL); + int fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms, + unsigned int want, unsigned int flags); + int statxat(int dirfd, const char *relpath, + struct ceph_statx *stx, const UserPerm& perms, + unsigned int want, unsigned int flags); + int fallocate(int fd, int mode, loff_t offset, loff_t length); + + // full path xattr ops + int getxattr(const char *path, const char *name, void *value, size_t size, + const UserPerm& perms); + int lgetxattr(const char *path, const char *name, void *value, size_t size, + const UserPerm& perms); + int fgetxattr(int fd, const char *name, void *value, size_t size, + const UserPerm& perms); + int listxattr(const char *path, char *list, size_t size, const UserPerm& perms); + int llistxattr(const char *path, char *list, size_t size, const UserPerm& perms); + int flistxattr(int fd, char *list, size_t size, const UserPerm& perms); + int removexattr(const char *path, const char *name, const UserPerm& perms); + int lremovexattr(const char *path, const char *name, const UserPerm& perms); + int fremovexattr(int fd, const char *name, const UserPerm& perms); + int setxattr(const char *path, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms); + int lsetxattr(const char *path, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms); + int fsetxattr(int fd, const char *name, const void *value, size_t size, + int flags, const UserPerm& perms); + + int sync_fs(); + int64_t drop_caches(); + + int get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info); + + // hpc lazyio + int lazyio(int fd, int enable); + int lazyio_propagate(int fd, loff_t offset, size_t count); + int lazyio_synchronize(int fd, loff_t offset, size_t count); + + // expose file layout + int describe_layout(const char *path, file_layout_t* layout, + const UserPerm& perms); + int fdescribe_layout(int fd, file_layout_t* layout); + int get_file_stripe_address(int fd, loff_t offset, std::vector<entity_addr_t>& address); + int get_file_extent_osds(int fd, loff_t off, loff_t *len, std::vector<int>& osds); + int get_osd_addr(int osd, entity_addr_t& addr); + + // expose mdsmap + int64_t get_default_pool_id(); + + // expose osdmap + int get_local_osd(); + int get_pool_replication(int64_t pool); + int64_t get_pool_id(const char *pool_name); + std::string get_pool_name(int64_t pool); + int get_osd_crush_location(int id, std::vector<std::pair<std::string, std::string> >& path); + + int enumerate_layout(int fd, std::vector<ObjectExtent>& result, + loff_t length, loff_t offset); + + int mksnap(const char *path, const char *name, const UserPerm& perm, + mode_t mode=0, const std::map<std::string, std::string> &metadata={}); + int rmsnap(const char *path, const char *name, const UserPerm& perm, bool check_perms=false); + + // Inode permission checking + int inode_permission(Inode *in, const UserPerm& perms, unsigned want); + + // expose caps + int get_caps_issued(int fd); + int get_caps_issued(const char *path, const UserPerm& perms); + + snapid_t ll_get_snapid(Inode *in); + vinodeno_t ll_get_vino(Inode *in) { + std::lock_guard lock(client_lock); + return _get_vino(in); + } + // get inode from faked ino + Inode *ll_get_inode(ino_t ino); + Inode *ll_get_inode(vinodeno_t vino); + int ll_lookup(Inode *parent, const char *name, struct stat *attr, + Inode **out, const UserPerm& perms); + int ll_lookup_inode(struct inodeno_t ino, const UserPerm& perms, Inode **inode); + int ll_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode); + int ll_lookupx(Inode *parent, const char *name, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm& perms); + bool ll_forget(Inode *in, uint64_t count); + bool ll_put(Inode *in); + int ll_get_snap_ref(snapid_t snap); + + int ll_getattr(Inode *in, struct stat *st, const UserPerm& perms); + int ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want, + unsigned int flags, const UserPerm& perms); + int ll_setattrx(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms); + int ll_setattr(Inode *in, struct stat *st, int mask, + const UserPerm& perms); + int ll_getxattr(Inode *in, const char *name, void *value, size_t size, + const UserPerm& perms); + int ll_setxattr(Inode *in, const char *name, const void *value, size_t size, + int flags, const UserPerm& perms); + int ll_removexattr(Inode *in, const char *name, const UserPerm& perms); + int ll_listxattr(Inode *in, char *list, size_t size, const UserPerm& perms); + int ll_opendir(Inode *in, int flags, dir_result_t **dirpp, + const UserPerm& perms); + int ll_releasedir(dir_result_t* dirp); + int ll_fsyncdir(dir_result_t* dirp); + int ll_readlink(Inode *in, char *buf, size_t bufsize, const UserPerm& perms); + int ll_mknod(Inode *in, const char *name, mode_t mode, dev_t rdev, + struct stat *attr, Inode **out, const UserPerm& perms); + int ll_mknodx(Inode *parent, const char *name, mode_t mode, dev_t rdev, + Inode **out, struct ceph_statx *stx, unsigned want, + unsigned flags, const UserPerm& perms); + int ll_mkdir(Inode *in, const char *name, mode_t mode, struct stat *attr, + Inode **out, const UserPerm& perm); + int ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm& perms); + int ll_symlink(Inode *in, const char *name, const char *value, + struct stat *attr, Inode **out, const UserPerm& perms); + int ll_symlinkx(Inode *parent, const char *name, const char *value, + Inode **out, struct ceph_statx *stx, unsigned want, + unsigned flags, const UserPerm& perms); + int ll_unlink(Inode *in, const char *name, const UserPerm& perm); + int ll_rmdir(Inode *in, const char *name, const UserPerm& perms); + int ll_rename(Inode *parent, const char *name, Inode *newparent, + const char *newname, const UserPerm& perm); + int ll_link(Inode *in, Inode *newparent, const char *newname, + const UserPerm& perm); + int ll_open(Inode *in, int flags, Fh **fh, const UserPerm& perms); + int _ll_create(Inode *parent, const char *name, mode_t mode, + int flags, InodeRef *in, int caps, Fh **fhp, + const UserPerm& perms); + int ll_create(Inode *parent, const char *name, mode_t mode, int flags, + struct stat *attr, Inode **out, Fh **fhp, + const UserPerm& perms); + int ll_createx(Inode *parent, const char *name, mode_t mode, + int oflags, Inode **outp, Fh **fhp, + struct ceph_statx *stx, unsigned want, unsigned lflags, + const UserPerm& perms); + int ll_read_block(Inode *in, uint64_t blockid, char *buf, uint64_t offset, + uint64_t length, file_layout_t* layout); + + int ll_write_block(Inode *in, uint64_t blockid, + char* buf, uint64_t offset, + uint64_t length, file_layout_t* layout, + uint64_t snapseq, uint32_t sync); + int ll_commit_blocks(Inode *in, uint64_t offset, uint64_t length); + + int ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms); + int ll_walk(const char* name, Inode **i, struct ceph_statx *stx, + unsigned int want, unsigned int flags, const UserPerm& perms); + uint32_t ll_stripe_unit(Inode *in); + int ll_file_layout(Inode *in, file_layout_t *layout); + uint64_t ll_snap_seq(Inode *in); + + int ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl); + int ll_write(Fh *fh, loff_t off, loff_t len, const char *data); + int64_t ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off); + int64_t ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off); + loff_t ll_lseek(Fh *fh, loff_t offset, int whence); + int ll_flush(Fh *fh); + int ll_fsync(Fh *fh, bool syncdataonly); + int ll_sync_inode(Inode *in, bool syncdataonly); + int ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length); + int ll_release(Fh *fh); + int ll_getlk(Fh *fh, struct flock *fl, uint64_t owner); + int ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep); + int ll_flock(Fh *fh, int cmd, uint64_t owner); + int ll_lazyio(Fh *fh, int enable); + int ll_file_layout(Fh *fh, file_layout_t *layout); + void ll_interrupt(void *d); + bool ll_handle_umask() { + return acl_type != NO_ACL; + } + + int ll_get_stripe_osd(struct Inode *in, uint64_t blockno, + file_layout_t* layout); + uint64_t ll_get_internal_offset(struct Inode *in, uint64_t blockno); + + int ll_num_osds(void); + int ll_osdaddr(int osd, uint32_t *addr); + int ll_osdaddr(int osd, char* buf, size_t size); + + void _ll_register_callbacks(struct ceph_client_callback_args *args); + void ll_register_callbacks(struct ceph_client_callback_args *args); // deprecated + int ll_register_callbacks2(struct ceph_client_callback_args *args); + std::pair<int, bool> test_dentry_handling(bool can_invalidate); + + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) override; + uint32_t get_deleg_timeout() { return deleg_timeout; } + int set_deleg_timeout(uint32_t timeout); + int ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv); + + entity_name_t get_myname() { return messenger->get_myname(); } + void wait_on_list(std::list<ceph::condition_variable*>& ls); + void signal_cond_list(std::list<ceph::condition_variable*>& ls); + + void set_filer_flags(int flags); + void clear_filer_flags(int flags); + + void tear_down_cache(); + + void update_metadata(std::string const &k, std::string const &v); + + client_t get_nodeid() { return whoami; } + + inodeno_t get_root_ino(); + Inode *get_root(); + + virtual int init(); + virtual void shutdown(); + + // messaging + void cancel_commands(const MDSMap& newmap); + void handle_mds_map(const MConstRef<MMDSMap>& m); + void handle_fs_map(const MConstRef<MFSMap>& m); + void handle_fs_map_user(const MConstRef<MFSMapUser>& m); + void handle_osd_map(const MConstRef<MOSDMap>& m); + + void handle_lease(const MConstRef<MClientLease>& m); + + // inline data + int uninline_data(Inode *in, Context *onfinish); + + // file caps + void check_cap_issue(Inode *in, unsigned issued); + void add_update_cap(Inode *in, MetaSession *session, uint64_t cap_id, + unsigned issued, unsigned wanted, unsigned seq, unsigned mseq, + inodeno_t realm, int flags, const UserPerm& perms); + void remove_cap(Cap *cap, bool queue_release); + void remove_all_caps(Inode *in); + void remove_session_caps(MetaSession *session, int err); + int mark_caps_flushing(Inode *in, ceph_tid_t *ptid); + void adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s); + void flush_caps_sync(); + void kick_flushing_caps(Inode *in, MetaSession *session); + void kick_flushing_caps(MetaSession *session); + void early_kick_flushing_caps(MetaSession *session); + int get_caps(Fh *fh, int need, int want, int *have, loff_t endoff); + int get_caps_used(Inode *in); + + void maybe_update_snaprealm(SnapRealm *realm, snapid_t snap_created, snapid_t snap_highwater, + std::vector<snapid_t>& snaps); + + void handle_quota(const MConstRef<MClientQuota>& m); + void handle_snap(const MConstRef<MClientSnap>& m); + void handle_caps(const MConstRef<MClientCaps>& m); + void handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m); + void handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m); + void handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m); + void handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m); + void handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m); + void handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m); + void cap_delay_requeue(Inode *in); + + void send_cap(Inode *in, MetaSession *session, Cap *cap, int flags, + int used, int want, int retain, int flush, + ceph_tid_t flush_tid); + + void send_flush_snap(Inode *in, MetaSession *session, snapid_t follows, CapSnap& capsnap); + + void flush_snaps(Inode *in); + void get_cap_ref(Inode *in, int cap); + void put_cap_ref(Inode *in, int cap); + void wait_sync_caps(Inode *in, ceph_tid_t want); + void wait_sync_caps(ceph_tid_t want); + void queue_cap_snap(Inode *in, SnapContext &old_snapc); + void finish_cap_snap(Inode *in, CapSnap &capsnap, int used); + + void _schedule_invalidate_dentry_callback(Dentry *dn, bool del); + void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, std::string& name); + void _try_to_trim_inode(Inode *in, bool sched_inval); + + void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len); + void _invalidate_inode_cache(Inode *in); + void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len); + void _async_invalidate(vinodeno_t ino, int64_t off, int64_t len); + + void _schedule_ino_release_callback(Inode *in); + void _async_inode_release(vinodeno_t ino); + + bool _release(Inode *in); + + /** + * Initiate a flush of the data associated with the given inode. + * If you specify a Context, you are responsible for holding an inode + * reference for the duration of the flush. If not, _flush() will + * take the reference for you. + * @param in The Inode whose data you wish to flush. + * @param c The Context you wish us to complete once the data is + * flushed. If already flushed, this will be called in-line. + * + * @returns true if the data was already flushed, false otherwise. + */ + bool _flush(Inode *in, Context *c); + void _flush_range(Inode *in, int64_t off, uint64_t size); + void _flushed(Inode *in); + void flush_set_callback(ObjectCacher::ObjectSet *oset); + + void close_release(Inode *in); + void close_safe(Inode *in); + + void lock_fh_pos(Fh *f); + void unlock_fh_pos(Fh *f); + + // metadata cache + void update_dir_dist(Inode *in, DirStat *st, mds_rank_t from); + + void clear_dir_complete_and_ordered(Inode *diri, bool complete); + void insert_readdir_results(MetaRequest *request, MetaSession *session, + Inode *diri, Inode *diri_other); + Inode* insert_trace(MetaRequest *request, MetaSession *session); + void update_inode_file_size(Inode *in, int issued, uint64_t size, + uint64_t truncate_seq, uint64_t truncate_size); + void update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq, + utime_t ctime, utime_t mtime, utime_t atime); + + Inode *add_update_inode(InodeStat *st, utime_t ttl, MetaSession *session, + const UserPerm& request_perms); + Dentry *insert_dentry_inode(Dir *dir, const std::string& dname, LeaseStat *dlease, + Inode *in, utime_t from, MetaSession *session, + Dentry *old_dentry = NULL); + void update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session); + + bool use_faked_inos() { return _use_faked_inos; } + vinodeno_t map_faked_ino(ino_t ino); + + //notify the mds to flush the mdlog + void flush_mdlog_sync(Inode *in); + void flush_mdlog_sync(); + void flush_mdlog(MetaSession *session); + + void renew_caps(); + void renew_caps(MetaSession *session); + void flush_cap_releases(); + void renew_and_flush_cap_releases(); + void tick(); + void start_tick_thread(); + + void update_read_io_size(size_t size) { + total_read_ops++; + total_read_size += size; + } + + void update_write_io_size(size_t size) { + total_write_ops++; + total_write_size += size; + } + + void inc_dentry_nr() { + ++dentry_nr; + } + void dec_dentry_nr() { + --dentry_nr; + } + void dlease_hit() { + ++dlease_hits; + } + void dlease_miss() { + ++dlease_misses; + } + std::tuple<uint64_t, uint64_t, uint64_t> get_dlease_hit_rates() { + return std::make_tuple(dlease_hits, dlease_misses, dentry_nr); + } + + void cap_hit() { + ++cap_hits; + } + void cap_miss() { + ++cap_misses; + } + std::pair<uint64_t, uint64_t> get_cap_hit_rates() { + return std::make_pair(cap_hits, cap_misses); + } + + void inc_opened_files() { + ++opened_files; + } + void dec_opened_files() { + --opened_files; + } + std::pair<uint64_t, uint64_t> get_opened_files_rates() { + return std::make_pair(opened_files, inode_map.size()); + } + + void inc_pinned_icaps() { + ++pinned_icaps; + } + void dec_pinned_icaps(uint64_t nr=1) { + pinned_icaps -= nr; + } + std::pair<uint64_t, uint64_t> get_pinned_icaps_rates() { + return std::make_pair(pinned_icaps, inode_map.size()); + } + + void inc_opened_inodes() { + ++opened_inodes; + } + void dec_opened_inodes() { + --opened_inodes; + } + std::pair<uint64_t, uint64_t> get_opened_inodes_rates() { + return std::make_pair(opened_inodes, inode_map.size()); + } + + /* timer_lock for 'timer' */ + ceph::mutex timer_lock = ceph::make_mutex("Client::timer_lock"); + SafeTimer timer; + + /* tick thread */ + std::thread upkeeper; + ceph::condition_variable upkeep_cond; + bool tick_thread_stopped = false; + + std::unique_ptr<PerfCounters> logger; + std::unique_ptr<MDSMap> mdsmap; + + bool fuse_default_permissions; + bool _collect_and_send_global_metrics; + +protected: + std::list<ceph::condition_variable*> waiting_for_reclaim; + /* Flags for check_caps() */ + static const unsigned CHECK_CAPS_NODELAY = 0x1; + static const unsigned CHECK_CAPS_SYNCHRONOUS = 0x2; + + void check_caps(Inode *in, unsigned flags); + + void set_cap_epoch_barrier(epoch_t e); + + void handle_command_reply(const MConstRef<MCommandReply>& m); + int fetch_fsmap(bool user); + int resolve_mds( + const std::string &mds_spec, + std::vector<mds_gid_t> *targets); + + void get_session_metadata(std::map<std::string, std::string> *meta) const; + bool have_open_session(mds_rank_t mds); + void got_mds_push(MetaSession *s); + MetaSessionRef _get_mds_session(mds_rank_t mds, Connection *con); ///< return session for mds *and* con; null otherwise + MetaSessionRef _get_or_open_mds_session(mds_rank_t mds); + MetaSessionRef _open_mds_session(mds_rank_t mds); + void _close_mds_session(MetaSession *s); + void _closed_mds_session(MetaSession *s, int err=0, bool rejected=false); + bool _any_stale_sessions() const; + void _kick_stale_sessions(); + void handle_client_session(const MConstRef<MClientSession>& m); + void send_reconnect(MetaSession *s); + void resend_unsafe_requests(MetaSession *s); + void wait_unsafe_requests(); + + void dump_mds_requests(Formatter *f); + void dump_mds_sessions(Formatter *f, bool cap_dump=false); + + int make_request(MetaRequest *req, const UserPerm& perms, + InodeRef *ptarget = 0, bool *pcreated = 0, + mds_rank_t use_mds=-1, bufferlist *pdirbl=0, + size_t feature_needed=ULONG_MAX); + void put_request(MetaRequest *request); + void unregister_request(MetaRequest *request); + + int verify_reply_trace(int r, MetaSession *session, MetaRequest *request, + const MConstRef<MClientReply>& reply, + InodeRef *ptarget, bool *pcreated, + const UserPerm& perms); + void encode_cap_releases(MetaRequest *request, mds_rank_t mds); + int encode_inode_release(Inode *in, MetaRequest *req, + mds_rank_t mds, int drop, + int unless,int force=0); + void encode_dentry_release(Dentry *dn, MetaRequest *req, + mds_rank_t mds, int drop, int unless); + mds_rank_t choose_target_mds(MetaRequest *req, Inode** phash_diri=NULL); + void connect_mds_targets(mds_rank_t mds); + void send_request(MetaRequest *request, MetaSession *session, + bool drop_cap_releases=false); + MRef<MClientRequest> build_client_request(MetaRequest *request, mds_rank_t mds); + void kick_requests(MetaSession *session); + void kick_requests_closed(MetaSession *session); + void handle_client_request_forward(const MConstRef<MClientRequestForward>& reply); + void handle_client_reply(const MConstRef<MClientReply>& reply); + bool is_dir_operation(MetaRequest *request); + + int path_walk(const filepath& fp, struct walk_dentry_result* result, const UserPerm& perms, bool followsym=true, int mask=0, + InodeRef dirinode=nullptr); + int path_walk(const filepath& fp, InodeRef *end, const UserPerm& perms, + bool followsym=true, int mask=0, InodeRef dirinode=nullptr); + + // fake inode number for 32-bits ino_t + void _assign_faked_ino(Inode *in); + void _assign_faked_root(Inode *in); + void _release_faked_ino(Inode *in); + void _reset_faked_inos(); + vinodeno_t _map_faked_ino(ino_t ino); + + // Optional extra metadata about me to send to the MDS + void populate_metadata(const std::string &mount_root); + + SnapRealm *get_snap_realm(inodeno_t r); + SnapRealm *get_snap_realm_maybe(inodeno_t r); + void put_snap_realm(SnapRealm *realm); + bool adjust_realm_parent(SnapRealm *realm, inodeno_t parent); + void update_snap_trace(MetaSession *session, const bufferlist& bl, SnapRealm **realm_ret, bool must_flush=true); + void invalidate_snaprealm_and_children(SnapRealm *realm); + + void refresh_snapdir_attrs(Inode *in, Inode *diri); + Inode *open_snapdir(Inode *diri); + + int get_fd() { + int fd = free_fd_set.range_start(); + free_fd_set.erase(fd, 1); + return fd; + } + void put_fd(int fd) { + free_fd_set.insert(fd, 1); + } + + /* + * Resolve file descriptor, or return NULL. + */ + Fh *get_filehandle(int fd) { + auto it = fd_map.find(fd); + if (it == fd_map.end()) + return NULL; + return it->second; + } + int get_fd_inode(int fd, InodeRef *in); + + // helpers + void wake_up_session_caps(MetaSession *s, bool reconnect); + + void wait_on_context_list(std::list<Context*>& ls); + void signal_context_list(std::list<Context*>& ls); + + // -- metadata cache stuff + + // decrease inode ref. delete if dangling. + void _put_inode(Inode *in, int n); + void delay_put_inodes(bool wakeup=false); + void put_inode(Inode *in, int n=1); + void close_dir(Dir *dir); + + int subscribe_mdsmap(const std::string &fs_name=""); + + void _abort_mds_sessions(int err); + + // same as unmount() but for when the client_lock is already held + void _unmount(bool abort); + + //int get_cache_size() { return lru.lru_get_size(); } + + /** + * Don't call this with in==NULL, use get_or_create for that + * leave dn set to default NULL unless you're trying to add + * a new inode to a pre-created Dentry + */ + Dentry* link(Dir *dir, const std::string& name, Inode *in, Dentry *dn); + void unlink(Dentry *dn, bool keepdir, bool keepdentry); + + int fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0); + int fill_stat(InodeRef& in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0) { + return fill_stat(in.get(), st, dirstat, rstat); + } + + void fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx); + void fill_statx(InodeRef& in, unsigned int mask, struct ceph_statx *stx) { + return fill_statx(in.get(), mask, stx); + } + + void touch_dn(Dentry *dn); + + // trim cache. + void trim_cache(bool trim_kernel_dcache=false); + void trim_cache_for_reconnect(MetaSession *s); + void trim_dentry(Dentry *dn); + void trim_caps(MetaSession *s, uint64_t max); + void _invalidate_kernel_dcache(); + void _trim_negative_child_dentries(InodeRef& in); + + void dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected); + void dump_cache(Formatter *f); // debug + + // force read-only + void force_session_readonly(MetaSession *s); + + void dump_status(Formatter *f); // debug + + bool ms_dispatch2(const MessageRef& m) override; + + void ms_handle_connect(Connection *con) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override; + bool ms_handle_refused(Connection *con) override; + + int authenticate(); + + Inode* get_quota_root(Inode *in, const UserPerm& perms, quota_max_t type=QUOTA_ANY); + bool check_quota_condition(Inode *in, const UserPerm& perms, + std::function<bool (const Inode &)> test); + bool is_quota_files_exceeded(Inode *in, const UserPerm& perms); + bool is_quota_bytes_exceeded(Inode *in, int64_t new_bytes, + const UserPerm& perms); + bool is_quota_bytes_approaching(Inode *in, const UserPerm& perms); + + int check_pool_perm(Inode *in, int need); + + void handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply); + + /** + * Call this when an OSDMap is seen with a full flag (global or per pool) + * set. + * + * @param pool the pool ID affected, or -1 if all. + */ + void _handle_full_flag(int64_t pool); + + void _close_sessions(); + + void _pre_init(); + + /** + * The basic housekeeping parts of init (perf counters, admin socket) + * that is independent of how objecters/monclient/messengers are + * being set up. + */ + void _finish_init(); + + // global client lock + // - protects Client and buffer cache both! + ceph::mutex client_lock = ceph::make_mutex("Client::client_lock"); + + std::map<snapid_t, int> ll_snap_ref; + + InodeRef root = nullptr; + map<Inode*, InodeRef> root_parents; + Inode* root_ancestor = nullptr; + LRU lru; // lru list of Dentry's in our local metadata cache. + + InodeRef cwd; + + std::unique_ptr<Filer> filer; + std::unique_ptr<ObjectCacher> objectcacher; + std::unique_ptr<WritebackHandler> writeback_handler; + + Messenger *messenger; + MonClient *monclient; + Objecter *objecter; + + client_t whoami; + + /* The state migration mechanism */ + enum _state { + /* For the initialize_state */ + CLIENT_NEW, // The initial state for the initialize_state or after Client::shutdown() + CLIENT_INITIALIZING, // At the beginning of the Client::init() + CLIENT_INITIALIZED, // At the end of CLient::init() + + /* For the mount_state */ + CLIENT_UNMOUNTED, // The initial state for the mount_state or after unmounted + CLIENT_MOUNTING, // At the beginning of Client::mount() + CLIENT_MOUNTED, // At the end of Client::mount() + CLIENT_UNMOUNTING, // At the beginning of the Client::_unmout() + }; + + typedef enum _state state_t; + using RWRef_t = RWRef<state_t>; + + struct mount_state_t : public RWRefState<state_t> { + public: + bool is_valid_state(state_t state) const override { + switch (state) { + case Client::CLIENT_MOUNTING: + case Client::CLIENT_MOUNTED: + case Client::CLIENT_UNMOUNTING: + case Client::CLIENT_UNMOUNTED: + return true; + default: + return false; + } + } + + int check_reader_state(state_t require) const override { + if (require == Client::CLIENT_MOUNTING && + (state == Client::CLIENT_MOUNTING || state == Client::CLIENT_MOUNTED)) + return true; + else + return false; + } + + /* The state migration check */ + int check_writer_state(state_t require) const override { + if (require == Client::CLIENT_MOUNTING && + state == Client::CLIENT_UNMOUNTED) + return true; + else if (require == Client::CLIENT_MOUNTED && + state == Client::CLIENT_MOUNTING) + return true; + else if (require == Client::CLIENT_UNMOUNTING && + state == Client::CLIENT_MOUNTED) + return true; + else if (require == Client::CLIENT_UNMOUNTED && + state == Client::CLIENT_UNMOUNTING) + return true; + else + return false; + } + + mount_state_t(state_t state, const char *lockname, uint64_t reader_cnt=0) + : RWRefState (state, lockname, reader_cnt) {} + ~mount_state_t() {} + }; + + struct initialize_state_t : public RWRefState<state_t> { + public: + bool is_valid_state(state_t state) const override { + switch (state) { + case Client::CLIENT_NEW: + case Client::CLIENT_INITIALIZING: + case Client::CLIENT_INITIALIZED: + return true; + default: + return false; + } + } + + int check_reader_state(state_t require) const override { + if (require == Client::CLIENT_INITIALIZED && + state >= Client::CLIENT_INITIALIZED) + return true; + else + return false; + } + + /* The state migration check */ + int check_writer_state(state_t require) const override { + if (require == Client::CLIENT_INITIALIZING && + (state == Client::CLIENT_NEW)) + return true; + else if (require == Client::CLIENT_INITIALIZED && + (state == Client::CLIENT_INITIALIZING)) + return true; + else if (require == Client::CLIENT_NEW && + (state == Client::CLIENT_INITIALIZED)) + return true; + else + return false; + } + + initialize_state_t(state_t state, const char *lockname, uint64_t reader_cnt=0) + : RWRefState (state, lockname, reader_cnt) {} + ~initialize_state_t() {} + }; + + struct mount_state_t mount_state; + struct initialize_state_t initialize_state; + +private: + struct C_Readahead : public Context { + C_Readahead(Client *c, Fh *f); + ~C_Readahead() override; + void finish(int r) override; + + Client *client; + Fh *f; + }; + + /* + * These define virtual xattrs exposing the recursive directory + * statistics and layout metadata. + */ + struct VXattr { + const std::string name; + size_t (Client::*getxattr_cb)(Inode *in, char *val, size_t size); + int (Client::*setxattr_cb)(Inode *in, const void *val, size_t size, + const UserPerm& perms); + bool readonly; + bool (Client::*exists_cb)(Inode *in); + unsigned int flags; + }; + + enum { + NO_ACL = 0, + POSIX_ACL, + }; + + enum { + MAY_EXEC = 1, + MAY_WRITE = 2, + MAY_READ = 4, + }; + + typedef std::function<void(dir_result_t*, MetaRequest*, InodeRef&, frag_t)> fill_readdir_args_cb_t; + + std::unique_ptr<CephContext, std::function<void(CephContext*)>> cct_deleter; + + /* Flags for VXattr */ + static const unsigned VXATTR_RSTAT = 0x1; + static const unsigned VXATTR_DIRSTAT = 0x2; + + static const VXattr _dir_vxattrs[]; + static const VXattr _file_vxattrs[]; + static const VXattr _common_vxattrs[]; + + + bool is_reserved_vino(vinodeno_t &vino); + + void fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off); + + int _opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms); + void _readdir_drop_dirp_buffer(dir_result_t *dirp); + bool _readdir_have_frag(dir_result_t *dirp); + void _readdir_next_frag(dir_result_t *dirp); + void _readdir_rechoose_frag(dir_result_t *dirp); + int _readdir_get_frag(int op, dir_result_t *dirp, + fill_readdir_args_cb_t fill_req_cb); + int _readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p, int caps, bool getref); + int _readdir_r_cb(int op, + dir_result_t* d, + add_dirent_cb_t cb, + fill_readdir_args_cb_t fill_cb, + void* p, + unsigned want, + unsigned flags, + bool getref, + bool bypass_cache); + + void _closedir(dir_result_t *dirp); + + // other helpers + void _fragmap_remove_non_leaves(Inode *in); + void _fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds); + + void _ll_get(Inode *in); + int _ll_put(Inode *in, uint64_t num); + void _ll_drop_pins(); + + Fh *_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms); + int _release_fh(Fh *fh); + void _put_fh(Fh *fh); + + std::pair<int, bool> _do_remount(bool retry_on_error); + + int _read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, bool *checkeof); + int _read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl); + + bool _dentry_valid(const Dentry *dn); + + // internal interface + // call these with client_lock held! + int _do_lookup(Inode *dir, const std::string& name, int mask, InodeRef *target, + const UserPerm& perms); + + int _lookup(Inode *dir, const std::string& dname, int mask, InodeRef *target, + const UserPerm& perm, std::string* alternate_name=nullptr, + bool is_rename=false); + + int _link(Inode *in, Inode *dir, const char *name, const UserPerm& perm, std::string alternate_name, + InodeRef *inp = 0); + int _unlink(Inode *dir, const char *name, const UserPerm& perm); + int _rename(Inode *olddir, const char *oname, Inode *ndir, const char *nname, const UserPerm& perm, std::string alternate_name); + int _mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm, + InodeRef *inp = 0, const std::map<std::string, std::string> &metadata={}, + std::string alternate_name=""); + int _rmdir(Inode *dir, const char *name, const UserPerm& perms); + int _symlink(Inode *dir, const char *name, const char *target, + const UserPerm& perms, std::string alternate_name, InodeRef *inp = 0); + int _mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, + const UserPerm& perms, InodeRef *inp = 0); + int _do_setattr(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms, InodeRef *inp, + std::vector<uint8_t>* aux=nullptr); + void stat_to_statx(struct stat *st, struct ceph_statx *stx); + int __setattrx(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms, InodeRef *inp = 0); + int _setattrx(InodeRef &in, struct ceph_statx *stx, int mask, + const UserPerm& perms); + int _setattr(InodeRef &in, struct stat *attr, int mask, + const UserPerm& perms); + int _ll_setattrx(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms, InodeRef *inp = 0); + int _getattr(Inode *in, int mask, const UserPerm& perms, bool force=false); + int _getattr(InodeRef &in, int mask, const UserPerm& perms, bool force=false) { + return _getattr(in.get(), mask, perms, force); + } + int _readlink(Inode *in, char *buf, size_t size); + int _getxattr(Inode *in, const char *name, void *value, size_t len, + const UserPerm& perms); + int _getxattr(InodeRef &in, const char *name, void *value, size_t len, + const UserPerm& perms); + int _getvxattr(Inode *in, const UserPerm& perms, const char *attr_name, + ssize_t size, void *value, mds_rank_t rank); + int _listxattr(Inode *in, char *names, size_t len, const UserPerm& perms); + int _do_setxattr(Inode *in, const char *name, const void *value, size_t len, + int flags, const UserPerm& perms); + int _setxattr(Inode *in, const char *name, const void *value, size_t len, + int flags, const UserPerm& perms); + int _setxattr(InodeRef &in, const char *name, const void *value, size_t len, + int flags, const UserPerm& perms); + int _setxattr_check_data_pool(std::string& name, std::string& value, const OSDMap *osdmap); + void _setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t len); + int _removexattr(Inode *in, const char *nm, const UserPerm& perms); + int _removexattr(InodeRef &in, const char *nm, const UserPerm& perms); + int _open(Inode *in, int flags, mode_t mode, Fh **fhp, + const UserPerm& perms); + int _renew_caps(Inode *in); + int _create(Inode *in, const char *name, int flags, mode_t mode, InodeRef *inp, + Fh **fhp, int stripe_unit, int stripe_count, int object_size, + const char *data_pool, bool *created, const UserPerm &perms, + std::string alternate_name); + + loff_t _lseek(Fh *fh, loff_t offset, int whence); + int64_t _read(Fh *fh, int64_t offset, uint64_t size, bufferlist *bl); + int64_t _write(Fh *fh, int64_t offset, uint64_t size, const char *buf, + const struct iovec *iov, int iovcnt); + int64_t _preadv_pwritev_locked(Fh *fh, const struct iovec *iov, + unsigned iovcnt, int64_t offset, + bool write, bool clamp_to_int); + int _preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, + int64_t offset, bool write); + int _flush(Fh *fh); + int _fsync(Fh *fh, bool syncdataonly); + int _fsync(Inode *in, bool syncdataonly); + int _sync_fs(); + int clear_suid_sgid(Inode *in, const UserPerm& perms, bool defer=false); + int _fallocate(Fh *fh, int mode, int64_t offset, int64_t length); + int _getlk(Fh *fh, struct flock *fl, uint64_t owner); + int _setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep); + int _flock(Fh *fh, int cmd, uint64_t owner); + int _lazyio(Fh *fh, int enable); + + Dentry *get_or_create(Inode *dir, const char* name); + + int xattr_permission(Inode *in, const char *name, unsigned want, + const UserPerm& perms); + int may_setattr(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms); + int may_open(Inode *in, int flags, const UserPerm& perms); + int may_lookup(Inode *dir, const UserPerm& perms); + int may_create(Inode *dir, const UserPerm& perms); + int may_delete(Inode *dir, const char *name, const UserPerm& perms); + int may_hardlink(Inode *in, const UserPerm& perms); + + int _getattr_for_perm(Inode *in, const UserPerm& perms); + + vinodeno_t _get_vino(Inode *in); + + bool _vxattrcb_fscrypt_auth_exists(Inode *in); + size_t _vxattrcb_fscrypt_auth(Inode *in, char *val, size_t size); + int _vxattrcb_fscrypt_auth_set(Inode *in, const void *val, size_t size, const UserPerm& perms); + bool _vxattrcb_fscrypt_file_exists(Inode *in); + size_t _vxattrcb_fscrypt_file(Inode *in, char *val, size_t size); + int _vxattrcb_fscrypt_file_set(Inode *in, const void *val, size_t size, const UserPerm& perms); + bool _vxattrcb_quota_exists(Inode *in); + size_t _vxattrcb_quota(Inode *in, char *val, size_t size); + size_t _vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size); + size_t _vxattrcb_quota_max_files(Inode *in, char *val, size_t size); + + bool _vxattrcb_layout_exists(Inode *in); + size_t _vxattrcb_layout(Inode *in, char *val, size_t size); + size_t _vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size); + size_t _vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size); + size_t _vxattrcb_layout_object_size(Inode *in, char *val, size_t size); + size_t _vxattrcb_layout_pool(Inode *in, char *val, size_t size); + size_t _vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size); + size_t _vxattrcb_dir_entries(Inode *in, char *val, size_t size); + size_t _vxattrcb_dir_files(Inode *in, char *val, size_t size); + size_t _vxattrcb_dir_subdirs(Inode *in, char *val, size_t size); + size_t _vxattrcb_dir_rentries(Inode *in, char *val, size_t size); + size_t _vxattrcb_dir_rfiles(Inode *in, char *val, size_t size); + size_t _vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size); + size_t _vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size); + size_t _vxattrcb_dir_rbytes(Inode *in, char *val, size_t size); + size_t _vxattrcb_dir_rctime(Inode *in, char *val, size_t size); + + bool _vxattrcb_dir_pin_exists(Inode *in); + size_t _vxattrcb_dir_pin(Inode *in, char *val, size_t size); + + bool _vxattrcb_snap_btime_exists(Inode *in); + size_t _vxattrcb_snap_btime(Inode *in, char *val, size_t size); + + size_t _vxattrcb_caps(Inode *in, char *val, size_t size); + + bool _vxattrcb_mirror_info_exists(Inode *in); + size_t _vxattrcb_mirror_info(Inode *in, char *val, size_t size); + + size_t _vxattrcb_cluster_fsid(Inode *in, char *val, size_t size); + size_t _vxattrcb_client_id(Inode *in, char *val, size_t size); + + static const VXattr *_get_vxattrs(Inode *in); + static const VXattr *_match_vxattr(Inode *in, const char *name); + + int _do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep, + struct flock *fl, uint64_t owner, bool removing=false); + int _interrupt_filelock(MetaRequest *req); + void _encode_filelocks(Inode *in, bufferlist& bl); + void _release_filelocks(Fh *fh); + void _update_lock_state(struct flock *fl, uint64_t owner, ceph_lock_state_t *lock_state); + + int _posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl, + const UserPerm& perms); + int _posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms); + int _posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want); + + mds_rank_t _get_random_up_mds() const; + + int _ll_getattr(Inode *in, int caps, const UserPerm& perms); + int _lookup_parent(Inode *in, const UserPerm& perms, Inode **parent=NULL); + int _lookup_name(Inode *in, Inode *parent, const UserPerm& perms); + int _lookup_vino(vinodeno_t ino, const UserPerm& perms, Inode **inode=NULL); + bool _ll_forget(Inode *in, uint64_t count); + + void collect_and_send_metrics(); + void collect_and_send_global_metrics(); + + void update_io_stat_metadata(utime_t latency); + void update_io_stat_read(utime_t latency); + void update_io_stat_write(utime_t latency); + + uint32_t deleg_timeout = 0; + + client_switch_interrupt_callback_t switch_interrupt_cb = nullptr; + client_remount_callback_t remount_cb = nullptr; + client_ino_callback_t ino_invalidate_cb = nullptr; + client_dentry_callback_t dentry_invalidate_cb = nullptr; + client_umask_callback_t umask_cb = nullptr; + client_ino_release_t ino_release_cb = nullptr; + void *callback_handle = nullptr; + bool can_invalidate_dentries = false; + + Finisher async_ino_invalidator; + Finisher async_dentry_invalidator; + Finisher interrupt_finisher; + Finisher remount_finisher; + Finisher async_ino_releasor; + Finisher objecter_finisher; + + ceph::coarse_mono_time last_cap_renew; + + CommandHook m_command_hook; + + int user_id, group_id; + int acl_type = NO_ACL; + + epoch_t cap_epoch_barrier = 0; + + // mds sessions + map<mds_rank_t, MetaSessionRef> mds_sessions; // mds -> push seq + std::set<mds_rank_t> mds_ranks_closing; // mds ranks currently tearing down sessions + std::list<ceph::condition_variable*> waiting_for_mdsmap; + + // FSMap, for when using mds_command + std::list<ceph::condition_variable*> waiting_for_fsmap; + std::unique_ptr<FSMap> fsmap; + std::unique_ptr<FSMapUser> fsmap_user; + + // This mutex only protects command_table + ceph::mutex command_lock = ceph::make_mutex("Client::command_lock"); + // MDS command state + CommandTable<MDSCommandOp> command_table; + + bool _use_faked_inos; + + // Cluster fsid + fs_cluster_id_t fscid; + + // file handles, etc. + interval_set<int> free_fd_set; // unused fds + ceph::unordered_map<int, Fh*> fd_map; + set<Fh*> ll_unclosed_fh_set; + ceph::unordered_set<dir_result_t*> opened_dirs; + uint64_t fd_gen = 1; + + bool mount_aborted = false; + bool blocklisted = false; + + ceph::unordered_map<vinodeno_t, Inode*> inode_map; + ceph::unordered_map<ino_t, vinodeno_t> faked_ino_map; + interval_set<ino_t> free_faked_inos; + ino_t last_used_faked_ino; + ino_t last_used_faked_root; + + int local_osd = -CEPHFS_ENXIO; + epoch_t local_osd_epoch = 0; + + // mds requests + ceph_tid_t last_tid = 0; + ceph_tid_t oldest_tid = 0; // oldest incomplete mds request, excluding setfilelock requests + map<ceph_tid_t, MetaRequest*> mds_requests; + + // cap flushing + ceph_tid_t last_flush_tid = 1; + + xlist<Inode*> delayed_list; + int num_flushing_caps = 0; + ceph::unordered_map<inodeno_t,SnapRealm*> snap_realms; + std::map<std::string, std::string> metadata; + + ceph::coarse_mono_time last_auto_reconnect; + std::chrono::seconds caps_release_delay, mount_timeout; + // trace generation + std::ofstream traceout; + + ceph::condition_variable mount_cond, sync_cond; + + std::map<std::pair<int64_t,std::string>, int> pool_perms; + std::list<ceph::condition_variable*> waiting_for_pool_perm; + + std::list<ceph::condition_variable*> waiting_for_rename; + + uint64_t retries_on_invalidate = 0; + + // state reclaim + int reclaim_errno = 0; + epoch_t reclaim_osd_epoch = 0; + entity_addrvec_t reclaim_target_addrs; + + // dentry lease metrics + uint64_t dentry_nr = 0; + uint64_t dlease_hits = 0; + uint64_t dlease_misses = 0; + + uint64_t cap_hits = 0; + uint64_t cap_misses = 0; + + uint64_t opened_files = 0; + uint64_t pinned_icaps = 0; + uint64_t opened_inodes = 0; + + uint64_t total_read_ops = 0; + uint64_t total_read_size = 0; + + uint64_t total_write_ops = 0; + uint64_t total_write_size = 0; + + ceph::spinlock delay_i_lock; + std::map<Inode*,int> delay_i_release; + + uint64_t nr_metadata_request = 0; + uint64_t nr_read_request = 0; + uint64_t nr_write_request = 0; +}; + +/** + * Specialization of Client that manages its own Objecter instance + * and handles init/shutdown of messenger/monclient + */ +class StandaloneClient : public Client +{ +public: + StandaloneClient(Messenger *m, MonClient *mc, boost::asio::io_context& ictx); + + ~StandaloneClient() override; + + int init() override; + void shutdown() override; +}; + +#endif diff --git a/src/client/ClientSnapRealm.cc b/src/client/ClientSnapRealm.cc new file mode 100644 index 000000000..2ddd86f8a --- /dev/null +++ b/src/client/ClientSnapRealm.cc @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ClientSnapRealm.h" +#include "common/Formatter.h" + +using std::set; +using std::vector; + +void SnapRealm::build_snap_context() +{ + set<snapid_t> snaps; + snapid_t max_seq = seq; + + // start with prior_parents? + for (unsigned i=0; i<prior_parent_snaps.size(); i++) + snaps.insert(prior_parent_snaps[i]); + + // current parent's snaps + if (pparent) { + const SnapContext& psnapc = pparent->get_snap_context(); + for (unsigned i=0; i<psnapc.snaps.size(); i++) + if (psnapc.snaps[i] >= parent_since) + snaps.insert(psnapc.snaps[i]); + if (psnapc.seq > max_seq) + max_seq = psnapc.seq; + } + + // my snaps + for (unsigned i=0; i<my_snaps.size(); i++) + snaps.insert(my_snaps[i]); + + // ok! + cached_snap_context.seq = max_seq; + cached_snap_context.snaps.resize(0); + cached_snap_context.snaps.reserve(snaps.size()); + for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p) + cached_snap_context.snaps.push_back(*p); +} + +void SnapRealm::dump(Formatter *f) const +{ + f->dump_stream("ino") << ino; + f->dump_int("nref", nref); + f->dump_stream("created") << created; + f->dump_stream("seq") << seq; + f->dump_stream("parent_ino") << parent; + f->dump_stream("parent_since") << parent_since; + + f->open_array_section("prior_parent_snaps"); + for (vector<snapid_t>::const_iterator p = prior_parent_snaps.begin(); p != prior_parent_snaps.end(); ++p) + f->dump_stream("snapid") << *p; + f->close_section(); + f->open_array_section("my_snaps"); + for (vector<snapid_t>::const_iterator p = my_snaps.begin(); p != my_snaps.end(); ++p) + f->dump_stream("snapid") << *p; + f->close_section(); + + f->open_array_section("children"); + for (set<SnapRealm*>::const_iterator p = pchildren.begin(); p != pchildren.end(); ++p) + f->dump_stream("child") << (*p)->ino; + f->close_section(); +} diff --git a/src/client/ClientSnapRealm.h b/src/client/ClientSnapRealm.h new file mode 100644 index 000000000..17f45dbc9 --- /dev/null +++ b/src/client/ClientSnapRealm.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLIENT_SNAPREALM_H +#define CEPH_CLIENT_SNAPREALM_H + +#include "include/types.h" +#include "common/snap_types.h" +#include "include/xlist.h" + +struct Inode; + +struct SnapRealm { + inodeno_t ino; + int nref; + snapid_t created; + snapid_t seq; + + inodeno_t parent; + snapid_t parent_since; + std::vector<snapid_t> prior_parent_snaps; // snaps prior to parent_since + std::vector<snapid_t> my_snaps; + + SnapRealm *pparent; + std::set<SnapRealm*> pchildren; + utime_t last_modified; + uint64_t change_attr; + +private: + SnapContext cached_snap_context; // my_snaps + parent snaps + past_parent_snaps + friend std::ostream& operator<<(std::ostream& out, const SnapRealm& r); + +public: + xlist<Inode*> inodes_with_caps; + + explicit SnapRealm(inodeno_t i) : + ino(i), nref(0), created(0), seq(0), + pparent(NULL), last_modified(utime_t()), change_attr(0) { } + + void build_snap_context(); + void invalidate_cache() { + cached_snap_context.clear(); + } + + const SnapContext& get_snap_context() { + if (cached_snap_context.seq == 0) + build_snap_context(); + return cached_snap_context; + } + + void dump(Formatter *f) const; +}; + +inline std::ostream& operator<<(std::ostream& out, const SnapRealm& r) { + return out << "snaprealm(" << r.ino << " nref=" << r.nref << " c=" << r.created << " seq=" << r.seq + << " parent=" << r.parent + << " my_snaps=" << r.my_snaps + << " cached_snapc=" << r.cached_snap_context + << " last_modified=" << r.last_modified + << " change_attr=" << r.change_attr + << ")"; +} + +#endif diff --git a/src/client/Delegation.cc b/src/client/Delegation.cc new file mode 100644 index 000000000..0c0ac1b35 --- /dev/null +++ b/src/client/Delegation.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "common/Clock.h" +#include "common/Timer.h" + +#include "Client.h" +#include "Inode.h" +#include "Fh.h" +#include "Delegation.h" + +class C_Deleg_Timeout : public Context { + Delegation *deleg; +public: + explicit C_Deleg_Timeout(Delegation *d) : deleg(d) {} + void finish(int r) override { + Inode *in = deleg->get_fh()->inode.get(); + Client *client = in->client; + + lsubdout(client->cct, client, 0) << __func__ << + ": delegation return timeout for inode 0x" << + std::hex << in->ino << ". Forcibly unmounting client. "<< + client << std::dec << dendl; + client->_unmount(false); + } +}; + +/** + * ceph_deleg_caps_for_type - what caps are necessary for a delegation? + * @type: delegation request type + * + * Determine what caps are necessary in order to grant a delegation of a given + * type. For read delegations, we need whatever we require in order to do + * cached reads, plus AsLs to cover metadata changes that should trigger a + * recall. We also grab Xs since changing xattrs usually alters the mtime and + * so would trigger a recall. + * + * For write delegations, we need whatever read delegations need plus the + * caps to allow writing to the file (Fbwx). + */ +int ceph_deleg_caps_for_type(unsigned type) +{ + int caps = CEPH_CAP_PIN; + + switch (type) { + case CEPH_DELEGATION_WR: + caps |= CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER; + /* Fallthrough */ + case CEPH_DELEGATION_RD: + caps |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | + CEPH_CAP_XATTR_SHARED | + CEPH_CAP_LINK_SHARED | CEPH_CAP_AUTH_SHARED; + break; + default: + // Should never happen + ceph_abort(); + } + return caps; +} + +/* + * A delegation is a container for holding caps on behalf of a client that + * wants to be able to rely on them until recalled. + */ +Delegation::Delegation(Fh *_fh, unsigned _type, ceph_deleg_cb_t _cb, void *_priv) + : fh(_fh), priv(_priv), type(_type), recall_cb(_cb), + recall_time(utime_t()), timeout_event(nullptr) +{ + Inode *inode = _fh->inode.get(); + inode->client->get_cap_ref(inode, ceph_deleg_caps_for_type(_type)); +}; + +Delegation::~Delegation() +{ + disarm_timeout(); + Inode *inode = fh->inode.get(); + inode->client->put_cap_ref(inode, ceph_deleg_caps_for_type(type)); +} + +void Delegation::reinit(unsigned _type, ceph_deleg_cb_t _recall_cb, void *_priv) +{ + /* update cap refs -- note that we do a get first to avoid any going to 0 */ + if (type != _type) { + Inode *inode = fh->inode.get(); + + inode->client->get_cap_ref(inode, ceph_deleg_caps_for_type(_type)); + inode->client->put_cap_ref(inode, ceph_deleg_caps_for_type(type)); + type = _type; + } + + recall_cb = _recall_cb; + priv = _priv; +} + +void Delegation::arm_timeout() +{ + Client *client = fh->inode.get()->client; + + std::scoped_lock l(client->timer_lock); + if (timeout_event) + return; + + timeout_event = new C_Deleg_Timeout(this); + client->timer.add_event_after(client->get_deleg_timeout(), timeout_event); +} + +void Delegation::disarm_timeout() +{ + Client *client = fh->inode.get()->client; + + std::scoped_lock l(client->timer_lock); + if (!timeout_event) + return; + + client->timer.cancel_event(timeout_event); + timeout_event = nullptr; +} + +void Delegation::recall(bool skip_read) +{ + /* If skip_read is true, don't break read delegations */ + if (skip_read && type == CEPH_DELEGATION_RD) + return; + + if (!is_recalled()) { + recall_cb(fh, priv); + recall_time = ceph_clock_now(); + arm_timeout(); + } +} diff --git a/src/client/Delegation.h b/src/client/Delegation.h new file mode 100644 index 000000000..d24a02487 --- /dev/null +++ b/src/client/Delegation.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef _CEPH_CLIENT_DELEGATION_H +#define _CEPH_CLIENT_DELEGATION_H + +#include "common/Clock.h" +#include "common/Timer.h" +#include "include/cephfs/ceph_ll_client.h" + +/* Commands for manipulating delegation state */ +#ifndef CEPH_DELEGATION_NONE +# define CEPH_DELEGATION_NONE 0 +# define CEPH_DELEGATION_RD 1 +# define CEPH_DELEGATION_WR 2 +#endif + +/* Converts CEPH_DELEGATION_* to cap mask */ +int ceph_deleg_caps_for_type(unsigned type); + +/* + * A delegation is a container for holding caps on behalf of a client that + * wants to be able to rely on them until recalled. + */ +class Delegation { +public: + Delegation(Fh *_fh, unsigned _type, ceph_deleg_cb_t _cb, void *_priv); + ~Delegation(); + Fh *get_fh() { return fh; } + unsigned get_type() { return type; } + bool is_recalled() { return !recall_time.is_zero(); } + + void reinit(unsigned _type, ceph_deleg_cb_t _recall_cb, void *_priv); + void recall(bool skip_read); +private: + // Filehandle against which it was acquired + Fh *fh; + + // opaque token that will be passed to the callback + void *priv; + + // CEPH_DELEGATION_* type + unsigned type; + + // callback into application to recall delegation + ceph_deleg_cb_t recall_cb; + + // time of first recall + utime_t recall_time; + + // timer for unreturned delegations + Context *timeout_event; + + void arm_timeout(); + void disarm_timeout(); +}; + +#endif /* _CEPH_CLIENT_DELEGATION_H */ diff --git a/src/client/Dentry.cc b/src/client/Dentry.cc new file mode 100644 index 000000000..f8741050a --- /dev/null +++ b/src/client/Dentry.cc @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/types.h" +#include "include/utime.h" + +#include "Dentry.h" +#include "Dir.h" +#include "Inode.h" + +#include "common/Formatter.h" + +void Dentry::dump(Formatter *f) const +{ + f->dump_string("name", name); + f->dump_stream("dir") << dir->parent_inode->ino; + if (inode) + f->dump_stream("ino") << inode->ino; + f->dump_int("ref", ref); + f->dump_int("offset", offset); + if (lease_mds >= 0) { + f->dump_int("lease_mds", lease_mds); + f->dump_stream("lease_ttl") << lease_ttl; + f->dump_unsigned("lease_gen", lease_gen); + f->dump_unsigned("lease_seq", lease_seq); + } + f->dump_int("cap_shared_gen", cap_shared_gen); +} + +std::ostream &operator<<(std::ostream &oss, const Dentry &dn) +{ + return oss << dn.dir->parent_inode->vino() << "[\"" << dn.name << "\"]"; +} diff --git a/src/client/Dentry.h b/src/client/Dentry.h new file mode 100644 index 000000000..8003dfed3 --- /dev/null +++ b/src/client/Dentry.h @@ -0,0 +1,100 @@ +#ifndef CEPH_CLIENT_DENTRY_H +#define CEPH_CLIENT_DENTRY_H + +#include "include/lru.h" +#include "include/xlist.h" + +#include "mds/mdstypes.h" +#include "Inode.h" +#include "InodeRef.h" +#include "Dir.h" + +class Dentry : public LRUObject { +public: + explicit Dentry(Dir *_dir, const std::string &_name) : + dir(_dir), name(_name), inode_xlist_link(this) + { + auto r = dir->dentries.insert(make_pair(name, this)); + ceph_assert(r.second); + dir->num_null_dentries++; + } + ~Dentry() { + ceph_assert(ref == 0); + ceph_assert(dir == nullptr); + } + + /* + * ref==1 -> cached, unused + * ref >1 -> pinned in lru + */ + void get() { + ceph_assert(ref > 0); + if (++ref == 2) + lru_pin(); + //cout << "dentry.get on " << this << " " << name << " now " << ref << std::endl; + } + void put() { + ceph_assert(ref > 0); + if (--ref == 1) + lru_unpin(); + //cout << "dentry.put on " << this << " " << name << " now " << ref << std::endl; + if (ref == 0) + delete this; + } + void link(InodeRef in) { + inode = in; + inode->dentries.push_back(&inode_xlist_link); + if (inode->is_dir()) { + if (inode->dir) + get(); // dir -> dn pin + if (inode->ll_ref) + get(); // ll_ref -> dn pin + } + dir->num_null_dentries--; + } + void unlink(void) { + if (inode->is_dir()) { + if (inode->dir) + put(); // dir -> dn pin + if (inode->ll_ref) + put(); // ll_ref -> dn pin + } + ceph_assert(inode_xlist_link.get_list() == &inode->dentries); + inode_xlist_link.remove_myself(); + inode.reset(); + dir->num_null_dentries++; + } + void mark_primary() { + if (inode && inode->dentries.front() != this) + inode->dentries.push_front(&inode_xlist_link); + } + void detach(void) { + ceph_assert(!inode); + auto p = dir->dentries.find(name); + ceph_assert(p != dir->dentries.end()); + dir->dentries.erase(p); + dir->num_null_dentries--; + dir = nullptr; + } + + void dump(Formatter *f) const; + friend std::ostream &operator<<(std::ostream &oss, const Dentry &Dentry); + + Dir *dir; + const std::string name; + InodeRef inode; + int ref = 1; // 1 if there's a dir beneath me. + int64_t offset = 0; + mds_rank_t lease_mds = -1; + utime_t lease_ttl; + uint64_t lease_gen = 0; + ceph_seq_t lease_seq = 0; + int cap_shared_gen = 0; + std::string alternate_name; + bool is_renaming = false; + +private: + xlist<Dentry *>::item inode_xlist_link; +}; + +#endif diff --git a/src/client/Dir.h b/src/client/Dir.h new file mode 100644 index 000000000..f98782e43 --- /dev/null +++ b/src/client/Dir.h @@ -0,0 +1,23 @@ +#ifndef CEPH_CLIENT_DIR_H +#define CEPH_CLIENT_DIR_H + +#include <string> +#include <vector> + +class Dentry; +struct Inode; + +class Dir { + public: + Inode *parent_inode; // my inode + ceph::unordered_map<std::string, Dentry*> dentries; + unsigned num_null_dentries = 0; + + std::vector<Dentry*> readdir_cache; + + explicit Dir(Inode* in) { parent_inode = in; } + + bool is_empty() { return dentries.empty(); } +}; + +#endif diff --git a/src/client/Fh.cc b/src/client/Fh.cc new file mode 100644 index 000000000..e7dd10b36 --- /dev/null +++ b/src/client/Fh.cc @@ -0,0 +1,32 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "Inode.h" + +#include "Fh.h" + +Fh::Fh(InodeRef in, int flags, int cmode, uint64_t _gen, const UserPerm &perms) : + inode(in), flags(flags), gen(_gen), actor_perms(perms), mode(cmode), + readahead() +{ + inode->add_fh(this); +} + +Fh::~Fh() +{ + inode->rm_fh(this); +} + diff --git a/src/client/Fh.h b/src/client/Fh.h new file mode 100644 index 000000000..434716fee --- /dev/null +++ b/src/client/Fh.h @@ -0,0 +1,65 @@ +#ifndef CEPH_CLIENT_FH_H +#define CEPH_CLIENT_FH_H + +#include "common/Readahead.h" +#include "include/types.h" +#include "InodeRef.h" +#include "UserPerm.h" +#include "mds/flock.h" + +class Inode; + +// file handle for any open file state + +struct Fh { + InodeRef inode; + int flags; + uint64_t gen; + UserPerm actor_perms; // perms I opened the file with + + // the members above once ininitalized in the constructor + // they won't change, and putting them under the client_lock + // makes no sense. + + int _ref = 1; + loff_t pos = 0; + int mode; // the mode i opened the file with + + bool pos_locked = false; // pos is currently in use + std::list<ceph::condition_variable*> pos_waiters; // waiters for pos + + Readahead readahead; + + // file lock + std::unique_ptr<ceph_lock_state_t> fcntl_locks; + std::unique_ptr<ceph_lock_state_t> flock_locks; + + bool has_any_filelocks() { + return + (fcntl_locks && !fcntl_locks->empty()) || + (flock_locks && !flock_locks->empty()); + } + + // IO error encountered by any writeback on this Inode while + // this Fh existed (i.e. an fsync on another Fh will still show + // up as an async_err here because it could have been the same + // bytes we wrote via this Fh). + int async_err = {0}; + + int take_async_err() + { + int e = async_err; + async_err = 0; + return e; + } + + Fh() = delete; + Fh(InodeRef in, int flags, int cmode, uint64_t gen, const UserPerm &perms); + ~Fh(); + + void get() { ++_ref; } + int put() { return --_ref; } +}; + + +#endif diff --git a/src/client/Inode.cc b/src/client/Inode.cc new file mode 100644 index 000000000..0c19bef5e --- /dev/null +++ b/src/client/Inode.cc @@ -0,0 +1,814 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Client.h" +#include "Inode.h" +#include "Dentry.h" +#include "Dir.h" +#include "Fh.h" +#include "MetaSession.h" +#include "ClientSnapRealm.h" +#include "Delegation.h" + +#include "mds/flock.h" + +using std::dec; +using std::list; +using std::oct; +using std::ostream; +using std::string; + +Inode::~Inode() +{ + delay_cap_item.remove_myself(); + dirty_cap_item.remove_myself(); + snaprealm_item.remove_myself(); + + if (snapdir_parent) { + snapdir_parent->flags &= ~I_SNAPDIR_OPEN; + snapdir_parent.reset(); + } + + if (!oset.objects.empty()) { + lsubdout(client->cct, client, 0) << __func__ << ": leftover objects on inode 0x" + << std::hex << ino << std::dec << dendl; + ceph_assert(oset.objects.empty()); + } + + if (!delegations.empty()) { + lsubdout(client->cct, client, 0) << __func__ << ": leftover delegations on inode 0x" + << std::hex << ino << std::dec << dendl; + ceph_assert(delegations.empty()); + } +} + +ostream& operator<<(ostream &out, const Inode &in) +{ + out << in.vino() << "(" + << "faked_ino=" << in.faked_ino + << " nref=" << in.get_nref() + << " ll_ref=" << in.ll_ref + << " cap_refs=" << in.cap_refs + << " open=" << in.open_by_mode + << " mode=" << oct << in.mode << dec + << " size=" << in.size << "/" << in.max_size + << " nlink=" << in.nlink + << " btime=" << in.btime + << " mtime=" << in.mtime + << " ctime=" << in.ctime + << " change_attr=" << in.change_attr + << " caps=" << ccap_string(in.caps_issued()); + if (!in.caps.empty()) { + out << "("; + bool first = true; + for (const auto &pair : in.caps) { + if (!first) + out << ','; + out << pair.first << '=' << ccap_string(pair.second.issued); + first = false; + } + out << ")"; + } + if (in.dirty_caps) + out << " dirty_caps=" << ccap_string(in.dirty_caps); + if (in.flushing_caps) + out << " flushing_caps=" << ccap_string(in.flushing_caps); + + if (in.flags & I_COMPLETE) + out << " COMPLETE"; + + if (in.is_file()) + out << " " << in.oset; + + if (!in.dentries.empty()) + out << " parents=" << in.dentries; + + if (in.is_dir() && in.has_dir_layout()) + out << " has_dir_layout"; + + if (in.quota.is_enabled()) + out << " " << in.quota; + + out << ' ' << &in << ")"; + return out; +} + + +void Inode::make_long_path(filepath& p) +{ + if (!dentries.empty()) { + Dentry *dn = get_first_parent(); + ceph_assert(dn->dir && dn->dir->parent_inode); + dn->dir->parent_inode->make_long_path(p); + p.push_dentry(dn->name); + } else if (snapdir_parent) { + make_nosnap_relative_path(p); + } else + p = filepath(ino); +} + +void Inode::make_short_path(filepath& p) +{ + if (!dentries.empty()) { + Dentry *dn = get_first_parent(); + ceph_assert(dn->dir && dn->dir->parent_inode); + p = filepath(dn->name, dn->dir->parent_inode->ino); + } else if (snapdir_parent) { + make_nosnap_relative_path(p); + } else + p = filepath(ino); +} + +/* + * make a filepath suitable for an mds request: + * - if we are non-snapped/live, the ino is sufficient, e.g. #1234 + * - if we are snapped, make filepath relative to first non-snapped parent. + */ +void Inode::make_nosnap_relative_path(filepath& p) +{ + if (snapid == CEPH_NOSNAP) { + p = filepath(ino); + } else if (snapdir_parent) { + snapdir_parent->make_nosnap_relative_path(p); + string empty; + p.push_dentry(empty); + } else if (!dentries.empty()) { + Dentry *dn = get_first_parent(); + ceph_assert(dn->dir && dn->dir->parent_inode); + dn->dir->parent_inode->make_nosnap_relative_path(p); + p.push_dentry(dn->name); + } else { + p = filepath(ino); + } +} + +void Inode::get_open_ref(int mode) +{ + client->inc_opened_files(); + if (open_by_mode[mode] == 0) { + client->inc_opened_inodes(); + } + open_by_mode[mode]++; + break_deleg(!(mode & CEPH_FILE_MODE_WR)); +} + +bool Inode::put_open_ref(int mode) +{ + //cout << "open_by_mode[" << mode << "] " << open_by_mode[mode] << " -> " << (open_by_mode[mode]-1) << std::endl; + auto& ref = open_by_mode.at(mode); + ceph_assert(ref > 0); + client->dec_opened_files(); + if (--ref == 0) { + client->dec_opened_inodes(); + return true; + } + return false; +} + +void Inode::get_cap_ref(int cap) +{ + int n = 0; + while (cap) { + if (cap & 1) { + int c = 1 << n; + cap_refs[c]++; + //cout << "inode " << *this << " get " << cap_string(c) << " " << (cap_refs[c]-1) << " -> " << cap_refs[c] << std::endl; + } + cap >>= 1; + n++; + } +} + +int Inode::put_cap_ref(int cap) +{ + int last = 0; + int n = 0; + while (cap) { + if (cap & 1) { + int c = 1 << n; + if (cap_refs[c] <= 0) { + lderr(client->cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl; + ceph_assert(cap_refs[c] > 0); + } + if (--cap_refs[c] == 0) + last |= c; + //cout << "inode " << *this << " put " << cap_string(c) << " " << (cap_refs[c]+1) << " -> " << cap_refs[c] << std::endl; + } + cap >>= 1; + n++; + } + return last; +} + +bool Inode::is_any_caps() +{ + return !caps.empty() || snap_caps; +} + +bool Inode::cap_is_valid(const Cap &cap) const +{ + /*cout << "cap_gen " << cap->session-> cap_gen << std::endl + << "session gen " << cap->gen << std::endl + << "cap expire " << cap->session->cap_ttl << std::endl + << "cur time " << ceph_clock_now(cct) << std::endl;*/ + if ((cap.session->cap_gen <= cap.gen) + && (ceph_clock_now() < cap.session->cap_ttl)) { + return true; + } + return false; +} + +int Inode::caps_issued(int *implemented) const +{ + int c = snap_caps; + int i = 0; + for (const auto &[mds, cap] : caps) { + if (cap_is_valid(cap)) { + c |= cap.issued; + i |= cap.implemented; + } + } + // exclude caps issued by non-auth MDS, but are been revoking by + // the auth MDS. The non-auth MDS should be revoking/exporting + // these caps, but the message is delayed. + if (auth_cap) + c &= ~auth_cap->implemented | auth_cap->issued; + + if (implemented) + *implemented = i; + return c; +} + +void Inode::try_touch_cap(mds_rank_t mds) +{ + auto it = caps.find(mds); + if (it != caps.end()) { + it->second.touch(); + } +} + +/** + * caps_issued_mask - check whether we have all of the caps in the mask + * @mask: mask to check against + * @allow_impl: whether the caller can also use caps that are implemented but not issued + * + * This is the bog standard "check whether we have the required caps" operation. + * Typically, we only check against the capset that is currently "issued". + * In other words, we ignore caps that have been revoked but not yet released. + * Also account capability hit/miss stats. + * + * Some callers (particularly those doing attribute retrieval) can also make + * use of the full set of "implemented" caps to satisfy requests from the + * cache. + * + * Those callers should refrain from taking new references to implemented + * caps! + */ +bool Inode::caps_issued_mask(unsigned mask, bool allow_impl) +{ + int c = snap_caps; + int i = 0; + + if ((c & mask) == mask) + return true; + // prefer auth cap + if (auth_cap && + cap_is_valid(*auth_cap) && + (auth_cap->issued & mask) == mask) { + auth_cap->touch(); + client->cap_hit(); + return true; + } + // try any cap + for (auto &pair : caps) { + Cap &cap = pair.second; + if (cap_is_valid(cap)) { + if ((cap.issued & mask) == mask) { + cap.touch(); + client->cap_hit(); + return true; + } + c |= cap.issued; + i |= cap.implemented; + } + } + + if (allow_impl) + c |= i; + + if ((c & mask) == mask) { + // bah.. touch them all + for (auto &pair : caps) { + pair.second.touch(); + } + client->cap_hit(); + return true; + } + + client->cap_miss(); + return false; +} + +int Inode::caps_used() +{ + int w = 0; + for (const auto &[cap, cnt] : cap_refs) + if (cnt) + w |= cap; + return w; +} + +int Inode::caps_file_wanted() +{ + int want = 0; + for (const auto &[mode, cnt] : open_by_mode) + if (cnt) + want |= ceph_caps_for_mode(mode); + return want; +} + +int Inode::caps_wanted() +{ + int want = caps_file_wanted() | caps_used(); + if (want & CEPH_CAP_FILE_BUFFER) + want |= CEPH_CAP_FILE_EXCL; + return want; +} + +int Inode::caps_mds_wanted() +{ + int want = 0; + for (const auto &pair : caps) { + want |= pair.second.wanted; + } + return want; +} + +int Inode::caps_dirty() +{ + return dirty_caps | flushing_caps; +} + +const UserPerm* Inode::get_best_perms() +{ + const UserPerm *perms = NULL; + for (const auto &pair : caps) { + const UserPerm& iperm = pair.second.latest_perms; + if (!perms) { // we don't have any, take what's present + perms = &iperm; + } else if (iperm.uid() == uid) { + if (iperm.gid() == gid) { // we have the best possible, return + return &iperm; + } + if (perms->uid() != uid) { // take uid > gid every time + perms = &iperm; + } + } else if (perms->uid() != uid && iperm.gid() == gid) { + perms = &iperm; // a matching gid is better than nothing + } + } + return perms; +} + +bool Inode::have_valid_size() +{ + // RD+RDCACHE or WR+WRBUFFER => valid size + if (caps_issued() & (CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL)) + return true; + return false; +} + +// open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). +Dir *Inode::open_dir() +{ + if (!dir) { + dir = new Dir(this); + lsubdout(client->cct, client, 15) << "open_dir " << dir << " on " << this << dendl; + ceph_assert(dentries.size() < 2); // dirs can't be hard-linked + if (!dentries.empty()) + get_first_parent()->get(); // pin dentry + iget(); // pin inode + } + return dir; +} + +bool Inode::check_mode(const UserPerm& perms, unsigned want) +{ + if (uid == perms.uid()) { + // if uid is owner, owner entry determines access + want = want << 6; + } else if (perms.gid_in_groups(gid)) { + // if a gid or sgid matches the owning group, group entry determines access + want = want << 3; + } + + return (mode & want) == want; +} + +void Inode::dump(Formatter *f) const +{ + f->dump_stream("ino") << ino; + f->dump_stream("snapid") << snapid; + if (rdev) + f->dump_unsigned("rdev", rdev); + f->dump_stream("ctime") << ctime; + f->dump_stream("btime") << btime; + f->dump_stream("mode") << '0' << std::oct << mode << std::dec; + f->dump_unsigned("uid", uid); + f->dump_unsigned("gid", gid); + f->dump_int("nlink", nlink); + + f->dump_unsigned("size", size); + f->dump_unsigned("max_size", max_size); + f->dump_unsigned("truncate_seq", truncate_seq); + f->dump_unsigned("truncate_size", truncate_size); + f->dump_stream("mtime") << mtime; + f->dump_stream("atime") << atime; + f->dump_unsigned("time_warp_seq", time_warp_seq); + f->dump_unsigned("change_attr", change_attr); + + f->dump_object("layout", layout); + if (is_dir()) { + f->open_object_section("dir_layout"); + ::dump(dir_layout, f); + f->close_section(); + + f->dump_bool("complete", flags & I_COMPLETE); + f->dump_bool("ordered", flags & I_DIR_ORDERED); + + /* FIXME when wip-mds-encoding is merged *** + f->open_object_section("dir_stat"); + dirstat.dump(f); + f->close_section(); + + f->open_object_section("rstat"); + rstat.dump(f); + f->close_section(); + */ + } + + f->dump_unsigned("version", version); + f->dump_unsigned("xattr_version", xattr_version); + f->dump_unsigned("flags", flags); + + if (is_dir()) { + f->dump_int("dir_hashed", (int)dir_hashed); + f->dump_int("dir_replicated", (int)dir_replicated); + if (dir_replicated) { + f->open_array_section("dirfrags"); + for (const auto &frag : frag_repmap) { + f->open_object_section("frags"); + CachedStackStringStream css; + *css << std::hex << frag.first.value() << "/" << std::dec << frag.first.bits(); + f->dump_string("frag", css->strv()); + + f->open_array_section("repmap"); + for (const auto &mds : frag.second) { + f->dump_int("mds", mds); + } + f->close_section(); + + f->close_section(); + } + f->close_section(); + } + } + + f->open_array_section("caps"); + for (const auto &pair : caps) { + f->open_object_section("cap"); + if (&pair.second == auth_cap) + f->dump_int("auth", 1); + pair.second.dump(f); + f->close_section(); + } + f->close_section(); + if (auth_cap) + f->dump_int("auth_cap", auth_cap->session->mds_num); + + f->dump_stream("dirty_caps") << ccap_string(dirty_caps); + if (flushing_caps) { + f->dump_stream("flushings_caps") << ccap_string(flushing_caps); + f->open_object_section("flushing_cap_tid"); + for (map<ceph_tid_t, int>::const_iterator p = flushing_cap_tids.begin(); + p != flushing_cap_tids.end(); + ++p) { + string n(ccap_string(p->second)); + f->dump_unsigned(n.c_str(), p->first); + } + f->close_section(); + } + f->dump_int("shared_gen", shared_gen); + f->dump_int("cache_gen", cache_gen); + if (snap_caps) { + f->dump_int("snap_caps", snap_caps); + f->dump_int("snap_cap_refs", snap_cap_refs); + } + + f->dump_stream("hold_caps_until") << hold_caps_until; + + if (snaprealm) { + f->open_object_section("snaprealm"); + snaprealm->dump(f); + f->close_section(); + } + if (!cap_snaps.empty()) { + for (const auto &p : cap_snaps) { + f->open_object_section("cap_snap"); + f->dump_stream("follows") << p.first; + p.second.dump(f); + f->close_section(); + } + } + + // open + if (!open_by_mode.empty()) { + f->open_array_section("open_by_mode"); + for (map<int,int>::const_iterator p = open_by_mode.begin(); p != open_by_mode.end(); ++p) { + f->open_object_section("ref"); + f->dump_int("mode", p->first); + f->dump_int("refs", p->second); + f->close_section(); + } + f->close_section(); + } + if (!cap_refs.empty()) { + f->open_array_section("cap_refs"); + for (map<int,int>::const_iterator p = cap_refs.begin(); p != cap_refs.end(); ++p) { + f->open_object_section("cap_ref"); + f->dump_stream("cap") << ccap_string(p->first); + f->dump_int("refs", p->second); + f->close_section(); + } + f->close_section(); + } + + f->dump_unsigned("reported_size", reported_size); + if (wanted_max_size != max_size) + f->dump_unsigned("wanted_max_size", wanted_max_size); + if (requested_max_size != max_size) + f->dump_unsigned("requested_max_size", requested_max_size); + + f->dump_int("nref", get_nref()); + f->dump_int("ll_ref", ll_ref); + + if (!dentries.empty()) { + f->open_array_section("parents"); + for (const auto &&dn : dentries) { + f->open_object_section("dentry"); + f->dump_stream("dir_ino") << dn->dir->parent_inode->ino; + f->dump_string("name", dn->name); + f->close_section(); + } + f->close_section(); + } +} + +void Cap::dump(Formatter *f) const +{ + f->dump_int("mds", session->mds_num); + f->dump_stream("ino") << inode.ino; + f->dump_unsigned("cap_id", cap_id); + f->dump_stream("issued") << ccap_string(issued); + if (implemented != issued) + f->dump_stream("implemented") << ccap_string(implemented); + f->dump_stream("wanted") << ccap_string(wanted); + f->dump_unsigned("seq", seq); + f->dump_unsigned("issue_seq", issue_seq); + f->dump_unsigned("mseq", mseq); + f->dump_unsigned("gen", gen); +} + +void CapSnap::dump(Formatter *f) const +{ + f->dump_stream("ino") << in->ino; + f->dump_stream("issued") << ccap_string(issued); + f->dump_stream("dirty") << ccap_string(dirty); + f->dump_unsigned("size", size); + f->dump_stream("ctime") << ctime; + f->dump_stream("mtime") << mtime; + f->dump_stream("atime") << atime; + f->dump_int("time_warp_seq", time_warp_seq); + f->dump_stream("mode") << '0' << std::oct << mode << std::dec; + f->dump_unsigned("uid", uid); + f->dump_unsigned("gid", gid); + if (!xattrs.empty()) { + f->open_object_section("xattr_lens"); + for (map<string,bufferptr>::const_iterator p = xattrs.begin(); p != xattrs.end(); ++p) + f->dump_int(p->first.c_str(), p->second.length()); + f->close_section(); + } + f->dump_unsigned("xattr_version", xattr_version); + f->dump_int("writing", (int)writing); + f->dump_int("dirty_data", (int)dirty_data); + f->dump_unsigned("flush_tid", flush_tid); +} + +void Inode::set_async_err(int r) +{ + for (const auto &fh : fhs) { + fh->async_err = r; + } +} + +bool Inode::has_recalled_deleg() +{ + if (delegations.empty()) + return false; + + // Either all delegations are recalled or none are. Just check the first. + Delegation& deleg = delegations.front(); + return deleg.is_recalled(); +} + +void Inode::recall_deleg(bool skip_read) +{ + if (delegations.empty()) + return; + + // Issue any recalls + for (list<Delegation>::iterator d = delegations.begin(); + d != delegations.end(); ++d) { + + Delegation& deleg = *d; + deleg.recall(skip_read); + } +} + +bool Inode::delegations_broken(bool skip_read) +{ + if (delegations.empty()) { + lsubdout(client->cct, client, 10) << + __func__ << ": delegations empty on " << *this << dendl; + return true; + } + + if (skip_read) { + Delegation& deleg = delegations.front(); + lsubdout(client->cct, client, 10) << + __func__ << ": read delegs only on " << *this << dendl; + if (deleg.get_type() == CEPH_FILE_MODE_RD) { + return true; + } + } + lsubdout(client->cct, client, 10) << + __func__ << ": not broken" << *this << dendl; + return false; +} + +void Inode::break_deleg(bool skip_read) +{ + lsubdout(client->cct, client, 10) << + __func__ << ": breaking delegs on " << *this << dendl; + + recall_deleg(skip_read); + + while (!delegations_broken(skip_read)) + client->wait_on_list(waitfor_deleg); +} + +/** + * set_deleg: request a delegation on an open Fh + * @fh: filehandle on which to acquire it + * @type: delegation request type + * @cb: delegation recall callback function + * @priv: private pointer to be passed to callback + * + * Attempt to acquire a delegation on an open file handle. If there are no + * conflicts and we have the right caps, allocate a new delegation, fill it + * out and return 0. Return an error if we can't get one for any reason. + */ +int Inode::set_deleg(Fh *fh, unsigned type, ceph_deleg_cb_t cb, void *priv) +{ + lsubdout(client->cct, client, 10) << + __func__ << ": inode " << *this << dendl; + + /* + * 0 deleg timeout means that they haven't been explicitly enabled. Don't + * allow it, with an unusual error to make it clear. + */ + if (!client->get_deleg_timeout()) + return -CEPHFS_ETIME; + + // Just say no if we have any recalled delegs still outstanding + if (has_recalled_deleg()) { + lsubdout(client->cct, client, 10) << __func__ << + ": has_recalled_deleg" << dendl; + return -CEPHFS_EAGAIN; + } + + // check vs. currently open files on this inode + switch (type) { + case CEPH_DELEGATION_RD: + if (open_count_for_write()) { + lsubdout(client->cct, client, 10) << __func__ << + ": open for write" << dendl; + return -CEPHFS_EAGAIN; + } + break; + case CEPH_DELEGATION_WR: + if (open_count() > 1) { + lsubdout(client->cct, client, 10) << __func__ << ": open" << dendl; + return -CEPHFS_EAGAIN; + } + break; + default: + return -CEPHFS_EINVAL; + } + + /* + * A delegation is essentially a long-held container for cap references that + * we delegate to the client until recalled. The caps required depend on the + * type of delegation (read vs. rw). This is entirely an opportunistic thing. + * If we don't have the necessary caps for the delegation, then we just don't + * grant one. + * + * In principle we could request the caps from the MDS, but a delegation is + * usually requested just after an open. If we don't have the necessary caps + * already, then it's likely that there is some sort of conflicting access. + * + * In the future, we may need to add a way to have this request caps more + * aggressively -- for instance, to handle WANT_DELEGATION for NFSv4.1+. + */ + int need = ceph_deleg_caps_for_type(type); + if (!caps_issued_mask(need)) { + lsubdout(client->cct, client, 10) << __func__ << ": cap mismatch, have=" + << ccap_string(caps_issued()) << " need=" << ccap_string(need) << dendl; + return -CEPHFS_EAGAIN; + } + + for (list<Delegation>::iterator d = delegations.begin(); + d != delegations.end(); ++d) { + Delegation& deleg = *d; + if (deleg.get_fh() == fh) { + deleg.reinit(type, cb, priv); + return 0; + } + } + + delegations.emplace_back(fh, type, cb, priv); + return 0; +} + +/** + * unset_deleg - remove a delegation that was previously set + * @fh: file handle to clear delegation of + * + * Unlink delegation from the Inode (if there is one), put caps and free it. + */ +void Inode::unset_deleg(Fh *fh) +{ + for (list<Delegation>::iterator d = delegations.begin(); + d != delegations.end(); ++d) { + Delegation& deleg = *d; + if (deleg.get_fh() == fh) { + delegations.erase(d); + client->signal_cond_list(waitfor_deleg); + break; + } + } +} + +/** +* mark_caps_dirty - mark some caps dirty +* @caps: the dirty caps +* +* note that if there is no dirty and flushing caps before, we need to pin this inode. +* it will be unpined by handle_cap_flush_ack when there are no dirty and flushing caps. +*/ +void Inode::mark_caps_dirty(int caps) +{ + /* + * If auth_cap is nullptr means the reonnecting is not finished or + * already rejected. + */ + if (!auth_cap) { + ceph_assert(!dirty_caps); + + lsubdout(client->cct, client, 1) << __func__ << " " << *this << " dirty caps '" << ccap_string(caps) + << "', but no auth cap." << dendl; + return; + } + + lsubdout(client->cct, client, 10) << __func__ << " " << *this << " " << ccap_string(dirty_caps) << " -> " + << ccap_string(dirty_caps | caps) << dendl; + + if (caps && !caps_dirty()) + iget(); + + dirty_caps |= caps; + auth_cap->session->get_dirty_list().push_back(&dirty_cap_item); + client->cap_delay_requeue(this); +} + +/** +* mark_caps_clean - only clean the dirty_caps and caller should start flushing the dirty caps. +*/ +void Inode::mark_caps_clean() +{ + lsubdout(client->cct, client, 10) << __func__ << " " << *this << dendl; + dirty_caps = 0; + dirty_cap_item.remove_myself(); +} + + diff --git a/src/client/Inode.h b/src/client/Inode.h new file mode 100644 index 000000000..d28ff2e08 --- /dev/null +++ b/src/client/Inode.h @@ -0,0 +1,366 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLIENT_INODE_H +#define CEPH_CLIENT_INODE_H + +#include <numeric> + +#include "include/compat.h" +#include "include/ceph_assert.h" +#include "include/types.h" +#include "include/xlist.h" + +#include "mds/flock.h" +#include "mds/mdstypes.h" // hrm +#include "include/cephfs/types.h" + +#include "osdc/ObjectCacher.h" + +#include "InodeRef.h" +#include "MetaSession.h" +#include "UserPerm.h" +#include "Delegation.h" + +class Client; +class Dentry; +class Dir; +struct SnapRealm; +struct Inode; +class MetaRequest; +class filepath; +class Fh; + +class Cap { +public: + Cap() = delete; + Cap(Inode &i, MetaSession *s) : inode(i), + session(s), + gen(s->cap_gen), + cap_item(this) + { + s->caps.push_back(&cap_item); + } + ~Cap() { + cap_item.remove_myself(); + } + + void touch(void) { + // move to back of LRU + session->caps.push_back(&cap_item); + } + + void dump(Formatter *f) const; + + Inode &inode; + MetaSession *session; + uint64_t cap_id = 0; + unsigned issued = 0; + unsigned implemented = 0; + unsigned wanted = 0; // as known to mds. + uint64_t seq = 0; + uint64_t issue_seq = 0; + __u32 mseq = 0; // migration seq + __u32 gen; + UserPerm latest_perms; + +private: + /* Note that this Cap will not move (see Inode::caps): + * + * Section 23.1.2#8 + * The insert members shall not affect the validity of iterators and + * references to the container, and the erase members shall invalidate only + * iterators and references to the erased elements. + */ + xlist<Cap *>::item cap_item; +}; + +struct CapSnap { + //snapid_t follows; // map key + InodeRef in; + SnapContext context; + int issued = 0, dirty = 0; + + uint64_t size = 0; + utime_t ctime, btime, mtime, atime; + version_t time_warp_seq = 0; + uint64_t change_attr = 0; + uint32_t mode = 0; + uid_t uid = 0; + gid_t gid = 0; + std::map<std::string,bufferptr> xattrs; + version_t xattr_version = 0; + + bufferlist inline_data; + version_t inline_version = 0; + + bool writing = false, dirty_data = false; + uint64_t flush_tid = 0; + + int64_t cap_dirtier_uid = -1; + int64_t cap_dirtier_gid = -1; + + explicit CapSnap(Inode *i) + : in(i) + {} + + void dump(Formatter *f) const; +}; + +// inode flags +#define I_COMPLETE (1 << 0) +#define I_DIR_ORDERED (1 << 1) +#define I_SNAPDIR_OPEN (1 << 2) +#define I_KICK_FLUSH (1 << 3) +#define I_CAP_DROPPED (1 << 4) +#define I_ERROR_FILELOCK (1 << 5) + +struct Inode : RefCountedObject { + ceph::coarse_mono_time hold_caps_until; + Client *client; + + // -- the actual inode -- + inodeno_t ino; // ORDER DEPENDENCY: oset + snapid_t snapid; + ino_t faked_ino = 0; + + uint32_t rdev = 0; // if special file + + // affected by any inode change... + utime_t ctime; // inode change time + utime_t btime; // birth time + + // perm (namespace permissions) + uint32_t mode = 0; + uid_t uid = 0; + gid_t gid = 0; + + // nlink + int32_t nlink = 0; + + // file (data access) + ceph_dir_layout dir_layout{}; + file_layout_t layout; + uint64_t size = 0; // on directory, # dentries + uint32_t truncate_seq = 1; + uint64_t truncate_size = -1; + utime_t mtime; // file data modify time. + utime_t atime; // file data access time. + uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes()) + uint64_t change_attr = 0; + + uint64_t max_size = 0; // max size we can write to + + // dirfrag, recursive accountin + frag_info_t dirstat; + nest_info_t rstat; + + // special stuff + version_t version = 0; // auth only + version_t xattr_version = 0; + utime_t snap_btime; // snapshot creation (birth) time + std::map<std::string, std::string> snap_metadata; + + // inline data + version_t inline_version = 0; + bufferlist inline_data; + + std::vector<uint8_t> fscrypt_auth; + std::vector<uint8_t> fscrypt_file; + bool is_fscrypt_enabled() { + return !!fscrypt_auth.size(); + } + + bool is_root() const { return ino == CEPH_INO_ROOT; } + bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; } + bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; } + bool is_file() const { return (mode & S_IFMT) == S_IFREG; } + + bool has_dir_layout() const { + return layout != file_layout_t(); + } + + __u32 hash_dentry_name(const std::string &dn) { + int which = dir_layout.dl_dir_hash; + if (!which) + which = CEPH_STR_HASH_LINUX; + ceph_assert(ceph_str_hash_valid(which)); + return ceph_str_hash(which, dn.data(), dn.length()); + } + + unsigned flags = 0; + + quota_info_t quota; + + bool is_complete_and_ordered() { + static const unsigned wants = I_COMPLETE | I_DIR_ORDERED; + return (flags & wants) == wants; + } + + // about the dir (if this is one!) + Dir *dir = 0; // if i'm a dir. + fragtree_t dirfragtree; + uint64_t dir_release_count = 1; + uint64_t dir_ordered_count = 1; + bool dir_hashed = false; + bool dir_replicated = false; + + // per-mds caps + std::map<mds_rank_t, Cap> caps; // mds -> Cap + Cap *auth_cap = 0; + int64_t cap_dirtier_uid = -1; + int64_t cap_dirtier_gid = -1; + unsigned dirty_caps = 0; + unsigned flushing_caps = 0; + std::map<ceph_tid_t, int> flushing_cap_tids; + int shared_gen = 0; + int cache_gen = 0; + int snap_caps = 0; + int snap_cap_refs = 0; + xlist<Inode*>::item delay_cap_item, dirty_cap_item, flushing_cap_item; + + SnapRealm *snaprealm = 0; + xlist<Inode*>::item snaprealm_item; + InodeRef snapdir_parent; // only if we are a snapdir inode + std::map<snapid_t,CapSnap> cap_snaps; // pending flush to mds + + //int open_by_mode[CEPH_FILE_MODE_NUM]; + std::map<int,int> open_by_mode; + std::map<int,int> cap_refs; + + ObjectCacher::ObjectSet oset; // ORDER DEPENDENCY: ino + + uint64_t reported_size = 0; + uint64_t wanted_max_size = 0; + uint64_t requested_max_size = 0; + + uint64_t ll_ref = 0; // separate ref count for ll client + xlist<Dentry *> dentries; // if i'm linked to a dentry. + std::string symlink; // symlink content, if it's a symlink + std::map<std::string,bufferptr> xattrs; + std::map<frag_t,int> fragmap; // known frag -> mds mappings + std::map<frag_t, std::vector<mds_rank_t>> frag_repmap; // non-auth mds mappings + + std::list<ceph::condition_variable*> waitfor_caps; + std::list<ceph::condition_variable*> waitfor_commit; + std::list<ceph::condition_variable*> waitfor_deleg; + + Dentry *get_first_parent() { + ceph_assert(!dentries.empty()); + return *dentries.begin(); + } + + void make_long_path(filepath& p); + void make_short_path(filepath& p); + void make_nosnap_relative_path(filepath& p); + + // The ref count. 1 for each dentry, fh, inode_map, + // cwd that links to me. + void iget() { get(); } + void iput(int n=1) { ceph_assert(n >= 0); while (n--) put(); } + + void ll_get() { + ll_ref++; + } + void ll_put(uint64_t n=1) { + ceph_assert(ll_ref >= n); + ll_ref -= n; + } + + // file locks + std::unique_ptr<ceph_lock_state_t> fcntl_locks; + std::unique_ptr<ceph_lock_state_t> flock_locks; + + bool has_any_filelocks() { + return + (fcntl_locks && !fcntl_locks->empty()) || + (flock_locks && !flock_locks->empty()); + } + + std::list<Delegation> delegations; + + xlist<MetaRequest*> unsafe_ops; + + std::set<Fh*> fhs; + + mds_rank_t dir_pin = MDS_RANK_NONE; + + Inode() = delete; + Inode(Client *c, vinodeno_t vino, file_layout_t *newlayout) + : client(c), ino(vino.ino), snapid(vino.snapid), delay_cap_item(this), + dirty_cap_item(this), flushing_cap_item(this), snaprealm_item(this), + oset((void *)this, newlayout->pool_id, this->ino) {} + ~Inode(); + + vinodeno_t vino() const { return vinodeno_t(ino, snapid); } + + struct Compare { + bool operator() (Inode* const & left, Inode* const & right) { + if (left->ino.val < right->ino.val) { + return (left->snapid.val < right->snapid.val); + } + return false; + } + }; + + bool check_mode(const UserPerm& perms, unsigned want); + + // CAPS -------- + void get_open_ref(int mode); + bool put_open_ref(int mode); + + void get_cap_ref(int cap); + int put_cap_ref(int cap); + bool is_any_caps(); + bool cap_is_valid(const Cap &cap) const; + int caps_issued(int *implemented = 0) const; + void try_touch_cap(mds_rank_t mds); + bool caps_issued_mask(unsigned mask, bool allow_impl=false); + int caps_used(); + int caps_file_wanted(); + int caps_wanted(); + int caps_mds_wanted(); + int caps_dirty(); + const UserPerm *get_best_perms(); + + bool have_valid_size(); + Dir *open_dir(); + + void add_fh(Fh *f) {fhs.insert(f);} + void rm_fh(Fh *f) {fhs.erase(f);} + void set_async_err(int r); + void dump(Formatter *f) const; + + void break_all_delegs() { break_deleg(false); }; + + void recall_deleg(bool skip_read); + bool has_recalled_deleg(); + int set_deleg(Fh *fh, unsigned type, ceph_deleg_cb_t cb, void *priv); + void unset_deleg(Fh *fh); + + void mark_caps_dirty(int caps); + void mark_caps_clean(); +private: + // how many opens for write on this Inode? + long open_count_for_write() + { + return (long)(open_by_mode[CEPH_FILE_MODE_RDWR] + + open_by_mode[CEPH_FILE_MODE_WR]); + }; + + // how many opens of any sort on this inode? + long open_count() + { + return (long) std::accumulate(open_by_mode.begin(), open_by_mode.end(), 0, + [] (int value, const std::map<int, int>::value_type& p) + { return value + p.second; }); + }; + + void break_deleg(bool skip_read); + bool delegations_broken(bool skip_read); + +}; + +std::ostream& operator<<(std::ostream &out, const Inode &in); + +#endif diff --git a/src/client/InodeRef.h b/src/client/InodeRef.h new file mode 100644 index 000000000..822ec0ffc --- /dev/null +++ b/src/client/InodeRef.h @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLIENT_INODEREF_H +#define CEPH_CLIENT_INODEREF_H + +#include <boost/intrusive_ptr.hpp> +class Inode; +void intrusive_ptr_add_ref(Inode *in); +void intrusive_ptr_release(Inode *in); +typedef boost::intrusive_ptr<Inode> InodeRef; +#endif diff --git a/src/client/MetaRequest.cc b/src/client/MetaRequest.cc new file mode 100644 index 000000000..6d709db58 --- /dev/null +++ b/src/client/MetaRequest.cc @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/types.h" +#include "client/MetaRequest.h" +#include "client/Dentry.h" +#include "client/Inode.h" +#include "messages/MClientReply.h" +#include "common/Formatter.h" + +void MetaRequest::dump(Formatter *f) const +{ + auto age = std::chrono::duration<double>(ceph_clock_now() - op_stamp); + + f->dump_unsigned("tid", tid); + f->dump_string("op", ceph_mds_op_name(head.op)); + f->dump_stream("path") << path; + f->dump_stream("path2") << path2; + if (_inode) + f->dump_stream("ino") << _inode->ino; + if (_old_inode) + f->dump_stream("old_ino") << _old_inode->ino; + if (_other_inode) + f->dump_stream("other_ino") << _other_inode->ino; + if (target) + f->dump_stream("target_ino") << target->ino; + if (_dentry) + f->dump_string("dentry", _dentry->name); + if (_old_dentry) + f->dump_string("old_dentry", _old_dentry->name); + f->dump_stream("hint_ino") << inodeno_t(head.ino); + + f->dump_stream("sent_stamp") << sent_stamp; + f->dump_float("age", age.count()); + f->dump_int("mds", mds); + f->dump_int("resend_mds", resend_mds); + f->dump_int("send_to_auth", send_to_auth); + f->dump_unsigned("sent_on_mseq", sent_on_mseq); + f->dump_int("retry_attempt", retry_attempt); + + f->dump_int("got_unsafe", got_unsafe); + + f->dump_unsigned("uid", head.caller_uid); + f->dump_unsigned("gid", head.caller_gid); + + f->dump_unsigned("oldest_client_tid", head.oldest_client_tid); + f->dump_unsigned("mdsmap_epoch", head.mdsmap_epoch); + f->dump_unsigned("flags", head.flags); + f->dump_unsigned("num_retry", head.ext_num_retry); + f->dump_unsigned("num_fwd", head.ext_num_fwd); + f->dump_unsigned("num_releases", head.num_releases); + + f->dump_int("abort_rc", abort_rc); + + f->dump_unsigned("owner_uid", head.owner_uid); + f->dump_unsigned("owner_gid", head.owner_gid); +} + +MetaRequest::~MetaRequest() +{ + if (_dentry) + _dentry->put(); + if (_old_dentry) + _old_dentry->put(); +} + +void MetaRequest::set_dentry(Dentry *d) { + ceph_assert(_dentry == NULL); + _dentry = d; + _dentry->get(); +} +Dentry *MetaRequest::dentry() { + return _dentry; +} + +void MetaRequest::set_old_dentry(Dentry *d) { + ceph_assert(_old_dentry == NULL); + _old_dentry = d; + _old_dentry->get(); +} +Dentry *MetaRequest::old_dentry() { + return _old_dentry; +} diff --git a/src/client/MetaRequest.h b/src/client/MetaRequest.h new file mode 100644 index 000000000..49ee6dc6e --- /dev/null +++ b/src/client/MetaRequest.h @@ -0,0 +1,246 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLIENT_METAREQUEST_H +#define CEPH_CLIENT_METAREQUEST_H + + +#include "include/types.h" +#include "include/xlist.h" +#include "include/filepath.h" +#include "mds/mdstypes.h" +#include "InodeRef.h" +#include "UserPerm.h" + +#include "messages/MClientRequest.h" +#include "messages/MClientReply.h" + +class Dentry; +class dir_result_t; + +struct MetaRequest { +private: + InodeRef _inode, _old_inode, _other_inode; + Dentry *_dentry = NULL; //associated with path + Dentry *_old_dentry = NULL; //associated with path2 + int abort_rc = 0; +public: + ceph::coarse_mono_time created = ceph::coarse_mono_clock::zero(); + uint64_t tid = 0; + utime_t op_stamp; + ceph_mds_request_head head; + filepath path, path2; + std::string alternate_name; + std::vector<uint8_t> fscrypt_auth; + std::vector<uint8_t> fscrypt_file; + bufferlist data; + int inode_drop = 0; //the inode caps this operation will drop + int inode_unless = 0; //unless we have these caps already + int old_inode_drop = 0, old_inode_unless = 0; + int dentry_drop = 0, dentry_unless = 0; + int old_dentry_drop = 0, old_dentry_unless = 0; + int other_inode_drop = 0, other_inode_unless = 0; + std::vector<MClientRequest::Release> cap_releases; + + int regetattr_mask = 0; // getattr mask if i need to re-stat after a traceless reply + + utime_t sent_stamp; + mds_rank_t mds = -1; // who i am asking + mds_rank_t resend_mds = -1; // someone wants you to (re)send the request here + bool send_to_auth = false; // must send to auth mds + __u32 sent_on_mseq = 0; // mseq at last submission of this request + int num_fwd = 0; // # of times i've been forwarded + int retry_attempt = 0; + std::atomic<uint64_t> ref = { 1 }; + + ceph::cref_t<MClientReply> reply = NULL; // the reply + bool kick = false; + bool success = false; + + // readdir result + dir_result_t *dirp = NULL; + + //possible responses + bool got_unsafe = false; + + xlist<MetaRequest*>::item item; + xlist<MetaRequest*>::item unsafe_item; + xlist<MetaRequest*>::item unsafe_dir_item; + xlist<MetaRequest*>::item unsafe_target_item; + + ceph::condition_variable *caller_cond = NULL; // who to take up + ceph::condition_variable *dispatch_cond = NULL; // who to kick back + std::list<ceph::condition_variable*> waitfor_safe; + + InodeRef target; + UserPerm perms; + + explicit MetaRequest(int op) : + item(this), unsafe_item(this), unsafe_dir_item(this), + unsafe_target_item(this) { + memset(&head, 0, sizeof(head)); + head.op = op; + head.owner_uid = -1; + head.owner_gid = -1; + } + ~MetaRequest(); + + /** + * Prematurely terminate the request, such that callers + * to make_request will receive `rc` as their result. + */ + void abort(int rc) + { + ceph_assert(rc != 0); + abort_rc = rc; + } + + /** + * Whether abort() has been called for this request + */ + inline bool aborted() const + { + return abort_rc != 0; + } + + /** + * Given that abort() has been called for this request, what `rc` was + * passed into it? + */ + int get_abort_code() const + { + return abort_rc; + } + + void set_inode(Inode *in) { + _inode = in; + } + Inode *inode() { + return _inode.get(); + } + void take_inode(InodeRef *out) { + out->swap(_inode); + } + void set_old_inode(Inode *in) { + _old_inode = in; + } + Inode *old_inode() { + return _old_inode.get(); + } + void take_old_inode(InodeRef *out) { + out->swap(_old_inode); + } + void set_other_inode(Inode *in) { + _other_inode = in; + } + Inode *other_inode() { + return _other_inode.get(); + } + void take_other_inode(InodeRef *out) { + out->swap(_other_inode); + } + void set_dentry(Dentry *d); + Dentry *dentry(); + void set_old_dentry(Dentry *d); + Dentry *old_dentry(); + + MetaRequest* get() { + ref++; + return this; + } + + /// psuedo-private put method; use Client::put_request() + bool _put() { + int v = --ref; + return v == 0; + } + + void set_inode_owner_uid_gid(unsigned u, unsigned g) { + /* it makes sense to set owner_{u,g}id only for OPs which create inodes */ + ceph_assert(IS_CEPH_MDS_OP_NEWINODE(head.op)); + head.owner_uid = u; + head.owner_gid = g; + } + + // normal fields + void set_tid(ceph_tid_t t) { tid = t; } + void set_oldest_client_tid(ceph_tid_t t) { head.oldest_client_tid = t; } + void inc_num_fwd() { head.ext_num_fwd = head.ext_num_fwd + 1; } + void set_retry_attempt(int a) { head.ext_num_retry = a; } + void set_filepath(const filepath& fp) { path = fp; } + void set_filepath2(const filepath& fp) { path2 = fp; } + void set_alternate_name(std::string an) { alternate_name = an; } + void set_string2(const char *s) { path2.set_path(std::string_view(s), 0); } + void set_caller_perms(const UserPerm& _perms) { + perms = _perms; + head.caller_uid = perms.uid(); + head.caller_gid = perms.gid(); + } + uid_t get_uid() { return perms.uid(); } + uid_t get_gid() { return perms.gid(); } + void set_data(const bufferlist &d) { data = d; } + void set_dentry_wanted() { + head.flags = head.flags | CEPH_MDS_FLAG_WANT_DENTRY; + } + int get_op() { return head.op; } + ceph_tid_t get_tid() { return tid; } + filepath& get_filepath() { return path; } + filepath& get_filepath2() { return path2; } + + bool is_write() { + return + (head.op & CEPH_MDS_OP_WRITE) || + (head.op == CEPH_MDS_OP_OPEN && (head.args.open.flags & (O_CREAT|O_TRUNC))); + } + bool can_forward() { + if ((head.op & CEPH_MDS_OP_WRITE) || + head.op == CEPH_MDS_OP_OPEN) // do not forward _any_ open request. + return false; + return true; + } + bool auth_is_best(int issued) { + if (send_to_auth) + return true; + + /* Any write op ? */ + if (head.op & CEPH_MDS_OP_WRITE) + return true; + + switch (head.op) { + case CEPH_MDS_OP_OPEN: + case CEPH_MDS_OP_READDIR: + return true; + case CEPH_MDS_OP_GETATTR: + /* + * If any 'x' caps is issued we can just choose the auth MDS + * instead of the random replica MDSes. Because only when the + * Locker is in LOCK_EXEC state will the loner client could + * get the 'x' caps. And if we send the getattr requests to + * any replica MDS it must auth pin and tries to rdlock from + * the auth MDS, and then the auth MDS need to do the Locker + * state transition to LOCK_SYNC. And after that the lock state + * will change back. + * + * This cost much when doing the Locker state transition and + * usually will need to revoke caps from clients. + * + * And for the 'Xs' caps for getxattr we will also choose the + * auth MDS, because the MDS side code is buggy due to setxattr + * won't notify the replica MDSes when the values changed and + * the replica MDS will return the old values. Though we will + * fix it in MDS code, but this still makes sense for old ceph. + */ + if (((head.args.getattr.mask & CEPH_CAP_ANY_SHARED) && + (issued & CEPH_CAP_ANY_EXCL)) || + (head.args.getattr.mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR))) + return true; + default: + return false; + } + } + + void dump(Formatter *f) const; + +}; + +#endif diff --git a/src/client/MetaSession.cc b/src/client/MetaSession.cc new file mode 100644 index 000000000..b5160a843 --- /dev/null +++ b/src/client/MetaSession.cc @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/types.h" +#include "messages/MClientCapRelease.h" + +#include "MetaSession.h" +#include "Inode.h" + +#include "common/Formatter.h" + +const char *MetaSession::get_state_name() const +{ + switch (state) { + case STATE_NEW: return "new"; + case STATE_OPENING: return "opening"; + case STATE_OPEN: return "open"; + case STATE_CLOSING: return "closing"; + case STATE_CLOSED: return "closed"; + case STATE_STALE: return "stale"; + default: return "unknown"; + } +} + +void MetaSession::dump(Formatter *f, bool cap_dump) const +{ + f->dump_int("mds", mds_num); + f->dump_object("addrs", addrs); + f->dump_unsigned("seq", seq); + f->dump_unsigned("cap_gen", cap_gen); + f->dump_stream("cap_ttl") << cap_ttl; + f->dump_stream("last_cap_renew_request") << last_cap_renew_request; + f->dump_unsigned("cap_renew_seq", cap_renew_seq); + f->dump_int("num_caps", caps.size()); + if (cap_dump) { + f->open_array_section("caps"); + for (const auto& cap : caps) { + f->dump_object("cap", *cap); + } + f->close_section(); + } + f->dump_string("state", get_state_name()); +} + +void MetaSession::enqueue_cap_release(inodeno_t ino, uint64_t cap_id, ceph_seq_t iseq, + ceph_seq_t mseq, epoch_t osd_barrier) +{ + if (!release) { + release = ceph::make_message<MClientCapRelease>(); + } + + if (osd_barrier > release->osd_epoch_barrier) { + release->osd_epoch_barrier = osd_barrier; + } + + ceph_mds_cap_item i; + i.ino = ino; + i.cap_id = cap_id; + i.seq = iseq; + i.migrate_seq = mseq; + release->caps.push_back(i); +} diff --git a/src/client/MetaSession.h b/src/client/MetaSession.h new file mode 100644 index 000000000..301306263 --- /dev/null +++ b/src/client/MetaSession.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLIENT_METASESSION_H +#define CEPH_CLIENT_METASESSION_H + +#include "include/types.h" +#include "include/utime.h" +#include "include/xlist.h" +#include "mds/MDSMap.h" +#include "mds/mdstypes.h" +#include "messages/MClientCapRelease.h" + +struct Cap; +struct Inode; +struct CapSnap; +struct MetaRequest; + +struct MetaSession { + mds_rank_t mds_num; + ConnectionRef con; + version_t seq = 0; + uint64_t cap_gen = 0; + utime_t cap_ttl, last_cap_renew_request; + uint64_t cap_renew_seq = 0; + entity_addrvec_t addrs; + feature_bitset_t mds_features; + feature_bitset_t mds_metric_flags; + + enum { + STATE_NEW, // Unused + STATE_OPENING, + STATE_OPEN, + STATE_CLOSING, + STATE_CLOSED, + STATE_STALE, + STATE_REJECTED, + } state = STATE_OPENING; + + enum { + RECLAIM_NULL, + RECLAIMING, + RECLAIM_OK, + RECLAIM_FAIL, + } reclaim_state = RECLAIM_NULL; + + int mds_state = MDSMap::STATE_NULL; + bool readonly = false; + + std::list<Context*> waiting_for_open; + + xlist<Cap*> caps; + // dirty_list keeps all the dirty inodes before flushing in current session. + xlist<Inode*> dirty_list; + xlist<Inode*> flushing_caps; + xlist<MetaRequest*> requests; + xlist<MetaRequest*> unsafe_requests; + std::set<ceph_tid_t> flushing_caps_tids; + + ceph::ref_t<MClientCapRelease> release; + + MetaSession(mds_rank_t mds_num, ConnectionRef con, const entity_addrvec_t& addrs) + : mds_num(mds_num), con(con), addrs(addrs) { + } + ~MetaSession() { + ceph_assert(caps.empty()); + ceph_assert(dirty_list.empty()); + ceph_assert(flushing_caps.empty()); + ceph_assert(requests.empty()); + ceph_assert(unsafe_requests.empty()); + } + + xlist<Inode*> &get_dirty_list() { return dirty_list; } + + const char *get_state_name() const; + + void dump(Formatter *f, bool cap_dump=false) const; + + void enqueue_cap_release(inodeno_t ino, uint64_t cap_id, ceph_seq_t iseq, + ceph_seq_t mseq, epoch_t osd_barrier); +}; + +using MetaSessionRef = std::shared_ptr<MetaSession>; +#endif diff --git a/src/client/ObjecterWriteback.h b/src/client/ObjecterWriteback.h new file mode 100644 index 000000000..867ef5aa0 --- /dev/null +++ b/src/client/ObjecterWriteback.h @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_OSDC_OBJECTERWRITEBACKHANDLER_H +#define CEPH_OSDC_OBJECTERWRITEBACKHANDLER_H + +#include "osdc/Objecter.h" +#include "osdc/WritebackHandler.h" + +class ObjecterWriteback : public WritebackHandler { + public: + ObjecterWriteback(Objecter *o, Finisher *fin, ceph::mutex *lock) + : m_objecter(o), + m_finisher(fin), + m_lock(lock) { } + ~ObjecterWriteback() override {} + + void read(const object_t& oid, uint64_t object_no, + const object_locator_t& oloc, uint64_t off, uint64_t len, + snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, + Context *onfinish) override { + m_objecter->read_trunc(oid, oloc, off, len, snapid, pbl, 0, + trunc_size, trunc_seq, + new C_OnFinisher(new C_Lock(m_lock, onfinish), + m_finisher)); + } + + bool may_copy_on_write(const object_t& oid, uint64_t read_off, + uint64_t read_len, snapid_t snapid) override { + return false; + } + + ceph_tid_t write(const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, + const SnapContext& snapc, const bufferlist &bl, + ceph::real_time mtime, uint64_t trunc_size, + __u32 trunc_seq, ceph_tid_t journal_tid, + const ZTracer::Trace &parent_trace, + Context *oncommit) override { + return m_objecter->write_trunc(oid, oloc, off, len, snapc, bl, mtime, 0, + trunc_size, trunc_seq, + new C_OnFinisher(new C_Lock(m_lock, + oncommit), + m_finisher)); + } + + bool can_scattered_write() override { return true; } + using WritebackHandler::write; + ceph_tid_t write(const object_t& oid, const object_locator_t& oloc, + std::vector<std::pair<uint64_t, bufferlist> >& io_vec, + const SnapContext& snapc, ceph::real_time mtime, + uint64_t trunc_size, __u32 trunc_seq, + Context *oncommit) override { + ObjectOperation op; + for (auto& [offset, bl] : io_vec) + op.write(offset, bl, trunc_size, trunc_seq); + + return m_objecter->mutate(oid, oloc, op, snapc, mtime, 0, + new C_OnFinisher(new C_Lock(m_lock, oncommit), + m_finisher)); + } + + private: + Objecter *m_objecter; + Finisher *m_finisher; + ceph::mutex *m_lock; +}; + +#endif diff --git a/src/client/RWRef.h b/src/client/RWRef.h new file mode 100644 index 000000000..9035a0937 --- /dev/null +++ b/src/client/RWRef.h @@ -0,0 +1,244 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + * ============ + * + * This is a common read/write reference framework, which will work + * simliarly to a RW lock, the difference here is that for the "readers" + * they won't hold any lock but will increase a reference instead when + * the "require" state is matched, or set a flag to tell the callers + * that the "require" state is not matched and also there is no any + * wait mechanism for "readers" to wait the state until it matches. It + * will let the callers determine what to do next. + * + * The usage, such as in libcephfs's client/Client.cc case: + * + * The Readers: + * + * For the ll_read()/ll_write(), etc fucntions, they will work as + * "readers", in the beginning they just need to define a RWRef + * object and in RWRef constructor it will check if the state is + * MOUNTED or MOUTING, if not it will fail and return directly with + * doing nothing, or it will increase the reference and continue. + * And when destructing the RWRef object, in the RWRef destructor + * it will decrease the reference and notify the "writers" who maybe + * waiting. + * + * The Writers: + * + * And for the _unmount() function , as a "writer", in the beginning + * it will also just need to define a RWRef object and in RWRef + * constructor it will update the state to next stage first, which then + * will fail all the new comming "readers", and then wait for all the + * "readers" to finish. + * + * With this we can get rid of the locks for all the "readers" and they + * can run in parallel. And we won't have any potential deadlock issue + * with RWRef, such as: + * + * With RWLock: + * + * ThreadA: ThreadB: + * + * write_lock<RWLock1>.lock(); another_lock.lock(); + * state = NEXT_STATE; ... + * another_lock.lock(); read_lock<RWLock1>.lock(); + * ... if (state == STATE) { + * ... + * } + * ... + * + * With RWRef: + * + * ThreadA: ThreadB: + * + * w = RWRef(myS, NEXT_STATE, false); another_lock.lock(); + * another_lock.lock(); r = RWRef(myS, STATE); + * ... if (r.is_state_satisfied()) { + * ... + * } + * ... + * + * And also in ThreadA, if it needs to do the cond.wait(&another_lock), + * it will goto sleep by holding the write_lock<RWLock1> for the RWLock + * case, if the ThreadBs are for some IOs, they may stuck for a very long + * time that may get timedout in the uplayer which may keep retrying. + * With the RWRef, the ThreadB will fail or continue directly without any + * stuck, and the uplayer will knew what next to do quickly. + */ + +#ifndef CEPH_RWRef_Posix__H +#define CEPH_RWRef_Posix__H + +#include <string> +#include "include/ceph_assert.h" +#include "common/ceph_mutex.h" + +/* The status mechanism info */ +template<typename T> +struct RWRefState { + public: + template <typename T1> friend class RWRef; + + /* + * This will be status mechanism. Currently you need to define + * it by yourself. + */ + T state; + + /* + * User defined method to check whether the "require" state + * is in the proper range we need. + * + * For example for the client/Client.cc: + * In some reader operation cases we need to make sure the + * client state is in mounting or mounted states, then it + * will set the "require = mounting" in class RWRef's constructor. + * Then the check_reader_state() should return truth if the + * state is already in mouting or mounted state. + */ + virtual int check_reader_state(T require) const = 0; + + /* + * User defined method to check whether the "require" state + * is in the proper range we need. + * + * This will usually be the state migration check. + */ + virtual int check_writer_state(T require) const = 0; + + /* + * User defined method to check whether the "require" + * state is valid or not. + */ + virtual bool is_valid_state(T require) const = 0; + + int64_t get_state() const { + std::scoped_lock l{lock}; + return state; + } + + bool check_current_state(T require) const { + ceph_assert(is_valid_state(require)); + + std::scoped_lock l{lock}; + return state == require; + } + + RWRefState(T init_state, const char *lockname, uint64_t _reader_cnt=0) + : state(init_state), lock(ceph::make_mutex(lockname)), reader_cnt(_reader_cnt) {} + virtual ~RWRefState() {} + + private: + mutable ceph::mutex lock; + ceph::condition_variable cond; + uint64_t reader_cnt = 0; +}; + +template<typename T> +class RWRef { +public: + RWRef(const RWRef& other) = delete; + const RWRef& operator=(const RWRef& other) = delete; + + RWRef(RWRefState<T> &s, T require, bool ir=true) + :S(s), is_reader(ir) { + ceph_assert(S.is_valid_state(require)); + + std::scoped_lock l{S.lock}; + if (likely(is_reader)) { // Readers will update the reader_cnt + if (S.check_reader_state(require)) { + S.reader_cnt++; + satisfied = true; + } + } else { // Writers will update the state + is_reader = false; + + /* + * If the current state is not the same as "require" + * then update the state and we are the first writer. + * + * Or if there already has one writer running or + * finished, it will let user to choose to continue + * or just break. + */ + if (S.check_writer_state(require)) { + first_writer = true; + S.state = require; + } + satisfied = true; + } + } + + /* + * Whether the "require" state is in the proper range of + * the states. + */ + bool is_state_satisfied() const { + return satisfied; + } + + /* + * Update the state, and only the writer could do the update. + */ + void update_state(T new_state) { + ceph_assert(!is_reader); + ceph_assert(S.is_valid_state(new_state)); + + std::scoped_lock l{S.lock}; + S.state = new_state; + } + + /* + * For current state whether we are the first writer or not + */ + bool is_first_writer() const { + return first_writer; + } + + /* + * Will wait for all the in-flight "readers" to finish + */ + void wait_readers_done() { + // Only writers can wait + ceph_assert(!is_reader); + + std::unique_lock l{S.lock}; + + S.cond.wait(l, [this] { + return !S.reader_cnt; + }); + } + + ~RWRef() { + std::scoped_lock l{S.lock}; + if (!is_reader) + return; + + if (!satisfied) + return; + + /* + * Decrease the refcnt and notify the waiters + */ + if (--S.reader_cnt == 0) + S.cond.notify_all(); + } + +private: + RWRefState<T> &S; + bool satisfied = false; + bool first_writer = false; + bool is_reader = true; +}; + +#endif // !CEPH_RWRef_Posix__H diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc new file mode 100644 index 000000000..3b408dd3f --- /dev/null +++ b/src/client/SyntheticClient.cc @@ -0,0 +1,3438 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" + +#include <iostream> +#include <sstream> + + +#include "common/config.h" +#include "SyntheticClient.h" +#include "osdc/Objecter.h" +#include "osdc/Filer.h" + + +#include "include/filepath.h" +#include "common/perf_counters.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <utime.h> +#include <math.h> +#include <sys/statvfs.h> + +#include "common/errno.h" +#include "include/ceph_assert.h" +#include "include/cephfs/ceph_ll_client.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_client +#undef dout_prefix +#define dout_prefix *_dout << "client." << (whoami >= 0 ? whoami:client->get_nodeid()) << " " + +using namespace std; +// traces +//void trace_include(SyntheticClient *syn, Client *cl, string& prefix); +//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix); + +int num_client = 1; +list<int> syn_modes; +list<int> syn_iargs; +list<string> syn_sargs; +int syn_filer_flags = 0; + +void parse_syn_options(vector<const char*>& args) +{ + vector<const char*> nargs; + + for (unsigned i=0; i<args.size(); i++) { + if (strcmp(args[i],"--num-client") == 0) { + num_client = atoi(args[++i]); + continue; + } + if (strcmp(args[i],"--syn") == 0) { + ++i; + + if (strcmp(args[i], "mksnap") == 0) { + syn_modes.push_back(SYNCLIENT_MODE_MKSNAP); + syn_sargs.push_back(args[++i]); // path + syn_sargs.push_back(args[++i]); // name + } + else if (strcmp(args[i], "rmsnap") == 0) { + syn_modes.push_back(SYNCLIENT_MODE_RMSNAP); + syn_sargs.push_back(args[++i]); // path + syn_sargs.push_back(args[++i]); // name + } else if (strcmp(args[i], "mksnapfile") == 0) { + syn_modes.push_back(SYNCLIENT_MODE_MKSNAPFILE); + syn_sargs.push_back(args[++i]); // path + } else if (strcmp(args[i],"rmfile") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_RMFILE ); + } else if (strcmp(args[i],"writefile") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_WRITEFILE ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"wrshared") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_WRSHARED ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"writebatch") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_WRITEBATCH ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"readfile") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_READFILE ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"readwriterandom") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_RDWRRANDOM ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"readwriterandom_ex") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_RDWRRANDOM_EX ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"overloadosd0") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_OVERLOAD_OSD_0 ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"readshared") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_READSHARED ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"rw") == 0) { + int a = atoi(args[++i]); + int b = atoi(args[++i]); + syn_modes.push_back( SYNCLIENT_MODE_WRITEFILE ); + syn_iargs.push_back( a ); + syn_iargs.push_back( b ); + syn_modes.push_back( SYNCLIENT_MODE_READFILE ); + syn_iargs.push_back( a ); + syn_iargs.push_back( b ); + } else if (strcmp(args[i],"dumpplacement") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_DUMP ); + syn_sargs.push_back( args[++i] ); + } else if (strcmp(args[i],"dropcache") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_DROPCACHE ); + } else if (strcmp(args[i],"makedirs") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_MAKEDIRS ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"makedirmess") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_MAKEDIRMESS ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"statdirs") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_STATDIRS ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"readdirs") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_READDIRS ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"makefiles") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_MAKEFILES ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"makefiles2") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_MAKEFILES2 ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"linktest") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_LINKTEST ); + } else if (strcmp(args[i],"createshared") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_CREATESHARED ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"openshared") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_OPENSHARED ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"createobjects") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_CREATEOBJECTS ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"objectrw") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_OBJECTRW ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"walk") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_FULLWALK ); + //syn_sargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"randomwalk") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_RANDOMWALK ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"trace") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_TRACE ); + syn_sargs.push_back( args[++i] ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back(1);// data + } else if (strcmp(args[i],"mtrace") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_TRACE ); + syn_sargs.push_back( args[++i] ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back(0);// no data + } else if (strcmp(args[i],"thrashlinks") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_THRASHLINKS ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"foo") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_FOO ); + } else if (strcmp(args[i],"until") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_UNTIL ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"sleepuntil") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_SLEEPUNTIL ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"only") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_ONLY ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"onlyrange") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_ONLYRANGE ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"sleep") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_SLEEP ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"randomsleep") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_RANDOMSLEEP ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"opentest") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_OPENTEST ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"optest") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_OPTEST ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"truncate") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_TRUNCATE ); + syn_sargs.push_back(args[++i]); + syn_iargs.push_back(atoi(args[++i])); + } else if (strcmp(args[i],"importfind") == 0) { + syn_modes.push_back(SYNCLIENT_MODE_IMPORTFIND); + syn_sargs.push_back(args[++i]); + syn_sargs.push_back(args[++i]); + syn_iargs.push_back(atoi(args[++i])); + } else if (strcmp(args[i], "lookuphash") == 0) { + syn_modes.push_back(SYNCLIENT_MODE_LOOKUPHASH); + syn_sargs.push_back(args[++i]); + syn_sargs.push_back(args[++i]); + syn_sargs.push_back(args[++i]); + } else if (strcmp(args[i], "lookupino") == 0) { + syn_modes.push_back(SYNCLIENT_MODE_LOOKUPINO); + syn_sargs.push_back(args[++i]); + } else if (strcmp(args[i], "chunkfile") == 0) { + syn_modes.push_back(SYNCLIENT_MODE_CHUNK); + syn_sargs.push_back(args[++i]); + } else { + cerr << "unknown syn arg " << args[i] << std::endl; + ceph_abort(); + } + } + else if (strcmp(args[i], "localize_reads") == 0) { + cerr << "set CEPH_OSD_FLAG_LOCALIZE_READS" << std::endl; + syn_filer_flags |= CEPH_OSD_FLAG_LOCALIZE_READS; + } + else { + nargs.push_back(args[i]); + } + } + + args = nargs; +} + + +SyntheticClient::SyntheticClient(StandaloneClient *client, int w) +{ + this->client = client; + whoami = w; + thread_id = 0; + + did_readdir = false; + + run_only = -1; + exclude = -1; + + this->modes = syn_modes; + this->iargs = syn_iargs; + this->sargs = syn_sargs; + + run_start = ceph_clock_now(); +} + + + + +#define DBL 2 + +void *synthetic_client_thread_entry(void *ptr) +{ + SyntheticClient *sc = static_cast<SyntheticClient*>(ptr); + //int r = + sc->run(); + return 0;//(void*)r; +} + +string SyntheticClient::get_sarg(int seq) +{ + string a; + if (!sargs.empty()) { + a = sargs.front(); + sargs.pop_front(); + } + if (a.length() == 0 || a == "~") { + char s[30]; + snprintf(s, sizeof(s), "syn.%lld.%d", (long long)client->whoami.v, seq); + a = s; + } + return a; +} + +int SyntheticClient::run() +{ + UserPerm perms = client->pick_my_perms(); + dout(15) << "initing" << dendl; + int err = client->init(); + if (err < 0) { + dout(0) << "failed to initialize: " << cpp_strerror(err) << dendl; + return -1; + } + + dout(15) << "mounting" << dendl; + err = client->mount("", perms); + if (err < 0) { + dout(0) << "failed to mount: " << cpp_strerror(err) << dendl; + client->shutdown(); + return -1; + } + + //run_start = ceph_clock_now(client->cct); + run_until = utime_t(0,0); + dout(5) << "run" << dendl; + + int seq = 0; + + for (list<int>::iterator it = modes.begin(); + it != modes.end(); + ++it) { + int mode = *it; + dout(3) << "mode " << mode << dendl; + + switch (mode) { + + + // WHO? + + case SYNCLIENT_MODE_ONLY: + { + run_only = iargs.front(); + iargs.pop_front(); + if (run_only == client->get_nodeid()) + dout(2) << "only " << run_only << dendl; + } + break; + case SYNCLIENT_MODE_ONLYRANGE: + { + int first = iargs.front(); + iargs.pop_front(); + int last = iargs.front(); + iargs.pop_front(); + if (first <= client->get_nodeid() && + last > client->get_nodeid()) { + run_only = client->get_nodeid(); + dout(2) << "onlyrange [" << first << ", " << last << ") includes me" << dendl; + } else + run_only = client->get_nodeid().v+1; // not me + } + break; + case SYNCLIENT_MODE_EXCLUDE: + { + exclude = iargs.front(); + iargs.pop_front(); + if (exclude == client->get_nodeid()) { + run_only = client->get_nodeid().v + 1; + dout(2) << "not running " << exclude << dendl; + } else + run_only = -1; + } + break; + + // HOW LONG? + + case SYNCLIENT_MODE_UNTIL: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (run_me()) { + if (iarg1) { + dout(2) << "until " << iarg1 << dendl; + utime_t dur(iarg1,0); + run_until = run_start + dur; + } else { + dout(2) << "until " << iarg1 << " (no limit)" << dendl; + run_until = utime_t(0,0); + } + } + } + break; + + + // ... + + case SYNCLIENT_MODE_FOO: + if (run_me()) { + foo(); + } + did_run_me(); + break; + + case SYNCLIENT_MODE_RANDOMSLEEP: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (run_me()) { + srand(time(0) + getpid() + client->whoami.v); + sleep(rand() % iarg1); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_SLEEP: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (run_me()) { + dout(2) << "sleep " << iarg1 << dendl; + sleep(iarg1); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_SLEEPUNTIL: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (iarg1 && run_me()) { + dout(2) << "sleepuntil " << iarg1 << dendl; + utime_t at = ceph_clock_now() - run_start; + if (at.sec() < iarg1) + sleep(iarg1 - at.sec()); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_RANDOMWALK: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (run_me()) { + dout(2) << "randomwalk " << iarg1 << dendl; + random_walk(iarg1); + } + did_run_me(); + } + break; + + + case SYNCLIENT_MODE_DROPCACHE: + { + client->sync_fs(); + client->drop_caches(); + } + break; + + case SYNCLIENT_MODE_DUMP: + { + string sarg1 = get_sarg(0); + if (run_me()) { + dout(2) << "placement dump " << sarg1 << dendl; + dump_placement(sarg1); + } + did_run_me(); + } + break; + + + case SYNCLIENT_MODE_MAKEDIRMESS: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makedirmess " << sarg1 << " " << iarg1 << dendl; + make_dir_mess(sarg1.c_str(), iarg1); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_MAKEDIRS: + { + string sarg1 = get_sarg(seq++); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; + make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_STATDIRS: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; + stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_READDIRS: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; + read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); + } + did_run_me(); + } + break; + + + case SYNCLIENT_MODE_THRASHLINKS: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + int iarg4 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "thrashlinks " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; + thrash_links(sarg1.c_str(), iarg1, iarg2, iarg3, iarg4); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_LINKTEST: + { + if (run_me()) { + link_test(); + } + did_run_me(); + } + break; + + + case SYNCLIENT_MODE_MAKEFILES: + { + int num = iargs.front(); iargs.pop_front(); + int count = iargs.front(); iargs.pop_front(); + int priv = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makefiles " << num << " " << count << " " << priv << dendl; + make_files(num, count, priv, false); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_MAKEFILES2: + { + int num = iargs.front(); iargs.pop_front(); + int count = iargs.front(); iargs.pop_front(); + int priv = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makefiles2 " << num << " " << count << " " << priv << dendl; + make_files(num, count, priv, true); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_CREATESHARED: + { + string sarg1 = get_sarg(0); + int num = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "createshared " << num << dendl; + create_shared(num); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_OPENSHARED: + { + string sarg1 = get_sarg(0); + int num = iargs.front(); iargs.pop_front(); + int count = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "openshared " << num << dendl; + open_shared(num, count); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_CREATEOBJECTS: + { + int count = iargs.front(); iargs.pop_front(); + int size = iargs.front(); iargs.pop_front(); + int inflight = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "createobjects " << count << " of " << size << " bytes" + << ", " << inflight << " in flight" << dendl; + create_objects(count, size, inflight); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_OBJECTRW: + { + int count = iargs.front(); iargs.pop_front(); + int size = iargs.front(); iargs.pop_front(); + int wrpc = iargs.front(); iargs.pop_front(); + int overlap = iargs.front(); iargs.pop_front(); + int rskew = iargs.front(); iargs.pop_front(); + int wskew = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "objectrw " << count << " " << size << " " << wrpc + << " " << overlap << " " << rskew << " " << wskew << dendl; + object_rw(count, size, wrpc, overlap, rskew, wskew); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_FULLWALK: + { + string sarg1;// = get_sarg(0); + if (run_me()) { + dout(2) << "fullwalk" << sarg1 << dendl; + full_walk(sarg1); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_REPEATWALK: + { + string sarg1 = get_sarg(0); + if (run_me()) { + dout(2) << "repeatwalk " << sarg1 << dendl; + while (full_walk(sarg1) == 0) ; + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_RMFILE: + { + string sarg1 = get_sarg(0); + if (run_me()) { + rm_file(sarg1); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_WRITEFILE: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + dout(1) << "WRITING SYN CLIENT" << dendl; + if (run_me()) { + write_file(sarg1, iarg1, iarg2); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_CHUNK: + if (run_me()) { + string sarg1 = get_sarg(0); + chunk_file(sarg1); + } + did_run_me(); + break; + + + case SYNCLIENT_MODE_OVERLOAD_OSD_0: + { + dout(1) << "OVERLOADING OSD 0" << dendl; + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + if (run_me()) { + overload_osd_0(iarg1, iarg2, iarg3); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_WRSHARED: + { + string sarg1 = "shared"; + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + if (run_me()) { + write_file(sarg1, iarg1, iarg2); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_READSHARED: + { + string sarg1 = "shared"; + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + if (run_me()) { + read_file(sarg1, iarg1, iarg2, true); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_WRITEBATCH: + { + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + + if (run_me()) { + write_batch(iarg1, iarg2, iarg3); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_READFILE: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + + dout(1) << "READING SYN CLIENT" << dendl; + if (run_me()) { + read_file(sarg1, iarg1, iarg2); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_RDWRRANDOM: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + + dout(1) << "RANDOM READ WRITE SYN CLIENT" << dendl; + if (run_me()) { + read_random(sarg1, iarg1, iarg2); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_RDWRRANDOM_EX: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + + dout(1) << "RANDOM READ WRITE SYN CLIENT" << dendl; + if (run_me()) { + read_random_ex(sarg1, iarg1, iarg2); + } + did_run_me(); + } + break; + case SYNCLIENT_MODE_TRACE: + { + string tfile = get_sarg(0); + sargs.push_front(string("~")); + int iarg1 = iargs.front(); iargs.pop_front(); + int playdata = iargs.front(); iargs.pop_front(); + string prefix = get_sarg(0); + char realtfile[100]; + snprintf(realtfile, sizeof(realtfile), tfile.c_str(), (int)client->get_nodeid().v); + + if (run_me()) { + dout(0) << "trace " << tfile << " prefix=" << prefix << " count=" << iarg1 << " data=" << playdata << dendl; + + Trace t(realtfile); + + if (iarg1 == 0) iarg1 = 1; // play trace at least once! + + for (int i=0; i<iarg1; i++) { + utime_t start = ceph_clock_now(); + + if (time_to_stop()) break; + play_trace(t, prefix, !playdata); + if (time_to_stop()) break; + if (iarg1 > 1) clean_dir(prefix); // clean only if repeat + + utime_t lat = ceph_clock_now(); + lat -= start; + + dout(0) << " trace " << tfile << " loop " << (i+1) << "/" << iarg1 << " done in " << (double)lat << " seconds" << dendl; + if (client->logger + && i > 0 + && i < iarg1-1 + ) { + //client->logger->finc("trsum", (double)lat); + //client->logger->inc("trnum"); + } + } + dout(1) << "done " << dendl; + } + did_run_me(); + } + break; + + + case SYNCLIENT_MODE_OPENTEST: + { + int count = iargs.front(); iargs.pop_front(); + if (run_me()) { + for (int i=0; i<count; i++) { + int fd = client->open("test", (rand()%2) ? + (O_WRONLY|O_CREAT) : O_RDONLY, + perms); + if (fd > 0) client->close(fd); + } + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_OPTEST: + { + int count = iargs.front(); iargs.pop_front(); + if (run_me()) { + client->mknod("test", 0777, perms); + struct stat st; + for (int i=0; i<count; i++) { + client->lstat("test", &st, perms); + client->chmod("test", 0777, perms); + } + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_TRUNCATE: + { + string file = get_sarg(0); + sargs.push_front(file); + int iarg1 = iargs.front(); iargs.pop_front(); + if (run_me()) { + client->truncate(file.c_str(), iarg1, perms); + } + did_run_me(); + } + break; + + + case SYNCLIENT_MODE_IMPORTFIND: + { + string base = get_sarg(0); + string find = get_sarg(0); + int data = get_iarg(); + if (run_me()) { + import_find(base.c_str(), find.c_str(), data); + } + did_run_me(); + } + break; + + case SYNCLIENT_MODE_LOOKUPHASH: + { + inodeno_t ino; + string iname = get_sarg(0); + sscanf(iname.c_str(), "%llx", (long long unsigned*)&ino.val); + inodeno_t dirino; + string diname = get_sarg(0); + sscanf(diname.c_str(), "%llx", (long long unsigned*)&dirino.val); + string name = get_sarg(0); + if (run_me()) { + lookup_hash(ino, dirino, name.c_str(), perms); + } + } + break; + case SYNCLIENT_MODE_LOOKUPINO: + { + inodeno_t ino; + string iname = get_sarg(0); + sscanf(iname.c_str(), "%llx", (long long unsigned*)&ino.val); + if (run_me()) { + lookup_ino(ino, perms); + } + } + break; + + case SYNCLIENT_MODE_MKSNAP: + { + string base = get_sarg(0); + string name = get_sarg(0); + if (run_me()) + mksnap(base.c_str(), name.c_str(), perms); + did_run_me(); + } + break; + case SYNCLIENT_MODE_RMSNAP: + { + string base = get_sarg(0); + string name = get_sarg(0); + if (run_me()) + rmsnap(base.c_str(), name.c_str(), perms); + did_run_me(); + } + break; + case SYNCLIENT_MODE_MKSNAPFILE: + { + string base = get_sarg(0); + if (run_me()) + mksnapfile(base.c_str()); + did_run_me(); + } + break; + + default: + ceph_abort(); + } + } + dout(1) << "syn done, unmounting " << dendl; + + client->unmount(); + client->shutdown(); + return 0; +} + + +int SyntheticClient::start_thread() +{ + ceph_assert(!thread_id); + + pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this); + ceph_assert(thread_id); + ceph_pthread_setname(thread_id, "client"); + return 0; +} + +int SyntheticClient::join_thread() +{ + ceph_assert(thread_id); + void *rv; + pthread_join(thread_id, &rv); + return 0; +} + + +bool roll_die(float p) +{ + float r = (float)(rand() % 100000) / 100000.0; + if (r < p) + return true; + else + return false; +} + +void SyntheticClient::init_op_dist() +{ + op_dist.clear(); +#if 0 + op_dist.add( CEPH_MDS_OP_STAT, 610 ); + op_dist.add( CEPH_MDS_OP_UTIME, 0 ); + op_dist.add( CEPH_MDS_OP_CHMOD, 1 ); + op_dist.add( CEPH_MDS_OP_CHOWN, 1 ); +#endif + + op_dist.add( CEPH_MDS_OP_READDIR, 2 ); + op_dist.add( CEPH_MDS_OP_MKNOD, 30 ); + op_dist.add( CEPH_MDS_OP_LINK, 0 ); + op_dist.add( CEPH_MDS_OP_UNLINK, 20 ); + op_dist.add( CEPH_MDS_OP_RENAME, 40 ); + + op_dist.add( CEPH_MDS_OP_MKDIR, 10 ); + op_dist.add( CEPH_MDS_OP_RMDIR, 20 ); + op_dist.add( CEPH_MDS_OP_SYMLINK, 20 ); + + op_dist.add( CEPH_MDS_OP_OPEN, 200 ); + //op_dist.add( CEPH_MDS_OP_READ, 0 ); + //op_dist.add( CEPH_MDS_OP_WRITE, 0 ); + //op_dist.add( CEPH_MDS_OP_TRUNCATE, 0 ); + //op_dist.add( CEPH_MDS_OP_FSYNC, 0 ); + //op_dist.add( CEPH_MDS_OP_RELEASE, 200 ); + op_dist.normalize(); +} + +void SyntheticClient::up() +{ + cwd = cwd.prefixpath(cwd.depth()-1); + dout(DBL) << "cd .. -> " << cwd << dendl; + clear_dir(); +} + +int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only) +{ + dout(4) << "play trace prefix '" << prefix << "'" << dendl; + UserPerm perms = client->pick_my_perms(); + t.start(); + + string buf; + string buf2; + + utime_t start = ceph_clock_now(); + + ceph::unordered_map<int64_t, int64_t> open_files; + ceph::unordered_map<int64_t, dir_result_t*> open_dirs; + + ceph::unordered_map<int64_t, Fh*> ll_files; + ceph::unordered_map<int64_t, dir_result_t*> ll_dirs; + ceph::unordered_map<uint64_t, int64_t> ll_inos; + + Inode *i1, *i2; + + ll_inos[1] = 1; // root inode is known. + + // prefix? + const char *p = prefix.c_str(); + if (prefix.length()) { + client->mkdir(prefix.c_str(), 0755, perms); + struct ceph_statx stx; + i1 = client->ll_get_inode(vinodeno_t(1, CEPH_NOSNAP)); + if (client->ll_lookupx(i1, prefix.c_str(), &i2, &stx, CEPH_STATX_INO, 0, perms) == 0) { + ll_inos[1] = stx.stx_ino; + dout(5) << "'root' ino is " << inodeno_t(stx.stx_ino) << dendl; + client->ll_put(i1); + } else { + dout(0) << "warning: play_trace couldn't lookup up my per-client directory" << dendl; + } + } else + (void) client->ll_get_inode(vinodeno_t(1, CEPH_NOSNAP)); + + utime_t last_status = start; + + int n = 0; + + // for object traces + ceph::mutex lock = ceph::make_mutex("synclient foo"); + ceph::condition_variable cond; + bool ack; + + while (!t.end()) { + + if (++n == 100) { + n = 00; + utime_t now = last_status; + if (now - last_status > 1.0) { + last_status = now; + dout(1) << "play_trace at line " << t.get_line() << dendl; + } + } + + if (time_to_stop()) break; + + // op + const char *op = t.get_string(buf, 0); + dout(4) << (t.get_line()-1) << ": trace op " << op << dendl; + + if (op[0] == '@') { + // timestamp... ignore it! + t.get_int(); // sec + t.get_int(); // usec + op = t.get_string(buf, 0); + } + + // high level ops --------------------- + UserPerm perms = client->pick_my_perms(); + if (strcmp(op, "link") == 0) { + const char *a = t.get_string(buf, p); + const char *b = t.get_string(buf2, p); + client->link(a, b, perms); + } else if (strcmp(op, "unlink") == 0) { + const char *a = t.get_string(buf, p); + client->unlink(a, perms); + } else if (strcmp(op, "rename") == 0) { + const char *a = t.get_string(buf, p); + const char *b = t.get_string(buf2, p); + client->rename(a,b, perms); + } else if (strcmp(op, "mkdir") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + client->mkdir(a, b, perms); + } else if (strcmp(op, "rmdir") == 0) { + const char *a = t.get_string(buf, p); + client->rmdir(a, perms); + } else if (strcmp(op, "symlink") == 0) { + const char *a = t.get_string(buf, p); + const char *b = t.get_string(buf2, p); + client->symlink(a, b, perms); + } else if (strcmp(op, "readlink") == 0) { + const char *a = t.get_string(buf, p); + char buf[100]; + client->readlink(a, buf, 100, perms); + } else if (strcmp(op, "lstat") == 0) { + struct stat st; + const char *a = t.get_string(buf, p); + if (strcmp(a, p) != 0 && + strcmp(a, "/") != 0 && + strcmp(a, "/lib") != 0 && // or /lib.. that would be a lookup. hack. + a[0] != 0) // stop stating the root directory already + client->lstat(a, &st, perms); + } else if (strcmp(op, "chmod") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + client->chmod(a, b, perms); + } else if (strcmp(op, "chown") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + int64_t c = t.get_int(); + client->chown(a, b, c, perms); + } else if (strcmp(op, "utime") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + int64_t c = t.get_int(); + struct utimbuf u; + u.actime = b; + u.modtime = c; + client->utime(a, &u, perms); + } else if (strcmp(op, "mknod") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + int64_t c = t.get_int(); + client->mknod(a, b, perms, c); + } else if (strcmp(op, "oldmknod") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + client->mknod(a, b, perms, 0); + } else if (strcmp(op, "getdir") == 0) { + const char *a = t.get_string(buf, p); + list<string> contents; + int r = client->getdir(a, contents, perms); + if (r < 0) { + dout(1) << "getdir on " << a << " returns " << r << dendl; + } + } else if (strcmp(op, "opendir") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + dir_result_t *dirp; + client->opendir(a, &dirp, perms); + if (dirp) open_dirs[b] = dirp; + } else if (strcmp(op, "closedir") == 0) { + int64_t a = t.get_int(); + client->closedir(open_dirs[a]); + open_dirs.erase(a); + } else if (strcmp(op, "open") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + int64_t c = t.get_int(); + int64_t d = t.get_int(); + int64_t fd = client->open(a, b, perms, c); + if (fd > 0) open_files[d] = fd; + } else if (strcmp(op, "oldopen") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + int64_t d = t.get_int(); + int64_t fd = client->open(a, b, perms, 0755); + if (fd > 0) open_files[d] = fd; + } else if (strcmp(op, "close") == 0) { + int64_t id = t.get_int(); + int64_t fh = open_files[id]; + if (fh > 0) client->close(fh); + open_files.erase(id); + } else if (strcmp(op, "lseek") == 0) { + int64_t f = t.get_int(); + int fd = open_files[f]; + int64_t off = t.get_int(); + int64_t whence = t.get_int(); + client->lseek(fd, off, whence); + } else if (strcmp(op, "read") == 0) { + int64_t f = t.get_int(); + int64_t size = t.get_int(); + int64_t off = t.get_int(); + int64_t fd = open_files[f]; + if (!metadata_only) { + char *b = new char[size]; + client->read(fd, b, size, off); + delete[] b; + } + } else if (strcmp(op, "write") == 0) { + int64_t f = t.get_int(); + int64_t fd = open_files[f]; + int64_t size = t.get_int(); + int64_t off = t.get_int(); + if (!metadata_only) { + char *b = new char[size]; + memset(b, 1, size); // let's write 1's! + client->write(fd, b, size, off); + delete[] b; + } else { + client->write(fd, NULL, 0, size+off); + } + } else if (strcmp(op, "truncate") == 0) { + const char *a = t.get_string(buf, p); + int64_t l = t.get_int(); + client->truncate(a, l, perms); + } else if (strcmp(op, "ftruncate") == 0) { + int64_t f = t.get_int(); + int fd = open_files[f]; + int64_t l = t.get_int(); + client->ftruncate(fd, l, perms); + } else if (strcmp(op, "fsync") == 0) { + int64_t f = t.get_int(); + int64_t b = t.get_int(); + int fd = open_files[f]; + client->fsync(fd, b); + } else if (strcmp(op, "chdir") == 0) { + const char *a = t.get_string(buf, p); + // Client users should remember their path, but since this + // is just a synthetic client we ignore it. + std::string ignore; + client->chdir(a, ignore, perms); + } else if (strcmp(op, "statfs") == 0) { + struct statvfs stbuf; + client->statfs("/", &stbuf, perms); + } + + // low level ops --------------------- + else if (strcmp(op, "ll_lookup") == 0) { + int64_t i = t.get_int(); + const char *name = t.get_string(buf, p); + int64_t r = t.get_int(); + struct ceph_statx stx; + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + if (client->ll_lookupx(i1, name, &i2, &stx, CEPH_STATX_INO, 0, perms) == 0) + ll_inos[r] = stx.stx_ino; + client->ll_put(i1); + } + } else if (strcmp(op, "ll_forget") == 0) { + int64_t i = t.get_int(); + int64_t n = t.get_int(); + if (ll_inos.count(i) && + client->ll_forget( + client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)), n)) + ll_inos.erase(i); + } else if (strcmp(op, "ll_getattr") == 0) { + int64_t i = t.get_int(); + struct stat attr; + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + client->ll_getattr(i1, &attr, perms); + client->ll_put(i1); + } + } else if (strcmp(op, "ll_setattr") == 0) { + int64_t i = t.get_int(); + struct stat attr; + memset(&attr, 0, sizeof(attr)); + attr.st_mode = t.get_int(); + attr.st_uid = t.get_int(); + attr.st_gid = t.get_int(); + attr.st_size = t.get_int(); + attr.st_mtime = t.get_int(); + attr.st_atime = t.get_int(); + int mask = t.get_int(); + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + client->ll_setattr(i1, &attr, mask, perms); + client->ll_put(i1); + } + } else if (strcmp(op, "ll_readlink") == 0) { + int64_t i = t.get_int(); + if (ll_inos.count(i)) { + char buf[PATH_MAX]; + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + client->ll_readlink(i1, buf, sizeof(buf), perms); + client->ll_put(i1); + } + } else if (strcmp(op, "ll_mknod") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + int m = t.get_int(); + int r = t.get_int(); + int64_t ri = t.get_int(); + struct stat attr; + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + if (client->ll_mknod(i1, n, m, r, &attr, &i2, perms) == 0) + ll_inos[ri] = attr.st_ino; + client->ll_put(i1); + } + } else if (strcmp(op, "ll_mkdir") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + int m = t.get_int(); + int64_t ri = t.get_int(); + struct stat attr; + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + if (client->ll_mkdir(i1, n, m, &attr, &i2, perms) == 0) + ll_inos[ri] = attr.st_ino; + client->ll_put(i1); + } + } else if (strcmp(op, "ll_symlink") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + const char *v = t.get_string(buf2, p); + int64_t ri = t.get_int(); + struct stat attr; + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + if (client->ll_symlink(i1, n, v, &attr, &i2, perms) == 0) + ll_inos[ri] = attr.st_ino; + client->ll_put(i1); + } + } else if (strcmp(op, "ll_unlink") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + client->ll_unlink(i1, n, perms); + client->ll_put(i1); + } + } else if (strcmp(op, "ll_rmdir") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + client->ll_rmdir(i1, n, perms); + client->ll_put(i1); + } + } else if (strcmp(op, "ll_rename") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + int64_t ni = t.get_int(); + const char *nn = t.get_string(buf2, p); + if (ll_inos.count(i) && + ll_inos.count(ni)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + i2 = client->ll_get_inode(vinodeno_t(ll_inos[ni],CEPH_NOSNAP)); + client->ll_rename(i1, n, i2, nn, perms); + client->ll_put(i1); + client->ll_put(i2); + } + } else if (strcmp(op, "ll_link") == 0) { + int64_t i = t.get_int(); + int64_t ni = t.get_int(); + const char *nn = t.get_string(buf, p); + if (ll_inos.count(i) && + ll_inos.count(ni)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + i2 = client->ll_get_inode(vinodeno_t(ll_inos[ni],CEPH_NOSNAP)); + client->ll_link(i1, i2, nn, perms); + client->ll_put(i1); + client->ll_put(i2); + } + } else if (strcmp(op, "ll_opendir") == 0) { + int64_t i = t.get_int(); + int64_t r = t.get_int(); + dir_result_t *dirp; + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + if (client->ll_opendir(i1, O_RDONLY, &dirp, perms) == 0) + ll_dirs[r] = dirp; + client->ll_put(i1); + } + } else if (strcmp(op, "ll_releasedir") == 0) { + int64_t f = t.get_int(); + if (ll_dirs.count(f)) { + client->ll_releasedir(ll_dirs[f]); + ll_dirs.erase(f); + } + } else if (strcmp(op, "ll_open") == 0) { + int64_t i = t.get_int(); + int64_t f = t.get_int(); + int64_t r = t.get_int(); + Fh *fhp; + if (ll_inos.count(i)) { + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + if (client->ll_open(i1, f, &fhp, perms) == 0) + ll_files[r] = fhp; + client->ll_put(i1); + } + } else if (strcmp(op, "ll_create") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + int64_t m = t.get_int(); + int64_t f = t.get_int(); + int64_t r = t.get_int(); + int64_t ri = t.get_int(); + struct stat attr; + if (ll_inos.count(i)) { + Fh *fhp; + i1 = client->ll_get_inode(vinodeno_t(ll_inos[i],CEPH_NOSNAP)); + if (client->ll_create(i1, n, m, f, &attr, NULL, &fhp, perms) == 0) { + ll_inos[ri] = attr.st_ino; + ll_files[r] = fhp; + } + client->ll_put(i1); + } + } else if (strcmp(op, "ll_read") == 0) { + int64_t f = t.get_int(); + int64_t off = t.get_int(); + int64_t size = t.get_int(); + if (ll_files.count(f) && + !metadata_only) { + bufferlist bl; + client->ll_read(ll_files[f], off, size, &bl); + } + } else if (strcmp(op, "ll_write") == 0) { + int64_t f = t.get_int(); + int64_t off = t.get_int(); + int64_t size = t.get_int(); + if (ll_files.count(f)) { + if (!metadata_only) { + bufferlist bl; + bufferptr bp(size); + bl.push_back(bp); + bp.zero(); + client->ll_write(ll_files[f], off, size, bl.c_str()); + } else { + client->ll_write(ll_files[f], off+size, 0, NULL); + } + } + } else if (strcmp(op, "ll_flush") == 0) { + int64_t f = t.get_int(); + if (!metadata_only && + ll_files.count(f)) + client->ll_flush(ll_files[f]); + } else if (strcmp(op, "ll_fsync") == 0) { + int64_t f = t.get_int(); + if (!metadata_only && + ll_files.count(f)) + client->ll_fsync(ll_files[f], false); // FIXME dataonly param + } else if (strcmp(op, "ll_release") == 0) { + int64_t f = t.get_int(); + if (ll_files.count(f)) { + client->ll_release(ll_files[f]); + ll_files.erase(f); + } + } else if (strcmp(op, "ll_statfs") == 0) { + int64_t i = t.get_int(); + if (ll_inos.count(i)) + {} //client->ll_statfs(vinodeno_t(ll_inos[i],CEPH_NOSNAP), perms); + } + + + // object-level traces + + else if (strcmp(op, "o_stat") == 0) { + int64_t oh = t.get_int(); + int64_t ol = t.get_int(); + object_t oid = file_object_t(oh, ol); + std::unique_lock locker{lock}; + object_locator_t oloc(SYNCLIENT_FIRST_POOL); + uint64_t size; + ceph::real_time mtime; + client->objecter->stat(oid, oloc, CEPH_NOSNAP, &size, &mtime, 0, + new C_SafeCond(lock, cond, &ack)); + cond.wait(locker, [&ack] { return ack; }); + } + else if (strcmp(op, "o_read") == 0) { + int64_t oh = t.get_int(); + int64_t ol = t.get_int(); + int64_t off = t.get_int(); + int64_t len = t.get_int(); + object_t oid = file_object_t(oh, ol); + object_locator_t oloc(SYNCLIENT_FIRST_POOL); + std::unique_lock locker{lock}; + bufferlist bl; + client->objecter->read(oid, oloc, off, len, CEPH_NOSNAP, &bl, 0, + new C_SafeCond(lock, cond, &ack)); + cond.wait(locker, [&ack] { return ack; }); + } + else if (strcmp(op, "o_write") == 0) { + int64_t oh = t.get_int(); + int64_t ol = t.get_int(); + int64_t off = t.get_int(); + int64_t len = t.get_int(); + object_t oid = file_object_t(oh, ol); + object_locator_t oloc(SYNCLIENT_FIRST_POOL); + std::unique_lock locker{lock}; + bufferptr bp(len); + bufferlist bl; + bl.push_back(bp); + SnapContext snapc; + client->objecter->write(oid, oloc, off, len, snapc, bl, + ceph::real_clock::now(), 0, + new C_SafeCond(lock, cond, &ack)); + cond.wait(locker, [&ack] { return ack; }); + } + else if (strcmp(op, "o_zero") == 0) { + int64_t oh = t.get_int(); + int64_t ol = t.get_int(); + int64_t off = t.get_int(); + int64_t len = t.get_int(); + object_t oid = file_object_t(oh, ol); + object_locator_t oloc(SYNCLIENT_FIRST_POOL); + std::unique_lock locker{lock}; + SnapContext snapc; + client->objecter->zero(oid, oloc, off, len, snapc, + ceph::real_clock::now(), 0, + new C_SafeCond(lock, cond, &ack)); + cond.wait(locker, [&ack] { return ack; }); + } + + + else { + dout(0) << (t.get_line()-1) << ": *** trace hit unrecognized symbol '" << op << "' " << dendl; + ceph_abort(); + } + } + + dout(10) << "trace finished on line " << t.get_line() << dendl; + + // close open files + for (ceph::unordered_map<int64_t, int64_t>::iterator fi = open_files.begin(); + fi != open_files.end(); + ++fi) { + dout(1) << "leftover close " << fi->second << dendl; + if (fi->second > 0) client->close(fi->second); + } + for (ceph::unordered_map<int64_t, dir_result_t*>::iterator fi = open_dirs.begin(); + fi != open_dirs.end(); + ++fi) { + dout(1) << "leftover closedir " << fi->second << dendl; + if (fi->second != 0) client->closedir(fi->second); + } + for (ceph::unordered_map<int64_t,Fh*>::iterator fi = ll_files.begin(); + fi != ll_files.end(); + ++fi) { + dout(1) << "leftover ll_release " << fi->second << dendl; + if (fi->second) client->ll_release(fi->second); + } + for (ceph::unordered_map<int64_t,dir_result_t*>::iterator fi = ll_dirs.begin(); + fi != ll_dirs.end(); + ++fi) { + dout(1) << "leftover ll_releasedir " << fi->second << dendl; + if (fi->second) client->ll_releasedir(fi->second); + } + + return 0; +} + + + +int SyntheticClient::clean_dir(string& basedir) +{ + // read dir + list<string> contents; + UserPerm perms = client->pick_my_perms(); + int r = client->getdir(basedir.c_str(), contents, perms); + if (r < 0) { + dout(1) << "getdir on " << basedir << " returns " << r << dendl; + return r; + } + + for (list<string>::iterator it = contents.begin(); + it != contents.end(); + ++it) { + if (*it == ".") continue; + if (*it == "..") continue; + string file = basedir + "/" + *it; + + if (time_to_stop()) break; + + struct stat st; + int r = client->lstat(file.c_str(), &st, perms); + if (r < 0) { + dout(1) << "stat error on " << file << " r=" << r << dendl; + continue; + } + + if ((st.st_mode & S_IFMT) == S_IFDIR) { + clean_dir(file); + client->rmdir(file.c_str(), perms); + } else { + client->unlink(file.c_str(), perms); + } + } + + return 0; + +} + + +int SyntheticClient::full_walk(string& basedir) +{ + if (time_to_stop()) return -1; + + list<string> dirq; + list<frag_info_t> statq; + dirq.push_back(basedir); + frag_info_t empty; + statq.push_back(empty); + + ceph::unordered_map<inodeno_t, int> nlink; + ceph::unordered_map<inodeno_t, int> nlink_seen; + + UserPerm perms = client->pick_my_perms(); + while (!dirq.empty()) { + string dir = dirq.front(); + frag_info_t expect = statq.front(); + dirq.pop_front(); + statq.pop_front(); + + frag_info_t actual = empty; + + // read dir + list<string> contents; + int r = client->getdir(dir.c_str(), contents, perms); + if (r < 0) { + dout(1) << "getdir on " << dir << " returns " << r << dendl; + continue; + } + + for (list<string>::iterator it = contents.begin(); + it != contents.end(); + ++it) { + if (*it == "." || + *it == "..") + continue; + string file = dir + "/" + *it; + + struct stat st; + frag_info_t dirstat; + int r = client->lstat(file.c_str(), &st, perms, &dirstat); + if (r < 0) { + dout(1) << "stat error on " << file << " r=" << r << dendl; + continue; + } + + nlink_seen[st.st_ino]++; + nlink[st.st_ino] = st.st_nlink; + + if (S_ISDIR(st.st_mode)) + actual.nsubdirs++; + else + actual.nfiles++; + + // print + char *tm = ctime(&st.st_mtime); + tm[strlen(tm)-1] = 0; + printf("%llx %c%c%c%c%c%c%c%c%c%c %2d %5d %5d %8llu %12s %s\n", + (long long)st.st_ino, + S_ISDIR(st.st_mode) ? 'd':'-', + (st.st_mode & 0400) ? 'r':'-', + (st.st_mode & 0200) ? 'w':'-', + (st.st_mode & 0100) ? 'x':'-', + (st.st_mode & 040) ? 'r':'-', + (st.st_mode & 020) ? 'w':'-', + (st.st_mode & 010) ? 'x':'-', + (st.st_mode & 04) ? 'r':'-', + (st.st_mode & 02) ? 'w':'-', + (st.st_mode & 01) ? 'x':'-', + (int)st.st_nlink, + (int)st.st_uid, (int)st.st_gid, + (long long unsigned)st.st_size, + tm, + file.c_str()); + + + if ((st.st_mode & S_IFMT) == S_IFDIR) { + dirq.push_back(file); + statq.push_back(dirstat); + } + } + + if (dir != "" && + (actual.nsubdirs != expect.nsubdirs || + actual.nfiles != expect.nfiles)) { + dout(0) << dir << ": expected " << expect << dendl; + dout(0) << dir << ": got " << actual << dendl; + } + } + + for (ceph::unordered_map<inodeno_t,int>::iterator p = nlink.begin(); p != nlink.end(); ++p) { + if (nlink_seen[p->first] != p->second) + dout(0) << p->first << " nlink " << p->second << " != " << nlink_seen[p->first] << "seen" << dendl; + } + + return 0; +} + + + +int SyntheticClient::dump_placement(string& fn) { + + UserPerm perms = client->pick_my_perms(); + + // open file + int fd = client->open(fn.c_str(), O_RDONLY, perms); + dout(5) << "reading from " << fn << " fd " << fd << dendl; + if (fd < 0) return fd; + + + // How big is it? + struct stat stbuf; + int lstat_result = client->lstat(fn.c_str(), &stbuf, perms); + if (lstat_result < 0) { + dout(0) << "lstat error for file " << fn << dendl; + client->close(fd); + return lstat_result; + } + + off_t filesize = stbuf.st_size; + + // grab the placement info + vector<ObjectExtent> extents; + off_t offset = 0; + client->enumerate_layout(fd, extents, filesize, offset); + client->close(fd); + + + // run through all the object extents + dout(0) << "file size is " << filesize << dendl; + dout(0) << "(osd, start, length) tuples for file " << fn << dendl; + for (const auto& x : extents) { + int osd = client->objecter->with_osdmap([&](const OSDMap& o) { + return o.get_pg_acting_primary(o.object_locator_to_pg(x.oid, x.oloc)); + }); + + // run through all the buffer extents + for (const auto& be : x.buffer_extents) + dout(0) << "OSD " << osd << ", offset " << be.first + << ", length " << be.second << dendl; + } + return 0; +} + + + +int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth) +{ + if (time_to_stop()) return 0; + + UserPerm perms = client->pick_my_perms(); + // make sure base dir exists + int r = client->mkdir(basedir, 0755, perms); + if (r != 0) { + dout(1) << "can't make base dir? " << basedir << dendl; + //return -1; + } + + // children + char d[500]; + dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl; + for (int i=0; i<files; i++) { + snprintf(d, sizeof(d), "%s/file.%d", basedir, i); + client->mknod(d, 0644, perms); + } + + if (depth == 0) return 0; + + for (int i=0; i<dirs; i++) { + snprintf(d, sizeof(d), "%s/dir.%d", basedir, i); + make_dirs(d, dirs, files, depth-1); + } + + return 0; +} + +int SyntheticClient::stat_dirs(const char *basedir, int dirs, int files, int depth) +{ + if (time_to_stop()) return 0; + + UserPerm perms = client->pick_my_perms(); + + // make sure base dir exists + struct stat st; + int r = client->lstat(basedir, &st, perms); + if (r != 0) { + dout(1) << "can't make base dir? " << basedir << dendl; + return -1; + } + + // children + char d[500]; + dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl; + for (int i=0; i<files; i++) { + snprintf(d, sizeof(d), "%s/file.%d", basedir, i); + client->lstat(d, &st, perms); + } + + if (depth == 0) return 0; + + for (int i=0; i<dirs; i++) { + snprintf(d, sizeof(d), "%s/dir.%d", basedir, i); + stat_dirs(d, dirs, files, depth-1); + } + + return 0; +} +int SyntheticClient::read_dirs(const char *basedir, int dirs, int files, int depth) +{ + if (time_to_stop()) return 0; + + struct stat st; + + // children + char d[500]; + dout(3) << "read_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl; + + list<string> contents; + UserPerm perms = client->pick_my_perms(); + utime_t s = ceph_clock_now(); + int r = client->getdir(basedir, contents, perms); + utime_t e = ceph_clock_now(); + e -= s; + if (r < 0) { + dout(0) << "getdir couldn't readdir " << basedir << ", stopping" << dendl; + return -1; + } + + for (int i=0; i<files; i++) { + snprintf(d, sizeof(d), "%s/file.%d", basedir, i); + utime_t s = ceph_clock_now(); + if (client->lstat(d, &st, perms) < 0) { + dout(2) << "read_dirs failed stat on " << d << ", stopping" << dendl; + return -1; + } + utime_t e = ceph_clock_now(); + e -= s; + } + + if (depth > 0) + for (int i=0; i<dirs; i++) { + snprintf(d, sizeof(d), "%s/dir.%d", basedir, i); + if (read_dirs(d, dirs, files, depth-1) < 0) return -1; + } + + return 0; +} + + +int SyntheticClient::make_files(int num, int count, int priv, bool more) +{ + int whoami = client->get_nodeid().v; + char d[255]; + UserPerm perms = client->pick_my_perms(); + + if (priv) { + for (int c=0; c<count; c++) { + snprintf(d, sizeof(d), "dir.%d.run%d", whoami, c); + client->mkdir(d, 0755, perms); + } + } else { + // shared + if (true || whoami == 0) { + for (int c=0; c<count; c++) { + snprintf(d, sizeof(d), "dir.%d.run%d", 0, c); + client->mkdir(d, 0755, perms); + } + } else { + sleep(2); + } + } + + // files + struct stat st; + utime_t start = ceph_clock_now(); + for (int c=0; c<count; c++) { + for (int n=0; n<num; n++) { + snprintf(d, sizeof(d), "dir.%d.run%d/file.client%d.%d", priv ? whoami:0, c, whoami, n); + + client->mknod(d, 0644, perms); + + if (more) { + client->lstat(d, &st, perms); + int fd = client->open(d, O_RDONLY, perms); + client->unlink(d, perms); + client->close(fd); + } + + if (time_to_stop()) return 0; + } + } + utime_t end = ceph_clock_now(); + end -= start; + dout(0) << "makefiles time is " << end << " or " << ((double)end / (double)num) <<" per file" << dendl; + + return 0; +} + +int SyntheticClient::link_test() +{ + char d[255]; + char e[255]; + + UserPerm perms = client->pick_my_perms(); + + // create files + int num = 200; + + client->mkdir("orig", 0755, perms); + client->mkdir("copy", 0755, perms); + + utime_t start = ceph_clock_now(); + for (int i=0; i<num; i++) { + snprintf(d, sizeof(d), "orig/file.%d", i); + client->mknod(d, 0755, perms); + } + utime_t end = ceph_clock_now(); + end -= start; + + dout(0) << "orig " << end << dendl; + + // link + start = ceph_clock_now(); + for (int i=0; i<num; i++) { + snprintf(d, sizeof(d), "orig/file.%d", i); + snprintf(e, sizeof(e), "copy/file.%d", i); + client->link(d, e, perms); + } + end = ceph_clock_now(); + end -= start; + dout(0) << "copy " << end << dendl; + + return 0; +} + + +int SyntheticClient::create_shared(int num) +{ + // files + UserPerm perms = client->pick_my_perms(); + char d[255]; + client->mkdir("test", 0755, perms); + for (int n=0; n<num; n++) { + snprintf(d, sizeof(d), "test/file.%d", n); + client->mknod(d, 0644, perms); + } + + return 0; +} + +int SyntheticClient::open_shared(int num, int count) +{ + // files + char d[255]; + UserPerm perms = client->pick_my_perms(); + for (int c=0; c<count; c++) { + // open + list<int> fds; + for (int n=0; n<num; n++) { + snprintf(d, sizeof(d), "test/file.%d", n); + int fd = client->open(d, O_RDONLY, perms); + if (fd > 0) fds.push_back(fd); + } + + if (false && client->get_nodeid() == 0) + for (int n=0; n<num; n++) { + snprintf(d, sizeof(d), "test/file.%d", n); + client->unlink(d, perms); + } + + while (!fds.empty()) { + int fd = fds.front(); + fds.pop_front(); + client->close(fd); + } + } + + return 0; +} + + +// Hits OSD 0 with writes to various files with OSD 0 as the primary. +int SyntheticClient::overload_osd_0(int n, int size, int wrsize) { + UserPerm perms = client->pick_my_perms(); + // collect a bunch of files starting on OSD 0 + int left = n; + int tried = 0; + while (left < 0) { + + + // pull open a file + dout(0) << "in OSD overload" << dendl; + string filename = get_sarg(tried); + dout(1) << "OSD Overload workload: trying file " << filename << dendl; + int fd = client->open(filename.c_str(), O_RDWR|O_CREAT, perms); + ++tried; + + // only use the file if its first primary is OSD 0 + int primary_osd = check_first_primary(fd); + if (primary_osd != 0) { + client->close(fd); + dout(1) << "OSD Overload workload: SKIPPING file " << filename << + " with OSD " << primary_osd << " as first primary. " << dendl; + continue; + } + dout(1) << "OSD Overload workload: USING file " << filename << + " with OSD 0 as first primary. " << dendl; + + + --left; + // do whatever operation we want to do on the file. How about a write? + write_fd(fd, size, wrsize); + } + return 0; +} + + +// See what the primary is for the first object in this file. +int SyntheticClient::check_first_primary(int fh) +{ + vector<ObjectExtent> extents; + client->enumerate_layout(fh, extents, 1, 0); + return client->objecter->with_osdmap([&](const OSDMap& o) { + return o.get_pg_acting_primary( + o.object_locator_to_pg(extents.begin()->oid, extents.begin()->oloc)); + }); +} + +int SyntheticClient::rm_file(string& fn) +{ + UserPerm perms = client->pick_my_perms(); + return client->unlink(fn.c_str(), perms); +} + +int SyntheticClient::write_file(string& fn, int size, loff_t wrsize) // size is in MB, wrsize in bytes +{ + //uint64_t wrsize = 1024*256; + char *buf = new char[wrsize+100]; // 1 MB + memset(buf, 7, wrsize); + int64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)wrsize; + UserPerm perms = client->pick_my_perms(); + + int fd = client->open(fn.c_str(), O_RDWR|O_CREAT, perms); + dout(5) << "writing to " << fn << " fd " << fd << dendl; + if (fd < 0) { + delete[] buf; + return fd; + } + + utime_t from = ceph_clock_now(); + utime_t start = from; + uint64_t bytes = 0, total = 0; + + + for (loff_t i=0; i<chunks; i++) { + if (time_to_stop()) { + dout(0) << "stopping" << dendl; + break; + } + dout(2) << "writing block " << i << "/" << chunks << dendl; + + // fill buf with a 16 byte fingerprint + // 64 bits : file offset + // 64 bits : client id + // = 128 bits (16 bytes) + uint64_t *p = (uint64_t*)buf; + while ((char*)p < buf + wrsize) { + *p = (uint64_t)i*(uint64_t)wrsize + (uint64_t)((char*)p - buf); + p++; + *p = client->get_nodeid().v; + p++; + } + + client->write(fd, buf, wrsize, i*wrsize); + bytes += wrsize; + total += wrsize; + + utime_t now = ceph_clock_now(); + if (now - from >= 1.0) { + double el = now - from; + dout(0) << "write " << (bytes / el / 1048576.0) << " MB/sec" << dendl; + from = now; + bytes = 0; + } + } + + client->fsync(fd, true); + + utime_t stop = ceph_clock_now(); + double el = stop - start; + dout(0) << "write total " << (total / el / 1048576.0) << " MB/sec (" + << total << " bytes in " << el << " seconds)" << dendl; + + client->close(fd); + delete[] buf; + + return 0; +} + +int SyntheticClient::write_fd(int fd, int size, int wrsize) // size is in MB, wrsize in bytes +{ + //uint64_t wrsize = 1024*256; + char *buf = new char[wrsize+100]; // 1 MB + memset(buf, 7, wrsize); + uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)wrsize; + + //dout(5) << "SyntheticClient::write_fd: writing to fd " << fd << dendl; + if (fd < 0) { + delete[] buf; + return fd; + } + + for (unsigned i=0; i<chunks; i++) { + if (time_to_stop()) { + dout(0) << "stopping" << dendl; + break; + } + dout(2) << "writing block " << i << "/" << chunks << dendl; + + // fill buf with a 16 byte fingerprint + // 64 bits : file offset + // 64 bits : client id + // = 128 bits (16 bytes) + uint64_t *p = (uint64_t*)buf; + while ((char*)p < buf + wrsize) { + *p = (uint64_t)i*(uint64_t)wrsize + (uint64_t)((char*)p - buf); + p++; + *p = client->get_nodeid().v; + p++; + } + + client->write(fd, buf, wrsize, i*wrsize); + } + client->close(fd); + delete[] buf; + + return 0; +} + + +int SyntheticClient::write_batch(int nfile, int size, int wrsize) +{ + for (int i=0; i<nfile; i++) { + string sarg1 = get_sarg(i); + dout(0) << "Write file " << sarg1 << dendl; + write_file(sarg1, size, wrsize); + } + return 0; +} + +// size is in MB, wrsize in bytes +int SyntheticClient::read_file(const std::string& fn, int size, + int rdsize, bool ignoreprint) +{ + char *buf = new char[rdsize]; + memset(buf, 1, rdsize); + uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)rdsize; + UserPerm perms = client->pick_my_perms(); + + int fd = client->open(fn.c_str(), O_RDONLY, perms); + dout(5) << "reading from " << fn << " fd " << fd << dendl; + if (fd < 0) { + delete[] buf; + return fd; + } + + utime_t from = ceph_clock_now(); + utime_t start = from; + uint64_t bytes = 0, total = 0; + + for (unsigned i=0; i<chunks; i++) { + if (time_to_stop()) break; + dout(2) << "reading block " << i << "/" << chunks << dendl; + int r = client->read(fd, buf, rdsize, i*rdsize); + if (r < rdsize) { + dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; + break; + } + + bytes += rdsize; + total += rdsize; + + utime_t now = ceph_clock_now(); + if (now - from >= 1.0) { + double el = now - from; + dout(0) << "read " << (bytes / el / 1048576.0) << " MB/sec" << dendl; + from = now; + bytes = 0; + } + + // verify fingerprint + int bad = 0; + uint64_t *p = (uint64_t*)buf; + while ((char*)p + 32 < buf + rdsize) { + uint64_t readoff = *p; + uint64_t wantoff = (uint64_t)i*(uint64_t)rdsize + (uint64_t)((char*)p - buf); + p++; + int64_t readclient = *p; + p++; + if (readoff != wantoff || + readclient != client->get_nodeid()) { + if (!bad && !ignoreprint) + dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient + << ", should be offset " << wantoff << " client " << client->get_nodeid() + << dendl; + bad++; + } + } + if (bad && !ignoreprint) + dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; + } + + utime_t stop = ceph_clock_now(); + double el = stop - start; + dout(0) << "read total " << (total / el / 1048576.0) << " MB/sec (" + << total << " bytes in " << el << " seconds)" << dendl; + + client->close(fd); + delete[] buf; + + return 0; +} + + + + +class C_Ref : public Context { + ceph::mutex& lock; + ceph::condition_variable& cond; + int *ref; +public: + C_Ref(ceph::mutex &l, ceph::condition_variable &c, int *r) + : lock(l), cond(c), ref(r) { + lock_guard locker{lock}; + (*ref)++; + } + void finish(int) override { + lock_guard locker{lock}; + (*ref)--; + cond.notify_all(); + } +}; + +int SyntheticClient::create_objects(int nobj, int osize, int inflight) +{ + // divy up + int numc = num_client ? num_client : 1; + + int start, inc, end; + + if (1) { + // strided + start = client->get_nodeid().v; //nobjs % numc; + inc = numc; + end = start + nobj; + } else { + // segments + start = nobj * client->get_nodeid().v / numc; + inc = 1; + end = nobj * (client->get_nodeid().v+1) / numc; + } + + dout(5) << "create_objects " << nobj << " size=" << osize + << " .. doing [" << start << "," << end << ") inc " << inc + << dendl; + + bufferptr bp(osize); + bp.zero(); + bufferlist bl; + bl.push_back(bp); + + ceph::mutex lock = ceph::make_mutex("create_objects lock"); + ceph::condition_variable cond; + + int unsafe = 0; + + list<utime_t> starts; + + for (int i=start; i<end; i += inc) { + if (time_to_stop()) break; + + object_t oid = file_object_t(999, i); + object_locator_t oloc(SYNCLIENT_FIRST_POOL); + SnapContext snapc; + + if (i % inflight == 0) { + dout(6) << "create_objects " << i << "/" << (nobj+1) << dendl; + } + dout(10) << "writing " << oid << dendl; + + starts.push_back(ceph_clock_now()); + { + std::lock_guard locker{client->client_lock}; + client->objecter->write(oid, oloc, 0, osize, snapc, bl, + ceph::real_clock::now(), 0, + new C_Ref(lock, cond, &unsafe)); + } + { + std::unique_lock locker{lock}; + cond.wait(locker, [&unsafe, inflight, this] { + if (unsafe > inflight) { + dout(20) << "waiting for " << unsafe << " unsafe" << dendl; + } + return unsafe <= inflight; + }); + } + utime_t lat = ceph_clock_now(); + lat -= starts.front(); + starts.pop_front(); + } + { + std::unique_lock locker{lock}; + cond.wait(locker, [&unsafe, this] { + if (unsafe > 0) { + dout(10) << "waiting for " << unsafe << " unsafe" << dendl; + } + return unsafe <= 0; + }); + } + dout(5) << "create_objects done" << dendl; + return 0; +} + +int SyntheticClient::object_rw(int nobj, int osize, int wrpc, + int overlappc, + double rskew, double wskew) +{ + dout(5) << "object_rw " << nobj << " size=" << osize << " with " + << wrpc << "% writes" + << ", " << overlappc << "% overlap" + << ", rskew = " << rskew + << ", wskew = " << wskew + << dendl; + + bufferptr bp(osize); + bp.zero(); + bufferlist bl; + bl.push_back(bp); + + // start with odd number > nobj + rjhash<uint32_t> h; + unsigned prime = nobj + 1; // this is the minimum! + prime += h(nobj) % (3*nobj); // bump it up some + prime |= 1; // make it odd + + while (true) { + unsigned j; + for (j=2; j*j<=prime; j++) + if (prime % j == 0) break; + if (j*j > prime) { + break; + //cout << "prime " << prime << endl; + } + prime += 2; + } + + ceph::mutex lock = ceph::make_mutex("lock"); + ceph::condition_variable cond; + + int unack = 0; + + while (1) { + if (time_to_stop()) break; + + // read or write? + bool write = (rand() % 100) < wrpc; + + // choose object + double r = drand48(); // [0..1) + long o; + if (write) { + o = (long)trunc(pow(r, wskew) * (double)nobj); // exponentially skew towards 0 + int pnoremap = (long)(r * 100.0); + if (pnoremap >= overlappc) + o = (o*prime) % nobj; // remap + } else { + o = (long)trunc(pow(r, rskew) * (double)nobj); // exponentially skew towards 0 + } + object_t oid = file_object_t(999, o); + object_locator_t oloc(SYNCLIENT_FIRST_POOL); + SnapContext snapc; + + client->client_lock.lock(); + utime_t start = ceph_clock_now(); + if (write) { + dout(10) << "write to " << oid << dendl; + + ObjectOperation m; + OSDOp op; + op.op.op = CEPH_OSD_OP_WRITE; + op.op.extent.offset = 0; + op.op.extent.length = osize; + op.indata = bl; + m.ops.push_back(op); + client->objecter->mutate(oid, oloc, m, snapc, + ceph::real_clock::now(), 0, + new C_Ref(lock, cond, &unack)); + } else { + dout(10) << "read from " << oid << dendl; + bufferlist inbl; + client->objecter->read(oid, oloc, 0, osize, CEPH_NOSNAP, &inbl, 0, + new C_Ref(lock, cond, &unack)); + } + client->client_lock.unlock(); + + { + std::unique_lock locker{lock}; + cond.wait(locker, [&unack, this] { + if (unack > 0) { + dout(20) << "waiting for " << unack << " unack" << dendl; + } + return unack <= 0; + }); + } + + utime_t lat = ceph_clock_now(); + lat -= start; + } + + return 0; +} + + + + + +int SyntheticClient::read_random(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes +{ + UserPerm perms = client->pick_my_perms(); + uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)rdsize; + int fd = client->open(fn.c_str(), O_RDWR, perms); + dout(5) << "reading from " << fn << " fd " << fd << dendl; + + if (fd < 0) return fd; + int offset = 0; + char * buf = NULL; + + for (unsigned i=0; i<2000; i++) { + if (time_to_stop()) break; + + bool read=false; + + time_t seconds; + time( &seconds); + srand(seconds); + + // use rand instead ?? + double x = drand48(); + + // cleanup before call 'new' + if (buf != NULL) { + delete[] buf; + buf = NULL; + } + if (x < 0.5) { + buf = new char[rdsize]; + memset(buf, 1, rdsize); + read=true; + } else { + buf = new char[rdsize+100]; // 1 MB + memset(buf, 7, rdsize); + } + + if (read) { + offset=(rand())%(chunks+1); + dout(2) << "reading block " << offset << "/" << chunks << dendl; + + int r = client->read(fd, buf, rdsize, offset*rdsize); + if (r < rdsize) { + dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; + } + } else { + dout(2) << "writing block " << offset << "/" << chunks << dendl; + + // fill buf with a 16 byte fingerprint + // 64 bits : file offset + // 64 bits : client id + // = 128 bits (16 bytes) + + offset=(rand())%(chunks+1); + uint64_t *p = (uint64_t*)buf; + while ((char*)p < buf + rdsize) { + *p = offset*rdsize + (char*)p - buf; + p++; + *p = client->get_nodeid().v; + p++; + } + + client->write(fd, buf, rdsize, + offset*rdsize); + } + + // verify fingerprint + if (read) { + int bad = 0; + int64_t *p = (int64_t*)buf; + while ((char*)p + 32 < buf + rdsize) { + int64_t readoff = *p; + int64_t wantoff = offset*rdsize + (int64_t)((char*)p - buf); + p++; + int64_t readclient = *p; + p++; + if (readoff != wantoff || readclient != client->get_nodeid()) { + if (!bad) + dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient + << ", should be offset " << wantoff << " client " << client->get_nodeid() + << dendl; + bad++; + } + } + if (bad) + dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; + } + } + + client->close(fd); + delete[] buf; + + return 0; +} + +int normdist(int min, int max, int stdev) /* specifies input values */ +{ + /* min: Minimum value; max: Maximum value; stdev: degree of deviation */ + + //int min, max, stdev; { + time_t seconds; + time( &seconds); + srand(seconds); + + int range, iterate, result; + /* declare range, iterate and result as integers, to avoid the need for + floating point math*/ + + result = 0; + /* ensure result is initialized to 0 */ + + range = max -min; + /* calculate range of possible values between the max and min values */ + + iterate = range / stdev; + /* this number of iterations ensures the proper shape of the resulting + curve */ + + stdev += 1; /* compensation for integer vs. floating point math */ + for (int c = iterate; c != 0; c--) /* loop through iterations */ + { + // result += (uniform (1, 100) * stdev) / 100; /* calculate and + result += ( (rand()%100 + 1) * stdev) / 100; + // printf("result=%d\n", result ); + } + printf("\n final result=%d\n", result ); + return result + min; /* send final result back */ +} + +int SyntheticClient::read_random_ex(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes +{ + uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)rdsize; + UserPerm perms = client->pick_my_perms(); + int fd = client->open(fn.c_str(), O_RDWR, perms); + dout(5) << "reading from " << fn << " fd " << fd << dendl; + + if (fd < 0) return fd; + int offset = 0; + char * buf = NULL; + + for (unsigned i=0; i<2000; i++) { + if (time_to_stop()) break; + + bool read=false; + + time_t seconds; + time( &seconds); + srand(seconds); + + // use rand instead ?? + double x = drand48(); + + // cleanup before call 'new' + if (buf != NULL) { + delete[] buf; + buf = NULL; + } + if (x < 0.5) { + buf = new char[rdsize]; + memset(buf, 1, rdsize); + read=true; + } else { + buf = new char[rdsize+100]; // 1 MB + memset(buf, 7, rdsize); + } + + if (read) { + dout(2) << "reading block " << offset << "/" << chunks << dendl; + + int r = client->read(fd, buf, rdsize, + offset*rdsize); + if (r < rdsize) { + dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; + } + } else { + dout(2) << "writing block " << offset << "/" << chunks << dendl; + + // fill buf with a 16 byte fingerprint + // 64 bits : file offset + // 64 bits : client id + // = 128 bits (16 bytes) + + int count = rand()%10; + + for ( int j=0;j<count; j++ ) { + offset=(rand())%(chunks+1); + uint64_t *p = (uint64_t*)buf; + while ((char*)p < buf + rdsize) { + *p = offset*rdsize + (char*)p - buf; + p++; + *p = client->get_nodeid().v; + p++; + } + + client->write(fd, buf, rdsize, offset*rdsize); + } + } + + // verify fingerprint + if (read) { + int bad = 0; + int64_t *p = (int64_t*)buf; + while ((char*)p + 32 < buf + rdsize) { + int64_t readoff = *p; + int64_t wantoff = offset*rdsize + (int64_t)((char*)p - buf); + p++; + int64_t readclient = *p; + p++; + if (readoff != wantoff || readclient != client->get_nodeid()) { + if (!bad) + dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient + << ", should be offset " << wantoff << " client " << client->get_nodeid() + << dendl; + bad++; + } + } + if (bad) + dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; + } + } + + client->close(fd); + delete[] buf; + + return 0; +} + + +int SyntheticClient::random_walk(int num_req) +{ + int left = num_req; + + //dout(1) << "random_walk() will do " << left << " ops" << dendl; + + init_op_dist(); // set up metadata op distribution + + UserPerm perms = client->pick_my_perms(); + while (left > 0) { + left--; + + if (time_to_stop()) break; + + // ascend? + if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) { + dout(DBL) << "die says up" << dendl; + up(); + continue; + } + + // descend? + if (roll_die(::pow((double).9,(double)cwd.depth())) && !subdirs.empty()) { + string s = get_random_subdir(); + cwd.push_dentry( s ); + dout(DBL) << "cd " << s << " -> " << cwd << dendl; + clear_dir(); + continue; + } + + int op = 0; + filepath path; + + if (contents.empty() && roll_die(.3)) { + if (did_readdir) { + dout(DBL) << "empty dir, up" << dendl; + up(); + } else + op = CEPH_MDS_OP_READDIR; + } else { + op = op_dist.sample(); + } + //dout(DBL) << "op is " << op << dendl; + + int r = 0; + + // do op + if (op == CEPH_MDS_OP_UNLINK) { + if (contents.empty()) + op = CEPH_MDS_OP_READDIR; + else + r = client->unlink(get_random_sub(), perms); // will fail on dirs + } + + if (op == CEPH_MDS_OP_RENAME) { + if (contents.empty()) + op = CEPH_MDS_OP_READDIR; + else { + r = client->rename(get_random_sub(), make_sub("ren"), perms); + } + } + + if (op == CEPH_MDS_OP_MKDIR) { + r = client->mkdir(make_sub("mkdir"), 0755, perms); + } + + if (op == CEPH_MDS_OP_RMDIR) { + if (!subdirs.empty()) + r = client->rmdir(get_random_subdir(), perms); + else + r = client->rmdir(cwd.c_str(), perms); // will pbly fail + } + + if (op == CEPH_MDS_OP_SYMLINK) { + } + /* + if (op == CEPH_MDS_OP_CHMOD) { + if (contents.empty()) + op = CEPH_MDS_OP_READDIR; + else + r = client->chmod(get_random_sub(), rand() & 0755, perms); + } + + if (op == CEPH_MDS_OP_CHOWN) { + if (contents.empty()) r = client->chown(cwd.c_str(), rand(), rand(), perms); + else + r = client->chown(get_random_sub(), rand(), rand(), perms); + } + + if (op == CEPH_MDS_OP_UTIME) { + struct utimbuf b; + memset(&b, 1, sizeof(b)); + if (contents.empty()) + r = client->utime(cwd.c_str(), &b, perms); + else + r = client->utime(get_random_sub(), &b, perms); + } + */ + if (op == CEPH_MDS_OP_LINK) { + } + + if (op == CEPH_MDS_OP_MKNOD) { + r = client->mknod(make_sub("mknod"), 0644, perms); + } + + if (op == CEPH_MDS_OP_OPEN) { + if (contents.empty()) + op = CEPH_MDS_OP_READDIR; + else { + r = client->open(get_random_sub(), O_RDONLY, perms); + if (r > 0) { + ceph_assert(open_files.count(r) == 0); + open_files.insert(r); + } + } + } + + /*if (op == CEPH_MDS_OP_RELEASE) { // actually, close + if (open_files.empty()) + op = CEPH_MDS_OP_STAT; + else { + int fh = get_random_fh(); + r = client->close( fh ); + if (r == 0) open_files.erase(fh); + } + } + */ + + if (op == CEPH_MDS_OP_GETATTR) { + struct stat st; + if (contents.empty()) { + if (did_readdir) { + if (roll_die(.1)) { + dout(DBL) << "stat in empty dir, up" << dendl; + up(); + } else { + op = CEPH_MDS_OP_MKNOD; + } + } else + op = CEPH_MDS_OP_READDIR; + } else + r = client->lstat(get_random_sub(), &st, perms); + } + + if (op == CEPH_MDS_OP_READDIR) { + clear_dir(); + + list<string> c; + r = client->getdir(cwd.c_str(), c, perms); + + for (list<string>::iterator it = c.begin(); + it != c.end(); + ++it) { + //dout(DBL) << " got " << *it << dendl; + ceph_abort(); + /*contents[*it] = it->second; + if (it->second && + S_ISDIR(it->second->st_mode)) + subdirs.insert(*it); + */ + } + + did_readdir = true; + } + + // errors? + if (r < 0) { + // reevaluate cwd. + //while (cwd.depth()) { + //if (client->lookup(cwd)) break; // it's in the cache + + //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << dendl; + dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << dendl; + up(); + //} + } + } + + // close files + dout(DBL) << "closing files" << dendl; + while (!open_files.empty()) { + int fh = get_random_fh(); + int r = client->close( fh ); + if (r == 0) open_files.erase(fh); + } + + dout(DBL) << "done" << dendl; + return 0; +} + + + + +void SyntheticClient::make_dir_mess(const char *basedir, int n) +{ + UserPerm perms = client->pick_my_perms(); + vector<string> dirs; + + dirs.push_back(basedir); + dirs.push_back(basedir); + + client->mkdir(basedir, 0755, perms); + + // motivation: + // P(dir) ~ subdirs_of(dir) + 2 + // from 5-year metadata workload paper in fast'07 + + // create dirs + for (int i=0; i<n; i++) { + // pick a dir + int k = rand() % dirs.size(); + string parent = dirs[k]; + + // pick a name + std::stringstream ss; + ss << parent << "/" << i; + string dir = ss.str(); + + // update dirs + dirs.push_back(parent); + dirs.push_back(dir); + dirs.push_back(dir); + + // do it + client->mkdir(dir.c_str(), 0755, perms); + } + + +} + + + +void SyntheticClient::foo() +{ + UserPerm perms = client->pick_my_perms(); + + if (1) { + // make 2 parallel dirs, link/unlink between them. + char a[100], b[100]; + client->mkdir("/a", 0755, perms); + client->mkdir("/b", 0755, perms); + for (int i=0; i<10; i++) { + snprintf(a, sizeof(a), "/a/%d", i); + client->mknod(a, 0644, perms); + } + while (1) { + for (int i=0; i<10; i++) { + snprintf(a, sizeof(a), "/a/%d", i); + snprintf(b, sizeof(b), "/b/%d", i); + client->link(a, b, perms); + } + for (int i=0; i<10; i++) { + snprintf(b, sizeof(b), "/b/%d", i); + client->unlink(b, perms); + } + } + return; + } + if (1) { + // bug1.cpp + const char *fn = "blah"; + char buffer[8192]; + client->unlink(fn, perms); + int handle = client->open(fn, O_CREAT|O_RDWR, perms, S_IRWXU); + ceph_assert(handle>=0); + int r=client->write(handle,buffer,8192); + ceph_assert(r>=0); + r=client->close(handle); + ceph_assert(r>=0); + + handle = client->open(fn, O_RDWR, perms); // open the same file, it must have some data already + ceph_assert(handle>=0); + r=client->read(handle,buffer,8192); + ceph_assert(r==8192); // THIS ASSERTION FAILS with disabled cache + r=client->close(handle); + ceph_assert(r>=0); + + return; + } + if (1) { + dout(0) << "first" << dendl; + int fd = client->open("tester", O_WRONLY|O_CREAT, perms); + client->write(fd, "hi there", 0, 8); + client->close(fd); + dout(0) << "sleep" << dendl; + sleep(10); + dout(0) << "again" << dendl; + fd = client->open("tester", O_WRONLY|O_CREAT, perms); + client->write(fd, "hi there", 0, 8); + client->close(fd); + return; + } + if (1) { + // open some files + srand(0); + for (int i=0; i<20; i++) { + int s = 5; + int a = rand() % s; + int b = rand() % s; + int c = rand() % s; + char src[80]; + snprintf(src, sizeof(src), "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); + //int fd = + client->open(src, O_RDONLY, perms); + } + + return; + } + + if (0) { + // rename fun + for (int i=0; i<100; i++) { + int s = 5; + int a = rand() % s; + int b = rand() % s; + int c = rand() % s; + int d = rand() % s; + int e = rand() % s; + int f = rand() % s; + char src[80]; + char dst[80]; + snprintf(src, sizeof(src), "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); + snprintf(dst, sizeof(dst), "syn.0.0/dir.%d/dir.%d/file.%d", d, e, f); + client->rename(src, dst, perms); + } + return; + } + + if (1) { + // link fun + srand(0); + for (int i=0; i<100; i++) { + int s = 5; + int a = rand() % s; + int b = rand() % s; + int c = rand() % s; + int d = rand() % s; + int e = rand() % s; + int f = rand() % s; + char src[80]; + char dst[80]; + snprintf(src, sizeof(src), "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); + snprintf(dst, sizeof(dst), "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); + client->link(src, dst, perms); + } + srand(0); + for (int i=0; i<100; i++) { + int s = 5; + int a = rand() % s; + int b = rand() % s; + int c = rand() % s; + int d = rand() % s; + int e = rand() % s; + int f = rand() % s; + char src[80]; + char dst[80]; + snprintf(src, sizeof(src), "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); + snprintf(dst, sizeof(dst), "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); + client->unlink(dst, perms); + } + + + return; + } + + // link fun + client->mknod("one", 0755, perms); + client->mknod("two", 0755, perms); + client->link("one", "three", perms); + client->mkdir("dir", 0755, perms); + client->link("two", "/dir/twolink", perms); + client->link("dir/twolink", "four", perms); + + // unlink fun + client->mknod("a", 0644, perms); + client->unlink("a", perms); + client->mknod("b", 0644, perms); + client->link("b", "c", perms); + client->unlink("c", perms); + client->mkdir("d", 0755, perms); + client->unlink("d", perms); + client->rmdir("d", perms); + + // rename fun + client->mknod("p1", 0644, perms); + client->mknod("p2", 0644, perms); + client->rename("p1","p2", perms); + client->mknod("p3", 0644, perms); + client->rename("p3","p4", perms); + + // check dest dir ambiguity thing + client->mkdir("dir1", 0755, perms); + client->mkdir("dir2", 0755, perms); + client->rename("p2", "dir1/p2", perms); + client->rename("dir1/p2", "dir2/p2", perms); + client->rename("dir2/p2", "/p2", perms); + + // check primary+remote link merging + client->link("p2","p2.l", perms); + client->link("p4","p4.l", perms); + client->rename("p2.l", "p2", perms); + client->rename("p4", "p4.l", perms); + + // check anchor updates + client->mknod("dir1/a", 0644, perms); + client->link("dir1/a", "da1", perms); + client->link("dir1/a", "da2", perms); + client->link("da2","da3", perms); + client->rename("dir1/a", "dir2/a", perms); + client->rename("dir2/a", "da2", perms); + client->rename("da1", "da2", perms); + client->rename("da2", "da3", perms); + + // check directory renames + client->mkdir("dir3", 0755, perms); + client->mknod("dir3/asdf", 0644, perms); + client->mkdir("dir4", 0755, perms); + client->mkdir("dir5", 0755, perms); + client->mknod("dir5/asdf", 0644, perms); + client->rename("dir3", "dir4", perms); // ok + client->rename("dir4", "dir5", perms); // fail +} + +int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int depth, int n) +{ + dout(1) << "thrash_links " << basedir << " " << dirs << " " << files << " " << depth + << " links " << n + << dendl; + + if (time_to_stop()) return 0; + + UserPerm perms = client->pick_my_perms(); + + srand(0); + if (1) { + bool renames = true; // thrash renames too? + for (int k=0; k<n; k++) { + + if (renames && rand() % 10 == 0) { + // rename some directories. whee! + int dep = (rand() % depth) + 1; + string src = basedir; + { + char t[80]; + for (int d=0; d<dep; d++) { + int a = rand() % dirs; + snprintf(t, sizeof(t), "/dir.%d", a); + src += t; + } + } + string dst = basedir; + { + char t[80]; + for (int d=0; d<dep; d++) { + int a = rand() % dirs; + snprintf(t, sizeof(t), "/dir.%d", a); + dst += t; + } + } + + if (client->rename(dst.c_str(), "/tmp", perms) == 0) { + client->rename(src.c_str(), dst.c_str(), perms); + client->rename("/tmp", src.c_str(), perms); + } + continue; + } + + // pick a dest dir + string src = basedir; + { + char t[80]; + for (int d=0; d<depth; d++) { + int a = rand() % dirs; + snprintf(t, sizeof(t), "/dir.%d", a); + src += t; + } + int a = rand() % files; + snprintf(t, sizeof(t), "/file.%d", a); + src += t; + } + string dst = basedir; + { + char t[80]; + for (int d=0; d<depth; d++) { + int a = rand() % dirs; + snprintf(t, sizeof(t), "/dir.%d", a); + dst += t; + } + int a = rand() % files; + snprintf(t, sizeof(t), "/file.%d", a); + dst += t; + } + + int o = rand() % 4; + switch (o) { + case 0: + client->mknod(src.c_str(), 0755, perms); + if (renames) client->rename(src.c_str(), dst.c_str(), perms); + break; + case 1: + client->mknod(src.c_str(), 0755, perms); + client->unlink(dst.c_str(), perms); + client->link(src.c_str(), dst.c_str(), perms); + break; + case 2: client->unlink(src.c_str(), perms); break; + case 3: client->unlink(dst.c_str(), perms); break; + //case 4: client->mknod(src.c_str(), 0755, perms); break; + //case 5: client->mknod(dst.c_str(), 0755, perms); break; + } + } + return 0; + } + + if (1) { + // now link shit up + for (int i=0; i<n; i++) { + if (time_to_stop()) return 0; + + char f[20]; + + // pick a file + string file = basedir; + + if (depth) { + int d = rand() % (depth+1); + for (int k=0; k<d; k++) { + snprintf(f, sizeof(f), "/dir.%d", rand() % dirs); + file += f; + } + } + snprintf(f, sizeof(f), "/file.%d", rand() % files); + file += f; + + // pick a dir for our link + string ln = basedir; + if (depth) { + int d = rand() % (depth+1); + for (int k=0; k<d; k++) { + snprintf(f, sizeof(f), "/dir.%d", rand() % dirs); + ln += f; + } + } + snprintf(f, sizeof(f), "/ln.%d", i); + ln += f; + + client->link(file.c_str(), ln.c_str(), perms); + } + } + return 0; +} + + + + +void SyntheticClient::import_find(const char *base, const char *find, bool data) +{ + dout(1) << "import_find " << base << " from " << find << " data=" << data << dendl; + + /* use this to gather the static trace: + * + * find . -exec ls -dilsn --time-style=+%s \{\} \; + * or if it's wafl, + * find . -path ./.snapshot -prune -o -exec ls -dilsn --time-style=+%s \{\} \; + * + */ + + UserPerm process_perms = client->pick_my_perms(); + + if (base[0] != '-') + client->mkdir(base, 0755, process_perms); + + ifstream f(find); + ceph_assert(f.is_open()); + + int dirnum = 0; + + while (!f.eof()) { + uint64_t ino; + int dunno, nlink; + string modestring; + int uid, gid; + off_t size; + time_t mtime; + string filename; + f >> ino; + if (f.eof()) break; + f >> dunno; + f >> modestring; + f >> nlink; + f >> uid; + f >> gid; + f >> size; + f >> mtime; + f.seekg(1, ios::cur); + getline(f, filename); + UserPerm perms(uid, gid); + + // ignore "." + if (filename == ".") continue; + + // remove leading ./ + ceph_assert(filename[0] == '.' && filename[1] == '/'); + filename = filename.substr(2); + + // new leading dir? + int sp = filename.find("/"); + if (sp < 0) dirnum++; + + //dout(0) << "leading dir " << filename << " " << dirnum << dendl; + if (dirnum % num_client != client->get_nodeid()) { + dout(20) << "skipping leading dir " << dirnum << " " << filename << dendl; + continue; + } + + // parse the mode + ceph_assert(modestring.length() == 10); + mode_t mode = 0; + switch (modestring[0]) { + case 'd': mode |= S_IFDIR; break; + case 'l': mode |= S_IFLNK; break; + default: + case '-': mode |= S_IFREG; break; + } + if (modestring[1] == 'r') mode |= 0400; + if (modestring[2] == 'w') mode |= 0200; + if (modestring[3] == 'x') mode |= 0100; + if (modestring[4] == 'r') mode |= 040; + if (modestring[5] == 'w') mode |= 020; + if (modestring[6] == 'x') mode |= 010; + if (modestring[7] == 'r') mode |= 04; + if (modestring[8] == 'w') mode |= 02; + if (modestring[9] == 'x') mode |= 01; + + dout(20) << " mode " << modestring << " to " << oct << mode << dec << dendl; + + if (S_ISLNK(mode)) { + // target vs destination + int pos = filename.find(" -> "); + ceph_assert(pos > 0); + string link; + if (base[0] != '-') { + link = base; + link += "/"; + } + link += filename.substr(0, pos); + string target; + if (filename[pos+4] == '/') { + if (base[0] != '-') + target = base; + target += filename.substr(pos + 4); + } else { + target = filename.substr(pos + 4); + } + dout(10) << "symlink from '" << link << "' -> '" << target << "'" << dendl; + client->symlink(target.c_str(), link.c_str(), perms); + } else { + string f; + if (base[0] != '-') { + f = base; + f += "/"; + } + f += filename; + if (S_ISDIR(mode)) { + client->mkdir(f.c_str(), mode, perms); + } else { + int fd = client->open(f.c_str(), O_WRONLY|O_CREAT, perms, mode & 0777); + ceph_assert(fd > 0); + if (data) { + client->write(fd, "", 0, size); + } else { + client->truncate(f.c_str(), size, perms); + } + client->close(fd); + + //client->chmod(f.c_str(), mode & 0777, perms, process_perms); + client->chown(f.c_str(), uid, gid, process_perms); + + struct utimbuf ut; + ut.modtime = mtime; + ut.actime = mtime; + client->utime(f.c_str(), &ut, perms); + } + } + } + + +} + + +int SyntheticClient::lookup_hash(inodeno_t ino, inodeno_t dirino, + const char *name, const UserPerm& perms) +{ + int r = client->lookup_hash(ino, dirino, name, perms); + dout(0) << "lookup_hash(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl; + return r; +} + +int SyntheticClient::lookup_ino(inodeno_t ino, const UserPerm& perms) +{ + int r = client->lookup_ino(ino, perms); + dout(0) << "lookup_ino(" << ino << ") = " << r << dendl; + return r; +} + +int SyntheticClient::chunk_file(string &filename) +{ + UserPerm perms = client->pick_my_perms(); + int fd = client->open(filename.c_str(), O_RDONLY, perms); + if (fd < 0) + return fd; + + struct stat st; + int ret = client->fstat(fd, &st, perms); + if (ret < 0) { + client->close(fd); + return ret; + } + uint64_t size = st.st_size; + dout(0) << "file " << filename << " size is " << size << dendl; + + inode_t inode{}; + inode.ino = st.st_ino; + ret = client->fdescribe_layout(fd, &inode.layout); + ceph_assert(ret == 0); // otherwise fstat did a bad thing + + uint64_t pos = 0; + bufferlist from_before; + while (pos < size) { + int get = std::min<int>(size - pos, 1048576); + + ceph::mutex flock = ceph::make_mutex("synclient chunk_file lock"); + ceph::condition_variable cond; + bool done; + bufferlist bl; + { + std::unique_lock locker{flock}; + Context *onfinish = new C_SafeCond(flock, cond, &done); + client->filer->read(inode.ino, &inode.layout, CEPH_NOSNAP, pos, get, &bl, 0, + onfinish); + cond.wait(locker, [&done] { return done; }); + } + dout(0) << "got " << bl.length() << " bytes at " << pos << dendl; + + if (from_before.length()) { + dout(0) << " including bit from previous block" << dendl; + pos -= from_before.length(); + from_before.claim_append(bl); + bl.swap(from_before); + } + + // .... + + // keep last 32 bytes around + from_before.clear(); + from_before.substr_of(bl, bl.length()-32, 32); + + pos += bl.length(); + } + + client->close(fd); + return 0; +} + + + +void SyntheticClient::mksnap(const char *base, const char *name, const UserPerm& perms) +{ + client->mksnap(base, name, perms); +} + +void SyntheticClient::rmsnap(const char *base, const char *name, const UserPerm& perms) +{ + client->rmsnap(base, name, perms); +} + +void SyntheticClient::mksnapfile(const char *dir) +{ + UserPerm perms = client->pick_my_perms(); + client->mkdir(dir, 0755, perms); + + string f = dir; + f += "/foo"; + int fd = client->open(f.c_str(), O_WRONLY|O_CREAT|O_TRUNC, perms); + + char buf[1048576*4]; + client->write(fd, buf, sizeof(buf), 0); + client->fsync(fd, true); + client->close(fd); + + string s = dir; + s += "/.snap/1"; + client->mkdir(s.c_str(), 0755, perms); + + fd = client->open(f.c_str(), O_WRONLY, perms); + client->write(fd, buf, 1048576*2, 1048576); + client->fsync(fd, true); + client->close(fd); +} diff --git a/src/client/SyntheticClient.h b/src/client/SyntheticClient.h new file mode 100644 index 000000000..2c1601637 --- /dev/null +++ b/src/client/SyntheticClient.h @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_SYNTHETICCLIENT_H +#define CEPH_SYNTHETICCLIENT_H + +#include <pthread.h> + +#include "Client.h" +#include "include/Distribution.h" + +#include "Trace.h" + +#define SYNCLIENT_FIRST_POOL 0 + +#define SYNCLIENT_MODE_RANDOMWALK 1 +#define SYNCLIENT_MODE_FULLWALK 2 +#define SYNCLIENT_MODE_REPEATWALK 3 + +#define SYNCLIENT_MODE_MAKEDIRMESS 7 +#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth +#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth +#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth + +#define SYNCLIENT_MODE_MAKEFILES 11 // num count private +#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private +#define SYNCLIENT_MODE_CREATESHARED 13 // num +#define SYNCLIENT_MODE_OPENSHARED 14 // num count + +#define SYNCLIENT_MODE_RMFILE 19 +#define SYNCLIENT_MODE_WRITEFILE 20 +#define SYNCLIENT_MODE_READFILE 21 +#define SYNCLIENT_MODE_WRITEBATCH 22 +#define SYNCLIENT_MODE_WRSHARED 23 +#define SYNCLIENT_MODE_READSHARED 24 +#define SYNCLIENT_MODE_RDWRRANDOM 25 +#define SYNCLIENT_MODE_RDWRRANDOM_EX 26 + +#define SYNCLIENT_MODE_LINKTEST 27 + +#define SYNCLIENT_MODE_OVERLOAD_OSD_0 28 // two args + +#define SYNCLIENT_MODE_DROPCACHE 29 + +#define SYNCLIENT_MODE_TRACE 30 + +#define SYNCLIENT_MODE_CREATEOBJECTS 35 +#define SYNCLIENT_MODE_OBJECTRW 36 + +#define SYNCLIENT_MODE_OPENTEST 40 +#define SYNCLIENT_MODE_OPTEST 41 + +#define SYNCLIENT_MODE_ONLY 50 +#define SYNCLIENT_MODE_ONLYRANGE 51 +#define SYNCLIENT_MODE_EXCLUDE 52 +#define SYNCLIENT_MODE_EXCLUDERANGE 53 + +#define SYNCLIENT_MODE_UNTIL 55 +#define SYNCLIENT_MODE_SLEEPUNTIL 56 + +#define SYNCLIENT_MODE_RANDOMSLEEP 61 +#define SYNCLIENT_MODE_SLEEP 62 + +#define SYNCLIENT_MODE_DUMP 63 + +#define SYNCLIENT_MODE_LOOKUPHASH 70 +#define SYNCLIENT_MODE_LOOKUPINO 71 + +#define SYNCLIENT_MODE_TRUNCATE 200 + +#define SYNCLIENT_MODE_FOO 100 +#define SYNCLIENT_MODE_THRASHLINKS 101 + +#define SYNCLIENT_MODE_IMPORTFIND 300 + +#define SYNCLIENT_MODE_CHUNK 400 + +#define SYNCLIENT_MODE_MKSNAP 1000 +#define SYNCLIENT_MODE_RMSNAP 1001 + +#define SYNCLIENT_MODE_MKSNAPFILE 1002 + + + +void parse_syn_options(std::vector<const char*>& args); +extern int num_client; + +class SyntheticClient { + StandaloneClient *client; + int whoami; + + pthread_t thread_id; + + Distribution op_dist; + + void init_op_dist(); + int get_op(); + + + filepath cwd; + std::map<std::string, struct stat*> contents; + std::set<std::string> subdirs; + bool did_readdir; + std::set<int> open_files; + + void up(); + + void clear_dir() { + contents.clear(); + subdirs.clear(); + did_readdir = false; + } + + int get_random_fh() { + int r = rand() % open_files.size(); + std::set<int>::iterator it = open_files.begin(); + while (r--) ++it; + return *it; + } + + + filepath n1; + const char *get_random_subdir() { + ceph_assert(!subdirs.empty()); + int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2; // non-uniform distn + std::set<std::string>::iterator it = subdirs.begin(); + while (r--) ++it; + + n1 = cwd; + n1.push_dentry( *it ); + return n1.get_path().c_str(); + } + filepath n2; + const char *get_random_sub() { + ceph_assert(!contents.empty()); + int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn + if (cwd.depth() && cwd.last_dentry().length()) + r += cwd.last_dentry().c_str()[0]; // slightly permuted + r %= contents.size(); + + std::map<std::string,struct stat*>::iterator it = contents.begin(); + while (r--) ++it; + + n2 = cwd; + n2.push_dentry( it->first ); + return n2.get_path().c_str(); + } + + filepath sub; + char sub_s[50]; + const char *make_sub(const char *base) { + snprintf(sub_s, sizeof(sub_s), "%s.%d", base, rand() % 100); + std::string f = sub_s; + sub = cwd; + sub.push_dentry(f); + return sub.c_str(); + } + + public: + SyntheticClient(StandaloneClient *client, int w = -1); + + int start_thread(); + int join_thread(); + + int run(); + + bool run_me() { + if (run_only >= 0) { + if (run_only == client->get_nodeid()) + return true; + else + return false; + } + return true; + } + void did_run_me() { + run_only = -1; + run_until = utime_t(); + } + + // run() will do one of these things: + std::list<int> modes; + std::list<std::string> sargs; + std::list<int> iargs; + utime_t run_start; + utime_t run_until; + + client_t run_only; + client_t exclude; + + std::string get_sarg(int seq); + int get_iarg() { + int i = iargs.front(); + iargs.pop_front(); + return i; + } + + bool time_to_stop() { + utime_t now = ceph_clock_now(); + if (0) std::cout << "time_to_stop .. now " << now + << " until " << run_until + << " start " << run_start + << std::endl; + if (run_until.sec() && now > run_until) + return true; + else + return false; + } + + std::string compose_path(std::string& prefix, char *rest) { + return prefix + rest; + } + + int full_walk(std::string& fromdir); + int random_walk(int n); + + int dump_placement(std::string& fn); + + + int make_dirs(const char *basedir, int dirs, int files, int depth); + int stat_dirs(const char *basedir, int dirs, int files, int depth); + int read_dirs(const char *basedir, int dirs, int files, int depth); + int make_files(int num, int count, int priv, bool more); + int link_test(); + + int create_shared(int num); + int open_shared(int num, int count); + + int rm_file(std::string& fn); + int write_file(std::string& fn, int mb, loff_t chunk); + int write_fd(int fd, int size, int wrsize); + + int write_batch(int nfile, int mb, int chunk); + int read_file(const std::string& fn, int mb, int chunk, bool ignoreprint=false); + + int create_objects(int nobj, int osize, int inflight); + int object_rw(int nobj, int osize, int wrpc, int overlap, + double rskew, double wskew); + + int read_random(std::string& fn, int mb, int chunk); + int read_random_ex(std::string& fn, int mb, int chunk); + + int overload_osd_0(int n, int sie, int wrsize); + int check_first_primary(int fd); + + int clean_dir(std::string& basedir); + + int play_trace(Trace& t, std::string& prefix, bool metadata_only=false); + + void make_dir_mess(const char *basedir, int n); + void foo(); + + int thrash_links(const char *basedir, int dirs, int files, int depth, int n); + + void import_find(const char *basedir, const char *find, bool writedata); + + int lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name, + const UserPerm& perms); + int lookup_ino(inodeno_t ino, const UserPerm& perms); + + int chunk_file(std::string &filename); + + void mksnap(const char *base, const char *name, const UserPerm& perms); + void rmsnap(const char *base, const char *name, const UserPerm& perms); + void mksnapfile(const char *dir); + +}; + +#endif diff --git a/src/client/Trace.cc b/src/client/Trace.cc new file mode 100644 index 000000000..0d07da66c --- /dev/null +++ b/src/client/Trace.cc @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "Trace.h" +#include "common/debug.h" + +#include <iostream> +#include <map> + +#include "common/config.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + + + + + +void Trace::start() +{ + //cout << "start" << std::endl; + delete fs; + + fs = new ifstream(); + fs->open(filename); + if (!fs->is_open()) { + //generic_dout(0) << "** unable to open trace file " << filename << dendl; + ceph_abort(); + } + //generic_dout(2) << "opened traced file '" << filename << "'" << dendl; + + // read first line + getline(*fs, line); + //cout << "first line is " << line << std::endl; + + _line = 1; +} + +const char *Trace::peek_string(string &buf, const char *prefix) +{ + //if (prefix) cout << "prefix '" << prefix << "' line '" << line << "'" << std::endl; + if (prefix && + strstr(line.c_str(), "/prefix") == line.c_str()) { + buf.clear(); + buf.append(prefix); + buf.append(line.c_str() + strlen("/prefix")); + } else { + buf = line; + } + return buf.c_str(); +} + + +const char *Trace::get_string(string &buf, const char *prefix) +{ + peek_string(buf, prefix); + + //cout << "buf is " << buf << std::endl; + // read next line (and detect eof early) + _line++; + getline(*fs, line); + //cout << "next line is " << line << std::endl; + + return buf.c_str(); +} diff --git a/src/client/Trace.h b/src/client/Trace.h new file mode 100644 index 000000000..2f90c4f7c --- /dev/null +++ b/src/client/Trace.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_CLIENT_TRACE_H +#define CEPH_CLIENT_TRACE_H + +#include <stdlib.h> + +#include <list> +#include <string> +#include <fstream> +using std::list; +using std::string; +using std::ifstream; + +/* + + this class is more like an iterator over a constant tokenlist (which + is protected by a mutex, see Trace.cc) + + */ + +class Trace { + int _line; + const char *filename; + ifstream *fs; + string line; + + public: + explicit Trace(const char* f) : _line(0), filename(f), fs(0) {} + ~Trace() { + delete fs; + } + + Trace(const Trace& other); + const Trace& operator=(const Trace& other); + + int get_line() { return _line; } + + void start(); + + const char *peek_string(string &buf, const char *prefix); + const char *get_string(string &buf, const char *prefix); + + int64_t get_int() { + string buf; + return atoll(get_string(buf, 0)); + } + bool end() { + return !fs || fs->eof(); + //return _cur == _end; + } +}; + +#endif diff --git a/src/client/UserPerm.h b/src/client/UserPerm.h new file mode 100644 index 000000000..bb7c25d3e --- /dev/null +++ b/src/client/UserPerm.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_CLIENT_USERPERM_H +#define CEPH_CLIENT_USERPERM_H + +struct UserPerm +{ +private: + uid_t m_uid; + gid_t m_gid; + int gid_count; + gid_t *gids; + bool alloced_gids; + void deep_copy_from(const UserPerm& b) { + if (alloced_gids) { + delete[] gids; + alloced_gids = false; + } + m_uid = b.m_uid; + m_gid = b.m_gid; + gid_count = b.gid_count; + if (gid_count > 0) { + gids = new gid_t[gid_count]; + alloced_gids = true; + for (int i = 0; i < gid_count; ++i) { + gids[i] = b.gids[i]; + } + } + } +public: + UserPerm() : m_uid(-1), m_gid(-1), gid_count(0), + gids(NULL), alloced_gids(false) {} + UserPerm(uid_t uid, gid_t gid, int ngids=0, gid_t *gidlist=NULL) : + m_uid(uid), m_gid(gid), gid_count(ngids), + gids(gidlist), alloced_gids(false) {} + UserPerm(const UserPerm& o) : UserPerm() { + deep_copy_from(o); + } + UserPerm(UserPerm && o) { + m_uid = o.m_uid; + m_gid = o.m_gid; + gid_count = o.gid_count; + gids = o.gids; + alloced_gids = o.alloced_gids; + o.gids = NULL; + o.gid_count = 0; + } + ~UserPerm() { + if (alloced_gids) + delete[] gids; + } + UserPerm& operator=(const UserPerm& o) { + deep_copy_from(o); + return *this; + } + + uid_t uid() const { return m_uid != (uid_t)-1 ? m_uid : ::geteuid(); } + gid_t gid() const { return m_gid != (gid_t)-1 ? m_gid : ::getegid(); } + bool gid_in_groups(gid_t id) const { + if (id == gid()) return true; + for (int i = 0; i < gid_count; ++i) { + if (id == gids[i]) return true; + } + return false; + } + int get_gids(const gid_t **_gids) const { *_gids = gids; return gid_count; } + void init_gids(gid_t* _gids, int count) { + gids = _gids; + gid_count = count; + alloced_gids = true; + } + void shallow_copy(const UserPerm& o) { + m_uid = o.m_uid; + m_gid = o.m_gid; + gid_count = o.gid_count; + gids = o.gids; + alloced_gids = false; + } +}; + +#endif diff --git a/src/client/barrier.cc b/src/client/barrier.cc new file mode 100644 index 000000000..dfdedaf58 --- /dev/null +++ b/src/client/barrier.cc @@ -0,0 +1,195 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * + * Copyright (C) 2012 CohortFS, LLC. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#if defined(__FreeBSD__) +#include <sys/param.h> +#endif + +#include "include/Context.h" +#include "Client.h" +#include "barrier.h" + +#undef dout_prefix +#define dout_prefix *_dout << "client." << whoami << " " + +#define dout_subsys ceph_subsys_client + +#define cldout(cl, v) dout_impl((cl)->cct, dout_subsys, v) \ + *_dout << "client." << cl->whoami << " " + +/* C_Block_Sync */ +class C_Block_Sync : public Context { +private: + Client *cl; + uint64_t ino; + barrier_interval iv; + enum CBlockSync_State state; + Barrier *barrier; + int *rval; /* see Cond.h */ + +public: + boost::intrusive::list_member_hook<> intervals_hook; + C_Block_Sync(Client *c, uint64_t i, barrier_interval iv, int *r); + void finish(int rval); + + friend class Barrier; + friend class BarrierContext; +}; + +C_Block_Sync::C_Block_Sync(Client *c, uint64_t i, barrier_interval iv, + int *r=0) : + cl(c), ino(i), iv(iv), rval(r) +{ + state = CBlockSync_State_None; + barrier = NULL; + + cldout(cl, 1) << "C_Block_Sync for " << ino << dendl; + + if (!cl->barriers[ino]) { + cl->barriers[ino] = new BarrierContext(cl, ino); + } + /* XXX current semantics aren't commit-ordered */ + cl->barriers[ino]->write_nobarrier(*this); +} + +void C_Block_Sync::finish(int r) { + cldout(cl, 1) << "C_Block_Sync::finish() for " << ino << " " + << iv << " r==" << r << dendl; + if (rval) + *rval = r; + cl->barriers[ino]->complete(*this); +} + +/* Barrier */ +Barrier::Barrier() +{ } + +Barrier::~Barrier() +{ } + +/* BarrierContext */ +BarrierContext::BarrierContext(Client *c, uint64_t ino) : + cl(c), ino(ino) +{ }; + +void BarrierContext::write_nobarrier(C_Block_Sync &cbs) +{ + std::lock_guard locker(lock); + cbs.state = CBlockSync_State_Unclaimed; + outstanding_writes.push_back(cbs); +} + +void BarrierContext::write_barrier(C_Block_Sync &cbs) +{ + std::unique_lock locker(lock); + barrier_interval &iv = cbs.iv; + + { /* find blocking commit--intrusive no help here */ + BarrierList::iterator iter; + bool done = false; + for (iter = active_commits.begin(); + !done && (iter != active_commits.end()); + ++iter) { + Barrier &barrier = *iter; + while (boost::icl::intersects(barrier.span, iv)) { + /* wait on this */ + barrier.cond.wait(locker); + done = true; + } + } + } + + cbs.state = CBlockSync_State_Unclaimed; + outstanding_writes.push_back(cbs); + +} /* write_barrier */ + +void BarrierContext::commit_barrier(barrier_interval &civ) +{ + std::unique_lock locker(lock); + + /* we commit outstanding writes--if none exist, we don't care */ + if (outstanding_writes.size() == 0) + return; + + boost::icl::interval_set<uint64_t> cvs; + cvs.insert(civ); + + Barrier *barrier = NULL; + BlockSyncList::iterator iter, iter2; + + iter = outstanding_writes.begin(); + while (iter != outstanding_writes.end()) { + barrier_interval &iv = iter->iv; + if (boost::icl::intersects(cvs, iv)) { + C_Block_Sync &a_write = *iter; + if (! barrier) + barrier = new Barrier(); + /* mark the callback */ + a_write.state = CBlockSync_State_Committing; + a_write.barrier = barrier; + iter2 = iter++; + outstanding_writes.erase(iter2); + barrier->write_list.push_back(a_write); + barrier->span.insert(iv); + /* avoid iter invalidate */ + } else { + ++iter; + } + } + + if (barrier) { + active_commits.push_back(*barrier); + /* and wait on this */ + barrier->cond.wait(locker); + } + +} /* commit_barrier */ + +void BarrierContext::complete(C_Block_Sync &cbs) +{ + std::lock_guard locker(lock); + BlockSyncList::iterator iter = + BlockSyncList::s_iterator_to(cbs); + + switch (cbs.state) { + case CBlockSync_State_Unclaimed: + /* cool, no waiting */ + outstanding_writes.erase(iter); + break; + case CBlockSync_State_Committing: + { + Barrier *barrier = iter->barrier; + barrier->write_list.erase(iter); + /* signal waiters */ + barrier->cond.notify_all(); + /* dispose cleared barrier */ + if (barrier->write_list.size() == 0) { + BarrierList::iterator iter2 = + BarrierList::s_iterator_to(*barrier); + active_commits.erase(iter2); + delete barrier; + } + } + break; + default: + ceph_abort(); + break; + } + + cbs.state = CBlockSync_State_Completed; + +} /* complete */ + +BarrierContext::~BarrierContext() +{ } diff --git a/src/client/barrier.h b/src/client/barrier.h new file mode 100644 index 000000000..289a3a521 --- /dev/null +++ b/src/client/barrier.h @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * + * Copyright (C) 2012 CohortFS, LLC. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef BARRIER_H +#define BARRIER_H + +#include "include/types.h" +#include <boost/intrusive/list.hpp> +#define BOOST_ICL_USE_STATIC_BOUNDED_INTERVALS +#include <boost/icl/interval_set.hpp> +#include "common/ceph_mutex.h" + +class Client; + +typedef boost::icl::interval<uint64_t>::type barrier_interval; + + +/* + * we keep count of uncommitted writes on the inode, so that + * ll_commit_blocks can do the right thing. + * + * This is just a hacked copy of Ceph's sync callback. + */ + +enum CBlockSync_State +{ + CBlockSync_State_None, /* initial state */ + CBlockSync_State_Unclaimed, /* outstanding write */ + CBlockSync_State_Committing, /* commit in progress */ + CBlockSync_State_Completed, +}; + +class BarrierContext; + +class C_Block_Sync; + +typedef boost::intrusive::list< C_Block_Sync, + boost::intrusive::member_hook< + C_Block_Sync, + boost::intrusive::list_member_hook<>, + &C_Block_Sync::intervals_hook > + > BlockSyncList; + +class Barrier +{ +private: + ceph::condition_variable cond; + boost::icl::interval_set<uint64_t> span; + BlockSyncList write_list; + +public: + boost::intrusive::list_member_hook<> active_commits_hook; + + Barrier(); + ~Barrier(); + + friend class BarrierContext; +}; + +typedef boost::intrusive::list< Barrier, + boost::intrusive::member_hook< + Barrier, + boost::intrusive::list_member_hook<>, + &Barrier::active_commits_hook > + > BarrierList; + +class BarrierContext +{ +private: + Client *cl; + uint64_t ino; + ceph::mutex lock = ceph::make_mutex("BarrierContext"); + + // writes not claimed by a commit + BlockSyncList outstanding_writes; + + // commits in progress, with their claimed writes + BarrierList active_commits; + +public: + BarrierContext(Client *c, uint64_t ino); + void write_nobarrier(C_Block_Sync &cbs); + void write_barrier(C_Block_Sync &cbs); + void commit_barrier(barrier_interval &civ); + void complete(C_Block_Sync &cbs); + ~BarrierContext(); +}; + +#endif diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc new file mode 100644 index 000000000..7f92dd668 --- /dev/null +++ b/src/client/fuse_ll.cc @@ -0,0 +1,1814 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <sys/file.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <limits.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> + +#if defined(__linux__) +#include <libgen.h> +#include <sys/vfs.h> +#include <sys/xattr.h> +#include <linux/magic.h> +#endif + +// ceph +#include "common/errno.h" +#include "common/safe_io.h" +#include "include/types.h" +#include "Client.h" +#include "Fh.h" +#include "ioctl.h" +#include "common/config.h" +#include "include/ceph_assert.h" +#include "include/cephfs/ceph_ll_client.h" +#include "include/ceph_fuse.h" + +#include "fuse_ll.h" +#include <fuse_lowlevel.h> + +#define dout_context g_ceph_context + +#define FINO_INO(x) ((x) & ((1ull<<48)-1ull)) +#define FINO_STAG(x) ((x) >> 48) +#define MAKE_FINO(i,s) ((i) | ((int64_t)(s) << 48)) +#define STAG_MASK 0xffff +#define G_NOSNAP_STAG 0 // for all CEPH_NOSNAP +#define G_SNAPDIR_STAG 1 // for all CEPH_SNAPDIR + +#define MINORBITS 20 +#define MINORMASK ((1U << MINORBITS) - 1) + +#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) +#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) +#define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) + +#if defined(__linux__) +#ifndef FUSE_SUPER_MAGIC +#define FUSE_SUPER_MAGIC 0x65735546 +#endif + +#define _CEPH_CLIENT_ID "ceph.client_id" +#endif + +/* + * The dedicated struct for snapid <-> stag map for each ceph + * inode, and the stag is a number in range [2, 0xffff], and + * the stag number 0 is reserved for CEPH_NOSNAP and 1 is + * reserved for CEPH_SNAPDIR. + */ +struct ceph_fuse_fake_inode_stag { + ceph::unordered_map<uint64_t,int> snap_stag_map; // <snapid, stagid> + ceph::unordered_map<int, uint64_t> stag_snap_map; // <stagid, snapid> + int last_stag = 1; +}; + +using namespace std; + +static const ceph::unordered_map<int,int> cephfs_errno_to_system_errno = { + {CEPHFS_EBLOCKLISTED, ESHUTDOWN}, + {CEPHFS_EPERM, EPERM}, + {CEPHFS_ESTALE, ESTALE}, + {CEPHFS_ENOSPC, ENOSPC}, + {CEPHFS_ETIMEDOUT, ETIMEDOUT}, + {CEPHFS_EIO, EIO}, + {CEPHFS_ENOTCONN, ENOTCONN}, + {CEPHFS_EEXIST, EEXIST}, + {CEPHFS_EINTR, EINTR}, + {CEPHFS_EINVAL, EINVAL}, + {CEPHFS_EBADF, EBADF}, + {CEPHFS_EROFS, EROFS}, + {CEPHFS_EAGAIN, EAGAIN}, + {CEPHFS_EACCES, EACCES}, + {CEPHFS_ELOOP, ELOOP}, + {CEPHFS_EISDIR, EISDIR}, + {CEPHFS_ENOENT, ENOENT}, + {CEPHFS_ENOTDIR, ENOTDIR}, + {CEPHFS_ENAMETOOLONG, ENAMETOOLONG}, + {CEPHFS_EBUSY, EBUSY}, + {CEPHFS_EDQUOT, EDQUOT}, + {CEPHFS_EFBIG, EFBIG}, + {CEPHFS_ERANGE, ERANGE}, + {CEPHFS_ENXIO, ENXIO}, + {CEPHFS_ECANCELED, ECANCELED}, + {CEPHFS_ENODATA, ENODATA}, + {CEPHFS_EOPNOTSUPP, EOPNOTSUPP}, + {CEPHFS_EXDEV, EXDEV}, + {CEPHFS_ENOMEM, ENOMEM}, + {CEPHFS_ENOTRECOVERABLE, ENOTRECOVERABLE}, + {CEPHFS_ENOSYS, ENOSYS}, + {CEPHFS_ENOTEMPTY, ENOTEMPTY}, + {CEPHFS_EDEADLK, EDEADLK}, + {CEPHFS_EDOM, EDOM}, + {CEPHFS_EMLINK, EMLINK}, + {CEPHFS_ETIME, ETIME}, + {CEPHFS_EOLDSNAPC, EIO} // forcing to EIO for now +}; + +/* Requirements: + * cephfs_errno >= 0 + */ +static int get_sys_errno(int cephfs_errno) +{ + if (cephfs_errno == 0) + return 0; + + auto it = cephfs_errno_to_system_errno.find(cephfs_errno); + if (it != cephfs_errno_to_system_errno.end()) + return it->second; + return EIO; +} + +static uint32_t new_encode_dev(dev_t dev) +{ + unsigned major = MAJOR(dev); + unsigned minor = MINOR(dev); + return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12); +} + +static dev_t new_decode_dev(uint32_t dev) +{ + unsigned major = (dev & 0xfff00) >> 8; + unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00); + return MKDEV(major, minor); +} + +class CephFuse::Handle { +public: + Handle(Client *c, int fd); + ~Handle(); + + int init(int argc, const char *argv[]); + int start(); + int loop(); + void finalize(); + + uint64_t fino_snap(uint64_t fino); + uint64_t make_fake_ino(inodeno_t ino, snapid_t snapid); + Inode * iget(fuse_ino_t fino); + void iput(Inode *in); + + int fd_on_success; + Client *client; + + struct fuse_session *se = nullptr; +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + struct fuse_cmdline_opts opts; + struct fuse_conn_info_opts *conn_opts; +#else + struct fuse_chan *ch = nullptr; + char *mountpoint = nullptr; +#endif + + ceph::mutex stag_lock = ceph::make_mutex("fuse_ll.cc stag_lock"); + + // a map of <ceph ino, fino stag/snapid map> + ceph::unordered_map<uint64_t, struct ceph_fuse_fake_inode_stag> g_fino_maps; + + pthread_key_t fuse_req_key = 0; + void set_fuse_req(fuse_req_t); + fuse_req_t get_fuse_req(); + + struct fuse_args args; +}; + +#if defined(__linux__) +static int already_fuse_mounted(const char *path, bool &already_mounted) +{ + struct statx path_statx; + struct statx parent_statx; + char path_copy[PATH_MAX] = {0}; + char *parent_path = NULL; + int err = 0; + + already_mounted = false; + + strncpy(path_copy, path, sizeof(path_copy)-1); + parent_path = dirname(path_copy); + + // get stat information for original path + if (-1 == statx(AT_FDCWD, path, AT_STATX_DONT_SYNC, STATX_INO, &path_statx)) { + err = errno; + derr << "fuse_ll: already_fuse_mounted: statx(" << path << ") failed with error " + << cpp_strerror(err) << dendl; + return err; + } + + // if path isn't directory, then it can't be a mountpoint. + if (!(path_statx.stx_mode & S_IFDIR)) { + err = EINVAL; + derr << "fuse_ll: already_fuse_mounted: " + << path << " is not a directory" << dendl; + return err; + } + + // get stat information for parent path + if (-1 == statx(AT_FDCWD, parent_path, AT_STATX_DONT_SYNC, STATX_INO, &parent_statx)) { + err = errno; + derr << "fuse_ll: already_fuse_mounted: statx(" << parent_path << ") failed with error " + << cpp_strerror(err) << dendl; + return err; + } + + // if original path and parent have different device ids, + // then the path is a mount point + // or, if they refer to the same path, then it's probably + // the root directory '/' and therefore path is a mountpoint + if( path_statx.stx_dev_major != parent_statx.stx_dev_major || + path_statx.stx_dev_minor != parent_statx.stx_dev_minor || + ( path_statx.stx_dev_major == parent_statx.stx_dev_major && + path_statx.stx_dev_minor == parent_statx.stx_dev_minor && + path_statx.stx_ino == parent_statx.stx_ino + ) + ) { + struct statfs path_statfs; + if (-1 == statfs(path, &path_statfs)) { + err = errno; + derr << "fuse_ll: already_fuse_mounted: statfs(" << path << ") failed with error " + << cpp_strerror(err) << dendl; + return err; + } + + if(FUSE_SUPER_MAGIC == path_statfs.f_type) { + // if getxattr returns positive length means value exist for ceph.client_id + // then ceph fuse is already mounted on path + char client_id[128] = {0}; + if (getxattr(path, _CEPH_CLIENT_ID, &client_id, sizeof(client_id)) > 0) { + already_mounted = true; + derr << path << " already mounted by " << client_id << dendl; + } + } + } + + return err; +} +#else // non-linux platforms +static int already_fuse_mounted(const char *path, bool &already_mounted) +{ + already_mounted = false; + return 0; +} +#endif + +static int getgroups(fuse_req_t req, gid_t **sgids) +{ +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) + ceph_assert(sgids); + int c = fuse_req_getgroups(req, 0, NULL); + if (c < 0) { + return c; + } + if (c == 0) { + return 0; + } + + gid_t *gids = new (std::nothrow) gid_t[c]; + if (!gids) { + return -get_sys_errno(CEPHFS_ENOMEM); + } + c = fuse_req_getgroups(req, c, gids); + if (c < 0) { + delete[] gids; + } else { + *sgids = gids; + } + return c; +#endif + return -get_sys_errno(CEPHFS_ENOSYS); +} + +static void get_fuse_groups(UserPerm& perms, fuse_req_t req) +{ + CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + if (cfuse->client->cct->_conf.get_val<bool>("fuse_set_user_groups")) { + gid_t *gids = NULL; + int count = getgroups(req, &gids); + + if (count > 0) { + perms.init_gids(gids, count); + } else if (count < 0) { + derr << __func__ << ": getgroups failed: " << cpp_strerror(-count) + << dendl; + } + } +} + + +static CephFuse::Handle *fuse_ll_req_prepare(fuse_req_t req) +{ + CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + cfuse->set_fuse_req(req); + return cfuse; +} + +static void fuse_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + struct fuse_entry_param fe; + Inode *i2, *i1 = cfuse->iget(parent); // see below + int r; + UserPerm perms(ctx->uid, ctx->gid); + get_fuse_groups(perms, req); + + if (!i1) + { + r = cfuse->client->lookup_ino(parent, perms, &i1); + if (r < 0) { + fuse_reply_err(req, get_sys_errno(-r)); + return; + } + } + + memset(&fe, 0, sizeof(fe)); + r = cfuse->client->ll_lookup(i1, name, &fe.attr, &i2, perms); + if (r >= 0) { + fe.ino = cfuse->make_fake_ino(fe.attr.st_ino, fe.attr.st_dev); + fe.attr.st_rdev = new_encode_dev(fe.attr.st_rdev); + fuse_reply_entry(req, &fe); + } else { + fuse_reply_err(req, get_sys_errno(-r)); + } + + // XXX NB, we dont iput(i2) because FUSE will do so in a matching + // fuse_ll_forget() + cfuse->iput(i1); +} + +// fuse3 has changed forget function signature +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) +static void fuse_ll_forget(fuse_req_t req, fuse_ino_t ino, + uint64_t nlookup) +#else +static void fuse_ll_forget(fuse_req_t req, fuse_ino_t ino, + long unsigned nlookup) +#endif +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Inode *in = cfuse->iget(ino); + if (in) + cfuse->client->ll_forget(in, nlookup+1); + fuse_reply_none(req); +} + +static void fuse_ll_getattr(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + struct stat stbuf; + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + (void) fi; // XXX + + if (cfuse->client->ll_getattr(in, &stbuf, perms) + == 0) { + stbuf.st_ino = cfuse->make_fake_ino(stbuf.st_ino, stbuf.st_dev); + stbuf.st_rdev = new_encode_dev(stbuf.st_rdev); + fuse_reply_attr(req, &stbuf, 0); + } else + fuse_reply_err(req, ENOENT); + + cfuse->iput(in); // iput required +} + +static void fuse_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + int to_set, struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int mask = 0; + if (to_set & FUSE_SET_ATTR_MODE) mask |= CEPH_SETATTR_MODE; + if (to_set & FUSE_SET_ATTR_UID) mask |= CEPH_SETATTR_UID; + if (to_set & FUSE_SET_ATTR_GID) mask |= CEPH_SETATTR_GID; + if (to_set & FUSE_SET_ATTR_MTIME) mask |= CEPH_SETATTR_MTIME; + if (to_set & FUSE_SET_ATTR_ATIME) mask |= CEPH_SETATTR_ATIME; + if (to_set & FUSE_SET_ATTR_SIZE) mask |= CEPH_SETATTR_SIZE; +#if !defined(__APPLE__) + if (to_set & FUSE_SET_ATTR_MTIME_NOW) mask |= CEPH_SETATTR_MTIME_NOW; + if (to_set & FUSE_SET_ATTR_ATIME_NOW) mask |= CEPH_SETATTR_ATIME_NOW; +#endif + + int r = cfuse->client->ll_setattr(in, attr, mask, perms); + if (r == 0) + fuse_reply_attr(req, attr, 0); + else + fuse_reply_err(req, get_sys_errno(-r)); + + cfuse->iput(in); // iput required +} + +// XATTRS + +static void fuse_ll_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + const char *value, size_t size, + int flags +#if defined(__APPLE__) + ,uint32_t pos +#endif + ) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int r = cfuse->client->ll_setxattr(in, name, value, size, flags, perms); + fuse_reply_err(req, get_sys_errno(-r)); + + cfuse->iput(in); // iput required +} + +static void fuse_ll_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + char buf[size]; + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int r = cfuse->client->ll_listxattr(in, buf, size, perms); + if (size == 0 && r >= 0) + fuse_reply_xattr(req, r); + else if (r >= 0) + fuse_reply_buf(req, buf, r); + else + fuse_reply_err(req, get_sys_errno(-r)); + + cfuse->iput(in); // iput required +} + +static void fuse_ll_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + size_t size +#if defined(__APPLE__) + ,uint32_t position +#endif + ) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + char buf[size]; + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int r = cfuse->client->ll_getxattr(in, name, buf, size, perms); + if (size == 0 && r >= 0) + fuse_reply_xattr(req, r); + else if (r >= 0) + fuse_reply_buf(req, buf, r); + else + fuse_reply_err(req, get_sys_errno(-r)); + + cfuse->iput(in); // iput required +} + +static void fuse_ll_removexattr(fuse_req_t req, fuse_ino_t ino, + const char *name) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int r = cfuse->client->ll_removexattr(in, name, perms); + fuse_reply_err(req, get_sys_errno(-r)); + + cfuse->iput(in); // iput required +} + +static void fuse_ll_opendir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + UserPerm perms(ctx->uid, ctx->gid); + void *dirp; + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int r = cfuse->client->ll_opendir(in, fi->flags, (dir_result_t **)&dirp, + perms); + if (r >= 0) { + fi->fh = (uint64_t)dirp; + fuse_reply_open(req, fi); + } else { + fuse_reply_err(req, get_sys_errno(-r)); + } + + cfuse->iput(in); // iput required +} + +static void fuse_ll_readlink(fuse_req_t req, fuse_ino_t ino) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + char buf[PATH_MAX + 1]; // leave room for a null terminator + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + int r = cfuse->client->ll_readlink(in, buf, sizeof(buf) - 1, perms); + if (r >= 0) { + buf[r] = '\0'; + fuse_reply_readlink(req, buf); + } else { + fuse_reply_err(req, get_sys_errno(-r)); + } + + cfuse->iput(in); // iput required +} + +static void fuse_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, dev_t rdev) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + struct fuse_entry_param fe; + UserPerm perms(ctx->uid, ctx->gid); + Inode *i2, *i1 = cfuse->iget(parent); + if (!i1) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + memset(&fe, 0, sizeof(fe)); + + int r = cfuse->client->ll_mknod(i1, name, mode, new_decode_dev(rdev), + &fe.attr, &i2, perms); + if (r == 0) { + fe.ino = cfuse->make_fake_ino(fe.attr.st_ino, fe.attr.st_dev); + fe.attr.st_rdev = new_encode_dev(fe.attr.st_rdev); + fuse_reply_entry(req, &fe); + } else { + fuse_reply_err(req, get_sys_errno(-r)); + } + + // XXX NB, we dont iput(i2) because FUSE will do so in a matching + // fuse_ll_forget() + cfuse->iput(i1); // iput required +} + +static void fuse_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + Inode *i2, *i1; + struct fuse_entry_param fe; + + memset(&fe, 0, sizeof(fe)); + UserPerm perm(ctx->uid, ctx->gid); + get_fuse_groups(perm, req); +#ifdef HAVE_SYS_SYNCFS + auto fuse_multithreaded = cfuse->client->cct->_conf.get_val<bool>( + "fuse_multithreaded"); + auto fuse_syncfs_on_mksnap = cfuse->client->cct->_conf.get_val<bool>( + "fuse_syncfs_on_mksnap"); + if (cfuse->fino_snap(parent) == CEPH_SNAPDIR && + fuse_multithreaded && fuse_syncfs_on_mksnap) { + int err = 0; +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + int fd = ::open(cfuse->opts.mountpoint, O_RDONLY | O_DIRECTORY | O_CLOEXEC); +#else + int fd = ::open(cfuse->mountpoint, O_RDONLY | O_DIRECTORY | O_CLOEXEC); +#endif + if (fd < 0) { + err = errno; + } else { + int r = ::syncfs(fd); + if (r < 0) + err = errno; + ::close(fd); + } + if (err) { + fuse_reply_err(req, err); + return; + } + } +#endif + + i1 = cfuse->iget(parent); + if (!i1) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + int r = cfuse->client->ll_mkdir(i1, name, mode, &fe.attr, &i2, perm); + if (r == 0) { + fe.ino = cfuse->make_fake_ino(fe.attr.st_ino, fe.attr.st_dev); + fe.attr.st_rdev = new_encode_dev(fe.attr.st_rdev); + fuse_reply_entry(req, &fe); + } else { + fuse_reply_err(req, get_sys_errno(-r)); + } + + // XXX NB, we dont iput(i2) because FUSE will do so in a matching + // fuse_ll_forget() + cfuse->iput(i1); // iput required +} + +static void fuse_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + UserPerm perm(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(parent); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perm, req); + + int r = cfuse->client->ll_unlink(in, name, perm); + fuse_reply_err(req, get_sys_errno(-r)); + + cfuse->iput(in); // iput required +} + +static void fuse_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(parent); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int r = cfuse->client->ll_rmdir(in, name, perms); + fuse_reply_err(req, get_sys_errno(-r)); + + cfuse->iput(in); // iput required +} + +static void fuse_ll_symlink(fuse_req_t req, const char *existing, + fuse_ino_t parent, const char *name) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + struct fuse_entry_param fe; + UserPerm perms(ctx->uid, ctx->gid); + Inode *i2, *i1 = cfuse->iget(parent); + if (!i1) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + memset(&fe, 0, sizeof(fe)); + + int r = cfuse->client->ll_symlink(i1, name, existing, &fe.attr, &i2, perms); + if (r == 0) { + fe.ino = cfuse->make_fake_ino(fe.attr.st_ino, fe.attr.st_dev); + fe.attr.st_rdev = new_encode_dev(fe.attr.st_rdev); + fuse_reply_entry(req, &fe); + } else { + fuse_reply_err(req, get_sys_errno(-r)); + } + + // XXX NB, we dont iput(i2) because FUSE will do so in a matching + // fuse_ll_forget() + cfuse->iput(i1); // iput required +} + +static void fuse_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + fuse_ino_t newparent, const char *newname +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + , unsigned int flags +#endif + ) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + UserPerm perm(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(parent); + Inode *nin = cfuse->iget(newparent); + if (!in || !nin) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perm, req); + + int r = cfuse->client->ll_rename(in, name, nin, newname, perm); + fuse_reply_err(req, get_sys_errno(-r)); + + cfuse->iput(in); // iputs required + cfuse->iput(nin); +} + +static void fuse_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, + const char *newname) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + struct fuse_entry_param fe; + Inode *in = cfuse->iget(ino); + Inode *nin = cfuse->iget(newparent); + if (!in || !nin) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + memset(&fe, 0, sizeof(fe)); + UserPerm perm(ctx->uid, ctx->gid); + get_fuse_groups(perm, req); + + /* + * Note that we could successfully link, but then fail the subsequent + * getattr and return an error. Perhaps we should ignore getattr errors, + * but then how do we tell FUSE that the attrs are bogus? + */ + int r = cfuse->client->ll_link(in, nin, newname, perm); + if (r == 0) { + r = cfuse->client->ll_getattr(in, &fe.attr, perm); + if (r == 0) { + fe.ino = cfuse->make_fake_ino(fe.attr.st_ino, fe.attr.st_dev); + fe.attr.st_rdev = new_encode_dev(fe.attr.st_rdev); + fuse_reply_entry(req, &fe); + } + } + + if (r != 0) { + /* + * Many ll operations in libcephfs return an extra inode reference, but + * ll_link currently does not. Still, FUSE needs one for the new dentry, + * so we commandeer the reference taken earlier when ll_link is successful. + * On error however, we must put that reference. + */ + cfuse->iput(in); + fuse_reply_err(req, get_sys_errno(-r)); + } + + cfuse->iput(nin); +} + +static void fuse_ll_open(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + Fh *fh = NULL; + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int r = cfuse->client->ll_open(in, fi->flags, &fh, perms); + if (r == 0) { + fi->fh = (uint64_t)fh; +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) + auto fuse_disable_pagecache = cfuse->client->cct->_conf.get_val<bool>( + "fuse_disable_pagecache"); + auto fuse_use_invalidate_cb = cfuse->client->cct->_conf.get_val<bool>( + "fuse_use_invalidate_cb"); + if (fuse_disable_pagecache) + fi->direct_io = 1; + else if (fuse_use_invalidate_cb) + fi->keep_cache = 1; +#endif + fuse_reply_open(req, fi); + } else { + fuse_reply_err(req, get_sys_errno(-r)); + } + + cfuse->iput(in); // iput required +} + +static void fuse_ll_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Fh *fh = reinterpret_cast<Fh*>(fi->fh); + bufferlist bl; + int r = cfuse->client->ll_read(fh, off, size, &bl); + if (r >= 0) { + vector<iovec> iov; + size_t len; + struct fuse_bufvec *bufv; + + if (bl.get_num_buffers() > IOV_MAX) + bl.rebuild(); + + bl.prepare_iov(&iov); + len = sizeof(struct fuse_bufvec) + sizeof(struct fuse_buf) * (iov.size() - 1); + bufv = (struct fuse_bufvec *)calloc(1, len); + if (bufv) { + int i = 0; + bufv->count = iov.size(); + for (auto &v: iov) { + bufv->buf[i].mem = v.iov_base; + bufv->buf[i++].size = v.iov_len; + } + fuse_reply_data(req, bufv, FUSE_BUF_SPLICE_MOVE); + free(bufv); + return; + } + iov.insert(iov.begin(), {0}); // the first one is reserved for fuse_out_header + fuse_reply_iov(req, &iov[0], iov.size()); + } else + fuse_reply_err(req, get_sys_errno(-r)); +} + +static void fuse_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, + size_t size, off_t off, struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Fh *fh = reinterpret_cast<Fh*>(fi->fh); + int r = cfuse->client->ll_write(fh, off, size, buf); + if (r >= 0) + fuse_reply_write(req, r); + else + fuse_reply_err(req, get_sys_errno(-r)); +} + +static void fuse_ll_flush(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Fh *fh = reinterpret_cast<Fh*>(fi->fh); + int r = cfuse->client->ll_flush(fh); + fuse_reply_err(req, get_sys_errno(-r)); +} + +#ifdef FUSE_IOCTL_COMPAT +static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 5) + unsigned int cmd, +#else + int cmd, +#endif + void *arg, struct fuse_file_info *fi, + unsigned flags, const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + + if (flags & FUSE_IOCTL_COMPAT) { + fuse_reply_err(req, ENOSYS); + return; + } + + switch (static_cast<unsigned>(cmd)) { + case CEPH_IOC_GET_LAYOUT: { + file_layout_t layout; + struct ceph_ioctl_layout l; + Fh *fh = (Fh*)fi->fh; + cfuse->client->ll_file_layout(fh, &layout); + l.stripe_unit = layout.stripe_unit; + l.stripe_count = layout.stripe_count; + l.object_size = layout.object_size; + l.data_pool = layout.pool_id; + fuse_reply_ioctl(req, 0, &l, sizeof(struct ceph_ioctl_layout)); + } + break; + default: + fuse_reply_err(req, EINVAL); + } +} +#endif + +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9) + +static void fuse_ll_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, + off_t offset, off_t length, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Fh *fh = (Fh*)fi->fh; + int r = cfuse->client->ll_fallocate(fh, mode, offset, length); + fuse_reply_err(req, get_sys_errno(-r)); +} + +#endif + +static void fuse_ll_release(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Fh *fh = reinterpret_cast<Fh*>(fi->fh); + int r = cfuse->client->ll_release(fh); + fuse_reply_err(req, get_sys_errno(-r)); +} + +static void fuse_ll_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Fh *fh = reinterpret_cast<Fh*>(fi->fh); + int r = cfuse->client->ll_fsync(fh, datasync); + fuse_reply_err(req, get_sys_errno(-r)); +} + +struct readdir_context { + fuse_req_t req; + char *buf; + size_t size; + size_t pos; /* in buf */ + uint64_t snap; +}; + +/* + * return 0 on success, -1 if out of space + */ +static int fuse_ll_add_dirent(void *p, struct dirent *de, + struct ceph_statx *stx, off_t next_off, + Inode *in) +{ + struct readdir_context *c = (struct readdir_context *)p; + CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(c->req); + + struct stat st; + st.st_ino = cfuse->make_fake_ino(stx->stx_ino, c->snap); + st.st_mode = stx->stx_mode; + st.st_rdev = new_encode_dev(stx->stx_rdev); + + size_t room = c->size - c->pos; + size_t entrysize = fuse_add_direntry(c->req, c->buf + c->pos, room, + de->d_name, &st, next_off); + if (entrysize > room) + return -ENOSPC; + + /* success */ + c->pos += entrysize; + return 0; +} + +static void fuse_ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t off, struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + + dir_result_t *dirp = reinterpret_cast<dir_result_t*>(fi->fh); + cfuse->client->seekdir(dirp, off); + + struct readdir_context rc; + rc.req = req; + rc.snap = cfuse->fino_snap(ino); + if (rc.snap == CEPH_MAXSNAP) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + rc.buf = new char[size]; + rc.size = size; + rc.pos = 0; + + int r = cfuse->client->readdir_r_cb(dirp, fuse_ll_add_dirent, &rc); + if (r == 0 || r == -CEPHFS_ENOSPC) /* ignore ENOSPC from our callback */ + fuse_reply_buf(req, rc.buf, rc.pos); + else + fuse_reply_err(req, get_sys_errno(-r)); + delete[] rc.buf; +} + +static void fuse_ll_releasedir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + dir_result_t *dirp = reinterpret_cast<dir_result_t*>(fi->fh); + cfuse->client->ll_releasedir(dirp); + fuse_reply_err(req, 0); +} + +static void fuse_ll_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + dir_result_t *dirp = reinterpret_cast<dir_result_t*>(fi->fh); + int r = cfuse->client->ll_fsyncdir(dirp); + fuse_reply_err(req, get_sys_errno(-r)); +} + +static void fuse_ll_access(fuse_req_t req, fuse_ino_t ino, int mask) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int r = cfuse->client->inode_permission(in, perms, mask); + fuse_reply_err(req, get_sys_errno(-r)); + cfuse->iput(in); +} + +static void fuse_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + struct fuse_entry_param fe; + Fh *fh = NULL; + UserPerm perms(ctx->uid, ctx->gid); + Inode *i1 = cfuse->iget(parent), *i2; + if (!i1) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + memset(&fe, 0, sizeof(fe)); + + // pass &i2 for the created inode so that ll_create takes an initial ll_ref + int r = cfuse->client->ll_create(i1, name, mode, fi->flags, &fe.attr, &i2, + &fh, perms); + if (r == 0) { + fi->fh = (uint64_t)fh; + fe.ino = cfuse->make_fake_ino(fe.attr.st_ino, fe.attr.st_dev); +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) + auto fuse_disable_pagecache = cfuse->client->cct->_conf.get_val<bool>( + "fuse_disable_pagecache"); + auto fuse_use_invalidate_cb = cfuse->client->cct->_conf.get_val<bool>( + "fuse_use_invalidate_cb"); + if (fuse_disable_pagecache) + fi->direct_io = 1; + else if (fuse_use_invalidate_cb) + fi->keep_cache = 1; +#endif + fuse_reply_create(req, &fe, fi); + } else + fuse_reply_err(req, get_sys_errno(-r)); + // XXX NB, we dont iput(i2) because FUSE will do so in a matching + // fuse_ll_forget() + cfuse->iput(i1); // iput required +} + +static void fuse_ll_statfs(fuse_req_t req, fuse_ino_t ino) +{ + struct statvfs stbuf; + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + UserPerm perms(ctx->uid, ctx->gid); + Inode *in = cfuse->iget(ino); + if (!in) { + fuse_reply_err(req, get_sys_errno(CEPHFS_EINVAL)); + return; + } + + get_fuse_groups(perms, req); + + int r = cfuse->client->ll_statfs(in, &stbuf, perms); + if (r == 0) + fuse_reply_statfs(req, &stbuf); + else + fuse_reply_err(req, get_sys_errno(-r)); + + cfuse->iput(in); // iput required +} + +static void fuse_ll_getlk(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi, struct flock *lock) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Fh *fh = reinterpret_cast<Fh*>(fi->fh); + + int r = cfuse->client->ll_getlk(fh, lock, fi->lock_owner); + if (r == 0) + fuse_reply_lock(req, lock); + else + fuse_reply_err(req, get_sys_errno(-r)); +} + +static void fuse_ll_setlk(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi, struct flock *lock, int sleep) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Fh *fh = reinterpret_cast<Fh*>(fi->fh); + + // must use multithread if operation may block + auto fuse_multithreaded = cfuse->client->cct->_conf.get_val<bool>( + "fuse_multithreaded"); + if (!fuse_multithreaded && sleep && lock->l_type != F_UNLCK) { + fuse_reply_err(req, EDEADLK); + return; + } + + int r = cfuse->client->ll_setlk(fh, lock, fi->lock_owner, sleep); + fuse_reply_err(req, get_sys_errno(-r)); +} + +static void fuse_ll_interrupt(fuse_req_t req, void* data) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + cfuse->client->ll_interrupt(data); +} + +static void switch_interrupt_cb(void *handle, void* data) +{ + CephFuse::Handle *cfuse = (CephFuse::Handle *)handle; + fuse_req_t req = cfuse->get_fuse_req(); + + if (data) + fuse_req_interrupt_func(req, fuse_ll_interrupt, data); + else + fuse_req_interrupt_func(req, NULL, NULL); +} + +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9) +static void fuse_ll_flock(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi, int cmd) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); + Fh *fh = (Fh*)fi->fh; + + // must use multithread if operation may block + auto fuse_multithreaded = cfuse->client->cct->_conf.get_val<bool>( + "fuse_multithreaded"); + if (!fuse_multithreaded && !(cmd & (LOCK_NB | LOCK_UN))) { + fuse_reply_err(req, EDEADLK); + return; + } + + int r = cfuse->client->ll_flock(fh, cmd, fi->lock_owner); + fuse_reply_err(req, get_sys_errno(-r)); +} +#endif + +#if !defined(__APPLE__) +static mode_t umask_cb(void *handle) +{ + CephFuse::Handle *cfuse = (CephFuse::Handle *)handle; + fuse_req_t req = cfuse->get_fuse_req(); + const struct fuse_ctx *ctx = fuse_req_ctx(req); + return ctx->umask; +} +#endif + +static void ino_invalidate_cb(void *handle, vinodeno_t vino, int64_t off, + int64_t len) +{ +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) + CephFuse::Handle *cfuse = (CephFuse::Handle *)handle; + fuse_ino_t fino = cfuse->make_fake_ino(vino.ino, vino.snapid); +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + fuse_lowlevel_notify_inval_inode(cfuse->se, fino, off, len); +#else + fuse_lowlevel_notify_inval_inode(cfuse->ch, fino, off, len); +#endif +#endif +} + +static void dentry_invalidate_cb(void *handle, vinodeno_t dirino, + vinodeno_t ino, const char *name, size_t len) +{ + CephFuse::Handle *cfuse = (CephFuse::Handle *)handle; + fuse_ino_t fdirino = cfuse->make_fake_ino(dirino.ino, dirino.snapid); +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9) + fuse_ino_t fino = 0; + if (ino.ino != inodeno_t()) + fino = cfuse->make_fake_ino(ino.ino, ino.snapid); +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + fuse_lowlevel_notify_delete(cfuse->se, fdirino, fino, name, len); +#else + fuse_lowlevel_notify_delete(cfuse->ch, fdirino, fino, name, len); +#endif +#elif FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) + fuse_lowlevel_notify_inval_entry(cfuse->ch, fdirino, name, len); +#endif +} + +static int remount_cb(void *handle) +{ + // used for trimming kernel dcache. when remounting a file system, linux kernel + // trims all unused dentries in the file system + char cmd[128+PATH_MAX]; + CephFuse::Handle *cfuse = (CephFuse::Handle *)handle; + snprintf(cmd, sizeof(cmd), "LIBMOUNT_FSTAB=/dev/null mount -i -o remount %s", +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + cfuse->opts.mountpoint); +#else + cfuse->mountpoint); +#endif + int r = system(cmd); + if (r != 0 && r != -1) { + r = WEXITSTATUS(r); + } + + return r; +} + +static void do_init(void *data, fuse_conn_info *conn) +{ + CephFuse::Handle *cfuse = (CephFuse::Handle *)data; + Client *client = cfuse->client; + +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + fuse_apply_conn_info_opts(cfuse->conn_opts, conn); +#endif + + if(conn->capable & FUSE_CAP_SPLICE_MOVE) + conn->want |= FUSE_CAP_SPLICE_MOVE; + +#if !defined(__APPLE__) + if (!client->fuse_default_permissions && client->ll_handle_umask()) { + // apply umask in userspace if posix acl is enabled + if(conn->capable & FUSE_CAP_DONT_MASK) + conn->want |= FUSE_CAP_DONT_MASK; + } + if(conn->capable & FUSE_CAP_EXPORT_SUPPORT) + conn->want |= FUSE_CAP_EXPORT_SUPPORT; +#endif + + if (cfuse->fd_on_success) { + //cout << "fuse init signaling on fd " << fd_on_success << std::endl; + // see Preforker::daemonize(), ceph-fuse's parent process expects a `-1` + // from a daemonized child process. + uint32_t r = -1; + int err = safe_write(cfuse->fd_on_success, &r, sizeof(r)); + if (err) { + derr << "fuse_ll: do_init: safe_write failed with error " + << cpp_strerror(err) << dendl; + ceph_abort(); + } + //cout << "fuse init done signaling on fd " << fd_on_success << std::endl; + + // close stdout, etc. + ::close(0); + ::close(1); + ::close(2); + } +} + +const static struct fuse_lowlevel_ops fuse_ll_oper = { + init: do_init, + destroy: 0, + lookup: fuse_ll_lookup, + forget: fuse_ll_forget, + getattr: fuse_ll_getattr, + setattr: fuse_ll_setattr, + readlink: fuse_ll_readlink, + mknod: fuse_ll_mknod, + mkdir: fuse_ll_mkdir, + unlink: fuse_ll_unlink, + rmdir: fuse_ll_rmdir, + symlink: fuse_ll_symlink, + rename: fuse_ll_rename, + link: fuse_ll_link, + open: fuse_ll_open, + read: fuse_ll_read, + write: fuse_ll_write, + flush: fuse_ll_flush, + release: fuse_ll_release, + fsync: fuse_ll_fsync, + opendir: fuse_ll_opendir, + readdir: fuse_ll_readdir, + releasedir: fuse_ll_releasedir, + fsyncdir: fuse_ll_fsyncdir, + statfs: fuse_ll_statfs, + setxattr: fuse_ll_setxattr, + getxattr: fuse_ll_getxattr, + listxattr: fuse_ll_listxattr, + removexattr: fuse_ll_removexattr, + access: fuse_ll_access, + create: fuse_ll_create, + getlk: fuse_ll_getlk, + setlk: fuse_ll_setlk, + bmap: 0, +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) +#ifdef FUSE_IOCTL_COMPAT + ioctl: fuse_ll_ioctl, +#else + ioctl: 0, +#endif + poll: 0, +#endif +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9) + write_buf: 0, + retrieve_reply: 0, + forget_multi: 0, + flock: fuse_ll_flock, +#endif +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9) + fallocate: fuse_ll_fallocate +#endif +}; + + +CephFuse::Handle::Handle(Client *c, int fd) : + fd_on_success(fd), + client(c) +{ + memset(&args, 0, sizeof(args)); +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + memset(&opts, 0, sizeof(opts)); +#endif +} + +CephFuse::Handle::~Handle() +{ + fuse_opt_free_args(&args); +} + +void CephFuse::Handle::finalize() +{ +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + if (se) { + fuse_remove_signal_handlers(se); + fuse_session_unmount(se); + fuse_session_destroy(se); + } + if (conn_opts) + free(conn_opts); + if (opts.mountpoint) + free(opts.mountpoint); +#else + if (se) + fuse_remove_signal_handlers(se); + if (ch) + fuse_session_remove_chan(ch); + if (se) + fuse_session_destroy(se); + if (ch) + fuse_unmount(mountpoint, ch); +#endif + + pthread_key_delete(fuse_req_key); +} + +int CephFuse::Handle::init(int argc, const char *argv[]) +{ + + int r = pthread_key_create(&fuse_req_key, NULL); + if (r) { + derr << "pthread_key_create failed." << dendl; + return r; + } + + // set up fuse argc/argv + int newargc = 0; + const char **newargv = (const char **) malloc((argc + 17) * sizeof(char *)); + if(!newargv) + return ENOMEM; + + newargv[newargc++] = argv[0]; + newargv[newargc++] = "-f"; // stay in foreground + + auto fuse_allow_other = client->cct->_conf.get_val<bool>( + "fuse_allow_other"); + auto fuse_default_permissions = client->cct->_conf.get_val<bool>( + "fuse_default_permissions"); +#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0) + auto fuse_big_writes = client->cct->_conf.get_val<bool>( + "fuse_big_writes"); +#endif + auto fuse_max_write = client->cct->_conf.get_val<Option::size_t>( + "fuse_max_write"); + auto fuse_atomic_o_trunc = client->cct->_conf.get_val<bool>( + "fuse_atomic_o_trunc"); + auto fuse_splice_read = client->cct->_conf.get_val<bool>( + "fuse_splice_read"); + auto fuse_splice_write = client->cct->_conf.get_val<bool>( + "fuse_splice_write"); + auto fuse_splice_move = client->cct->_conf.get_val<bool>( + "fuse_splice_move"); + auto fuse_debug = client->cct->_conf.get_val<bool>( + "fuse_debug"); + + if (fuse_allow_other) { + newargv[newargc++] = "-o"; + newargv[newargc++] = "allow_other"; + } + if (fuse_default_permissions) { + newargv[newargc++] = "-o"; + newargv[newargc++] = "default_permissions"; + } +#if defined(__linux__) +#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0) + if (fuse_big_writes) { + newargv[newargc++] = "-o"; + newargv[newargc++] = "big_writes"; + } +#endif + if (fuse_max_write > 0) { + char strsplice[65]; + newargv[newargc++] = "-o"; + sprintf(strsplice, "max_write=%zu", (size_t)fuse_max_write); + newargv[newargc++] = strsplice; + } + if (fuse_atomic_o_trunc) { + newargv[newargc++] = "-o"; + newargv[newargc++] = "atomic_o_trunc"; + } + if (fuse_splice_read) { + newargv[newargc++] = "-o"; + newargv[newargc++] = "splice_read"; + } + if (fuse_splice_write) { + newargv[newargc++] = "-o"; + newargv[newargc++] = "splice_write"; + } + if (fuse_splice_move) { + newargv[newargc++] = "-o"; + newargv[newargc++] = "splice_move"; + } +#endif + if (fuse_debug) + newargv[newargc++] = "-d"; + + for (int argctr = 1; argctr < argc; argctr++) + newargv[newargc++] = argv[argctr]; + + derr << "init, newargv = " << newargv << " newargc=" << newargc << dendl; + struct fuse_args a = FUSE_ARGS_INIT(newargc, (char**)newargv); + args = a; // Roundabout construction b/c FUSE_ARGS_INIT is for initialization not assignment + +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + if (fuse_parse_cmdline(&args, &opts) == -1) { +#else + if (fuse_parse_cmdline(&args, &mountpoint, NULL, NULL) == -1) { +#endif + derr << "fuse_parse_cmdline failed." << dendl; + fuse_opt_free_args(&args); + free(newargv); + return EINVAL; + } + +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + derr << "init, args.argv = " << args.argv << " args.argc=" << args.argc << dendl; + conn_opts = fuse_parse_conn_info_opts(&args); + if (!conn_opts) { + derr << "fuse_parse_conn_info_opts failed" << dendl; + fuse_opt_free_args(&args); + free(newargv); + return EINVAL; + } +#endif + + ceph_assert(args.allocated); // Checking fuse has realloc'd args so we can free newargv + free(newargv); + + struct ceph_client_callback_args cb_args = { + handle: this, + ino_cb: client->cct->_conf.get_val<bool>("fuse_use_invalidate_cb") ? + ino_invalidate_cb : NULL, + dentry_cb: dentry_invalidate_cb, + switch_intr_cb: switch_interrupt_cb, +#if defined(__linux__) + remount_cb: remount_cb, +#endif +#if !defined(__APPLE__) + umask_cb: umask_cb, +#endif + }; + r = client->ll_register_callbacks2(&cb_args); + if (r) { + derr << "registering callbacks failed: " << r << dendl; + return r; + } + + return 0; +} + +int CephFuse::Handle::start() +{ + bool is_mounted = false; +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + int err = already_fuse_mounted(opts.mountpoint, is_mounted); +#else + int err = already_fuse_mounted(mountpoint, is_mounted); +#endif + if (err) { + return err; + } + + if (is_mounted) { + return EBUSY; + } + +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + se = fuse_session_new(&args, &fuse_ll_oper, sizeof(fuse_ll_oper), this); + if (!se) { + derr << "fuse_session_new failed" << dendl; + return EDOM; + } +#else + ch = fuse_mount(mountpoint, &args); + if (!ch) { + derr << "fuse_mount(mountpoint=" << mountpoint << ") failed." << dendl; + return EIO; + } + + se = fuse_lowlevel_new(&args, &fuse_ll_oper, sizeof(fuse_ll_oper), this); + if (!se) { + derr << "fuse_lowlevel_new failed" << dendl; + return EDOM; + } +#endif + + signal(SIGTERM, SIG_DFL); + signal(SIGINT, SIG_DFL); + if (fuse_set_signal_handlers(se) == -1) { + derr << "fuse_set_signal_handlers failed" << dendl; + return ENOSYS; + } + +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + if (fuse_session_mount(se, opts.mountpoint) != 0) { + derr << "fuse_session_mount failed" << dendl; + return ENOSYS; + } +#else + fuse_session_add_chan(se, ch); +#endif + + return 0; +} + +int CephFuse::Handle::loop() +{ + auto fuse_multithreaded = client->cct->_conf.get_val<bool>( + "fuse_multithreaded"); + if (fuse_multithreaded) { +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 12) + { + struct fuse_loop_config *conf = fuse_loop_cfg_create(); + ceph_assert(conf != nullptr); + + fuse_loop_cfg_set_clone_fd(conf, opts.clone_fd); + fuse_loop_cfg_set_idle_threads(conf, opts.max_idle_threads); + fuse_loop_cfg_set_max_threads(conf, opts.max_threads); + + int r = fuse_session_loop_mt(se, conf); + + fuse_loop_cfg_destroy(conf); + return r; + } +#elif FUSE_VERSION >= FUSE_MAKE_VERSION(3, 1) + { + struct fuse_loop_config conf = { + clone_fd: opts.clone_fd, + max_idle_threads: opts.max_idle_threads + }; + return fuse_session_loop_mt(se, &conf); + } +#elif FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + return fuse_session_loop_mt(se, opts.clone_fd); +#else + return fuse_session_loop_mt(se); +#endif + } else { + return fuse_session_loop(se); + } +} + +uint64_t CephFuse::Handle::fino_snap(uint64_t fino) +{ + if (fino == FUSE_ROOT_ID) + return CEPH_NOSNAP; + + if (client->use_faked_inos()) { + vinodeno_t vino = client->map_faked_ino(fino); + return vino.snapid; + } else { + std::lock_guard l(stag_lock); + uint64_t stag = FINO_STAG(fino); + if (stag == 0) + return CEPH_NOSNAP; + else if (stag == 1) + return CEPH_SNAPDIR; + + inodeno_t ino = FINO_INO(fino); + + // does the fino_maps for the ino exist ? + if (!g_fino_maps.count(ino)) + return CEPH_MAXSNAP; + + auto &fino_maps = g_fino_maps[ino]; + + // does the stagid <--> snapid map exist ? + if (!fino_maps.stag_snap_map.count(stag)) + return CEPH_MAXSNAP; + + // get the snapid + return fino_maps.stag_snap_map[stag]; + } +} + +Inode * CephFuse::Handle::iget(fuse_ino_t fino) +{ + if (fino == FUSE_ROOT_ID) + return client->get_root(); + + if (client->use_faked_inos()) { + return client->ll_get_inode((ino_t)fino); + } else { + uint64_t snap = fino_snap(fino); + if (snap == CEPH_MAXSNAP) + return NULL; + vinodeno_t vino(FINO_INO(fino), snap); + return client->ll_get_inode(vino); + } +} + +void CephFuse::Handle::iput(Inode *in) +{ + client->ll_put(in); +} + +uint64_t CephFuse::Handle::make_fake_ino(inodeno_t ino, snapid_t snapid) +{ + if (client->use_faked_inos()) { + // already faked by libcephfs + if (ino == client->get_root_ino()) + return FUSE_ROOT_ID; + + return ino; + } else { + if (snapid == CEPH_NOSNAP && ino == client->get_root_ino()) + return FUSE_ROOT_ID; + + int stag; + if (snapid == CEPH_NOSNAP) { + stag = G_NOSNAP_STAG; + } else if (snapid == CEPH_SNAPDIR) { + stag = G_SNAPDIR_STAG; + } else { + std::lock_guard l(stag_lock); + auto &fino_maps = g_fino_maps[ino]; // will insert it anyway if not exists + + // already exist ? + if (fino_maps.snap_stag_map.count(snapid)) { + inodeno_t fino = MAKE_FINO(ino, fino_maps.snap_stag_map[snapid]); + return fino; + } + + // create a new snapid <--> stagid map + int first = fino_maps.last_stag & STAG_MASK; + stag = (++fino_maps.last_stag) & STAG_MASK; + for (; stag != first; stag = (++fino_maps.last_stag) & STAG_MASK) { + // stag 0 is reserved for CEPH_NOSNAP and 1 for CEPH_SNAPDIR + if (stag == 0 || stag == 1) + continue; + + // the new stag is not used ? + if (!fino_maps.stag_snap_map.count(stag)) { + fino_maps.snap_stag_map[snapid] = stag; + fino_maps.stag_snap_map[stag] = snapid; + break; + } + + // the stag is already used by a snpaid, + // try to free it + auto _snapid = fino_maps.stag_snap_map[stag]; + if (!client->ll_get_snap_ref(_snapid)) { + fino_maps.snap_stag_map.erase(_snapid); + fino_maps.snap_stag_map[snapid] = stag; + fino_maps.stag_snap_map[stag] = snapid; + break; + } + } + if (stag == first) { + /* + * It shouldn't be here because the max snapshots for each + * directory is 4_K, and here we have around 64_K, which is + * from 0xffff - 2, stags could be used for each directory. + * + * More detail please see mds 'mds_max_snaps_per_dir' option. + */ + ceph_abort_msg("run out of stag"); + } + } + + inodeno_t fino = MAKE_FINO(ino, stag); + //cout << "make_fake_ino " << ino << "." << snapid << " -> " << fino << std::endl; + return fino; + } +} + +void CephFuse::Handle::set_fuse_req(fuse_req_t req) +{ + pthread_setspecific(fuse_req_key, (void*)req); +} + +fuse_req_t CephFuse::Handle::get_fuse_req() +{ + return (fuse_req_t) pthread_getspecific(fuse_req_key); +} + + +CephFuse::CephFuse(Client *c, int fd) : _handle(new CephFuse::Handle(c, fd)) +{ +} + +CephFuse::~CephFuse() +{ + delete _handle; +} + +int CephFuse::init(int argc, const char *argv[]) +{ + return _handle->init(argc, argv); +} + +int CephFuse::start() +{ + return _handle->start(); +} + +int CephFuse::loop() +{ + return _handle->loop(); +} + +void CephFuse::finalize() +{ + return _handle->finalize(); +} + +std::string CephFuse::get_mount_point() const +{ +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + if (_handle->opts.mountpoint) { + return _handle->opts.mountpoint; +#else + if (_handle->mountpoint) { + return _handle->mountpoint; +#endif + } else { + return ""; + } +} diff --git a/src/client/fuse_ll.h b/src/client/fuse_ll.h new file mode 100644 index 000000000..85e587baf --- /dev/null +++ b/src/client/fuse_ll.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +class CephFuse { +public: + CephFuse(Client *c, int fd); + ~CephFuse(); + int init(int argc, const char *argv[]); + int start(); + int mount(); + int loop(); + void finalize(); + class Handle; + std::string get_mount_point() const; +private: + CephFuse::Handle *_handle; +}; diff --git a/src/client/hypertable/CephBroker.cc b/src/client/hypertable/CephBroker.cc new file mode 100644 index 000000000..596e72287 --- /dev/null +++ b/src/client/hypertable/CephBroker.cc @@ -0,0 +1,526 @@ +/** -*- C++ -*- + * Copyright (C) 2009-2011 New Dream Network + * + * This file is part of Hypertable. + * + * Hypertable is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or any later version. + * + * Hypertable is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Hypertable. If not, see <http://www.gnu.org/licenses/> + * + * Authors: + * Gregory Farnum <gfarnum@gmail.com> + * Colin McCabe <cmccabe@alumni.cmu.edu> + */ + +#include "Common/Compat.h" + +#include "CephBroker.h" +#include "Common/Error.h" +#include "Common/FileUtils.h" +#include "Common/Filesystem.h" +#include "Common/System.h" + +#include <cephfs/libcephfs.h> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <string> +#include <sys/types.h> +#include <sys/uio.h> +#include <unistd.h> + +using namespace Hypertable; + +std::atomic<int> CephBroker::ms_next_fd{0}; + +/* A thread-safe version of strerror */ +static std::string cpp_strerror(int err) +{ + char buf[128]; + if (err < 0) + err = -err; + std::ostringstream oss; + oss << strerror_r(err, buf, sizeof(buf)); + return oss.str(); +} + +OpenFileDataCeph::OpenFileDataCeph(struct ceph_mount_info *cmount_, const String& fname, + int _fd, int _flags) + : cmount(cmount_), fd(_fd), flags(_flags), filename(fname) +{ +} + +OpenFileDataCeph::~OpenFileDataCeph() { + ceph_close(cmount, fd); +} + +CephBroker::CephBroker(PropertiesPtr& cfg) + : cmount(NULL) +{ + int ret; + String id(cfg->get_str("CephBroker.Id")); + m_verbose = cfg->get_bool("Hypertable.Verbose"); + m_root_dir = cfg->get_str("CephBroker.RootDir"); + String mon_addr(cfg->get_str("CephBroker.MonAddr")); + + HT_INFO("Calling ceph_create"); + ret = ceph_create(&cmount, id.empty() ? NULL : id.c_str()); + if (ret) { + throw Hypertable::Exception(ret, "ceph_create failed"); + } + ret = ceph_conf_set(cmount, "mon_host", mon_addr.c_str()); + if (ret) { + ceph_shutdown(cmount); + throw Hypertable::Exception(ret, "ceph_conf_set(mon_addr) failed"); + } + + // For Ceph debugging, uncomment these lines + //ceph_conf_set(cmount, "debug_client", "1"); + //ceph_conf_set(cmount, "debug_ms", "1"); + + HT_INFO("Calling ceph_mount"); + ret = ceph_mount(cmount, m_root_dir.empty() ? NULL : m_root_dir.c_str()); + if (ret) { + ceph_shutdown(cmount); + throw Hypertable::Exception(ret, "ceph_mount failed"); + } + HT_INFO("Mounted Ceph filesystem."); +} + +CephBroker::~CephBroker() +{ + ceph_shutdown(cmount); + cmount = NULL; +} + +void CephBroker::open(ResponseCallbackOpen *cb, const char *fname, + uint32_t flags, uint32_t bufsz) { + int fd, ceph_fd; + String abspath; + HT_DEBUGF("open file='%s' bufsz=%d", fname, bufsz); + + make_abs_path(fname, abspath); + + fd = atomic_inc_return(&ms_next_fd); + + if ((ceph_fd = ceph_open(cmount, abspath.c_str(), O_RDONLY, 0)) < 0) { + report_error(cb, -ceph_fd); + return; + } + HT_INFOF("open (%s) fd=%" PRIu32 " ceph_fd=%d", fname, fd, ceph_fd); + + { + struct sockaddr_in addr; + OpenFileDataCephPtr fdata(new OpenFileDataCeph(cmount, abspath, ceph_fd, O_RDONLY)); + + cb->get_address(addr); + + m_open_file_map.create(fd, addr, fdata); + + cb->response(fd); + } +} + +void CephBroker::create(ResponseCallbackOpen *cb, const char *fname, uint32_t flags, + int32_t bufsz, int16_t replication, int64_t blksz){ + int fd, ceph_fd; + int oflags; + String abspath; + + make_abs_path(fname, abspath); + HT_DEBUGF("create file='%s' flags=%u bufsz=%d replication=%d blksz=%lld", + fname, flags, bufsz, (int)replication, (Lld)blksz); + + fd = atomic_inc_return(&ms_next_fd); + + if (flags & Filesystem::OPEN_FLAG_OVERWRITE) + oflags = O_WRONLY | O_CREAT | O_TRUNC; + else + oflags = O_WRONLY | O_CREAT | O_APPEND; + + //make sure the directories in the path exist + String directory = abspath.substr(0, abspath.rfind('/')); + int r; + HT_INFOF("Calling mkdirs on %s", directory.c_str()); + if((r=ceph_mkdirs(cmount, directory.c_str(), 0644)) < 0 && r!=-CEPHFS_EEXIST) { + HT_ERRORF("create failed on mkdirs: dname='%s' - %d", directory.c_str(), -r); + report_error(cb, -r); + return; + } + + //create file + if ((ceph_fd = ceph_open(cmount, abspath.c_str(), oflags, 0644)) < 0) { + std::string errs(cpp_strerror(-ceph_fd)); + HT_ERRORF("open failed: file=%s - %s", abspath.c_str(), errs.c_str()); + report_error(cb, ceph_fd); + return; + } + + HT_INFOF("create %s = %d", fname, ceph_fd); + + { + struct sockaddr_in addr; + OpenFileDataCephPtr fdata (new OpenFileDataCeph(cmount, fname, ceph_fd, O_WRONLY)); + + cb->get_address(addr); + + m_open_file_map.create(fd, addr, fdata); + + cb->response(fd); + } +} + +void CephBroker::close(ResponseCallback *cb, uint32_t fd) { + if (m_verbose) { + HT_INFOF("close fd=%" PRIu32, fd); + } + OpenFileDataCephPtr fdata; + m_open_file_map.get(fd, fdata); + m_open_file_map.remove(fd); + cb->response_ok(); +} + +void CephBroker::read(ResponseCallbackRead *cb, uint32_t fd, uint32_t amount) { + OpenFileDataCephPtr fdata; + ssize_t nread; + int64_t offset; + StaticBuffer buf(new uint8_t [amount], amount); + + HT_DEBUGF("read fd=%" PRIu32 " amount = %d", fd, amount); + + if (!m_open_file_map.get(fd, fdata)) { + char errbuf[32]; + sprintf(errbuf, "%" PRIu32, fd); + cb->error(Error::DFSBROKER_BAD_FILE_HANDLE, errbuf); + HT_ERRORF("bad file handle: %" PRIu32, fd); + return; + } + + if ((offset = ceph_lseek(cmount, fdata->fd, 0, SEEK_CUR)) < 0) { + std::string errs(cpp_strerror(offset)); + HT_ERRORF("lseek failed: fd=%" PRIu32 " ceph_fd=%d offset=0 SEEK_CUR - %s", + fd, fdata->fd, errs.c_str()); + report_error(cb, offset); + return; + } + + if ((nread = ceph_read(cmount, fdata->fd, (char *)buf.base, amount, 0)) < 0 ) { + HT_ERRORF("read failed: fd=%" PRIu32 " ceph_fd=%d amount=%d", fd, fdata->fd, amount); + report_error(cb, -nread); + return; + } + + buf.size = nread; + cb->response((uint64_t)offset, buf); +} + +void CephBroker::append(ResponseCallbackAppend *cb, uint32_t fd, + uint32_t amount, const void *data, bool sync) +{ + OpenFileDataCephPtr fdata; + ssize_t nwritten; + int64_t offset; + + HT_DEBUG_OUT << "append fd="<< fd <<" amount="<< amount <<" data='" + << format_bytes(20, data, amount) <<" sync="<< sync << HT_END; + + if (!m_open_file_map.get(fd, fdata)) { + char errbuf[32]; + sprintf(errbuf, "%" PRIu32, fd); + cb->error(Error::DFSBROKER_BAD_FILE_HANDLE, errbuf); + return; + } + + if ((offset = ceph_lseek(cmount, fdata->fd, 0, SEEK_CUR)) < 0) { + std::string errs(cpp_strerror(offset)); + HT_ERRORF("lseek failed: fd=%" PRIu32 " ceph_fd=%d offset=0 SEEK_CUR - %s", fd, fdata->fd, + errs.c_str()); + report_error(cb, offset); + return; + } + + if ((nwritten = ceph_write(cmount, fdata->fd, (const char *)data, amount, 0)) < 0) { + std::string errs(cpp_strerror(nwritten)); + HT_ERRORF("write failed: fd=%" PRIu32 " ceph_fd=%d amount=%d - %s", + fd, fdata->fd, amount, errs.c_str()); + report_error(cb, -nwritten); + return; + } + + int r; + if (sync && ((r = ceph_fsync(cmount, fdata->fd, true)) != 0)) { + std::string errs(cpp_strerror(errno)); + HT_ERRORF("flush failed: fd=%" PRIu32 " ceph_fd=%d - %s", fd, fdata->fd, errs.c_str()); + report_error(cb, r); + return; + } + + cb->response((uint64_t)offset, nwritten); +} + +void CephBroker::seek(ResponseCallback *cb, uint32_t fd, uint64_t offset) { + OpenFileDataCephPtr fdata; + + HT_DEBUGF("seek fd=%" PRIu32 " offset=%llu", fd, (Llu)offset); + + if (!m_open_file_map.get(fd, fdata)) { + char errbuf[32]; + sprintf(errbuf, "%" PRIu32, fd); + cb->error(Error::DFSBROKER_BAD_FILE_HANDLE, errbuf); + return; + } + loff_t res = ceph_lseek(cmount, fdata->fd, offset, SEEK_SET); + if (res < 0) { + std::string errs(cpp_strerror((int)res)); + HT_ERRORF("lseek failed: fd=%" PRIu32 " ceph_fd=%d offset=%llu - %s", + fd, fdata->fd, (Llu)offset, errs.c_str()); + report_error(cb, offset); + return; + } + + cb->response_ok(); +} + +void CephBroker::remove(ResponseCallback *cb, const char *fname) { + String abspath; + + HT_DEBUGF("remove file='%s'", fname); + + make_abs_path(fname, abspath); + + int r; + if ((r = ceph_unlink(cmount, abspath.c_str())) < 0) { + std::string errs(cpp_strerror(r)); + HT_ERRORF("unlink failed: file='%s' - %s", abspath.c_str(), errs.c_str()); + report_error(cb, r); + return; + } + cb->response_ok(); +} + +void CephBroker::length(ResponseCallbackLength *cb, const char *fname, bool) { + int r; + struct ceph_statx stx; + + HT_DEBUGF("length file='%s'", fname); + + if ((r = ceph_statx(cmount, fname, &stx, CEPH_STATX_SIZE, AT_SYMLINK_NOFOLLOW)) < 0) { + String abspath; + make_abs_path(fname, abspath); + std::string errs(cpp_strerror(r)); + HT_ERRORF("length (stat) failed: file='%s' - %s", abspath.c_str(), errs.c_str()); + report_error(cb,- r); + return; + } + cb->response(stx.stx_size); +} + +void CephBroker::pread(ResponseCallbackRead *cb, uint32_t fd, uint64_t offset, + uint32_t amount, bool) { + OpenFileDataCephPtr fdata; + ssize_t nread; + StaticBuffer buf(new uint8_t [amount], amount); + + HT_DEBUGF("pread fd=%" PRIu32 " offset=%llu amount=%d", fd, (Llu)offset, amount); + + if (!m_open_file_map.get(fd, fdata)) { + char errbuf[32]; + sprintf(errbuf, "%" PRIu32, fd); + cb->error(Error::DFSBROKER_BAD_FILE_HANDLE, errbuf); + return; + } + + if ((nread = ceph_read(cmount, fdata->fd, (char *)buf.base, amount, offset)) < 0) { + std::string errs(cpp_strerror(nread)); + HT_ERRORF("pread failed: fd=%" PRIu32 " ceph_fd=%d amount=%d offset=%llu - %s", + fd, fdata->fd, amount, (Llu)offset, errs.c_str()); + report_error(cb, nread); + return; + } + + buf.size = nread; + + cb->response(offset, buf); +} + +void CephBroker::mkdirs(ResponseCallback *cb, const char *dname) { + String absdir; + + HT_DEBUGF("mkdirs dir='%s'", dname); + + make_abs_path(dname, absdir); + int r; + if((r=ceph_mkdirs(cmount, absdir.c_str(), 0644)) < 0 && r!=-CEPHFS_EEXIST) { + HT_ERRORF("mkdirs failed: dname='%s' - %d", absdir.c_str(), -r); + report_error(cb, -r); + return; + } + cb->response_ok(); +} + +void CephBroker::rmdir(ResponseCallback *cb, const char *dname) { + String absdir; + int r; + + make_abs_path(dname, absdir); + if((r = rmdir_recursive(absdir.c_str())) < 0) { + HT_ERRORF("failed to remove dir %s, got error %d", absdir.c_str(), r); + report_error(cb, -r); + return; + } + cb->response_ok(); +} + +int CephBroker::rmdir_recursive(const char *directory) { + struct ceph_dir_result *dirp; + struct dirent de; + struct ceph_statx stx; + int r; + if ((r = ceph_opendir(cmount, directory, &dirp)) < 0) + return r; //failed to open + while ((r = ceph_readdirplus_r(cmount, dirp, &de, &stx, CEPH_STATX_INO, AT_STATX_DONT_SYNC, NULL)) > 0) { + String new_dir = de.d_name; + if(!(new_dir.compare(".")==0 || new_dir.compare("..")==0)) { + new_dir = directory; + new_dir += '/'; + new_dir += de.d_name; + if (S_ISDIR(stx.stx_mode)) { //it's a dir, clear it out... + if((r=rmdir_recursive(new_dir.c_str())) < 0) return r; + } else { //delete this file + if((r=ceph_unlink(cmount, new_dir.c_str())) < 0) return r; + } + } + } + if (r < 0) return r; //we got an error + if ((r = ceph_closedir(cmount, dirp)) < 0) return r; + return ceph_rmdir(cmount, directory); +} + +void CephBroker::flush(ResponseCallback *cb, uint32_t fd) { + OpenFileDataCephPtr fdata; + + HT_DEBUGF("flush fd=%" PRIu32, fd); + + if (!m_open_file_map.get(fd, fdata)) { + char errbuf[32]; + sprintf(errbuf, "%" PRIu32, fd); + cb->error(Error::DFSBROKER_BAD_FILE_HANDLE, errbuf); + return; + } + + int r; + if ((r = ceph_fsync(cmount, fdata->fd, true)) != 0) { + std::string errs(cpp_strerror(r)); + HT_ERRORF("flush failed: fd=%" PRIu32 " ceph_fd=%d - %s", fd, fdata->fd, errs.c_str()); + report_error(cb, -r); + return; + } + + cb->response_ok(); +} + +void CephBroker::status(ResponseCallback *cb) { + cb->response_ok(); + /*perhaps a total cheat, but both the local and Kosmos brokers + included in Hypertable also do this. */ +} + +void CephBroker::shutdown(ResponseCallback *cb) { + m_open_file_map.remove_all(); + cb->response_ok(); + poll(0, 0, 2000); +} + +void CephBroker::readdir(ResponseCallbackReaddir *cb, const char *dname) { + std::vector<String> listing; + String absdir; + + HT_DEBUGF("Readdir dir='%s'", dname); + + //get from ceph in a buffer + make_abs_path(dname, absdir); + + struct ceph_dir_result *dirp; + ceph_opendir(cmount, absdir.c_str(), &dirp); + int r; + int buflen = 100; //good default? + char *buf = new char[buflen]; + String *ent; + int bufpos; + while (1) { + r = ceph_getdnames(cmount, dirp, buf, buflen); + if (r==-CEPHFS_ERANGE) { //expand the buffer + delete [] buf; + buflen *= 2; + buf = new char[buflen]; + continue; + } + if (r<=0) break; + + //if we make it here, we got at least one name, maybe more + bufpos = 0; + while (bufpos<r) {//make new strings and add them to listing + ent = new String(buf+bufpos); + if (ent->compare(".") && ent->compare("..")) + listing.push_back(*ent); + bufpos+=ent->size()+1; + delete ent; + } + } + delete [] buf; + ceph_closedir(cmount, dirp); + + if (r < 0) report_error(cb, -r); //Ceph shouldn't return r<0 on getdnames + //(except for ERANGE) so if it happens this is bad + cb->response(listing); +} + +void CephBroker::exists(ResponseCallbackExists *cb, const char *fname) { + String abspath; + struct ceph_statx stx; + + HT_DEBUGF("exists file='%s'", fname); + make_abs_path(fname, abspath); + cb->response(ceph_statx(cmount, abspath.c_str(), &stx, 0, AT_SYMLINK_NOFOLLOW) == 0); +} + +void CephBroker::rename(ResponseCallback *cb, const char *src, const char *dst) { + String src_abs; + String dest_abs; + int r; + + make_abs_path(src, src_abs); + make_abs_path(dst, dest_abs); + if ((r = ceph_rename(cmount, src_abs.c_str(), dest_abs.c_str())) <0 ) { + report_error(cb, r); + return; + } + cb->response_ok(); +} + +void CephBroker::debug(ResponseCallback *cb, int32_t command, + StaticBuffer &serialized_parameters) { + HT_ERROR("debug commands not implemented!"); + cb->error(Error::NOT_IMPLEMENTED, format("Debug commands not supported")); +} + +void CephBroker::report_error(ResponseCallback *cb, int error) { + char errbuf[128]; + errbuf[0] = 0; + + strerror_r(error, errbuf, 128); + + cb->error(Error::DFSBROKER_IO_ERROR, errbuf); +} + + diff --git a/src/client/hypertable/CephBroker.h b/src/client/hypertable/CephBroker.h new file mode 100644 index 000000000..d2df38909 --- /dev/null +++ b/src/client/hypertable/CephBroker.h @@ -0,0 +1,117 @@ +/** -*- C++ -*- + * Copyright (C) 2009-2011 New Dream Network + * + * This file is part of Hypertable. + * + * Hypertable is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or any later version. + * + * Hypertable is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Hypertable. If not, see <http://www.gnu.org/licenses/> + * + * Authors: + * Gregory Farnum <gfarnum@gmail.com> + * Colin McCabe <cmccabe@alumni.cmu.edu> + */ + +#ifndef HYPERTABLE_CEPHBROKER_H +#define HYPERTABLE_CEPHBROKER_H + +extern "C" { +#include <unistd.h> +} +#include <atomic> +#include "Common/String.h" +#include "Common/Properties.h" + +#include "DfsBroker/Lib/Broker.h" + +#include <cephfs/libcephfs.h> + +namespace Hypertable { + using namespace DfsBroker; + /** + * + */ + class OpenFileDataCeph : public OpenFileData { + public: + OpenFileDataCeph(struct ceph_mount_info *cmount_, const String& fname, + int _fd, int _flags); + virtual ~OpenFileDataCeph(); + struct ceph_mount_info *cmount; + int fd; + int flags; + String filename; + }; + + /** + * + */ + class OpenFileDataCephPtr : public OpenFileDataPtr { + public: + OpenFileDataCephPtr() : OpenFileDataPtr() { } + explicit OpenFileDataCephPtr(OpenFileDataCeph *ofdl) : OpenFileDataPtr(ofdl, true) { } + OpenFileDataCeph *operator->() const { return static_cast<OpenFileDataCeph *>(get()); } + }; + + /** + * + */ + class CephBroker : public DfsBroker::Broker { + public: + explicit CephBroker(PropertiesPtr& cfg); + virtual ~CephBroker(); + + virtual void open(ResponseCallbackOpen *cb, const char *fname, + uint32_t flags, uint32_t bufsz); + virtual void + create(ResponseCallbackOpen *cb, const char *fname, uint32_t flags, + int32_t bufsz, int16_t replication, int64_t blksz); + virtual void close(ResponseCallback *cb, uint32_t fd); + virtual void read(ResponseCallbackRead *cb, uint32_t fd, uint32_t amount); + virtual void append(ResponseCallbackAppend *cb, uint32_t fd, + uint32_t amount, const void *data, bool sync); + virtual void seek(ResponseCallback *cb, uint32_t fd, uint64_t offset); + virtual void remove(ResponseCallback *cb, const char *fname); + virtual void length(ResponseCallbackLength *cb, const char *fname, bool); + virtual void pread(ResponseCallbackRead *cb, uint32_t fd, uint64_t offset, + uint32_t amount, bool); + virtual void mkdirs(ResponseCallback *cb, const char *dname); + virtual void rmdir(ResponseCallback *cb, const char *dname); + virtual void flush(ResponseCallback *cb, uint32_t fd); + virtual void status(ResponseCallback *cb); + virtual void shutdown(ResponseCallback *cb); + virtual void readdir(ResponseCallbackReaddir *cb, const char *dname); + virtual void exists(ResponseCallbackExists *cb, const char *fname); + virtual void rename(ResponseCallback *cb, const char *src, const char *dst); + virtual void debug(ResponseCallback *, int32_t command, + StaticBuffer &serialized_parameters); + + private: + struct ceph_mount_info *cmount; + static std::atomic<int> ms_next_fd; + + virtual void report_error(ResponseCallback *cb, int error); + + void make_abs_path(const char *fname, String& abs) { + if (fname[0] == '/') + abs = fname; + else + abs = m_root_dir + "/" + fname; + } + + int rmdir_recursive(const char *directory); + + bool m_verbose; + String m_root_dir; + }; +} + +#endif //HYPERTABLE_CEPH_BROKER_H diff --git a/src/client/ioctl.h b/src/client/ioctl.h new file mode 100644 index 000000000..047543946 --- /dev/null +++ b/src/client/ioctl.h @@ -0,0 +1,51 @@ +#ifndef FS_CEPH_IOCTL_H +#define FS_CEPH_IOCTL_H + +#include "include/int_types.h" + +#if defined(__linux__) +#include <linux/ioctl.h> +#include <linux/types.h> +#elif defined(__APPLE__) || defined(__FreeBSD__) +#include <sys/ioctl.h> +#include <sys/types.h> +#endif + +#define CEPH_IOCTL_MAGIC 0x97 + +/* just use u64 to align sanely on all archs */ +struct ceph_ioctl_layout { + __u64 stripe_unit, stripe_count, object_size; + __u64 data_pool; + __s64 unused; +}; + +#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \ + struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ + struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \ + struct ceph_ioctl_layout) + +/* + * Extract identity, address of the OSD and object storing a given + * file offset. + */ +struct ceph_ioctl_dataloc { + __u64 file_offset; /* in+out: file offset */ + __u64 object_offset; /* out: offset in object */ + __u64 object_no; /* out: object # */ + __u64 object_size; /* out: object size */ + char object_name[64]; /* out: object name */ + __u64 block_offset; /* out: offset in block */ + __u64 block_size; /* out: block length */ + __s64 osd; /* out: osd # */ + struct sockaddr_storage osd_addr; /* out: osd address */ +}; + +#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ + struct ceph_ioctl_dataloc) + +#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + +#endif diff --git a/src/client/posix_acl.cc b/src/client/posix_acl.cc new file mode 100644 index 000000000..909bb0c2b --- /dev/null +++ b/src/client/posix_acl.cc @@ -0,0 +1,288 @@ +#include "include/compat.h" +#include "include/types.h" +#include "include/fs_types.h" +#include <sys/stat.h> +#include "posix_acl.h" +#include "UserPerm.h" + +int posix_acl_check(const void *xattr, size_t size) +{ + const acl_ea_header *header; + if (size < sizeof(*header)) + return -1; + header = reinterpret_cast<const acl_ea_header*>(xattr); + ceph_le32 expected_version; + expected_version = ACL_EA_VERSION; + if (header->a_version != expected_version) + return -1; + + const acl_ea_entry *entry = header->a_entries; + size -= sizeof(*header); + if (size % sizeof(*entry)) + return -1; + + int count = size / sizeof(*entry); + if (count == 0) + return 0; + + int state = ACL_USER_OBJ; + int needs_mask = 0; + for (int i = 0; i < count; ++i) { + __u16 tag = entry->e_tag; + switch(tag) { + case ACL_USER_OBJ: + if (state == ACL_USER_OBJ) { + state = ACL_USER; + break; + } + return -1; + case ACL_USER: + if (state != ACL_USER) + return -1; + needs_mask = 1; + break; + case ACL_GROUP_OBJ: + if (state == ACL_USER) { + state = ACL_GROUP; + break; + } + return -1; + case ACL_GROUP: + if (state != ACL_GROUP) + return -1; + needs_mask = 1; + break; + case ACL_MASK: + if (state != ACL_GROUP) + return -1; + state = ACL_OTHER; + break; + case ACL_OTHER: + if (state == ACL_OTHER || + (state == ACL_GROUP && !needs_mask)) { + state = 0; + break; + } + // fall-thru + default: + return -1; + } + ++entry; + } + + return state == 0 ? count : -1; +} + +int posix_acl_equiv_mode(const void *xattr, size_t size, mode_t *mode_p) +{ + if (posix_acl_check(xattr, size) < 0) + return -CEPHFS_EINVAL; + + int not_equiv = 0; + mode_t mode = 0; + + const acl_ea_header *header = reinterpret_cast<const acl_ea_header*>(xattr); + const acl_ea_entry *entry = header->a_entries; + int count = (size - sizeof(*header)) / sizeof(*entry); + for (int i = 0; i < count; ++i) { + __u16 tag = entry->e_tag; + __u16 perm = entry->e_perm; + switch(tag) { + case ACL_USER_OBJ: + mode |= (perm & S_IRWXO) << 6; + break; + case ACL_GROUP_OBJ: + mode |= (perm & S_IRWXO) << 3; + break; + case ACL_OTHER: + mode |= perm & S_IRWXO; + break; + case ACL_MASK: + mode = (mode & ~S_IRWXG) | ((perm & S_IRWXO) << 3); + /* fall through */ + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + default: + return -CEPHFS_EINVAL; + } + ++entry; + } + if (mode_p) + *mode_p = (*mode_p & ~ACCESSPERMS) | mode; + return not_equiv; +} + +int posix_acl_inherit_mode(bufferptr& acl, mode_t *mode_p) +{ + if (posix_acl_check(acl.c_str(), acl.length()) <= 0) + return -CEPHFS_EIO; + + acl_ea_entry *group_entry = NULL, *mask_entry = NULL; + mode_t mode = *mode_p; + int not_equiv = 0; + + acl_ea_header *header = reinterpret_cast<acl_ea_header*>(acl.c_str()); + acl_ea_entry *entry = header->a_entries; + int count = (acl.length() - sizeof(*header)) / sizeof(*entry); + for (int i = 0; i < count; ++i) { + __u16 tag = entry->e_tag; + __u16 perm = entry->e_perm; + switch(tag) { + case ACL_USER_OBJ: + perm &= (mode >> 6) | ~S_IRWXO; + mode &= (perm << 6) | ~S_IRWXU; + entry->e_perm = perm; + break; + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + case ACL_GROUP_OBJ: + group_entry = entry; + break; + case ACL_OTHER: + perm &= mode | ~S_IRWXO; + mode &= perm | ~S_IRWXO; + entry->e_perm = perm; + break; + case ACL_MASK: + mask_entry = entry; + not_equiv = 1; + break; + default: + return -CEPHFS_EIO; + + } + ++entry; + } + + if (mask_entry) { + __u16 perm = mask_entry->e_perm; + perm &= (mode >> 3) | ~S_IRWXO; + mode &= (perm << 3) | ~S_IRWXG; + mask_entry->e_perm = perm; + } else { + if (!group_entry) + return -CEPHFS_EIO; + __u16 perm = group_entry->e_perm; + perm &= (mode >> 3) | ~S_IRWXO; + mode &= (perm << 3) | ~S_IRWXG; + group_entry->e_perm = perm; + } + + *mode_p = (*mode_p & ~ACCESSPERMS) | mode; + return not_equiv; +} + +int posix_acl_access_chmod(bufferptr& acl, mode_t mode) +{ + if (posix_acl_check(acl.c_str(), acl.length()) <= 0) + return -CEPHFS_EIO; + + acl_ea_entry *group_entry = NULL, *mask_entry = NULL; + + acl_ea_header *header = reinterpret_cast<acl_ea_header*>(acl.c_str()); + acl_ea_entry *entry = header->a_entries; + int count = (acl.length() - sizeof(*header)) / sizeof(*entry); + for (int i = 0; i < count; ++i) { + __u16 tag = entry->e_tag; + switch(tag) { + case ACL_USER_OBJ: + entry->e_perm = (mode & S_IRWXU) >> 6; + break; + case ACL_GROUP_OBJ: + group_entry = entry; + break; + case ACL_MASK: + mask_entry = entry; + break; + case ACL_OTHER: + entry->e_perm = mode & S_IRWXO; + break; + default: + break; + } + ++entry; + } + + if (mask_entry) { + mask_entry->e_perm = (mode & S_IRWXG) >> 3; + } else { + if (!group_entry) + return -CEPHFS_EIO; + group_entry->e_perm = (mode & S_IRWXG) >> 3; + } + return 0; +} + +int posix_acl_permits(const bufferptr& acl, uid_t i_uid, gid_t i_gid, + const UserPerm& perms, unsigned want) +{ + if (posix_acl_check(acl.c_str(), acl.length()) < 0) + return -CEPHFS_EIO; + + const acl_ea_header *header = reinterpret_cast<const acl_ea_header*>(acl.c_str()); + const acl_ea_entry *entry = header->a_entries; + const acl_ea_entry *next_entry; + __u16 perm, tag; + __u32 id; + int group_found = 0; + int idx; + int count = (acl.length() - sizeof(*header)) / sizeof(*entry); + for (idx = 0; idx < count; ++idx) { + tag = entry->e_tag; + perm = entry->e_perm; + switch(tag) { + case ACL_USER_OBJ: + if (i_uid == perms.uid()) + goto check_perm; + break; + case ACL_USER: + id = entry->e_id; + if (id == perms.uid()) + goto check_mask; + break; + case ACL_GROUP_OBJ: + /* fall through */ + case ACL_GROUP: + id = (tag == ACL_GROUP_OBJ) ? i_gid : entry->e_id; + if (perms.gid_in_groups(id)) { + group_found = 1; + if ((perm & want) == want) + goto check_mask; + } + break; + case ACL_MASK: + break; + case ACL_OTHER: + if (group_found) + return -CEPHFS_EACCES; + else + goto check_perm; + break; + default: + return -CEPHFS_EIO; + } + ++entry; + } + return -CEPHFS_EIO; + +check_mask: + next_entry = entry + 1; + for (++idx; idx < count; ++idx) { + tag = next_entry->e_tag; + if (tag == ACL_MASK) { + __u16 mask = next_entry->e_perm; + if ((perm & mask & want) == want) + return 0; + return -CEPHFS_EACCES; + } + ++next_entry; + } +check_perm: + if ((perm & want) == want) + return 0; + return -CEPHFS_EACCES; +} diff --git a/src/client/posix_acl.h b/src/client/posix_acl.h new file mode 100644 index 000000000..4afcc3fe2 --- /dev/null +++ b/src/client/posix_acl.h @@ -0,0 +1,35 @@ +#ifndef CEPH_POSIX_ACL +#define CEPH_POSIX_ACL + +#define ACL_EA_VERSION 0x0002 + +#define ACL_USER_OBJ 0x01 +#define ACL_USER 0x02 +#define ACL_GROUP_OBJ 0x04 +#define ACL_GROUP 0x08 +#define ACL_MASK 0x10 +#define ACL_OTHER 0x20 + +#define ACL_EA_ACCESS "system.posix_acl_access" +#define ACL_EA_DEFAULT "system.posix_acl_default" + +typedef struct { + ceph_le16 e_tag; + ceph_le16 e_perm; + ceph_le32 e_id; +} acl_ea_entry; + +typedef struct { + ceph_le32 a_version; + acl_ea_entry a_entries[0]; +} acl_ea_header; + +class UserPerm; + +int posix_acl_check(const void *xattr, size_t size); +int posix_acl_equiv_mode(const void *xattr, size_t size, mode_t *mode_p); +int posix_acl_inherit_mode(bufferptr& acl, mode_t *mode_p); +int posix_acl_access_chmod(bufferptr& acl, mode_t mode); +int posix_acl_permits(const bufferptr& acl, uid_t i_uid, gid_t i_gid, + const UserPerm& groups, unsigned want); +#endif diff --git a/src/client/test_ioctls.c b/src/client/test_ioctls.c new file mode 100644 index 000000000..23fa835c5 --- /dev/null +++ b/src/client/test_ioctls.c @@ -0,0 +1,122 @@ + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <limits.h> + +#include <sys/ioctl.h> +#include <netinet/in.h> +#include <sys/socket.h> +#include <netdb.h> + +#include "ioctl.h" + +char new_file_name[PATH_MAX]; + +int main(int argc, char **argv) +{ + char *fn; + int fd, err; + struct ceph_ioctl_layout l; + struct ceph_ioctl_dataloc dl; + + if (argc < 3) { + printf("usage: ceph_test_ioctls <filename> <offset>\n"); + return 1; + } + fn = argv[1]; + + fd = open(fn, O_CREAT|O_RDWR, 0644); + if (fd < 0) { + perror("couldn't open file"); + return 1; + } + + /* get layout */ + err = ioctl(fd, CEPH_IOC_GET_LAYOUT, (unsigned long)&l); + if (err < 0) { + perror("ioctl IOC_GET_LAYOUT error"); + return 1; + } + printf("layout:\n stripe_unit %lld\n stripe_count %lld\n object_size %lld\n data_pool %lld\n", + (long long)l.stripe_unit, (long long)l.stripe_count, (long long)l.object_size, (long long)l.data_pool); + + + /* set layout */ + l.stripe_unit = 1048576; + l.stripe_count = 2; + err = ioctl(fd, CEPH_IOC_SET_LAYOUT, (unsigned long)&l); + if (err < 0) { + perror("ioctl IOC_SET_LAYOUT error"); + return 1; + } + printf("set layout, writing to file\n"); + + printf("file %s\n", fn); + /* get layout again */ + err = ioctl(fd, CEPH_IOC_GET_LAYOUT, (unsigned long)&l); + if (err < 0) { + perror("ioctl IOC_GET_LAYOUT error"); + return 1; + } + printf("layout:\n stripe_unit %lld\n stripe_count %lld\n object_size %lld\n data_pool %lld\n", + (long long)l.stripe_unit, (long long)l.stripe_count, (long long)l.object_size, (long long)l.data_pool); + + /* dataloc */ + dl.file_offset = atoll(argv[2]); + err = ioctl(fd, CEPH_IOC_GET_DATALOC, (unsigned long)&dl); + if (err < 0) { + perror("ioctl IOC_GET_DATALOC error"); + return 1; + } + + printf("dataloc:\n"); + printf(" file_offset %lld (of object start)\n", (long long)dl.file_offset); + printf(" object '%s'\n object_offset %lld\n object_size %lld object_no %lld\n", + dl.object_name, (long long)dl.object_offset, (long long)dl.object_size, (long long)dl.object_no); + printf(" block_offset %lld\n block_size %lld\n", + (long long)dl.block_offset, (long long)dl.block_size); + + char buf[80]; + getnameinfo((struct sockaddr *)&dl.osd_addr, sizeof(dl.osd_addr), buf, sizeof(buf), 0, 0, NI_NUMERICHOST); + printf(" osd%lld %s\n", (long long)dl.osd, buf); + + if (argc < 4) + return 0; + + /* set dir default layout */ + printf("testing dir policy setting\n"); + fd = open(argv[3], O_RDONLY); + if (fd < 0) { + perror("couldn't open dir"); + return 1; + } + + l.object_size = 1048576; + l.stripe_count = 1; + err = ioctl(fd, CEPH_IOC_SET_LAYOUT_POLICY, (unsigned long)&l); + if (err < 0) { + perror("ioctl IOC_SET_LAYOUT_POLICY error"); + return 1; + } + printf("set layout, creating file\n"); + + snprintf(new_file_name, sizeof(new_file_name), + "%s/testfile", argv[3]); + fd = open(new_file_name, O_CREAT | O_RDWR, 0644); + if (fd < 0) { + perror("couldn't open file"); + return 1; + } + err = ioctl(fd, CEPH_IOC_GET_LAYOUT, (unsigned long)&l); + if (err < 0) { + perror("ioctl IOC_GET_LAYOUT error"); + return 1; + } + printf("layout:\n stripe_unit %lld\n stripe_count %lld\n object_size %lld\n data_pool %lld\n", + (long long)l.stripe_unit, (long long)l.stripe_count, (long long)l.object_size, (long long)l.data_pool); + return 0; +} |