diff options
Diffstat (limited to 'src/os/filestore')
32 files changed, 19818 insertions, 0 deletions
diff --git a/src/os/filestore/BtrfsFileStoreBackend.cc b/src/os/filestore/BtrfsFileStoreBackend.cc new file mode 100644 index 00000000..2ff2000d --- /dev/null +++ b/src/os/filestore/BtrfsFileStoreBackend.cc @@ -0,0 +1,575 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" +#include "include/types.h" + +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include "include/compat.h" +#include "include/linux_fiemap.h" +#include "include/color.h" +#include "include/buffer.h" +#include "include/ceph_assert.h" + +#ifndef __CYGWIN__ +#include "os/fs/btrfs_ioctl.h" +#endif + +#include <iostream> +#include <fstream> +#include <sstream> + +#include "BtrfsFileStoreBackend.h" + +#include "common/errno.h" +#include "common/config.h" + +#if defined(__linux__) + +#define dout_context cct() +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") " + +#define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) +#define ALIGNED(x, by) (!((x) % (by))) +#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) + +BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs): + GenericFileStoreBackend(fs), has_clone_range(false), + has_snap_create(false), has_snap_destroy(false), + has_snap_create_v2(false), has_wait_sync(false), stable_commits(false), + m_filestore_btrfs_clone_range(cct()->_conf->filestore_btrfs_clone_range), + m_filestore_btrfs_snap (cct()->_conf->filestore_btrfs_snap) { } + +int BtrfsFileStoreBackend::detect_features() +{ + int r; + + r = GenericFileStoreBackend::detect_features(); + if (r < 0) + return r; + + // clone_range? + if (m_filestore_btrfs_clone_range) { + int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY|O_CLOEXEC, 0600); + if (fd >= 0) { + if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) { + r = -errno; + dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: " + << cpp_strerror(r) << dendl; + } + btrfs_ioctl_clone_range_args clone_args; + memset(&clone_args, 0, sizeof(clone_args)); + clone_args.src_fd = -1; + r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args); + if (r < 0 && errno == EBADF) { + dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl; + has_clone_range = true; + } else { + r = -errno; + dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl; + } + TEMP_FAILURE_RETRY(::close(fd)); + } else { + r = -errno; + dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: " + << cpp_strerror(r) << dendl; + } + } else { + dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl; + } + + struct btrfs_ioctl_vol_args vol_args; + memset(&vol_args, 0, sizeof(vol_args)); + + // create test source volume + vol_args.fd = 0; + strcpy(vol_args.name, "test_subvol"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args); + if (r != 0) { + r = -errno; + dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl; + } + int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY|O_CLOEXEC); + if (srcfd < 0) { + r = -errno; + dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl; + } + + // snap_create and snap_destroy? + vol_args.fd = srcfd; + strcpy(vol_args.name, "sync_snap_test"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); + int err = errno; + if (r == 0 || errno == EEXIST) { + dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl; + has_snap_create = true; + + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r == 0) { + dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl; + has_snap_destroy = true; + } else { + err = -errno; + dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl; + + if (err == -EPERM && getuid() != 0) { + dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl; + cerr << TEXT_YELLOW + << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed" + << TEXT_NORMAL << std::endl; + } else if (err == -EOPNOTSUPP) { + derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl; + } + } + } else { + dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl; + } + + if (m_filestore_btrfs_snap) { + if (has_snap_destroy) + stable_commits = true; + else + dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl; + } + + // start_sync? + __u64 transid = 0; + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid); + if (r < 0) { + int err = errno; + dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl; + } + if (r == 0 && transid > 0) { + dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl; + + // do we have wait_sync too? + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid); + if (r == 0 || errno == ERANGE) { + dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl; + has_wait_sync = true; + } else { + int err = errno; + dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl; + } + } else { + int err = errno; + dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl; + } + + if (has_wait_sync) { + // async snap creation? + struct btrfs_ioctl_vol_args_v2 async_args; + memset(&async_args, 0, sizeof(async_args)); + async_args.fd = srcfd; + async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC; + strcpy(async_args.name, "async_snap_test"); + + // remove old one, first + struct stat st; + strcpy(vol_args.name, async_args.name); + if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) { + dout(0) << "detect_feature: removing old async_snap_test" << dendl; + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r != 0) { + int err = errno; + dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl; + } + } + + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args); + if (r == 0 || errno == EEXIST) { + dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl; + has_snap_create_v2 = true; + + // clean up + strcpy(vol_args.name, "async_snap_test"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r != 0) { + int err = errno; + dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl; + } + } else { + int err = errno; + dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl; + } + } + + // clean up test subvol + if (srcfd >= 0) + TEMP_FAILURE_RETRY(::close(srcfd)); + + strcpy(vol_args.name, "test_subvol"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r < 0) { + r = -errno; + dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl; + } + + if (m_filestore_btrfs_snap && !has_snap_create_v2) { + dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n" + << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n" + << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n" + << " performance.\n" + << TEXT_NORMAL; + } + + return 0; +} + +bool BtrfsFileStoreBackend::can_checkpoint() +{ + return stable_commits; +} + +int BtrfsFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "create_current: current/ exists but is not a directory" << dendl; + return -EINVAL; + } + + struct stat basest; + struct statfs currentfs; + ret = ::fstat(get_basedir_fd(), &basest); + if (ret < 0) { + ret = -errno; + dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl; + return ret; + } + ret = ::statfs(get_current_path().c_str(), ¤tfs); + if (ret < 0) { + ret = -errno; + dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl; + return ret; + } + if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) { + dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl; + stable_commits = true; + } + return 0; + } + + struct btrfs_ioctl_vol_args volargs; + memset(&volargs, 0, sizeof(volargs)); + + volargs.fd = 0; + strcpy(volargs.name, "current"); + if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) { + ret = -errno; + dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error " + << cpp_strerror(ret) << dendl; + return ret; + } + + dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl; + if (::chmod(get_current_path().c_str(), 0755) < 0) { + ret = -errno; + dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: " + << cpp_strerror(ret) << dendl; + return ret; + } + + stable_commits = true; + return 0; +} + +int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls) +{ + int ret, err = 0; + + struct stat basest; + ret = ::fstat(get_basedir_fd(), &basest); + if (ret < 0) { + ret = -errno; + dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl; + return ret; + } + + // get snap list + DIR *dir = ::opendir(get_basedir_path().c_str()); + if (!dir) { + ret = -errno; + dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: " + << cpp_strerror(ret) << dendl; + return ret; + } + + list<string> snaps; + char path[PATH_MAX]; + struct dirent *de; + while ((de = ::readdir(dir))) { + snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name); + + struct stat st; + ret = ::stat(path, &st); + if (ret < 0) { + err = -errno; + dout(0) << "list_checkpoints: stat '" << path << "' failed: " + << cpp_strerror(err) << dendl; + break; + } + + if (!S_ISDIR(st.st_mode)) + continue; + + struct statfs fs; + ret = ::statfs(path, &fs); + if (ret < 0) { + err = -errno; + dout(0) << "list_checkpoints: statfs '" << path << "' failed: " + << cpp_strerror(err) << dendl; + break; + } + + if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) + snaps.push_back(string(de->d_name)); + } + + if (::closedir(dir) < 0) { + ret = -errno; + dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl; + if (!err) + err = ret; + } + + if (err) + return err; + + ls.swap(snaps); + return 0; +} + +int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid) +{ + dout(10) << "create_checkpoint: '" << name << "'" << dendl; + if (has_snap_create_v2 && transid) { + struct btrfs_ioctl_vol_args_v2 async_args; + memset(&async_args, 0, sizeof(async_args)); + async_args.fd = get_current_fd(); + async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC; + + size_t name_size = sizeof(async_args.name); + strncpy(async_args.name, name.c_str(), name_size); + async_args.name[name_size-1] = '\0'; + + int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args); + if (r < 0) { + r = -errno; + dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl; + return r; + } + dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl; + *transid = async_args.transid; + } else { + struct btrfs_ioctl_vol_args vol_args; + memset(&vol_args, 0, sizeof(vol_args)); + vol_args.fd = get_current_fd(); + + size_t name_size = sizeof(vol_args.name); + strncpy(vol_args.name, name.c_str(), name_size); + vol_args.name[name_size-1] = '\0'; + + int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); + if (r < 0) { + r = -errno; + dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl; + return r; + } + if (transid) + *transid = 0; + } + return 0; +} + +int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid) +{ + // wait for commit + dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl; + int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid); + if (ret < 0) { + ret = -errno; + dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl; + return -errno; + } + dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl; + return 0; +} + +int BtrfsFileStoreBackend::rollback_to(const string& name) +{ + dout(10) << "rollback_to: to '" << name << "'" << dendl; + char s[PATH_MAX]; + btrfs_ioctl_vol_args vol_args; + + memset(&vol_args, 0, sizeof(vol_args)); + vol_args.fd = 0; + strcpy(vol_args.name, "current"); + + int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (ret && errno != ENOENT) { + dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl; + snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand()); + if (::rename(get_current_path().c_str(), s)) { + ret = -errno; + dout(0) << "rollback_to: error renaming old current subvol: " + << cpp_strerror(ret) << dendl; + return ret; + } + } + + snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str()); + + // roll back + vol_args.fd = ::open(s, O_RDONLY|O_CLOEXEC); + if (vol_args.fd < 0) { + ret = -errno; + dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl; + return ret; + } + ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); + if (ret < 0 ) { + ret = -errno; + dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl; + } + TEMP_FAILURE_RETRY(::close(vol_args.fd)); + return ret; +} + +int BtrfsFileStoreBackend::destroy_checkpoint(const string& name) +{ + dout(10) << "destroy_checkpoint: '" << name << "'" << dendl; + btrfs_ioctl_vol_args vol_args; + memset(&vol_args, 0, sizeof(vol_args)); + vol_args.fd = 0; + strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name)); + + int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (ret) { + ret = -errno; + dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl; + return ret; + } + return 0; +} + +int BtrfsFileStoreBackend::syncfs() +{ + dout(15) << "syncfs" << dendl; + // do a full btrfs commit + int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC); + if (ret < 0) { + ret = -errno; + dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl; + } + return ret; +} + +int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl; + size_t blk_size = get_blksize(); + if (!has_clone_range || + srcoff % blk_size != dstoff % blk_size) { + dout(20) << "clone_range: using copy" << dendl; + return _copy_range(from, to, srcoff, len, dstoff); + } + + int err = 0; + int r = 0; + + uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size); + uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size); + if (srcoffclone >= srcoff + len) { + dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl; + return _copy_range(from, to, srcoff, len, dstoff); + } + + uint64_t lenclone = len - (srcoffclone - srcoff); + if (!ALIGNED(lenclone, blk_size)) { + struct stat from_stat, to_stat; + err = ::fstat(from, &from_stat); + if (err) return -errno; + err = ::fstat(to , &to_stat); + if (err) return -errno; + + if (srcoff + len != (uint64_t)from_stat.st_size || + dstoff + len < (uint64_t)to_stat.st_size) { + // Not to the end of the file, need to align length as well + lenclone = ALIGN_DOWN(lenclone, blk_size); + } + } + if (lenclone == 0) { + // too short + return _copy_range(from, to, srcoff, len, dstoff); + } + + dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone + << " to " << dstoffclone << " = " << r << dendl; + btrfs_ioctl_clone_range_args a; + a.src_fd = from; + a.src_offset = srcoffclone; + a.src_length = lenclone; + a.dest_offset = dstoffclone; + err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a); + if (err >= 0) { + r += err; + } else if (errno == EINVAL) { + // Still failed, might be compressed + dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl; + return _copy_range(from, to, srcoff, len, dstoff); + } else { + return -errno; + } + + // Take care any trimmed from front + if (srcoffclone != srcoff) { + err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff); + if (err >= 0) { + r += err; + } else { + return err; + } + } + + // Copy end + if (srcoffclone + lenclone != srcoff + len) { + err = _copy_range(from, to, + srcoffclone + lenclone, + (srcoff + len) - (srcoffclone + lenclone), + dstoffclone + lenclone); + if (err >= 0) { + r += err; + } else { + return err; + } + } + dout(20) << "clone_range: finished " << srcoff << "~" << len + << " to " << dstoff << " = " << r << dendl; + return r; +} +#endif diff --git a/src/os/filestore/BtrfsFileStoreBackend.h b/src/os/filestore/BtrfsFileStoreBackend.h new file mode 100644 index 00000000..0794be2d --- /dev/null +++ b/src/os/filestore/BtrfsFileStoreBackend.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BTRFSFILESTOREBACKEDN_H +#define CEPH_BTRFSFILESTOREBACKEDN_H + +#if defined(__linux__) +#include "GenericFileStoreBackend.h" + +class BtrfsFileStoreBackend : public GenericFileStoreBackend { +private: + bool has_clone_range; ///< clone range ioctl is supported + bool has_snap_create; ///< snap create ioctl is supported + bool has_snap_destroy; ///< snap destroy ioctl is supported + bool has_snap_create_v2; ///< snap create v2 ioctl (async!) is supported + bool has_wait_sync; ///< wait sync ioctl is supported + bool stable_commits; + bool m_filestore_btrfs_clone_range; + bool m_filestore_btrfs_snap; +public: + explicit BtrfsFileStoreBackend(FileStore *fs); + ~BtrfsFileStoreBackend() override {} + const char *get_name() override { + return "btrfs"; + } + int detect_features() override; + bool can_checkpoint() override; + int create_current() override; + int list_checkpoints(list<string>& ls) override; + int create_checkpoint(const string& name, uint64_t *cid) override; + int sync_checkpoint(uint64_t cid) override; + int rollback_to(const string& name) override; + int destroy_checkpoint(const string& name) override; + int syncfs() override; + int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override; +}; +#endif +#endif diff --git a/src/os/filestore/CollectionIndex.h b/src/os/filestore/CollectionIndex.h new file mode 100644 index 00000000..eb43e47d --- /dev/null +++ b/src/os/filestore/CollectionIndex.h @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef OS_COLLECTIONINDEX_H +#define OS_COLLECTIONINDEX_H + +#include <string> +#include <vector> + +#include "osd/osd_types.h" +#include "include/object.h" +#include "common/RWLock.h" + +/** + CollectionIndex provides an interface for manipulating indexed collections + */ +class CollectionIndex { +public: + CephContext* cct; +protected: + /** + * Object encapsulating a returned path. + * + * A path to an object (existent or non-existent) becomes invalid + * when a different object is created in the index. Path stores + * a shared_ptr to the CollectionIndex to keep the index alive + * during its lifetime. + * @see IndexManager + * @see self_ref + * @see set_ref + */ + class Path { + public: + /// Returned path + string full_path; + /// Ref to parent Index + CollectionIndex* parent_ref; + /// coll_t for parent Index + coll_t parent_coll; + + /// Normal Constructor + Path( + string path, ///< [in] Path to return. + CollectionIndex* ref) + : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {} + + /// Debugging Constructor + Path( + string path, ///< [in] Path to return. + const coll_t& coll) ///< [in] collection + : full_path(path), parent_coll(coll) {} + + /// Getter for the stored path. + const char *path() const { return full_path.c_str(); } + + /// Getter for collection + const coll_t& coll() const { return parent_coll; } + + /// Getter for parent + CollectionIndex* get_index() const { + return parent_ref; + } + }; + public: + + RWLock access_lock; + /// Type of returned paths + typedef std::shared_ptr<Path> IndexedPath; + + static IndexedPath get_testing_path(string path, coll_t collection) { + return std::make_shared<Path>(path, collection); + } + + static const uint32_t FLAT_INDEX_TAG = 0; + static const uint32_t HASH_INDEX_TAG = 1; + static const uint32_t HASH_INDEX_TAG_2 = 2; + static const uint32_t HOBJECT_WITH_POOL = 3; + /** + * For tracking Filestore collection versions. + * + * @return Collection version represented by the Index implementation + */ + virtual uint32_t collection_version() = 0; + + /** + * Returns the collection managed by this CollectionIndex + */ + virtual coll_t coll() const = 0; + + + /** + * Initializes the index. + * + * @return Error Code, 0 for success + */ + virtual int init() = 0; + + /** + * Cleanup before replaying journal + * + * Index implementations may need to perform compound operations + * which may leave the collection unstable if interrupted. cleanup + * is called on mount to allow the CollectionIndex implementation + * to stabilize. + * + * @see HashIndex + * @return Error Code, 0 for success + */ + virtual int cleanup() = 0; + + /** + * Call when a file is created using a path returned from lookup. + * + * @return Error Code, 0 for success + */ + virtual int created( + const ghobject_t &oid, ///< [in] Created object. + const char *path ///< [in] Path to created object. + ) = 0; + + /** + * Removes oid from the collection + * + * @return Error Code, 0 for success + */ + virtual int unlink( + const ghobject_t &oid ///< [in] Object to remove + ) = 0; + + /** + * Gets the IndexedPath for oid. + * + * @return Error Code, 0 for success + */ + virtual int lookup( + const ghobject_t &oid, ///< [in] Object to lookup + IndexedPath *path, ///< [out] Path to object + int *hardlink ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist. + ) = 0; + + /** + * Moves objects matching @e match in the lsb @e bits + * + * dest and this must be the same subclass + * + * @return Error Code, 0 for success + */ + virtual int split( + uint32_t match, //< [in] value to match + uint32_t bits, //< [in] bits to check + CollectionIndex* dest //< [in] destination index + ) { ceph_abort(); return 0; } + + virtual int merge( + uint32_t bits, //< [in] common (target) bits + CollectionIndex* dest //< [in] destination index + ) { ceph_abort(); return 0; } + + + /// List contents of collection by hash + virtual int collection_list_partial( + const ghobject_t &start, ///< [in] object at which to start + const ghobject_t &end, ///< [in] list only objects < end + int max_count, ///< [in] return at most max_count objects + vector<ghobject_t> *ls, ///< [out] Listed objects + ghobject_t *next ///< [out] Next object to list + ) = 0; + + /// Call prior to removing directory + virtual int prep_delete() { return 0; } + + CollectionIndex(CephContext* cct, const coll_t& collection) + : cct(cct), access_lock("CollectionIndex::access_lock", true, false) {} + + /* + * Pre-hash the collection, this collection should map to a PG folder. + * + * @param pg_num - pg number of the pool this collection belongs to. + * @param expected_num_objs - expected number of objects in this collection. + * @Return 0 on success, an error code otherwise. + */ + virtual int pre_hash_collection( + uint32_t pg_num, ///< [in] pg number of the pool this collection belongs to + uint64_t expected_num_objs ///< [in] expected number of objects this collection has + ) { ceph_abort(); return 0; } + + virtual int apply_layout_settings(int target_level) { ceph_abort(); return 0; } + + /// Read index-wide settings (should be called after construction) + virtual int read_settings() { return 0; } + + /// Virtual destructor + virtual ~CollectionIndex() {} +}; + +#endif diff --git a/src/os/filestore/DBObjectMap.cc b/src/os/filestore/DBObjectMap.cc new file mode 100644 index 00000000..5a057014 --- /dev/null +++ b/src/os/filestore/DBObjectMap.cc @@ -0,0 +1,1415 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "include/int_types.h" +#include "include/buffer.h" + +#include <iostream> +#include <set> +#include <map> +#include <string> +#include <vector> + +#include "os/ObjectMap.h" +#include "kv/KeyValueDB.h" +#include "DBObjectMap.h" +#include <errno.h> + +#include "common/debug.h" +#include "common/config.h" +#include "include/ceph_assert.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "filestore " + +const string DBObjectMap::USER_PREFIX = "_USER_"; +const string DBObjectMap::XATTR_PREFIX = "_AXATTR_"; +const string DBObjectMap::SYS_PREFIX = "_SYS_"; +const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_"; +const string DBObjectMap::HEADER_KEY = "HEADER"; +const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER"; +const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER"; +const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_"; + +// Legacy +const string DBObjectMap::LEAF_PREFIX = "_LEAF_"; +const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_"; + +static void append_escaped(const string &in, string *out) +{ + for (string::const_iterator i = in.begin(); i != in.end(); ++i) { + if (*i == '%') { + out->push_back('%'); + out->push_back('p'); + } else if (*i == '.') { + out->push_back('%'); + out->push_back('e'); + } else if (*i == '_') { + out->push_back('%'); + out->push_back('u'); + } else { + out->push_back(*i); + } + } +} + +int DBObjectMap::check(std::ostream &out, bool repair, bool force) +{ + int errors = 0, comp_errors = 0; + bool repaired = false; + map<uint64_t, uint64_t> parent_to_num_children; + map<uint64_t, uint64_t> parent_to_actual_num_children; + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + _Header header; + bufferlist bl = iter->value(); + while (true) { + auto bliter = bl.cbegin(); + header.decode(bliter); + if (header.seq != 0) + parent_to_actual_num_children[header.seq] = header.num_children; + + if (state.v == 2 || force) { + // Check complete table + bool complete_error = false; + boost::optional<string> prev; + KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX); + for (complete_iter->seek_to_first(); complete_iter->valid(); + complete_iter->next()) { + if (prev && prev >= complete_iter->key()) { + out << "Bad complete for " << header.oid << std::endl; + complete_error = true; + break; + } + prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1); + } + if (complete_error) { + out << "Complete mapping for " << header.seq << " :" << std::endl; + for (complete_iter->seek_to_first(); complete_iter->valid(); + complete_iter->next()) { + out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl; + } + if (repair) { + repaired = true; + KeyValueDB::Transaction t = db->get_transaction(); + t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX); + db->submit_transaction(t); + out << "Cleared complete mapping to repair" << std::endl; + } else { + errors++; // Only count when not repaired + comp_errors++; // Track errors here for version update + } + } + } + + if (header.parent == 0) + break; + + if (!parent_to_num_children.count(header.parent)) + parent_to_num_children[header.parent] = 0; + parent_to_num_children[header.parent]++; + if (parent_to_actual_num_children.count(header.parent)) + break; + + set<string> to_get; + map<string, bufferlist> got; + to_get.insert(HEADER_KEY); + db->get(sys_parent_prefix(header), to_get, &got); + if (got.empty()) { + out << "Missing: seq " << header.parent << std::endl; + errors++; + break; + } else { + bl = got.begin()->second; + } + } + } + + for (map<uint64_t, uint64_t>::iterator i = parent_to_num_children.begin(); + i != parent_to_num_children.end(); + parent_to_num_children.erase(i++)) { + if (!parent_to_actual_num_children.count(i->first)) + continue; + if (parent_to_actual_num_children[i->first] != i->second) { + out << "Invalid: seq " << i->first << " recorded children: " + << parent_to_actual_num_children[i->first] << " found: " + << i->second << std::endl; + errors++; + } + parent_to_actual_num_children.erase(i->first); + } + + // Only advance the version from 2 to 3 here + // Mark as legacy because there are still older structures + // we don't update. The value of legacy is only used + // for internal assertions. + if (comp_errors == 0 && state.v == 2 && repair) { + state.v = 3; + state.legacy = true; + set_state(); + } + + if (errors == 0 && repaired) + return -1; + return errors; +} + +string DBObjectMap::ghobject_key(const ghobject_t &oid) +{ + string out; + append_escaped(oid.hobj.oid.name, &out); + out.push_back('.'); + append_escaped(oid.hobj.get_key(), &out); + out.push_back('.'); + append_escaped(oid.hobj.nspace, &out); + out.push_back('.'); + + char snap_with_hash[1000]; + char *t = snap_with_hash; + char *end = t + sizeof(snap_with_hash); + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); + + if (oid.hobj.pool == -1) + t += snprintf(t, end - t, ".none"); + else + t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool); + t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash()); + + if (oid.generation != ghobject_t::NO_GEN || + oid.shard_id != shard_id_t::NO_SHARD) { + t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation); + t += snprintf(t, end - t, ".%x", (int)oid.shard_id); + } + out += string(snap_with_hash); + return out; +} + +// ok: pglog%u3%efs1...0.none.0017B237 +// bad: plana8923501-10...4c.3.ffffffffffffffff.2 +// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2 +// returns 0 for false, 1 for true, negative for error +int DBObjectMap::is_buggy_ghobject_key_v1(CephContext* cct, + const string &in) +{ + int dots = 5; // skip 5 .'s + const char *s = in.c_str(); + do { + while (*s && *s != '.') + ++s; + if (!*s) { + derr << "unexpected null at " << (int)(s-in.c_str()) << dendl; + return -EINVAL; + } + ++s; + } while (*s && --dots); + if (!*s) { + derr << "unexpected null at " << (int)(s-in.c_str()) << dendl; + return -EINVAL; + } + // we are now either at a hash value (32 bits, 8 chars) or a generation + // value (64 bits) '.' and shard id. count the dots! + int len = 0; + while (*s && *s != '.') { + ++s; + ++len; + } + if (*s == '\0') { + if (len != 8) { + derr << "hash value is not 8 chars" << dendl; + return -EINVAL; // the hash value is always 8 chars. + } + return 0; + } + if (*s != '.') { // the shard follows. + derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl; + return -EINVAL; + } + return 1; +} + + +string DBObjectMap::map_header_key(const ghobject_t &oid) +{ + return ghobject_key(oid); +} + +string DBObjectMap::header_key(uint64_t seq) +{ + char buf[100]; + snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq); + return string(buf); +} + +string DBObjectMap::complete_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX; +} + +string DBObjectMap::user_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + USER_PREFIX; +} + +string DBObjectMap::sys_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + SYS_PREFIX; +} + +string DBObjectMap::xattr_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX; +} + +string DBObjectMap::sys_parent_prefix(_Header header) +{ + return USER_PREFIX + header_key(header.parent) + SYS_PREFIX; +} + +int DBObjectMap::DBObjectMapIteratorImpl::init() +{ + invalid = false; + if (ready) { + return 0; + } + ceph_assert(!parent_iter); + if (header->parent) { + Header parent = map->lookup_parent(header); + if (!parent) { + ceph_abort(); + return -EINVAL; + } + parent_iter = std::make_shared<DBObjectMapIteratorImpl>(map, parent); + } + key_iter = map->db->get_iterator(map->user_prefix(header)); + ceph_assert(key_iter); + complete_iter = map->db->get_iterator(map->complete_prefix(header)); + ceph_assert(complete_iter); + cur_iter = key_iter; + ceph_assert(cur_iter); + ready = true; + return 0; +} + +ObjectMap::ObjectMapIterator DBObjectMap::get_iterator( + const ghobject_t &oid) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return ObjectMapIterator(new EmptyIteratorImpl()); + DBObjectMapIterator iter = _get_iterator(header); + iter->hlock.swap(hl); + return iter; +} + +int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first() +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->seek_to_first(); + if (r < 0) + return r; + } + r = key_iter->seek_to_first(); + if (r < 0) + return r; + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last() +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->seek_to_last(); + if (r < 0) + return r; + if (parent_iter->valid()) + r = parent_iter->next(); + if (r < 0) + return r; + } + r = key_iter->seek_to_last(); + if (r < 0) + return r; + if (key_iter->valid()) + r = key_iter->next(); + if (r < 0) + return r; + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to) +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->lower_bound(to); + if (r < 0) + return r; + } + r = key_iter->lower_bound(to); + if (r < 0) + return r; + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::lower_bound_parent(const string &to) +{ + int r = lower_bound(to); + if (r < 0) + return r; + if (valid() && !on_parent()) + return next_parent(); + else + return r; +} + +int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after) +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->upper_bound(after); + if (r < 0) + return r; + } + r = key_iter->upper_bound(after); + if (r < 0) + return r; + return adjust(); +} + +bool DBObjectMap::DBObjectMapIteratorImpl::valid() +{ + bool valid = !invalid && ready; + ceph_assert(!valid || cur_iter->valid()); + return valid; +} + +bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent() +{ + if (parent_iter && parent_iter->valid() && + (!key_iter->valid() || key_iter->key() > parent_iter->key())) + return true; + return false; +} + +int DBObjectMap::DBObjectMapIteratorImpl::next() +{ + ceph_assert(cur_iter->valid()); + ceph_assert(valid()); + cur_iter->next(); + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::next_parent() +{ + r = next(); + if (r < 0) + return r; + while (parent_iter && parent_iter->valid() && !on_parent()) { + ceph_assert(valid()); + r = lower_bound(parent_iter->key()); + if (r < 0) + return r; + } + + if (!parent_iter || !parent_iter->valid()) { + invalid = true; + } + return 0; +} + +int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test, + string *begin, + string *end) +{ + /* This is clumsy because one cannot call prev() on end(), nor can one + * test for == begin(). + */ + complete_iter->upper_bound(to_test); + if (complete_iter->valid()) { + complete_iter->prev(); + if (!complete_iter->valid()) { + complete_iter->upper_bound(to_test); + return false; + } + } else { + complete_iter->seek_to_last(); + if (!complete_iter->valid()) + return false; + } + + ceph_assert(complete_iter->key() <= to_test); + ceph_assert(complete_iter->value().length() >= 1); + string _end(complete_iter->value().c_str(), + complete_iter->value().length() - 1); + if (_end.empty() || _end > to_test) { + if (begin) + *begin = complete_iter->key(); + if (end) + *end = _end; + return true; + } else { + complete_iter->next(); + ceph_assert(!complete_iter->valid() || complete_iter->key() > to_test); + return false; + } +} + +/** + * Moves parent_iter to the next position both out of the complete_region and + * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and + * less than key_iter and key_iter otherwise. + */ +int DBObjectMap::DBObjectMapIteratorImpl::adjust() +{ + string begin, end; + while (parent_iter && parent_iter->valid()) { + if (in_complete_region(parent_iter->key(), &begin, &end)) { + if (end.size() == 0) { + parent_iter->seek_to_last(); + if (parent_iter->valid()) + parent_iter->next(); + } else + parent_iter->lower_bound(end); + } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) { + parent_iter->next(); + } else { + break; + } + } + if (valid_parent()) { + cur_iter = parent_iter; + } else if (key_iter->valid()) { + cur_iter = key_iter; + } else { + invalid = true; + } + ceph_assert(invalid || cur_iter->valid()); + return 0; +} + + +string DBObjectMap::DBObjectMapIteratorImpl::key() +{ + return cur_iter->key(); +} + +bufferlist DBObjectMap::DBObjectMapIteratorImpl::value() +{ + return cur_iter->value(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::status() +{ + return r; +} + +int DBObjectMap::set_keys(const ghobject_t &oid, + const map<string, bufferlist> &set, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_create_map_header(hl, oid, t); + if (!header) + return -EINVAL; + if (check_spos(oid, header, spos)) + return 0; + + t->set(user_prefix(header), set); + + return db->submit_transaction(t); +} + +int DBObjectMap::set_header(const ghobject_t &oid, + const bufferlist &bl, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_create_map_header(hl, oid, t); + if (!header) + return -EINVAL; + if (check_spos(oid, header, spos)) + return 0; + _set_header(header, bl, t); + return db->submit_transaction(t); +} + +void DBObjectMap::_set_header(Header header, const bufferlist &bl, + KeyValueDB::Transaction t) +{ + map<string, bufferlist> to_set; + to_set[USER_HEADER_KEY] = bl; + t->set(sys_prefix(header), to_set); +} + +int DBObjectMap::get_header(const ghobject_t &oid, + bufferlist *bl) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) { + return 0; + } + return _get_header(header, bl); +} + +int DBObjectMap::_get_header(Header header, + bufferlist *bl) +{ + map<string, bufferlist> out; + while (true) { + out.clear(); + set<string> to_get; + to_get.insert(USER_HEADER_KEY); + int r = db->get(sys_prefix(header), to_get, &out); + if (r == 0 && !out.empty()) + break; + if (r < 0) + return r; + Header current(header); + if (!current->parent) + break; + header = lookup_parent(current); + } + + if (!out.empty()) + bl->swap(out.begin()->second); + return 0; +} + +int DBObjectMap::clear(const ghobject_t &oid, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + if (check_spos(oid, header, spos)) + return 0; + remove_map_header(hl, oid, header, t); + ceph_assert(header->num_children > 0); + header->num_children--; + int r = _clear(header, t); + if (r < 0) + return r; + return db->submit_transaction(t); +} + +int DBObjectMap::_clear(Header header, + KeyValueDB::Transaction t) +{ + while (1) { + if (header->num_children) { + set_header(header, t); + break; + } + clear_header(header, t); + if (!header->parent) + break; + Header parent = lookup_parent(header); + if (!parent) { + return -EINVAL; + } + ceph_assert(parent->num_children > 0); + parent->num_children--; + header.swap(parent); + } + return 0; +} + +int DBObjectMap::copy_up_header(Header header, + KeyValueDB::Transaction t) +{ + bufferlist bl; + int r = _get_header(header, &bl); + if (r < 0) + return r; + + _set_header(header, bl, t); + return 0; +} + +int DBObjectMap::rm_keys(const ghobject_t &oid, + const set<string> &to_clear, + const SequencerPosition *spos) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + KeyValueDB::Transaction t = db->get_transaction(); + if (check_spos(oid, header, spos)) + return 0; + t->rmkeys(user_prefix(header), to_clear); + if (!header->parent) { + return db->submit_transaction(t); + } + + ceph_assert(state.legacy); + + { + // We only get here for legacy (v2) stores + // Copy up all keys from parent excluding to_clear + // and remove parent + // This eliminates a v2 format use of complete for this oid only + map<string, bufferlist> to_write; + ObjectMapIterator iter = _get_iterator(header); + for (iter->seek_to_first() ; iter->valid() ; iter->next()) { + if (iter->status()) + return iter->status(); + if (!to_clear.count(iter->key())) + to_write[iter->key()] = iter->value(); + } + t->set(user_prefix(header), to_write); + } // destruct iter which has parent in_use + + copy_up_header(header, t); + Header parent = lookup_parent(header); + if (!parent) + return -EINVAL; + parent->num_children--; + _clear(parent, t); + header->parent = 0; + set_map_header(hl, oid, *header, t); + t->rmkeys_by_prefix(complete_prefix(header)); + return db->submit_transaction(t); +} + +int DBObjectMap::clear_keys_header(const ghobject_t &oid, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + if (check_spos(oid, header, spos)) + return 0; + + // save old attrs + KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header)); + if (!iter) + return -EINVAL; + map<string, bufferlist> attrs; + for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next()) + attrs.insert(make_pair(iter->key(), iter->value())); + if (iter->status()) + return iter->status(); + + // remove current header + remove_map_header(hl, oid, header, t); + ceph_assert(header->num_children > 0); + header->num_children--; + int r = _clear(header, t); + if (r < 0) + return r; + + // create new header + Header newheader = generate_new_header(oid, Header()); + set_map_header(hl, oid, *newheader, t); + if (!attrs.empty()) + t->set(xattr_prefix(newheader), attrs); + return db->submit_transaction(t); +} + +int DBObjectMap::get(const ghobject_t &oid, + bufferlist *_header, + map<string, bufferlist> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + _get_header(header, _header); + ObjectMapIterator iter = _get_iterator(header); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + if (iter->status()) + return iter->status(); + out->insert(make_pair(iter->key(), iter->value())); + } + return 0; +} + +int DBObjectMap::get_keys(const ghobject_t &oid, + set<string> *keys) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + ObjectMapIterator iter = _get_iterator(header); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + if (iter->status()) + return iter->status(); + keys->insert(iter->key()); + } + return 0; +} + +int DBObjectMap::scan(Header header, + const set<string> &in_keys, + set<string> *out_keys, + map<string, bufferlist> *out_values) +{ + ObjectMapIterator db_iter = _get_iterator(header); + for (set<string>::const_iterator key_iter = in_keys.begin(); + key_iter != in_keys.end(); + ++key_iter) { + db_iter->lower_bound(*key_iter); + if (db_iter->status()) + return db_iter->status(); + if (db_iter->valid() && db_iter->key() == *key_iter) { + if (out_keys) + out_keys->insert(*key_iter); + if (out_values) + out_values->insert(make_pair(db_iter->key(), db_iter->value())); + } + } + return 0; +} + +int DBObjectMap::get_values(const ghobject_t &oid, + const set<string> &keys, + map<string, bufferlist> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + return scan(header, keys, 0, out); +} + +int DBObjectMap::check_keys(const ghobject_t &oid, + const set<string> &keys, + set<string> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + return scan(header, keys, out, 0); +} + +int DBObjectMap::get_xattrs(const ghobject_t &oid, + const set<string> &to_get, + map<string, bufferlist> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + return db->get(xattr_prefix(header), to_get, out); +} + +int DBObjectMap::get_all_xattrs(const ghobject_t &oid, + set<string> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header)); + if (!iter) + return -EINVAL; + for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next()) + out->insert(iter->key()); + return iter->status(); +} + +int DBObjectMap::set_xattrs(const ghobject_t &oid, + const map<string, bufferlist> &to_set, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_create_map_header(hl, oid, t); + if (!header) + return -EINVAL; + if (check_spos(oid, header, spos)) + return 0; + t->set(xattr_prefix(header), to_set); + return db->submit_transaction(t); +} + +int DBObjectMap::remove_xattrs(const ghobject_t &oid, + const set<string> &to_remove, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + if (check_spos(oid, header, spos)) + return 0; + t->rmkeys(xattr_prefix(header), to_remove); + return db->submit_transaction(t); +} + +// ONLY USED FOR TESTING +// Set version to 2 to avoid asserts +int DBObjectMap::legacy_clone(const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos) +{ + state.legacy = true; + + if (oid == target) + return 0; + + MapHeaderLock _l1(this, std::min(oid, target)); + MapHeaderLock _l2(this, std::max(oid, target)); + MapHeaderLock *lsource, *ltarget; + if (oid > target) { + lsource = &_l2; + ltarget= &_l1; + } else { + lsource = &_l1; + ltarget= &_l2; + } + + KeyValueDB::Transaction t = db->get_transaction(); + { + Header destination = lookup_map_header(*ltarget, target); + if (destination) { + if (check_spos(target, destination, spos)) + return 0; + destination->num_children--; + remove_map_header(*ltarget, target, destination, t); + _clear(destination, t); + } + } + + Header parent = lookup_map_header(*lsource, oid); + if (!parent) + return db->submit_transaction(t); + + Header source = generate_new_header(oid, parent); + Header destination = generate_new_header(target, parent); + if (spos) + destination->spos = *spos; + + parent->num_children = 2; + set_header(parent, t); + set_map_header(*lsource, oid, *source, t); + set_map_header(*ltarget, target, *destination, t); + + map<string, bufferlist> to_set; + KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent)); + for (xattr_iter->seek_to_first(); + xattr_iter->valid(); + xattr_iter->next()) + to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value())); + t->set(xattr_prefix(source), to_set); + t->set(xattr_prefix(destination), to_set); + t->rmkeys_by_prefix(xattr_prefix(parent)); + return db->submit_transaction(t); +} + +int DBObjectMap::clone(const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos) +{ + if (oid == target) + return 0; + + MapHeaderLock _l1(this, std::min(oid, target)); + MapHeaderLock _l2(this, std::max(oid, target)); + MapHeaderLock *lsource, *ltarget; + if (oid > target) { + lsource = &_l2; + ltarget= &_l1; + } else { + lsource = &_l1; + ltarget= &_l2; + } + + KeyValueDB::Transaction t = db->get_transaction(); + { + Header destination = lookup_map_header(*ltarget, target); + if (destination) { + if (check_spos(target, destination, spos)) + return 0; + destination->num_children--; + remove_map_header(*ltarget, target, destination, t); + _clear(destination, t); + } + } + + Header source = lookup_map_header(*lsource, oid); + if (!source) + return db->submit_transaction(t); + + Header destination = generate_new_header(target, Header()); + if (spos) + destination->spos = *spos; + + set_map_header(*ltarget, target, *destination, t); + + bufferlist bl; + int r = _get_header(source, &bl); + if (r < 0) + return r; + _set_header(destination, bl, t); + + map<string, bufferlist> to_set; + KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(source)); + for (xattr_iter->seek_to_first(); + xattr_iter->valid(); + xattr_iter->next()) + to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value())); + t->set(xattr_prefix(destination), to_set); + + map<string, bufferlist> to_write; + ObjectMapIterator iter = _get_iterator(source); + for (iter->seek_to_first() ; iter->valid() ; iter->next()) { + if (iter->status()) + return iter->status(); + to_write[iter->key()] = iter->value(); + } + t->set(user_prefix(destination), to_write); + + return db->submit_transaction(t); +} + +int DBObjectMap::upgrade_to_v2() +{ + dout(1) << __func__ << " start" << dendl; + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + iter->seek_to_first(); + while (iter->valid()) { + unsigned count = 0; + KeyValueDB::Transaction t = db->get_transaction(); + set<string> remove; + map<string, bufferlist> add; + for (; + iter->valid() && count < 300; + iter->next()) { + dout(20) << __func__ << " key is " << iter->key() << dendl; + int r = is_buggy_ghobject_key_v1(cct, iter->key()); + if (r < 0) { + derr << __func__ << " bad key '" << iter->key() << "'" << dendl; + return r; + } + if (!r) { + dout(20) << __func__ << " " << iter->key() << " ok" << dendl; + continue; + } + + // decode header to get oid + _Header hdr; + bufferlist bl = iter->value(); + auto bliter = bl.cbegin(); + hdr.decode(bliter); + + string newkey(ghobject_key(hdr.oid)); + dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl; + add[newkey] = iter->value(); + remove.insert(iter->key()); + ++count; + } + + if (!remove.empty()) { + dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl; + t->rmkeys(HOBJECT_TO_SEQ, remove); + t->set(HOBJECT_TO_SEQ, add); + int r = db->submit_transaction(t); + if (r < 0) + return r; + } + } + + state.v = 2; + + set_state(); + return 0; +} + +void DBObjectMap::set_state() +{ + Mutex::Locker l(header_lock); + KeyValueDB::Transaction t = db->get_transaction(); + write_state(t); + int ret = db->submit_transaction_sync(t); + ceph_assert(ret == 0); + dout(1) << __func__ << " done" << dendl; + return; +} + +int DBObjectMap::get_state() +{ + map<string, bufferlist> result; + set<string> to_get; + to_get.insert(GLOBAL_STATE_KEY); + int r = db->get(SYS_PREFIX, to_get, &result); + if (r < 0) + return r; + if (!result.empty()) { + auto bliter = result.begin()->second.cbegin(); + state.decode(bliter); + } else { + // New store + state.v = State::CUR_VERSION; + state.seq = 1; + state.legacy = false; + } + return 0; +} + +int DBObjectMap::init(bool do_upgrade) +{ + int ret = get_state(); + if (ret < 0) + return ret; + if (state.v < 1) { + dout(1) << "DBObjectMap is *very* old; upgrade to an older version first" + << dendl; + return -ENOTSUP; + } + if (state.v < 2) { // Needs upgrade + if (!do_upgrade) { + dout(1) << "DOBjbectMap requires an upgrade," + << " set filestore_update_to" + << dendl; + return -ENOTSUP; + } else { + int r = upgrade_to_v2(); + if (r < 0) + return r; + } + } + ostringstream ss; + int errors = check(ss, true); + if (errors) { + derr << ss.str() << dendl; + if (errors > 0) + return -EINVAL; + } + dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl; + return 0; +} + +int DBObjectMap::sync(const ghobject_t *oid, + const SequencerPosition *spos) { + KeyValueDB::Transaction t = db->get_transaction(); + if (oid) { + ceph_assert(spos); + MapHeaderLock hl(this, *oid); + Header header = lookup_map_header(hl, *oid); + if (header) { + dout(10) << "oid: " << *oid << " setting spos to " + << *spos << dendl; + header->spos = *spos; + set_map_header(hl, *oid, *header, t); + } + /* It may appear that this and the identical portion of the else + * block can combined below, but in this block, the transaction + * must be submitted under *both* the MapHeaderLock and the full + * header_lock. + * + * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891. + */ + Mutex::Locker l(header_lock); + write_state(t); + return db->submit_transaction_sync(t); + } else { + Mutex::Locker l(header_lock); + write_state(t); + return db->submit_transaction_sync(t); + } +} + +int DBObjectMap::write_state(KeyValueDB::Transaction _t) { + ceph_assert(header_lock.is_locked_by_me()); + dout(20) << "dbobjectmap: seq is " << state.seq << dendl; + KeyValueDB::Transaction t = _t ? _t : db->get_transaction(); + bufferlist bl; + state.encode(bl); + map<string, bufferlist> to_write; + to_write[GLOBAL_STATE_KEY] = bl; + t->set(SYS_PREFIX, to_write); + return _t ? 0 : db->submit_transaction(t); +} + + +DBObjectMap::Header DBObjectMap::_lookup_map_header( + const MapHeaderLock &l, + const ghobject_t &oid) +{ + ceph_assert(l.get_locked() == oid); + + _Header *header = new _Header(); + { + Mutex::Locker l(cache_lock); + if (caches.lookup(oid, header)) { + ceph_assert(!in_use.count(header->seq)); + in_use.insert(header->seq); + return Header(header, RemoveOnDelete(this)); + } + } + + bufferlist out; + int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out); + if (r < 0 || out.length()==0) { + delete header; + return Header(); + } + + Header ret(header, RemoveOnDelete(this)); + auto iter = out.cbegin(); + ret->decode(iter); + { + Mutex::Locker l(cache_lock); + caches.add(oid, *ret); + } + + ceph_assert(!in_use.count(header->seq)); + in_use.insert(header->seq); + return ret; +} + +DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid, + Header parent) +{ + Header header = Header(new _Header(), RemoveOnDelete(this)); + header->seq = state.seq++; + if (parent) { + header->parent = parent->seq; + header->spos = parent->spos; + } + header->num_children = 1; + header->oid = oid; + ceph_assert(!in_use.count(header->seq)); + in_use.insert(header->seq); + + write_state(); + return header; +} + +DBObjectMap::Header DBObjectMap::lookup_parent(Header input) +{ + Mutex::Locker l(header_lock); + while (in_use.count(input->parent)) + header_cond.Wait(header_lock); + map<string, bufferlist> out; + set<string> keys; + keys.insert(HEADER_KEY); + + dout(20) << "lookup_parent: parent " << input->parent + << " for seq " << input->seq << dendl; + int r = db->get(sys_parent_prefix(input), keys, &out); + if (r < 0) { + ceph_abort(); + return Header(); + } + if (out.empty()) { + ceph_abort(); + return Header(); + } + + Header header = Header(new _Header(), RemoveOnDelete(this)); + auto iter = out.begin()->second.cbegin(); + header->decode(iter); + ceph_assert(header->seq == input->parent); + dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent " + << header->parent << dendl; + in_use.insert(header->seq); + return header; +} + +DBObjectMap::Header DBObjectMap::lookup_create_map_header( + const MapHeaderLock &hl, + const ghobject_t &oid, + KeyValueDB::Transaction t) +{ + Mutex::Locker l(header_lock); + Header header = _lookup_map_header(hl, oid); + if (!header) { + header = _generate_new_header(oid, Header()); + set_map_header(hl, oid, *header, t); + } + return header; +} + +void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t) +{ + dout(20) << "clear_header: clearing seq " << header->seq << dendl; + t->rmkeys_by_prefix(user_prefix(header)); + t->rmkeys_by_prefix(sys_prefix(header)); + if (state.legacy) + t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0 + t->rmkeys_by_prefix(xattr_prefix(header)); + set<string> keys; + keys.insert(header_key(header->seq)); + t->rmkeys(USER_PREFIX, keys); +} + +void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t) +{ + dout(20) << "set_header: setting seq " << header->seq << dendl; + map<string, bufferlist> to_write; + header->encode(to_write[HEADER_KEY]); + t->set(sys_prefix(header), to_write); +} + +void DBObjectMap::remove_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, + Header header, + KeyValueDB::Transaction t) +{ + ceph_assert(l.get_locked() == oid); + dout(20) << "remove_map_header: removing " << header->seq + << " oid " << oid << dendl; + set<string> to_remove; + to_remove.insert(map_header_key(oid)); + t->rmkeys(HOBJECT_TO_SEQ, to_remove); + { + Mutex::Locker l(cache_lock); + caches.clear(oid); + } +} + +void DBObjectMap::set_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, _Header header, + KeyValueDB::Transaction t) +{ + ceph_assert(l.get_locked() == oid); + dout(20) << "set_map_header: setting " << header.seq + << " oid " << oid << " parent seq " + << header.parent << dendl; + map<string, bufferlist> to_set; + header.encode(to_set[map_header_key(oid)]); + t->set(HOBJECT_TO_SEQ, to_set); + { + Mutex::Locker l(cache_lock); + caches.add(oid, header); + } +} + +bool DBObjectMap::check_spos(const ghobject_t &oid, + Header header, + const SequencerPosition *spos) +{ + if (!spos || *spos > header->spos) { + stringstream out; + if (spos) + dout(10) << "oid: " << oid << " not skipping op, *spos " + << *spos << dendl; + else + dout(10) << "oid: " << oid << " not skipping op, *spos " + << "empty" << dendl; + dout(10) << " > header.spos " << header->spos << dendl; + return false; + } else { + dout(10) << "oid: " << oid << " skipping op, *spos " << *spos + << " <= header.spos " << header->spos << dendl; + return true; + } +} + +int DBObjectMap::list_objects(vector<ghobject_t> *out) +{ + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + bufferlist bl = iter->value(); + auto bliter = bl.cbegin(); + _Header header; + header.decode(bliter); + out->push_back(header.oid); + } + return 0; +} + +int DBObjectMap::list_object_headers(vector<_Header> *out) +{ + int error = 0; + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + bufferlist bl = iter->value(); + auto bliter = bl.cbegin(); + _Header header; + header.decode(bliter); + out->push_back(header); + while (header.parent) { + set<string> to_get; + map<string, bufferlist> got; + to_get.insert(HEADER_KEY); + db->get(sys_parent_prefix(header), to_get, &got); + if (got.empty()) { + dout(0) << "Missing: seq " << header.parent << dendl; + error = -ENOENT; + break; + } else { + bl = got.begin()->second; + auto bliter = bl.cbegin(); + header.decode(bliter); + out->push_back(header); + } + } + } + return error; +} + +ostream& operator<<(ostream& out, const DBObjectMap::_Header& h) +{ + out << "seq=" << h.seq << " parent=" << h.parent + << " num_children=" << h.num_children + << " ghobject=" << h.oid; + return out; +} + +int DBObjectMap::rename(const ghobject_t &from, + const ghobject_t &to, + const SequencerPosition *spos) +{ + if (from == to) + return 0; + + MapHeaderLock _l1(this, std::min(from, to)); + MapHeaderLock _l2(this, std::max(from, to)); + MapHeaderLock *lsource, *ltarget; + if (from > to) { + lsource = &_l2; + ltarget= &_l1; + } else { + lsource = &_l1; + ltarget= &_l2; + } + + KeyValueDB::Transaction t = db->get_transaction(); + { + Header destination = lookup_map_header(*ltarget, to); + if (destination) { + if (check_spos(to, destination, spos)) + return 0; + destination->num_children--; + remove_map_header(*ltarget, to, destination, t); + _clear(destination, t); + } + } + + Header hdr = lookup_map_header(*lsource, from); + if (!hdr) + return db->submit_transaction(t); + + remove_map_header(*lsource, from, hdr, t); + hdr->oid = to; + set_map_header(*ltarget, to, *hdr, t); + + return db->submit_transaction(t); +} diff --git a/src/os/filestore/DBObjectMap.h b/src/os/filestore/DBObjectMap.h new file mode 100644 index 00000000..e288df83 --- /dev/null +++ b/src/os/filestore/DBObjectMap.h @@ -0,0 +1,585 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +#ifndef DBOBJECTMAP_DB_H +#define DBOBJECTMAP_DB_H + +#include "include/buffer_fwd.h" +#include <set> +#include <map> +#include <string> + +#include <vector> +#include <boost/scoped_ptr.hpp> + +#include "os/ObjectMap.h" +#include "kv/KeyValueDB.h" +#include "osd/osd_types.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/simple_cache.hpp" +#include <boost/optional/optional_io.hpp> + +#include "SequencerPosition.h" + +/** + * DBObjectMap: Implements ObjectMap in terms of KeyValueDB + * + * Prefix space structure: + * + * @see complete_prefix + * @see user_prefix + * @see sys_prefix + * + * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and + * corresponding omap header + * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number + * @see State + * @see write_state + * @see init + * @see generate_new_header + * - USER_PREFIX + header_key(header->seq) + USER_PREFIX + * : key->value for header->seq + * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below + * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs + * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX + * : USER_HEADER_KEY - omap header for header->seq + * : HEADER_KEY - encoding of header for header->seq + * + * For each node (represented by a header), we + * store three mappings: the key mapping, the complete mapping, and the parent. + * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in + * this mapping indicates that the key mapping contains all entries on [x,y). + * Note, max string is represented by "", so ""->"" indicates that the parent + * is unnecessary (@see rm_keys). When looking up a key not contained in the + * the complete set, we have to check the parent if we don't find it in the + * key set. During rm_keys, we copy keys from the parent and update the + * complete set to reflect the change @see rm_keys. + */ +class DBObjectMap : public ObjectMap { +public: + + KeyValueDB *get_db() override { return db.get(); } + + /** + * Serializes access to next_seq as well as the in_use set + */ + Mutex header_lock; + Cond header_cond; + Cond map_header_cond; + + /** + * Set of headers currently in use + */ + set<uint64_t> in_use; + set<ghobject_t> map_header_in_use; + + /** + * Takes the map_header_in_use entry in constructor, releases in + * destructor + */ + class MapHeaderLock { + DBObjectMap *db; + boost::optional<ghobject_t> locked; + + MapHeaderLock(const MapHeaderLock &); + MapHeaderLock &operator=(const MapHeaderLock &); + public: + explicit MapHeaderLock(DBObjectMap *db) : db(db) {} + MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) { + Mutex::Locker l(db->header_lock); + while (db->map_header_in_use.count(*locked)) + db->map_header_cond.Wait(db->header_lock); + db->map_header_in_use.insert(*locked); + } + + const ghobject_t &get_locked() const { + ceph_assert(locked); + return *locked; + } + + void swap(MapHeaderLock &o) { + ceph_assert(db == o.db); + + // centos6's boost optional doesn't seem to have swap :( + boost::optional<ghobject_t> _locked = o.locked; + o.locked = locked; + locked = _locked; + } + + ~MapHeaderLock() { + if (locked) { + Mutex::Locker l(db->header_lock); + ceph_assert(db->map_header_in_use.count(*locked)); + db->map_header_cond.Signal(); + db->map_header_in_use.erase(*locked); + } + } + }; + + DBObjectMap(CephContext* cct, KeyValueDB *db) + : ObjectMap(cct, db), header_lock("DBOBjectMap"), + cache_lock("DBObjectMap::CacheLock"), + caches(cct->_conf->filestore_omap_header_cache_size) + {} + + int set_keys( + const ghobject_t &oid, + const map<string, bufferlist> &set, + const SequencerPosition *spos=0 + ) override; + + int set_header( + const ghobject_t &oid, + const bufferlist &bl, + const SequencerPosition *spos=0 + ) override; + + int get_header( + const ghobject_t &oid, + bufferlist *bl + ) override; + + int clear( + const ghobject_t &oid, + const SequencerPosition *spos=0 + ) override; + + int clear_keys_header( + const ghobject_t &oid, + const SequencerPosition *spos=0 + ) override; + + int rm_keys( + const ghobject_t &oid, + const set<string> &to_clear, + const SequencerPosition *spos=0 + ) override; + + int get( + const ghobject_t &oid, + bufferlist *header, + map<string, bufferlist> *out + ) override; + + int get_keys( + const ghobject_t &oid, + set<string> *keys + ) override; + + int get_values( + const ghobject_t &oid, + const set<string> &keys, + map<string, bufferlist> *out + ) override; + + int check_keys( + const ghobject_t &oid, + const set<string> &keys, + set<string> *out + ) override; + + int get_xattrs( + const ghobject_t &oid, + const set<string> &to_get, + map<string, bufferlist> *out + ) override; + + int get_all_xattrs( + const ghobject_t &oid, + set<string> *out + ) override; + + int set_xattrs( + const ghobject_t &oid, + const map<string, bufferlist> &to_set, + const SequencerPosition *spos=0 + ) override; + + int remove_xattrs( + const ghobject_t &oid, + const set<string> &to_remove, + const SequencerPosition *spos=0 + ) override; + + int clone( + const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos=0 + ) override; + + int rename( + const ghobject_t &from, + const ghobject_t &to, + const SequencerPosition *spos=0 + ); + + int legacy_clone( + const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos=0 + ); + + /// Read initial state from backing store + int get_state(); + /// Write current state settings to DB + void set_state(); + /// Read initial state and upgrade or initialize state + int init(bool upgrade = false); + + /// Upgrade store to current version + int upgrade_to_v2(); + + /// Consistency check, debug, there must be no parallel writes + int check(std::ostream &out, bool repair = false, bool force = false) override; + + /// Ensure that all previous operations are durable + int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override; + + void compact() override { + ceph_assert(db); + db->compact(); + } + + /// Util, get all objects, there must be no other concurrent access + int list_objects(vector<ghobject_t> *objs ///< [out] objects + ); + + struct _Header; + // Util, get all object headers, there must be no other concurrent access + int list_object_headers(vector<_Header> *out ///< [out] headers + ); + + ObjectMapIterator get_iterator(const ghobject_t &oid) override; + + static const string USER_PREFIX; + static const string XATTR_PREFIX; + static const string SYS_PREFIX; + static const string COMPLETE_PREFIX; + static const string HEADER_KEY; + static const string USER_HEADER_KEY; + static const string GLOBAL_STATE_KEY; + static const string HOBJECT_TO_SEQ; + + /// Legacy + static const string LEAF_PREFIX; + static const string REVERSE_LEAF_PREFIX; + + /// persistent state for store @see generate_header + struct State { + static const __u8 CUR_VERSION = 3; + __u8 v; + uint64_t seq; + // legacy is false when complete regions never used + bool legacy; + State() : v(0), seq(1), legacy(false) {} + explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {} + + void encode(bufferlist &bl) const { + ENCODE_START(3, 1, bl); + encode(v, bl); + encode(seq, bl); + encode(legacy, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl) { + DECODE_START(3, bl); + if (struct_v >= 2) + decode(v, bl); + else + v = 0; + decode(seq, bl); + if (struct_v >= 3) + decode(legacy, bl); + else + legacy = false; + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + f->dump_unsigned("v", v); + f->dump_unsigned("seq", seq); + f->dump_bool("legacy", legacy); + } + + static void generate_test_instances(list<State*> &o) { + o.push_back(new State(0)); + o.push_back(new State(20)); + } + } state; + + struct _Header { + uint64_t seq; + uint64_t parent; + uint64_t num_children; + + ghobject_t oid; + + SequencerPosition spos; + + void encode(bufferlist &bl) const { + coll_t unused; + ENCODE_START(2, 1, bl); + encode(seq, bl); + encode(parent, bl); + encode(num_children, bl); + encode(unused, bl); + encode(oid, bl); + encode(spos, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl) { + coll_t unused; + DECODE_START(2, bl); + decode(seq, bl); + decode(parent, bl); + decode(num_children, bl); + decode(unused, bl); + decode(oid, bl); + if (struct_v >= 2) + decode(spos, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + f->dump_unsigned("seq", seq); + f->dump_unsigned("parent", parent); + f->dump_unsigned("num_children", num_children); + f->dump_stream("oid") << oid; + } + + static void generate_test_instances(list<_Header*> &o) { + o.push_back(new _Header); + o.push_back(new _Header); + o.back()->parent = 20; + o.back()->seq = 30; + } + + size_t length() { + return sizeof(_Header); + } + + _Header() : seq(0), parent(0), num_children(1) {} + }; + + /// String munging (public for testing) + static string ghobject_key(const ghobject_t &oid); + static string ghobject_key_v0(coll_t c, const ghobject_t &oid); + static int is_buggy_ghobject_key_v1(CephContext* cct, + const string &in); +private: + /// Implicit lock on Header->seq + typedef std::shared_ptr<_Header> Header; + Mutex cache_lock; + SimpleLRU<ghobject_t, _Header> caches; + + string map_header_key(const ghobject_t &oid); + string header_key(uint64_t seq); + string complete_prefix(Header header); + string user_prefix(Header header); + string sys_prefix(Header header); + string xattr_prefix(Header header); + string sys_parent_prefix(_Header header); + string sys_parent_prefix(Header header) { + return sys_parent_prefix(*header); + } + + class EmptyIteratorImpl : public ObjectMapIteratorImpl { + public: + int seek_to_first() override { return 0; } + int seek_to_last() { return 0; } + int upper_bound(const string &after) override { return 0; } + int lower_bound(const string &to) override { return 0; } + bool valid() override { return false; } + int next() override { ceph_abort(); return 0; } + string key() override { ceph_abort(); return ""; } + bufferlist value() override { ceph_abort(); return bufferlist(); } + int status() override { return 0; } + }; + + + /// Iterator + class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl { + public: + DBObjectMap *map; + + /// NOTE: implicit lock hlock->get_locked() when returned out of the class + MapHeaderLock hlock; + /// NOTE: implicit lock on header->seq AND for all ancestors + Header header; + + /// parent_iter == NULL iff no parent + std::shared_ptr<DBObjectMapIteratorImpl> parent_iter; + KeyValueDB::Iterator key_iter; + KeyValueDB::Iterator complete_iter; + + /// cur_iter points to currently valid iterator + std::shared_ptr<ObjectMapIteratorImpl> cur_iter; + int r; + + /// init() called, key_iter, complete_iter, parent_iter filled in + bool ready; + /// past end + bool invalid; + + DBObjectMapIteratorImpl(DBObjectMap *map, Header header) : + map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {} + int seek_to_first() override; + int seek_to_last(); + int upper_bound(const string &after) override; + int lower_bound(const string &to) override; + bool valid() override; + int next() override; + string key() override; + bufferlist value() override; + int status() override; + + bool on_parent() { + return cur_iter == parent_iter; + } + + /// skips to next valid parent entry + int next_parent(); + + /// first parent() >= to + int lower_bound_parent(const string &to); + + /** + * Tests whether to_test is in complete region + * + * postcondition: complete_iter will be max s.t. complete_iter->value > to_test + */ + int in_complete_region(const string &to_test, ///< [in] key to test + string *begin, ///< [out] beginning of region + string *end ///< [out] end of region + ); ///< @returns true if to_test is in the complete region, else false + + private: + int init(); + bool valid_parent(); + int adjust(); + }; + + typedef std::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator; + DBObjectMapIterator _get_iterator(Header header) { + return std::make_shared<DBObjectMapIteratorImpl>(this, header); + } + + /// sys + + /// Removes node corresponding to header + void clear_header(Header header, KeyValueDB::Transaction t); + + /// Set node containing input to new contents + void set_header(Header input, KeyValueDB::Transaction t); + + /// Remove leaf node corresponding to oid in c + void remove_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, + Header header, + KeyValueDB::Transaction t); + + /// Set leaf node for c and oid to the value of header + void set_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, _Header header, + KeyValueDB::Transaction t); + + /// Set leaf node for c and oid to the value of header + bool check_spos(const ghobject_t &oid, + Header header, + const SequencerPosition *spos); + + /// Lookup or create header for c oid + Header lookup_create_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, + KeyValueDB::Transaction t); + + /** + * Generate new header for c oid with new seq number + * + * Has the side effect of synchronously saving the new DBObjectMap state + */ + Header _generate_new_header(const ghobject_t &oid, Header parent); + Header generate_new_header(const ghobject_t &oid, Header parent) { + Mutex::Locker l(header_lock); + return _generate_new_header(oid, parent); + } + + /// Lookup leaf header for c oid + Header _lookup_map_header( + const MapHeaderLock &l, + const ghobject_t &oid); + Header lookup_map_header( + const MapHeaderLock &l2, + const ghobject_t &oid) { + Mutex::Locker l(header_lock); + return _lookup_map_header(l2, oid); + } + + /// Lookup header node for input + Header lookup_parent(Header input); + + + /// Helpers + int _get_header(Header header, bufferlist *bl); + + /// Scan keys in header into out_keys and out_values (if nonnull) + int scan(Header header, + const set<string> &in_keys, + set<string> *out_keys, + map<string, bufferlist> *out_values); + + /// Remove header and all related prefixes + int _clear(Header header, + KeyValueDB::Transaction t); + + /* Scan complete region bumping *begin to the beginning of any + * containing region and adding all complete region keys between + * the updated begin and end to the complete_keys_to_remove set */ + int merge_new_complete(DBObjectMapIterator &iter, + string *begin, + const string &end, + set<string> *complete_keys_to_remove); + + /// Writes out State (mainly next_seq) + int write_state(KeyValueDB::Transaction _t = + KeyValueDB::Transaction()); + + /// Copies header entry from parent @see rm_keys + int copy_up_header(Header header, + KeyValueDB::Transaction t); + + /// Sets header @see set_header + void _set_header(Header header, const bufferlist &bl, + KeyValueDB::Transaction t); + + /** + * Removes header seq lock and possibly object lock + * once Header is out of scope + * @see lookup_parent + * @see generate_new_header + */ + class RemoveOnDelete { + public: + DBObjectMap *db; + explicit RemoveOnDelete(DBObjectMap *db) : + db(db) {} + void operator() (_Header *header) { + Mutex::Locker l(db->header_lock); + ceph_assert(db->in_use.count(header->seq)); + db->in_use.erase(header->seq); + db->header_cond.Signal(); + delete header; + } + }; + friend class RemoveOnDelete; +}; +WRITE_CLASS_ENCODER(DBObjectMap::_Header) +WRITE_CLASS_ENCODER(DBObjectMap::State) + +ostream& operator<<(ostream& out, const DBObjectMap::_Header& h); + +#endif diff --git a/src/os/filestore/FDCache.h b/src/os/filestore/FDCache.h new file mode 100644 index 00000000..ee8c4fb0 --- /dev/null +++ b/src/os/filestore/FDCache.h @@ -0,0 +1,112 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_FDCACHE_H +#define CEPH_FDCACHE_H + +#include <memory> +#include <errno.h> +#include <cstdio> +#include "common/config_obs.h" +#include "common/hobject.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/shared_cache.hpp" +#include "include/compat.h" +#include "include/intarith.h" + +/** + * FD Cache + */ +class FDCache : public md_config_obs_t { +public: + /** + * FD + * + * Wrapper for an fd. Destructor closes the fd. + */ + class FD { + public: + const int fd; + explicit FD(int _fd) : fd(_fd) { + ceph_assert(_fd >= 0); + } + int operator*() const { + return fd; + } + ~FD() { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + }; + +private: + CephContext *cct; + const int registry_shards; + SharedLRU<ghobject_t, FD> *registry; + +public: + explicit FDCache(CephContext *cct) : cct(cct), + registry_shards(std::max<int64_t>(cct->_conf->filestore_fd_cache_shards, 1)) { + ceph_assert(cct); + cct->_conf.add_observer(this); + registry = new SharedLRU<ghobject_t, FD>[registry_shards]; + for (int i = 0; i < registry_shards; ++i) { + registry[i].set_cct(cct); + registry[i].set_size( + std::max<int64_t>((cct->_conf->filestore_fd_cache_size / registry_shards), 1)); + } + } + ~FDCache() override { + cct->_conf.remove_observer(this); + delete[] registry; + } + typedef std::shared_ptr<FD> FDRef; + + FDRef lookup(const ghobject_t &hoid) { + int registry_id = hoid.hobj.get_hash() % registry_shards; + return registry[registry_id].lookup(hoid); + } + + FDRef add(const ghobject_t &hoid, int fd, bool *existed) { + int registry_id = hoid.hobj.get_hash() % registry_shards; + return registry[registry_id].add(hoid, new FD(fd), existed); + } + + /// clear cached fd for hoid, subsequent lookups will get an empty FD + void clear(const ghobject_t &hoid) { + int registry_id = hoid.hobj.get_hash() % registry_shards; + registry[registry_id].purge(hoid); + } + + /// md_config_obs_t + const char** get_tracked_conf_keys() const override { + static const char* KEYS[] = { + "filestore_fd_cache_size", + NULL + }; + return KEYS; + } + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) override { + if (changed.count("filestore_fd_cache_size")) { + for (int i = 0; i < registry_shards; ++i) + registry[i].set_size( + std::max<int64_t>((conf->filestore_fd_cache_size / registry_shards), 1)); + } + } + +}; +typedef FDCache::FDRef FDRef; + +#endif diff --git a/src/os/filestore/FileJournal.cc b/src/os/filestore/FileJournal.cc new file mode 100644 index 00000000..f0351fe4 --- /dev/null +++ b/src/os/filestore/FileJournal.cc @@ -0,0 +1,2216 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "acconfig.h" + +#include "common/debug.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "FileJournal.h" +#include "include/color.h" +#include "common/perf_counters.h" +#include "FileStore.h" + +#include "include/compat.h" + +#include <fcntl.h> +#include <limits.h> +#include <sstream> +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mount.h> + +#include "common/blkdev.h" +#if defined(__linux__) +#include "common/linux_version.h" +#endif + +#if defined(__FreeBSD__) +#define O_DSYNC O_SYNC +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_journal +#undef dout_prefix +#define dout_prefix *_dout << "journal " + +const static int64_t ONE_MEG(1 << 20); +const static int CEPH_DIRECTIO_ALIGNMENT(4096); + + +int FileJournal::_open(bool forwrite, bool create) +{ + int flags, ret; + + if (forwrite) { + flags = O_RDWR; + if (directio) + flags |= O_DIRECT | O_DSYNC; + } else { + flags = O_RDONLY; + } + if (create) + flags |= O_CREAT; + + if (fd >= 0) { + if (TEMP_FAILURE_RETRY(::close(fd))) { + int err = errno; + derr << "FileJournal::_open: error closing old fd: " + << cpp_strerror(err) << dendl; + } + } + fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags|O_CLOEXEC, 0644)); + if (fd < 0) { + int err = errno; + dout(2) << "FileJournal::_open unable to open journal " + << fn << ": " << cpp_strerror(err) << dendl; + return -err; + } + + struct stat st; + ret = ::fstat(fd, &st); + if (ret) { + ret = errno; + derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl; + ret = -ret; + goto out_fd; + } + + if (S_ISBLK(st.st_mode)) { + ret = _open_block_device(); + } else if (S_ISREG(st.st_mode)) { + if (aio && !force_aio) { + derr << "FileJournal::_open: disabling aio for non-block journal. Use " + << "journal_force_aio to force use of aio anyway" << dendl; + aio = false; + } + ret = _open_file(st.st_size, st.st_blksize, create); + } else { + derr << "FileJournal::_open: wrong journal file type: " << st.st_mode + << dendl; + ret = -EINVAL; + } + + if (ret) + goto out_fd; + +#ifdef HAVE_LIBAIO + if (aio) { + aio_ctx = 0; + ret = io_setup(128, &aio_ctx); + if (ret < 0) { + switch (ret) { + // Contrary to naive expectations -EAGIAN means ... + case -EAGAIN: + derr << "FileJournal::_open: user's limit of aio events exceeded. " + << "Try increasing /proc/sys/fs/aio-max-nr" << dendl; + break; + default: + derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl; + break; + } + goto out_fd; + } + } +#endif + + /* We really want max_size to be a multiple of block_size. */ + max_size -= max_size % block_size; + + dout(1) << "_open " << fn << " fd " << fd + << ": " << max_size + << " bytes, block size " << block_size + << " bytes, directio = " << directio + << ", aio = " << aio + << dendl; + return 0; + + out_fd: + VOID_TEMP_FAILURE_RETRY(::close(fd)); + fd = -1; + return ret; +} + +int FileJournal::_open_block_device() +{ + int64_t bdev_sz = 0; + BlkDev blkdev(fd); + int ret = blkdev.get_size(&bdev_sz); + if (ret) { + dout(0) << __func__ << ": failed to read block device size." << dendl; + return -EIO; + } + + /* Check for bdev_sz too small */ + if (bdev_sz < ONE_MEG) { + dout(0) << __func__ << ": your block device must be at least " + << ONE_MEG << " bytes to be used for a Ceph journal." << dendl; + return -EINVAL; + } + + dout(10) << __func__ << ": ignoring osd journal size. " + << "We'll use the entire block device (size: " << bdev_sz << ")" + << dendl; + max_size = bdev_sz; + + block_size = cct->_conf->journal_block_size; + + if (cct->_conf->journal_discard) { + discard = blkdev.support_discard(); + dout(10) << fn << " support discard: " << (int)discard << dendl; + } + + return 0; +} + +int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, + bool create) +{ + int ret; + int64_t conf_journal_sz(cct->_conf->osd_journal_size); + conf_journal_sz <<= 20; + + if ((cct->_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) { + derr << "I'm sorry, I don't know how large of a journal to create." + << "Please specify a block device to use as the journal OR " + << "set osd_journal_size in your ceph.conf" << dendl; + return -EINVAL; + } + + if (create && (oldsize < conf_journal_sz)) { + uint64_t newsize(conf_journal_sz); + dout(10) << __func__ << " _open extending to " << newsize << " bytes" << dendl; + ret = ::ftruncate(fd, newsize); + if (ret < 0) { + int err = errno; + derr << "FileJournal::_open_file : unable to extend journal to " + << newsize << " bytes: " << cpp_strerror(err) << dendl; + return -err; + } + ret = ceph_posix_fallocate(fd, 0, newsize); + if (ret) { + derr << "FileJournal::_open_file : unable to preallocation journal to " + << newsize << " bytes: " << cpp_strerror(ret) << dendl; + return -ret; + } + max_size = newsize; + } + else { + max_size = oldsize; + } + block_size = cct->_conf->journal_block_size; + + if (create && cct->_conf->journal_zero_on_create) { + derr << "FileJournal::_open_file : zeroing journal" << dendl; + uint64_t write_size = 1 << 20; + char *buf; + ret = ::posix_memalign((void **)&buf, block_size, write_size); + if (ret != 0) { + return -ret; + } + memset(static_cast<void*>(buf), 0, write_size); + uint64_t i = 0; + for (; (i + write_size) <= (uint64_t)max_size; i += write_size) { + ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i); + if (ret < 0) { + free(buf); + return -errno; + } + } + if (i < (uint64_t)max_size) { + ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i); + if (ret < 0) { + free(buf); + return -errno; + } + } + free(buf); + } + + + dout(10) << "_open journal is not a block device, NOT checking disk " + << "write cache on '" << fn << "'" << dendl; + + return 0; +} + +// This can not be used on an active journal +int FileJournal::check() +{ + int ret; + + ceph_assert(fd == -1); + ret = _open(false, false); + if (ret) + return ret; + + ret = read_header(&header); + if (ret < 0) + goto done; + + if (header.fsid != fsid) { + derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid + << ", invalid (someone else's?) journal" << dendl; + ret = -EINVAL; + goto done; + } + + dout(1) << "check: header looks ok" << dendl; + ret = 0; + + done: + close(); + return ret; +} + + +int FileJournal::create() +{ + void *buf = 0; + int64_t needed_space; + int ret; + buffer::ptr bp; + dout(2) << "create " << fn << " fsid " << fsid << dendl; + + ret = _open(true, true); + if (ret) + goto done; + + // write empty header + header = header_t(); + header.flags = header_t::FLAG_CRC; // enable crcs on any new journal. + header.fsid = fsid; + header.max_size = max_size; + header.block_size = block_size; + if (cct->_conf->journal_block_align || directio) + header.alignment = block_size; + else + header.alignment = 16; // at least stay word aligned on 64bit machines... + + header.start = get_top(); + header.start_seq = 0; + + print_header(header); + + // static zeroed buffer for alignment padding + delete [] zero_buf; + zero_buf = new char[header.alignment]; + memset(zero_buf, 0, header.alignment); + + bp = prepare_header(); + if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) { + ret = -errno; + derr << "FileJournal::create : create write header error " + << cpp_strerror(ret) << dendl; + goto close_fd; + } + + // zero first little bit, too. + ret = posix_memalign(&buf, block_size, block_size); + if (ret) { + ret = -ret; + derr << "FileJournal::create: failed to allocate " << block_size + << " bytes of memory: " << cpp_strerror(ret) << dendl; + goto close_fd; + } + memset(buf, 0, block_size); + if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) { + ret = -errno; + derr << "FileJournal::create: error zeroing first " << block_size + << " bytes " << cpp_strerror(ret) << dendl; + goto free_buf; + } + + needed_space = cct->_conf->osd_max_write_size << 20; + needed_space += (2 * sizeof(entry_header_t)) + get_top(); + if (header.max_size - header.start < needed_space) { + derr << "FileJournal::create: OSD journal is not large enough to hold " + << "osd_max_write_size bytes!" << dendl; + ret = -ENOSPC; + goto free_buf; + } + + dout(2) << "create done" << dendl; + ret = 0; + +free_buf: + free(buf); + buf = 0; +close_fd: + if (TEMP_FAILURE_RETRY(::close(fd)) < 0) { + ret = -errno; + derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret) + << dendl; + } +done: + fd = -1; + return ret; +} + +// This can not be used on an active journal +int FileJournal::peek_fsid(uuid_d& fsid) +{ + ceph_assert(fd == -1); + int r = _open(false, false); + if (r) + return r; + r = read_header(&header); + if (r < 0) + goto out; + fsid = header.fsid; +out: + close(); + return r; +} + +int FileJournal::open(uint64_t fs_op_seq) +{ + dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl; + + uint64_t next_seq = fs_op_seq + 1; + uint64_t seq = -1; + + int err = _open(false); + if (err) + return err; + + // assume writeable, unless... + read_pos = 0; + write_pos = get_top(); + + // read header? + err = read_header(&header); + if (err < 0) + goto out; + + // static zeroed buffer for alignment padding + delete [] zero_buf; + zero_buf = new char[header.alignment]; + memset(zero_buf, 0, header.alignment); + + dout(10) << "open header.fsid = " << header.fsid + //<< " vs expected fsid = " << fsid + << dendl; + if (header.fsid != fsid) { + derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid + << ", invalid (someone else's?) journal" << dendl; + err = -EINVAL; + goto out; + } + if (header.max_size > max_size) { + dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl; + err = -EINVAL; + goto out; + } + if (header.block_size != block_size) { + dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl; + err = -EINVAL; + goto out; + } + if (header.max_size % header.block_size) { + dout(2) << "open journal max size " << header.max_size + << " not a multiple of block size " << header.block_size << dendl; + err = -EINVAL; + goto out; + } + if (header.alignment != block_size && directio) { + dout(0) << "open journal alignment " << header.alignment << " does not match block size " + << block_size << " (required for direct_io journal mode)" << dendl; + err = -EINVAL; + goto out; + } + if ((header.alignment % CEPH_DIRECTIO_ALIGNMENT) && directio) { + dout(0) << "open journal alignment " << header.alignment + << " is not multiple of minimum directio alignment " + << CEPH_DIRECTIO_ALIGNMENT << " (required for direct_io journal mode)" + << dendl; + err = -EINVAL; + goto out; + } + + // looks like a valid header. + write_pos = 0; // not writeable yet + + journaled_seq = header.committed_up_to; + + // find next entry + read_pos = header.start; + seq = header.start_seq; + + while (1) { + bufferlist bl; + off64_t old_pos = read_pos; + if (!read_entry(bl, seq)) { + dout(10) << "open reached end of journal." << dendl; + break; + } + if (seq > next_seq) { + dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq + << ", ignoring journal contents" + << dendl; + read_pos = -1; + last_committed_seq = 0; + return 0; + } + if (seq == next_seq) { + dout(10) << "open reached seq " << seq << dendl; + read_pos = old_pos; + break; + } + seq++; // next event should follow. + } + + return 0; +out: + close(); + return err; +} + +void FileJournal::_close(int fd) const +{ + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + +void FileJournal::close() +{ + dout(1) << "close " << fn << dendl; + + // stop writer thread + stop_writer(); + + // close + ceph_assert(writeq_empty()); + ceph_assert(!must_write_header); + ceph_assert(fd >= 0); + _close(fd); + fd = -1; +} + + +int FileJournal::dump(ostream& out) +{ + return _dump(out, false); +} + +int FileJournal::simple_dump(ostream& out) +{ + return _dump(out, true); +} + +int FileJournal::_dump(ostream& out, bool simple) +{ + JSONFormatter f(true); + int ret = _fdump(f, simple); + f.flush(out); + return ret; +} + +int FileJournal::_fdump(Formatter &f, bool simple) +{ + dout(10) << "_fdump" << dendl; + + ceph_assert(fd == -1); + int err = _open(false, false); + if (err) + return err; + + err = read_header(&header); + if (err < 0) { + close(); + return err; + } + + off64_t next_pos = header.start; + + f.open_object_section("journal"); + + f.open_object_section("header"); + f.dump_unsigned("flags", header.flags); + ostringstream os; + os << header.fsid; + f.dump_string("fsid", os.str()); + f.dump_unsigned("block_size", header.block_size); + f.dump_unsigned("alignment", header.alignment); + f.dump_int("max_size", header.max_size); + f.dump_int("start", header.start); + f.dump_unsigned("committed_up_to", header.committed_up_to); + f.dump_unsigned("start_seq", header.start_seq); + f.close_section(); + + f.open_array_section("entries"); + uint64_t seq = header.start_seq; + while (1) { + bufferlist bl; + off64_t pos = next_pos; + + if (!pos) { + dout(2) << "_dump -- not readable" << dendl; + err = -EINVAL; + break; + } + stringstream ss; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + &ss); + if (result != SUCCESS) { + if (seq < header.committed_up_to) { + dout(2) << "Unable to read past sequence " << seq + << " but header indicates the journal has committed up through " + << header.committed_up_to << ", journal is corrupt" << dendl; + err = -EINVAL; + } + dout(25) << ss.str() << dendl; + dout(25) << "No further valid entries found, journal is most likely valid" + << dendl; + break; + } + + f.open_object_section("entry"); + f.dump_unsigned("offset", pos); + f.dump_unsigned("seq", seq); + if (simple) { + f.dump_unsigned("bl.length", bl.length()); + } else { + f.open_array_section("transactions"); + auto p = bl.cbegin(); + int trans_num = 0; + while (!p.end()) { + ObjectStore::Transaction t(p); + f.open_object_section("transaction"); + f.dump_unsigned("trans_num", trans_num); + t.dump(&f); + f.close_section(); + trans_num++; + } + f.close_section(); + } + f.close_section(); + } + + f.close_section(); + f.close_section(); + dout(10) << "dump finish" << dendl; + + close(); + return err; +} + + +void FileJournal::start_writer() +{ + write_stop = false; + aio_stop = false; + write_thread.create("journal_write"); +#ifdef HAVE_LIBAIO + if (aio) + write_finish_thread.create("journal_wrt_fin"); +#endif +} + +void FileJournal::stop_writer() +{ + // Do nothing if writer already stopped or never started + if (!write_stop) + { + { + Mutex::Locker l(write_lock); + Mutex::Locker p(writeq_lock); + write_stop = true; + writeq_cond.Signal(); + // Doesn't hurt to signal commit_cond in case thread is waiting there + // and caller didn't use committed_thru() first. + commit_cond.Signal(); + } + write_thread.join(); + + // write journal header now so that we have less to replay on remount + write_header_sync(); + } + +#ifdef HAVE_LIBAIO + // stop aio completeion thread *after* writer thread has stopped + // and has submitted all of its io + if (aio && !aio_stop) { + aio_lock.Lock(); + aio_stop = true; + aio_cond.Signal(); + write_finish_cond.Signal(); + aio_lock.Unlock(); + write_finish_thread.join(); + } +#endif +} + + + +void FileJournal::print_header(const header_t &header) const +{ + dout(10) << "header: block_size " << header.block_size + << " alignment " << header.alignment + << " max_size " << header.max_size + << dendl; + dout(10) << "header: start " << header.start << dendl; + dout(10) << " write_pos " << write_pos << dendl; +} + +int FileJournal::read_header(header_t *hdr) const +{ + dout(10) << "read_header" << dendl; + bufferlist bl; + + buffer::ptr bp = buffer::create_small_page_aligned(block_size); + char* bpdata = bp.c_str(); + int r = ::pread(fd, bpdata, bp.length(), 0); + + if (r < 0) { + int err = errno; + dout(0) << "read_header got " << cpp_strerror(err) << dendl; + return -err; + } + + // don't use bp.zero() here, because it also invalidates + // crc cache (which is not yet populated anyway) + if (bp.length() != (size_t)r) { + // r will be always less or equal than bp.length + bpdata += r; + memset(bpdata, 0, bp.length() - r); + } + + bl.push_back(std::move(bp)); + + try { + auto p = bl.cbegin(); + decode(*hdr, p); + } + catch (buffer::error& e) { + derr << "read_header error decoding journal header" << dendl; + return -EINVAL; + } + + + /* + * Unfortunately we weren't initializing the flags field for new + * journals! Aie. This is safe(ish) now that we have only one + * flag. Probably around when we add the next flag we need to + * remove this or else this (eventually old) code will clobber newer + * code's flags. + */ + if (hdr->flags > 3) { + derr << "read_header appears to have gibberish flags; assuming 0" << dendl; + hdr->flags = 0; + } + + print_header(*hdr); + + return 0; +} + +bufferptr FileJournal::prepare_header() +{ + bufferlist bl; + { + Mutex::Locker l(finisher_lock); + header.committed_up_to = journaled_seq; + } + encode(header, bl); + bufferptr bp = buffer::create_small_page_aligned(get_top()); + // don't use bp.zero() here, because it also invalidates + // crc cache (which is not yet populated anyway) + char* data = bp.c_str(); + memcpy(data, bl.c_str(), bl.length()); + data += bl.length(); + memset(data, 0, bp.length()-bl.length()); + return bp; +} + +void FileJournal::write_header_sync() +{ + Mutex::Locker locker(write_lock); + must_write_header = true; + bufferlist bl; + do_write(bl); + dout(20) << __func__ << " finish" << dendl; +} + +int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size) +{ + // already full? + if (full_state != FULL_NOTFULL) + return -ENOSPC; + + // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL. + off64_t room; + if (pos >= header.start) + room = (header.max_size - pos) + (header.start - get_top()) - 1; + else + room = header.start - pos - 1; + dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start + << " top " << get_top() << dendl; + + if (do_sync_cond) { + if (room >= (header.max_size >> 1) && + room - size < (header.max_size >> 1)) { + dout(10) << " passing half full mark, triggering commit" << dendl; + do_sync_cond->SloppySignal(); // initiate a real commit so we can trim + } + } + + if (room >= size) { + dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl; + if (pos + size > header.max_size) + must_write_header = true; + return 0; + } + + // full + dout(1) << "check_for_full at " << pos << " : JOURNAL FULL " + << pos << " >= " << room + << " (max_size " << header.max_size << " start " << header.start << ")" + << dendl; + + off64_t max = header.max_size - get_top(); + if (size > max) + dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl; + + return -ENOSPC; +} + +int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes) +{ + // gather queued writes + off64_t queue_pos = write_pos; + + int eleft = cct->_conf->journal_max_write_entries; + unsigned bmax = cct->_conf->journal_max_write_bytes; + + if (full_state != FULL_NOTFULL) + return -ENOSPC; + + while (!writeq_empty()) { + list<write_item> items; + batch_pop_write(items); + list<write_item>::iterator it = items.begin(); + while (it != items.end()) { + uint64_t bytes = it->bl.length(); + int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes); + if (r == 0) { // prepare ok, delete it + items.erase(it++); +#ifdef HAVE_LIBAIO + { + Mutex::Locker locker(aio_lock); + ceph_assert(aio_write_queue_ops > 0); + aio_write_queue_ops--; + ceph_assert(aio_write_queue_bytes >= bytes); + aio_write_queue_bytes -= bytes; + } +#else + (void)bytes; +#endif + } + if (r == -ENOSPC) { + // the journal maybe full, insert the left item to writeq + batch_unpop_write(items); + if (orig_ops) + goto out; // commit what we have + + if (logger) + logger->inc(l_filestore_journal_full); + + if (wait_on_full) { + dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl; + } else { + dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl; + + // throw out what we have so far + full_state = FULL_FULL; + while (!writeq_empty()) { + complete_write(1, peek_write().orig_len); + pop_write(); + } + print_header(header); + } + + return -ENOSPC; // hrm, full on first op + } + if (eleft) { + if (--eleft == 0) { + dout(20) << "prepare_multi_write hit max events per write " + << cct->_conf->journal_max_write_entries << dendl; + batch_unpop_write(items); + goto out; + } + } + if (bmax) { + if (bl.length() >= bmax) { + dout(20) << "prepare_multi_write hit max write size " + << cct->_conf->journal_max_write_bytes << dendl; + batch_unpop_write(items); + goto out; + } + } + } + } + +out: + dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl; + ceph_assert((write_pos + bl.length() == queue_pos) || + (write_pos + bl.length() - header.max_size + get_top() == queue_pos)); + return 0; +} + +/* +void FileJournal::queue_write_fin(uint64_t seq, Context *fin) +{ + writing_seq.push_back(seq); + if (!waiting_for_notfull.empty()) { + // make sure previously unjournaled stuff waiting for UNFULL triggers + // _before_ newly journaled stuff does + dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin + << " until after UNFULL" << dendl; + C_Gather *g = new C_Gather(writeq.front().fin); + writing_fin.push_back(g->new_sub()); + waiting_for_notfull.push_back(g->new_sub()); + } else { + writing_fin.push_back(writeq.front().fin); + dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl; + } +} +*/ + +void FileJournal::queue_completions_thru(uint64_t seq) +{ + ceph_assert(finisher_lock.is_locked()); + utime_t now = ceph_clock_now(); + list<completion_item> items; + batch_pop_completions(items); + list<completion_item>::iterator it = items.begin(); + while (it != items.end()) { + completion_item& next = *it; + if (next.seq > seq) + break; + utime_t lat = now; + lat -= next.start; + dout(10) << "queue_completions_thru seq " << seq + << " queueing seq " << next.seq + << " " << next.finish + << " lat " << lat << dendl; + if (logger) { + logger->tinc(l_filestore_journal_latency, lat); + } + if (next.finish) + finisher->queue(next.finish); + if (next.tracked_op) { + next.tracked_op->mark_event("journaled_completion_queued"); + next.tracked_op->journal_trace.event("queued completion"); + next.tracked_op->journal_trace.keyval("completed through", seq); + } + items.erase(it++); + } + batch_unpop_completions(items); + finisher_cond.Signal(); +} + + +int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes) +{ + uint64_t seq = next_write.seq; + bufferlist &ebl = next_write.bl; + off64_t size = ebl.length(); + + int r = check_for_full(seq, queue_pos, size); + if (r < 0) + return r; // ENOSPC or EAGAIN + + uint32_t orig_len = next_write.orig_len; + orig_bytes += orig_len; + orig_ops++; + + // add to write buffer + dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq + << " len " << orig_len << " -> " << size << dendl; + + unsigned seq_offset = offsetof(entry_header_t, seq); + unsigned magic1_offset = offsetof(entry_header_t, magic1); + unsigned magic2_offset = offsetof(entry_header_t, magic2); + + bufferptr headerptr = ebl.buffers().front(); + uint64_t _seq = seq; + uint64_t _queue_pos = queue_pos; + uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64()); + headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq); + headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos); + headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2); + + bufferptr footerptr = ebl.buffers().back(); + unsigned post_offset = footerptr.length() - sizeof(entry_header_t); + footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq); + footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos); + footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2); + + bl.claim_append(ebl); + if (next_write.tracked_op) { + next_write.tracked_op->mark_event("write_thread_in_journal_buffer"); + next_write.tracked_op->journal_trace.event("prepare_single_write"); + } + + journalq.push_back(pair<uint64_t,off64_t>(seq, queue_pos)); + writing_seq = seq; + + queue_pos += size; + if (queue_pos >= header.max_size) + queue_pos = queue_pos + get_top() - header.max_size; + + return 0; +} + +void FileJournal::check_align(off64_t pos, bufferlist& bl) +{ + // make sure list segments are page aligned + if (directio && !bl.is_aligned_size_and_memory(block_size, CEPH_DIRECTIO_ALIGNMENT)) { + ceph_assert((bl.length() & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0); + ceph_assert((pos & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0); + ceph_abort_msg("bl was not aligned"); + } +} + +int FileJournal::write_bl(off64_t& pos, bufferlist& bl) +{ + int ret; + + off64_t spos = ::lseek64(fd, pos, SEEK_SET); + if (spos < 0) { + ret = -errno; + derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl; + return ret; + } + ret = bl.write_fd(fd); + if (ret) { + derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl; + return ret; + } + pos += bl.length(); + if (pos == header.max_size) + pos = get_top(); + return 0; +} + +void FileJournal::do_write(bufferlist& bl) +{ + // nothing to do? + if (bl.length() == 0 && !must_write_header) + return; + + buffer::ptr hbp; + if (cct->_conf->journal_write_header_frequency && + (((++journaled_since_start) % + cct->_conf->journal_write_header_frequency) == 0)) { + must_write_header = true; + } + + if (must_write_header) { + must_write_header = false; + hbp = prepare_header(); + } + + dout(15) << "do_write writing " << write_pos << "~" << bl.length() + << (hbp.length() ? " + header":"") + << dendl; + + utime_t from = ceph_clock_now(); + + // entry + off64_t pos = write_pos; + + // Adjust write_pos + write_pos += bl.length(); + if (write_pos >= header.max_size) + write_pos = write_pos - header.max_size + get_top(); + + write_lock.Unlock(); + + // split? + off64_t split = 0; + if (pos + bl.length() > header.max_size) { + bufferlist first, second; + split = header.max_size - pos; + first.substr_of(bl, 0, split); + second.substr_of(bl, split, bl.length() - split); + ceph_assert(first.length() + second.length() == bl.length()); + dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length() + << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl; + + //Save pos to write first piece second + off64_t first_pos = pos; + off64_t orig_pos; + pos = get_top(); + // header too? + if (hbp.length()) { + // be sneaky: include the header in the second fragment + bufferlist tmp; + tmp.push_back(hbp); + tmp.claim_append(second); + second.swap(tmp); + pos = 0; // we included the header + } + // Write the second portion first possible with the header, so + // do_read_entry() won't even get a valid entry_header_t if there + // is a crash between the two writes. + orig_pos = pos; + if (write_bl(pos, second)) { + derr << "FileJournal::do_write: write_bl(pos=" << orig_pos + << ") failed" << dendl; + check_align(pos, second); + ceph_abort(); + } + orig_pos = first_pos; + if (write_bl(first_pos, first)) { + derr << "FileJournal::do_write: write_bl(pos=" << orig_pos + << ") failed" << dendl; + check_align(first_pos, first); + ceph_abort(); + } + ceph_assert(first_pos == get_top()); + } else { + // header too? + if (hbp.length()) { + if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) { + int err = errno; + derr << "FileJournal::do_write: pwrite(fd=" << fd + << ", hbp.length=" << hbp.length() << ") failed :" + << cpp_strerror(err) << dendl; + ceph_abort(); + } + } + + if (write_bl(pos, bl)) { + derr << "FileJournal::do_write: write_bl(pos=" << pos + << ") failed" << dendl; + check_align(pos, bl); + ceph_abort(); + } + } + + if (!directio) { + dout(20) << "do_write fsync" << dendl; + + /* + * We'd really love to have a fsync_range or fdatasync_range and do a: + * + * if (split) { + * ::fsync_range(fd, header.max_size - split, split)l + * ::fsync_range(fd, get_top(), bl.length() - split); + * else + * ::fsync_range(fd, write_pos, bl.length()) + * + * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be + * too hard given all the underlying infrastructure already exist. + * + * NOTE: using sync_file_range here would not be safe as it does not + * flush disk caches or commits any sort of metadata. + */ + int ret = 0; +#if defined(__APPLE__) || defined(__FreeBSD__) + ret = ::fsync(fd); +#else + ret = ::fdatasync(fd); +#endif + if (ret < 0) { + derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +#ifdef HAVE_POSIX_FADVISE + if (cct->_conf->filestore_fadvise) + posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + } + + utime_t lat = ceph_clock_now() - from; + dout(20) << "do_write latency " << lat << dendl; + + write_lock.Lock(); + + ceph_assert(write_pos == pos); + ceph_assert(write_pos % header.alignment == 0); + + { + Mutex::Locker locker(finisher_lock); + journaled_seq = writing_seq; + + // kick finisher? + // only if we haven't filled up recently! + if (full_state != FULL_NOTFULL) { + dout(10) << "do_write NOT queueing finisher seq " << journaled_seq + << ", full_commit_seq|full_restart_seq" << dendl; + } else { + if (plug_journal_completions) { + dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq + << " due to completion plug" << dendl; + } else { + dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl; + queue_completions_thru(journaled_seq); + } + } + } +} + +void FileJournal::flush() +{ + dout(10) << "waiting for completions to empty" << dendl; + { + Mutex::Locker l(finisher_lock); + while (!completions_empty()) + finisher_cond.Wait(finisher_lock); + } + dout(10) << "flush waiting for finisher" << dendl; + finisher->wait_for_empty(); + dout(10) << "flush done" << dendl; +} + + +void FileJournal::write_thread_entry() +{ + dout(10) << "write_thread_entry start" << dendl; + while (1) { + { + Mutex::Locker locker(writeq_lock); + if (writeq.empty() && !must_write_header) { + if (write_stop) + break; + dout(20) << "write_thread_entry going to sleep" << dendl; + writeq_cond.Wait(writeq_lock); + dout(20) << "write_thread_entry woke up" << dendl; + continue; + } + } + +#ifdef HAVE_LIBAIO + if (aio) { + Mutex::Locker locker(aio_lock); + // should we back off to limit aios in flight? try to do this + // adaptively so that we submit larger aios once we have lots of + // them in flight. + // + // NOTE: our condition here is based on aio_num (protected by + // aio_lock) and throttle_bytes (part of the write queue). when + // we sleep, we *only* wait for aio_num to change, and do not + // wake when more data is queued. this is not strictly correct, + // but should be fine given that we will have plenty of aios in + // flight if we hit this limit to ensure we keep the device + // saturated. + while (aio_num > 0) { + int exp = std::min<int>(aio_num * 2, 24); + long unsigned min_new = 1ull << exp; + uint64_t cur = aio_write_queue_bytes; + dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes + << " ... exp " << exp << " min_new " << min_new + << " ... pending " << cur << dendl; + if (cur >= min_new) + break; + dout(20) << "write_thread_entry deferring until more aios complete: " + << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new + << " bytes to start a new aio (currently " << cur << " pending)" << dendl; + aio_cond.Wait(aio_lock); + dout(20) << "write_thread_entry woke up" << dendl; + } + } +#endif + + Mutex::Locker locker(write_lock); + uint64_t orig_ops = 0; + uint64_t orig_bytes = 0; + + bufferlist bl; + int r = prepare_multi_write(bl, orig_ops, orig_bytes); + // Don't care about journal full if stoppping, so drop queue and + // possibly let header get written and loop above to notice stop + if (r == -ENOSPC) { + if (write_stop) { + dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl; + while (!writeq_empty()) { + complete_write(1, peek_write().orig_len); + pop_write(); + } + print_header(header); + r = 0; + } else { + dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl; + commit_cond.Wait(write_lock); + dout(20) << "write_thread_entry woke up" << dendl; + continue; + } + } + ceph_assert(r == 0); + + if (logger) { + logger->inc(l_filestore_journal_wr); + logger->inc(l_filestore_journal_wr_bytes, bl.length()); + } + +#ifdef HAVE_LIBAIO + if (aio) + do_aio_write(bl); + else + do_write(bl); +#else + do_write(bl); +#endif + complete_write(orig_ops, orig_bytes); + } + + dout(10) << "write_thread_entry finish" << dendl; +} + +#ifdef HAVE_LIBAIO +void FileJournal::do_aio_write(bufferlist& bl) +{ + + if (cct->_conf->journal_write_header_frequency && + (((++journaled_since_start) % + cct->_conf->journal_write_header_frequency) == 0)) { + must_write_header = true; + } + + // nothing to do? + if (bl.length() == 0 && !must_write_header) + return; + + buffer::ptr hbp; + if (must_write_header) { + must_write_header = false; + hbp = prepare_header(); + } + + // entry + off64_t pos = write_pos; + + dout(15) << "do_aio_write writing " << pos << "~" << bl.length() + << (hbp.length() ? " + header":"") + << dendl; + + // split? + off64_t split = 0; + if (pos + bl.length() > header.max_size) { + bufferlist first, second; + split = header.max_size - pos; + first.substr_of(bl, 0, split); + second.substr_of(bl, split, bl.length() - split); + ceph_assert(first.length() + second.length() == bl.length()); + dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl; + + if (write_aio_bl(pos, first, 0)) { + derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + ceph_assert(pos == header.max_size); + if (hbp.length()) { + // be sneaky: include the header in the second fragment + bufferlist tmp; + tmp.push_back(hbp); + tmp.claim_append(second); + second.swap(tmp); + pos = 0; // we included the header + } else + pos = get_top(); // no header, start after that + if (write_aio_bl(pos, second, writing_seq)) { + derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + } else { + // header too? + if (hbp.length()) { + bufferlist hbl; + hbl.push_back(hbp); + loff_t pos = 0; + if (write_aio_bl(pos, hbl, 0)) { + derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl; + ceph_abort(); + } + } + + if (write_aio_bl(pos, bl, writing_seq)) { + derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + } + + write_pos = pos; + if (write_pos == header.max_size) + write_pos = get_top(); + ceph_assert(write_pos % header.alignment == 0); +} + +/** + * write a buffer using aio + * + * @param seq seq to trigger when this aio completes. if 0, do not update any state + * on completion. + */ +int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq) +{ + dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl; + + while (bl.length() > 0) { + int max = std::min<int>(bl.get_num_buffers(), IOV_MAX-1); + iovec *iov = new iovec[max]; + int n = 0; + unsigned len = 0; + for (auto p = std::cbegin(bl.buffers()); n < max; ++p, ++n) { + ceph_assert(p != std::cend(bl.buffers())); + iov[n].iov_base = const_cast<void*>(static_cast<const void*>(p->c_str())); + iov[n].iov_len = p->length(); + len += p->length(); + } + + bufferlist tbl; + bl.splice(0, len, &tbl); // move bytes from bl -> tbl + + // lock only aio_queue, current aio, aio_num, aio_bytes, which may be + // modified in check_aio_completion + aio_lock.Lock(); + aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq)); + aio_info& aio = aio_queue.back(); + aio.iov = iov; + + io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos); + + dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len + << " in " << n << dendl; + + aio_num++; + aio_bytes += aio.len; + + // need to save current aio len to update write_pos later because current + // aio could be ereased from aio_queue once it is done + uint64_t cur_len = aio.len; + // unlock aio_lock because following io_submit might take time to return + aio_lock.Unlock(); + + iocb *piocb = &aio.iocb; + + // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds + int attempts = 16; + int delay = 125; + do { + int r = io_submit(aio_ctx, 1, &piocb); + dout(20) << "write_aio_bl io_submit return value: " << r << dendl; + if (r < 0) { + derr << "io_submit to " << aio.off << "~" << cur_len + << " got " << cpp_strerror(r) << dendl; + if (r == -EAGAIN && attempts-- > 0) { + usleep(delay); + delay *= 2; + continue; + } + check_align(pos, tbl); + ceph_abort_msg("io_submit got unexpected error"); + } else { + break; + } + } while (true); + pos += cur_len; + } + aio_lock.Lock(); + write_finish_cond.Signal(); + aio_lock.Unlock(); + return 0; +} +#endif + +void FileJournal::write_finish_thread_entry() +{ +#ifdef HAVE_LIBAIO + dout(10) << __func__ << " enter" << dendl; + while (true) { + { + Mutex::Locker locker(aio_lock); + if (aio_queue.empty()) { + if (aio_stop) + break; + dout(20) << __func__ << " sleeping" << dendl; + write_finish_cond.Wait(aio_lock); + continue; + } + } + + dout(20) << __func__ << " waiting for aio(s)" << dendl; + io_event event[16]; + int r = io_getevents(aio_ctx, 1, 16, event, NULL); + if (r < 0) { + if (r == -EINTR) { + dout(0) << "io_getevents got " << cpp_strerror(r) << dendl; + continue; + } + derr << "io_getevents got " << cpp_strerror(r) << dendl; + if (r == -EIO) { + note_io_error_event(devname.c_str(), fn.c_str(), -EIO, 0, 0, 0); + } + ceph_abort_msg("got unexpected error from io_getevents"); + } + + { + Mutex::Locker locker(aio_lock); + for (int i=0; i<r; i++) { + aio_info *ai = (aio_info *)event[i].obj; + if (event[i].res != ai->len) { + derr << "aio to " << ai->off << "~" << ai->len + << " returned: " << (int)event[i].res << dendl; + ceph_abort_msg("unexpected aio error"); + } + dout(10) << __func__ << " aio " << ai->off + << "~" << ai->len << " done" << dendl; + ai->done = true; + } + check_aio_completion(); + } + } + dout(10) << __func__ << " exit" << dendl; +#endif +} + +#ifdef HAVE_LIBAIO +/** + * check aio_wait for completed aio, and update state appropriately. + */ +void FileJournal::check_aio_completion() +{ + ceph_assert(aio_lock.is_locked()); + dout(20) << "check_aio_completion" << dendl; + + bool completed_something = false, signal = false; + uint64_t new_journaled_seq = 0; + + list<aio_info>::iterator p = aio_queue.begin(); + while (p != aio_queue.end() && p->done) { + dout(20) << "check_aio_completion completed seq " << p->seq << " " + << p->off << "~" << p->len << dendl; + if (p->seq) { + new_journaled_seq = p->seq; + completed_something = true; + } + aio_num--; + aio_bytes -= p->len; + aio_queue.erase(p++); + signal = true; + } + + if (completed_something) { + // kick finisher? + // only if we haven't filled up recently! + Mutex::Locker locker(finisher_lock); + journaled_seq = new_journaled_seq; + if (full_state != FULL_NOTFULL) { + dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq + << ", full_commit_seq|full_restart_seq" << dendl; + } else { + if (plug_journal_completions) { + dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq + << " due to completion plug" << dendl; + } else { + dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl; + queue_completions_thru(journaled_seq); + } + } + } + if (signal) { + // maybe write queue was waiting for aio count to drop? + aio_cond.Signal(); + } +} +#endif + +int FileJournal::prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) { + dout(10) << "prepare_entry " << tls << dendl; + int data_len = cct->_conf->journal_align_min_size - 1; + int data_align = -1; // -1 indicates that we don't care about the alignment + bufferlist bl; + for (vector<ObjectStore::Transaction>::iterator p = tls.begin(); + p != tls.end(); ++p) { + if ((int)(*p).get_data_length() > data_len) { + data_len = (*p).get_data_length(); + data_align = ((*p).get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK; + } + encode(*p, bl); + } + if (tbl->length()) { + bl.claim_append(*tbl); + } + // add it this entry + entry_header_t h; + unsigned head_size = sizeof(entry_header_t); + off64_t base_size = 2*head_size + bl.length(); + memset(&h, 0, sizeof(h)); + if (data_align >= 0) + h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK; + off64_t size = round_up_to(base_size + h.pre_pad, header.alignment); + unsigned post_pad = size - base_size - h.pre_pad; + h.len = bl.length(); + h.post_pad = post_pad; + h.crc32c = bl.crc32c(0); + dout(10) << " len " << bl.length() << " -> " << size + << " (head " << head_size << " pre_pad " << h.pre_pad + << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")" + << " (bl alignment " << data_align << ")" + << dendl; + bufferlist ebl; + // header + ebl.append((const char*)&h, sizeof(h)); + if (h.pre_pad) { + ebl.push_back(buffer::create_static(h.pre_pad, zero_buf)); + } + // payload + ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy + if (h.post_pad) { + ebl.push_back(buffer::create_static(h.post_pad, zero_buf)); + } + // footer + ebl.append((const char*)&h, sizeof(h)); + if (directio) + ebl.rebuild_aligned(CEPH_DIRECTIO_ALIGNMENT); + tbl->claim(ebl); + return h.len; +} + +void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len, + Context *oncommit, TrackedOpRef osd_op) +{ + // dump on queue + dout(5) << "submit_entry seq " << seq + << " len " << e.length() + << " (" << oncommit << ")" << dendl; + ceph_assert(e.length() > 0); + ceph_assert(e.length() < header.max_size); + + if (logger) { + logger->inc(l_filestore_journal_queue_bytes, orig_len); + logger->inc(l_filestore_journal_queue_ops, 1); + } + + throttle.register_throttle_seq(seq, e.length()); + if (logger) { + logger->inc(l_filestore_journal_ops, 1); + logger->inc(l_filestore_journal_bytes, e.length()); + } + + if (osd_op) { + osd_op->mark_event("commit_queued_for_journal_write"); + if (osd_op->store_trace) { + osd_op->journal_trace.init("journal", &trace_endpoint, &osd_op->store_trace); + osd_op->journal_trace.event("submit_entry"); + osd_op->journal_trace.keyval("seq", seq); + } + } + { + Mutex::Locker l1(writeq_lock); +#ifdef HAVE_LIBAIO + Mutex::Locker l2(aio_lock); +#endif + Mutex::Locker l3(completions_lock); + +#ifdef HAVE_LIBAIO + aio_write_queue_ops++; + aio_write_queue_bytes += e.length(); + aio_cond.Signal(); +#endif + + completions.push_back( + completion_item( + seq, oncommit, ceph_clock_now(), osd_op)); + if (writeq.empty()) + writeq_cond.Signal(); + writeq.push_back(write_item(seq, e, orig_len, osd_op)); + if (osd_op) + osd_op->journal_trace.keyval("queue depth", writeq.size()); + } +} + +bool FileJournal::writeq_empty() +{ + Mutex::Locker locker(writeq_lock); + return writeq.empty(); +} + +FileJournal::write_item &FileJournal::peek_write() +{ + ceph_assert(write_lock.is_locked()); + Mutex::Locker locker(writeq_lock); + return writeq.front(); +} + +void FileJournal::pop_write() +{ + ceph_assert(write_lock.is_locked()); + Mutex::Locker locker(writeq_lock); + if (logger) { + logger->dec(l_filestore_journal_queue_bytes, writeq.front().orig_len); + logger->dec(l_filestore_journal_queue_ops, 1); + } + writeq.pop_front(); +} + +void FileJournal::batch_pop_write(list<write_item> &items) +{ + ceph_assert(write_lock.is_locked()); + { + Mutex::Locker locker(writeq_lock); + writeq.swap(items); + } + for (auto &&i : items) { + if (logger) { + logger->dec(l_filestore_journal_queue_bytes, i.orig_len); + logger->dec(l_filestore_journal_queue_ops, 1); + } + } +} + +void FileJournal::batch_unpop_write(list<write_item> &items) +{ + ceph_assert(write_lock.is_locked()); + for (auto &&i : items) { + if (logger) { + logger->inc(l_filestore_journal_queue_bytes, i.orig_len); + logger->inc(l_filestore_journal_queue_ops, 1); + } + } + Mutex::Locker locker(writeq_lock); + writeq.splice(writeq.begin(), items); +} + +void FileJournal::commit_start(uint64_t seq) +{ + dout(10) << "commit_start" << dendl; + + // was full? + switch (full_state) { + case FULL_NOTFULL: + break; // all good + + case FULL_FULL: + if (seq >= journaled_seq) { + dout(1) << " FULL_FULL -> FULL_WAIT. commit_start on seq " + << seq << " > journaled_seq " << journaled_seq + << ", moving to FULL_WAIT." + << dendl; + full_state = FULL_WAIT; + } else { + dout(1) << "FULL_FULL commit_start on seq " + << seq << " < journaled_seq " << journaled_seq + << ", remaining in FULL_FULL" + << dendl; + } + break; + + case FULL_WAIT: + dout(1) << " FULL_WAIT -> FULL_NOTFULL. journal now active, setting completion plug." << dendl; + full_state = FULL_NOTFULL; + plug_journal_completions = true; + break; + } +} + +/* + *send discard command to joural block deivce + */ +void FileJournal::do_discard(int64_t offset, int64_t end) +{ + dout(10) << __func__ << " trim(" << offset << ", " << end << dendl; + + offset = round_up_to(offset, block_size); + if (offset >= end) + return; + end = round_up_to(end - block_size, block_size); + ceph_assert(end >= offset); + if (offset < end) { + BlkDev blkdev(fd); + if (blkdev.discard(offset, end - offset) < 0) { + dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl; + } + } +} + +void FileJournal::committed_thru(uint64_t seq) +{ + Mutex::Locker locker(write_lock); + + auto released = throttle.flush(seq); + if (logger) { + logger->dec(l_filestore_journal_ops, released.first); + logger->dec(l_filestore_journal_bytes, released.second); + } + + if (seq < last_committed_seq) { + dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl; + ceph_assert(seq >= last_committed_seq); + return; + } + if (seq == last_committed_seq) { + dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl; + return; + } + + dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl; + last_committed_seq = seq; + + // completions! + { + Mutex::Locker locker(finisher_lock); + queue_completions_thru(seq); + if (plug_journal_completions && seq >= header.start_seq) { + dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl; + plug_journal_completions = false; + queue_completions_thru(journaled_seq); + } + } + + // adjust start pointer + while (!journalq.empty() && journalq.front().first <= seq) { + journalq.pop_front(); + } + + int64_t old_start = header.start; + if (!journalq.empty()) { + header.start = journalq.front().second; + header.start_seq = journalq.front().first; + } else { + header.start = write_pos; + header.start_seq = seq + 1; + } + + if (discard) { + dout(10) << __func__ << " will trim (" << old_start << ", " << header.start << ")" << dendl; + if (old_start < header.start) + do_discard(old_start, header.start - 1); + else { + do_discard(old_start, header.max_size - 1); + do_discard(get_top(), header.start - 1); + } + } + + must_write_header = true; + print_header(header); + + // committed but unjournaled items + while (!writeq_empty() && peek_write().seq <= seq) { + dout(15) << " dropping committed but unwritten seq " << peek_write().seq + << " len " << peek_write().bl.length() + << dendl; + complete_write(1, peek_write().orig_len); + pop_write(); + } + + commit_cond.Signal(); + + dout(10) << "committed_thru done" << dendl; +} + + +void FileJournal::complete_write(uint64_t ops, uint64_t bytes) +{ + dout(5) << __func__ << " finished " << ops << " ops and " + << bytes << " bytes" << dendl; +} + +int FileJournal::make_writeable() +{ + dout(10) << __func__ << dendl; + int r = set_throttle_params(); + if (r < 0) + return r; + + r = _open(true); + if (r < 0) + return r; + + if (read_pos > 0) + write_pos = read_pos; + else + write_pos = get_top(); + read_pos = 0; + + must_write_header = true; + + start_writer(); + return 0; +} + +int FileJournal::set_throttle_params() +{ + stringstream ss; + bool valid = throttle.set_params( + cct->_conf->journal_throttle_low_threshhold, + cct->_conf->journal_throttle_high_threshhold, + cct->_conf->filestore_expected_throughput_bytes, + cct->_conf->journal_throttle_high_multiple, + cct->_conf->journal_throttle_max_multiple, + header.max_size - get_top(), + &ss); + + if (!valid) { + derr << "tried to set invalid params: " + << ss.str() + << dendl; + } + return valid ? 0 : -EINVAL; +} + +const char** FileJournal::get_tracked_conf_keys() const +{ + static const char *KEYS[] = { + "journal_throttle_low_threshhold", + "journal_throttle_high_threshhold", + "journal_throttle_high_multiple", + "journal_throttle_max_multiple", + "filestore_expected_throughput_bytes", + NULL}; + return KEYS; +} + +void FileJournal::wrap_read_bl( + off64_t pos, + int64_t olen, + bufferlist* bl, + off64_t *out_pos + ) const +{ + while (olen > 0) { + while (pos >= header.max_size) + pos = pos + get_top() - header.max_size; + + int64_t len; + if (pos + olen > header.max_size) + len = header.max_size - pos; // partial + else + len = olen; // rest + + int64_t actual = ::lseek64(fd, pos, SEEK_SET); + ceph_assert(actual == pos); + + bufferptr bp = buffer::create(len); + int r = safe_read_exact(fd, bp.c_str(), len); + if (r) { + derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned " + << cpp_strerror(r) << dendl; + ceph_abort(); + } + bl->push_back(std::move(bp)); + pos += len; + olen -= len; + } + if (pos >= header.max_size) + pos = pos + get_top() - header.max_size; + if (out_pos) + *out_pos = pos; +} + +bool FileJournal::read_entry( + bufferlist &bl, + uint64_t &next_seq, + bool *corrupt) +{ + if (corrupt) + *corrupt = false; + uint64_t seq = next_seq; + + if (!read_pos) { + dout(2) << "read_entry -- not readable" << dendl; + return false; + } + + off64_t pos = read_pos; + off64_t next_pos = pos; + stringstream ss; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + &ss); + if (result == SUCCESS) { + journalq.push_back( pair<uint64_t,off64_t>(seq, pos)); + uint64_t amount_to_take = + next_pos > pos ? + next_pos - pos : + (header.max_size - pos) + (next_pos - get_top()); + throttle.take(amount_to_take); + throttle.register_throttle_seq(next_seq, amount_to_take); + if (logger) { + logger->inc(l_filestore_journal_ops, 1); + logger->inc(l_filestore_journal_bytes, amount_to_take); + } + if (next_seq > seq) { + return false; + } else { + read_pos = next_pos; + next_seq = seq; + if (seq > journaled_seq) + journaled_seq = seq; + return true; + } + } else { + derr << "do_read_entry(" << pos << "): " << ss.str() << dendl; + } + + if (seq && seq < header.committed_up_to) { + derr << "Unable to read past sequence " << seq + << " but header indicates the journal has committed up through " + << header.committed_up_to << ", journal is corrupt" << dendl; + if (cct->_conf->journal_ignore_corruption) { + if (corrupt) + *corrupt = true; + return false; + } else { + ceph_abort(); + } + } + + dout(2) << "No further valid entries found, journal is most likely valid" + << dendl; + return false; +} + +FileJournal::read_entry_result FileJournal::do_read_entry( + off64_t init_pos, + off64_t *next_pos, + bufferlist *bl, + uint64_t *seq, + ostream *ss, + entry_header_t *_h) const +{ + off64_t cur_pos = init_pos; + bufferlist _bl; + if (!bl) + bl = &_bl; + + // header + entry_header_t *h; + bufferlist hbl; + off64_t _next_pos; + wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos); + h = reinterpret_cast<entry_header_t *>(hbl.c_str()); + + if (!h->check_magic(cur_pos, header.get_fsid64())) { + dout(25) << "read_entry " << init_pos + << " : bad header magic, end of journal" << dendl; + if (ss) + *ss << "bad header magic"; + if (next_pos) + *next_pos = init_pos + (4<<10); // check 4k ahead + return MAYBE_CORRUPT; + } + cur_pos = _next_pos; + + // pad + body + pad + if (h->pre_pad) + cur_pos += h->pre_pad; + + bl->clear(); + wrap_read_bl(cur_pos, h->len, bl, &cur_pos); + + if (h->post_pad) + cur_pos += h->post_pad; + + // footer + entry_header_t *f; + bufferlist fbl; + wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos); + f = reinterpret_cast<entry_header_t *>(fbl.c_str()); + if (memcmp(f, h, sizeof(*f))) { + if (ss) + *ss << "bad footer magic, partial entry"; + if (next_pos) + *next_pos = cur_pos; + return MAYBE_CORRUPT; + } + + if ((header.flags & header_t::FLAG_CRC) || // if explicitly enabled (new journal) + h->crc32c != 0) { // newer entry in old journal + uint32_t actual_crc = bl->crc32c(0); + if (actual_crc != h->crc32c) { + if (ss) + *ss << "header crc (" << h->crc32c + << ") doesn't match body crc (" << actual_crc << ")"; + if (next_pos) + *next_pos = cur_pos; + return MAYBE_CORRUPT; + } + } + + // yay! + dout(2) << "read_entry " << init_pos << " : seq " << h->seq + << " " << h->len << " bytes" + << dendl; + + // ok! + if (seq) + *seq = h->seq; + + + if (next_pos) + *next_pos = cur_pos; + + if (_h) + *_h = *h; + + ceph_assert(cur_pos % header.alignment == 0); + return SUCCESS; +} + +void FileJournal::reserve_throttle_and_backoff(uint64_t count) +{ + throttle.get(count); +} + +void FileJournal::get_header( + uint64_t wanted_seq, + off64_t *_pos, + entry_header_t *h) +{ + off64_t pos = header.start; + off64_t next_pos = pos; + bufferlist bl; + uint64_t seq = 0; + dout(2) << __func__ << dendl; + while (1) { + bl.clear(); + pos = next_pos; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + 0, + h); + if (result == FAILURE || result == MAYBE_CORRUPT) + ceph_abort(); + if (seq == wanted_seq) { + if (_pos) + *_pos = pos; + return; + } + } + ceph_abort(); // not reachable +} + +void FileJournal::corrupt( + int wfd, + off64_t corrupt_at) +{ + dout(2) << __func__ << dendl; + if (corrupt_at >= header.max_size) + corrupt_at = corrupt_at + get_top() - header.max_size; + + int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET); + ceph_assert(actual == corrupt_at); + + char buf[10]; + int r = safe_read_exact(fd, buf, 1); + ceph_assert(r == 0); + + actual = ::lseek64(wfd, corrupt_at, SEEK_SET); + ceph_assert(actual == corrupt_at); + + buf[0]++; + r = safe_write(wfd, buf, 1); + ceph_assert(r == 0); +} + +void FileJournal::corrupt_payload( + int wfd, + uint64_t seq) +{ + dout(2) << __func__ << dendl; + off64_t pos = 0; + entry_header_t h; + get_header(seq, &pos, &h); + off64_t corrupt_at = + pos + sizeof(entry_header_t) + h.pre_pad; + corrupt(wfd, corrupt_at); +} + + +void FileJournal::corrupt_footer_magic( + int wfd, + uint64_t seq) +{ + dout(2) << __func__ << dendl; + off64_t pos = 0; + entry_header_t h; + get_header(seq, &pos, &h); + off64_t corrupt_at = + pos + sizeof(entry_header_t) + h.pre_pad + + h.len + h.post_pad + + (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h)); + corrupt(wfd, corrupt_at); +} + + +void FileJournal::corrupt_header_magic( + int wfd, + uint64_t seq) +{ + dout(2) << __func__ << dendl; + off64_t pos = 0; + entry_header_t h; + get_header(seq, &pos, &h); + off64_t corrupt_at = + pos + + (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h)); + corrupt(wfd, corrupt_at); +} + +off64_t FileJournal::get_journal_size_estimate() +{ + off64_t size, start = header.start; + if (write_pos < start) { + size = (max_size - start) + write_pos; + } else { + size = write_pos - start; + } + dout(20) << __func__ << " journal size=" << size << dendl; + return size; +} + +void FileJournal::get_devices(set<string> *ls) +{ + string dev_node; + BlkDev blkdev(fd); + if (int rc = blkdev.wholedisk(&dev_node); rc) { + return; + } + get_raw_devices(dev_node, ls); +} + +void FileJournal::collect_metadata(map<string,string> *pm) +{ + BlkDev blkdev(fd); + char partition_path[PATH_MAX]; + char dev_node[PATH_MAX]; + if (blkdev.partition(partition_path, PATH_MAX)) { + (*pm)["backend_filestore_journal_partition_path"] = "unknown"; + } else { + (*pm)["backend_filestore_journal_partition_path"] = string(partition_path); + } + if (blkdev.wholedisk(dev_node, PATH_MAX)) { + (*pm)["backend_filestore_journal_dev_node"] = "unknown"; + } else { + (*pm)["backend_filestore_journal_dev_node"] = string(dev_node); + devname = dev_node; + } +} diff --git a/src/os/filestore/FileJournal.h b/src/os/filestore/FileJournal.h new file mode 100644 index 00000000..2313b4b8 --- /dev/null +++ b/src/os/filestore/FileJournal.h @@ -0,0 +1,556 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FILEJOURNAL_H +#define CEPH_FILEJOURNAL_H + +#include <stdlib.h> +#include <deque> +using std::deque; + +#include "Journal.h" +#include "common/config_fwd.h" +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/Thread.h" +#include "common/Throttle.h" +#include "JournalThrottle.h" +#include "common/zipkin_trace.h" + +#ifdef HAVE_LIBAIO +# include <libaio.h> +#endif + +// re-include our assert to clobber the system one; fix dout: +#include "include/ceph_assert.h" + +/** + * Implements journaling on top of block device or file. + * + * Lock ordering is write_lock > aio_lock > (completions_lock | finisher_lock) + */ +class FileJournal : + public Journal, + public md_config_obs_t { +public: + /// Protected by finisher_lock + struct completion_item { + uint64_t seq; + Context *finish; + utime_t start; + TrackedOpRef tracked_op; + completion_item(uint64_t o, Context *c, utime_t s, TrackedOpRef opref) + : seq(o), finish(c), start(s), tracked_op(opref) {} + completion_item() : seq(0), finish(0), start(0) {} + }; + struct write_item { + uint64_t seq; + bufferlist bl; + uint32_t orig_len; + TrackedOpRef tracked_op; + ZTracer::Trace trace; + write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) : + seq(s), orig_len(ol), tracked_op(opref) { + bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy + } + write_item() : seq(0), orig_len(0) {} + }; + + Mutex finisher_lock; + Cond finisher_cond; + uint64_t journaled_seq; + bool plug_journal_completions; + + Mutex writeq_lock; + Cond writeq_cond; + list<write_item> writeq; + bool writeq_empty(); + write_item &peek_write(); + void pop_write(); + void batch_pop_write(list<write_item> &items); + void batch_unpop_write(list<write_item> &items); + + Mutex completions_lock; + list<completion_item> completions; + bool completions_empty() { + Mutex::Locker l(completions_lock); + return completions.empty(); + } + void batch_pop_completions(list<completion_item> &items) { + Mutex::Locker l(completions_lock); + completions.swap(items); + } + void batch_unpop_completions(list<completion_item> &items) { + Mutex::Locker l(completions_lock); + completions.splice(completions.begin(), items); + } + completion_item completion_peek_front() { + Mutex::Locker l(completions_lock); + ceph_assert(!completions.empty()); + return completions.front(); + } + void completion_pop_front() { + Mutex::Locker l(completions_lock); + ceph_assert(!completions.empty()); + completions.pop_front(); + } + + int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) override; + + void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len, + Context *oncommit, + TrackedOpRef osd_op = TrackedOpRef()) override; + /// End protected by finisher_lock + + /* + * journal header + */ + struct header_t { + enum { + FLAG_CRC = (1<<0), + // NOTE: remove kludgey weirdness in read_header() next time a flag is added. + }; + + uint64_t flags; + uuid_d fsid; + __u32 block_size; + __u32 alignment; + int64_t max_size; // max size of journal ring buffer + int64_t start; // offset of first entry + uint64_t committed_up_to; // committed up to + + /** + * start_seq + * + * entry at header.start has sequence >= start_seq + * + * Generally, the entry at header.start will have sequence + * start_seq if it exists. The only exception is immediately + * after journal creation since the first sequence number is + * not known. + * + * If the first read on open fails, we can assume corruption + * if start_seq > committed_up_to because the entry would have + * a sequence >= start_seq and therefore > committed_up_to. + */ + uint64_t start_seq; + + header_t() : + flags(0), block_size(0), alignment(0), max_size(0), start(0), + committed_up_to(0), start_seq(0) {} + + void clear() { + start = block_size; + } + + uint64_t get_fsid64() const { + return *(uint64_t*)fsid.bytes(); + } + + void encode(bufferlist& bl) const { + using ceph::encode; + __u32 v = 4; + encode(v, bl); + bufferlist em; + { + encode(flags, em); + encode(fsid, em); + encode(block_size, em); + encode(alignment, em); + encode(max_size, em); + encode(start, em); + encode(committed_up_to, em); + encode(start_seq, em); + } + encode(em, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + __u32 v; + decode(v, bl); + if (v < 2) { // normally 0, but conceivably 1 + // decode old header_t struct (pre v0.40). + bl.advance(4u); // skip __u32 flags (it was unused by any old code) + flags = 0; + uint64_t tfsid; + decode(tfsid, bl); + *(uint64_t*)&fsid.bytes()[0] = tfsid; + *(uint64_t*)&fsid.bytes()[8] = tfsid; + decode(block_size, bl); + decode(alignment, bl); + decode(max_size, bl); + decode(start, bl); + committed_up_to = 0; + start_seq = 0; + return; + } + bufferlist em; + decode(em, bl); + auto t = em.cbegin(); + decode(flags, t); + decode(fsid, t); + decode(block_size, t); + decode(alignment, t); + decode(max_size, t); + decode(start, t); + + if (v > 2) + decode(committed_up_to, t); + else + committed_up_to = 0; + + if (v > 3) + decode(start_seq, t); + else + start_seq = 0; + } + } header; + + struct entry_header_t { + uint64_t seq; // fs op seq # + uint32_t crc32c; // payload only. not header, pre_pad, post_pad, or footer. + uint32_t len; + uint32_t pre_pad, post_pad; + uint64_t magic1; + uint64_t magic2; + + static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) { + return (fsid ^ seq ^ len); + } + bool check_magic(off64_t pos, uint64_t fsid) { + return + magic1 == (uint64_t)pos && + magic2 == (fsid ^ seq ^ len); + } + } __attribute__((__packed__, aligned(4))); + + bool journalq_empty() { return journalq.empty(); } + +private: + string fn; + + char *zero_buf; + off64_t max_size; + size_t block_size; + bool directio, aio, force_aio; + bool must_write_header; + off64_t write_pos; // byte where the next entry to be written will go + off64_t read_pos; // + bool discard; //for block journal whether support discard + +#ifdef HAVE_LIBAIO + /// state associated with an in-flight aio request + /// Protected by aio_lock + struct aio_info { + struct iocb iocb {}; + bufferlist bl; + struct iovec *iov; + bool done; + uint64_t off, len; ///< these are for debug only + uint64_t seq; ///< seq number to complete on aio completion, if non-zero + + aio_info(bufferlist& b, uint64_t o, uint64_t s) + : iov(NULL), done(false), off(o), len(b.length()), seq(s) { + bl.claim(b); + } + ~aio_info() { + delete[] iov; + } + }; + Mutex aio_lock; + Cond aio_cond; + Cond write_finish_cond; + io_context_t aio_ctx; + list<aio_info> aio_queue; + int aio_num, aio_bytes; + uint64_t aio_write_queue_ops; + uint64_t aio_write_queue_bytes; + /// End protected by aio_lock +#endif + + uint64_t last_committed_seq; + uint64_t journaled_since_start; + + string devname; + + /* + * full states cycle at the beginnging of each commit epoch, when commit_start() + * is called. + * FULL - we just filled up during this epoch. + * WAIT - we filled up last epoch; now we have to wait until everything during + * that epoch commits to the fs before we can start writing over it. + * NOTFULL - all good, journal away. + */ + enum { + FULL_NOTFULL = 0, + FULL_FULL = 1, + FULL_WAIT = 2, + } full_state; + + int fd; + + // in journal + deque<pair<uint64_t, off64_t> > journalq; // track seq offsets, so we can trim later. + uint64_t writing_seq; + + + // throttle + int set_throttle_params(); + const char** get_tracked_conf_keys() const override; + void handle_conf_change( + const ConfigProxy& conf, + const std::set <std::string> &changed) override { + for (const char **i = get_tracked_conf_keys(); + *i; + ++i) { + if (changed.count(string(*i))) { + set_throttle_params(); + return; + } + } + } + + void complete_write(uint64_t ops, uint64_t bytes); + JournalThrottle throttle; + + // write thread + Mutex write_lock; + bool write_stop; + bool aio_stop; + + Cond commit_cond; + + int _open(bool wr, bool create=false); + int _open_block_device(); + void _close(int fd) const; + int _open_file(int64_t oldsize, blksize_t blksize, bool create); + int _dump(ostream& out, bool simple); + void print_header(const header_t &hdr) const; + int read_header(header_t *hdr) const; + bufferptr prepare_header(); + void start_writer(); + void stop_writer(); + void write_thread_entry(); + + void queue_completions_thru(uint64_t seq); + + int check_for_full(uint64_t seq, off64_t pos, off64_t size); + int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee); + int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, + uint64_t& orig_ops, uint64_t& orig_bytes); + void do_write(bufferlist& bl); + + void write_finish_thread_entry(); + void check_aio_completion(); + void do_aio_write(bufferlist& bl); + int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq); + + + void check_align(off64_t pos, bufferlist& bl); + int write_bl(off64_t& pos, bufferlist& bl); + + /// read len from journal starting at in_pos and wrapping up to len + void wrap_read_bl( + off64_t in_pos, ///< [in] start position + int64_t len, ///< [in] length to read + bufferlist* bl, ///< [out] result + off64_t *out_pos ///< [out] next position to read, will be wrapped + ) const; + + void do_discard(int64_t offset, int64_t end); + + class Writer : public Thread { + FileJournal *journal; + public: + explicit Writer(FileJournal *fj) : journal(fj) {} + void *entry() override { + journal->write_thread_entry(); + return 0; + } + } write_thread; + + class WriteFinisher : public Thread { + FileJournal *journal; + public: + explicit WriteFinisher(FileJournal *fj) : journal(fj) {} + void *entry() override { + journal->write_finish_thread_entry(); + return 0; + } + } write_finish_thread; + + off64_t get_top() const { + return round_up_to(sizeof(header), block_size); + } + + ZTracer::Endpoint trace_endpoint; + + public: + FileJournal(CephContext* cct, uuid_d fsid, Finisher *fin, Cond *sync_cond, + const char *f, bool dio=false, bool ai=true, bool faio=false) : + Journal(cct, fsid, fin, sync_cond), + finisher_lock("FileJournal::finisher_lock", false, true, false), + journaled_seq(0), + plug_journal_completions(false), + writeq_lock("FileJournal::writeq_lock", false, true, false), + completions_lock( + "FileJournal::completions_lock", false, true, false), + fn(f), + zero_buf(NULL), + max_size(0), block_size(0), + directio(dio), aio(ai), force_aio(faio), + must_write_header(false), + write_pos(0), read_pos(0), + discard(false), +#ifdef HAVE_LIBAIO + aio_lock("FileJournal::aio_lock"), + aio_ctx(0), + aio_num(0), aio_bytes(0), + aio_write_queue_ops(0), + aio_write_queue_bytes(0), +#endif + last_committed_seq(0), + journaled_since_start(0), + full_state(FULL_NOTFULL), + fd(-1), + writing_seq(0), + throttle(cct->_conf->filestore_caller_concurrency), + write_lock("FileJournal::write_lock", false, true, false), + write_stop(true), + aio_stop(true), + write_thread(this), + write_finish_thread(this), + trace_endpoint("0.0.0.0", 0, "FileJournal") { + + if (aio && !directio) { + lderr(cct) << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl; + aio = false; + } +#ifndef HAVE_LIBAIO + if (aio && ::getenv("CEPH_DEV") == NULL) { + lderr(cct) << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl; + aio = false; + } +#endif + + cct->_conf.add_observer(this); + } + ~FileJournal() override { + ceph_assert(fd == -1); + delete[] zero_buf; + cct->_conf.remove_observer(this); + } + + int check() override; + int create() override; + int open(uint64_t fs_op_seq) override; + void close() override; + int peek_fsid(uuid_d& fsid); + + int dump(ostream& out) override; + int simple_dump(ostream& out); + int _fdump(Formatter &f, bool simple); + + void flush() override; + + void get_devices(set<string> *ls) override; + void collect_metadata(map<string,string> *pm) override; + + void reserve_throttle_and_backoff(uint64_t count) override; + + bool is_writeable() override { + return read_pos == 0; + } + int make_writeable() override; + + // writes + void commit_start(uint64_t seq) override; + void committed_thru(uint64_t seq) override; + bool should_commit_now() override { + return full_state != FULL_NOTFULL && !write_stop; + } + + void write_header_sync(); + + void set_wait_on_full(bool b) { wait_on_full = b; } + + off64_t get_journal_size_estimate(); + + // reads + + /// Result code for read_entry + enum read_entry_result { + SUCCESS, + FAILURE, + MAYBE_CORRUPT + }; + + /** + * read_entry + * + * Reads next entry starting at pos. If the entry appears + * clean, *bl will contain the payload, *seq will contain + * the sequence number, and *out_pos will reflect the next + * read position. If the entry is invalid *ss will contain + * debug text, while *seq, *out_pos, and *bl will be unchanged. + * + * If the entry suggests a corrupt log, *ss will contain debug + * text, *out_pos will contain the next index to check. If + * we find an entry in this way that returns SUCCESS, the journal + * is most likely corrupt. + */ + read_entry_result do_read_entry( + off64_t pos, ///< [in] position to read + off64_t *next_pos, ///< [out] next position to read + bufferlist* bl, ///< [out] payload for successful read + uint64_t *seq, ///< [out] seq of successful read + ostream *ss, ///< [out] error output + entry_header_t *h = 0 ///< [out] header + ) const; ///< @return result code + + bool read_entry( + bufferlist &bl, + uint64_t &last_seq, + bool *corrupt + ); + + bool read_entry( + bufferlist &bl, + uint64_t &last_seq) override { + return read_entry(bl, last_seq, 0); + } + + // Debug/Testing + void get_header( + uint64_t wanted_seq, + off64_t *_pos, + entry_header_t *h); + void corrupt( + int wfd, + off64_t corrupt_at); + void corrupt_payload( + int wfd, + uint64_t seq); + void corrupt_footer_magic( + int wfd, + uint64_t seq); + void corrupt_header_magic( + int wfd, + uint64_t seq); +}; + +WRITE_CLASS_ENCODER(FileJournal::header_t) + +#endif diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc new file mode 100644 index 00000000..d387947e --- /dev/null +++ b/src/os/filestore/FileStore.cc @@ -0,0 +1,6425 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (c) 2015 Hewlett-Packard Development Company, L.P. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "include/compat.h" +#include "include/int_types.h" +#include "boost/tuple/tuple.hpp" + +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/file.h> +#include <errno.h> +#include <dirent.h> +#include <sys/ioctl.h> + +#if defined(__linux__) +#include <linux/fs.h> +#include <linux/falloc.h> +#endif + +#include <iostream> +#include <map> + +#include "include/linux_fiemap.h" + +#include "common/xattr.h" +#include "chain_xattr.h" + +#if defined(__APPLE__) || defined(__FreeBSD__) +#include <sys/param.h> +#include <sys/mount.h> +#endif + + +#include <fstream> +#include <sstream> + +#include "FileStore.h" +#include "GenericFileStoreBackend.h" +#include "BtrfsFileStoreBackend.h" +#include "XfsFileStoreBackend.h" +#include "ZFSFileStoreBackend.h" +#include "common/BackTrace.h" +#include "include/types.h" +#include "FileJournal.h" + +#include "osd/osd_types.h" +#include "include/color.h" +#include "include/buffer.h" + +#include "common/Timer.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/run_cmd.h" +#include "common/safe_io.h" +#include "common/perf_counters.h" +#include "common/sync_filesystem.h" +#include "common/fd.h" +#include "HashIndex.h" +#include "DBObjectMap.h" +#include "kv/KeyValueDB.h" + +#include "common/ceph_crypto.h" +using ceph::crypto::SHA1; + +#include "include/ceph_assert.h" + +#include "common/config.h" +#include "common/blkdev.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/objectstore.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "filestore(" << basedir << ") " + +#define COMMIT_SNAP_ITEM "snap_%llu" +#define CLUSTER_SNAP_ITEM "clustersnap_%s" + +#define REPLAY_GUARD_XATTR "user.cephos.seq" +#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq" + +// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether +// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file +// xattrs and the value is "no", it indicates no xattrs in DBObjectMap +#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out" +#define XATTR_NO_SPILL_OUT "0" +#define XATTR_SPILL_OUT "1" +#define __FUNC__ __func__ << "(" << __LINE__ << ")" + +//Initial features in new superblock. +static CompatSet get_fs_initial_compat_set() { + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} + +//Features are added here that this FileStore supports. +static CompatSet get_fs_supported_compat_set() { + CompatSet compat = get_fs_initial_compat_set(); + //Any features here can be set in code, but not in initial superblock + compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); + return compat; +} + +int FileStore::validate_hobject_key(const hobject_t &obj) const +{ + unsigned len = LFNIndex::get_max_escaped_name_len(obj); + return len > m_filestore_max_xattr_value_size ? -ENAMETOOLONG : 0; +} + +int FileStore::get_block_device_fsid(CephContext* cct, const string& path, + uuid_d *fsid) +{ + // make sure we don't try to use aio or direct_io (and get annoying + // error messages from failing to do so); performance implications + // should be irrelevant for this use + FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false); + return j.peek_fsid(*fsid); +} + +void FileStore::FSPerfTracker::update_from_perfcounters( + PerfCounters &logger) +{ + os_commit_latency_ns.consume_next( + logger.get_tavg_ns( + l_filestore_journal_latency)); + os_apply_latency_ns.consume_next( + logger.get_tavg_ns( + l_filestore_apply_latency)); +} + + +ostream& operator<<(ostream& out, const FileStore::OpSequencer& s) +{ + return out << "osr(" << s.cid << ")"; +} + +int FileStore::get_cdir(const coll_t& cid, char *s, int len) +{ + const string &cid_str(cid.to_str()); + return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str()); +} + +void FileStore::handle_eio() +{ + // don't try to map this back to an offset; too hard since there is + // a file system in between. we also don't really know whether this + // was a read or a write, since we have so many layers beneath us. + // don't even try. + note_io_error_event(devname.c_str(), basedir.c_str(), -EIO, 0, 0, 0); + ceph_abort_msg("unexpected eio error"); +} + +int FileStore::get_index(const coll_t& cid, Index *index) +{ + int r = index_manager.get_index(cid, basedir, index); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +int FileStore::init_index(const coll_t& cid) +{ + char path[PATH_MAX]; + get_cdir(cid, path, sizeof(path)); + int r = index_manager.init_index(cid, path, target_version); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path) +{ + IndexedPath path2; + if (!path) + path = &path2; + int r, exist; + ceph_assert(index.index); + r = (index.index)->lookup(oid, path, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (!exist) + return -ENOENT; + return 0; +} + +int FileStore::lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length) +{ + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) + return r; + r = ::ftruncate(**fd, length); + if (r < 0) + r = -errno; + if (r >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_truncate(**fd, length); + ceph_assert(rc >= 0); + } + lfn_close(fd); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +int FileStore::lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf) +{ + IndexedPath path; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + + r = lfn_find(oid, index, &path); + if (r < 0) + return r; + r = ::stat(path->path(), buf); + if (r < 0) + r = -errno; + return r; +} + +int FileStore::lfn_open(const coll_t& cid, + const ghobject_t& oid, + bool create, + FDRef *outfd, + Index *index) +{ + ceph_assert(outfd); + int r = 0; + bool need_lock = true; + int flags = O_RDWR; + + if (create) + flags |= O_CREAT; + if (cct->_conf->filestore_odsync_write) { + flags |= O_DSYNC; + } + + Index index2; + if (!index) { + index = &index2; + } + if (!((*index).index)) { + r = get_index(cid, index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + return r; + } + } else { + need_lock = false; + } + + int fd, exist; + ceph_assert((*index).index); + if (need_lock) { + ((*index).index)->access_lock.get_write(); + } + if (!replaying) { + *outfd = fdcache.lookup(oid); + if (*outfd) { + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + return 0; + } + } + + + IndexedPath path2; + IndexedPath *path = &path2; + + r = (*index)->lookup(oid, path, &exist); + if (r < 0) { + derr << "could not find " << oid << " in index: " + << cpp_strerror(-r) << dendl; + goto fail; + } + + r = ::open((*path)->path(), flags|O_CLOEXEC, 0644); + if (r < 0) { + r = -errno; + dout(10) << "error opening file " << (*path)->path() << " with flags=" + << flags << ": " << cpp_strerror(-r) << dendl; + goto fail; + } + fd = r; + if (create && (!exist)) { + r = (*index)->created(oid, (*path)->path()); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "error creating " << oid << " (" << (*path)->path() + << ") in index: " << cpp_strerror(-r) << dendl; + goto fail; + } + r = chain_fsetxattr<true, true>( + fd, XATTR_SPILL_OUT_NAME, + XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path() + << "):" << cpp_strerror(-r) << dendl; + goto fail; + } + } + + if (!replaying) { + bool existed; + *outfd = fdcache.add(oid, fd, &existed); + if (existed) { + TEMP_FAILURE_RETRY(::close(fd)); + } + } else { + *outfd = std::make_shared<FDCache::FD>(fd); + } + + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + + return 0; + + fail: + + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +void FileStore::lfn_close(FDRef fd) +{ +} + +int FileStore::lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) +{ + Index index_new, index_old; + IndexedPath path_new, path_old; + int exist; + int r; + bool index_same = false; + if (c < newcid) { + r = get_index(newcid, &index_new); + if (r < 0) + return r; + r = get_index(c, &index_old); + if (r < 0) + return r; + } else if (c == newcid) { + r = get_index(c, &index_old); + if (r < 0) + return r; + index_new = index_old; + index_same = true; + } else { + r = get_index(c, &index_old); + if (r < 0) + return r; + r = get_index(newcid, &index_new); + if (r < 0) + return r; + } + + ceph_assert(index_old.index); + ceph_assert(index_new.index); + + if (!index_same) { + + RWLock::RLocker l1((index_old.index)->access_lock); + + r = index_old->lookup(o, &path_old, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (!exist) + return -ENOENT; + + RWLock::WLocker l2((index_new.index)->access_lock); + + r = index_new->lookup(newoid, &path_new, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (exist) + return -EEXIST; + + dout(25) << __FUNC__ << ": path_old: " << path_old << dendl; + dout(25) << __FUNC__ << ": path_new: " << path_new << dendl; + r = ::link(path_old->path(), path_new->path()); + if (r < 0) + return -errno; + + r = index_new->created(newoid, path_new->path()); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + } else { + RWLock::WLocker l1((index_old.index)->access_lock); + + r = index_old->lookup(o, &path_old, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (!exist) + return -ENOENT; + + r = index_new->lookup(newoid, &path_new, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (exist) + return -EEXIST; + + dout(25) << __FUNC__ << ": path_old: " << path_old << dendl; + dout(25) << __FUNC__ << ": path_new: " << path_new << dendl; + r = ::link(path_old->path(), path_new->path()); + if (r < 0) + return -errno; + + // make sure old fd for unlinked/overwritten file is gone + fdcache.clear(newoid); + + r = index_new->created(newoid, path_new->path()); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + } + return 0; +} + +int FileStore::lfn_unlink(const coll_t& cid, const ghobject_t& o, + const SequencerPosition &spos, + bool force_clear_omap) +{ + Index index; + int r = get_index(cid, &index); + if (r < 0) { + dout(25) << __FUNC__ << ": get_index failed " << cpp_strerror(r) << dendl; + return r; + } + + ceph_assert(index.index); + RWLock::WLocker l((index.index)->access_lock); + + { + IndexedPath path; + int hardlink; + r = index->lookup(o, &path, &hardlink); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + + if (!force_clear_omap) { + if (hardlink == 0 || hardlink == 1) { + force_clear_omap = true; + } + } + if (force_clear_omap) { + dout(20) << __FUNC__ << ": clearing omap on " << o + << " in cid " << cid << dendl; + r = object_map->clear(o, &spos); + if (r < 0 && r != -ENOENT) { + dout(25) << __FUNC__ << ": omap clear failed " << cpp_strerror(r) << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (cct->_conf->filestore_debug_inject_read_err) { + debug_obj_on_delete(o); + } + if (!m_disable_wbthrottle) { + wbthrottle.clear_object(o); // should be only non-cache ref + } + fdcache.clear(o); + } else { + /* Ensure that replay of this op doesn't result in the object_map + * going away. + */ + if (!backend->can_checkpoint()) + object_map->sync(&o, &spos); + } + if (hardlink == 0) { + if (!m_disable_wbthrottle) { + wbthrottle.clear_object(o); // should be only non-cache ref + } + return 0; + } + } + r = index->unlink(o); + if (r < 0) { + dout(25) << __FUNC__ << ": index unlink failed " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +FileStore::FileStore(CephContext* cct, const std::string &base, + const std::string &jdev, osflagbits_t flags, + const char *name, bool do_update) : + JournalingObjectStore(cct, base), + internal_name(name), + basedir(base), journalpath(jdev), + generic_flags(flags), + blk_size(0), + fsid_fd(-1), op_fd(-1), + basedir_fd(-1), current_fd(-1), + backend(nullptr), + index_manager(cct, do_update), + lock("FileStore::lock"), + force_sync(false), + sync_entry_timeo_lock("FileStore::sync_entry_timeo_lock"), + timer(cct, sync_entry_timeo_lock), + stop(false), sync_thread(this), + coll_lock("FileStore::coll_lock"), + fdcache(cct), + wbthrottle(cct), + next_osr_id(0), + m_disable_wbthrottle(cct->_conf->filestore_odsync_write || + !cct->_conf->filestore_wbthrottle_enable), + throttle_ops(cct, "filestore_ops", cct->_conf->filestore_caller_concurrency), + throttle_bytes(cct, "filestore_bytes", cct->_conf->filestore_caller_concurrency), + m_ondisk_finisher_num(cct->_conf->filestore_ondisk_finisher_threads), + m_apply_finisher_num(cct->_conf->filestore_apply_finisher_threads), + op_tp(cct, "FileStore::op_tp", "tp_fstore_op", cct->_conf->filestore_op_threads, "filestore_op_threads"), + op_wq(this, cct->_conf->filestore_op_thread_timeout, + cct->_conf->filestore_op_thread_suicide_timeout, &op_tp), + logger(nullptr), + trace_endpoint("0.0.0.0", 0, "FileStore"), + read_error_lock("FileStore::read_error_lock"), + m_filestore_commit_timeout(cct->_conf->filestore_commit_timeout), + m_filestore_journal_parallel(cct->_conf->filestore_journal_parallel ), + m_filestore_journal_trailing(cct->_conf->filestore_journal_trailing), + m_filestore_journal_writeahead(cct->_conf->filestore_journal_writeahead), + m_filestore_fiemap_threshold(cct->_conf->filestore_fiemap_threshold), + m_filestore_max_sync_interval(cct->_conf->filestore_max_sync_interval), + m_filestore_min_sync_interval(cct->_conf->filestore_min_sync_interval), + m_filestore_fail_eio(cct->_conf->filestore_fail_eio), + m_filestore_fadvise(cct->_conf->filestore_fadvise), + do_update(do_update), + m_journal_dio(cct->_conf->journal_dio), + m_journal_aio(cct->_conf->journal_aio), + m_journal_force_aio(cct->_conf->journal_force_aio), + m_osd_rollback_to_cluster_snap(cct->_conf->osd_rollback_to_cluster_snap), + m_osd_use_stale_snap(cct->_conf->osd_use_stale_snap), + m_filestore_do_dump(false), + m_filestore_dump_fmt(true), + m_filestore_sloppy_crc(cct->_conf->filestore_sloppy_crc), + m_filestore_sloppy_crc_block_size(cct->_conf->filestore_sloppy_crc_block_size), + m_filestore_max_alloc_hint_size(cct->_conf->filestore_max_alloc_hint_size), + m_fs_type(0), + m_filestore_max_inline_xattr_size(0), + m_filestore_max_inline_xattrs(0), + m_filestore_max_xattr_value_size(0) +{ + m_filestore_kill_at = cct->_conf->filestore_kill_at; + for (int i = 0; i < m_ondisk_finisher_num; ++i) { + ostringstream oss; + oss << "filestore-ondisk-" << i; + Finisher *f = new Finisher(cct, oss.str(), "fn_odsk_fstore"); + ondisk_finishers.push_back(f); + } + for (int i = 0; i < m_apply_finisher_num; ++i) { + ostringstream oss; + oss << "filestore-apply-" << i; + Finisher *f = new Finisher(cct, oss.str(), "fn_appl_fstore"); + apply_finishers.push_back(f); + } + + ostringstream oss; + oss << basedir << "/current"; + current_fn = oss.str(); + + ostringstream sss; + sss << basedir << "/current/commit_op_seq"; + current_op_seq_fn = sss.str(); + + ostringstream omss; + if (cct->_conf->filestore_omap_backend_path != "") { + omap_dir = cct->_conf->filestore_omap_backend_path; + } else { + omss << basedir << "/current/omap"; + omap_dir = omss.str(); + } + + // initialize logger + PerfCountersBuilder plb(cct, internal_name, l_filestore_first, l_filestore_last); + + plb.add_u64(l_filestore_journal_queue_ops, "journal_queue_ops", "Operations in journal queue"); + plb.add_u64(l_filestore_journal_ops, "journal_ops", "Active journal entries to be applied"); + plb.add_u64(l_filestore_journal_queue_bytes, "journal_queue_bytes", "Size of journal queue"); + plb.add_u64(l_filestore_journal_bytes, "journal_bytes", "Active journal operation size to be applied"); + plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency", + NULL, PerfCountersBuilder::PRIO_USEFUL); + plb.add_u64_counter(l_filestore_journal_wr, "journal_wr", "Journal write IOs"); + plb.add_u64_avg(l_filestore_journal_wr_bytes, "journal_wr_bytes", "Journal data written"); + plb.add_u64(l_filestore_op_queue_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue"); + plb.add_u64(l_filestore_op_queue_ops, "op_queue_ops", "Operations in writing to FS queue"); + plb.add_u64_counter(l_filestore_ops, "ops", "Operations written to store"); + plb.add_u64(l_filestore_op_queue_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue"); + plb.add_u64(l_filestore_op_queue_bytes, "op_queue_bytes", "Size of writing to FS queue"); + plb.add_u64_counter(l_filestore_bytes, "bytes", "Data written to store"); + plb.add_time_avg(l_filestore_apply_latency, "apply_latency", "Apply latency"); + plb.add_u64(l_filestore_committing, "committing", "Is currently committing"); + + plb.add_u64_counter(l_filestore_commitcycle, "commitcycle", "Commit cycles"); + plb.add_time_avg(l_filestore_commitcycle_interval, "commitcycle_interval", "Average interval between commits"); + plb.add_time_avg(l_filestore_commitcycle_latency, "commitcycle_latency", "Average latency of commit"); + plb.add_u64_counter(l_filestore_journal_full, "journal_full", "Journal writes while full"); + plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg", + "Store operation queue latency", NULL, PerfCountersBuilder::PRIO_USEFUL); + plb.add_time(l_filestore_sync_pause_max_lat, "sync_pause_max_latency", "Max latency of op_wq pause before syncfs"); + + logger = plb.create_perf_counters(); + + cct->get_perfcounters_collection()->add(logger); + cct->_conf.add_observer(this); + + superblock.compat_features = get_fs_initial_compat_set(); +} + +FileStore::~FileStore() +{ + for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + delete *it; + *it = nullptr; + } + for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + delete *it; + *it = nullptr; + } + cct->_conf.remove_observer(this); + cct->get_perfcounters_collection()->remove(logger); + + if (journal) + journal->logger = nullptr; + delete logger; + logger = nullptr; + + if (m_filestore_do_dump) { + dump_stop(); + } +} + +static void get_attrname(const char *name, char *buf, int len) +{ + snprintf(buf, len, "user.ceph.%s", name); +} + +bool parse_attrname(char **name) +{ + if (strncmp(*name, "user.ceph.", 10) == 0) { + *name += 10; + return true; + } + return false; +} + +void FileStore::collect_metadata(map<string,string> *pm) +{ + char partition_path[PATH_MAX]; + char dev_node[PATH_MAX]; + + (*pm)["filestore_backend"] = backend->get_name(); + ostringstream ss; + ss << "0x" << std::hex << m_fs_type << std::dec; + (*pm)["filestore_f_type"] = ss.str(); + + if (cct->_conf->filestore_collect_device_partition_information) { + int rc = 0; + BlkDev blkdev(fsid_fd); + if (rc = blkdev.partition(partition_path, PATH_MAX); rc) { + (*pm)["backend_filestore_partition_path"] = "unknown"; + } else { + (*pm)["backend_filestore_partition_path"] = string(partition_path); + } + if (rc = blkdev.wholedisk(dev_node, PATH_MAX); rc) { + (*pm)["backend_filestore_dev_node"] = "unknown"; + } else { + (*pm)["backend_filestore_dev_node"] = string(dev_node); + devname = dev_node; + } + if (rc == 0 && vdo_fd >= 0) { + (*pm)["vdo"] = "true"; + (*pm)["vdo_physical_size"] = + stringify(4096 * get_vdo_stat(vdo_fd, "physical_blocks")); + } + if (journal) { + journal->collect_metadata(pm); + } + } +} + +int FileStore::get_devices(set<string> *ls) +{ + string dev_node; + BlkDev blkdev(fsid_fd); + if (int rc = blkdev.wholedisk(&dev_node); rc) { + return rc; + } + get_raw_devices(dev_node, ls); + if (journal) { + journal->get_devices(ls); + } + return 0; +} + +int FileStore::statfs(struct store_statfs_t *buf0, osd_alert_list_t* alerts) +{ + struct statfs buf; + buf0->reset(); + if (alerts) { + alerts->clear(); // returns nothing for now + } + if (::statfs(basedir.c_str(), &buf) < 0) { + int r = -errno; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + ceph_assert(r != -ENOENT); + return r; + } + + uint64_t bfree = buf.f_bavail * buf.f_bsize; + + // assume all of leveldb/rocksdb is omap. + { + map<string,uint64_t> kv_usage; + buf0->omap_allocated += object_map->get_db()->get_estimated_size(kv_usage); + } + + uint64_t thin_total, thin_avail; + if (get_vdo_utilization(vdo_fd, &thin_total, &thin_avail)) { + buf0->total = thin_total; + bfree = std::min(bfree, thin_avail); + buf0->allocated = thin_total - thin_avail; + buf0->data_stored = bfree; + } else { + buf0->total = buf.f_blocks * buf.f_bsize; + buf0->allocated = bfree; + buf0->data_stored = bfree; + } + buf0->available = bfree; + + // FIXME: we don't know how to populate buf->internal_metadata; XFS doesn't + // tell us what its internal overhead is. + + // Adjust for writes pending in the journal + if (journal) { + uint64_t estimate = journal->get_journal_size_estimate(); + buf0->internally_reserved = estimate; + if (buf0->available > estimate) + buf0->available -= estimate; + else + buf0->available = 0; + } + + return 0; +} + +int FileStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) +{ + return -ENOTSUP; +} + +void FileStore::new_journal() +{ + if (journalpath.length()) { + dout(10) << "open_journal at " << journalpath << dendl; + journal = new FileJournal(cct, fsid, &finisher, &sync_cond, + journalpath.c_str(), + m_journal_dio, m_journal_aio, + m_journal_force_aio); + if (journal) + journal->logger = logger; + } + return; +} + +int FileStore::dump_journal(ostream& out) +{ + int r; + + if (!journalpath.length()) + return -EINVAL; + + FileJournal *journal = new FileJournal(cct, fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio); + r = journal->dump(out); + delete journal; + journal = nullptr; + return r; +} + +FileStoreBackend *FileStoreBackend::create(unsigned long f_type, FileStore *fs) +{ + switch (f_type) { +#if defined(__linux__) + case BTRFS_SUPER_MAGIC: + return new BtrfsFileStoreBackend(fs); +# ifdef HAVE_LIBXFS + case XFS_SUPER_MAGIC: + return new XfsFileStoreBackend(fs); +# endif +#endif +#ifdef HAVE_LIBZFS + case ZFS_SUPER_MAGIC: + return new ZFSFileStoreBackend(fs); +#endif + default: + return new GenericFileStoreBackend(fs); + } +} + +void FileStore::create_backend(unsigned long f_type) +{ + m_fs_type = f_type; + + ceph_assert(!backend); + backend = FileStoreBackend::create(f_type, this); + + dout(0) << "backend " << backend->get_name() + << " (magic 0x" << std::hex << f_type << std::dec << ")" + << dendl; + + switch (f_type) { +#if defined(__linux__) + case BTRFS_SUPER_MAGIC: + if (!m_disable_wbthrottle){ + wbthrottle.set_fs(WBThrottle::BTRFS); + } + break; + + case XFS_SUPER_MAGIC: + // wbthrottle is constructed with fs(WBThrottle::XFS) + break; +#endif + } + + set_xattr_limits_via_conf(); +} + +int FileStore::mkfs() +{ + int ret = 0; + char fsid_fn[PATH_MAX]; + char fsid_str[40]; + uuid_d old_fsid; + uuid_d old_omap_fsid; + + dout(1) << "mkfs in " << basedir << dendl; + basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC); + if (basedir_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl; + return ret; + } + + // open+lock fsid + snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + if (lock_fsid() < 0) { + ret = -EBUSY; + goto close_fsid_fd; + } + + if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << __FUNC__ << ": generated fsid " << fsid << dendl; + } else { + dout(1) << __FUNC__ << ": using provided fsid " << fsid << dendl; + } + + fsid.print(fsid_str); + strcat(fsid_str, "\n"); + ret = ::ftruncate(fsid_fd, 0); + if (ret < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to truncate fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str)); + if (ret < 0) { + derr << __FUNC__ << ": failed to write fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + if (::fsync(fsid_fd) < 0) { + ret = -errno; + derr << __FUNC__ << ": close failed: can't write fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + dout(10) << __FUNC__ << ": fsid is " << fsid << dendl; + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << __FUNC__ << ": on-disk fsid " << old_fsid << " != provided " << fsid << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + fsid = old_fsid; + dout(1) << __FUNC__ << ": fsid is already set to " << fsid << dendl; + } + + // version stamp + ret = write_version_stamp(); + if (ret < 0) { + derr << __FUNC__ << ": write_version_stamp() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + // superblock + superblock.omap_backend = cct->_conf->filestore_omap_backend; + ret = write_superblock(); + if (ret < 0) { + derr << __FUNC__ << ": write_superblock() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + struct statfs basefs; + ret = ::fstatfs(basedir_fd, &basefs); + if (ret < 0) { + ret = -errno; + derr << __FUNC__ << ": cannot fstatfs basedir " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + +#if defined(__linux__) + if (basefs.f_type == BTRFS_SUPER_MAGIC && + !g_ceph_context->check_experimental_feature_enabled("btrfs")) { + derr << __FUNC__ << ": deprecated btrfs support is not enabled" << dendl; + goto close_fsid_fd; + } +#endif + + create_backend(basefs.f_type); + + ret = backend->create_current(); + if (ret < 0) { + derr << __FUNC__ << ": failed to create current/ " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + // write initial op_seq + { + uint64_t initial_seq = 0; + int fd = read_op_seq(&initial_seq); + if (fd < 0) { + ret = fd; + derr << __FUNC__ << ": failed to create " << current_op_seq_fn << ": " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + if (initial_seq == 0) { + ret = write_op_seq(fd, 1); + if (ret < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << __FUNC__ << ": failed to write to " << current_op_seq_fn << ": " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + if (backend->can_checkpoint()) { + // create snap_1 too + current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC); + ceph_assert(current_fd >= 0); + char s[NAME_MAX]; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull); + ret = backend->create_checkpoint(s, nullptr); + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + if (ret < 0 && ret != -EEXIST) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << __FUNC__ << ": failed to create snap_1: " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + } + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir); + if (ret < 0) { + derr << __FUNC__ << ": failed to create " << cct->_conf->filestore_omap_backend << dendl; + goto close_fsid_fd; + } + // create fsid under omap + // open+lock fsid + int omap_fsid_fd; + char omap_fsid_fn[PATH_MAX]; + snprintf(omap_fsid_fn, sizeof(omap_fsid_fn), "%s/osd_uuid", omap_dir.c_str()); + omap_fsid_fd = ::open(omap_fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644); + if (omap_fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << omap_fsid_fn << ": " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + if (read_fsid(omap_fsid_fd, &old_omap_fsid) < 0 || old_omap_fsid.is_zero()) { + ceph_assert(!fsid.is_zero()); + fsid.print(fsid_str); + strcat(fsid_str, "\n"); + ret = ::ftruncate(omap_fsid_fd, 0); + if (ret < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to truncate fsid: " + << cpp_strerror(ret) << dendl; + goto close_omap_fsid_fd; + } + ret = safe_write(omap_fsid_fd, fsid_str, strlen(fsid_str)); + if (ret < 0) { + derr << __FUNC__ << ": failed to write fsid: " + << cpp_strerror(ret) << dendl; + goto close_omap_fsid_fd; + } + dout(10) << __FUNC__ << ": write success, fsid:" << fsid_str << ", ret:" << ret << dendl; + if (::fsync(omap_fsid_fd) < 0) { + ret = -errno; + derr << __FUNC__ << ": close failed: can't write fsid: " + << cpp_strerror(ret) << dendl; + goto close_omap_fsid_fd; + } + dout(10) << "mkfs omap fsid is " << fsid << dendl; + } else { + if (fsid != old_omap_fsid) { + derr << __FUNC__ << ": " << omap_fsid_fn + << " has existed omap fsid " << old_omap_fsid + << " != expected osd fsid " << fsid + << dendl; + ret = -EINVAL; + goto close_omap_fsid_fd; + } + dout(1) << __FUNC__ << ": omap fsid is already set to " << fsid << dendl; + } + + dout(1) << cct->_conf->filestore_omap_backend << " db exists/created" << dendl; + + // journal? + ret = mkjournal(); + if (ret) + goto close_omap_fsid_fd; + + ret = write_meta("type", "filestore"); + if (ret) + goto close_omap_fsid_fd; + + dout(1) << "mkfs done in " << basedir << dendl; + ret = 0; + + close_omap_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd)); + close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + close_basedir_fd: + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + delete backend; + backend = nullptr; + return ret; +} + +int FileStore::mkjournal() +{ + // read fsid + int ret; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC, 0644); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": open error: " << cpp_strerror(err) << dendl; + return -err; + } + ret = read_fsid(fd, &fsid); + if (ret < 0) { + derr << __FUNC__ << ": read error: " << cpp_strerror(ret) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + + ret = 0; + + new_journal(); + if (journal) { + ret = journal->check(); + if (ret < 0) { + ret = journal->create(); + if (ret) + derr << __FUNC__ << ": error creating journal on " << journalpath + << ": " << cpp_strerror(ret) << dendl; + else + dout(0) << __FUNC__ << ": created journal on " << journalpath << dendl; + } + delete journal; + journal = nullptr; + } + return ret; +} + +int FileStore::read_fsid(int fd, uuid_d *uuid) +{ + char fsid_str[40]; + memset(fsid_str, 0, sizeof(fsid_str)); + int ret = safe_read(fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) + return ret; + if (ret == 8) { + // old 64-bit fsid... mirror it. + *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str; + *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str; + return 0; + } + + if (ret > 36) + fsid_str[36] = 0; + else + fsid_str[ret] = 0; + if (!uuid->parse(fsid_str)) + return -EINVAL; + return 0; +} + +int FileStore::lock_fsid() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + dout(0) << __FUNC__ << ": failed to lock " << basedir << "/fsid, is another ceph-osd still running? " + << cpp_strerror(err) << dendl; + return -err; + } + return 0; +} + +bool FileStore::test_mount_in_use() +{ + dout(5) << __FUNC__ << ": basedir " << basedir << " journal " << journalpath << dendl; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); + + // verify fs isn't in use + + fsid_fd = ::open(fn, O_RDWR|O_CLOEXEC, 0644); + if (fsid_fd < 0) + return 0; // no fsid, ok. + bool inuse = lock_fsid() < 0; + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + return inuse; +} + +bool FileStore::is_rotational() +{ + bool rotational; + if (backend) { + rotational = backend->is_rotational(); + } else { + int fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) + return true; + struct statfs st; + int r = ::fstatfs(fd, &st); + ::close(fd); + if (r < 0) { + return true; + } + create_backend(st.f_type); + rotational = backend->is_rotational(); + delete backend; + backend = nullptr; + } + dout(10) << __func__ << " " << (int)rotational << dendl; + return rotational; +} + +bool FileStore::is_journal_rotational() +{ + bool journal_rotational; + if (backend) { + journal_rotational = backend->is_journal_rotational(); + } else { + int fd = ::open(journalpath.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) + return true; + struct statfs st; + int r = ::fstatfs(fd, &st); + ::close(fd); + if (r < 0) { + return true; + } + create_backend(st.f_type); + journal_rotational = backend->is_journal_rotational(); + delete backend; + backend = nullptr; + } + dout(10) << __func__ << " " << (int)journal_rotational << dendl; + return journal_rotational; +} + +int FileStore::_detect_fs() +{ + struct statfs st; + int r = ::fstatfs(basedir_fd, &st); + if (r < 0) + return -errno; + + blk_size = st.f_bsize; + +#if defined(__linux__) + if (st.f_type == BTRFS_SUPER_MAGIC && + !g_ceph_context->check_experimental_feature_enabled("btrfs")) { + derr <<__FUNC__ << ": deprecated btrfs support is not enabled" << dendl; + return -EPERM; + } +#endif + + create_backend(st.f_type); + + r = backend->detect_features(); + if (r < 0) { + derr << __FUNC__ << ": detect_features error: " << cpp_strerror(r) << dendl; + return r; + } + + // vdo + { + char dev_node[PATH_MAX]; + if (int rc = BlkDev{fsid_fd}.wholedisk(dev_node, PATH_MAX); rc == 0) { + vdo_fd = get_vdo_stats_handle(dev_node, &vdo_name); + if (vdo_fd >= 0) { + dout(0) << __func__ << " VDO volume " << vdo_name << " for " << dev_node + << dendl; + } + } + } + + // test xattrs + char fn[PATH_MAX]; + int x = rand(); + int y = x+1; + snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str()); + int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0700); + if (tmpfd < 0) { + int ret = -errno; + derr << __FUNC__ << ": unable to create " << fn << ": " << cpp_strerror(ret) << dendl; + return ret; + } + + int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x)); + if (ret >= 0) + ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y)); + if ((ret < 0) || (x != y)) { + derr << "Extended attributes don't appear to work. "; + if (ret) + *_dout << "Got error " + cpp_strerror(ret) + ". "; + *_dout << "If you are using ext3 or ext4, be sure to mount the underlying " + << "file system with the 'user_xattr' option." << dendl; + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); + return -ENOTSUP; + } + + char buf[1000]; + memset(buf, 0, sizeof(buf)); // shut up valgrind + chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf)); + ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf)); + if (ret == -ENOSPC) { + dout(0) << "limited size xattrs" << dendl; + } + chain_fremovexattr(tmpfd, "user.test"); + chain_fremovexattr(tmpfd, "user.test2"); + chain_fremovexattr(tmpfd, "user.test3"); + chain_fremovexattr(tmpfd, "user.test4"); + chain_fremovexattr(tmpfd, "user.test5"); + + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); + + return 0; +} + +int FileStore::_sanity_check_fs() +{ + // sanity check(s) + + if (((int)m_filestore_journal_writeahead + + (int)m_filestore_journal_parallel + + (int)m_filestore_journal_trailing) > 1) { + dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl; + cerr << TEXT_RED + << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n" + << " is enabled in ceph.conf. You must choose a single journal mode." + << TEXT_NORMAL << std::endl; + return -EINVAL; + } + + if (!backend->can_checkpoint()) { + if (!journal || !m_filestore_journal_writeahead) { + dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl; + cerr << TEXT_RED + << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n" + << " For non-btrfs volumes, a writeahead journal is required to\n" + << " maintain on-disk consistency in the event of a crash. Your conf\n" + << " should include something like:\n" + << " osd journal = /path/to/journal_device_or_file\n" + << " filestore journal writeahead = true\n" + << TEXT_NORMAL; + } + } + + if (!journal) { + dout(0) << "mount WARNING: no journal" << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: No osd journal is configured: write latency may be high.\n" + << " If you will not be using an osd journal, write latency may be\n" + << " relatively high. It can be reduced somewhat by lowering\n" + << " filestore_max_sync_interval, but lower values mean lower write\n" + << " throughput, especially with spinning disks.\n" + << TEXT_NORMAL; + } + + return 0; +} + +int FileStore::write_superblock() +{ + bufferlist bl; + encode(superblock, bl); + return safe_write_file(basedir.c_str(), "superblock", + bl.c_str(), bl.length(), 0600); +} + +int FileStore::read_superblock() +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "superblock", + bp.c_str(), bp.length()); + if (ret < 0) { + if (ret == -ENOENT) { + // If the file doesn't exist write initial CompatSet + return write_superblock(); + } + return ret; + } + + bufferlist bl; + bl.push_back(std::move(bp)); + auto i = bl.cbegin(); + decode(superblock, i); + return 0; +} + +int FileStore::update_version_stamp() +{ + return write_version_stamp(); +} + +int FileStore::version_stamp_is_valid(uint32_t *version) +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "store_version", + bp.c_str(), bp.length()); + if (ret < 0) { + return ret; + } + bufferlist bl; + bl.push_back(std::move(bp)); + auto i = bl.cbegin(); + decode(*version, i); + dout(10) << __FUNC__ << ": was " << *version << " vs target " + << target_version << dendl; + if (*version == target_version) + return 1; + else + return 0; +} + +int FileStore::flush_cache(ostream *os) +{ + string drop_caches_file = "/proc/sys/vm/drop_caches"; + int drop_caches_fd = ::open(drop_caches_file.c_str(), O_WRONLY|O_CLOEXEC), ret = 0; + char buf[2] = "3"; + size_t len = strlen(buf); + + if (drop_caches_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << drop_caches_file << ": " << cpp_strerror(ret) << dendl; + if (os) { + *os << "FileStore flush_cache: failed to open " << drop_caches_file << ": " << cpp_strerror(ret); + } + return ret; + } + + if (::write(drop_caches_fd, buf, len) < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to write to " << drop_caches_file << ": " << cpp_strerror(ret) << dendl; + if (os) { + *os << "FileStore flush_cache: failed to write to " << drop_caches_file << ": " << cpp_strerror(ret); + } + goto out; + } + +out: + ::close(drop_caches_fd); + return ret; +} + +int FileStore::write_version_stamp() +{ + dout(1) << __FUNC__ << ": " << target_version << dendl; + bufferlist bl; + encode(target_version, bl); + + return safe_write_file(basedir.c_str(), "store_version", + bl.c_str(), bl.length(), 0600); +} + +int FileStore::upgrade() +{ + dout(1) << __FUNC__ << dendl; + uint32_t version; + int r = version_stamp_is_valid(&version); + + if (r == -ENOENT) { + derr << "The store_version file doesn't exist." << dendl; + return -EINVAL; + } + if (r < 0) + return r; + if (r == 1) + return 0; + + if (version < 3) { + derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl; + return -EINVAL; + } + + // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to + // open up DBObjectMap with the do_upgrade flag, which we already did. + update_version_stamp(); + return 0; +} + +int FileStore::read_op_seq(uint64_t *seq) +{ + int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR|O_CLOEXEC, 0644); + if (op_fd < 0) { + int r = -errno; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + char s[40]; + memset(s, 0, sizeof(s)); + int ret = safe_read(op_fd, s, sizeof(s) - 1); + if (ret < 0) { + derr << __FUNC__ << ": error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(op_fd)); + ceph_assert(!m_filestore_fail_eio || ret != -EIO); + return ret; + } + *seq = atoll(s); + return op_fd; +} + +int FileStore::write_op_seq(int fd, uint64_t seq) +{ + char s[30]; + snprintf(s, sizeof(s), "%" PRId64 "\n", seq); + int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0)); + if (ret < 0) { + ret = -errno; + ceph_assert(!m_filestore_fail_eio || ret != -EIO); + } + return ret; +} + +int FileStore::mount() +{ + int ret; + char buf[PATH_MAX]; + uint64_t initial_op_seq; + uuid_d omap_fsid; + set<string> cluster_snaps; + CompatSet supported_compat_set = get_fs_supported_compat_set(); + + dout(5) << "basedir " << basedir << " journal " << journalpath << dendl; + + ret = set_throttle_params(); + if (ret != 0) + goto done; + + // make sure global base dir exists + if (::access(basedir.c_str(), R_OK | W_OK)) { + ret = -errno; + derr << __FUNC__ << ": unable to access basedir '" << basedir << "': " + << cpp_strerror(ret) << dendl; + goto done; + } + + // get fsid + snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(buf, O_RDWR|O_CLOEXEC, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": error opening '" << buf << "': " + << cpp_strerror(ret) << dendl; + goto done; + } + + ret = read_fsid(fsid_fd, &fsid); + if (ret < 0) { + derr << __FUNC__ << ": error reading fsid_fd: " << cpp_strerror(ret) + << dendl; + goto close_fsid_fd; + } + + if (lock_fsid() < 0) { + derr << __FUNC__ << ": lock_fsid failed" << dendl; + ret = -EBUSY; + goto close_fsid_fd; + } + + dout(10) << "mount fsid is " << fsid << dendl; + + + uint32_t version_stamp; + ret = version_stamp_is_valid(&version_stamp); + if (ret < 0) { + derr << __FUNC__ << ": error in version_stamp_is_valid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } else if (ret == 0) { + if (do_update || (int)version_stamp < cct->_conf->filestore_update_to) { + derr << __FUNC__ << ": stale version stamp detected: " + << version_stamp + << ". Proceeding, do_update " + << "is set, performing disk format upgrade." + << dendl; + do_update = true; + } else { + ret = -EINVAL; + derr << __FUNC__ << ": stale version stamp " << version_stamp + << ". Please run the FileStore update script before starting the " + << "OSD, or set filestore_update_to to " << target_version + << " (currently " << cct->_conf->filestore_update_to << ")" + << dendl; + goto close_fsid_fd; + } + } + + ret = read_superblock(); + if (ret < 0) { + goto close_fsid_fd; + } + + // Check if this FileStore supports all the necessary features to mount + if (supported_compat_set.compare(superblock.compat_features) == -1) { + derr << __FUNC__ << ": Incompatible features set " + << superblock.compat_features << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + + // open some dir handles + basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC); + if (basedir_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << basedir << ": " + << cpp_strerror(ret) << dendl; + basedir_fd = -1; + goto close_fsid_fd; + } + + // test for btrfs, xattrs, etc. + ret = _detect_fs(); + if (ret < 0) { + derr << __FUNC__ << ": error in _detect_fs: " + << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + { + list<string> ls; + ret = backend->list_checkpoints(ls); + if (ret < 0) { + derr << __FUNC__ << ": error in _list_snaps: "<< cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + long long unsigned c, prev = 0; + char clustersnap[NAME_MAX]; + for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) { + if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) { + ceph_assert(c > prev); + prev = c; + snaps.push_back(c); + } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1) + cluster_snaps.insert(*it); + } + } + + if (m_osd_rollback_to_cluster_snap.length() && + cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) { + derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl; + ret = -ENOENT; + goto close_basedir_fd; + } + + char nosnapfn[200]; + snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str()); + + if (backend->can_checkpoint()) { + if (snaps.empty()) { + dout(0) << __FUNC__ << ": WARNING: no consistent snaps found, store may be in inconsistent state" << dendl; + } else { + char s[NAME_MAX]; + uint64_t curr_seq = 0; + + if (m_osd_rollback_to_cluster_snap.length()) { + derr << TEXT_RED + << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **" + << TEXT_NORMAL + << dendl; + ceph_assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap)); + snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str()); + } else { + { + int fd = read_op_seq(&curr_seq); + if (fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + } + if (curr_seq) + dout(10) << " current/ seq was " << curr_seq << dendl; + else + dout(10) << " current/ missing entirely (unusual, but okay)" << dendl; + + uint64_t cp = snaps.back(); + dout(10) << " most recent snap from " << snaps << " is " << cp << dendl; + + // if current/ is marked as non-snapshotted, refuse to roll + // back (without clear direction) to avoid throwing out new + // data. + struct stat st; + if (::stat(nosnapfn, &st) == 0) { + if (!m_osd_use_stale_snap) { + derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl; + derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl; + derr << "config option for --osd-use-stale-snap startup argument." << dendl; + ret = -ENOTSUP; + goto close_basedir_fd; + } + derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq + << ", newest snap is " << cp << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: forcing the use of stale snapshot data **" + << TEXT_NORMAL << std::endl; + } + + dout(10) << __FUNC__ << ": rolling back to consistent snap " << cp << dendl; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); + } + + // drop current? + ret = backend->rollback_to(s); + if (ret) { + derr << __FUNC__ << ": error rolling back to " << s << ": " + << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + } + } + initial_op_seq = 0; + + current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC); + if (current_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + ceph_assert(current_fd >= 0); + + op_fd = read_op_seq(&initial_op_seq); + if (op_fd < 0) { + ret = op_fd; + derr << __FUNC__ << ": read_op_seq failed" << dendl; + goto close_current_fd; + } + + dout(5) << "mount op_seq is " << initial_op_seq << dendl; + if (initial_op_seq == 0) { + derr << "mount initial op seq is 0; something is wrong" << dendl; + ret = -EINVAL; + goto close_current_fd; + } + + if (!backend->can_checkpoint()) { + // mark current/ as non-snapshotted so that we don't rollback away + // from it. + int r = ::creat(nosnapfn, 0644); + if (r < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to create current/nosnap" << dendl; + goto close_current_fd; + } + VOID_TEMP_FAILURE_RETRY(::close(r)); + } else { + // clear nosnap marker, if present. + ::unlink(nosnapfn); + } + + // check fsid with omap + // get omap fsid + char omap_fsid_buf[PATH_MAX]; + struct ::stat omap_fsid_stat; + snprintf(omap_fsid_buf, sizeof(omap_fsid_buf), "%s/osd_uuid", omap_dir.c_str()); + // if osd_uuid not exists, assume as this omap matchs corresponding osd + if (::stat(omap_fsid_buf, &omap_fsid_stat) != 0){ + dout(10) << __FUNC__ << ": osd_uuid not found under omap, " + << "assume as matched." + << dendl; + } else { + int omap_fsid_fd; + // if osd_uuid exists, compares osd_uuid with fsid + omap_fsid_fd = ::open(omap_fsid_buf, O_RDONLY|O_CLOEXEC, 0644); + if (omap_fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": error opening '" << omap_fsid_buf << "': " + << cpp_strerror(ret) + << dendl; + goto close_current_fd; + } + ret = read_fsid(omap_fsid_fd, &omap_fsid); + VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd)); + if (ret < 0) { + derr << __FUNC__ << ": error reading omap_fsid_fd" + << ", omap_fsid = " << omap_fsid + << cpp_strerror(ret) + << dendl; + goto close_current_fd; + } + if (fsid != omap_fsid) { + derr << __FUNC__ << ": " << omap_fsid_buf + << " has existed omap fsid " << omap_fsid + << " != expected osd fsid " << fsid + << dendl; + ret = -EINVAL; + goto close_current_fd; + } + } + + dout(0) << "start omap initiation" << dendl; + if (!(generic_flags & SKIP_MOUNT_OMAP)) { + KeyValueDB * omap_store = KeyValueDB::create(cct, + superblock.omap_backend, + omap_dir); + if (!omap_store) + { + derr << __FUNC__ << ": Error creating " << superblock.omap_backend << dendl; + ret = -1; + goto close_current_fd; + } + + if (superblock.omap_backend == "rocksdb") + ret = omap_store->init(cct->_conf->filestore_rocksdb_options); + else + ret = omap_store->init(); + + if (ret < 0) { + derr << __FUNC__ << ": Error initializing omap_store: " << cpp_strerror(ret) << dendl; + goto close_current_fd; + } + + stringstream err; + if (omap_store->create_and_open(err)) { + delete omap_store; + omap_store = nullptr; + derr << __FUNC__ << ": Error initializing " << superblock.omap_backend + << " : " << err.str() << dendl; + ret = -1; + goto close_current_fd; + } + + DBObjectMap *dbomap = new DBObjectMap(cct, omap_store); + ret = dbomap->init(do_update); + if (ret < 0) { + delete dbomap; + dbomap = nullptr; + derr << __FUNC__ << ": Error initializing DBObjectMap: " << ret << dendl; + goto close_current_fd; + } + stringstream err2; + + if (cct->_conf->filestore_debug_omap_check && !dbomap->check(err2)) { + derr << err2.str() << dendl; + delete dbomap; + dbomap = nullptr; + ret = -EINVAL; + goto close_current_fd; + } + object_map.reset(dbomap); + } + + // journal + new_journal(); + + // select journal mode? + if (journal) { + if (!m_filestore_journal_writeahead && + !m_filestore_journal_parallel && + !m_filestore_journal_trailing) { + if (!backend->can_checkpoint()) { + m_filestore_journal_writeahead = true; + dout(0) << __FUNC__ << ": enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl; + } else { + m_filestore_journal_parallel = true; + dout(0) << __FUNC__ << ": enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl; + } + } else { + if (m_filestore_journal_writeahead) + dout(0) << __FUNC__ << ": WRITEAHEAD journal mode explicitly enabled in conf" << dendl; + if (m_filestore_journal_parallel) + dout(0) << __FUNC__ << ": PARALLEL journal mode explicitly enabled in conf" << dendl; + if (m_filestore_journal_trailing) + dout(0) << __FUNC__ << ": TRAILING journal mode explicitly enabled in conf" << dendl; + } + if (m_filestore_journal_writeahead) + journal->set_wait_on_full(true); + } else { + dout(0) << __FUNC__ << ": no journal" << dendl; + } + + ret = _sanity_check_fs(); + if (ret) { + derr << __FUNC__ << ": _sanity_check_fs failed with error " + << ret << dendl; + goto close_current_fd; + } + + // Cleanup possibly invalid collections + { + vector<coll_t> collections; + ret = list_collections(collections, true); + if (ret < 0) { + derr << "Error " << ret << " while listing collections" << dendl; + goto close_current_fd; + } + for (vector<coll_t>::iterator i = collections.begin(); + i != collections.end(); + ++i) { + Index index; + ret = get_index(*i, &index); + if (ret < 0) { + derr << "Unable to mount index " << *i + << " with error: " << ret << dendl; + goto close_current_fd; + } + ceph_assert(index.index); + RWLock::WLocker l((index.index)->access_lock); + + index->cleanup(); + } + } + if (!m_disable_wbthrottle) { + wbthrottle.start(); + } else { + dout(0) << __FUNC__ << ": INFO: WbThrottle is disabled" << dendl; + if (cct->_conf->filestore_odsync_write) { + dout(0) << __FUNC__ << ": INFO: O_DSYNC write is enabled" << dendl; + } + } + sync_thread.create("filestore_sync"); + + if (!(generic_flags & SKIP_JOURNAL_REPLAY)) { + ret = journal_replay(initial_op_seq); + if (ret < 0) { + derr << __FUNC__ << ": failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl; + if (ret == -ENOTTY) { + derr << "maybe journal is not pointing to a block device and its size " + << "wasn't configured?" << dendl; + } + + goto stop_sync; + } + } + + { + stringstream err2; + if (cct->_conf->filestore_debug_omap_check && !object_map->check(err2)) { + derr << err2.str() << dendl; + ret = -EINVAL; + goto stop_sync; + } + } + + init_temp_collections(); + + journal_start(); + + op_tp.start(); + for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->start(); + } + for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->start(); + } + + timer.init(); + + // upgrade? + if (cct->_conf->filestore_update_to >= (int)get_target_version()) { + int err = upgrade(); + if (err < 0) { + derr << "error converting store" << dendl; + umount(); + return err; + } + } + + // all okay. + return 0; + +stop_sync: + // stop sync thread + lock.Lock(); + stop = true; + sync_cond.Signal(); + lock.Unlock(); + sync_thread.join(); + if (!m_disable_wbthrottle) { + wbthrottle.stop(); + } +close_current_fd: + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; +close_basedir_fd: + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + basedir_fd = -1; +close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; +done: + ceph_assert(!m_filestore_fail_eio || ret != -EIO); + delete backend; + backend = nullptr; + object_map.reset(); + return ret; +} + +void FileStore::init_temp_collections() +{ + dout(10) << __FUNC__ << dendl; + vector<coll_t> ls; + int r = list_collections(ls, true); + ceph_assert(r >= 0); + + dout(20) << " ls " << ls << dendl; + + SequencerPosition spos; + + set<coll_t> temps; + for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) + if (p->is_temp()) + temps.insert(*p); + dout(20) << " temps " << temps << dendl; + + for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) { + if (p->is_temp()) + continue; + coll_map[*p] = new OpSequencer(cct, ++next_osr_id, *p); + if (p->is_meta()) + continue; + coll_t temp = p->get_temp(); + if (temps.count(temp)) { + temps.erase(temp); + } else { + dout(10) << __FUNC__ << ": creating " << temp << dendl; + r = _create_collection(temp, 0, spos); + ceph_assert(r == 0); + } + } + + for (set<coll_t>::iterator p = temps.begin(); p != temps.end(); ++p) { + dout(10) << __FUNC__ << ": removing stray " << *p << dendl; + r = _collection_remove_recursive(*p, spos); + ceph_assert(r == 0); + } +} + +int FileStore::umount() +{ + dout(5) << __FUNC__ << ": " << basedir << dendl; + + flush(); + sync(); + do_force_sync(); + + { + Mutex::Locker l(coll_lock); + coll_map.clear(); + } + + lock.Lock(); + stop = true; + sync_cond.Signal(); + lock.Unlock(); + sync_thread.join(); + if (!m_disable_wbthrottle){ + wbthrottle.stop(); + } + op_tp.stop(); + + journal_stop(); + if (!(generic_flags & SKIP_JOURNAL_REPLAY)) + journal_write_close(); + + for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->stop(); + } + for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->stop(); + } + + if (vdo_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(vdo_fd)); + vdo_fd = -1; + } + if (fsid_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + } + if (op_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(op_fd)); + op_fd = -1; + } + if (current_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; + } + if (basedir_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + basedir_fd = -1; + } + + force_sync = false; + + delete backend; + backend = nullptr; + + object_map.reset(); + + { + Mutex::Locker l(sync_entry_timeo_lock); + timer.shutdown(); + } + + // nothing + return 0; +} + + +/// ----------------------------- + +// keep OpSequencer handles alive for all time so that a sequence +// that removes a collection and creates a new one will not allow +// two sequencers for the same collection to be alive at once. + +ObjectStore::CollectionHandle FileStore::open_collection(const coll_t& c) +{ + Mutex::Locker l(coll_lock); + auto p = coll_map.find(c); + if (p == coll_map.end()) { + return CollectionHandle(); + } + return p->second; +} + +ObjectStore::CollectionHandle FileStore::create_new_collection(const coll_t& c) +{ + Mutex::Locker l(coll_lock); + auto p = coll_map.find(c); + if (p == coll_map.end()) { + auto *r = new OpSequencer(cct, ++next_osr_id, c); + coll_map[c] = r; + return r; + } else { + return p->second; + } +} + + +/// ----------------------------- + +FileStore::Op *FileStore::build_op(vector<Transaction>& tls, + Context *onreadable, + Context *onreadable_sync, + TrackedOpRef osd_op) +{ + uint64_t bytes = 0, ops = 0; + for (vector<Transaction>::iterator p = tls.begin(); + p != tls.end(); + ++p) { + bytes += (*p).get_num_bytes(); + ops += (*p).get_num_ops(); + } + + Op *o = new Op; + o->start = ceph_clock_now(); + o->tls = std::move(tls); + o->onreadable = onreadable; + o->onreadable_sync = onreadable_sync; + o->ops = ops; + o->bytes = bytes; + o->osd_op = osd_op; + return o; +} + + + +void FileStore::queue_op(OpSequencer *osr, Op *o) +{ + // queue op on sequencer, then queue sequencer for the threadpool, + // so that regardless of which order the threads pick up the + // sequencer, the op order will be preserved. + + osr->queue(o); + o->trace.event("queued"); + + logger->inc(l_filestore_ops); + logger->inc(l_filestore_bytes, o->bytes); + + dout(5) << __FUNC__ << ": " << o << " seq " << o->op + << " " << *osr + << " " << o->bytes << " bytes" + << " (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)" + << dendl; + op_wq.queue(osr); +} + +void FileStore::op_queue_reserve_throttle(Op *o) +{ + throttle_ops.get(); + throttle_bytes.get(o->bytes); + + logger->set(l_filestore_op_queue_ops, throttle_ops.get_current()); + logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current()); +} + +void FileStore::op_queue_release_throttle(Op *o) +{ + throttle_ops.put(); + throttle_bytes.put(o->bytes); + logger->set(l_filestore_op_queue_ops, throttle_ops.get_current()); + logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current()); +} + +void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle) +{ + if (!m_disable_wbthrottle) { + wbthrottle.throttle(); + } + // inject a stall? + if (cct->_conf->filestore_inject_stall) { + int orig = cct->_conf->filestore_inject_stall; + dout(5) << __FUNC__ << ": filestore_inject_stall " << orig << ", sleeping" << dendl; + sleep(orig); + cct->_conf.set_val("filestore_inject_stall", "0"); + dout(5) << __FUNC__ << ": done stalling" << dendl; + } + + osr->apply_lock.Lock(); + Op *o = osr->peek_queue(); + o->trace.event("op_apply_start"); + apply_manager.op_apply_start(o->op); + dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " start" << dendl; + o->trace.event("_do_transactions start"); + int r = _do_transactions(o->tls, o->op, &handle, osr->osr_name); + o->trace.event("op_apply_finish"); + apply_manager.op_apply_finish(o->op); + dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " r = " << r + << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl; +} + +void FileStore::_finish_op(OpSequencer *osr) +{ + list<Context*> to_queue; + Op *o = osr->dequeue(&to_queue); + + o->tls.clear(); + + utime_t lat = ceph_clock_now(); + lat -= o->start; + + dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " lat " << lat << dendl; + osr->apply_lock.Unlock(); // locked in _do_op + o->trace.event("_finish_op"); + + // called with tp lock held + op_queue_release_throttle(o); + + logger->tinc(l_filestore_apply_latency, lat); + + if (o->onreadable_sync) { + o->onreadable_sync->complete(0); + } + if (o->onreadable) { + apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable); + } + if (!to_queue.empty()) { + apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue); + } + delete o; + o = nullptr; +} + +struct C_JournaledAhead : public Context { + FileStore *fs; + FileStore::OpSequencer *osr; + FileStore::Op *o; + Context *ondisk; + + C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk): + fs(f), osr(os), o(o), ondisk(ondisk) { } + void finish(int r) override { + fs->_journaled_ahead(osr, o, ondisk); + } +}; + +int FileStore::queue_transactions(CollectionHandle& ch, vector<Transaction>& tls, + TrackedOpRef osd_op, + ThreadPool::TPHandle *handle) +{ + Context *onreadable; + Context *ondisk; + Context *onreadable_sync; + ObjectStore::Transaction::collect_contexts( + tls, &onreadable, &ondisk, &onreadable_sync); + + if (cct->_conf->objectstore_blackhole) { + dout(0) << __FUNC__ << ": objectstore_blackhole = TRUE, dropping transaction" + << dendl; + delete ondisk; + ondisk = nullptr; + delete onreadable; + onreadable = nullptr; + delete onreadable_sync; + onreadable_sync = nullptr; + return 0; + } + + utime_t start = ceph_clock_now(); + + OpSequencer *osr = static_cast<OpSequencer*>(ch.get()); + dout(5) << __FUNC__ << ": osr " << osr << " " << *osr << dendl; + + ZTracer::Trace trace; + if (osd_op && osd_op->pg_trace) { + osd_op->store_trace.init("filestore op", &trace_endpoint, &osd_op->pg_trace); + trace = osd_op->store_trace; + } + + if (journal && journal->is_writeable() && !m_filestore_journal_trailing) { + Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); + + //prepare and encode transactions data out of lock + bufferlist tbl; + int orig_len = journal->prepare_entry(o->tls, &tbl); + + if (handle) + handle->suspend_tp_timeout(); + + op_queue_reserve_throttle(o); + journal->reserve_throttle_and_backoff(tbl.length()); + + if (handle) + handle->reset_tp_timeout(); + + uint64_t op_num = submit_manager.op_submit_start(); + o->op = op_num; + trace.keyval("opnum", op_num); + + if (m_filestore_do_dump) + dump_transactions(o->tls, o->op, osr); + + if (m_filestore_journal_parallel) { + dout(5) << __FUNC__ << ": (parallel) " << o->op << " " << o->tls << dendl; + + trace.keyval("journal mode", "parallel"); + trace.event("journal started"); + _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op); + + // queue inside submit_manager op submission lock + queue_op(osr, o); + trace.event("op queued"); + } else if (m_filestore_journal_writeahead) { + dout(5) << __FUNC__ << ": (writeahead) " << o->op << " " << o->tls << dendl; + + osr->queue_journal(o); + + trace.keyval("journal mode", "writeahead"); + trace.event("journal started"); + _op_journal_transactions(tbl, orig_len, o->op, + new C_JournaledAhead(this, osr, o, ondisk), + osd_op); + } else { + ceph_abort(); + } + submit_manager.op_submit_finish(op_num); + utime_t end = ceph_clock_now(); + logger->tinc(l_filestore_queue_transaction_latency_avg, end - start); + return 0; + } + + if (!journal) { + Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); + dout(5) << __FUNC__ << ": (no journal) " << o << " " << tls << dendl; + + if (handle) + handle->suspend_tp_timeout(); + + op_queue_reserve_throttle(o); + + if (handle) + handle->reset_tp_timeout(); + + uint64_t op_num = submit_manager.op_submit_start(); + o->op = op_num; + + if (m_filestore_do_dump) + dump_transactions(o->tls, o->op, osr); + + queue_op(osr, o); + trace.keyval("opnum", op_num); + trace.keyval("journal mode", "none"); + trace.event("op queued"); + + if (ondisk) + apply_manager.add_waiter(op_num, ondisk); + submit_manager.op_submit_finish(op_num); + utime_t end = ceph_clock_now(); + logger->tinc(l_filestore_queue_transaction_latency_avg, end - start); + return 0; + } + + ceph_assert(journal); + //prepare and encode transactions data out of lock + bufferlist tbl; + int orig_len = -1; + if (journal->is_writeable()) { + orig_len = journal->prepare_entry(tls, &tbl); + } + uint64_t op = submit_manager.op_submit_start(); + dout(5) << __FUNC__ << ": (trailing journal) " << op << " " << tls << dendl; + + if (m_filestore_do_dump) + dump_transactions(tls, op, osr); + + trace.event("op_apply_start"); + trace.keyval("opnum", op); + trace.keyval("journal mode", "trailing"); + apply_manager.op_apply_start(op); + trace.event("do_transactions"); + int r = do_transactions(tls, op); + + if (r >= 0) { + trace.event("journal started"); + _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op); + } else { + delete ondisk; + ondisk = nullptr; + } + + // start on_readable finisher after we queue journal item, as on_readable callback + // is allowed to delete the Transaction + if (onreadable_sync) { + onreadable_sync->complete(r); + } + apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r); + + submit_manager.op_submit_finish(op); + trace.event("op_apply_finish"); + apply_manager.op_apply_finish(op); + + utime_t end = ceph_clock_now(); + logger->tinc(l_filestore_queue_transaction_latency_avg, end - start); + return r; +} + +void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk) +{ + dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl; + + o->trace.event("writeahead journal finished"); + + // this should queue in order because the journal does it's completions in order. + queue_op(osr, o); + + list<Context*> to_queue; + osr->dequeue_journal(&to_queue); + + // do ondisk completions async, to prevent any onreadable_sync completions + // getting blocked behind an ondisk completion. + if (ondisk) { + dout(10) << " queueing ondisk " << ondisk << dendl; + ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk); + } + if (!to_queue.empty()) { + ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue); + } +} + +int FileStore::_do_transactions( + vector<Transaction> &tls, + uint64_t op_seq, + ThreadPool::TPHandle *handle, + const char *osr_name) +{ + int trans_num = 0; + + for (vector<Transaction>::iterator p = tls.begin(); + p != tls.end(); + ++p, trans_num++) { + _do_transaction(*p, op_seq, trans_num, handle, osr_name); + if (handle) + handle->reset_tp_timeout(); + } + + return 0; +} + +void FileStore::_set_global_replay_guard(const coll_t& cid, + const SequencerPosition &spos) +{ + if (backend->can_checkpoint()) + return; + + // sync all previous operations on this sequencer + int ret = object_map->sync(); + if (ret < 0) { + derr << __FUNC__ << ": omap sync error " << cpp_strerror(ret) << dendl; + ceph_abort_msg("_set_global_replay_guard failed"); + } + ret = sync_filesystem(basedir_fd); + if (ret < 0) { + derr << __FUNC__ << ": sync_filesystem error " << cpp_strerror(ret) << dendl; + ceph_abort_msg("_set_global_replay_guard failed"); + } + + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + ceph_abort_msg("_set_global_replay_guard failed"); + } + + _inject_failure(); + + // then record that we did it + bufferlist v; + encode(spos, v); + int r = chain_fsetxattr<true, true>( + fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length()); + if (r < 0) { + derr << __FUNC__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR + << " got " << cpp_strerror(r) << dendl; + ceph_abort_msg("fsetxattr failed"); + } + + // and make sure our xattr is durable. + r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + _inject_failure(); + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + dout(10) << __FUNC__ << ": " << spos << " done" << dendl; +} + +int FileStore::_check_global_replay_guard(const coll_t& cid, + const SequencerPosition& spos) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + dout(10) << __FUNC__ << ": " << cid << " dne" << dendl; + return 1; // if collection does not exist, there is no guard, and we can replay. + } + + char buf[100]; + int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf)); + if (r < 0) { + dout(20) << __FUNC__ << ": no xattr" << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return 1; // no xattr + } + bufferlist bl; + bl.append(buf, r); + + SequencerPosition opos; + auto p = bl.cbegin(); + decode(opos, p); + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return spos >= opos ? 1 : -1; +} + + +void FileStore::_set_replay_guard(const coll_t& cid, + const SequencerPosition &spos, + bool in_progress=false) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + ceph_abort_msg("_set_replay_guard failed"); + } + _set_replay_guard(fd, spos, 0, in_progress); + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + + +void FileStore::_set_replay_guard(int fd, + const SequencerPosition& spos, + const ghobject_t *hoid, + bool in_progress) +{ + if (backend->can_checkpoint()) + return; + + dout(10) << __FUNC__ << ": " << spos << (in_progress ? " START" : "") << dendl; + + _inject_failure(); + + // first make sure the previous operation commits + int r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + if (!in_progress) { + // sync object_map too. even if this object has a header or keys, + // it have had them in the past and then removed them, so always + // sync. + object_map->sync(hoid, &spos); + } + + _inject_failure(); + + // then record that we did it + bufferlist v(40); + encode(spos, v); + encode(in_progress, v); + r = chain_fsetxattr<true, true>( + fd, REPLAY_GUARD_XATTR, v.c_str(), v.length()); + if (r < 0) { + derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; + ceph_abort_msg("fsetxattr failed"); + } + + // and make sure our xattr is durable. + r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + _inject_failure(); + + dout(10) << __FUNC__ << ": " << spos << " done" << dendl; +} + +void FileStore::_close_replay_guard(const coll_t& cid, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + ceph_abort_msg("_close_replay_guard failed"); + } + _close_replay_guard(fd, spos); + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + +void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos, + const ghobject_t *hoid) +{ + if (backend->can_checkpoint()) + return; + + dout(10) << __FUNC__ << ": " << spos << dendl; + + _inject_failure(); + + // sync object_map too. even if this object has a header or keys, + // it have had them in the past and then removed them, so always + // sync. + object_map->sync(hoid, &spos); + + // then record that we are done with this operation + bufferlist v(40); + encode(spos, v); + bool in_progress = false; + encode(in_progress, v); + int r = chain_fsetxattr<true, true>( + fd, REPLAY_GUARD_XATTR, v.c_str(), v.length()); + if (r < 0) { + derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; + ceph_abort_msg("fsetxattr failed"); + } + + // and make sure our xattr is durable. + r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + _inject_failure(); + + dout(10) << __FUNC__ << ": " << spos << " done" << dendl; +} + +int FileStore::_check_replay_guard(const coll_t& cid, const ghobject_t &oid, + const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + int r = _check_global_replay_guard(cid, spos); + if (r < 0) + return r; + + FDRef fd; + r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << __FUNC__ << ": " << cid << " " << oid << " dne" << dendl; + return 1; // if file does not exist, there is no guard, and we can replay. + } + int ret = _check_replay_guard(**fd, spos); + lfn_close(fd); + return ret; +} + +int FileStore::_check_replay_guard(const coll_t& cid, const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + dout(10) << __FUNC__ << ": " << cid << " dne" << dendl; + return 1; // if collection does not exist, there is no guard, and we can replay. + } + int ret = _check_replay_guard(fd, spos); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; +} + +int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + char buf[100]; + int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf)); + if (r < 0) { + dout(20) << __FUNC__ << ": no xattr" << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return 1; // no xattr + } + bufferlist bl; + bl.append(buf, r); + + SequencerPosition opos; + auto p = bl.cbegin(); + decode(opos, p); + bool in_progress = false; + if (!p.end()) // older journals don't have this + decode(in_progress, p); + if (opos > spos) { + dout(10) << __FUNC__ << ": object has " << opos << " > current pos " << spos + << ", now or in future, SKIPPING REPLAY" << dendl; + return -1; + } else if (opos == spos) { + if (in_progress) { + dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos + << ", in_progress=true, CONDITIONAL REPLAY" << dendl; + return 0; + } else { + dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos + << ", in_progress=false, SKIPPING REPLAY" << dendl; + return -1; + } + } else { + dout(10) << __FUNC__ << ": object has " << opos << " < current pos " << spos + << ", in past, will replay" << dendl; + return 1; + } +} + +void FileStore::_do_transaction( + Transaction& t, uint64_t op_seq, int trans_num, + ThreadPool::TPHandle *handle, + const char *osr_name) +{ + dout(10) << __FUNC__ << ": on " << &t << dendl; + + Transaction::iterator i = t.begin(); + + SequencerPosition spos(op_seq, trans_num, 0); + while (i.have_op()) { + if (handle) + handle->reset_tp_timeout(); + + Transaction::Op *op = i.decode_op(); + int r = 0; + + _inject_failure(); + + switch (op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_TOUCH: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, touch_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _touch(cid, oid); + tracepoint(objectstore, touch_exit, r); + } + break; + + case Transaction::OP_WRITE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, write_enter, osr_name, off, len); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _write(cid, oid, off, len, bl, fadvise_flags); + tracepoint(objectstore, write_exit, r); + } + break; + + case Transaction::OP_ZERO: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + uint64_t len = op->len; + tracepoint(objectstore, zero_enter, osr_name, off, len); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _zero(cid, oid, off, len); + tracepoint(objectstore, zero_exit, r); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + tracepoint(objectstore, truncate_enter, osr_name, off); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _truncate(cid, oid, off); + tracepoint(objectstore, truncate_exit, r); + } + break; + + case Transaction::OP_REMOVE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, remove_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _remove(cid, oid, spos); + tracepoint(objectstore, remove_exit, r); + } + break; + + case Transaction::OP_SETATTR: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, setattr_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) { + map<string, bufferptr> to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(cid, oid, to_set, spos); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid + << " name " << name << " size " << bl.length() << dendl; + } + tracepoint(objectstore, setattr_exit, r); + } + break; + + case Transaction::OP_SETATTRS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + map<string, bufferptr> aset; + i.decode_attrset(aset); + tracepoint(objectstore, setattrs_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _setattrs(cid, oid, aset, spos); + tracepoint(objectstore, setattrs_exit, r); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl; + } + break; + + case Transaction::OP_RMATTR: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + string name = i.decode_string(); + tracepoint(objectstore, rmattr_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _rmattr(cid, oid, name.c_str(), spos); + tracepoint(objectstore, rmattr_exit, r); + } + break; + + case Transaction::OP_RMATTRS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, rmattrs_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _rmattrs(cid, oid, spos); + tracepoint(objectstore, rmattrs_exit, r); + } + break; + + case Transaction::OP_CLONE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + const ghobject_t &noid = i.get_oid(op->dest_oid); + tracepoint(objectstore, clone_enter, osr_name); + r = _clone(cid, oid, noid, spos); + tracepoint(objectstore, clone_exit, r); + } + break; + + case Transaction::OP_CLONERANGE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const ghobject_t &noid = i.get_oid(op->dest_oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + uint64_t len = op->len; + tracepoint(objectstore, clone_range_enter, osr_name, len); + r = _clone_range(cid, oid, ncid, noid, off, len, off, spos); + tracepoint(objectstore, clone_range_exit, r); + } + break; + + case Transaction::OP_CLONERANGE2: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const ghobject_t &noid = i.get_oid(op->dest_oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ? + _cid : _cid.get_temp(); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + tracepoint(objectstore, clone_range2_enter, osr_name, len); + r = _clone_range(cid, oid, ncid, noid, srcoff, len, dstoff, spos); + tracepoint(objectstore, clone_range2_exit, r); + } + break; + + case Transaction::OP_MKCOLL: + { + const coll_t &cid = i.get_cid(op->cid); + tracepoint(objectstore, mkcoll_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _create_collection(cid, op->split_bits, spos); + tracepoint(objectstore, mkcoll_exit, r); + } + break; + + case Transaction::OP_COLL_SET_BITS: + { + const coll_t &cid = i.get_cid(op->cid); + int bits = op->split_bits; + r = _collection_set_bits(cid, bits); + } + break; + + case Transaction::OP_COLL_HINT: + { + const coll_t &cid = i.get_cid(op->cid); + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + auto hiter = hint.cbegin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + decode(pg_num, hiter); + decode(num_objs, hiter); + if (_check_replay_guard(cid, spos) > 0) { + r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos); + } + } else { + // Ignore the hint + dout(10) << "Unrecognized collection hint type: " << type << dendl; + } + } + break; + + case Transaction::OP_RMCOLL: + { + const coll_t &cid = i.get_cid(op->cid); + tracepoint(objectstore, rmcoll_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _destroy_collection(cid); + tracepoint(objectstore, rmcoll_exit, r); + } + break; + + case Transaction::OP_COLL_ADD: + { + const coll_t &ocid = i.get_cid(op->cid); + const coll_t &ncid = i.get_cid(op->dest_cid); + const ghobject_t &oid = i.get_oid(op->oid); + + ceph_assert(oid.hobj.pool >= -1); + + // always followed by OP_COLL_REMOVE + Transaction::Op *op2 = i.decode_op(); + const coll_t &ocid2 = i.get_cid(op2->cid); + const ghobject_t &oid2 = i.get_oid(op2->oid); + ceph_assert(op2->op == Transaction::OP_COLL_REMOVE); + ceph_assert(ocid2 == ocid); + ceph_assert(oid2 == oid); + + tracepoint(objectstore, coll_add_enter); + r = _collection_add(ncid, ocid, oid, spos); + tracepoint(objectstore, coll_add_exit, r); + spos.op++; + if (r < 0) + break; + tracepoint(objectstore, coll_remove_enter, osr_name); + if (_check_replay_guard(ocid, oid, spos) > 0) + r = _remove(ocid, oid, spos); + tracepoint(objectstore, coll_remove_exit, r); + } + break; + + case Transaction::OP_COLL_MOVE: + { + // WARNING: this is deprecated and buggy; only here to replay old journals. + const coll_t &ocid = i.get_cid(op->cid); + const coll_t &ncid = i.get_cid(op->dest_cid); + const ghobject_t &oid = i.get_oid(op->oid); + tracepoint(objectstore, coll_move_enter); + r = _collection_add(ocid, ncid, oid, spos); + if (r == 0 && + (_check_replay_guard(ocid, oid, spos) > 0)) + r = _remove(ocid, oid, spos); + tracepoint(objectstore, coll_move_exit, r); + } + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + const coll_t &_oldcid = i.get_cid(op->cid); + const ghobject_t &oldoid = i.get_oid(op->oid); + const coll_t &_newcid = i.get_cid(op->dest_cid); + const ghobject_t &newoid = i.get_oid(op->dest_oid); + const coll_t &oldcid = !_need_temp_object_collection(_oldcid, oldoid) ? + _oldcid : _oldcid.get_temp(); + const coll_t &newcid = !_need_temp_object_collection(_newcid, newoid) ? + _oldcid : _newcid.get_temp(); + tracepoint(objectstore, coll_move_rename_enter); + r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos); + tracepoint(objectstore, coll_move_rename_exit, r); + } + break; + + case Transaction::OP_TRY_RENAME: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oldoid = i.get_oid(op->oid); + const ghobject_t &newoid = i.get_oid(op->dest_oid); + const coll_t &oldcid = !_need_temp_object_collection(_cid, oldoid) ? + _cid : _cid.get_temp(); + const coll_t &newcid = !_need_temp_object_collection(_cid, newoid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, coll_try_rename_enter); + r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos, true); + tracepoint(objectstore, coll_try_rename_exit, r); + } + break; + + case Transaction::OP_COLL_SETATTR: + case Transaction::OP_COLL_RMATTR: + ceph_abort_msg("collection attr methods no longer implemented"); + break; + + case Transaction::OP_COLL_RENAME: + { + r = -EOPNOTSUPP; + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, omap_clear_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_clear(cid, oid, spos); + tracepoint(objectstore, omap_clear_exit, r); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + map<string, bufferlist> aset; + i.decode_attrset(aset); + tracepoint(objectstore, omap_setkeys_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_setkeys(cid, oid, aset, spos); + tracepoint(objectstore, omap_setkeys_exit, r); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + set<string> keys; + i.decode_keyset(keys); + tracepoint(objectstore, omap_rmkeys_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_rmkeys(cid, oid, keys, spos); + tracepoint(objectstore, omap_rmkeys_exit, r); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + tracepoint(objectstore, omap_rmkeyrange_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_rmkeyrange(cid, oid, first, last, spos); + tracepoint(objectstore, omap_rmkeyrange_exit, r); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, omap_setheader_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_setheader(cid, oid, bl, spos); + tracepoint(objectstore, omap_setheader_exit, r); + } + break; + case Transaction::OP_SPLIT_COLLECTION: + { + ceph_abort_msg("not legacy journal; upgrade to firefly first"); + } + break; + case Transaction::OP_SPLIT_COLLECTION2: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + coll_t dest = i.get_cid(op->dest_cid); + tracepoint(objectstore, split_coll2_enter, osr_name); + r = _split_collection(cid, bits, rem, dest, spos); + tracepoint(objectstore, split_coll2_exit, r); + } + break; + + case Transaction::OP_MERGE_COLLECTION: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + coll_t dest = i.get_cid(op->dest_cid); + tracepoint(objectstore, merge_coll_enter, osr_name); + r = _merge_collection(cid, bits, dest, spos); + tracepoint(objectstore, merge_coll_exit, r); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t expected_object_size = op->expected_object_size; + uint64_t expected_write_size = op->expected_write_size; + tracepoint(objectstore, setallochint_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _set_alloc_hint(cid, oid, expected_object_size, + expected_write_size); + tracepoint(objectstore, setallochint_exit, r); + } + break; + + default: + derr << "bad op " << op->op << dendl; + ceph_abort(); + } + + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD || + op->op == Transaction::OP_SETATTR || + op->op == Transaction::OP_SETATTRS || + op->op == Transaction::OP_RMATTR || + op->op == Transaction::OP_OMAP_SETKEYS || + op->op == Transaction::OP_OMAP_RMKEYS || + op->op == Transaction::OP_OMAP_RMKEYRANGE || + op->op == Transaction::OP_OMAP_SETHEADER)) + // -ENOENT is normally okay + // ...including on a replayed OP_RMCOLL with checkpoint mode + ok = true; + if (r == -ENODATA) + ok = true; + + if (op->op == Transaction::OP_SETALLOCHINT) + // Either EOPNOTSUPP or EINVAL most probably. EINVAL in most + // cases means invalid hint size (e.g. too big, not a multiple + // of block size, etc) or, at least on xfs, an attempt to set + // or change it when the file is not empty. However, + // OP_SETALLOCHINT is advisory, so ignore all errors. + ok = true; + + if (replaying && !backend->can_checkpoint()) { + if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -ERANGE) { + dout(10) << "tolerating ERANGE on replay" << dendl; + ok = true; + } + if (r == -ENOENT) { + dout(10) << "tolerating ENOENT on replay" << dendl; + ok = true; + } + } + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) { + msg = "ENOENT on clone suggests osd bug"; + } else if (r == -ENOSPC) { + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC from disk filesystem, misconfigured cluster"; + } else if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } else if (r == -EPERM) { + msg = "EPERM suggests file(s) in osd data dir not owned by ceph user, or leveldb corruption"; + } + + derr << " error " << cpp_strerror(r) << " not handled on operation " << op + << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl; + dout(0) << msg << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + + if (r == -EMFILE) { + dump_open_fds(cct); + } + + ceph_abort_msg("unexpected error"); + } + } + + spos.op++; + } + + _inject_failure(); +} + + /*********************************************/ + + + +// -------------------- +// objects + +bool FileStore::exists(CollectionHandle& ch, const ghobject_t& oid) +{ + tracepoint(objectstore, exists_enter, ch->cid.c_str()); + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + struct stat st; + bool retval = stat(ch, oid, &st) == 0; + tracepoint(objectstore, exists_exit, retval); + return retval; +} + +int FileStore::stat( + CollectionHandle& ch, const ghobject_t& oid, struct stat *st, bool allow_eio) +{ + tracepoint(objectstore, stat_enter, ch->cid.c_str()); + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + int r = lfn_stat(cid, oid, st); + ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO); + if (r < 0) { + dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid + << " = " << r << dendl; + } else { + dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid + << " = " << r + << " (size " << st->st_size << ")" << dendl; + } + if (cct->_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, stat_exit, r); + return r; + } +} + +int FileStore::set_collection_opts( + CollectionHandle& ch, + const pool_opts_t& opts) +{ + return -EOPNOTSUPP; +} + +int FileStore::read( + CollectionHandle& ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags) +{ + int got; + tracepoint(objectstore, read_enter, ch->cid.c_str(), offset, len); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") open error: " + << cpp_strerror(r) << dendl; + return r; + } + + if (offset == 0 && len == 0) { + struct stat st; + memset(&st, 0, sizeof(struct stat)); + int r = ::fstat(**fd, &st); + ceph_assert(r == 0); + len = st.st_size; + } + +#ifdef HAVE_POSIX_FADVISE + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM) + posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM); + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) + posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL); +#endif + + bufferptr bptr(len); // prealloc space for entire read + got = safe_pread(**fd, bptr.c_str(), len, offset); + if (got < 0) { + dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl; + lfn_close(fd); + return got; + } + bptr.set_length(got); // properly size the buffer + bl.clear(); + bl.push_back(std::move(bptr)); // put it in the target bufferlist + +#ifdef HAVE_POSIX_FADVISE + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED); + if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)) + posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL); +#endif + + if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) { + ostringstream ss; + int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss); + if (errors != 0) { + dout(0) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" + << got << " ... BAD CRC:\n" << ss.str() << dendl; + ceph_abort_msg("bad crc on read"); + } + } + + lfn_close(fd); + + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" + << got << "/" << len << dendl; + if (cct->_conf->filestore_debug_inject_read_err && + debug_data_eio(oid)) { + return -EIO; + } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */ + cct->_conf->filestore_debug_random_read_err && + (rand() % (int)(cct->_conf->filestore_debug_random_read_err * + 100.0)) == 0) { + dout(0) << __func__ << ": inject random EIO" << dendl; + return -EIO; + } else { + tracepoint(objectstore, read_exit, got); + return got; + } +} + +int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len, + map<uint64_t, uint64_t> *m) +{ + uint64_t i; + struct fiemap_extent *extent = nullptr; + struct fiemap *fiemap = nullptr; + int r = 0; + +more: + r = backend->do_fiemap(fd, offset, len, &fiemap); + if (r < 0) + return r; + + if (fiemap->fm_mapped_extents == 0) { + free(fiemap); + return r; + } + + extent = &fiemap->fm_extents[0]; + + /* start where we were asked to start */ + if (extent->fe_logical < offset) { + extent->fe_length -= offset - extent->fe_logical; + extent->fe_logical = offset; + } + + i = 0; + + struct fiemap_extent *last = nullptr; + while (i < fiemap->fm_mapped_extents) { + struct fiemap_extent *next = extent + 1; + + dout(10) << __FUNC__ << ": fm_mapped_extents=" << fiemap->fm_mapped_extents + << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl; + + /* try to merge extents */ + while ((i < fiemap->fm_mapped_extents - 1) && + (extent->fe_logical + extent->fe_length == next->fe_logical)) { + next->fe_length += extent->fe_length; + next->fe_logical = extent->fe_logical; + extent = next; + next = extent + 1; + i++; + } + + if (extent->fe_logical + extent->fe_length > offset + len) + extent->fe_length = offset + len - extent->fe_logical; + (*m)[extent->fe_logical] = extent->fe_length; + i++; + last = extent++; + } + uint64_t xoffset = last->fe_logical + last->fe_length - offset; + offset = last->fe_logical + last->fe_length; + len -= xoffset; + const bool is_last = (last->fe_flags & FIEMAP_EXTENT_LAST) || (len == 0); + free(fiemap); + if (!is_last) { + goto more; + } + + return r; +} + +int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len, + map<uint64_t, uint64_t> *m) +{ +#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) + off_t hole_pos, data_pos; + int r = 0; + + // If lseek fails with errno setting to be ENXIO, this means the current + // file offset is beyond the end of the file. + off_t start = offset; + while(start < (off_t)(offset + len)) { + data_pos = lseek(fd, start, SEEK_DATA); + if (data_pos < 0) { + if (errno == ENXIO) + break; + else { + r = -errno; + dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; + return r; + } + } else if (data_pos > (off_t)(offset + len)) { + break; + } + + hole_pos = lseek(fd, data_pos, SEEK_HOLE); + if (hole_pos < 0) { + if (errno == ENXIO) { + break; + } else { + r = -errno; + dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; + return r; + } + } + + if (hole_pos >= (off_t)(offset + len)) { + (*m)[data_pos] = offset + len - data_pos; + break; + } + (*m)[data_pos] = hole_pos - data_pos; + start = hole_pos; + } + + return r; +#else + (*m)[offset] = len; + return 0; +#endif +} + +int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid, + uint64_t offset, size_t len, + bufferlist& bl) +{ + map<uint64_t, uint64_t> exomap; + int r = fiemap(ch, oid, offset, len, exomap); + if (r >= 0) { + encode(exomap, bl); + } + return r; +} + +int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid, + uint64_t offset, size_t len, + map<uint64_t, uint64_t>& destmap) +{ + tracepoint(objectstore, fiemap_enter, ch->cid.c_str(), offset, len); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + destmap.clear(); + + if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) || + len <= (size_t)m_filestore_fiemap_threshold) { + destmap[offset] = len; + return 0; + } + + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + + FDRef fd; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl; + goto done; + } + + if (backend->has_seek_data_hole()) { + dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl; + r = _do_seek_hole_data(**fd, offset, len, &destmap); + } else if (backend->has_fiemap()) { + dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl; + r = _do_fiemap(**fd, offset, len, &destmap); + } + + lfn_close(fd); + +done: + + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << destmap.size() << " " << destmap << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + tracepoint(objectstore, fiemap_exit, r); + return r; +} + +int FileStore::_remove(const coll_t& cid, const ghobject_t& oid, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + int r = lfn_unlink(cid, oid, spos); + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int FileStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << dendl; + int r = lfn_truncate(cid, oid, size); + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << " = " << r << dendl; + return r; +} + + +int FileStore::_touch(const coll_t& cid, const ghobject_t& oid) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + + FDRef fd; + int r = lfn_open(cid, oid, true, &fd); + if (r < 0) { + return r; + } else { + lfn_close(fd); + } + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int FileStore::_write(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len, + const bufferlist& bl, uint32_t fadvise_flags) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + int r; + + FDRef fd; + r = lfn_open(cid, oid, true, &fd); + if (r < 0) { + dout(0) << __FUNC__ << ": couldn't open " << cid << "/" + << oid << ": " + << cpp_strerror(r) << dendl; + goto out; + } + + // write + r = bl.write_fd(**fd, offset); + if (r < 0) { + derr << __FUNC__ << ": write_fd on " << cid << "/" << oid + << " error: " << cpp_strerror(r) << dendl; + lfn_close(fd); + goto out; + } + r = bl.length(); + + if (r >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_write(**fd, offset, len, bl); + ceph_assert(rc >= 0); + } + + if (replaying || m_disable_wbthrottle) { + if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) { +#ifdef HAVE_POSIX_FADVISE + posix_fadvise(**fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + } + } else { + wbthrottle.queue_wb(fd, oid, offset, len, + fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + } + + lfn_close(fd); + + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl; + return r; +} + +int FileStore::_zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + int ret = 0; + + if (cct->_conf->filestore_punch_hole) { +#ifdef CEPH_HAVE_FALLOCATE +# if !defined(__APPLE__) && !defined(__FreeBSD__) +# ifdef FALLOC_FL_KEEP_SIZE + // first try to punch a hole. + FDRef fd; + ret = lfn_open(cid, oid, false, &fd); + if (ret < 0) { + goto out; + } + + struct stat st; + ret = ::fstat(**fd, &st); + if (ret < 0) { + ret = -errno; + lfn_close(fd); + goto out; + } + + // first try fallocate + ret = fallocate(**fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + offset, len); + if (ret < 0) { + ret = -errno; + } else { + // ensure we extend file size, if needed + if (len > 0 && offset + len > (uint64_t)st.st_size) { + ret = ::ftruncate(**fd, offset + len); + if (ret < 0) { + ret = -errno; + lfn_close(fd); + goto out; + } + } + } + lfn_close(fd); + + if (ret >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_zero(**fd, offset, len); + ceph_assert(rc >= 0); + } + + if (ret == 0) + goto out; // yay! + if (ret != -EOPNOTSUPP) + goto out; // some other error +# endif +# endif +#endif + } + + // lame, kernel is old and doesn't support it. + // write zeros.. yuck! + dout(20) << __FUNC__ << ": falling back to writing zeros" << dendl; + { + bufferlist bl; + bl.append_zero(len); + ret = _write(cid, oid, offset, len, bl); + } + +#ifdef CEPH_HAVE_FALLOCATE +# if !defined(__APPLE__) && !defined(__FreeBSD__) +# ifdef FALLOC_FL_KEEP_SIZE + out: +# endif +# endif +#endif + dout(20) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl; + return ret; +} + +int FileStore::_clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid, + const SequencerPosition& spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl; + + if (_check_replay_guard(cid, newoid, spos) < 0) + return 0; + + int r; + FDRef o, n; + { + Index index; + r = lfn_open(cid, oldoid, false, &o, &index); + if (r < 0) { + goto out2; + } + ceph_assert(index.index); + RWLock::WLocker l((index.index)->access_lock); + + r = lfn_open(cid, newoid, true, &n, &index); + if (r < 0) { + goto out; + } + r = ::ftruncate(**n, 0); + if (r < 0) { + r = -errno; + goto out3; + } + struct stat st; + r = ::fstat(**o, &st); + if (r < 0) { + r = -errno; + goto out3; + } + + r = _do_clone_range(**o, **n, 0, st.st_size, 0); + if (r < 0) { + goto out3; + } + + dout(20) << "objectmap clone" << dendl; + r = object_map->clone(oldoid, newoid, &spos); + if (r < 0 && r != -ENOENT) + goto out3; + } + + { + char buf[2]; + map<string, bufferptr> aset; + r = _fgetattrs(**o, aset); + if (r < 0) + goto out3; + + r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, + sizeof(XATTR_NO_SPILL_OUT)); + } else { + r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, + sizeof(XATTR_SPILL_OUT)); + } + if (r < 0) + goto out3; + + r = _fsetattrs(**n, aset); + if (r < 0) + goto out3; + } + + // clone is non-idempotent; record our work. + _set_replay_guard(**n, spos, &newoid); + + out3: + lfn_close(n); + out: + lfn_close(o); + out2: + dout(10) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << __FUNC__ << ": copy " << srcoff << "~" << len << " to " << dstoff << dendl; + return backend->clone_range(from, to, srcoff, len, dstoff); +} + +int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl; + int r = 0; + map<uint64_t, uint64_t> exomap; + // fiemap doesn't allow zero length + if (len == 0) + return 0; + + if (backend->has_seek_data_hole()) { + dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl; + r = _do_seek_hole_data(from, srcoff, len, &exomap); + } else if (backend->has_fiemap()) { + dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl; + r = _do_fiemap(from, srcoff, len, &exomap); + } + + + int64_t written = 0; + if (r < 0) + goto out; + + for (map<uint64_t, uint64_t>::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) { + uint64_t it_off = miter->first - srcoff + dstoff; + r = _do_copy_range(from, to, miter->first, miter->second, it_off, true); + if (r < 0) { + derr << __FUNC__ << ": copy error at " << miter->first << "~" << miter->second + << " to " << it_off << ", " << cpp_strerror(r) << dendl; + break; + } + written += miter->second; + } + + if (r >= 0) { + if (m_filestore_sloppy_crc) { + int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); + ceph_assert(rc >= 0); + } + struct stat st; + r = ::fstat(to, &st); + if (r < 0) { + r = -errno; + derr << __FUNC__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl; + goto out; + } + if (st.st_size < (int)(dstoff + len)) { + r = ::ftruncate(to, dstoff + len); + if (r < 0) { + r = -errno; + derr << __FUNC__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl; + goto out; + } + } + r = written; + } + + out: + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc) +{ + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl; + int r = 0; + loff_t pos = srcoff; + loff_t end = srcoff + len; + int buflen = 4096 * 16; //limit by pipe max size.see fcntl + +#ifdef CEPH_HAVE_SPLICE + if (backend->has_splice()) { + int pipefd[2]; + if (pipe_cloexec(pipefd) < 0) { + int e = errno; + derr << " pipe " << " got " << cpp_strerror(e) << dendl; + return -e; + } + + loff_t dstpos = dstoff; + while (pos < end) { + int l = std::min<int>(end-pos, buflen); + r = safe_splice(from, &pos, pipefd[1], nullptr, l, SPLICE_F_NONBLOCK); + dout(10) << " safe_splice read from " << pos << "~" << l << " got " << r << dendl; + if (r < 0) { + derr << __FUNC__ << ": safe_splice read error at " << pos << "~" << len + << ", " << cpp_strerror(r) << dendl; + break; + } + if (r == 0) { + // hrm, bad source range, wtf. + r = -ERANGE; + derr << __FUNC__ << ": got short read result at " << pos + << " of fd " << from << " len " << len << dendl; + break; + } + + r = safe_splice(pipefd[0], nullptr, to, &dstpos, r, 0); + dout(10) << " safe_splice write to " << to << " len " << r + << " got " << r << dendl; + if (r < 0) { + derr << __FUNC__ << ": write error at " << pos << "~" + << r << ", " << cpp_strerror(r) << dendl; + break; + } + } + close(pipefd[0]); + close(pipefd[1]); + } else +#endif + { + int64_t actual; + + actual = ::lseek64(from, srcoff, SEEK_SET); + if (actual != (int64_t)srcoff) { + if (actual < 0) + r = -errno; + else + r = -EINVAL; + derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl; + return r; + } + actual = ::lseek64(to, dstoff, SEEK_SET); + if (actual != (int64_t)dstoff) { + if (actual < 0) + r = -errno; + else + r = -EINVAL; + derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl; + return r; + } + + char buf[buflen]; + while (pos < end) { + int l = std::min<int>(end-pos, buflen); + r = ::read(from, buf, l); + dout(25) << " read from " << pos << "~" << l << " got " << r << dendl; + if (r < 0) { + if (errno == EINTR) { + continue; + } else { + r = -errno; + derr << __FUNC__ << ": read error at " << pos << "~" << len + << ", " << cpp_strerror(r) << dendl; + break; + } + } + if (r == 0) { + // hrm, bad source range, wtf. + r = -ERANGE; + derr << __FUNC__ << ": got short read result at " << pos + << " of fd " << from << " len " << len << dendl; + break; + } + int op = 0; + while (op < r) { + int r2 = safe_write(to, buf+op, r-op); + dout(25) << " write to " << to << " len " << (r-op) + << " got " << r2 << dendl; + if (r2 < 0) { + r = r2; + derr << __FUNC__ << ": write error at " << pos << "~" + << r-op << ", " << cpp_strerror(r) << dendl; + + break; + } + op += (r-op); + } + if (r < 0) + break; + pos += r; + } + } + + if (r < 0 && replaying) { + ceph_assert(r == -ERANGE); + derr << __FUNC__ << ": short source tolerated because we are replaying" << dendl; + r = len; + } + ceph_assert(replaying || pos == end); + if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); + ceph_assert(rc >= 0); + } + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +int FileStore::_clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid, + uint64_t srcoff, uint64_t len, uint64_t dstoff, + const SequencerPosition& spos) +{ + dout(15) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl; + + if (_check_replay_guard(newcid, newoid, spos) < 0) + return 0; + + int r; + FDRef o, n; + r = lfn_open(oldcid, oldoid, false, &o); + if (r < 0) { + goto out2; + } + r = lfn_open(newcid, newoid, true, &n); + if (r < 0) { + goto out; + } + r = _do_clone_range(**o, **n, srcoff, len, dstoff); + if (r < 0) { + goto out3; + } + + // clone is non-idempotent; record our work. + _set_replay_guard(**n, spos, &newoid); + + out3: + lfn_close(n); + out: + lfn_close(o); + out2: + dout(10) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " + << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +class SyncEntryTimeout : public Context { +public: + CephContext* cct; + explicit SyncEntryTimeout(CephContext* cct, int commit_timeo) + : cct(cct), m_commit_timeo(commit_timeo) + { + } + + void finish(int r) override { + BackTrace *bt = new BackTrace(1); + generic_dout(-1) << "FileStore: sync_entry timed out after " + << m_commit_timeo << " seconds.\n"; + bt->print(*_dout); + *_dout << dendl; + delete bt; + bt = nullptr; + ceph_abort(); + } +private: + int m_commit_timeo; +}; + +void FileStore::sync_entry() +{ + lock.Lock(); + while (!stop) { + utime_t max_interval; + max_interval.set_from_double(m_filestore_max_sync_interval); + utime_t min_interval; + min_interval.set_from_double(m_filestore_min_sync_interval); + + utime_t startwait = ceph_clock_now(); + if (!force_sync) { + dout(20) << __FUNC__ << ": waiting for max_interval " << max_interval << dendl; + sync_cond.WaitInterval(lock, max_interval); + } else { + dout(20) << __FUNC__ << ": not waiting, force_sync set" << dendl; + } + + if (force_sync) { + dout(20) << __FUNC__ << ": force_sync set" << dendl; + force_sync = false; + } else if (stop) { + dout(20) << __FUNC__ << ": stop set" << dendl; + break; + } else { + // wait for at least the min interval + utime_t woke = ceph_clock_now(); + woke -= startwait; + dout(20) << __FUNC__ << ": woke after " << woke << dendl; + if (woke < min_interval) { + utime_t t = min_interval; + t -= woke; + dout(20) << __FUNC__ << ": waiting for another " << t + << " to reach min interval " << min_interval << dendl; + sync_cond.WaitInterval(lock, t); + } + } + + list<Context*> fin; + again: + fin.swap(sync_waiters); + lock.Unlock(); + + op_tp.pause(); + if (apply_manager.commit_start()) { + utime_t start = ceph_clock_now(); + uint64_t cp = apply_manager.get_committing_seq(); + + sync_entry_timeo_lock.Lock(); + SyncEntryTimeout *sync_entry_timeo = + new SyncEntryTimeout(cct, m_filestore_commit_timeout); + if (!timer.add_event_after(m_filestore_commit_timeout, + sync_entry_timeo)) { + sync_entry_timeo = nullptr; + } + sync_entry_timeo_lock.Unlock(); + + logger->set(l_filestore_committing, 1); + + dout(15) << __FUNC__ << ": committing " << cp << dendl; + stringstream errstream; + if (cct->_conf->filestore_debug_omap_check && !object_map->check(errstream)) { + derr << errstream.str() << dendl; + ceph_abort(); + } + + if (backend->can_checkpoint()) { + int err = write_op_seq(op_fd, cp); + if (err < 0) { + derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; + ceph_abort_msg("error during write_op_seq"); + } + + char s[NAME_MAX]; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); + uint64_t cid = 0; + err = backend->create_checkpoint(s, &cid); + if (err < 0) { + int err = errno; + derr << "snap create '" << s << "' got error " << err << dendl; + ceph_assert(err == 0); + } + + snaps.push_back(cp); + apply_manager.commit_started(); + op_tp.unpause(); + + if (cid > 0) { + dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl; + err = backend->sync_checkpoint(cid); + if (err < 0) { + derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl; + ceph_abort_msg("wait_sync got error"); + } + dout(20) << " done waiting for checkpoint " << cid << " to complete" << dendl; + } + } else { + apply_manager.commit_started(); + op_tp.unpause(); + + int err = object_map->sync(); + if (err < 0) { + derr << "object_map sync got " << cpp_strerror(err) << dendl; + ceph_abort_msg("object_map sync returned error"); + } + + err = backend->syncfs(); + if (err < 0) { + derr << "syncfs got " << cpp_strerror(err) << dendl; + ceph_abort_msg("syncfs returned error"); + } + + err = write_op_seq(op_fd, cp); + if (err < 0) { + derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; + ceph_abort_msg("error during write_op_seq"); + } + err = ::fsync(op_fd); + if (err < 0) { + derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl; + ceph_abort_msg("error during fsync of op_seq"); + } + } + + utime_t done = ceph_clock_now(); + utime_t lat = done - start; + utime_t dur = done - startwait; + dout(10) << __FUNC__ << ": commit took " << lat << ", interval was " << dur << dendl; + utime_t max_pause_lat = logger->tget(l_filestore_sync_pause_max_lat); + if (max_pause_lat < dur - lat) { + logger->tinc(l_filestore_sync_pause_max_lat, dur - lat); + } + + logger->inc(l_filestore_commitcycle); + logger->tinc(l_filestore_commitcycle_latency, lat); + logger->tinc(l_filestore_commitcycle_interval, dur); + + apply_manager.commit_finish(); + if (!m_disable_wbthrottle) { + wbthrottle.clear(); + } + + logger->set(l_filestore_committing, 0); + + // remove old snaps? + if (backend->can_checkpoint()) { + char s[NAME_MAX]; + while (snaps.size() > 2) { + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front()); + snaps.pop_front(); + dout(10) << "removing snap '" << s << "'" << dendl; + int r = backend->destroy_checkpoint(s); + if (r) { + int err = errno; + derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl; + } + } + } + + dout(15) << __FUNC__ << ": committed to op_seq " << cp << dendl; + + if (sync_entry_timeo) { + Mutex::Locker lock(sync_entry_timeo_lock); + timer.cancel_event(sync_entry_timeo); + } + } else { + op_tp.unpause(); + } + + lock.Lock(); + finish_contexts(cct, fin, 0); + fin.clear(); + if (!sync_waiters.empty()) { + dout(10) << __FUNC__ << ": more waiters, committing again" << dendl; + goto again; + } + if (!stop && journal && journal->should_commit_now()) { + dout(10) << __FUNC__ << ": journal says we should commit again (probably is/was full)" << dendl; + goto again; + } + } + stop = false; + lock.Unlock(); +} + +void FileStore::do_force_sync() +{ + dout(10) << __FUNC__ << dendl; + Mutex::Locker l(lock); + force_sync = true; + sync_cond.Signal(); +} + +void FileStore::start_sync(Context *onsafe) +{ + Mutex::Locker l(lock); + sync_waiters.push_back(onsafe); + sync_cond.Signal(); + force_sync = true; + dout(10) << __FUNC__ << dendl; +} + +void FileStore::sync() +{ + Mutex l("FileStore::sync"); + Cond c; + bool done; + C_SafeCond *fin = new C_SafeCond(&l, &c, &done); + + start_sync(fin); + + l.Lock(); + while (!done) { + dout(10) << "sync waiting" << dendl; + c.Wait(l); + } + l.Unlock(); + dout(10) << "sync done" << dendl; +} + +void FileStore::_flush_op_queue() +{ + dout(10) << __FUNC__ << ": draining op tp" << dendl; + op_wq.drain(); + dout(10) << __FUNC__ << ": waiting for apply finisher" << dendl; + for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->wait_for_empty(); + } +} + +/* + * flush - make every queued write readable + */ +void FileStore::flush() +{ + dout(10) << __FUNC__ << dendl; + + if (cct->_conf->filestore_blackhole) { + // wait forever + Mutex lock("FileStore::flush::lock"); + Cond cond; + lock.Lock(); + while (true) + cond.Wait(lock); + ceph_abort(); + } + + if (m_filestore_journal_writeahead) { + if (journal) + journal->flush(); + dout(10) << __FUNC__ << ": draining ondisk finisher" << dendl; + for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->wait_for_empty(); + } + } + + _flush_op_queue(); + dout(10) << __FUNC__ << ": complete" << dendl; +} + +/* + * sync_and_flush - make every queued write readable AND committed to disk + */ +void FileStore::sync_and_flush() +{ + dout(10) << __FUNC__ << dendl; + + if (m_filestore_journal_writeahead) { + if (journal) + journal->flush(); + _flush_op_queue(); + } else { + // includes m_filestore_journal_parallel + _flush_op_queue(); + sync(); + } + dout(10) << __FUNC__ << ": done" << dendl; +} + +int FileStore::flush_journal() +{ + dout(10) << __FUNC__ << dendl; + sync_and_flush(); + sync(); + return 0; +} + +int FileStore::snapshot(const string& name) +{ + dout(10) << __FUNC__ << ": " << name << dendl; + sync_and_flush(); + + if (!backend->can_checkpoint()) { + dout(0) << __FUNC__ << ": " << name << " failed, not supported" << dendl; + return -EOPNOTSUPP; + } + + char s[NAME_MAX]; + snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str()); + + int r = backend->create_checkpoint(s, nullptr); + if (r) { + derr << __FUNC__ << ": " << name << " failed: " << cpp_strerror(r) << dendl; + } + + return r; +} + +// ------------------------------- +// attributes + +int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp) +{ + char val[CHAIN_XATTR_MAX_BLOCK_LEN]; + int l = chain_fgetxattr(fd, name, val, sizeof(val)); + if (l >= 0) { + bp = buffer::create(l); + memcpy(bp.c_str(), val, l); + } else if (l == -ERANGE) { + l = chain_fgetxattr(fd, name, 0, 0); + if (l > 0) { + bp = buffer::create(l); + l = chain_fgetxattr(fd, name, bp.c_str(), l); + } + } + ceph_assert(!m_filestore_fail_eio || l != -EIO); + return l; +} + +int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset) +{ + // get attr list + char names1[100]; + int len = chain_flistxattr(fd, names1, sizeof(names1)-1); + char *names2 = 0; + char *name = 0; + if (len == -ERANGE) { + len = chain_flistxattr(fd, 0, 0); + if (len < 0) { + ceph_assert(!m_filestore_fail_eio || len != -EIO); + return len; + } + dout(10) << " -ERANGE, len is " << len << dendl; + names2 = new char[len+1]; + len = chain_flistxattr(fd, names2, len); + dout(10) << " -ERANGE, got " << len << dendl; + if (len < 0) { + ceph_assert(!m_filestore_fail_eio || len != -EIO); + delete[] names2; + return len; + } + name = names2; + } else if (len < 0) { + ceph_assert(!m_filestore_fail_eio || len != -EIO); + return len; + } else { + name = names1; + } + name[len] = 0; + + char *end = name + len; + while (name < end) { + char *attrname = name; + if (parse_attrname(&name)) { + if (*name) { + dout(20) << __FUNC__ << ": " << fd << " getting '" << name << "'" << dendl; + int r = _fgetattr(fd, attrname, aset[name]); + if (r < 0) { + delete[] names2; + return r; + } + } + } + name += strlen(name) + 1; + } + + delete[] names2; + return 0; +} + +int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset) +{ + for (map<string, bufferptr>::iterator p = aset.begin(); + p != aset.end(); + ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + const char *val; + if (p->second.length()) + val = p->second.c_str(); + else + val = ""; + // ??? Why do we skip setting all the other attrs if one fails? + int r = chain_fsetxattr(fd, n, val, p->second.length()); + if (r < 0) { + derr << __FUNC__ << ": chain_setxattr returned " << r << dendl; + return r; + } + } + return 0; +} + +// debug EIO injection +void FileStore::inject_data_error(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __FUNC__ << ": init error on " << oid << dendl; + data_error_set.insert(oid); +} +void FileStore::inject_mdata_error(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __FUNC__ << ": init error on " << oid << dendl; + mdata_error_set.insert(oid); +} + +void FileStore::debug_obj_on_delete(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __FUNC__ << ": clear error on " << oid << dendl; + data_error_set.erase(oid); + mdata_error_set.erase(oid); +} +bool FileStore::debug_data_eio(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + if (data_error_set.count(oid)) { + dout(10) << __FUNC__ << ": inject error on " << oid << dendl; + return true; + } else { + return false; + } +} +bool FileStore::debug_mdata_eio(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + if (mdata_error_set.count(oid)) { + dout(10) << __FUNC__ << ": inject error on " << oid << dendl; + return true; + } else { + return false; + } +} + + +// objects + +int FileStore::getattr(CollectionHandle& ch, const ghobject_t& oid, const char *name, bufferptr &bp) +{ + tracepoint(objectstore, getattr_enter, ch->cid.c_str()); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); + r = _fgetattr(**fd, n, bp); + lfn_close(fd); + if (r == -ENODATA) { + map<string, bufferlist> got; + set<string> to_get; + to_get.insert(string(name)); + Index index; + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out; + } + r = object_map->get_xattrs(oid, to_get, &got); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": get_xattrs err r =" << r << dendl; + goto out; + } + if (got.empty()) { + dout(10) << __FUNC__ << ": got.size() is 0" << dendl; + return -ENODATA; + } + bp = bufferptr(got.begin()->second.c_str(), + got.begin()->second.length()); + r = bp.length(); + } + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + if (cct->_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, getattr_exit, r); + return r < 0 ? r : 0; + } +} + +int FileStore::getattrs(CollectionHandle& ch, const ghobject_t& oid, map<string,bufferptr>& aset) +{ + tracepoint(objectstore, getattrs_enter, ch->cid.c_str()); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + set<string> omap_attrs; + map<string, bufferlist> omap_aset; + Index index; + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + + FDRef fd; + bool spill_out = true; + char buf[2]; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) + spill_out = false; + + r = _fgetattrs(**fd, aset); + lfn_close(fd); + fd = FDRef(); // defensive + if (r < 0) { + goto out; + } + + if (!spill_out) { + dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl; + goto out; + } + + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out; + } + { + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl; + goto out; + } + + r = object_map->get_xattrs(oid, omap_attrs, &omap_aset); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl; + goto out; + } + if (r == -ENOENT) + r = 0; + } + ceph_assert(omap_attrs.size() == omap_aset.size()); + for (map<string, bufferlist>::iterator i = omap_aset.begin(); + i != omap_aset.end(); + ++i) { + string key(i->first); + aset.insert(make_pair(key, + bufferptr(i->second.c_str(), i->second.length()))); + } + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + + if (cct->_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, getattrs_exit, r); + return r; + } +} + +int FileStore::_setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset, + const SequencerPosition &spos) +{ + map<string, bufferlist> omap_set; + set<string> omap_remove; + map<string, bufferptr> inline_set; + map<string, bufferptr> inline_to_set; + FDRef fd; + int spill_out = -1; + bool incomplete_inline = false; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) + spill_out = 0; + else + spill_out = 1; + + r = _fgetattrs(**fd, inline_set); + incomplete_inline = (r == -E2BIG); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + dout(15) << __FUNC__ << ": " << cid << "/" << oid + << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "") + << dendl; + + for (map<string,bufferptr>::iterator p = aset.begin(); + p != aset.end(); + ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + + if (incomplete_inline) { + chain_fremovexattr(**fd, n); // ignore any error + omap_set[p->first].push_back(p->second); + continue; + } + + if (p->second.length() > m_filestore_max_inline_xattr_size) { + if (inline_set.count(p->first)) { + inline_set.erase(p->first); + r = chain_fremovexattr(**fd, n); + if (r < 0) + goto out_close; + } + omap_set[p->first].push_back(p->second); + continue; + } + + if (!inline_set.count(p->first) && + inline_set.size() >= m_filestore_max_inline_xattrs) { + omap_set[p->first].push_back(p->second); + continue; + } + omap_remove.insert(p->first); + inline_set.insert(*p); + + inline_to_set.insert(*p); + } + + if (spill_out != 1 && !omap_set.empty()) { + chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, + sizeof(XATTR_SPILL_OUT)); + } + + r = _fsetattrs(**fd, inline_to_set); + if (r < 0) + goto out_close; + + if (spill_out && !omap_remove.empty()) { + r = object_map->remove_xattrs(oid, omap_remove, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not remove_xattrs r = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + goto out_close; + } else { + r = 0; // don't confuse the debug output + } + } + + if (!omap_set.empty()) { + r = object_map->set_xattrs(oid, omap_set, &spos); + if (r < 0) { + dout(10) << __FUNC__ << ": could not set_xattrs r = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + goto out_close; + } + } + out_close: + lfn_close(fd); + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + +int FileStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl; + FDRef fd; + bool spill_out = true; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + spill_out = false; + } + + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); + r = chain_fremovexattr(**fd, n); + if (r == -ENODATA && spill_out) { + Index index; + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out_close; + } + set<string> to_remove; + to_remove.insert(string(name)); + r = object_map->remove_xattrs(oid, to_remove, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not remove_xattrs index r = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + goto out_close; + } + } + out_close: + lfn_close(fd); + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl; + return r; +} + +int FileStore::_rmattrs(const coll_t& cid, const ghobject_t& oid, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + + map<string,bufferptr> aset; + FDRef fd; + set<string> omap_attrs; + Index index; + bool spill_out = true; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + spill_out = false; + } + + r = _fgetattrs(**fd, aset); + if (r >= 0) { + for (map<string,bufferptr>::iterator p = aset.begin(); p != aset.end(); ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + r = chain_fremovexattr(**fd, n); + if (r < 0) { + dout(10) << __FUNC__ << ": could not remove xattr r = " << r << dendl; + goto out_close; + } + } + } + + if (!spill_out) { + dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl; + goto out_close; + } + + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out_close; + } + { + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + goto out_close; + } + r = object_map->remove_xattrs(oid, omap_attrs, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not remove omap_attrs r = " << r << dendl; + goto out_close; + } + if (r == -ENOENT) + r = 0; + chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, + sizeof(XATTR_NO_SPILL_OUT)); + } + + out_close: + lfn_close(fd); + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + + + +int FileStore::_collection_remove_recursive(const coll_t &cid, + const SequencerPosition &spos) +{ + struct stat st; + int r = collection_stat(cid, &st); + if (r < 0) { + if (r == -ENOENT) + return 0; + return r; + } + + vector<ghobject_t> objects; + ghobject_t max; + while (!max.is_max()) { + r = collection_list(cid, max, ghobject_t::get_max(), + 300, &objects, &max); + if (r < 0) + return r; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + ceph_assert(_check_replay_guard(cid, *i, spos)); + r = _remove(cid, *i, spos); + if (r < 0) + return r; + } + objects.clear(); + } + return _destroy_collection(cid); +} + +// -------------------------- +// collections + +int FileStore::list_collections(vector<coll_t>& ls) +{ + return list_collections(ls, false); +} + +int FileStore::list_collections(vector<coll_t>& ls, bool include_temp) +{ + tracepoint(objectstore, list_collections_enter); + dout(10) << __FUNC__ << dendl; + + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/current", basedir.c_str()); + + int r = 0; + DIR *dir = ::opendir(fn); + if (!dir) { + r = -errno; + derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + + struct dirent *de = nullptr; + while ((de = ::readdir(dir))) { + if (de->d_type == DT_UNKNOWN) { + // d_type not supported (non-ext[234], btrfs), must stat + struct stat sb; + char filename[PATH_MAX]; + if (int n = snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name); + n >= static_cast<int>(sizeof(filename))) { + derr << __func__ << " path length overrun: " << n << dendl; + ceph_abort(); + } + + r = ::stat(filename, &sb); + if (r < 0) { + r = -errno; + derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + break; + } + if (!S_ISDIR(sb.st_mode)) { + continue; + } + } else if (de->d_type != DT_DIR) { + continue; + } + if (strcmp(de->d_name, "omap") == 0) { + continue; + } + if (de->d_name[0] == '.' && + (de->d_name[1] == '\0' || + (de->d_name[1] == '.' && + de->d_name[2] == '\0'))) + continue; + coll_t cid; + if (!cid.parse(de->d_name)) { + derr << "ignoring invalid collection '" << de->d_name << "'" << dendl; + continue; + } + if (!cid.is_temp() || include_temp) + ls.push_back(cid); + } + + if (r > 0) { + derr << "trying readdir " << fn << ": " << cpp_strerror(r) << dendl; + r = -r; + } + + ::closedir(dir); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + tracepoint(objectstore, list_collections_exit, r); + return r; +} + +int FileStore::collection_stat(const coll_t& c, struct stat *st) +{ + tracepoint(objectstore, collection_stat_enter, c.c_str()); + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + int r = ::stat(fn, st); + if (r < 0) + r = -errno; + dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + tracepoint(objectstore, collection_stat_exit, r); + return r; +} + +bool FileStore::collection_exists(const coll_t& c) +{ + tracepoint(objectstore, collection_exists_enter, c.c_str()); + struct stat st; + bool ret = collection_stat(c, &st) == 0; + tracepoint(objectstore, collection_exists_exit, ret); + return ret; +} + +int FileStore::collection_empty(const coll_t& cid, bool *empty) +{ + tracepoint(objectstore, collection_empty_enter, cid.c_str()); + dout(15) << __FUNC__ << ": " << cid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) { + derr << __FUNC__ << ": get_index returned: " << cpp_strerror(r) + << dendl; + return r; + } + + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + + vector<ghobject_t> ls; + r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(), + 1, &ls, nullptr); + if (r < 0) { + derr << __FUNC__ << ": collection_list_partial returned: " + << cpp_strerror(r) << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + *empty = ls.empty(); + tracepoint(objectstore, collection_empty_exit, *empty); + return 0; +} + +int FileStore::_collection_set_bits(const coll_t& c, int bits) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(10) << __FUNC__ << ": " << fn << " " << bits << dendl; + char n[PATH_MAX]; + int r; + int32_t v = bits; + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto out; + } + get_attrname("bits", n, PATH_MAX); + r = chain_fsetxattr(fd, n, (char*)&v, sizeof(v)); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + out: + dout(10) << __FUNC__ << ": " << fn << " " << bits << " = " << r << dendl; + return r; +} + +int FileStore::collection_bits(CollectionHandle& ch) +{ + char fn[PATH_MAX]; + get_cdir(ch->cid, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + int r; + char n[PATH_MAX]; + int32_t bits; + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + bits = r = -errno; + goto out; + } + get_attrname("bits", n, PATH_MAX); + r = chain_fgetxattr(fd, n, (char*)&bits, sizeof(bits)); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + if (r < 0) { + bits = r; + goto out; + } + out: + dout(10) << __FUNC__ << ": " << fn << " = " << bits << dendl; + return bits; +} + +int FileStore::collection_list(const coll_t& c, + const ghobject_t& orig_start, + const ghobject_t& end, + int max, + vector<ghobject_t> *ls, ghobject_t *next) +{ + ghobject_t start = orig_start; + if (start.is_max()) + return 0; + + ghobject_t temp_next; + if (!next) + next = &temp_next; + // figure out the pool id. we need this in order to generate a + // meaningful 'next' value. + int64_t pool = -1; + shard_id_t shard; + { + spg_t pgid; + if (c.is_temp(&pgid)) { + pool = -2 - pgid.pool(); + shard = pgid.shard; + } else if (c.is_pg(&pgid)) { + pool = pgid.pool(); + shard = pgid.shard; + } else if (c.is_meta()) { + pool = -1; + shard = shard_id_t::NO_SHARD; + } else { + // hrm, the caller is test code! we should get kill it off. for now, + // tolerate it. + pool = 0; + shard = shard_id_t::NO_SHARD; + } + dout(20) << __FUNC__ << ": pool is " << pool << " shard is " << shard + << " pgid " << pgid << dendl; + } + ghobject_t sep; + sep.hobj.pool = -1; + sep.set_shard(shard); + if (!c.is_temp() && !c.is_meta()) { + if (start < sep) { + dout(10) << __FUNC__ << ": first checking temp pool" << dendl; + coll_t temp = c.get_temp(); + int r = collection_list(temp, start, end, max, ls, next); + if (r < 0) + return r; + if (*next != ghobject_t::get_max()) + return r; + start = sep; + dout(10) << __FUNC__ << ": fall through to non-temp collection, start " + << start << dendl; + } else { + dout(10) << __FUNC__ << ": start " << start << " >= sep " << sep << dendl; + } + } + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + + r = index->collection_list_partial(start, end, max, ls, next); + + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + dout(20) << "objects: " << *ls << dendl; + + // HashIndex doesn't know the pool when constructing a 'next' value + if (!next->is_max()) { + next->hobj.pool = pool; + next->set_shard(shard); + dout(20) << " next " << *next << dendl; + } + + return 0; +} + +int FileStore::omap_get(CollectionHandle& ch, const ghobject_t &hoid, + bufferlist *header, + map<string, bufferlist> *out) +{ + tracepoint(objectstore, omap_get_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get(hoid, header, out); + if (r < 0 && r != -ENOENT) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + tracepoint(objectstore, omap_get_exit, 0); + return 0; +} + +int FileStore::omap_get_header( + CollectionHandle& ch, + const ghobject_t &hoid, + bufferlist *bl, + bool allow_eio) +{ + tracepoint(objectstore, omap_get_header_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get_header(hoid, bl); + if (r < 0 && r != -ENOENT) { + ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_get_header_exit, 0); + return 0; +} + +int FileStore::omap_get_keys(CollectionHandle& ch, const ghobject_t &hoid, set<string> *keys) +{ + tracepoint(objectstore, omap_get_keys_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get_keys(hoid, keys); + if (r < 0 && r != -ENOENT) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + tracepoint(objectstore, omap_get_keys_exit, 0); + return 0; +} + +int FileStore::omap_get_values(CollectionHandle& ch, const ghobject_t &hoid, + const set<string> &keys, + map<string, bufferlist> *out) +{ + tracepoint(objectstore, omap_get_values_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + const char *where = "()"; + int r = get_index(c, &index); + if (r < 0) { + where = " (get_index)"; + goto out; + } + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + where = " (lfn_find)"; + goto out; + } + } + r = object_map->get_values(hoid, keys, out); + if (r < 0 && r != -ENOENT) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + where = " (get_values)"; + goto out; + } + r = 0; + out: + tracepoint(objectstore, omap_get_values_exit, r); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << " = " << r + << where << dendl; + return r; +} + +int FileStore::omap_check_keys(CollectionHandle& ch, const ghobject_t &hoid, + const set<string> &keys, + set<string> *out) +{ + tracepoint(objectstore, omap_check_keys_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->check_keys(hoid, keys, out); + if (r < 0 && r != -ENOENT) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + tracepoint(objectstore, omap_check_keys_exit, 0); + return 0; +} + +ObjectMap::ObjectMapIterator FileStore::get_omap_iterator( + CollectionHandle& ch, + const ghobject_t &oid) +{ + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + return get_omap_iterator(ch->cid, oid); +} + +ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(const coll_t& _c, + const ghobject_t &hoid) +{ + tracepoint(objectstore, get_omap_iterator, _c.c_str()); + const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 " + << "(get_index failed with " << cpp_strerror(r) << ")" << dendl; + return ObjectMap::ObjectMapIterator(); + } + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 " + << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl; + return ObjectMap::ObjectMapIterator(); + } + } + return object_map->get_iterator(hoid); +} + +int FileStore::_collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num, + uint64_t expected_num_objs, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": collection: " << c << " pg number: " + << pg_num << " expected number of objects: " << expected_num_objs << dendl; + + bool empty; + int ret = collection_empty(c, &empty); + if (ret < 0) + return ret; + if (!empty && !replaying) { + dout(0) << "Failed to give an expected number of objects hint to collection : " + << c << ", only empty collection can take such type of hint. " << dendl; + return 0; + } + + Index index; + ret = get_index(c, &index); + if (ret < 0) + return ret; + // Pre-hash the collection + ret = index->pre_hash_collection(pg_num, expected_num_objs); + dout(10) << "pre_hash_collection " << c << " = " << ret << dendl; + if (ret < 0) + return ret; + _set_replay_guard(c, spos); + + return 0; +} + +int FileStore::_create_collection( + const coll_t& c, + int bits, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + int r = ::mkdir(fn, 0755); + if (r < 0) + r = -errno; + if (r == -EEXIST && replaying) + r = 0; + dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl; + + if (r < 0) + return r; + r = init_index(c); + if (r < 0) + return r; + r = _collection_set_bits(c, bits); + if (r < 0) + return r; + // create parallel temp collection, too + if (!c.is_meta() && !c.is_temp()) { + coll_t temp = c.get_temp(); + r = _create_collection(temp, 0, spos); + if (r < 0) + return r; + } + + _set_replay_guard(c, spos); + return 0; +} + +int FileStore::_destroy_collection(const coll_t& c) +{ + int r = 0; + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + { + Index from; + r = get_index(c, &from); + if (r < 0) + goto out; + ceph_assert(from.index); + RWLock::WLocker l((from.index)->access_lock); + + r = from->prep_delete(); + if (r < 0) + goto out; + } + r = ::rmdir(fn); + if (r < 0) { + r = -errno; + goto out; + } + + out: + // destroy parallel temp collection, too + if (!c.is_meta() && !c.is_temp()) { + coll_t temp = c.get_temp(); + int r2 = _destroy_collection(temp); + if (r2 < 0) { + r = r2; + goto out_final; + } + } + + out_final: + dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl; + return r; +} + + +int FileStore::_collection_add(const coll_t& c, const coll_t& oldcid, const ghobject_t& o, + const SequencerPosition& spos) +{ + dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << dendl; + + int dstcmp = _check_replay_guard(c, o, spos); + if (dstcmp < 0) + return 0; + + // check the src name too; it might have a newer guard, and we don't + // want to clobber it + int srccmp = _check_replay_guard(oldcid, o, spos); + if (srccmp < 0) + return 0; + + // open guard on object so we don't any previous operations on the + // new name that will modify the source inode. + FDRef fd; + int r = lfn_open(oldcid, o, 0, &fd); + if (r < 0) { + // the source collection/object does not exist. If we are replaying, we + // should be safe, so just return 0 and move on. + ceph_assert(replaying); + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " + << oldcid << "/" << o << " (dne, continue replay) " << dendl; + return 0; + } + if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" + _set_replay_guard(**fd, spos, &o, true); + } + + r = lfn_link(oldcid, c, o, o); + if (replaying && !backend->can_checkpoint() && + r == -EEXIST) // crashed between link() and set_replay_guard() + r = 0; + + _inject_failure(); + + // close guard on object so we don't do this again + if (r == 0) { + _close_replay_guard(**fd, spos); + } + lfn_close(fd); + + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl; + return r; +} + +int FileStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, + coll_t c, const ghobject_t& o, + const SequencerPosition& spos, + bool allow_enoent) +{ + dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl; + int r = 0; + int dstcmp, srccmp; + + if (replaying) { + /* If the destination collection doesn't exist during replay, + * we need to delete the src object and continue on + */ + if (!collection_exists(c)) + goto out_rm_src; + } + + dstcmp = _check_replay_guard(c, o, spos); + if (dstcmp < 0) + goto out_rm_src; + + // check the src name too; it might have a newer guard, and we don't + // want to clobber it + srccmp = _check_replay_guard(oldcid, oldoid, spos); + if (srccmp < 0) + return 0; + + { + // open guard on object so we don't any previous operations on the + // new name that will modify the source inode. + FDRef fd; + r = lfn_open(oldcid, oldoid, 0, &fd); + if (r < 0) { + // the source collection/object does not exist. If we are replaying, we + // should be safe, so just return 0 and move on. + if (replaying) { + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " + << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl; + } else if (allow_enoent) { + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " + << oldcid << "/" << oldoid << " (dne, ignoring enoent)" + << dendl; + } else { + ceph_abort_msg("ERROR: source must exist"); + } + + if (!replaying) { + return 0; + } + if (allow_enoent && dstcmp > 0) { // if dstcmp == 0, try_rename was started. + return 0; + } + + r = 0; // don't know if object_map was cloned + } else { + if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" + _set_replay_guard(**fd, spos, &o, true); + } + + r = lfn_link(oldcid, c, oldoid, o); + if (replaying && !backend->can_checkpoint() && + r == -EEXIST) // crashed between link() and set_replay_guard() + r = 0; + + lfn_close(fd); + fd = FDRef(); + + _inject_failure(); + } + + if (r == 0) { + // the name changed; link the omap content + r = object_map->rename(oldoid, o, &spos); + if (r == -ENOENT) + r = 0; + } + + _inject_failure(); + + if (r == 0) + r = lfn_unlink(oldcid, oldoid, spos, true); + + if (r == 0) + r = lfn_open(c, o, 0, &fd); + + // close guard on object so we don't do this again + if (r == 0) { + _close_replay_guard(**fd, spos, &o); + lfn_close(fd); + } + } + + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid + << " = " << r << dendl; + return r; + + out_rm_src: + // remove source + if (_check_replay_guard(oldcid, oldoid, spos) > 0) { + r = lfn_unlink(oldcid, oldoid, spos, true); + } + + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid + << " = " << r << dendl; + return r; +} + +void FileStore::_inject_failure() +{ + if (m_filestore_kill_at) { + int final = --m_filestore_kill_at; + dout(5) << __FUNC__ << ": " << (final+1) << " -> " << final << dendl; + if (final == 0) { + derr << __FUNC__ << ": KILLING" << dendl; + cct->_log->flush(); + _exit(1); + } + } +} + +int FileStore::_omap_clear(const coll_t& cid, const ghobject_t &hoid, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->clear_keys_header(hoid, &spos); + if (r < 0 && r != -ENOENT) + return r; + return 0; +} + +int FileStore::_omap_setkeys(const coll_t& cid, const ghobject_t &hoid, + const map<string, bufferlist> &aset, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r; + //treat pgmeta as a logical object, skip to check exist + if (hoid.is_pgmeta()) + goto skip; + + r = get_index(cid, &index); + if (r < 0) { + dout(20) << __FUNC__ << ": get_index got " << cpp_strerror(r) << dendl; + return r; + } + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + dout(20) << __FUNC__ << ": lfn_find got " << cpp_strerror(r) << dendl; + return r; + } + } +skip: + if (g_conf()->subsys.should_gather<ceph_subsys_filestore, 20>()) { + for (auto& p : aset) { + dout(20) << __FUNC__ << ": set " << p.first << dendl; + } + } + r = object_map->set_keys(hoid, aset, &spos); + dout(20) << __FUNC__ << ": " << cid << "/" << hoid << " = " << r << dendl; + return r; +} + +int FileStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &hoid, + const set<string> &keys, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r; + //treat pgmeta as a logical object, skip to check exist + if (hoid.is_pgmeta()) + goto skip; + + r = get_index(cid, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } +skip: + r = object_map->rm_keys(hoid, keys, &spos); + if (r < 0 && r != -ENOENT) + return r; + return 0; +} + +int FileStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &hoid, + const string& first, const string& last, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl; + set<string> keys; + { + ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid); + if (!iter) + return -ENOENT; + for (iter->lower_bound(first); iter->valid() && iter->key() < last; + iter->next()) { + keys.insert(iter->key()); + } + } + return _omap_rmkeys(cid, hoid, keys, spos); +} + +int FileStore::_omap_setheader(const coll_t& cid, const ghobject_t &hoid, + const bufferlist &bl, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + return object_map->set_header(hoid, bl, &spos); +} + +int FileStore::_merge_collection(const coll_t& cid, + uint32_t bits, + coll_t dest, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << " " << dest + << " bits " << bits << dendl; + int r = 0; + + if (!collection_exists(cid)) { + dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl; + ceph_assert(replaying); + return 0; + } + if (!collection_exists(dest)) { + dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl; + ceph_assert(replaying); + return 0; + } + + // set bits + if (_check_replay_guard(cid, spos) > 0) + _collection_set_bits(dest, bits); + + spg_t pgid; + bool is_pg = dest.is_pg(&pgid); + ceph_assert(is_pg); + + int dstcmp = _check_replay_guard(dest, spos); + if (dstcmp < 0) + return 0; + + int srccmp = _check_replay_guard(cid, spos); + if (srccmp < 0) + return 0; + + _set_global_replay_guard(cid, spos); + _set_replay_guard(cid, spos, true); + _set_replay_guard(dest, spos, true); + + // main collection + { + Index from; + r = get_index(cid, &from); + + Index to; + if (!r) + r = get_index(dest, &to); + + if (!r) { + ceph_assert(from.index); + RWLock::WLocker l1((from.index)->access_lock); + + ceph_assert(to.index); + RWLock::WLocker l2((to.index)->access_lock); + + r = from->merge(bits, to.index); + } + } + + // temp too + { + Index from; + r = get_index(cid.get_temp(), &from); + + Index to; + if (!r) + r = get_index(dest.get_temp(), &to); + + if (!r) { + ceph_assert(from.index); + RWLock::WLocker l1((from.index)->access_lock); + + ceph_assert(to.index); + RWLock::WLocker l2((to.index)->access_lock); + + r = from->merge(bits, to.index); + } + } + + // remove source + _destroy_collection(cid); + + _close_replay_guard(dest, spos); + _close_replay_guard(dest.get_temp(), spos); + // no need to close guards on cid... it's removed. + + if (!r && cct->_conf->filestore_debug_verify_split) { + vector<ghobject_t> objects; + ghobject_t next; + while (1) { + collection_list( + dest, + next, ghobject_t::get_max(), + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (!i->match(bits, pgid.pgid.ps())) { + dout(20) << __FUNC__ << ": " << *i << " does not belong in " + << cid << dendl; + ceph_assert(i->match(bits, pgid.pgid.ps())); + } + } + objects.clear(); + } + } + + dout(15) << __FUNC__ << ": " << cid << " " << dest << " bits " << bits + << " = " << r << dendl; + return r; +} + +int FileStore::_split_collection(const coll_t& cid, + uint32_t bits, + uint32_t rem, + coll_t dest, + const SequencerPosition &spos) +{ + int r; + { + dout(15) << __FUNC__ << ": " << cid << " bits: " << bits << dendl; + if (!collection_exists(cid)) { + dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl; + ceph_assert(replaying); + return 0; + } + if (!collection_exists(dest)) { + dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl; + ceph_assert(replaying); + return 0; + } + + int dstcmp = _check_replay_guard(dest, spos); + if (dstcmp < 0) + return 0; + + int srccmp = _check_replay_guard(cid, spos); + if (srccmp < 0) + return 0; + + _set_global_replay_guard(cid, spos); + _set_replay_guard(cid, spos, true); + _set_replay_guard(dest, spos, true); + + Index from; + r = get_index(cid, &from); + + Index to; + if (!r) + r = get_index(dest, &to); + + if (!r) { + ceph_assert(from.index); + RWLock::WLocker l1((from.index)->access_lock); + + ceph_assert(to.index); + RWLock::WLocker l2((to.index)->access_lock); + + r = from->split(rem, bits, to.index); + } + + _close_replay_guard(cid, spos); + _close_replay_guard(dest, spos); + } + _collection_set_bits(cid, bits); + if (!r && cct->_conf->filestore_debug_verify_split) { + vector<ghobject_t> objects; + ghobject_t next; + while (1) { + collection_list( + cid, + next, ghobject_t::get_max(), + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + dout(20) << __FUNC__ << ": " << *i << " still in source " + << cid << dendl; + ceph_assert(!i->match(bits, rem)); + } + objects.clear(); + } + next = ghobject_t(); + while (1) { + collection_list( + dest, + next, ghobject_t::get_max(), + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + dout(20) << __FUNC__ << ": " << *i << " now in dest " + << *i << dendl; + ceph_assert(i->match(bits, rem)); + } + objects.clear(); + } + } + return r; +} + +int FileStore::_set_alloc_hint(const coll_t& cid, const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl; + + FDRef fd; + int ret = 0; + + if (expected_object_size == 0 || expected_write_size == 0) + goto out; + + ret = lfn_open(cid, oid, false, &fd); + if (ret < 0) + goto out; + + { + // TODO: a more elaborate hint calculation + uint64_t hint = std::min<uint64_t>(expected_write_size, m_filestore_max_alloc_hint_size); + + ret = backend->set_alloc_hint(**fd, hint); + dout(20) << __FUNC__ << ": hint " << hint << " ret " << ret << dendl; + } + + lfn_close(fd); +out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl; + ceph_assert(!m_filestore_fail_eio || ret != -EIO); + return ret; +} + +const char** FileStore::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "filestore_max_inline_xattr_size", + "filestore_max_inline_xattr_size_xfs", + "filestore_max_inline_xattr_size_btrfs", + "filestore_max_inline_xattr_size_other", + "filestore_max_inline_xattrs", + "filestore_max_inline_xattrs_xfs", + "filestore_max_inline_xattrs_btrfs", + "filestore_max_inline_xattrs_other", + "filestore_max_xattr_value_size", + "filestore_max_xattr_value_size_xfs", + "filestore_max_xattr_value_size_btrfs", + "filestore_max_xattr_value_size_other", + "filestore_min_sync_interval", + "filestore_max_sync_interval", + "filestore_queue_max_ops", + "filestore_queue_max_bytes", + "filestore_expected_throughput_bytes", + "filestore_expected_throughput_ops", + "filestore_queue_low_threshhold", + "filestore_queue_high_threshhold", + "filestore_queue_high_delay_multiple", + "filestore_queue_max_delay_multiple", + "filestore_commit_timeout", + "filestore_dump_file", + "filestore_kill_at", + "filestore_fail_eio", + "filestore_fadvise", + "filestore_sloppy_crc", + "filestore_sloppy_crc_block_size", + "filestore_max_alloc_hint_size", + NULL + }; + return KEYS; +} + +void FileStore::handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + if (changed.count("filestore_max_inline_xattr_size") || + changed.count("filestore_max_inline_xattr_size_xfs") || + changed.count("filestore_max_inline_xattr_size_btrfs") || + changed.count("filestore_max_inline_xattr_size_other") || + changed.count("filestore_max_inline_xattrs") || + changed.count("filestore_max_inline_xattrs_xfs") || + changed.count("filestore_max_inline_xattrs_btrfs") || + changed.count("filestore_max_inline_xattrs_other") || + changed.count("filestore_max_xattr_value_size") || + changed.count("filestore_max_xattr_value_size_xfs") || + changed.count("filestore_max_xattr_value_size_btrfs") || + changed.count("filestore_max_xattr_value_size_other")) { + if (backend) { + Mutex::Locker l(lock); + set_xattr_limits_via_conf(); + } + } + + if (changed.count("filestore_queue_max_bytes") || + changed.count("filestore_queue_max_ops") || + changed.count("filestore_expected_throughput_bytes") || + changed.count("filestore_expected_throughput_ops") || + changed.count("filestore_queue_low_threshhold") || + changed.count("filestore_queue_high_threshhold") || + changed.count("filestore_queue_high_delay_multiple") || + changed.count("filestore_queue_max_delay_multiple")) { + Mutex::Locker l(lock); + set_throttle_params(); + } + + if (changed.count("filestore_min_sync_interval") || + changed.count("filestore_max_sync_interval") || + changed.count("filestore_kill_at") || + changed.count("filestore_fail_eio") || + changed.count("filestore_sloppy_crc") || + changed.count("filestore_sloppy_crc_block_size") || + changed.count("filestore_max_alloc_hint_size") || + changed.count("filestore_fadvise")) { + Mutex::Locker l(lock); + m_filestore_min_sync_interval = conf->filestore_min_sync_interval; + m_filestore_max_sync_interval = conf->filestore_max_sync_interval; + m_filestore_kill_at = conf->filestore_kill_at; + m_filestore_fail_eio = conf->filestore_fail_eio; + m_filestore_fadvise = conf->filestore_fadvise; + m_filestore_sloppy_crc = conf->filestore_sloppy_crc; + m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size; + m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size; + } + if (changed.count("filestore_commit_timeout")) { + Mutex::Locker l(sync_entry_timeo_lock); + m_filestore_commit_timeout = conf->filestore_commit_timeout; + } + if (changed.count("filestore_dump_file")) { + if (conf->filestore_dump_file.length() && + conf->filestore_dump_file != "-") { + dump_start(conf->filestore_dump_file); + } else { + dump_stop(); + } + } +} + +int FileStore::set_throttle_params() +{ + stringstream ss; + bool valid = throttle_bytes.set_params( + cct->_conf->filestore_queue_low_threshhold, + cct->_conf->filestore_queue_high_threshhold, + cct->_conf->filestore_expected_throughput_bytes, + cct->_conf->filestore_queue_high_delay_multiple? + cct->_conf->filestore_queue_high_delay_multiple: + cct->_conf->filestore_queue_high_delay_multiple_bytes, + cct->_conf->filestore_queue_max_delay_multiple? + cct->_conf->filestore_queue_max_delay_multiple: + cct->_conf->filestore_queue_max_delay_multiple_bytes, + cct->_conf->filestore_queue_max_bytes, + &ss); + + valid &= throttle_ops.set_params( + cct->_conf->filestore_queue_low_threshhold, + cct->_conf->filestore_queue_high_threshhold, + cct->_conf->filestore_expected_throughput_ops, + cct->_conf->filestore_queue_high_delay_multiple? + cct->_conf->filestore_queue_high_delay_multiple: + cct->_conf->filestore_queue_high_delay_multiple_ops, + cct->_conf->filestore_queue_max_delay_multiple? + cct->_conf->filestore_queue_max_delay_multiple: + cct->_conf->filestore_queue_max_delay_multiple_ops, + cct->_conf->filestore_queue_max_ops, + &ss); + + logger->set(l_filestore_op_queue_max_ops, throttle_ops.get_max()); + logger->set(l_filestore_op_queue_max_bytes, throttle_bytes.get_max()); + + if (!valid) { + derr << "tried to set invalid params: " + << ss.str() + << dendl; + } + return valid ? 0 : -EINVAL; +} + +void FileStore::dump_start(const std::string& file) +{ + dout(10) << __FUNC__ << ": " << file << dendl; + if (m_filestore_do_dump) { + dump_stop(); + } + m_filestore_dump_fmt.reset(); + m_filestore_dump_fmt.open_array_section("dump"); + m_filestore_dump.open(file.c_str()); + m_filestore_do_dump = true; +} + +void FileStore::dump_stop() +{ + dout(10) << __FUNC__ << dendl; + m_filestore_do_dump = false; + if (m_filestore_dump.is_open()) { + m_filestore_dump_fmt.close_section(); + m_filestore_dump_fmt.flush(m_filestore_dump); + m_filestore_dump.flush(); + m_filestore_dump.close(); + } +} + +void FileStore::dump_transactions(vector<ObjectStore::Transaction>& ls, uint64_t seq, OpSequencer *osr) +{ + m_filestore_dump_fmt.open_array_section("transactions"); + unsigned trans_num = 0; + for (vector<ObjectStore::Transaction>::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) { + m_filestore_dump_fmt.open_object_section("transaction"); + m_filestore_dump_fmt.dump_stream("osr") << osr->cid; + m_filestore_dump_fmt.dump_unsigned("seq", seq); + m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num); + (*i).dump(&m_filestore_dump_fmt); + m_filestore_dump_fmt.close_section(); + } + m_filestore_dump_fmt.close_section(); + m_filestore_dump_fmt.flush(m_filestore_dump); + m_filestore_dump.flush(); +} + +void FileStore::get_db_statistics(Formatter* f) +{ + object_map->db->get_statistics(f); +} + +void FileStore::set_xattr_limits_via_conf() +{ + uint32_t fs_xattr_size; + uint32_t fs_xattrs; + uint32_t fs_xattr_max_value_size; + + switch (m_fs_type) { +#if defined(__linux__) + case XFS_SUPER_MAGIC: + fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_xfs; + fs_xattrs = cct->_conf->filestore_max_inline_xattrs_xfs; + fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_xfs; + break; + case BTRFS_SUPER_MAGIC: + fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_btrfs; + fs_xattrs = cct->_conf->filestore_max_inline_xattrs_btrfs; + fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_btrfs; + break; +#endif + default: + fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_other; + fs_xattrs = cct->_conf->filestore_max_inline_xattrs_other; + fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_other; + break; + } + + // Use override value if set + if (cct->_conf->filestore_max_inline_xattr_size) + m_filestore_max_inline_xattr_size = cct->_conf->filestore_max_inline_xattr_size; + else + m_filestore_max_inline_xattr_size = fs_xattr_size; + + // Use override value if set + if (cct->_conf->filestore_max_inline_xattrs) + m_filestore_max_inline_xattrs = cct->_conf->filestore_max_inline_xattrs; + else + m_filestore_max_inline_xattrs = fs_xattrs; + + // Use override value if set + if (cct->_conf->filestore_max_xattr_value_size) + m_filestore_max_xattr_value_size = cct->_conf->filestore_max_xattr_value_size; + else + m_filestore_max_xattr_value_size = fs_xattr_max_value_size; + + if (m_filestore_max_xattr_value_size < cct->_conf->osd_max_object_name_len) { + derr << "WARNING: max attr value size (" + << m_filestore_max_xattr_value_size + << ") is smaller than osd_max_object_name_len (" + << cct->_conf->osd_max_object_name_len + << "). Your backend filesystem appears to not support attrs large " + << "enough to handle the configured max rados name size. You may get " + << "unexpected ENAMETOOLONG errors on rados operations or buggy " + << "behavior" + << dendl; + } +} + +uint64_t FileStore::estimate_objects_overhead(uint64_t num_objects) +{ + uint64_t res = num_objects * blk_size / 2; //assumes that each object uses ( in average ) additional 1/2 block due to FS allocation granularity. + return res; +} + +int FileStore::apply_layout_settings(const coll_t &cid, int target_level) +{ + dout(20) << __FUNC__ << ": " << cid << " target level: " + << target_level << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) { + dout(10) << "Error getting index for " << cid << ": " << cpp_strerror(r) + << dendl; + return r; + } + + return index->apply_layout_settings(target_level); +} + + +// -- FSSuperblock -- + +void FSSuperblock::encode(bufferlist &bl) const +{ + ENCODE_START(2, 1, bl); + compat_features.encode(bl); + encode(omap_backend, bl); + ENCODE_FINISH(bl); +} + +void FSSuperblock::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(2, bl); + compat_features.decode(bl); + if (struct_v >= 2) + decode(omap_backend, bl); + else + omap_backend = "leveldb"; + DECODE_FINISH(bl); +} + +void FSSuperblock::dump(Formatter *f) const +{ + f->open_object_section("compat"); + compat_features.dump(f); + f->dump_string("omap_backend", omap_backend); + f->close_section(); +} + +void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o) +{ + FSSuperblock z; + o.push_back(new FSSuperblock(z)); + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); + z.compat_features = CompatSet(feature_compat, feature_ro_compat, + feature_incompat); + o.push_back(new FSSuperblock(z)); + z.omap_backend = "rocksdb"; + o.push_back(new FSSuperblock(z)); +} + +#undef dout_prefix +#define dout_prefix *_dout << "filestore.osr(" << this << ") " + +void FileStore::OpSequencer::_register_apply(Op *o) +{ + if (o->registered_apply) { + dout(20) << __func__ << " " << o << " already registered" << dendl; + return; + } + o->registered_apply = true; + for (auto& t : o->tls) { + for (auto& i : t.get_object_index()) { + uint32_t key = i.first.hobj.get_hash(); + applying.emplace(make_pair(key, &i.first)); + dout(20) << __func__ << " " << o << " " << i.first << " (" + << &i.first << ")" << dendl; + } + } +} + +void FileStore::OpSequencer::_unregister_apply(Op *o) +{ + ceph_assert(o->registered_apply); + for (auto& t : o->tls) { + for (auto& i : t.get_object_index()) { + uint32_t key = i.first.hobj.get_hash(); + auto p = applying.find(key); + bool removed = false; + while (p != applying.end() && + p->first == key) { + if (p->second == &i.first) { + dout(20) << __func__ << " " << o << " " << i.first << " (" + << &i.first << ")" << dendl; + applying.erase(p); + removed = true; + break; + } + ++p; + } + ceph_assert(removed); + } + } +} + +void FileStore::OpSequencer::wait_for_apply(const ghobject_t& oid) +{ + Mutex::Locker l(qlock); + uint32_t key = oid.hobj.get_hash(); +retry: + while (true) { + // search all items in hash slot for a matching object + auto p = applying.find(key); + while (p != applying.end() && + p->first == key) { + if (*p->second == oid) { + dout(20) << __func__ << " " << oid << " waiting on " << p->second + << dendl; + cond.Wait(qlock); + goto retry; + } + ++p; + } + break; + } + dout(20) << __func__ << " " << oid << " done" << dendl; +} diff --git a/src/os/filestore/FileStore.h b/src/os/filestore/FileStore.h new file mode 100644 index 00000000..e09b9e04 --- /dev/null +++ b/src/os/filestore/FileStore.h @@ -0,0 +1,938 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FILESTORE_H +#define CEPH_FILESTORE_H + +#include "include/types.h" + +#include <map> +#include <deque> +#include <atomic> +#include <fstream> + + +#include <boost/scoped_ptr.hpp> + +#include "include/unordered_map.h" + +#include "include/ceph_assert.h" + +#include "os/ObjectStore.h" +#include "JournalingObjectStore.h" + +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "common/perf_counters.h" +#include "common/zipkin_trace.h" + +#include "common/Mutex.h" +#include "HashIndex.h" +#include "IndexManager.h" +#include "os/ObjectMap.h" +#include "SequencerPosition.h" +#include "FDCache.h" +#include "WBThrottle.h" + +#include "include/uuid.h" + +#if defined(__linux__) +# ifndef BTRFS_SUPER_MAGIC +#define BTRFS_SUPER_MAGIC 0x9123683EUL +# endif +# ifndef XFS_SUPER_MAGIC +#define XFS_SUPER_MAGIC 0x58465342UL +# endif +# ifndef ZFS_SUPER_MAGIC +#define ZFS_SUPER_MAGIC 0x2fc12fc1UL +# endif +#endif + + +class FileStoreBackend; + +#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects") + +enum { + l_filestore_first = 84000, + l_filestore_journal_queue_ops, + l_filestore_journal_queue_bytes, + l_filestore_journal_ops, + l_filestore_journal_bytes, + l_filestore_journal_latency, + l_filestore_journal_wr, + l_filestore_journal_wr_bytes, + l_filestore_journal_full, + l_filestore_committing, + l_filestore_commitcycle, + l_filestore_commitcycle_interval, + l_filestore_commitcycle_latency, + l_filestore_op_queue_max_ops, + l_filestore_op_queue_ops, + l_filestore_ops, + l_filestore_op_queue_max_bytes, + l_filestore_op_queue_bytes, + l_filestore_bytes, + l_filestore_apply_latency, + l_filestore_queue_transaction_latency_avg, + l_filestore_sync_pause_max_lat, + l_filestore_last, +}; + +class FSSuperblock { +public: + CompatSet compat_features; + string omap_backend; + + FSSuperblock() { } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<FSSuperblock*>& o); +}; +WRITE_CLASS_ENCODER(FSSuperblock) + +inline ostream& operator<<(ostream& out, const FSSuperblock& sb) +{ + return out << "sb(" << sb.compat_features << "): " + << sb.omap_backend; +} + +class FileStore : public JournalingObjectStore, + public md_config_obs_t +{ + static const uint32_t target_version = 4; +public: + uint32_t get_target_version() { + return target_version; + } + + static int get_block_device_fsid(CephContext* cct, const string& path, + uuid_d *fsid); + struct FSPerfTracker { + PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns; + PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns; + + objectstore_perf_stat_t get_cur_stats() const { + objectstore_perf_stat_t ret; + ret.os_commit_latency_ns = os_commit_latency_ns.current_avg(); + ret.os_apply_latency_ns = os_apply_latency_ns.current_avg(); + return ret; + } + + void update_from_perfcounters(PerfCounters &logger); + } perf_tracker; + objectstore_perf_stat_t get_cur_stats() override { + perf_tracker.update_from_perfcounters(*logger); + return perf_tracker.get_cur_stats(); + } + const PerfCounters* get_perf_counters() const override { + return logger; + } + +private: + string internal_name; ///< internal name, used to name the perfcounter instance + string basedir, journalpath; + osflagbits_t generic_flags; + std::string current_fn; + std::string current_op_seq_fn; + std::string omap_dir; + uuid_d fsid; + + size_t blk_size; ///< fs block size + + int fsid_fd, op_fd, basedir_fd, current_fd; + + FileStoreBackend *backend; + + void create_backend(unsigned long f_type); + + string devname; + + int vdo_fd = -1; + string vdo_name; + + deque<uint64_t> snaps; + + // Indexed Collections + IndexManager index_manager; + int get_index(const coll_t& c, Index *index); + int init_index(const coll_t& c); + + bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) { + // - normal temp case: cid is pg, object is temp (pool < -1) + // - hammer temp case: cid is pg (or already temp), object pool is -1 + return cid.is_pg() && oid.hobj.pool <= -1; + } + void init_temp_collections(); + + void handle_eio(); + + // ObjectMap + boost::scoped_ptr<ObjectMap> object_map; + + // helper fns + int get_cdir(const coll_t& cid, char *s, int len); + + /// read a uuid from fd + int read_fsid(int fd, uuid_d *uuid); + + /// lock fsid_fd + int lock_fsid(); + + // sync thread + Mutex lock; + bool force_sync; + Cond sync_cond; + + Mutex sync_entry_timeo_lock; + SafeTimer timer; + + list<Context*> sync_waiters; + bool stop; + void sync_entry(); + struct SyncThread : public Thread { + FileStore *fs; + explicit SyncThread(FileStore *f) : fs(f) {} + void *entry() override { + fs->sync_entry(); + return 0; + } + } sync_thread; + + // -- op workqueue -- + struct Op { + utime_t start; + uint64_t op; + vector<Transaction> tls; + Context *onreadable, *onreadable_sync; + uint64_t ops, bytes; + TrackedOpRef osd_op; + ZTracer::Trace trace; + bool registered_apply = false; + }; + class OpSequencer : public CollectionImpl { + CephContext *cct; + Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock) + list<Op*> q; + list<uint64_t> jq; + list<pair<uint64_t, Context*> > flush_commit_waiters; + Cond cond; + string osr_name_str; + /// hash of pointers to ghobject_t's for in-flight writes + unordered_multimap<uint32_t,const ghobject_t*> applying; + public: + Mutex apply_lock; // for apply mutual exclusion + int id; + const char *osr_name; + + /// get_max_uncompleted + bool _get_max_uncompleted( + uint64_t *seq ///< [out] max uncompleted seq + ) { + ceph_assert(qlock.is_locked()); + ceph_assert(seq); + *seq = 0; + if (q.empty() && jq.empty()) + return true; + + if (!q.empty()) + *seq = q.back()->op; + if (!jq.empty() && jq.back() > *seq) + *seq = jq.back(); + + return false; + } /// @returns true if both queues are empty + + /// get_min_uncompleted + bool _get_min_uncompleted( + uint64_t *seq ///< [out] min uncompleted seq + ) { + ceph_assert(qlock.is_locked()); + ceph_assert(seq); + *seq = 0; + if (q.empty() && jq.empty()) + return true; + + if (!q.empty()) + *seq = q.front()->op; + if (!jq.empty() && jq.front() < *seq) + *seq = jq.front(); + + return false; + } /// @returns true if both queues are empty + + void _wake_flush_waiters(list<Context*> *to_queue) { + uint64_t seq; + if (_get_min_uncompleted(&seq)) + seq = -1; + + for (list<pair<uint64_t, Context*> >::iterator i = + flush_commit_waiters.begin(); + i != flush_commit_waiters.end() && i->first < seq; + flush_commit_waiters.erase(i++)) { + to_queue->push_back(i->second); + } + } + + void queue_journal(Op *o) { + Mutex::Locker l(qlock); + jq.push_back(o->op); + _register_apply(o); + } + void dequeue_journal(list<Context*> *to_queue) { + Mutex::Locker l(qlock); + jq.pop_front(); + cond.Signal(); + _wake_flush_waiters(to_queue); + } + void queue(Op *o) { + Mutex::Locker l(qlock); + q.push_back(o); + _register_apply(o); + o->trace.keyval("queue depth", q.size()); + } + void _register_apply(Op *o); + void _unregister_apply(Op *o); + void wait_for_apply(const ghobject_t& oid); + Op *peek_queue() { + Mutex::Locker l(qlock); + ceph_assert(apply_lock.is_locked()); + return q.front(); + } + + Op *dequeue(list<Context*> *to_queue) { + ceph_assert(to_queue); + ceph_assert(apply_lock.is_locked()); + Mutex::Locker l(qlock); + Op *o = q.front(); + q.pop_front(); + cond.Signal(); + _unregister_apply(o); + _wake_flush_waiters(to_queue); + return o; + } + + void flush() override { + Mutex::Locker l(qlock); + + while (cct->_conf->filestore_blackhole) + cond.Wait(qlock); // wait forever + + + // get max for journal _or_ op queues + uint64_t seq = 0; + if (!q.empty()) + seq = q.back()->op; + if (!jq.empty() && jq.back() > seq) + seq = jq.back(); + + if (seq) { + // everything prior to our watermark to drain through either/both queues + while ((!q.empty() && q.front()->op <= seq) || + (!jq.empty() && jq.front() <= seq)) + cond.Wait(qlock); + } + } + bool flush_commit(Context *c) override { + Mutex::Locker l(qlock); + uint64_t seq = 0; + if (_get_max_uncompleted(&seq)) { + return true; + } else { + flush_commit_waiters.push_back(make_pair(seq, c)); + return false; + } + } + + OpSequencer(CephContext* cct, int i, coll_t cid) + : CollectionImpl(cid), + cct(cct), + qlock("FileStore::OpSequencer::qlock", false, false), + osr_name_str(stringify(cid)), + apply_lock("FileStore::OpSequencer::apply_lock", false, false), + id(i), + osr_name(osr_name_str.c_str()) {} + ~OpSequencer() override { + ceph_assert(q.empty()); + } + }; + typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef; + + Mutex coll_lock; + map<coll_t,OpSequencerRef> coll_map; + + friend ostream& operator<<(ostream& out, const OpSequencer& s); + + FDCache fdcache; + WBThrottle wbthrottle; + + std::atomic<int64_t> next_osr_id = { 0 }; + bool m_disable_wbthrottle; + deque<OpSequencer*> op_queue; + BackoffThrottle throttle_ops, throttle_bytes; + const int m_ondisk_finisher_num; + const int m_apply_finisher_num; + vector<Finisher*> ondisk_finishers; + vector<Finisher*> apply_finishers; + + ThreadPool op_tp; + struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> { + FileStore *store; + OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp) + : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {} + + bool _enqueue(OpSequencer *osr) override { + store->op_queue.push_back(osr); + return true; + } + void _dequeue(OpSequencer *o) override { + ceph_abort(); + } + bool _empty() override { + return store->op_queue.empty(); + } + OpSequencer *_dequeue() override { + if (store->op_queue.empty()) + return nullptr; + OpSequencer *osr = store->op_queue.front(); + store->op_queue.pop_front(); + return osr; + } + void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override { + store->_do_op(osr, handle); + } + void _process_finish(OpSequencer *osr) override { + store->_finish_op(osr); + } + void _clear() override { + ceph_assert(store->op_queue.empty()); + } + } op_wq; + + void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle); + void _finish_op(OpSequencer *o); + Op *build_op(vector<Transaction>& tls, + Context *onreadable, Context *onreadable_sync, + TrackedOpRef osd_op); + void queue_op(OpSequencer *osr, Op *o); + void op_queue_reserve_throttle(Op *o); + void op_queue_release_throttle(Op *o); + void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk); + friend struct C_JournaledAhead; + + void new_journal(); + + PerfCounters *logger; + + ZTracer::Endpoint trace_endpoint; + +public: + int lfn_find(const ghobject_t& oid, const Index& index, + IndexedPath *path = nullptr); + int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length); + int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf); + int lfn_open( + const coll_t& cid, + const ghobject_t& oid, + bool create, + FDRef *outfd, + Index *index = nullptr); + + void lfn_close(FDRef fd); + int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ; + int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos, + bool force_clear_omap=false); + +public: + FileStore(CephContext* cct, const std::string &base, const std::string &jdev, + osflagbits_t flags = 0, + const char *internal_name = "filestore", bool update_to=false); + ~FileStore() override; + + string get_type() override { + return "filestore"; + } + + int _detect_fs(); + int _sanity_check_fs(); + + bool test_mount_in_use() override; + int read_op_seq(uint64_t *seq); + int write_op_seq(int, uint64_t seq); + int mount() override; + int umount() override; + + int validate_hobject_key(const hobject_t &obj) const override; + + unsigned get_max_attr_name_length() override { + // xattr limit is 128; leave room for our prefixes (user.ceph._), + // some margin, and cap at 100 + return 100; + } + int mkfs() override; + int mkjournal() override; + bool wants_journal() override { + return true; + } + bool allows_journal() override { + return true; + } + bool needs_journal() override { + return false; + } + + bool is_sync_onreadable() const override { + return false; + } + + bool is_rotational() override; + bool is_journal_rotational() override; + + void dump_perf_counters(Formatter *f) override { + f->open_object_section("perf_counters"); + logger->dump_formatted(f, false); + f->close_section(); + } + + int flush_cache(ostream *os = NULL) override; + int write_version_stamp(); + int version_stamp_is_valid(uint32_t *version); + int update_version_stamp(); + int upgrade() override; + + bool can_sort_nibblewise() override { + return true; // i support legacy sort order + } + + void collect_metadata(map<string,string> *pm) override; + int get_devices(set<string> *ls) override; + + int statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts = nullptr) override; + int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override; + + int _do_transactions( + vector<Transaction> &tls, uint64_t op_seq, + ThreadPool::TPHandle *handle, + const char *osr_name); + int do_transactions(vector<Transaction> &tls, uint64_t op_seq) override { + return _do_transactions(tls, op_seq, nullptr, "replay"); + } + void _do_transaction( + Transaction& t, uint64_t op_seq, int trans_num, + ThreadPool::TPHandle *handle, const char *osr_name); + + CollectionHandle open_collection(const coll_t& c) override; + CollectionHandle create_new_collection(const coll_t& c) override; + void set_collection_commit_queue(const coll_t& cid, + ContextQueue *commit_queue) override { + } + + int queue_transactions(CollectionHandle& ch, vector<Transaction>& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = nullptr) override; + + /** + * set replay guard xattr on given file + * + * This will ensure that we will not replay this (or any previous) operation + * against this particular inode/object. + * + * @param fd open file descriptor for the file/object + * @param spos sequencer position of the last operation we should not replay + */ + void _set_replay_guard(int fd, + const SequencerPosition& spos, + const ghobject_t *oid=0, + bool in_progress=false); + void _set_replay_guard(const coll_t& cid, + const SequencerPosition& spos, + bool in_progress); + void _set_global_replay_guard(const coll_t& cid, + const SequencerPosition &spos); + + /// close a replay guard opened with in_progress=true + void _close_replay_guard(int fd, const SequencerPosition& spos, + const ghobject_t *oid=0); + void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos); + + /** + * check replay guard xattr on given file + * + * Check the current position against any marker on the file that + * indicates which operations have already been applied. If the + * current or a newer operation has been marked as applied, we + * should not replay the current operation again. + * + * If we are not replaying the journal, we already return true. It + * is only on replay that we might return false, indicated that the + * operation should not be performed (again). + * + * @param fd open fd on the file/object in question + * @param spos sequencerposition for an operation we could apply/replay + * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress + */ + int _check_replay_guard(int fd, const SequencerPosition& spos); + int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos); + int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos); + int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos); + + // ------------------ + // objects + int pick_object_revision_lt(ghobject_t& oid) { + return 0; + } + using ObjectStore::exists; + bool exists(CollectionHandle& c, const ghobject_t& oid) override; + using ObjectStore::stat; + int stat( + CollectionHandle& c, + const ghobject_t& oid, + struct stat *st, + bool allow_eio = false) override; + using ObjectStore::set_collection_opts; + int set_collection_opts( + CollectionHandle& c, + const pool_opts_t& opts) override; + using ObjectStore::read; + int read( + CollectionHandle& c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0) override; + int _do_fiemap(int fd, uint64_t offset, size_t len, + map<uint64_t, uint64_t> *m); + int _do_seek_hole_data(int fd, uint64_t offset, size_t len, + map<uint64_t, uint64_t> *m); + using ObjectStore::fiemap; + int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) override; + int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override; + + int _touch(const coll_t& cid, const ghobject_t& oid); + int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, + const bufferlist& bl, uint32_t fadvise_flags = 0); + int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len); + int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size); + int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid, + const SequencerPosition& spos); + int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid, + uint64_t srcoff, uint64_t len, uint64_t dstoff, + const SequencerPosition& spos); + int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); + int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); + int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false); + int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos); + + int _fgetattr(int fd, const char *name, bufferptr& bp); + int _fgetattrs(int fd, map<string,bufferptr>& aset); + int _fsetattrs(int fd, map<string, bufferptr> &aset); + + void do_force_sync(); + void start_sync(Context *onsafe); + void sync(); + void _flush_op_queue(); + void flush(); + void sync_and_flush(); + + int flush_journal() override; + int dump_journal(ostream& out) override; + + void set_fsid(uuid_d u) override { + fsid = u; + } + uuid_d get_fsid() override { return fsid; } + + uint64_t estimate_objects_overhead(uint64_t num_objects) override; + + // DEBUG read error injection, an object is removed from both on delete() + Mutex read_error_lock; + set<ghobject_t> data_error_set; // read() will return -EIO + set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO + void inject_data_error(const ghobject_t &oid) override; + void inject_mdata_error(const ghobject_t &oid) override; + + void compact() override { + ceph_assert(object_map); + object_map->compact(); + } + + bool has_builtin_csum() const override { + return false; + } + + void debug_obj_on_delete(const ghobject_t &oid); + bool debug_data_eio(const ghobject_t &oid); + bool debug_mdata_eio(const ghobject_t &oid); + + int snapshot(const string& name) override; + + // attrs + using ObjectStore::getattr; + using ObjectStore::getattrs; + int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, bufferptr &bp) override; + int getattrs(CollectionHandle& c, const ghobject_t& oid, map<string,bufferptr>& aset) override; + + int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset, + const SequencerPosition &spos); + int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name, + const SequencerPosition &spos); + int _rmattrs(const coll_t& cid, const ghobject_t& oid, + const SequencerPosition &spos); + + int _collection_remove_recursive(const coll_t &cid, + const SequencerPosition &spos); + + int _collection_set_bits(const coll_t& cid, int bits); + + // collections + using ObjectStore::collection_list; + int collection_bits(CollectionHandle& c) override; + int collection_list(CollectionHandle& c, + const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *next) override { + c->flush(); + return collection_list(c->cid, start, end, max, ls, next); + } + int collection_list(const coll_t& cid, + const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *next); + int list_collections(vector<coll_t>& ls) override; + int list_collections(vector<coll_t>& ls, bool include_temp); + int collection_stat(const coll_t& c, struct stat *st); + bool collection_exists(const coll_t& c) override; + int collection_empty(CollectionHandle& c, bool *empty) override { + c->flush(); + return collection_empty(c->cid, empty); + } + int collection_empty(const coll_t& cid, bool *empty); + + // omap (see ObjectStore.h for documentation) + using ObjectStore::omap_get; + int omap_get(CollectionHandle& c, const ghobject_t &oid, bufferlist *header, + map<string, bufferlist> *out) override; + using ObjectStore::omap_get_header; + int omap_get_header( + CollectionHandle& c, + const ghobject_t &oid, + bufferlist *out, + bool allow_eio = false) override; + using ObjectStore::omap_get_keys; + int omap_get_keys(CollectionHandle& c, const ghobject_t &oid, set<string> *keys) override; + using ObjectStore::omap_get_values; + int omap_get_values(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys, + map<string, bufferlist> *out) override; + using ObjectStore::omap_check_keys; + int omap_check_keys(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys, + set<string> *out) override; + using ObjectStore::get_omap_iterator; + ObjectMap::ObjectMapIterator get_omap_iterator(CollectionHandle& c, const ghobject_t &oid) override; + ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& cid, const ghobject_t &oid); + + int _create_collection(const coll_t& c, int bits, + const SequencerPosition &spos); + int _destroy_collection(const coll_t& c); + /** + * Give an expected number of objects hint to the collection. + * + * @param c - collection id. + * @param pg_num - pg number of the pool this collection belongs to + * @param expected_num_objs - expected number of objects in this collection + * @param spos - sequence position + * + * @return 0 on success, an error code otherwise + */ + int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num, + uint64_t expected_num_objs, + const SequencerPosition &spos); + int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid, + const SequencerPosition& spos); + int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, + coll_t c, const ghobject_t& o, + const SequencerPosition& spos, + bool ignore_enoent = false); + + int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size); + + void dump_start(const std::string& file); + void dump_stop(); + void dump_transactions(vector<Transaction>& ls, uint64_t seq, OpSequencer *osr); + + virtual int apply_layout_settings(const coll_t &cid, int target_level); + + void get_db_statistics(Formatter* f) override; + +private: + void _inject_failure(); + + // omap + int _omap_clear(const coll_t& cid, const ghobject_t &oid, + const SequencerPosition &spos); + int _omap_setkeys(const coll_t& cid, const ghobject_t &oid, + const map<string, bufferlist> &aset, + const SequencerPosition &spos); + int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const set<string> &keys, + const SequencerPosition &spos); + int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid, + const string& first, const string& last, + const SequencerPosition &spos); + int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl, + const SequencerPosition &spos); + int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest, + const SequencerPosition &spos); + int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest, + const SequencerPosition &spos); + + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) override; + int set_throttle_params(); + float m_filestore_commit_timeout; + bool m_filestore_journal_parallel; + bool m_filestore_journal_trailing; + bool m_filestore_journal_writeahead; + int m_filestore_fiemap_threshold; + double m_filestore_max_sync_interval; + double m_filestore_min_sync_interval; + bool m_filestore_fail_eio; + bool m_filestore_fadvise; + int do_update; + bool m_journal_dio, m_journal_aio, m_journal_force_aio; + std::string m_osd_rollback_to_cluster_snap; + bool m_osd_use_stale_snap; + bool m_filestore_do_dump; + std::ofstream m_filestore_dump; + JSONFormatter m_filestore_dump_fmt; + std::atomic<int64_t> m_filestore_kill_at = { 0 }; + bool m_filestore_sloppy_crc; + int m_filestore_sloppy_crc_block_size; + uint64_t m_filestore_max_alloc_hint_size; + unsigned long m_fs_type; + + //Determined xattr handling based on fs type + void set_xattr_limits_via_conf(); + uint32_t m_filestore_max_inline_xattr_size; + uint32_t m_filestore_max_inline_xattrs; + uint32_t m_filestore_max_xattr_value_size; + + FSSuperblock superblock; + + /** + * write_superblock() + * + * Write superblock to persisent storage + * + * return value: 0 on success, otherwise negative errno + */ + int write_superblock(); + + /** + * read_superblock() + * + * Fill in FileStore::superblock by reading persistent storage + * + * return value: 0 on success, otherwise negative errno + */ + int read_superblock(); + + friend class FileStoreBackend; + friend class TestFileStore; +}; + +ostream& operator<<(ostream& out, const FileStore::OpSequencer& s); + +struct fiemap; + +class FileStoreBackend { +private: + FileStore *filestore; +protected: + int get_basedir_fd() { + return filestore->basedir_fd; + } + int get_current_fd() { + return filestore->current_fd; + } + int get_op_fd() { + return filestore->op_fd; + } + size_t get_blksize() { + return filestore->blk_size; + } + const string& get_basedir_path() { + return filestore->basedir; + } + const string& get_journal_path() { + return filestore->journalpath; + } + const string& get_current_path() { + return filestore->current_fn; + } + int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) { + if (has_fiemap() || has_seek_data_hole()) { + return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff); + } else { + return filestore->_do_copy_range(from, to, srcoff, len, dstoff); + } + } + int get_crc_block_size() { + return filestore->m_filestore_sloppy_crc_block_size; + } + +public: + explicit FileStoreBackend(FileStore *fs) : filestore(fs) {} + virtual ~FileStoreBackend() {} + + CephContext* cct() const { + return filestore->cct; + } + + static FileStoreBackend *create(unsigned long f_type, FileStore *fs); + + virtual const char *get_name() = 0; + virtual int detect_features() = 0; + virtual int create_current() = 0; + virtual bool can_checkpoint() = 0; + virtual int list_checkpoints(list<string>& ls) = 0; + virtual int create_checkpoint(const string& name, uint64_t *cid) = 0; + virtual int sync_checkpoint(uint64_t id) = 0; + virtual int rollback_to(const string& name) = 0; + virtual int destroy_checkpoint(const string& name) = 0; + virtual int syncfs() = 0; + virtual bool has_fiemap() = 0; + virtual bool has_seek_data_hole() = 0; + virtual bool is_rotational() = 0; + virtual bool is_journal_rotational() = 0; + virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0; + virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0; + virtual int set_alloc_hint(int fd, uint64_t hint) = 0; + virtual bool has_splice() const = 0; + + // hooks for (sloppy) crc tracking + virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0; + virtual int _crc_update_truncate(int fd, loff_t off) = 0; + virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0; + virtual int _crc_update_clone_range(int srcfd, int destfd, + loff_t srcoff, size_t len, loff_t dstoff) = 0; + virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, + ostream *out) = 0; +}; + +#endif diff --git a/src/os/filestore/GenericFileStoreBackend.cc b/src/os/filestore/GenericFileStoreBackend.cc new file mode 100644 index 00000000..a75d501f --- /dev/null +++ b/src/os/filestore/GenericFileStoreBackend.cc @@ -0,0 +1,468 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" +#include "include/types.h" + +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> + +#if defined(__linux__) +#include <linux/fs.h> +#endif + +#include "include/compat.h" +#include "include/linux_fiemap.h" + +#include <iostream> +#include <fstream> +#include <sstream> + +#include "GenericFileStoreBackend.h" + +#include "common/errno.h" +#include "common/config.h" +#include "common/sync_filesystem.h" +#include "common/blkdev.h" + +#include "common/SloppyCRCMap.h" +#include "os/filestore/chain_xattr.h" + +#define SLOPPY_CRC_XATTR "user.cephos.scrc" + + +#define dout_context cct() +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") " + +#define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) +#define ALIGNED(x, by) (!((x) % (by))) +#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) + +GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs): + FileStoreBackend(fs), + ioctl_fiemap(false), + seek_data_hole(false), + use_splice(false), + m_filestore_fiemap(cct()->_conf->filestore_fiemap), + m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole), + m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data), + m_filestore_splice(cct()->_conf->filestore_splice) +{ + // rotational? + { + // NOTE: the below won't work on btrfs; we'll assume rotational. + string fn = get_basedir_path(); + int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) { + return; + } + BlkDev blkdev(fd); + m_rotational = blkdev.is_rotational(); + dout(20) << __func__ << " basedir " << fn + << " rotational " << (int)m_rotational << dendl; + ::close(fd); + } + // journal rotational? + { + // NOTE: the below won't work on btrfs; we'll assume rotational. + string fn = get_journal_path(); + int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) { + return; + } + BlkDev blkdev(fd); + m_journal_rotational = blkdev.is_rotational(); + dout(20) << __func__ << " journal filename " << fn.c_str() + << " journal rotational " << (int)m_journal_rotational << dendl; + ::close(fd); + } +} + +int GenericFileStoreBackend::detect_features() +{ + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str()); + + int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, 0644); + if (fd < 0) { + fd = -errno; + derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl; + return fd; + } + + // ext4 has a bug in older kernels where fiemap will return an empty + // result in some cases. this is a file layout that triggers the bug + // on 2.6.34-rc5. + int v[] = { + 0x0000000000016000, 0x0000000000007000, + 0x000000000004a000, 0x0000000000007000, + 0x0000000000060000, 0x0000000000001000, + 0x0000000000061000, 0x0000000000008000, + 0x0000000000069000, 0x0000000000007000, + 0x00000000000a3000, 0x000000000000c000, + 0x000000000024e000, 0x000000000000c000, + 0x000000000028b000, 0x0000000000009000, + 0x00000000002b1000, 0x0000000000003000, + 0, 0 + }; + for (int i=0; v[i]; i++) { + int off = v[i++]; + int len = v[i]; + + // write a large extent + char buf[len]; + memset(buf, 1, sizeof(buf)); + int r = ::lseek(fd, off, SEEK_SET); + if (r < 0) { + r = -errno; + derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + r = write(fd, buf, sizeof(buf)); + if (r < 0) { + derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + } + + // fiemap an extent inside that + if (!m_filestore_fiemap) { + dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl; + ioctl_fiemap = false; + } else { + struct fiemap *fiemap; + int r = do_fiemap(fd, 2430421, 59284, &fiemap); + if (r < 0) { + dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl; + ioctl_fiemap = false; + } else { + if (fiemap->fm_mapped_extents == 0) { + dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl; + ioctl_fiemap = false; + } else { + dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl; + ioctl_fiemap = true; + } + free(fiemap); + } + } + + // SEEK_DATA/SEEK_HOLE detection + if (!m_filestore_seek_data_hole) { + dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl; + seek_data_hole = false; + } else { +#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) + // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running + // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned. + // Fall back to use fiemap. + off_t hole_pos; + + hole_pos = lseek(fd, 0, SEEK_HOLE); + if (hole_pos < 0) { + if (errno == EINVAL) { + dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl; + seek_data_hole = false; + } else { + derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return -errno; + } + } else { + dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl; + seek_data_hole = true; + } +#endif + } + + //splice detection +#ifdef CEPH_HAVE_SPLICE + if (!m_filestore_splice) { + dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl; + use_splice = false; + } else { + int pipefd[2]; + loff_t off_in = 0; + int r; + if (pipe_cloexec(pipefd) < 0) { + int e = errno; + dout(0) << "detect_features: splice pipe met error " << cpp_strerror(e) << dendl; + } else { + lseek(fd, 0, SEEK_SET); + r = splice(fd, &off_in, pipefd[1], NULL, 10, 0); + if (!(r < 0 && errno == EINVAL)) { + use_splice = true; + dout(0) << "detect_features: splice is supported" << dendl; + } else + dout(0) << "detect_features: splice is NOT supported" << dendl; + close(pipefd[0]); + close(pipefd[1]); + } + } +#endif + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + + + bool have_syncfs = false; +#ifdef HAVE_SYS_SYNCFS + if (::syncfs(get_basedir_fd()) == 0) { + dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl; + have_syncfs = true; + } else { + dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl; + } +#elif defined(SYS_syncfs) + if (syscall(SYS_syncfs, get_basedir_fd()) == 0) { + dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl; + have_syncfs = true; + } else { + dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; + } +#elif defined(__NR_syncfs) + if (syscall(__NR_syncfs, get_basedir_fd()) == 0) { + dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl; + have_syncfs = true; + } else { + dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; + } +#endif + if (!have_syncfs) { + dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl; + if (m_filestore_fsync_flushes_journal_data) { + dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl; + } else { + dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl; + dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl; + } + } + + return 0; +} + +int GenericFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "_create_current: current/ exists but is not a directory" << dendl; + ret = -EINVAL; + } + } else { + ret = ::mkdir(get_current_path().c_str(), 0755); + if (ret < 0) { + ret = -errno; + dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl; + } + } + return ret; +} + +int GenericFileStoreBackend::syncfs() +{ + int ret; + if (m_filestore_fsync_flushes_journal_data) { + dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl; + // make the file system's journal commit. + // this works with ext3, but NOT ext4 + ret = ::fsync(get_op_fd()); + if (ret < 0) + ret = -errno; + } else { + dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl; + ret = sync_filesystem(get_current_fd()); + } + return ret; +} + +int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) +{ + struct fiemap *fiemap = NULL; + struct fiemap *_realloc_fiemap = NULL; + int size; + int ret; + + fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1); + if (!fiemap) + return -ENOMEM; + /* + * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096), + * the result is (logical=4096, len=4096). It leak the [3990, 4096). + * Commit:"xfs: fix rounding error of fiemap length parameter + * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug. + * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug. + */ + fiemap->fm_start = start - start % CEPH_PAGE_SIZE; + fiemap->fm_length = len + start % CEPH_PAGE_SIZE; + fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */ + +#if defined(__APPLE__) || defined(__FreeBSD__) + ret = -ENOTSUP; + goto done_err; +#else + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { + ret = -errno; + goto done_err; + } +#endif + size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); + + _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size); + if (!_realloc_fiemap) { + ret = -ENOMEM; + goto done_err; + } else { + fiemap = _realloc_fiemap; + } + + memset(fiemap->fm_extents, 0, size); + + fiemap->fm_extent_count = fiemap->fm_mapped_extents; + fiemap->fm_mapped_extents = 0; + +#if defined(__APPLE__) || defined(__FreeBSD__) + ret = -ENOTSUP; + goto done_err; +#else + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { + ret = -errno; + goto done_err; + } + *pfiemap = fiemap; +#endif + return 0; + +done_err: + *pfiemap = NULL; + free(fiemap); + return ret; +} + + +int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm) +{ + char buf[100]; + bufferptr bp; + int r = 0; + int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf)); + if (l == -ENODATA) { + return 0; + } + if (l >= 0) { + bp = buffer::create(l); + memcpy(bp.c_str(), buf, l); + } else if (l == -ERANGE) { + l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0); + if (l > 0) { + bp = buffer::create(l); + l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l); + } + } + bufferlist bl; + bl.append(std::move(bp)); + auto p = bl.cbegin(); + try { + decode(*cm, p); + } + catch (buffer::error &e) { + r = -EIO; + } + if (r < 0) + derr << __func__ << " got " << cpp_strerror(r) << dendl; + return r; +} + +int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm) +{ + bufferlist bl; + encode(*cm, bl); + int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length()); + if (r < 0) + derr << __func__ << " got " << cpp_strerror(r) << dendl; + return r; +} + +int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + ostringstream ss; + scm.write(off, len, bl, &ss); + dout(30) << __func__ << "\n" << ss.str() << dendl; + r = _crc_save(fd, &scm); + return r; +} + +int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + scm.truncate(off); + r = _crc_save(fd, &scm); + return r; +} + +int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + scm.zero(off, len); + r = _crc_save(fd, &scm); + return r; +} + +int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd, + loff_t srcoff, size_t len, loff_t dstoff) +{ + SloppyCRCMap scm_src(get_crc_block_size()); + SloppyCRCMap scm_dst(get_crc_block_size()); + int r = _crc_load_or_init(srcfd, &scm_src); + if (r < 0) + return r; + r = _crc_load_or_init(destfd, &scm_dst); + if (r < 0) + return r; + ostringstream ss; + scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss); + dout(30) << __func__ << "\n" << ss.str() << dendl; + r = _crc_save(destfd, &scm_dst); + return r; +} + +int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, + ostream *out) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + return scm.read(off, len, bl, out); +} diff --git a/src/os/filestore/GenericFileStoreBackend.h b/src/os/filestore/GenericFileStoreBackend.h new file mode 100644 index 00000000..207c3d0d --- /dev/null +++ b/src/os/filestore/GenericFileStoreBackend.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_GENERICFILESTOREBACKEDN_H +#define CEPH_GENERICFILESTOREBACKEDN_H + +#include "FileStore.h" + +class SloppyCRCMap; + +class GenericFileStoreBackend : public FileStoreBackend { +private: + bool ioctl_fiemap; + bool seek_data_hole; + bool use_splice; + bool m_filestore_fiemap; + bool m_filestore_seek_data_hole; + bool m_filestore_fsync_flushes_journal_data; + bool m_filestore_splice; + bool m_rotational = true; + bool m_journal_rotational = true; +public: + explicit GenericFileStoreBackend(FileStore *fs); + ~GenericFileStoreBackend() override {} + + const char *get_name() override { + return "generic"; + } + int detect_features() override; + int create_current() override; + bool can_checkpoint() override { return false; } + bool is_rotational() override { + return m_rotational; + } + bool is_journal_rotational() override { + return m_journal_rotational; + } + int list_checkpoints(list<string>& ls) override { return 0; } + int create_checkpoint(const string& name, uint64_t *cid) override { return -EOPNOTSUPP; } + int sync_checkpoint(uint64_t id) override { return -EOPNOTSUPP; } + int rollback_to(const string& name) override { return -EOPNOTSUPP; } + int destroy_checkpoint(const string& name) override { return -EOPNOTSUPP; } + int syncfs() override; + bool has_fiemap() override { return ioctl_fiemap; } + bool has_seek_data_hole() override { return seek_data_hole; } + int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) override; + int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override { + return _copy_range(from, to, srcoff, len, dstoff); + } + int set_alloc_hint(int fd, uint64_t hint) override { return -EOPNOTSUPP; } + bool has_splice() const override { return use_splice; } +private: + int _crc_load_or_init(int fd, SloppyCRCMap *cm); + int _crc_save(int fd, SloppyCRCMap *cm); +public: + int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) override; + int _crc_update_truncate(int fd, loff_t off) override; + int _crc_update_zero(int fd, loff_t off, size_t len) override; + int _crc_update_clone_range(int srcfd, int destfd, + loff_t srcoff, size_t len, loff_t dstoff) override; + int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, + ostream *out) override; +}; +#endif diff --git a/src/os/filestore/HashIndex.cc b/src/os/filestore/HashIndex.cc new file mode 100644 index 00000000..ab56b43c --- /dev/null +++ b/src/os/filestore/HashIndex.cc @@ -0,0 +1,1195 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include "include/types.h" +#include "include/buffer.h" +#include "osd/osd_types.h" +#include <errno.h> + +#include "HashIndex.h" + +#include "common/errno.h" +#include "common/debug.h" +#define dout_context cct +#define dout_subsys ceph_subsys_filestore + +const string HashIndex::SUBDIR_ATTR = "contents"; +const string HashIndex::SETTINGS_ATTR = "settings"; +const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op"; + +/// hex digit to integer value +int hex_to_int(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + ceph_abort(); +} + +/// int value to hex digit +char int_to_hex(int v) +{ + ceph_assert(v < 16); + if (v < 10) + return '0' + v; + return 'A' + v - 10; +} + +/// reverse bits in a nibble (0..15) +int reverse_nibble_bits(int in) +{ + ceph_assert(in < 16); + return + ((in & 8) >> 3) | + ((in & 4) >> 1) | + ((in & 2) << 1) | + ((in & 1) << 3); +} + +/// reverse nibble bits in a hex digit +char reverse_hexdigit_bits(char c) +{ + return int_to_hex(reverse_nibble_bits(hex_to_int(c))); +} + +/// reverse nibble bits in a hex string +string reverse_hexdigit_bits_string(string s) +{ + for (unsigned i=0; i<s.size(); ++i) + s[i] = reverse_hexdigit_bits(s[i]); + return s; +} + +/// compare hex digit (as length 1 string) bitwise +bool cmp_hexdigit_bitwise(const string& l, const string& r) +{ + ceph_assert(l.length() == 1 && r.length() == 1); + int lv = hex_to_int(l[0]); + int rv = hex_to_int(r[0]); + ceph_assert(lv < 16); + ceph_assert(rv < 16); + return reverse_nibble_bits(lv) < reverse_nibble_bits(rv); +} + +/// compare hex digit string bitwise +bool cmp_hexdigit_string_bitwise(const string& l, const string& r) +{ + string ll = reverse_hexdigit_bits_string(l); + string rr = reverse_hexdigit_bits_string(r); + return ll < rr; +} + +int HashIndex::cleanup() { + bufferlist bl; + int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) { + // No in progress operations! + return 0; + } + auto i = bl.cbegin(); + InProgressOp in_progress(i); + subdir_info_s info; + r = get_info(in_progress.path, &info); + if (r == -ENOENT) { + return end_split_or_merge(in_progress.path); + } else if (r < 0) { + return r; + } + + if (in_progress.is_split()) + return complete_split(in_progress.path, info); + else if (in_progress.is_merge()) + return complete_merge(in_progress.path, info); + else if (in_progress.is_col_split()) { + for (vector<string>::iterator i = in_progress.path.begin(); + i != in_progress.path.end(); + ++i) { + vector<string> path(in_progress.path.begin(), i); + int r = reset_attr(path); + if (r < 0) + return r; + } + return 0; + } + else + return -EINVAL; +} + +int HashIndex::reset_attr( + const vector<string> &path) +{ + int exists = 0; + int r = path_exists(path, &exists); + if (r < 0) + return r; + if (!exists) + return 0; + map<string, ghobject_t> objects; + vector<string> subdirs; + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + + subdir_info_s info; + info.hash_level = path.size(); + info.objs = objects.size(); + info.subdirs = subdirs.size(); + return set_info(path, info); +} + +int HashIndex::col_split_level( + HashIndex &from, + HashIndex &to, + const vector<string> &path, + uint32_t inbits, + uint32_t match, + unsigned *mkdirred) +{ + /* For each subdir, move, recurse, or ignore based on comparing the low order + * bits of the hash represented by the subdir path with inbits, match passed + * in. + */ + vector<string> subdirs; + int r = from.list_subdirs(path, &subdirs); + if (r < 0) + return r; + map<string, ghobject_t> objects; + r = from.list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + + set<string> to_move; + for (vector<string>::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + uint32_t bits = 0; + uint32_t hash = 0; + vector<string> sub_path(path.begin(), path.end()); + sub_path.push_back(*i); + path_to_hobject_hash_prefix(sub_path, &bits, &hash); + if (bits < inbits) { + if (hobject_t::match_hash(hash, bits, match)) { + r = col_split_level( + from, + to, + sub_path, + inbits, + match, + mkdirred); + if (r < 0) + return r; + if (*mkdirred > path.size()) + *mkdirred = path.size(); + } // else, skip, doesn't need to be moved or recursed into + } else { + if (hobject_t::match_hash(hash, inbits, match)) { + to_move.insert(*i); + } + } // else, skip, doesn't need to be moved or recursed into + } + + /* Then, do the same for each object */ + map<string, ghobject_t> objs_to_move; + for (map<string, ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (i->second.match(inbits, match)) { + objs_to_move.insert(*i); + } + } + + if (objs_to_move.empty() && to_move.empty()) + return 0; + + // Make parent directories as needed + while (*mkdirred < path.size()) { + ++*mkdirred; + int exists = 0; + vector<string> creating_path(path.begin(), path.begin()+*mkdirred); + r = to.path_exists(creating_path, &exists); + if (r < 0) + return r; + if (exists) + continue; + subdir_info_s info; + info.objs = 0; + info.subdirs = 0; + info.hash_level = creating_path.size(); + if (*mkdirred < path.size() - 1) + info.subdirs = 1; + r = to.start_col_split(creating_path); + if (r < 0) + return r; + r = to.create_path(creating_path); + if (r < 0) + return r; + r = to.set_info(creating_path, info); + if (r < 0) + return r; + r = to.end_split_or_merge(creating_path); + if (r < 0) + return r; + } + + subdir_info_s from_info; + subdir_info_s to_info; + r = from.get_info(path, &from_info); + if (r < 0) + return r; + r = to.get_info(path, &to_info); + if (r < 0) + return r; + + from.start_col_split(path); + to.start_col_split(path); + + // Do subdir moves + for (set<string>::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + from_info.subdirs--; + to_info.subdirs++; + r = move_subdir(from, to, path, *i); + if (r < 0) + return r; + } + + for (map<string, ghobject_t>::iterator i = objs_to_move.begin(); + i != objs_to_move.end(); + ++i) { + from_info.objs--; + to_info.objs++; + r = move_object(from, to, path, *i); + if (r < 0) + return r; + } + + + r = to.set_info(path, to_info); + if (r < 0) + return r; + r = from.set_info(path, from_info); + if (r < 0) + return r; + from.end_split_or_merge(path); + to.end_split_or_merge(path); + return 0; +} + +int HashIndex::_merge( + uint32_t bits, + CollectionIndex* dest) { + dout(20) << __func__ << " bits " << bits << dendl; + ceph_assert(collection_version() == dest->collection_version()); + + vector<string> emptypath; + + // pre-split to common/target level so that any shared prefix DIR_? + // directories already exist at the destination. Since each + // directory is a nibble (4 bits), + unsigned shared = bits / 4; + dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl; + if (shared) { + split_dirs(emptypath, shared); + ((HashIndex*)dest)->split_dirs(emptypath, shared); + } + + // now merge the contents + _merge_dirs(*this, *(HashIndex*)dest, emptypath); + + return 0; +} + +int HashIndex::_merge_dirs( + HashIndex& from, + HashIndex& to, + const vector<string>& path) +{ + dout(20) << __func__ << " path " << path << dendl; + int r; + + vector<string> src_subs, dst_subs; + r = from.list_subdirs(path, &src_subs); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "from.list_subdirs" + << dendl; + return r; + } + r = to.list_subdirs(path, &dst_subs); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "to.list_subdirs" + << dendl; + return r; + } + + for (auto& i : src_subs) { + if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) { + // move it + r = move_subdir(from, to, path, i); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "move_subdir(...," + << path << "," << i << ")" + << dendl; + return r; + } + } else { + // common, recurse! + vector<string> nested = path; + nested.push_back(i); + r = _merge_dirs(from, to, nested); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "rec _merge_dirs" + << dendl; + return r; + } + + // now remove it + r = remove_path(nested); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "remove_path " + << nested + << dendl; + return r; + } + } + } + + // objects + map<string, ghobject_t> objects; + r = from.list_objects(path, 0, 0, &objects); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "from.list_objects" + << dendl; + return r; + } + + for (auto& i : objects) { + r = move_object(from, to, path, i); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "move_object(...," + << path << "," << i << ")" + << dendl; + return r; + } + } + + return 0; +} + + +int HashIndex::_split( + uint32_t match, + uint32_t bits, + CollectionIndex* dest) { + ceph_assert(collection_version() == dest->collection_version()); + unsigned mkdirred = 0; + + return col_split_level( + *this, + *static_cast<HashIndex*>(dest), + vector<string>(), + bits, + match, + &mkdirred); +} + +int HashIndex::split_dirs(const vector<string> &path, int target_level) { + dout(20) << __func__ << " " << path << " target level: " + << target_level << dendl; + subdir_info_s info; + int r = get_info(path, &info); + if (r < 0) { + dout(10) << "error looking up info for " << path << ": " + << cpp_strerror(r) << dendl; + return r; + } + + if (must_split(info, target_level)) { + dout(1) << __func__ << " " << path << " has " << info.objs + << " objects, " << info.hash_level + << " level, starting split in pg " << coll() << "." << dendl; + r = initiate_split(path, info); + if (r < 0) { + dout(10) << "error initiating split on " << path << ": " + << cpp_strerror(r) << dendl; + return r; + } + + r = complete_split(path, info); + dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "." + << dendl; + if (r < 0) { + dout(10) << "error completing split on " << path << ": " + << cpp_strerror(r) << dendl; + return r; + } + } + + vector<string> subdirs; + r = list_subdirs(path, &subdirs); + if (r < 0) { + dout(10) << "error listing subdirs of " << path << ": " + << cpp_strerror(r) << dendl; + return r; + } + for (vector<string>::const_iterator it = subdirs.begin(); + it != subdirs.end(); ++it) { + vector<string> subdir_path(path); + subdir_path.push_back(*it); + r = split_dirs(subdir_path, target_level); + if (r < 0) { + return r; + } + } + + return r; +} + +int HashIndex::apply_layout_settings(int target_level) { + vector<string> path; + dout(10) << __func__ << " split multiple = " << split_multiplier + << " merge threshold = " << merge_threshold + << " split rand factor = " << cct->_conf->filestore_split_rand_factor + << " target level = " << target_level + << dendl; + int r = write_settings(); + if (r < 0) + return r; + return split_dirs(path, target_level); +} + +int HashIndex::_init() { + subdir_info_s info; + vector<string> path; + int r = set_info(path, info); + if (r < 0) + return r; + return write_settings(); +} + +int HashIndex::write_settings() { + if (cct->_conf->filestore_split_rand_factor > 0) { + settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor; + } else { + settings.split_rand_factor = 0; + } + vector<string> path; + bufferlist bl; + settings.encode(bl); + return add_attr_path(path, SETTINGS_ATTR, bl); +} + +int HashIndex::read_settings() { + vector<string> path; + bufferlist bl; + int r = get_attr_path(path, SETTINGS_ATTR, bl); + if (r == -ENODATA) + return 0; + if (r < 0) { + derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl; + return r; + } + auto it = bl.cbegin(); + settings.decode(it); + dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl; + return 0; +} + +/* LFNIndex virtual method implementations */ +int HashIndex::_created(const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name) { + subdir_info_s info; + int r; + r = get_info(path, &info); + if (r < 0) + return r; + info.objs++; + r = set_info(path, info); + if (r < 0) + return r; + + if (must_split(info)) { + dout(1) << __func__ << " " << path << " has " << info.objs + << " objects, starting split in pg " << coll() << "." << dendl; + int r = initiate_split(path, info); + if (r < 0) + return r; + r = complete_split(path, info); + dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "." + << dendl; + return r; + } else { + return 0; + } +} + +int HashIndex::_remove(const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name) { + int r; + r = remove_object(path, oid); + if (r < 0) + return r; + subdir_info_s info; + r = get_info(path, &info); + if (r < 0) + return r; + info.objs--; + r = set_info(path, info); + if (r < 0) + return r; + if (must_merge(info)) { + r = initiate_merge(path, info); + if (r < 0) + return r; + return complete_merge(path, info); + } else { + return 0; + } +} + +int HashIndex::_lookup(const ghobject_t &oid, + vector<string> *path, + string *mangled_name, + int *hardlink) { + vector<string> path_comp; + get_path_components(oid, &path_comp); + vector<string>::iterator next = path_comp.begin(); + int exists; + while (1) { + int r = path_exists(*path, &exists); + if (r < 0) + return r; + if (!exists) { + if (path->empty()) + return -ENOENT; + path->pop_back(); + break; + } + if (next == path_comp.end()) + break; + path->push_back(*(next++)); + } + return get_mangled_name(*path, oid, mangled_name, hardlink); +} + +int HashIndex::_collection_list_partial(const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next) { + vector<string> path; + ghobject_t _next; + if (!next) + next = &_next; + *next = start; + dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl; + return list_by_hash(path, end, max_count, next, ls); +} + +int HashIndex::prep_delete() { + return recursive_remove(vector<string>()); +} + +int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) { + int ret; + vector<string> path; + subdir_info_s root_info; + // Make sure there is neither objects nor sub-folders + // in this collection + ret = get_info(path, &root_info); + if (ret < 0) + return ret; + + // Do the folder splitting first + ret = pre_split_folder(pg_num, expected_num_objs); + if (ret < 0) + return ret; + // Initialize the folder info starting from root + return init_split_folder(path, 0); +} + +int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs) +{ + // If folder merging is enabled (by setting the threshold positive), + // no need to split + if (merge_threshold > 0) + return 0; + const coll_t c = coll(); + // Do not split if the expected number of objects in this collection is zero (by default) + if (expected_num_objs == 0) + return 0; + + // Calculate the number of leaf folders (which actually store files) + // need to be created + const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16; + uint64_t leavies = expected_num_objs / objs_per_folder ; + // No need to split + if (leavies == 0 || expected_num_objs == objs_per_folder) + return 0; + + spg_t spgid; + if (!c.is_pg_prefix(&spgid)) + return -EINVAL; + const ps_t ps = spgid.pgid.ps(); + + // the most significant bits of pg_num + const int pg_num_bits = calc_num_bits(pg_num - 1); + ps_t tmp_id = ps; + // calculate the number of levels we only create one sub folder + int num = pg_num_bits / 4; + // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111, + // so that splitting starts at level 3 + if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) { + --num; + } + + int ret; + // Start with creation that only has one subfolder + vector<string> paths; + int dump_num = num; + while (num-- > 0) { + ps_t v = tmp_id & 0x0000000f; + paths.push_back(to_hex(v)); + ret = create_path(paths); + if (ret < 0 && ret != -EEXIST) + return ret; + tmp_id = tmp_id >> 4; + } + + // Starting from here, we can split by creating multiple subfolders + const int left_bits = pg_num_bits - dump_num * 4; + // this variable denotes how many bits (for this level) that can be + // used for sub folder splitting + int split_bits = 4 - left_bits; + // the below logic is inspired by rados.h#ceph_stable_mod, + // it basically determines how many sub-folders should we + // create for splitting + ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT + if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) { + ++split_bits; + } + const uint32_t subs = (1 << split_bits); + // Calculate how many levels we create starting from here + int level = 0; + int level_limit = MAX_HASH_LEVEL - dump_num - 1; + uint64_t actual_leaves = subs; + while (actual_leaves < leavies && level < level_limit) { + ++level; + actual_leaves <<= 4; + } + for (uint32_t i = 0; i < subs; ++i) { + ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT + int v = tmp_id | (i << ((4 - split_bits) % 4)); + paths.push_back(to_hex(v)); + ret = create_path(paths); + if (ret < 0 && ret != -EEXIST) + return ret; + ret = recursive_create_path(paths, level); + if (ret < 0) + return ret; + paths.pop_back(); + } + return 0; +} + +int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level) +{ + // Get the number of sub directories for the current path + vector<string> subdirs; + int ret = list_subdirs(path, &subdirs); + if (ret < 0) + return ret; + subdir_info_s info; + info.subdirs = subdirs.size(); + info.hash_level = hash_level; + ret = set_info(path, info); + if (ret < 0) + return ret; + ret = fsync_dir(path); + if (ret < 0) + return ret; + + // Do the same for subdirs + vector<string>::const_iterator iter; + for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) { + path.push_back(*iter); + ret = init_split_folder(path, hash_level + 1); + if (ret < 0) + return ret; + path.pop_back(); + } + return 0; +} + +int HashIndex::recursive_create_path(vector<string>& path, int level) +{ + if (level == 0) + return 0; + for (int i = 0; i < 16; ++i) { + path.push_back(to_hex(i)); + int ret = create_path(path); + if (ret < 0 && ret != -EEXIST) + return ret; + ret = recursive_create_path(path, level - 1); + if (ret < 0) + return ret; + path.pop_back(); + } + return 0; +} + +int HashIndex::recursive_remove(const vector<string> &path) { + return _recursive_remove(path, true); +} + +int HashIndex::_recursive_remove(const vector<string> &path, bool top) { + vector<string> subdirs; + dout(20) << __func__ << " path=" << path << dendl; + int r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + map<string, ghobject_t> objects; + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + if (!objects.empty()) + return -ENOTEMPTY; + vector<string> subdir(path); + for (vector<string>::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + subdir.push_back(*i); + r = _recursive_remove(subdir, false); + if (r < 0) + return r; + subdir.pop_back(); + } + if (top) + return 0; + else + return remove_path(path); +} + +int HashIndex::start_col_split(const vector<string> &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::COL_SPLIT, path); + op_tag.encode(bl); + int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector<string>()); +} + +int HashIndex::start_split(const vector<string> &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::SPLIT, path); + op_tag.encode(bl); + int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector<string>()); +} + +int HashIndex::start_merge(const vector<string> &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::MERGE, path); + op_tag.encode(bl); + int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector<string>()); +} + +int HashIndex::end_split_or_merge(const vector<string> &path) { + return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG); +} + +int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) { + bufferlist buf; + int r = get_attr_path(path, SUBDIR_ATTR, buf); + if (r < 0) + return r; + auto bufiter = buf.cbegin(); + info->decode(bufiter); + ceph_assert(path.size() == (unsigned)info->hash_level); + return 0; +} + +int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) { + bufferlist buf; + ceph_assert(path.size() == (unsigned)info.hash_level); + info.encode(buf); + return add_attr_path(path, SUBDIR_ATTR, buf); +} + +bool HashIndex::must_merge(const subdir_info_s &info) { + return (info.hash_level > 0 && + merge_threshold > 0 && + info.objs < (unsigned)merge_threshold && + info.subdirs == 0); +} + +bool HashIndex::must_split(const subdir_info_s &info, int target_level) { + // target_level is used for ceph-objectstore-tool to split dirs offline. + // if it is set (defalult is 0) and current hash level < target_level, + // this dir would be split no matters how many objects it has. + return (info.hash_level < (unsigned)MAX_HASH_LEVEL && + ((target_level > 0 && info.hash_level < (unsigned)target_level) || + (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16)))); +} + +int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) { + return start_merge(path); +} + +int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) { + vector<string> dst = path; + dst.pop_back(); + subdir_info_s dstinfo; + int r, exists; + r = path_exists(path, &exists); + if (r < 0) + return r; + r = get_info(dst, &dstinfo); + if (r < 0) + return r; + if (exists) { + r = move_objects(path, dst); + if (r < 0) + return r; + r = reset_attr(dst); + if (r < 0) + return r; + r = remove_path(path); + if (r < 0) + return r; + } + if (must_merge(dstinfo)) { + r = initiate_merge(dst, dstinfo); + if (r < 0) + return r; + r = fsync_dir(dst); + if (r < 0) + return r; + return complete_merge(dst, dstinfo); + } + r = fsync_dir(dst); + if (r < 0) + return r; + return end_split_or_merge(path); +} + +int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) { + return start_split(path); +} + +int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) { + int level = info.hash_level; + map<string, ghobject_t> objects; + vector<string> dst = path; + int r; + dst.push_back(""); + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + vector<string> subdirs_vec; + r = list_subdirs(path, &subdirs_vec); + if (r < 0) + return r; + set<string> subdirs; + subdirs.insert(subdirs_vec.begin(), subdirs_vec.end()); + map<string, map<string, ghobject_t> > mapped; + map<string, ghobject_t> moved; + int num_moved = 0; + for (map<string, ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + vector<string> new_path; + get_path_components(i->second, &new_path); + mapped[new_path[level]][i->first] = i->second; + } + for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin(); + i != mapped.end(); + ) { + dst[level] = i->first; + /* If the info already exists, it must be correct, + * we may be picking up a partially finished split */ + subdir_info_s temp; + // subdir has already been fully copied + if (subdirs.count(i->first) && !get_info(dst, &temp)) { + for (map<string, ghobject_t>::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + moved[j->first] = j->second; + num_moved++; + objects.erase(j->first); + } + ++i; + continue; + } + + subdir_info_s info_new; + info_new.objs = i->second.size(); + info_new.subdirs = 0; + info_new.hash_level = level + 1; + if (must_merge(info_new) && !subdirs.count(i->first)) { + mapped.erase(i++); + continue; + } + + // Subdir doesn't yet exist + if (!subdirs.count(i->first)) { + info.subdirs += 1; + r = create_path(dst); + if (r < 0) + return r; + } // else subdir has been created but only partially copied + + for (map<string, ghobject_t>::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + moved[j->first] = j->second; + num_moved++; + objects.erase(j->first); + r = link_object(path, dst, j->second, j->first); + // May be a partially finished split + if (r < 0 && r != -EEXIST) { + return r; + } + } + + r = fsync_dir(dst); + if (r < 0) + return r; + + // Presence of info must imply that all objects have been copied + r = set_info(dst, info_new); + if (r < 0) + return r; + + r = fsync_dir(dst); + if (r < 0) + return r; + + ++i; + } + r = remove_objects(path, moved, &objects); + if (r < 0) + return r; + info.objs = objects.size(); + r = reset_attr(path); + if (r < 0) + return r; + r = fsync_dir(path); + if (r < 0) + return r; + return end_split_or_merge(path); +} + +void HashIndex::get_path_components(const ghobject_t &oid, + vector<string> *path) { + char buf[MAX_HASH_LEVEL + 1]; + snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key()); + + // Path components are the hex characters of oid.hobj.hash, least + // significant first + for (int i = 0; i < MAX_HASH_LEVEL; ++i) { + path->push_back(string(&buf[i], 1)); + } +} + +string HashIndex::get_hash_str(uint32_t hash) { + char buf[MAX_HASH_LEVEL + 1]; + snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash); + string retval; + for (int i = 0; i < MAX_HASH_LEVEL; ++i) { + retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]); + } + return retval; +} + +string HashIndex::get_path_str(const ghobject_t &oid) { + ceph_assert(!oid.is_max()); + return get_hash_str(oid.hobj.get_hash()); +} + +uint32_t HashIndex::hash_prefix_to_hash(string prefix) { + while (prefix.size() < sizeof(uint32_t) * 2) { + prefix.push_back('0'); + } + uint32_t hash; + sscanf(prefix.c_str(), "%x", &hash); + // nibble reverse + hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4); + hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8); + hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16); + return hash; +} + +int HashIndex::get_path_contents_by_hash_bitwise( + const vector<string> &path, + const ghobject_t *next_object, + set<string, CmpHexdigitStringBitwise> *hash_prefixes, + set<pair<string, ghobject_t>, CmpPairBitwise> *objects) +{ + map<string, ghobject_t> rev_objects; + int r; + r = list_objects(path, 0, 0, &rev_objects); + if (r < 0) + return r; + // bitwise sort + for (map<string, ghobject_t>::iterator i = rev_objects.begin(); + i != rev_objects.end(); + ++i) { + if (next_object && i->second < *next_object) + continue; + string hash_prefix = get_path_str(i->second); + hash_prefixes->insert(hash_prefix); + objects->insert(pair<string, ghobject_t>(hash_prefix, i->second)); + } + vector<string> subdirs; + r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + + // sort subdirs bitwise (by reversing hex digit nibbles) + std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise); + + // Local to this function, we will convert the prefix strings + // (previously simply the reversed hex digits) to also have each + // digit's nibbles reversed. This will make the strings sort + // bitwise. + string cur_prefix; + for (vector<string>::const_iterator i = path.begin(); + i != path.end(); + ++i) { + cur_prefix.append(reverse_hexdigit_bits_string(*i)); + } + string next_object_string; + if (next_object) + next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object)); + for (vector<string>::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + string candidate = cur_prefix + reverse_hexdigit_bits_string(*i); + if (next_object) { + if (next_object->is_max()) + continue; + if (candidate < next_object_string.substr(0, candidate.size())) + continue; + } + // re-reverse the hex digit nibbles for the caller + hash_prefixes->insert(reverse_hexdigit_bits_string(candidate)); + } + return 0; +} + +int HashIndex::list_by_hash(const vector<string> &path, + const ghobject_t &end, + int max_count, + ghobject_t *next, + vector<ghobject_t> *out) +{ + ceph_assert(out); + return list_by_hash_bitwise(path, end, max_count, next, out); +} + +int HashIndex::list_by_hash_bitwise( + const vector<string> &path, + const ghobject_t& end, + int max_count, + ghobject_t *next, + vector<ghobject_t> *out) +{ + vector<string> next_path = path; + next_path.push_back(""); + set<string, CmpHexdigitStringBitwise> hash_prefixes; + set<pair<string, ghobject_t>, CmpPairBitwise> objects; + int r = get_path_contents_by_hash_bitwise(path, + next, + &hash_prefixes, + &objects); + if (r < 0) + return r; + for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin(); + i != hash_prefixes.end(); + ++i) { + dout(20) << __func__ << " prefix " << *i << dendl; + set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound( + make_pair(*i, ghobject_t())); + if (j == objects.end() || j->first != *i) { + *(next_path.rbegin()) = *(i->rbegin()); + ghobject_t next_recurse; + if (next) + next_recurse = *next; + r = list_by_hash_bitwise(next_path, + end, + max_count, + &next_recurse, + out); + + if (r < 0) + return r; + if (!next_recurse.is_max()) { + if (next) + *next = next_recurse; + return 0; + } + } else { + while (j != objects.end() && j->first == *i) { + if (max_count > 0 && out->size() == (unsigned)max_count) { + if (next) + *next = j->second; + return 0; + } + if (j->second >= end) { + if (next) + *next = j->second; + return 0; + } + if (!next || j->second >= *next) { + dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl; + out->push_back(j->second); + } + ++j; + } + } + } + if (next) + *next = ghobject_t::get_max(); + return 0; +} + + diff --git a/src/os/filestore/HashIndex.h b/src/os/filestore/HashIndex.h new file mode 100644 index 00000000..7e34d155 --- /dev/null +++ b/src/os/filestore/HashIndex.h @@ -0,0 +1,462 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_HASHINDEX_H +#define CEPH_HASHINDEX_H + +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "LFNIndex.h" + +extern string reverse_hexdigit_bits_string(string l); + +/** + * Implements collection prehashing. + * + * @verbatim + * (root) - 0 - 0 + * - 1 + * - E + * - 1 + * - 2 - D - 0 + * . + * . + * . + * - F - 0 + * @endverbatim + * + * A file is located at the longest existing directory from the root + * given by the hex characters in the hash beginning with the least + * significant. + * + * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2) + * would be located in (root)/2/D/0/ + * + * Subdirectories are created when the number of objects in a + * directory exceed 16 * (abs(merge_threshhold) * split_multiplier + + * split_rand_factor). The number of objects in a directory is encoded + * as subdir_info_s in an xattr on the directory. + */ +class HashIndex : public LFNIndex { +private: + /// Attribute name for storing subdir info @see subdir_info_s + static const string SUBDIR_ATTR; + /// Attribute name for storing index-wide settings + static const string SETTINGS_ATTR; + /// Attribute name for storing in progress op tag + static const string IN_PROGRESS_OP_TAG; + /// Size (bits) in object hash + static const int PATH_HASH_LEN = 32; + /// Max length of hashed path + static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4); + + /** + * Merges occur when the number of object drops below + * merge_threshold and splits occur when the number of objects + * exceeds: + * + * 16 * (abs(merge_threshold) * split_multiplier + split_rand_factor) + * + * Please note if merge_threshold is less than zero, it will never + * do merging + */ + int merge_threshold; + int split_multiplier; + + /// Encodes current subdir state for determining when to split/merge. + struct subdir_info_s { + uint64_t objs; ///< Objects in subdir. + uint32_t subdirs; ///< Subdirs in subdir. + uint32_t hash_level; ///< Hashlevel of subdir. + + subdir_info_s() : objs(0), subdirs(0), hash_level(0) {} + + void encode(bufferlist &bl) const + { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(objs, bl); + encode(subdirs, bl); + encode(hash_level, bl); + } + + void decode(bufferlist::const_iterator &bl) + { + using ceph::decode; + __u8 v; + decode(v, bl); + ceph_assert(v == 1); + decode(objs, bl); + decode(subdirs, bl); + decode(hash_level, bl); + } + }; + + struct settings_s { + uint32_t split_rand_factor; ///< random factor added to split threshold (only on root of collection) + settings_s() : split_rand_factor(0) {} + void encode(bufferlist &bl) const + { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(split_rand_factor, bl); + } + void decode(bufferlist::const_iterator &bl) + { + using ceph::decode; + __u8 v; + decode(v, bl); + decode(split_rand_factor, bl); + } + } settings; + + /// Encodes in progress split or merge + struct InProgressOp { + static const int SPLIT = 0; + static const int MERGE = 1; + static const int COL_SPLIT = 2; + int op; + vector<string> path; + + InProgressOp(int op, const vector<string> &path) + : op(op), path(path) {} + + explicit InProgressOp(bufferlist::const_iterator &bl) { + decode(bl); + } + + bool is_split() const { return op == SPLIT; } + bool is_col_split() const { return op == COL_SPLIT; } + bool is_merge() const { return op == MERGE; } + + void encode(bufferlist &bl) const { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(op, bl); + encode(path, bl); + } + + void decode(bufferlist::const_iterator &bl) { + using ceph::decode; + __u8 v; + decode(v, bl); + ceph_assert(v == 1); + decode(op, bl); + decode(path, bl); + } + }; + + +public: + /// Constructor. + HashIndex( + CephContext* cct, + coll_t collection, ///< [in] Collection + const char *base_path, ///< [in] Path to the index root. + int merge_at, ///< [in] Merge threshold. + int split_multiple, ///< [in] Split threshold. + uint32_t index_version,///< [in] Index version + double retry_probability=0) ///< [in] retry probability + : LFNIndex(cct, collection, base_path, index_version, retry_probability), + merge_threshold(merge_at), + split_multiplier(split_multiple) + {} + + int read_settings() override; + + /// @see CollectionIndex + uint32_t collection_version() override { return index_version; } + + /// @see CollectionIndex + int cleanup() override; + + /// @see CollectionIndex + int prep_delete() override; + + /// @see CollectionIndex + int _split( + uint32_t match, + uint32_t bits, + CollectionIndex* dest + ) override; + + /// @see CollectionIndex + int _merge( + uint32_t bits, + CollectionIndex* dest + ) override; + + int _merge_dirs( + HashIndex& from, + HashIndex& to, + const vector<string>& path); + + /// @see CollectionIndex + int apply_layout_settings(int target_level) override; + +protected: + int _init() override; + + int _created( + const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name + ) override; + int _remove( + const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name + ) override; + int _lookup( + const ghobject_t &oid, + vector<string> *path, + string *mangled_name, + int *hardlink + ) override; + + /** + * Pre-hash the collection to create folders according to the expected number + * of objects in this collection. + */ + int _pre_hash_collection( + uint32_t pg_num, + uint64_t expected_num_objs + ) override; + + int _collection_list_partial( + const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next + ) override; +private: + /// Internal recursively remove path and its subdirs + int _recursive_remove( + const vector<string> &path, ///< [in] path to remove + bool top ///< [in] internal tracking of first caller + ); /// @return Error Code, 0 on success + /// Recursively remove path and its subdirs + int recursive_remove( + const vector<string> &path ///< [in] path to remove + ); /// @return Error Code, 0 on success + /// Tag root directory at beginning of col_split + int start_col_split( + const vector<string> &path ///< [in] path to split + ); ///< @return Error Code, 0 on success + /// Tag root directory at beginning of split + int start_split( + const vector<string> &path ///< [in] path to split + ); ///< @return Error Code, 0 on success + /// Tag root directory at beginning of split + int start_merge( + const vector<string> &path ///< [in] path to merge + ); ///< @return Error Code, 0 on success + /// Remove tag at end of split or merge + int end_split_or_merge( + const vector<string> &path ///< [in] path to split or merged + ); ///< @return Error Code, 0 on success + /// Gets info from the xattr on the subdir represented by path + int get_info( + const vector<string> &path, ///< [in] Path from which to read attribute. + subdir_info_s *info ///< [out] Attribute value + ); /// @return Error Code, 0 on success + + /// Sets info to the xattr on the subdir represented by path + int set_info( + const vector<string> &path, ///< [in] Path on which to set attribute. + const subdir_info_s &info ///< [in] Value to set + ); /// @return Error Code, 0 on success + + /// Encapsulates logic for when to split. + bool must_merge( + const subdir_info_s &info ///< [in] Info to check + ); /// @return True if info must be merged, False otherwise + + /// Encapsulates logic for when to merge. + bool must_split( + const subdir_info_s &info, ///< [in] Info to check + int target_level = 0 + ); /// @return True if info must be split, False otherwise + + /// Initiates merge + int initiate_merge( + const vector<string> &path, ///< [in] Subdir to merge + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Completes merge + int complete_merge( + const vector<string> &path, ///< [in] Subdir to merge + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Resets attr to match actual subdir contents + int reset_attr( + const vector<string> &path ///< [in] path to cleanup + ); + + /// Initiate Split + int initiate_split( + const vector<string> &path, ///< [in] Subdir to split + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Completes Split + int complete_split( + const vector<string> &path, ///< [in] Subdir to split + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Determine path components from hoid hash + void get_path_components( + const ghobject_t &oid, ///< [in] Object for which to get path components + vector<string> *path ///< [out] Path components for hoid. + ); + + /// Pre-hash and split folders to avoid runtime splitting + /// according to the given expected object number. + int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs); + + /// Initialize the folder (dir info) with the given hash + /// level and number of its subdirs. + int init_split_folder(vector<string> &path, uint32_t hash_level); + + /// do collection split for path + static int col_split_level( + HashIndex &from, ///< [in] from index + HashIndex &dest, ///< [in] to index + const vector<string> &path, ///< [in] path to split + uint32_t bits, ///< [in] num bits to match + uint32_t match, ///< [in] bits to match + unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred + ); + + + /** + * Get string representation of ghobject_t/hash + * + * e.g: 0x01234567 -> "76543210" + */ + static string get_path_str( + const ghobject_t &oid ///< [in] Object to get hash string for + ); ///< @return Hash string for hoid. + + /// Get string from hash, @see get_path_str + static string get_hash_str( + uint32_t hash ///< [in] Hash to convert to a string. + ); ///< @return String representation of hash + + /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00 + static uint32_t hash_prefix_to_hash( + string prefix ///< [in] string to convert + ); ///< @return Hash + + /// Get hash mod from path + static void path_to_hobject_hash_prefix( + const vector<string> &path,///< [in] path to convert + uint32_t *bits, ///< [out] bits + uint32_t *hash ///< [out] hash + ) { + string hash_str; + for (vector<string>::const_iterator i = path.begin(); + i != path.end(); + ++i) { + hash_str.push_back(*i->begin()); + } + uint32_t rev_hash = hash_prefix_to_hash(hash_str); + if (hash) + *hash = rev_hash; + if (bits) + *bits = path.size() * 4; + } + + /// Calculate the number of bits. + static int calc_num_bits(uint64_t n) { + int ret = 0; + while (n > 0) { + n = n >> 1; + ret++; + } + return ret; + } + + /// Convert a number to hex string (upper case). + static string to_hex(int n) { + ceph_assert(n >= 0 && n < 16); + char c = (n <= 9 ? ('0' + n) : ('A' + n - 10)); + string str; + str.append(1, c); + return str; + } + + struct CmpPairBitwise { + bool operator()(const pair<string, ghobject_t>& l, + const pair<string, ghobject_t>& r) const + { + if (l.first < r.first) + return true; + if (l.first > r.first) + return false; + if (cmp(l.second, r.second) < 0) + return true; + return false; + } + }; + + struct CmpHexdigitStringBitwise { + bool operator()(const string& l, const string& r) const { + return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r); + } + }; + + /// Get path contents by hash + int get_path_contents_by_hash_bitwise( + const vector<string> &path, /// [in] Path to list + const ghobject_t *next_object, /// [in] list > *next_object + set<string, CmpHexdigitStringBitwise> *hash_prefixes, /// [out] prefixes in dir + set<pair<string, ghobject_t>, CmpPairBitwise> *objects /// [out] objects + ); + + /// List objects in collection in ghobject_t order + int list_by_hash( + const vector<string> &path, /// [in] Path to list + const ghobject_t &end, /// [in] List only objects < end + int max_count, /// [in] List at most max_count + ghobject_t *next, /// [in,out] List objects >= *next + vector<ghobject_t> *out /// [out] Listed objects + ); ///< @return Error Code, 0 on success + /// List objects in collection in ghobject_t order + int list_by_hash_bitwise( + const vector<string> &path, /// [in] Path to list + const ghobject_t &end, /// [in] List only objects < end + int max_count, /// [in] List at most max_count + ghobject_t *next, /// [in,out] List objects >= *next + vector<ghobject_t> *out /// [out] Listed objects + ); ///< @return Error Code, 0 on success + + /// Create the given levels of sub directories from the given root. + /// The contents of *path* is not changed after calling this function. + int recursive_create_path(vector<string>& path, int level); + + /// split each dir below the given path + int split_dirs(const vector<string> &path, int target_level = 0); + + int write_settings(); +}; + +#endif diff --git a/src/os/filestore/IndexManager.cc b/src/os/filestore/IndexManager.cc new file mode 100644 index 00000000..73095026 --- /dev/null +++ b/src/os/filestore/IndexManager.cc @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/unordered_map.h" + +#if defined(__FreeBSD__) +#include <sys/param.h> +#endif + +#include <errno.h> + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/config.h" +#include "common/debug.h" +#include "include/buffer.h" + +#include "IndexManager.h" +#include "HashIndex.h" +#include "CollectionIndex.h" + +#include "chain_xattr.h" + +static int set_version(const char *path, uint32_t version) { + bufferlist bl; + encode(version, bl); + return chain_setxattr<true, true>( + path, "user.cephos.collection_version", bl.c_str(), + bl.length()); +} + +static int get_version(const char *path, uint32_t *version) { + bufferptr bp(PATH_MAX); + int r = chain_getxattr(path, "user.cephos.collection_version", + bp.c_str(), bp.length()); + if (r < 0) { + if (r != -ENOENT) { + *version = 0; + return 0; + } else { + return r; + } + } + bp.set_length(r); + bufferlist bl; + bl.push_back(bp); + auto i = bl.cbegin(); + decode(*version, i); + return 0; +} + +IndexManager::~IndexManager() { + + for (ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.begin(); + it != col_indices.end(); ++it) { + + delete it->second; + it->second = NULL; + } + col_indices.clear(); +} + + +int IndexManager::init_index(coll_t c, const char *path, uint32_t version) { + RWLock::WLocker l(lock); + int r = set_version(path, version); + if (r < 0) + return r; + HashIndex index(cct, c, path, cct->_conf->filestore_merge_threshold, + cct->_conf->filestore_split_multiple, + version, + cct->_conf->filestore_index_retry_probability); + r = index.init(); + if (r < 0) + return r; + return index.read_settings(); +} + +int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) { + if (upgrade) { + // Need to check the collection generation + int r; + uint32_t version = 0; + r = get_version(path, &version); + if (r < 0) + return r; + + switch (version) { + case CollectionIndex::FLAT_INDEX_TAG: + case CollectionIndex::HASH_INDEX_TAG: // fall through + case CollectionIndex::HASH_INDEX_TAG_2: // fall through + case CollectionIndex::HOBJECT_WITH_POOL: { + // Must be a HashIndex + *index = new HashIndex(cct, c, path, + cct->_conf->filestore_merge_threshold, + cct->_conf->filestore_split_multiple, + version); + return (*index)->read_settings(); + } + default: ceph_abort(); + } + + } else { + // No need to check + *index = new HashIndex(cct, c, path, cct->_conf->filestore_merge_threshold, + cct->_conf->filestore_split_multiple, + CollectionIndex::HOBJECT_WITH_POOL, + cct->_conf->filestore_index_retry_probability); + return (*index)->read_settings(); + } +} + +bool IndexManager::get_index_optimistic(coll_t c, Index *index) { + RWLock::RLocker l(lock); + ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c); + if (it == col_indices.end()) + return false; + index->index = it->second; + return true; +} + +int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) { + if (get_index_optimistic(c, index)) + return 0; + RWLock::WLocker l(lock); + ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c); + if (it == col_indices.end()) { + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str()); + CollectionIndex* colIndex = NULL; + int r = build_index(c, path, &colIndex); + if (r < 0) + return r; + col_indices[c] = colIndex; + index->index = colIndex; + } else { + index->index = it->second; + } + return 0; +} diff --git a/src/os/filestore/IndexManager.h b/src/os/filestore/IndexManager.h new file mode 100644 index 00000000..19cd2926 --- /dev/null +++ b/src/os/filestore/IndexManager.h @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef OS_INDEXMANAGER_H +#define OS_INDEXMANAGER_H + +#include "include/unordered_map.h" + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/config.h" +#include "common/debug.h" + +#include "CollectionIndex.h" +#include "HashIndex.h" + + +/// Public type for Index +struct Index { + CollectionIndex *index; + + Index() : index(NULL) {} + explicit Index(CollectionIndex* index) : index(index) {} + + CollectionIndex *operator->() { return index; } + CollectionIndex &operator*() { return *index; } +}; + + +/** + * Encapsulates mutual exclusion for CollectionIndexes. + * + * Allowing a modification (removal or addition of an object) to occur + * while a read is occurring (lookup of an object's path and use of + * that path) may result in the path becoming invalid. Thus, during + * the lifetime of a CollectionIndex object and any paths returned + * by it, no other concurrent accesses may be allowed. + * This is enforced by using CollectionIndex::access_lock + */ +class IndexManager { + CephContext* cct; + RWLock lock; ///< Lock for Index Manager + bool upgrade; + ceph::unordered_map<coll_t, CollectionIndex* > col_indices; + + /** + * Index factory + * + * Encapsulates logic for handling legacy FileStore + * layouts + * + * @param [in] c Collection for which to get index + * @param [in] path Path to collection + * @param [out] index Index for c + * @return error code + */ + int build_index(coll_t c, const char *path, CollectionIndex **index); + bool get_index_optimistic(coll_t c, Index *index); +public: + /// Constructor + explicit IndexManager(CephContext* cct, + bool upgrade) : cct(cct), + lock("IndexManager lock"), + upgrade(upgrade) {} + + ~IndexManager(); + + /** + * Reserve and return index for c + * + * @param [in] c Collection for which to get index + * @param [in] baseDir base directory of collections + * @param [out] index Index for c + * @return error code + */ + int get_index(coll_t c, const string& baseDir, Index *index); + + /** + * Initialize index for collection c at path + * + * @param [in] c Collection for which to init Index + * @param [in] path Path to collection + * @param [in] filestore_version version of containing FileStore + * @return error code + */ + int init_index(coll_t c, const char *path, uint32_t filestore_version); +}; + +#endif diff --git a/src/os/filestore/Journal.h b/src/os/filestore/Journal.h new file mode 100644 index 00000000..cfb667d8 --- /dev/null +++ b/src/os/filestore/Journal.h @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_JOURNAL_H +#define CEPH_JOURNAL_H + +#include <errno.h> + +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "common/Finisher.h" +#include "common/TrackedOp.h" +#include "os/ObjectStore.h" +#include "common/zipkin_trace.h" + +class PerfCounters; + +class Journal { +protected: + uuid_d fsid; + Finisher *finisher; +public: + CephContext* cct; + PerfCounters *logger; +protected: + Cond *do_sync_cond; + bool wait_on_full; + +public: + Journal(CephContext* cct, uuid_d f, Finisher *fin, Cond *c=0) : + fsid(f), finisher(fin), cct(cct), logger(NULL), + do_sync_cond(c), + wait_on_full(false) { } + virtual ~Journal() { } + + virtual int check() = 0; ///< check if journal appears valid + virtual int create() = 0; ///< create a fresh journal + virtual int open(uint64_t fs_op_seq) = 0; ///< open an existing journal + virtual void close() = 0; ///< close an open journal + + virtual void flush() = 0; + + virtual void get_devices(set<string> *ls) {} + virtual void collect_metadata(map<string,string> *pm) {} + /** + * reserve_throttle_and_backoff + * + * Implementation may throttle or backoff based on ops + * reserved here but not yet released using committed_thru. + */ + virtual void reserve_throttle_and_backoff(uint64_t count) = 0; + + virtual int dump(ostream& out) { return -EOPNOTSUPP; } + + void set_wait_on_full(bool b) { wait_on_full = b; } + + // writes + virtual bool is_writeable() = 0; + virtual int make_writeable() = 0; + virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len, + Context *oncommit, + TrackedOpRef osd_op = TrackedOpRef()) = 0; + virtual void commit_start(uint64_t seq) = 0; + virtual void committed_thru(uint64_t seq) = 0; + + /// Read next journal entry - asserts on invalid journal + virtual bool read_entry( + bufferlist &bl, ///< [out] payload on successful read + uint64_t &seq ///< [in,out] sequence number on last successful read + ) = 0; ///< @return true on successful read, false on journal end + + virtual bool should_commit_now() = 0; + + virtual int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) = 0; + + virtual off64_t get_journal_size_estimate() { return 0; } + + // reads/recovery + +}; + +#endif diff --git a/src/os/filestore/JournalThrottle.cc b/src/os/filestore/JournalThrottle.cc new file mode 100644 index 00000000..8475bbbf --- /dev/null +++ b/src/os/filestore/JournalThrottle.cc @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "JournalThrottle.h" +#include "include/ceph_assert.h" + +bool JournalThrottle::set_params( + double _low_threshhold, + double _high_threshhold, + double _expected_throughput, + double _high_multiple, + double _max_multiple, + uint64_t _throttle_max, + std::ostream *errstream) +{ + return throttle.set_params( + _low_threshhold, + _high_threshhold, + _expected_throughput, + _high_multiple, + _max_multiple, + _throttle_max, + errstream); +} + +std::chrono::duration<double> JournalThrottle::get(uint64_t c) +{ + return throttle.get(c); +} + +uint64_t JournalThrottle::take(uint64_t c) +{ + return throttle.take(c); +} + +void JournalThrottle::register_throttle_seq(uint64_t seq, uint64_t c) +{ + locker l(lock); + journaled_ops.push_back(std::make_pair(seq, c)); +} + +std::pair<uint64_t, uint64_t> JournalThrottle::flush(uint64_t mono_id) +{ + uint64_t to_put_bytes = 0; + uint64_t to_put_ops = 0; + { + locker l(lock); + while (!journaled_ops.empty() && + journaled_ops.front().first <= mono_id) { + to_put_bytes += journaled_ops.front().second; + to_put_ops++; + journaled_ops.pop_front(); + } + } + throttle.put(to_put_bytes); + return make_pair(to_put_ops, to_put_bytes); +} + +uint64_t JournalThrottle::get_current() +{ + return throttle.get_current(); +} + +uint64_t JournalThrottle::get_max() +{ + return throttle.get_max(); +} diff --git a/src/os/filestore/JournalThrottle.h b/src/os/filestore/JournalThrottle.h new file mode 100644 index 00000000..75485d6d --- /dev/null +++ b/src/os/filestore/JournalThrottle.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_JOURNAL_THROTTLE_H +#define CEPH_JOURNAL_THROTTLE_H + +#include "common/Throttle.h" + +#include <list> +#include <deque> +#include <condition_variable> +#include <thread> +#include <vector> +#include <chrono> +#include <iostream> + +/** + * JournalThrottle + * + * Throttle designed to implement dynamic throttling as the journal fills + * up. The goal is to not delay ops at all when the journal is relatively + * empty, delay ops somewhat as the journal begins to fill (with the delay + * getting linearly longer as the journal fills up to a high water mark), + * and to delay much more aggressively (though still linearly with usage) + * until we hit the max value. + * + * The implementation simply wraps BackoffThrottle with a queue of + * journaled but not synced ops. + * + * The usage pattern is as follows: + * 1) Call get(seq, bytes) before taking the op_queue_throttle + * 2) Once the journal is flushed, flush(max_op_id_flushed) + */ +class JournalThrottle { + BackoffThrottle throttle; + + std::mutex lock; + /// deque<id, count> + std::deque<std::pair<uint64_t, uint64_t> > journaled_ops; + using locker = std::unique_lock<std::mutex>; + +public: + /** + * set_params + * + * Sets params. If the params are invalid, returns false + * and populates errstream (if non-null) with a user compreshensible + * explanation. + */ + bool set_params( + double low_threshhold, + double high_threshhold, + double expected_throughput, + double high_multiple, + double max_multiple, + uint64_t throttle_max, + std::ostream *errstream); + + /** + * gets specified throttle for id mono_id, waiting as necessary + * + * @param c [in] amount to take + * @return duration waited + */ + std::chrono::duration<double> get(uint64_t c); + + /** + * take + * + * Takes specified throttle without waiting + */ + uint64_t take(uint64_t c); + + /** + * register_throttle_seq + * + * Registers a sequence number with an amount of throttle to + * release upon flush() + * + * @param seq [in] seq + */ + void register_throttle_seq(uint64_t seq, uint64_t c); + + + /** + * Releases throttle held by ids <= mono_id + * + * @param mono_id [in] id up to which to flush + * @returns pair<ops_flushed, bytes_flushed> + */ + std::pair<uint64_t, uint64_t> flush(uint64_t mono_id); + + uint64_t get_current(); + uint64_t get_max(); + + JournalThrottle( + unsigned expected_concurrency ///< [in] determines size of conds + ) : throttle(g_ceph_context, "filestore_journal", expected_concurrency) {} +}; + +#endif diff --git a/src/os/filestore/JournalingObjectStore.cc b/src/os/filestore/JournalingObjectStore.cc new file mode 100644 index 00000000..714d0935 --- /dev/null +++ b/src/os/filestore/JournalingObjectStore.cc @@ -0,0 +1,271 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "JournalingObjectStore.h" + +#include "common/errno.h" +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_journal +#undef dout_prefix +#define dout_prefix *_dout << "journal " + + + +void JournalingObjectStore::journal_start() +{ + dout(10) << "journal_start" << dendl; + finisher.start(); +} + +void JournalingObjectStore::journal_stop() +{ + dout(10) << "journal_stop" << dendl; + finisher.wait_for_empty(); + finisher.stop(); +} + +// A journal_replay() makes journal writeable, this closes that out. +void JournalingObjectStore::journal_write_close() +{ + if (journal) { + journal->close(); + delete journal; + journal = 0; + } + apply_manager.reset(); +} + +int JournalingObjectStore::journal_replay(uint64_t fs_op_seq) +{ + dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl; + + if (cct->_conf->journal_replay_from) { + dout(0) << "journal_replay forcing replay from " + << cct->_conf->journal_replay_from + << " instead of " << fs_op_seq << dendl; + // the previous op is the last one committed + fs_op_seq = cct->_conf->journal_replay_from - 1; + } + + uint64_t op_seq = fs_op_seq; + apply_manager.init_seq(fs_op_seq); + + if (!journal) { + submit_manager.set_op_seq(op_seq); + return 0; + } + + int err = journal->open(op_seq); + if (err < 0) { + dout(3) << "journal_replay open failed with " + << cpp_strerror(err) << dendl; + delete journal; + journal = 0; + return err; + } + + replaying = true; + + int count = 0; + while (1) { + bufferlist bl; + uint64_t seq = op_seq + 1; + if (!journal->read_entry(bl, seq)) { + dout(3) << "journal_replay: end of journal, done." << dendl; + break; + } + + if (seq <= op_seq) { + dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl; + continue; + } + ceph_assert(op_seq == seq-1); + + dout(3) << "journal_replay: applying op seq " << seq << dendl; + auto p = bl.cbegin(); + vector<ObjectStore::Transaction> tls; + while (!p.end()) { + tls.emplace_back(Transaction(p)); + } + + apply_manager.op_apply_start(seq); + int r = do_transactions(tls, seq); + apply_manager.op_apply_finish(seq); + + op_seq = seq; + count++; + + dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl; + } + + if (count) + dout(3) << "journal_replay: total = " << count << dendl; + + replaying = false; + + submit_manager.set_op_seq(op_seq); + + // done reading, make writeable. + err = journal->make_writeable(); + if (err < 0) + return err; + + if (!count) + journal->committed_thru(fs_op_seq); + + return count; +} + + +// ------------------------------------ + +uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op) +{ + Mutex::Locker l(apply_lock); + while (blocked) { + dout(10) << "op_apply_start blocked, waiting" << dendl; + blocked_cond.Wait(apply_lock); + } + dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> " + << (open_ops+1) << dendl; + ceph_assert(!blocked); + ceph_assert(op > committed_seq); + open_ops++; + return op; +} + +void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op) +{ + Mutex::Locker l(apply_lock); + dout(10) << "op_apply_finish " << op << " open_ops " << open_ops << " -> " + << (open_ops-1) << ", max_applied_seq " << max_applied_seq << " -> " + << std::max(op, max_applied_seq) << dendl; + --open_ops; + ceph_assert(open_ops >= 0); + + // signal a blocked commit_start + if (blocked) { + blocked_cond.Signal(); + } + + // there can be multiple applies in flight; track the max value we + // note. note that we can't _read_ this value and learn anything + // meaningful unless/until we've quiesced all in-flight applies. + if (op > max_applied_seq) + max_applied_seq = op; +} + +uint64_t JournalingObjectStore::SubmitManager::op_submit_start() +{ + lock.Lock(); + uint64_t op = ++op_seq; + dout(10) << "op_submit_start " << op << dendl; + return op; +} + +void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op) +{ + dout(10) << "op_submit_finish " << op << dendl; + if (op != op_submitted + 1) { + dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1) + << ", OUT OF ORDER" << dendl; + ceph_abort_msg("out of order op_submit_finish"); + } + op_submitted = op; + lock.Unlock(); +} + + +// ------------------------------------------ + +void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c) +{ + Mutex::Locker l(com_lock); + ceph_assert(c); + commit_waiters[op].push_back(c); +} + +bool JournalingObjectStore::ApplyManager::commit_start() +{ + bool ret = false; + + { + Mutex::Locker l(apply_lock); + dout(10) << "commit_start max_applied_seq " << max_applied_seq + << ", open_ops " << open_ops << dendl; + blocked = true; + while (open_ops > 0) { + dout(10) << "commit_start waiting for " << open_ops + << " open ops to drain" << dendl; + blocked_cond.Wait(apply_lock); + } + ceph_assert(open_ops == 0); + dout(10) << "commit_start blocked, all open_ops have completed" << dendl; + { + Mutex::Locker l(com_lock); + if (max_applied_seq == committed_seq) { + dout(10) << "commit_start nothing to do" << dendl; + blocked = false; + ceph_assert(commit_waiters.empty()); + goto out; + } + + committing_seq = max_applied_seq; + + dout(10) << "commit_start committing " << committing_seq + << ", still blocked" << dendl; + } + } + ret = true; + + if (journal) + journal->commit_start(committing_seq); // tell the journal too + out: + return ret; +} + +void JournalingObjectStore::ApplyManager::commit_started() +{ + Mutex::Locker l(apply_lock); + // allow new ops. (underlying fs should now be committing all prior ops) + dout(10) << "commit_started committing " << committing_seq << ", unblocking" + << dendl; + blocked = false; + blocked_cond.Signal(); +} + +void JournalingObjectStore::ApplyManager::commit_finish() +{ + Mutex::Locker l(com_lock); + dout(10) << "commit_finish thru " << committing_seq << dendl; + + if (journal) + journal->committed_thru(committing_seq); + + committed_seq = committing_seq; + + map<version_t, vector<Context*> >::iterator p = commit_waiters.begin(); + while (p != commit_waiters.end() && + p->first <= committing_seq) { + finisher.queue(p->second); + commit_waiters.erase(p++); + } +} + +void JournalingObjectStore::_op_journal_transactions( + bufferlist& tbl, uint32_t orig_len, uint64_t op, + Context *onjournal, TrackedOpRef osd_op) +{ + if (osd_op.get()) + dout(10) << "op_journal_transactions " << op << " reqid_t " + << (static_cast<OpRequest *>(osd_op.get()))->get_reqid() << dendl; + else + dout(10) << "op_journal_transactions " << op << dendl; + + if (journal && journal->is_writeable()) { + journal->submit_entry(op, tbl, orig_len, onjournal, osd_op); + } else if (onjournal) { + apply_manager.add_waiter(op, onjournal); + } +} diff --git a/src/os/filestore/JournalingObjectStore.h b/src/os/filestore/JournalingObjectStore.h new file mode 100644 index 00000000..a289d0e8 --- /dev/null +++ b/src/os/filestore/JournalingObjectStore.h @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_JOURNALINGOBJECTSTORE_H +#define CEPH_JOURNALINGOBJECTSTORE_H + +#include "os/ObjectStore.h" +#include "Journal.h" +#include "FileJournal.h" +#include "common/RWLock.h" +#include "osd/OpRequest.h" + +class JournalingObjectStore : public ObjectStore { +protected: + Journal *journal; + Finisher finisher; + + + class SubmitManager { + CephContext* cct; + Mutex lock; + uint64_t op_seq; + uint64_t op_submitted; + public: + SubmitManager(CephContext* cct) : + cct(cct), lock("JOS::SubmitManager::lock", false, true, false), + op_seq(0), op_submitted(0) + {} + uint64_t op_submit_start(); + void op_submit_finish(uint64_t op); + void set_op_seq(uint64_t seq) { + Mutex::Locker l(lock); + op_submitted = op_seq = seq; + } + uint64_t get_op_seq() { + return op_seq; + } + } submit_manager; + + class ApplyManager { + CephContext* cct; + Journal *&journal; + Finisher &finisher; + + Mutex apply_lock; + bool blocked; + Cond blocked_cond; + int open_ops; + uint64_t max_applied_seq; + + Mutex com_lock; + map<version_t, vector<Context*> > commit_waiters; + uint64_t committing_seq, committed_seq; + + public: + ApplyManager(CephContext* cct, Journal *&j, Finisher &f) : + cct(cct), journal(j), finisher(f), + apply_lock("JOS::ApplyManager::apply_lock", false, true, false), + blocked(false), + open_ops(0), + max_applied_seq(0), + com_lock("JOS::ApplyManager::com_lock", false, true, false), + committing_seq(0), committed_seq(0) {} + void reset() { + ceph_assert(open_ops == 0); + ceph_assert(blocked == false); + max_applied_seq = 0; + committing_seq = 0; + committed_seq = 0; + } + void add_waiter(uint64_t, Context*); + uint64_t op_apply_start(uint64_t op); + void op_apply_finish(uint64_t op); + bool commit_start(); + void commit_started(); + void commit_finish(); + bool is_committing() { + Mutex::Locker l(com_lock); + return committing_seq != committed_seq; + } + uint64_t get_committed_seq() { + Mutex::Locker l(com_lock); + return committed_seq; + } + uint64_t get_committing_seq() { + Mutex::Locker l(com_lock); + return committing_seq; + } + void init_seq(uint64_t fs_op_seq) { + { + Mutex::Locker l(com_lock); + committed_seq = fs_op_seq; + committing_seq = fs_op_seq; + } + { + Mutex::Locker l(apply_lock); + max_applied_seq = fs_op_seq; + } + } + } apply_manager; + + bool replaying; + +protected: + void journal_start(); + void journal_stop(); + void journal_write_close(); + int journal_replay(uint64_t fs_op_seq); + + void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op, + Context *onjournal, TrackedOpRef osd_op); + + virtual int do_transactions(vector<ObjectStore::Transaction>& tls, uint64_t op_seq) = 0; + +public: + bool is_committing() { + return apply_manager.is_committing(); + } + uint64_t get_committed_seq() { + return apply_manager.get_committed_seq(); + } + +public: + JournalingObjectStore(CephContext* cct, const std::string& path) + : ObjectStore(cct, path), + journal(NULL), + finisher(cct, "JournalObjectStore", "fn_jrn_objstore"), + submit_manager(cct), + apply_manager(cct, journal, finisher), + replaying(false) {} + + ~JournalingObjectStore() override { + } +}; + +#endif diff --git a/src/os/filestore/LFNIndex.cc b/src/os/filestore/LFNIndex.cc new file mode 100644 index 00000000..2451ae8c --- /dev/null +++ b/src/os/filestore/LFNIndex.cc @@ -0,0 +1,1407 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <string> +#include <map> +#include <set> +#include <vector> +#include <errno.h> +#include <string.h> + +#if defined(__FreeBSD__) +#include <sys/param.h> +#endif + +#include "osd/osd_types.h" +#include "include/object.h" +#include "common/config.h" +#include "common/debug.h" +#include "include/buffer.h" +#include "common/ceph_crypto.h" +#include "common/errno.h" +#include "include/compat.h" +#include "chain_xattr.h" + +#include "LFNIndex.h" +using ceph::crypto::SHA1; + +#define dout_context cct +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") " + + +const string LFNIndex::LFN_ATTR = "user.cephos.lfn"; +const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash."; +const string LFNIndex::SUBDIR_PREFIX = "DIR_"; +const string LFNIndex::FILENAME_COOKIE = "long"; +const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN - + FILENAME_COOKIE.size() - + FILENAME_EXTRA; +void LFNIndex::maybe_inject_failure() +{ + if (error_injection_enabled) { + if (current_failure > last_failure && + (((double)(rand() % 10000))/((double)(10000)) + < error_injection_probability)) { + last_failure = current_failure; + current_failure = 0; + throw RetryException(); + } + ++current_failure; + } +} + +// Helper to close fd's when we leave scope. This is useful when used +// in combination with RetryException, thrown by the above. +struct FDCloser { + int fd; + explicit FDCloser(int f) : fd(f) {} + ~FDCloser() { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } +}; + + +/* Public methods */ + +uint64_t LFNIndex::get_max_escaped_name_len(const hobject_t &obj) +{ + ghobject_t ghobj(obj); + ghobj.shard_id = shard_id_t(0); + ghobj.generation = 0; + ghobj.hobj.snap = 0; + return lfn_generate_object_name_current(ghobj).size(); +} + +int LFNIndex::init() +{ + return _init(); +} + +int LFNIndex::created(const ghobject_t &oid, const char *path) +{ + WRAP_RETRY( + vector<string> path_comp; + string short_name; + r = decompose_full_path(path, &path_comp, 0, &short_name); + if (r < 0) + goto out; + r = lfn_created(path_comp, oid, short_name); + if (r < 0) { + if (failed) { + /* This is hacky, but the only way we get ENOENT from lfn_created here is + * if we did a failure injection in _created below AND actually started the + * split or merge. In that case, lfn_created already suceeded, and + * WRAP_RETRY already cleaned it up and we are actually done. In a real + * failure, the filestore itself would have ended up calling this with + * the new path, not the old one, so we'd find it. + */ + r = 0; + } + goto out; + } + r = _created(path_comp, oid, short_name); + if (r < 0) + goto out; + ); +} + +int LFNIndex::unlink(const ghobject_t &oid) +{ + WRAP_RETRY( + vector<string> path; + string short_name; + r = _lookup(oid, &path, &short_name, NULL); + if (r < 0) { + goto out; + } + r = _remove(path, oid, short_name); + if (r < 0) { + goto out; + } + ); +} + +int LFNIndex::lookup(const ghobject_t &oid, + IndexedPath *out_path, + int *hardlink) +{ + WRAP_RETRY( + vector<string> path; + string short_name; + r = _lookup(oid, &path, &short_name, hardlink); + if (r < 0) + goto out; + string full_path = get_full_path(path, short_name); + *out_path = std::make_shared<Path>(full_path, this); + r = 0; + ); +} + +int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) +{ + return _pre_hash_collection(pg_num, expected_num_objs); +} + + +int LFNIndex::collection_list_partial(const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next) +{ + return _collection_list_partial(start, end, max_count, ls, next); +} + +/* Derived class utility methods */ + +int LFNIndex::fsync_dir(const vector<string> &path) +{ + maybe_inject_failure(); + int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + FDCloser f(fd); + maybe_inject_failure(); + int r = ::fsync(fd); + maybe_inject_failure(); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + return 0; +} + +int LFNIndex::link_object(const vector<string> &from, + const vector<string> &to, + const ghobject_t &oid, + const string &from_short_name) +{ + int r; + string from_path = get_full_path(from, from_short_name); + string to_path; + maybe_inject_failure(); + r = lfn_get_name(to, oid, 0, &to_path, 0); + if (r < 0) + return r; + maybe_inject_failure(); + r = ::link(from_path.c_str(), to_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::remove_objects(const vector<string> &dir, + const map<string, ghobject_t> &to_remove, + map<string, ghobject_t> *remaining) +{ + set<string> clean_chains; + for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin(); + to_clean != to_remove.end(); + ++to_clean) { + if (!lfn_is_hashed_filename(to_clean->first)) { + maybe_inject_failure(); + int r = ::unlink(get_full_path(dir, to_clean->first).c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + continue; + } + if (clean_chains.count(lfn_get_short_name(to_clean->second, 0))) + continue; + set<int> holes; + map<int, pair<string, ghobject_t> > chain; + for (int i = 0; ; ++i) { + string short_name = lfn_get_short_name(to_clean->second, i); + if (remaining->count(short_name)) { + chain[i] = *(remaining->find(short_name)); + } else if (to_remove.count(short_name)) { + holes.insert(i); + } else { + break; + } + } + + map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin(); + for (set<int>::iterator i = holes.begin(); + i != holes.end(); + ++i) { + if (candidate == chain.rend() || *i > candidate->first) { + string remove_path_name = + get_full_path(dir, lfn_get_short_name(to_clean->second, *i)); + maybe_inject_failure(); + int r = ::unlink(remove_path_name.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + continue; + } + string from = get_full_path(dir, candidate->second.first); + string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i)); + maybe_inject_failure(); + int r = ::rename(from.c_str(), to.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + remaining->erase(candidate->second.first); + remaining->insert(pair<string, ghobject_t>( + lfn_get_short_name(candidate->second.second, *i), + candidate->second.second)); + ++candidate; + } + if (!holes.empty()) + clean_chains.insert(lfn_get_short_name(to_clean->second, 0)); + } + return 0; +} + +int LFNIndex::move_objects(const vector<string> &from, + const vector<string> &to) +{ + map<string, ghobject_t> to_move; + int r; + r = list_objects(from, 0, NULL, &to_move); + if (r < 0) + return r; + for (map<string,ghobject_t>::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + string from_path = get_full_path(from, i->first); + string to_path, to_name; + r = lfn_get_name(to, i->second, &to_name, &to_path, 0); + if (r < 0) + return r; + maybe_inject_failure(); + r = ::link(from_path.c_str(), to_path.c_str()); + if (r < 0 && errno != EEXIST) + return -errno; + maybe_inject_failure(); + r = lfn_created(to, i->second, to_name); + maybe_inject_failure(); + if (r < 0) + return r; + } + r = fsync_dir(to); + if (r < 0) + return r; + for (map<string,ghobject_t>::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + maybe_inject_failure(); + r = ::unlink(get_full_path(from, i->first).c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } + return fsync_dir(from); +} + +int LFNIndex::remove_object(const vector<string> &from, + const ghobject_t &oid) +{ + string short_name; + int r, exist; + maybe_inject_failure(); + r = get_mangled_name(from, oid, &short_name, &exist); + maybe_inject_failure(); + if (r < 0) + return r; + if (exist == 0) + return -ENOENT; + return lfn_unlink(from, oid, short_name); +} + +int LFNIndex::get_mangled_name(const vector<string> &from, + const ghobject_t &oid, + string *mangled_name, int *hardlink) +{ + return lfn_get_name(from, oid, mangled_name, 0, hardlink); +} + +int LFNIndex::move_subdir( + LFNIndex &from, + LFNIndex &dest, + const vector<string> &path, + string dir + ) +{ + vector<string> sub_path(path.begin(), path.end()); + sub_path.push_back(dir); + string from_path(from.get_full_path_subdir(sub_path)); + string to_path(dest.get_full_path_subdir(sub_path)); + int r = ::rename(from_path.c_str(), to_path.c_str()); + if (r < 0) + return -errno; + return 0; +} + +int LFNIndex::move_object( + LFNIndex &from, + LFNIndex &dest, + const vector<string> &path, + const pair<string, ghobject_t> &obj + ) +{ + string from_path(from.get_full_path(path, obj.first)); + string to_path; + string to_name; + int exists; + int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists); + if (r < 0) + return r; + if (!exists) { + r = ::link(from_path.c_str(), to_path.c_str()); + if (r < 0) + return r; + } + r = dest.lfn_created(path, obj.second, to_name); + if (r < 0) + return r; + r = dest.fsync_dir(path); + if (r < 0) + return r; + r = from.remove_object(path, obj.second); + if (r < 0) + return r; + return from.fsync_dir(path); +} + + +static int get_hobject_from_oinfo(const char *dir, const char *file, + ghobject_t *o) +{ + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/%s", dir, file); + // Hack, user.ceph._ is the attribute used to store the object info + bufferptr bp; + int r = chain_getxattr_buf( + path, + "user.ceph._", + &bp); + if (r < 0) + return r; + bufferlist bl; + if (r > 0) + bl.push_back(bp); + object_info_t oi(bl); + *o = ghobject_t(oi.soid); + return 0; +} + + +int LFNIndex::list_objects(const vector<string> &to_list, int max_objs, + long *handle, map<string, ghobject_t> *out) +{ + string to_list_path = get_full_path_subdir(to_list); + DIR *dir = ::opendir(to_list_path.c_str()); + if (!dir) { + return -errno; + } + + if (handle && *handle) { + seekdir(dir, *handle); + } + + struct dirent *de = nullptr; + int r = 0; + int listed = 0; + bool end = true; + while ((de = ::readdir(dir))) { + end = false; + if (max_objs > 0 && listed >= max_objs) { + break; + } + if (de->d_name[0] == '.') + continue; + string short_name(de->d_name); + ghobject_t obj; + if (lfn_is_object(short_name)) { + r = lfn_translate(to_list, short_name, &obj); + if (r == -EINVAL) { + continue; + } else if (r < 0) { + goto cleanup; + } else { + string long_name = lfn_generate_object_name(obj); + if (!lfn_must_hash(long_name)) { + ceph_assert(long_name == short_name); + } + if (index_version == HASH_INDEX_TAG) + get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj); + + out->insert(pair<string, ghobject_t>(short_name, obj)); + ++listed; + } + } + } + + if (handle && !end) { + *handle = telldir(dir); + } + + r = 0; + cleanup: + ::closedir(dir); + return r; +} + +int LFNIndex::list_subdirs(const vector<string> &to_list, + vector<string> *out) +{ + string to_list_path = get_full_path_subdir(to_list); + DIR *dir = ::opendir(to_list_path.c_str()); + if (!dir) + return -errno; + + struct dirent *de = nullptr; + while ((de = ::readdir(dir))) { + string short_name(de->d_name); + string demangled_name; + if (lfn_is_subdir(short_name, &demangled_name)) { + out->push_back(demangled_name); + } + } + + ::closedir(dir); + return 0; +} + +int LFNIndex::create_path(const vector<string> &to_create) +{ + maybe_inject_failure(); + int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::remove_path(const vector<string> &to_remove) +{ + maybe_inject_failure(); + int r = ::rmdir(get_full_path_subdir(to_remove).c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::path_exists(const vector<string> &to_check, int *exists) +{ + string full_path = get_full_path_subdir(to_check); + struct stat buf; + if (::stat(full_path.c_str(), &buf)) { + int r = -errno; + if (r == -ENOENT) { + *exists = 0; + return 0; + } else { + return r; + } + } else { + *exists = 1; + return 0; + } +} + +int LFNIndex::add_attr_path(const vector<string> &path, + const string &attr_name, + bufferlist &attr_value) +{ + string full_path = get_full_path_subdir(path); + maybe_inject_failure(); + return chain_setxattr<false, true>( + full_path.c_str(), mangle_attr_name(attr_name).c_str(), + reinterpret_cast<void *>(attr_value.c_str()), + attr_value.length()); +} + +int LFNIndex::get_attr_path(const vector<string> &path, + const string &attr_name, + bufferlist &attr_value) +{ + string full_path = get_full_path_subdir(path); + bufferptr bp; + int r = chain_getxattr_buf( + full_path.c_str(), + mangle_attr_name(attr_name).c_str(), + &bp); + if (r > 0) + attr_value.push_back(bp); + return r; +} + +int LFNIndex::remove_attr_path(const vector<string> &path, + const string &attr_name) +{ + string full_path = get_full_path_subdir(path); + string mangled_attr_name = mangle_attr_name(attr_name); + maybe_inject_failure(); + return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str()); +} + +string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid) +{ + char s[FILENAME_MAX_LEN]; + char *end = s + sizeof(s); + char *t = s; + + ceph_assert(oid.generation == ghobject_t::NO_GEN); + const char *i = oid.hobj.oid.name.c_str(); + // Escape subdir prefix + if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { + *t++ = '\\'; + *t++ = 'd'; + i += 4; + } + while (*i && t < end) { + if (*i == '\\') { + *t++ = '\\'; + *t++ = '\\'; + } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading . + *t++ = '\\'; + *t++ = '.'; + } else if (*i == '/') { + *t++ = '\\'; + *t++ = 's'; + } else + *t++ = *i; + i++; + } + + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "_head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "_snapdir"); + else + t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap); + snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); + + return string(s); +} + +static void append_escaped(string::const_iterator begin, + string::const_iterator end, + string *out) +{ + for (string::const_iterator i = begin; i != end; ++i) { + if (*i == '\\') { + out->append("\\\\"); + } else if (*i == '/') { + out->append("\\s"); + } else if (*i == '_') { + out->append("\\u"); + } else if (*i == '\0') { + out->append("\\n"); + } else { + out->append(i, i+1); + } + } +} + +string LFNIndex::lfn_generate_object_name_current(const ghobject_t &oid) +{ + string full_name; + string::const_iterator i = oid.hobj.oid.name.begin(); + if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { + full_name.append("\\d"); + i += 4; + } else if (oid.hobj.oid.name[0] == '.') { + full_name.append("\\."); + ++i; + } + append_escaped(i, oid.hobj.oid.name.end(), &full_name); + full_name.append("_"); + append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name); + full_name.append("_"); + + char buf[PATH_MAX]; + char *t = buf; + const char *end = t + sizeof(buf); + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); + t += snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); + full_name.append(buf, t); + full_name.append("_"); + + append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name); + full_name.append("_"); + + t = buf; + if (oid.hobj.pool == -1) + t += snprintf(t, end - t, "none"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool); + full_name.append(buf, t); + + if (oid.generation != ghobject_t::NO_GEN || + oid.shard_id != shard_id_t::NO_SHARD) { + full_name.append("_"); + + t = buf; + t += snprintf(t, end - buf, "%llx", (long long unsigned)oid.generation); + full_name.append(buf, t); + + full_name.append("_"); + + t = buf; + t += snprintf(t, end - buf, "%x", (int)oid.shard_id); + full_name.append(buf, t); + } + + return full_name; +} + +string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid) +{ + if (index_version == HASH_INDEX_TAG) + return lfn_generate_object_name_keyless(oid); + + ceph_assert(oid.generation == ghobject_t::NO_GEN); + string full_name; + string::const_iterator i = oid.hobj.oid.name.begin(); + if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { + full_name.append("\\d"); + i += 4; + } else if (oid.hobj.oid.name[0] == '.') { + full_name.append("\\."); + ++i; + } + append_escaped(i, oid.hobj.oid.name.end(), &full_name); + full_name.append("_"); + append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name); + full_name.append("_"); + + char snap_with_hash[PATH_MAX]; + char *t = snap_with_hash; + char *end = t + sizeof(snap_with_hash); + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); + snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); + full_name += string(snap_with_hash); + return full_name; +} + +int LFNIndex::lfn_get_name(const vector<string> &path, + const ghobject_t &oid, + string *mangled_name, string *out_path, + int *hardlink) +{ + string full_name = lfn_generate_object_name(oid); + int r; + + if (!lfn_must_hash(full_name)) { + if (mangled_name) + *mangled_name = full_name; + if (out_path) + *out_path = get_full_path(path, full_name); + if (hardlink) { + struct stat buf; + string full_path = get_full_path(path, full_name); + maybe_inject_failure(); + r = ::stat(full_path.c_str(), &buf); + if (r < 0) { + if (errno == ENOENT) + *hardlink = 0; + else + return -errno; + } else { + *hardlink = buf.st_nlink; + } + } + return 0; + } + + int i = 0; + string candidate; + string candidate_path; + for ( ; ; ++i) { + candidate = lfn_get_short_name(oid, i); + candidate_path = get_full_path(path, candidate); + bufferptr bp; + r = chain_getxattr_buf( + candidate_path.c_str(), + get_lfn_attr().c_str(), + &bp); + if (r < 0) { + if (errno != ENODATA && errno != ENOENT) + return -errno; + if (errno == ENODATA) { + // Left over from incomplete transaction, it'll be replayed + maybe_inject_failure(); + r = ::unlink(candidate_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } + if (mangled_name) + *mangled_name = candidate; + if (out_path) + *out_path = candidate_path; + if (hardlink) + *hardlink = 0; + return 0; + } + ceph_assert(r > 0); + string lfn(bp.c_str(), bp.length()); + if (lfn == full_name) { + if (mangled_name) + *mangled_name = candidate; + if (out_path) + *out_path = candidate_path; + if (hardlink) { + struct stat st; + r = ::stat(candidate_path.c_str(), &st); + if (r < 0) { + if (errno == ENOENT) + *hardlink = 0; + else + return -errno; + } else { + *hardlink = st.st_nlink; + } + } + return 0; + } + bp = bufferptr(); + r = chain_getxattr_buf( + candidate_path.c_str(), + get_alt_lfn_attr().c_str(), + &bp); + if (r > 0) { + // only consider alt name if nlink > 1 + struct stat st; + int rc = ::stat(candidate_path.c_str(), &st); + if (rc < 0) + return -errno; + if (st.st_nlink <= 1) { + // left over from incomplete unlink, remove + maybe_inject_failure(); + dout(20) << __func__ << " found extra alt attr for " << candidate_path + << ", long name " << string(bp.c_str(), bp.length()) << dendl; + rc = chain_removexattr(candidate_path.c_str(), + get_alt_lfn_attr().c_str()); + maybe_inject_failure(); + if (rc < 0) + return rc; + continue; + } + string lfn(bp.c_str(), bp.length()); + if (lfn == full_name) { + dout(20) << __func__ << " used alt attr for " << full_name << dendl; + if (mangled_name) + *mangled_name = candidate; + if (out_path) + *out_path = candidate_path; + if (hardlink) + *hardlink = st.st_nlink; + return 0; + } + } + } + ceph_abort(); // Unreachable + return 0; +} + +int LFNIndex::lfn_created(const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name) +{ + if (!lfn_is_hashed_filename(mangled_name)) + return 0; + string full_path = get_full_path(path, mangled_name); + string full_name = lfn_generate_object_name(oid); + maybe_inject_failure(); + + // if the main attr exists and is different, move it to the alt attr. + bufferptr bp; + int r = chain_getxattr_buf( + full_path.c_str(), + get_lfn_attr().c_str(), + &bp); + if (r > 0) { + string lfn(bp.c_str(), bp.length()); + if (lfn != full_name) { + dout(20) << __func__ << " " << mangled_name + << " moving old name to alt attr " + << lfn + << ", new name is " << full_name << dendl; + r = chain_setxattr<false, true>( + full_path.c_str(), get_alt_lfn_attr().c_str(), + bp.c_str(), bp.length()); + if (r < 0) + return r; + } + } + + return chain_setxattr<false, true>( + full_path.c_str(), get_lfn_attr().c_str(), + full_name.c_str(), full_name.size()); +} + +int LFNIndex::lfn_unlink(const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name) +{ + if (!lfn_is_hashed_filename(mangled_name)) { + string full_path = get_full_path(path, mangled_name); + maybe_inject_failure(); + int r = ::unlink(full_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + return 0; + } + + int i = 0; + for ( ; ; ++i) { + string candidate = lfn_get_short_name(oid, i); + if (candidate == mangled_name) + break; + } + int removed_index = i; + ++i; + for ( ; ; ++i) { + struct stat buf; + string to_check = lfn_get_short_name(oid, i); + string to_check_path = get_full_path(path, to_check); + int r = ::stat(to_check_path.c_str(), &buf); + if (r < 0) { + if (errno == ENOENT) { + break; + } else { + return -errno; + } + } + } + string full_path = get_full_path(path, mangled_name); + int fd = ::open(full_path.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + FDCloser f(fd); + if (i == removed_index + 1) { + maybe_inject_failure(); + int r = ::unlink(full_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } else { + string& rename_to = full_path; + string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1)); + maybe_inject_failure(); + int r = ::rename(rename_from.c_str(), rename_to.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } + struct stat st; + int r = ::fstat(fd, &st); + if (r == 0 && st.st_nlink > 0) { + // remove alt attr + dout(20) << __func__ << " removing alt attr from " << full_path << dendl; + fsync_dir(path); + chain_fremovexattr(fd, get_alt_lfn_attr().c_str()); + } + return r; +} + +int LFNIndex::lfn_translate(const vector<string> &path, + const string &short_name, + ghobject_t *out) +{ + if (!lfn_is_hashed_filename(short_name)) { + return lfn_parse_object_name(short_name, out); + } + string full_path = get_full_path(path, short_name); + // First, check alt attr + bufferptr bp; + int r = chain_getxattr_buf( + full_path.c_str(), + get_alt_lfn_attr().c_str(), + &bp); + if (r > 0) { + // There is an alt attr, does it match? + string lfn(bp.c_str(), bp.length()); + if (short_name_matches(short_name.c_str(), lfn.c_str())) { + return lfn_parse_object_name(lfn, out); + } + } + + // Get lfn_attr + bp = bufferptr(); + r = chain_getxattr_buf( + full_path.c_str(), + get_lfn_attr().c_str(), + &bp); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + string long_name(bp.c_str(), bp.length()); + return lfn_parse_object_name(long_name, out); +} + +bool LFNIndex::lfn_is_object(const string &short_name) +{ + return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0); +} + +bool LFNIndex::lfn_is_subdir(const string &name, string *demangled) +{ + if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) { + if (demangled) + *demangled = demangle_path_component(name); + return 1; + } + return 0; +} + +static int parse_object(const char *s, ghobject_t& o) +{ + const char *hash = s + strlen(s) - 1; + while (*hash != '_' && + hash > s) + hash--; + const char *bar = hash - 1; + while (*bar != '_' && + bar > s) + bar--; + if (*bar == '_') { + char buf[bar-s + 1]; + char *t = buf; + const char *i = s; + while (i < bar) { + if (*i == '\\') { + i++; + switch (*i) { + case '\\': *t++ = '\\'; break; + case '.': *t++ = '.'; break; + case 's': *t++ = '/'; break; + case 'd': { + *t++ = 'D'; + *t++ = 'I'; + *t++ = 'R'; + *t++ = '_'; + break; + } + default: ceph_abort(); + } + } else { + *t++ = *i; + } + i++; + } + *t = 0; + o.hobj.oid.name = string(buf, t-buf); + if (strncmp(bar+1, "head", 4) == 0) + o.hobj.snap = CEPH_NOSNAP; + else if (strncmp(bar+1, "snapdir", 7) == 0) + o.hobj.snap = CEPH_SNAPDIR; + else + o.hobj.snap = strtoull(bar+1, NULL, 16); + + uint32_t hobject_hash_input; + sscanf(hash, "_%X", &hobject_hash_input); + o.hobj.set_hash(hobject_hash_input); + + return 1; + } + return 0; +} + +int LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out) +{ + int r = parse_object(long_name.c_str(), *out); + int64_t pool = -1; + spg_t pg; + if (coll().is_pg_prefix(&pg)) + pool = (int64_t)pg.pgid.pool(); + out->hobj.pool = pool; + if (!r) return -EINVAL; + string temp = lfn_generate_object_name(*out); + return 0; +} + +static bool append_unescaped(string::const_iterator begin, + string::const_iterator end, + string *out) +{ + for (string::const_iterator i = begin; i != end; ++i) { + if (*i == '\\') { + ++i; + if (*i == '\\') + out->append("\\"); + else if (*i == 's') + out->append("/"); + else if (*i == 'n') + (*out) += '\0'; + else if (*i == 'u') + out->append("_"); + else + return false; + } else { + out->append(i, i+1); + } + } + return true; +} + +int LFNIndex::lfn_parse_object_name_poolless(const string &long_name, + ghobject_t *out) +{ + string name; + string key; + uint32_t hash; + snapid_t snap; + + string::const_iterator current = long_name.begin(); + if (*current == '\\') { + ++current; + if (current == long_name.end()) { + return -EINVAL; + } else if (*current == 'd') { + name.append("DIR_"); + ++current; + } else if (*current == '.') { + name.append("."); + ++current; + } else { + --current; + } + } + + string::const_iterator end = current; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &name)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &key)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + string snap_str(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end != long_name.end()) + return -EINVAL; + string hash_str(current, end); + + if (snap_str == "head") + snap = CEPH_NOSNAP; + else if (snap_str == "snapdir") + snap = CEPH_SNAPDIR; + else + snap = strtoull(snap_str.c_str(), NULL, 16); + sscanf(hash_str.c_str(), "%X", &hash); + + + int64_t pool = -1; + spg_t pg; + if (coll().is_pg_prefix(&pg)) + pool = (int64_t)pg.pgid.pool(); + (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, "")); + return 0; +} + + +int LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out) +{ + string name; + string key; + string ns; + uint32_t hash; + snapid_t snap; + uint64_t pool; + gen_t generation = ghobject_t::NO_GEN; + shard_id_t shard_id = shard_id_t::NO_SHARD; + + if (index_version == HASH_INDEX_TAG) + return lfn_parse_object_name_keyless(long_name, out); + if (index_version == HASH_INDEX_TAG_2) + return lfn_parse_object_name_poolless(long_name, out); + + string::const_iterator current = long_name.begin(); + if (*current == '\\') { + ++current; + if (current == long_name.end()) { + return -EINVAL; + } else if (*current == 'd') { + name.append("DIR_"); + ++current; + } else if (*current == '.') { + name.append("."); + ++current; + } else { + --current; + } + } + + string::const_iterator end = current; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &name)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &key)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + string snap_str(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + string hash_str(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &ns)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + string pstring(current, end); + + // Optional generation/shard_id + string genstring, shardstring; + if (end != long_name.end()) { + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + genstring = string(current, end); + + generation = (gen_t)strtoull(genstring.c_str(), NULL, 16); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end != long_name.end()) + return -EINVAL; + shardstring = string(current, end); + + shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16); + } + + if (snap_str == "head") + snap = CEPH_NOSNAP; + else if (snap_str == "snapdir") + snap = CEPH_SNAPDIR; + else + snap = strtoull(snap_str.c_str(), NULL, 16); + sscanf(hash_str.c_str(), "%X", &hash); + + if (pstring == "none") + pool = (uint64_t)-1; + else + pool = strtoull(pstring.c_str(), NULL, 16); + + (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id); + return 0; +} + +bool LFNIndex::lfn_is_hashed_filename(const string &name) +{ + if (name.size() < (unsigned)FILENAME_SHORT_LEN) { + return 0; + } + if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size()) + == FILENAME_COOKIE) { + return 1; + } else { + return 0; + } +} + +bool LFNIndex::lfn_must_hash(const string &long_name) +{ + return (int)long_name.size() >= FILENAME_SHORT_LEN; +} + +static inline void buf_to_hex(const unsigned char *buf, int len, char *str) +{ + int i; + str[0] = '\0'; + for (i = 0; i < len; i++) { + sprintf(&str[i*2], "%02x", (int)buf[i]); + } +} + +int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len) +{ + if (buf_len < FILENAME_HASH_LEN + 1) + return -EINVAL; + + char buf[FILENAME_LFN_DIGEST_SIZE]; + char hex[FILENAME_LFN_DIGEST_SIZE * 2]; + + SHA1 h; + h.Update((const unsigned char *)filename, strlen(filename)); + h.Final((unsigned char *)buf); + + buf_to_hex((unsigned char *)buf, (FILENAME_HASH_LEN + 1) / 2, hex); + strncpy(hash, hex, FILENAME_HASH_LEN); + hash[FILENAME_HASH_LEN] = '\0'; + return 0; +} + +void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len) +{ + char hash[FILENAME_HASH_LEN + 1]; + + ceph_assert(len >= FILENAME_SHORT_LEN + 4); + + strncpy(filename, old_filename, FILENAME_PREFIX_LEN); + filename[FILENAME_PREFIX_LEN] = '\0'; + if ((int)strlen(filename) < FILENAME_PREFIX_LEN) + return; + if (old_filename[FILENAME_PREFIX_LEN] == '\0') + return; + + hash_filename(old_filename, hash, sizeof(hash)); + int ofs = FILENAME_PREFIX_LEN; + while (1) { + int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str()); + if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs) + break; + ofs--; + } +} + +bool LFNIndex::short_name_matches(const char *short_name, const char *cand_long_name) +{ + const char *end = short_name; + while (*end) ++end; + const char *suffix = end; + if (suffix > short_name) --suffix; // last char + while (suffix > short_name && *suffix != '_') --suffix; // back to first _ + if (suffix > short_name) --suffix; // one behind that + while (suffix > short_name && *suffix != '_') --suffix; // back to second _ + + int index = -1; + char buf[FILENAME_SHORT_LEN + 4]; + ceph_assert((end - suffix) < (int)sizeof(buf)); + int r = sscanf(suffix, "_%d_%s", &index, buf); + if (r < 2) + return false; + if (strcmp(buf, FILENAME_COOKIE.c_str()) != 0) + return false; + build_filename(cand_long_name, index, buf, sizeof(buf)); + return strcmp(short_name, buf) == 0; +} + +string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i) +{ + string long_name = lfn_generate_object_name(oid); + ceph_assert(lfn_must_hash(long_name)); + char buf[FILENAME_SHORT_LEN + 4]; + build_filename(long_name.c_str(), i, buf, sizeof(buf)); + return string(buf); +} + +const string &LFNIndex::get_base_path() +{ + return base_path; +} + +string LFNIndex::get_full_path_subdir(const vector<string> &rel) +{ + string retval = get_base_path(); + for (vector<string>::const_iterator i = rel.begin(); + i != rel.end(); + ++i) { + retval += "/"; + retval += mangle_path_component(*i); + } + return retval; +} + +string LFNIndex::get_full_path(const vector<string> &rel, const string &name) +{ + return get_full_path_subdir(rel) + "/" + name; +} + +string LFNIndex::mangle_path_component(const string &component) +{ + return SUBDIR_PREFIX + component; +} + +string LFNIndex::demangle_path_component(const string &component) +{ + return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size()); +} + +int LFNIndex::decompose_full_path(const char *in, vector<string> *out, + ghobject_t *oid, string *shortname) +{ + const char *beginning = in + get_base_path().size(); + const char *end = beginning; + while (1) { + end++; + beginning = end++; + for ( ; *end != '\0' && *end != '/'; ++end) ; + if (*end != '\0') { + out->push_back(demangle_path_component(string(beginning, end - beginning))); + continue; + } else { + break; + } + } + *shortname = string(beginning, end - beginning); + if (oid) { + int r = lfn_translate(*out, *shortname, oid); + if (r < 0) + return r; + } + return 0; +} + +string LFNIndex::mangle_attr_name(const string &attr) +{ + return PHASH_ATTR_PREFIX + attr; +} diff --git a/src/os/filestore/LFNIndex.h b/src/os/filestore/LFNIndex.h new file mode 100644 index 00000000..149ed10f --- /dev/null +++ b/src/os/filestore/LFNIndex.h @@ -0,0 +1,614 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef OS_LFNINDEX_H +#define OS_LFNINDEX_H + +#include <string> +#include <map> +#include <set> +#include <vector> +#include <exception> + +#include "osd/osd_types.h" +#include "include/object.h" +#include "common/ceph_crypto.h" + +#include "CollectionIndex.h" + +/** + * LFNIndex also encapsulates logic for manipulating + * subdirectories of a collection as well as the long filename + * logic. + * + * The protected methods provide machinery for derived classes to + * manipulate subdirectories and objects. + * + * The virtual methods are to be overridden to provide the actual + * hashed layout. + * + * User must call created when an object is created. + * + * Synchronization: Calling code must ensure that there are no object + * creations or deletions during the lifetime of a Path object (except + * of an object at that path). + * + * Unless otherwise noted, methods which return an int return 0 on success + * and a negative error code on failure. + */ +#define WRAP_RETRY(x) { \ + bool failed = false; \ + int r = 0; \ + init_inject_failure(); \ + while (1) { \ + try { \ + if (failed) { \ + r = cleanup(); \ + ceph_assert(r == 0); \ + } \ + { x } \ + out: \ + complete_inject_failure(); \ + return r; \ + } catch (RetryException&) { \ + failed = true; \ + } catch (...) { \ + ceph_abort(); \ + } \ + } \ + return -1; \ + } \ + + + +class LFNIndex : public CollectionIndex { + /// Hash digest output size. + static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE; + /// Length of filename hash. + static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE; + /// Max filename size. + static const int FILENAME_MAX_LEN = 4096; + /// Length of hashed filename. + static const int FILENAME_SHORT_LEN = 255; + /// Length of hashed filename prefix. + static const int FILENAME_PREFIX_LEN; + /// Length of hashed filename cookie. + static const int FILENAME_EXTRA = 4; + /// Lfn cookie value. + static const string FILENAME_COOKIE; + /// Name of LFN attribute for storing full name. + static const string LFN_ATTR; + /// Prefix for subdir index attributes. + static const string PHASH_ATTR_PREFIX; + /// Prefix for index subdirectories. + static const string SUBDIR_PREFIX; + + /// Path to Index base. + const string base_path; + +protected: + const uint32_t index_version; + + /// true if retry injection is enabled + struct RetryException : public exception {}; + bool error_injection_enabled; + bool error_injection_on; + double error_injection_probability; + uint64_t last_failure; + uint64_t current_failure; + void init_inject_failure() { + if (error_injection_on) { + error_injection_enabled = true; + last_failure = current_failure = 0; + } + } + void maybe_inject_failure(); + void complete_inject_failure() { + error_injection_enabled = false; + } + +private: + string lfn_attribute, lfn_alt_attribute; + coll_t collection; + +public: + /// Constructor + LFNIndex( + CephContext* cct, + coll_t collection, + const char *base_path, ///< [in] path to Index root + uint32_t index_version, + double _error_injection_probability=0) + : CollectionIndex(cct, collection), + base_path(base_path), + index_version(index_version), + error_injection_enabled(false), + error_injection_on(_error_injection_probability != 0), + error_injection_probability(_error_injection_probability), + last_failure(0), current_failure(0), + collection(collection) { + if (index_version == HASH_INDEX_TAG) { + lfn_attribute = LFN_ATTR; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", index_version); + lfn_attribute = LFN_ATTR + string(buf); + lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt"; + } + } + + coll_t coll() const override { return collection; } + + /// Virtual destructor + ~LFNIndex() override {} + + /// @see CollectionIndex + int init() override; + + /// @see CollectionIndex + int cleanup() override = 0; + + /// @see CollectionIndex + int created( + const ghobject_t &oid, + const char *path + ) override; + + /// @see CollectionIndex + int unlink( + const ghobject_t &oid + ) override; + + /// @see CollectionIndex + int lookup( + const ghobject_t &oid, + IndexedPath *path, + int *hardlink + ) override; + + /// @see CollectionIndex; + int pre_hash_collection( + uint32_t pg_num, + uint64_t expected_num_objs + ) override; + + /// @see CollectionIndex + int collection_list_partial( + const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next + ) override; + + virtual int _split( + uint32_t match, //< [in] value to match + uint32_t bits, //< [in] bits to check + CollectionIndex* dest //< [in] destination index + ) = 0; + virtual int _merge( + uint32_t bits, //< [in] bits for target + CollectionIndex* dest //< [in] destination index + ) = 0; + + /// @see CollectionIndex + int split( + uint32_t match, + uint32_t bits, + CollectionIndex* dest + ) override { + WRAP_RETRY( + r = _split(match, bits, dest); + goto out; + ); + } + + /// @see CollectionIndex + int merge( + uint32_t bits, + CollectionIndex* dest + ) override { + WRAP_RETRY( + r = _merge(bits, dest); + goto out; + ); + } + + /** + * Returns the length of the longest escaped name which could result + * from any clone, shard, or rollback object of this object + */ + static uint64_t get_max_escaped_name_len(const hobject_t &obj); + +protected: + virtual int _init() = 0; + + /// Will be called upon object creation + virtual int _created( + const vector<string> &path, ///< [in] Path to subdir. + const ghobject_t &oid, ///< [in] Object created. + const string &mangled_name ///< [in] Mangled filename. + ) = 0; + + /// Will be called to remove an object + virtual int _remove( + const vector<string> &path, ///< [in] Path to subdir. + const ghobject_t &oid, ///< [in] Object to remove. + const string &mangled_name ///< [in] Mangled filename. + ) = 0; + + /// Return the path and mangled_name for oid. + virtual int _lookup( + const ghobject_t &oid,///< [in] Object for lookup. + vector<string> *path, ///< [out] Path to the object. + string *mangled_name, ///< [out] Mangled filename. + int *exists ///< [out] True if the object exists. + ) = 0; + + /// Pre-hash the collection with the given pg number and + /// expected number of objects in the collection. + virtual int _pre_hash_collection( + uint32_t pg_num, + uint64_t expected_num_objs + ) = 0; + + /// @see CollectionIndex + virtual int _collection_list_partial( + const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next + ) = 0; + +protected: + + /* Non-virtual utility methods */ + + /// Sync a subdirectory + int fsync_dir( + const vector<string> &path ///< [in] Path to sync + ); ///< @return Error Code, 0 on success + + /// Link an object from from into to + int link_object( + const vector<string> &from, ///< [in] Source subdirectory. + const vector<string> &to, ///< [in] Dest subdirectory. + const ghobject_t &oid, ///< [in] Object to move. + const string &from_short_name ///< [in] Mangled filename of oid. + ); ///< @return Error Code, 0 on success + + /** + * Efficiently remove objects from a subdirectory + * + * remove_object invalidates mangled names in the directory requiring + * the mangled name of each additional object to be looked up a second + * time. remove_objects removes the need for additional lookups + * + * @param [in] dir Directory from which to remove. + * @param [in] map of objects to remove to mangle names + * @param [in,out] map of filenames to objects + * @return Error Code, 0 on success. + */ + int remove_objects( + const vector<string> &dir, + const map<string, ghobject_t> &to_remove, + map<string, ghobject_t> *remaining + ); + + + /** + * Moves contents of from into to. + * + * Invalidates mangled names in to. If interrupted, all objects will be + * present in to before objects are removed from from. Ignores EEXIST + * while linking into to. + * @return Error Code, 0 on success + */ + int move_objects( + const vector<string> &from, ///< [in] Source subdirectory. + const vector<string> &to ///< [in] Dest subdirectory. + ); + + /** + * Remove an object from from. + * + * Invalidates mangled names in from. + * @return Error Code, 0 on success + */ + int remove_object( + const vector<string> &from, ///< [in] Directory from which to remove. + const ghobject_t &to_remove ///< [in] Object to remove. + ); + + /** + * Gets the filename corresponding to oid in from. + * + * The filename may differ between subdirectories. Furthermore, + * file creations ore removals in from may invalidate the name. + * @return Error code on failure, 0 on success + */ + int get_mangled_name( + const vector<string> &from, ///< [in] Subdirectory + const ghobject_t &oid, ///< [in] Object + string *mangled_name, ///< [out] Filename + int *hardlink ///< [out] hardlink for this file, hardlink=0 mean no-exist + ); + + /// do move subdir from from to dest + static int move_subdir( + LFNIndex &from, ///< [in] from index + LFNIndex &dest, ///< [in] to index + const vector<string> &path, ///< [in] path containing dir + string dir ///< [in] dir to move + ); + + /// do move object from from to dest + static int move_object( + LFNIndex &from, ///< [in] from index + LFNIndex &dest, ///< [in] to index + const vector<string> &path, ///< [in] path to split + const pair<string, ghobject_t> &obj ///< [in] obj to move + ); + + /** + * Lists objects in to_list. + * + * @param [in] to_list Directory to list. + * @param [in] max_objects Max number to list. + * @param [in,out] handle Cookie for continuing the listing. + * Initialize to zero to start at the beginning of the directory. + * @param [out] out Mapping of listed object filenames to objects. + * @return Error code on failure, 0 on success + */ + int list_objects( + const vector<string> &to_list, + int max_objects, + long *handle, + map<string, ghobject_t> *out + ); + + /// Lists subdirectories. + int list_subdirs( + const vector<string> &to_list, ///< [in] Directory to list. + vector<string> *out ///< [out] Subdirectories listed. + ); + + /// Create subdirectory. + int create_path( + const vector<string> &to_create ///< [in] Subdirectory to create. + ); + + /// Remove subdirectory. + int remove_path( + const vector<string> &to_remove ///< [in] Subdirectory to remove. + ); + + /// Check whether to_check exists. + int path_exists( + const vector<string> &to_check, ///< [in] Subdirectory to check. + int *exists ///< [out] 1 if it exists, 0 else + ); + + /// Save attr_value to attr_name attribute on path. + int add_attr_path( + const vector<string> &path, ///< [in] Path to modify. + const string &attr_name, ///< [in] Name of attribute. + bufferlist &attr_value ///< [in] Value to save. + ); + + /// Read into attr_value attribute attr_name on path. + int get_attr_path( + const vector<string> &path, ///< [in] Path to read. + const string &attr_name, ///< [in] Attribute to read. + bufferlist &attr_value ///< [out] Attribute value read. + ); + + /// Remove attr from path + int remove_attr_path( + const vector<string> &path, ///< [in] path from which to remove attr + const string &attr_name ///< [in] attr to remove + ); ///< @return Error code, 0 on success + +private: + /* lfn translation functions */ + + /** + * Gets the version specific lfn attribute tag + */ + const string &get_lfn_attr() const { + return lfn_attribute; + } + const string &get_alt_lfn_attr() const { + return lfn_alt_attribute; + } + + /** + * Gets the filename corresponding to oid in path. + * + * @param [in] path Path in which to get filename for oid. + * @param [in] oid Object for which to get filename. + * @param [out] mangled_name Filename for oid, pass NULL if not needed. + * @param [out] full_path Fullpath for oid, pass NULL if not needed. + * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if + * not needed + * @return Error Code, 0 on success. + */ + int lfn_get_name( + const vector<string> &path, + const ghobject_t &oid, + string *mangled_name, + string *full_path, + int *hardlink + ); + + /// Adjusts path contents when oid is created at name mangled_name. + int lfn_created( + const vector<string> &path, ///< [in] Path to adjust. + const ghobject_t &oid, ///< [in] Object created. + const string &mangled_name ///< [in] Filename of created object. + ); + + /// Removes oid from path while adjusting path contents + int lfn_unlink( + const vector<string> &path, ///< [in] Path containing oid. + const ghobject_t &oid, ///< [in] Object to remove. + const string &mangled_name ///< [in] Filename of object to remove. + ); + + ///Transate a file into and ghobject_t. + int lfn_translate( + const vector<string> &path, ///< [in] Path containing the file. + const string &short_name, ///< [in] Filename to translate. + ghobject_t *out ///< [out] Object found. + ); ///< @return Negative error code on error, 0 if not an object, 1 else + + /* manglers/demanglers */ + /// Filters object filenames + bool lfn_is_object( + const string &short_name ///< [in] Filename to check + ); ///< True if short_name is an object, false otherwise + + /// Filters subdir filenames + bool lfn_is_subdir( + const string &short_name, ///< [in] Filename to check. + string *demangled_name ///< [out] Demangled subdir name. + ); ///< @return True if short_name is a subdir, false otherwise + + /// Generate object name + string lfn_generate_object_name_keyless( + const ghobject_t &oid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Generate object name + string lfn_generate_object_name_poolless( + const ghobject_t &oid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Generate object name + static string lfn_generate_object_name_current( + const ghobject_t &oid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Generate object name + string lfn_generate_object_name( + const ghobject_t &oid ///< [in] Object for which to generate. + ) { + if (index_version == HASH_INDEX_TAG) + return lfn_generate_object_name_keyless(oid); + if (index_version == HASH_INDEX_TAG_2) + return lfn_generate_object_name_poolless(oid); + else + return lfn_generate_object_name_current(oid); + } ///< @return Generated object name. + + /// Parse object name + int lfn_parse_object_name_keyless( + const string &long_name, ///< [in] Name to parse + ghobject_t *out ///< [out] Resulting Object + ); ///< @return True if successful, False otherwise. + + /// Parse object name + int lfn_parse_object_name_poolless( + const string &long_name, ///< [in] Name to parse + ghobject_t *out ///< [out] Resulting Object + ); ///< @return True if successful, False otherwise. + + /// Parse object name + int lfn_parse_object_name( + const string &long_name, ///< [in] Name to parse + ghobject_t *out ///< [out] Resulting Object + ); ///< @return True if successful, False otherwise. + + /// Checks whether short_name is a hashed filename. + bool lfn_is_hashed_filename( + const string &short_name ///< [in] Name to check. + ); ///< @return True if short_name is hashed, False otherwise. + + /// Checks whether long_name must be hashed. + bool lfn_must_hash( + const string &long_name ///< [in] Name to check. + ); ///< @return True if long_name must be hashed, False otherwise. + + /// Generate hashed name. + string lfn_get_short_name( + const ghobject_t &oid, ///< [in] Object for which to generate. + int i ///< [in] Index of hashed name to generate. + ); ///< @return Hashed filename. + + /* other common methods */ + /// Gets the base path + const string &get_base_path(); ///< @return Index base_path + + /// Get full path the subdir + string get_full_path_subdir( + const vector<string> &rel ///< [in] The subdir. + ); ///< @return Full path to rel. + + /// Get full path to object + string get_full_path( + const vector<string> &rel, ///< [in] Path to object. + const string &name ///< [in] Filename of object. + ); ///< @return Fullpath to object at name in rel. + + /// Get mangled path component + string mangle_path_component( + const string &component ///< [in] Component to mangle + ); /// @return Mangled component + + /// Demangle component + string demangle_path_component( + const string &component ///< [in] Subdir name to demangle + ); ///< @return Demangled path component. + + /// Decompose full path into object name and filename. + int decompose_full_path( + const char *in, ///< [in] Full path to object. + vector<string> *out, ///< [out] Path to object at in. + ghobject_t *oid, ///< [out] Object at in. + string *shortname ///< [out] Filename of object at in. + ); ///< @return Error Code, 0 on success. + + /// Mangle attribute name + string mangle_attr_name( + const string &attr ///< [in] Attribute to mangle. + ); ///< @return Mangled attribute name. + + /// checks whether long_name could hash to short_name + bool short_name_matches( + const char *short_name, ///< [in] name to check against + const char *cand_long_name ///< [in] candidate long name + ); + + /// Builds hashed filename + void build_filename( + const char *old_filename, ///< [in] Filename to convert. + int i, ///< [in] Index of hash. + char *filename, ///< [out] Resulting filename. + int len ///< [in] Size of buffer for filename + ); ///< @return Error Code, 0 on success + + /// Get hash of filename + int hash_filename( + const char *filename, ///< [in] Filename to hash. + char *hash, ///< [out] Hash of filename. + int len ///< [in] Size of hash buffer. + ); ///< @return Error Code, 0 on success. + + friend class TestWrapLFNIndex; +}; +typedef LFNIndex::IndexedPath IndexedPath; + +#endif diff --git a/src/os/filestore/SequencerPosition.h b/src/os/filestore/SequencerPosition.h new file mode 100644 index 00000000..164112ee --- /dev/null +++ b/src/os/filestore/SequencerPosition.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef __CEPH_OS_SEQUENCERPOSITION_H +#define __CEPH_OS_SEQUENCERPOSITION_H + +#include "include/types.h" +#include "include/cmp.h" +#include "include/encoding.h" +#include "common/Formatter.h" + +#include <ostream> + +/** + * transaction and op offset + */ +struct SequencerPosition { + uint64_t seq; ///< seq + uint32_t trans; ///< transaction in that seq (0-based) + uint32_t op; ///< op in that transaction (0-based) + + SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(seq, bl); + encode(trans, bl); + encode(op, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(seq, p); + decode(trans, p); + decode(op, p); + DECODE_FINISH(p); + } + void dump(Formatter *f) const { + f->dump_unsigned("seq", seq); + f->dump_unsigned("trans", trans); + f->dump_unsigned("op", op); + } + static void generate_test_instances(list<SequencerPosition*>& o) { + o.push_back(new SequencerPosition); + o.push_back(new SequencerPosition(1, 2, 3)); + o.push_back(new SequencerPosition(4, 5, 6)); + } +}; +WRITE_CLASS_ENCODER(SequencerPosition) + +inline ostream& operator<<(ostream& out, const SequencerPosition& t) { + return out << t.seq << "." << t.trans << "." << t.op; +} + +WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op) +WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op) + + +#endif diff --git a/src/os/filestore/WBThrottle.cc b/src/os/filestore/WBThrottle.cc new file mode 100644 index 00000000..ba2ed131 --- /dev/null +++ b/src/os/filestore/WBThrottle.cc @@ -0,0 +1,272 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "acconfig.h" + +#include "os/filestore/WBThrottle.h" +#include "common/perf_counters.h" +#include "common/errno.h" + +WBThrottle::WBThrottle(CephContext *cct) : + cur_ios(0), cur_size(0), + cct(cct), + logger(NULL), + stopping(true), + lock("WBThrottle::lock", false, true, false), + fs(XFS) +{ + { + Mutex::Locker l(lock); + set_from_conf(); + } + ceph_assert(cct); + PerfCountersBuilder b( + cct, string("WBThrottle"), + l_wbthrottle_first, l_wbthrottle_last); + b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations"); + b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations"); + b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write"); + b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries"); + logger = b.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); + for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i) + logger->set(i, 0); + + cct->_conf.add_observer(this); +} + +WBThrottle::~WBThrottle() { + ceph_assert(cct); + cct->get_perfcounters_collection()->remove(logger); + delete logger; + cct->_conf.remove_observer(this); +} + +void WBThrottle::start() +{ + { + Mutex::Locker l(lock); + stopping = false; + } + create("wb_throttle"); +} + +void WBThrottle::stop() +{ + { + Mutex::Locker l(lock); + stopping = true; + cond.Signal(); + } + + join(); +} + +const char** WBThrottle::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "filestore_wbthrottle_btrfs_bytes_start_flusher", + "filestore_wbthrottle_btrfs_bytes_hard_limit", + "filestore_wbthrottle_btrfs_ios_start_flusher", + "filestore_wbthrottle_btrfs_ios_hard_limit", + "filestore_wbthrottle_btrfs_inodes_start_flusher", + "filestore_wbthrottle_btrfs_inodes_hard_limit", + "filestore_wbthrottle_xfs_bytes_start_flusher", + "filestore_wbthrottle_xfs_bytes_hard_limit", + "filestore_wbthrottle_xfs_ios_start_flusher", + "filestore_wbthrottle_xfs_ios_hard_limit", + "filestore_wbthrottle_xfs_inodes_start_flusher", + "filestore_wbthrottle_xfs_inodes_hard_limit", + NULL + }; + return KEYS; +} + +void WBThrottle::set_from_conf() +{ + ceph_assert(lock.is_locked()); + if (fs == BTRFS) { + size_limits.first = + cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher; + size_limits.second = + cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit; + io_limits.first = + cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher; + io_limits.second = + cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit; + fd_limits.first = + cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher; + fd_limits.second = + cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit; + } else if (fs == XFS) { + size_limits.first = + cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher; + size_limits.second = + cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit; + io_limits.first = + cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher; + io_limits.second = + cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit; + fd_limits.first = + cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher; + fd_limits.second = + cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit; + } else { + ceph_abort_msg("invalid value for fs"); + } + cond.Signal(); +} + +void WBThrottle::handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + Mutex::Locker l(lock); + for (const char** i = get_tracked_conf_keys(); *i; ++i) { + if (changed.count(*i)) { + set_from_conf(); + return; + } + } +} + +bool WBThrottle::get_next_should_flush( + boost::tuple<ghobject_t, FDRef, PendingWB> *next) +{ + ceph_assert(lock.is_locked()); + ceph_assert(next); + while (!stopping && (!beyond_limit() || pending_wbs.empty())) + cond.Wait(lock); + if (stopping) + return false; + ceph_assert(!pending_wbs.empty()); + ghobject_t obj(pop_object()); + + ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i = + pending_wbs.find(obj); + *next = boost::make_tuple(obj, i->second.second, i->second.first); + pending_wbs.erase(i); + return true; +} + + +void *WBThrottle::entry() +{ + Mutex::Locker l(lock); + boost::tuple<ghobject_t, FDRef, PendingWB> wb; + while (get_next_should_flush(&wb)) { + clearing = wb.get<0>(); + cur_ios -= wb.get<2>().ios; + logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios); + logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios); + cur_size -= wb.get<2>().size; + logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size); + logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size); + logger->dec(l_wbthrottle_inodes_dirtied); + logger->inc(l_wbthrottle_inodes_wb); + lock.Unlock(); +#if defined(HAVE_FDATASYNC) + int r = ::fdatasync(**wb.get<1>()); +#else + int r = ::fsync(**wb.get<1>()); +#endif + if (r < 0) { + lderr(cct) << "WBThrottle fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +#ifdef HAVE_POSIX_FADVISE + if (cct->_conf->filestore_fadvise && wb.get<2>().nocache) { + int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED); + ceph_assert(fa_r == 0); + } +#endif + lock.Lock(); + clearing = ghobject_t(); + cond.Signal(); + wb = boost::tuple<ghobject_t, FDRef, PendingWB>(); + } + return 0; +} + +void WBThrottle::queue_wb( + FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len, + bool nocache) +{ + Mutex::Locker l(lock); + ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter = + pending_wbs.find(hoid); + if (wbiter == pending_wbs.end()) { + wbiter = pending_wbs.insert( + make_pair(hoid, + make_pair( + PendingWB(), + fd))).first; + logger->inc(l_wbthrottle_inodes_dirtied); + } else { + remove_object(hoid); + } + + cur_ios++; + logger->inc(l_wbthrottle_ios_dirtied); + cur_size += len; + logger->inc(l_wbthrottle_bytes_dirtied, len); + + wbiter->second.first.add(nocache, len, 1); + insert_object(hoid); + if (beyond_limit()) + cond.Signal(); +} + +void WBThrottle::clear() +{ + Mutex::Locker l(lock); + for (ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i = + pending_wbs.begin(); + i != pending_wbs.end(); + ++i) { +#ifdef HAVE_POSIX_FADVISE + if (cct->_conf->filestore_fadvise && i->second.first.nocache) { + int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED); + ceph_assert(fa_r == 0); + } +#endif + + } + cur_ios = cur_size = 0; + logger->set(l_wbthrottle_ios_dirtied, 0); + logger->set(l_wbthrottle_bytes_dirtied, 0); + logger->set(l_wbthrottle_inodes_dirtied, 0); + pending_wbs.clear(); + lru.clear(); + rev_lru.clear(); + cond.Signal(); +} + +void WBThrottle::clear_object(const ghobject_t &hoid) +{ + Mutex::Locker l(lock); + while (clearing == hoid) + cond.Wait(lock); + ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i = + pending_wbs.find(hoid); + if (i == pending_wbs.end()) + return; + + cur_ios -= i->second.first.ios; + logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios); + cur_size -= i->second.first.size; + logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size); + logger->dec(l_wbthrottle_inodes_dirtied); + + pending_wbs.erase(i); + remove_object(hoid); + cond.Signal(); +} + +void WBThrottle::throttle() +{ + Mutex::Locker l(lock); + while (!stopping && need_flush()) + cond.Wait(lock); +} diff --git a/src/os/filestore/WBThrottle.h b/src/os/filestore/WBThrottle.h new file mode 100644 index 00000000..ef809ea4 --- /dev/null +++ b/src/os/filestore/WBThrottle.h @@ -0,0 +1,187 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef WBTHROTTLE_H +#define WBTHROTTLE_H + +#include "include/unordered_map.h" +#include <boost/tuple/tuple.hpp> +#include "common/Formatter.h" +#include "common/hobject.h" +#include "include/interval_set.h" +#include "FDCache.h" +#include "common/Thread.h" +#include "common/ceph_context.h" + +class PerfCounters; +enum { + l_wbthrottle_first = 999090, + l_wbthrottle_bytes_dirtied, + l_wbthrottle_bytes_wb, + l_wbthrottle_ios_dirtied, + l_wbthrottle_ios_wb, + l_wbthrottle_inodes_dirtied, + l_wbthrottle_inodes_wb, + l_wbthrottle_last +}; + +/** + * WBThrottle + * + * Tracks, throttles, and flushes outstanding IO + */ +class WBThrottle : Thread, public md_config_obs_t { + ghobject_t clearing; + /* *_limits.first is the start_flusher limit and + * *_limits.second is the hard limit + */ + + /// Limits on unflushed bytes + pair<uint64_t, uint64_t> size_limits; + + /// Limits on unflushed ios + pair<uint64_t, uint64_t> io_limits; + + /// Limits on unflushed objects + pair<uint64_t, uint64_t> fd_limits; + + uint64_t cur_ios; /// Currently unflushed IOs + uint64_t cur_size; /// Currently unflushed bytes + + /** + * PendingWB tracks the ios pending on an object. + */ + class PendingWB { + public: + bool nocache; + uint64_t size; + uint64_t ios; + PendingWB() : nocache(true), size(0), ios(0) {} + void add(bool _nocache, uint64_t _size, uint64_t _ios) { + if (!_nocache) + nocache = false; // only nocache if all writes are nocache + size += _size; + ios += _ios; + } + }; + + CephContext *cct; + PerfCounters *logger; + bool stopping; + Mutex lock; + Cond cond; + + + /** + * Flush objects in lru order + */ + list<ghobject_t> lru; + ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator> rev_lru; + void remove_object(const ghobject_t &oid) { + ceph_assert(lock.is_locked()); + ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator>::iterator iter = + rev_lru.find(oid); + if (iter == rev_lru.end()) + return; + + lru.erase(iter->second); + rev_lru.erase(iter); + } + ghobject_t pop_object() { + ceph_assert(!lru.empty()); + ghobject_t oid(lru.front()); + lru.pop_front(); + rev_lru.erase(oid); + return oid; + } + void insert_object(const ghobject_t &oid) { + ceph_assert(rev_lru.find(oid) == rev_lru.end()); + lru.push_back(oid); + rev_lru.insert(make_pair(oid, --lru.end())); + } + + ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs; + + /// get next flush to perform + bool get_next_should_flush( + boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush + ); ///< @return false if we are shutting down +public: + enum FS { + BTRFS, + XFS + }; + +private: + FS fs; + + void set_from_conf(); + bool beyond_limit() const { + if (cur_ios < io_limits.first && + pending_wbs.size() < fd_limits.first && + cur_size < size_limits.first) + return false; + else + return true; + } + bool need_flush() const { + if (cur_ios < io_limits.second && + pending_wbs.size() < fd_limits.second && + cur_size < size_limits.second) + return false; + else + return true; + } + +public: + explicit WBThrottle(CephContext *cct); + ~WBThrottle() override; + + void start(); + void stop(); + /// Set fs as XFS or BTRFS + void set_fs(FS new_fs) { + Mutex::Locker l(lock); + fs = new_fs; + set_from_conf(); + } + + /// Queue wb on oid, fd taking throttle (does not block) + void queue_wb( + FDRef fd, ///< [in] FDRef to oid + const ghobject_t &oid, ///< [in] object + uint64_t offset, ///< [in] offset written + uint64_t len, ///< [in] length written + bool nocache ///< [in] try to clear out of cache after write + ); + + /// Clear all wb (probably due to sync) + void clear(); + + /// Clear object + void clear_object(const ghobject_t &oid); + + /// Block until there is throttle available + void throttle(); + + /// md_config_obs_t + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) override; + + /// Thread + void *entry() override; +}; + +#endif diff --git a/src/os/filestore/XfsFileStoreBackend.cc b/src/os/filestore/XfsFileStoreBackend.cc new file mode 100644 index 00000000..1081d146 --- /dev/null +++ b/src/os/filestore/XfsFileStoreBackend.cc @@ -0,0 +1,149 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "XfsFileStoreBackend.h" + +#include <errno.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <unistd.h> +#include <sys/utsname.h> + +#include <xfs/xfs.h> + +#include "common/errno.h" +#include "common/linux_version.h" +#include "include/ceph_assert.h" +#include "include/compat.h" + +#define dout_context cct() +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") " + +XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs): + GenericFileStoreBackend(fs), m_has_extsize(false) { } + +/* + * Set extsize attr on a file to val. Should be a free-standing + * function, but dout_prefix expanding to a call to get_basedir_path() + * protected member function won't let it. + */ +int XfsFileStoreBackend::set_extsize(int fd, unsigned int val) +{ + struct fsxattr fsx; + struct stat sb; + int ret; + + if (fstat(fd, &sb) < 0) { + ret = -errno; + dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl; + return ret; + } + if (!S_ISREG(sb.st_mode)) { + dout(0) << "set_extsize: invalid target file type" << dendl; + return -EINVAL; + } + + if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) { + ret = -errno; + dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl; + return ret; + } + + // already set? + if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val) + return 0; + + // xfs won't change extent size if any extents are allocated + if (fsx.fsx_nextents != 0) + return 0; + + fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE; + fsx.fsx_extsize = val; + + if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) { + ret = -errno; + dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl; + return ret; + } + + return 0; +} + +int XfsFileStoreBackend::detect_features() +{ + int ret; + + ret = GenericFileStoreBackend::detect_features(); + if (ret < 0) + return ret; + + // extsize? + int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600); + if (fd < 0) { + ret = -errno; + dout(0) << "detect_feature: failed to create test file for extsize attr: " + << cpp_strerror(ret) << dendl; + goto out; + } + if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) { + ret = -errno; + dout(0) << "detect_feature: failed to unlink test file for extsize attr: " + << cpp_strerror(ret) << dendl; + goto out_close; + } + + if (cct()->_conf->filestore_xfs_extsize) { + ret = set_extsize(fd, 1U << 15); // a few pages + if (ret) { + ret = 0; + dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl; + goto out_close; + } + + // make sure we have 3.5 or newer, which includes this fix + // aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d + // for this set_extsize bug + // http://oss.sgi.com/bugzilla/show_bug.cgi?id=874 + int ver = get_linux_version(); + if (ver == 0) { + dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl; + m_has_extsize = false; + } else if (ver < KERNEL_VERSION(3, 5, 0)) { + dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl; + m_has_extsize = false; + } else { + dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl; + m_has_extsize = true; + } + } else { + dout(0) << "detect_feature: extsize is disabled by conf" << dendl; + } + +out_close: + TEMP_FAILURE_RETRY(::close(fd)); +out: + return ret; +} + +int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint) +{ + if (!m_has_extsize) + return -EOPNOTSUPP; + + ceph_assert(hint < UINT_MAX); + return set_extsize(fd, hint); +} diff --git a/src/os/filestore/XfsFileStoreBackend.h b/src/os/filestore/XfsFileStoreBackend.h new file mode 100644 index 00000000..e8b81f9a --- /dev/null +++ b/src/os/filestore/XfsFileStoreBackend.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_XFSFILESTOREBACKEND_H +#define CEPH_XFSFILESTOREBACKEND_H + +#include "GenericFileStoreBackend.h" + +#include "include/int_types.h" + +class XfsFileStoreBackend : public GenericFileStoreBackend { +private: + bool m_has_extsize; + int set_extsize(int fd, unsigned int val); +public: + explicit XfsFileStoreBackend(FileStore *fs); + ~XfsFileStoreBackend() override {} + const char *get_name() override { + return "xfs"; + } + int detect_features() override; + int set_alloc_hint(int fd, uint64_t hint) override; +}; + +#endif /* CEPH_XFSFILESTOREBACKEND_H */ diff --git a/src/os/filestore/ZFSFileStoreBackend.cc b/src/os/filestore/ZFSFileStoreBackend.cc new file mode 100644 index 00000000..e85dbd52 --- /dev/null +++ b/src/os/filestore/ZFSFileStoreBackend.cc @@ -0,0 +1,258 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/int_types.h" +#include "include/types.h" + +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> + +#include "include/compat.h" +#include "include/linux_fiemap.h" +#include "include/color.h" +#include "include/buffer.h" +#include "include/ceph_assert.h" + +#include <iostream> +#include <fstream> +#include <sstream> + +#include "common/errno.h" +#include "common/config.h" +#include "common/sync_filesystem.h" + +#include "ZFSFileStoreBackend.h" + +#define dout_context cct() +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") " + +ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) : + GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL), + m_filestore_zfs_snap(cct()->_conf->filestore_zfs_snap) +{ + int ret = zfs.init(); + if (ret < 0) { + dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl; + return; + } + + base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM); + if (!base_zh) { + dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl; + return; + } + + update_current_zh(); +} + +ZFSFileStoreBackend::~ZFSFileStoreBackend() +{ + if (base_zh) + zfs.close(base_zh); + if (current_zh) + zfs.close(current_zh); +} + +int ZFSFileStoreBackend::update_current_zh() +{ + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); + ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM); + if (zh) { + char *mnt; + if (zfs.is_mounted(zh, &mnt)) { + int ret = get_current_path() == mnt; + free(mnt); + if (ret) { + current_zh = zh; + return 0; + } + } else { + int ret = zfs.mount(zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh) + << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + } + zfs.close(zh); + } else { + dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl; + return -ENOENT; + } + + zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM); + if (zh) { + if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) { + current_zh = zh; + return 0; + } + zfs.close(zh); + dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl; + } else { + dout(0) << "update_current_zh: current/ not exist" << dendl; + } + return -ENOENT; +} + +int ZFSFileStoreBackend::detect_features() +{ + if (!current_zh) + dout(0) << "detect_features: null zfs handle for current/" << dendl; + return 0; +} + +bool ZFSFileStoreBackend::can_checkpoint() +{ + return m_filestore_zfs_snap && current_zh != NULL; +} + +int ZFSFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "create_current: current/ exists but is not a directory" << dendl; + return -ENOTDIR; + } + return 0; + } else if (errno != ENOENT) { + ret = -errno; + dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl; + return ret; + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); + ret = zfs.create(path, ZFS::TYPE_FILESYSTEM); + if (ret < 0 && errno != EEXIST) { + ret = -errno; + dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + + ret = update_current_zh(); + return ret; +} + +static int list_checkpoints_callback(ZFS::Handle *zh, void *data) +{ + list<string> *ls = static_cast<list<string> *>(data); + string str = ZFS::get_name(zh); + size_t pos = str.find('@'); + ceph_assert(pos != string::npos && pos + 1 != str.length()); + ls->push_back(str.substr(pos + 1)); + return 0; +} + +int ZFSFileStoreBackend::list_checkpoints(list<string>& ls) +{ + dout(10) << "list_checkpoints:" << dendl; + if (!current_zh) + return -EINVAL; + + list<string> snaps; + int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps); + if (ret < 0) { + ret = -errno; + dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl; + return ret; + } + ls.swap(snaps); + return 0; +} + +int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid) +{ + dout(10) << "create_checkpoint: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + // looks like zfsonlinux doesn't flush dirty data when taking snapshot + int ret = sync_filesystem(get_current_fd()); + if (ret < 0) { + ret = -errno; + dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl; + return ret; + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); + ret = zfs.snapshot(path, false); + if (ret < 0) { + ret = -errno; + dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl; + return ret; + } + if (cid) + *cid = 0; + return 0; +} + +int ZFSFileStoreBackend::rollback_to(const string& name) +{ + dout(10) << "rollback_to: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + // umount current to avoid triggering online rollback deadlock + int ret; + if (zfs.is_mounted(current_zh, NULL)) { + ret = zfs.umount(current_zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl; + } + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); + + ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT); + if (!snap_zh) { + dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl; + return -ENOENT; + } + + ret = zfs.rollback(current_zh, snap_zh, false); + if (ret < 0) { + ret = -errno; + dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl; + } + + if (!zfs.is_mounted(current_zh, NULL)) { + int ret = zfs.mount(current_zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + } + + zfs.close(snap_zh); + return ret; +} + +int ZFSFileStoreBackend::destroy_checkpoint(const string& name) +{ + dout(10) << "destroy_checkpoint: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + int ret = zfs.destroy_snaps(current_zh, name.c_str(), true); + if (ret < 0) { + ret = -errno; + dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl; + } + return ret; +} diff --git a/src/os/filestore/ZFSFileStoreBackend.h b/src/os/filestore/ZFSFileStoreBackend.h new file mode 100644 index 00000000..b1fa9887 --- /dev/null +++ b/src/os/filestore/ZFSFileStoreBackend.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_ZFSFILESTOREBACKEND_H +#define CEPH_ZFSFILESTOREBACKEND_H + +#ifdef HAVE_LIBZFS +#include "GenericFileStoreBackend.h" +#include "os/fs/ZFS.h" + +class ZFSFileStoreBackend : public GenericFileStoreBackend { +private: + ZFS zfs; + ZFS::Handle *base_zh; + ZFS::Handle *current_zh; + bool m_filestore_zfs_snap; + int update_current_zh(); +public: + explicit ZFSFileStoreBackend(FileStore *fs); + ~ZFSFileStoreBackend(); + const char *get_name() override { + return "zfs"; + } + int detect_features(); + bool can_checkpoint(); + int create_current(); + int list_checkpoints(list<string>& ls); + int create_checkpoint(const string& name, uint64_t *cid); + int rollback_to(const string& name); + int destroy_checkpoint(const string& name); +}; +#endif +#endif diff --git a/src/os/filestore/chain_xattr.cc b/src/os/filestore/chain_xattr.cc new file mode 100644 index 00000000..e4dedd29 --- /dev/null +++ b/src/os/filestore/chain_xattr.cc @@ -0,0 +1,413 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "chain_xattr.h" +#include <errno.h> // for ERANGE, ENODATA, ENOMEM +#include <stdio.h> // for size_t, snprintf +#include <stdlib.h> // for free, malloc +#include <string.h> // for strcpy, strlen +#include "include/ceph_assert.h" // for assert +#include "include/buffer.h" + +#if defined(__linux__) +#include <linux/fs.h> +#endif + +#include "include/ceph_assert.h" + +/* + * chaining xattrs + * + * In order to support xattrs that are larger than the xattr size limit that some file systems + * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys + * are set as follows: + * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char + * being esacped ("@@"). + * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@<id>" + * where <id> marks the num of xattr in the chain. + */ + +void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len) +{ + int pos = 0; + + while (*name) { + switch (*name) { + case '@': /* escape it */ + pos += 2; + ceph_assert (pos < raw_len - 1); + *raw_name = '@'; + raw_name++; + *raw_name = '@'; + break; + default: + pos++; + ceph_assert(pos < raw_len - 1); + *raw_name = *name; + break; + } + name++; + raw_name++; + } + + if (!i) { + *raw_name = '\0'; + } else { + int r = snprintf(raw_name, raw_len - pos, "@%d", i); + ceph_assert(r < raw_len - pos); + } +} + +static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first) +{ + int pos = 0; + + *is_first = true; + while (*raw_name) { + switch (*raw_name) { + case '@': /* escape it */ + raw_name++; + if (!*raw_name) + break; + if (*raw_name != '@') { + *is_first = false; + goto done; + } + + /* fall through */ + default: + *name = *raw_name; + break; + } + pos++; + ceph_assert(pos < name_len); + name++; + raw_name++; + } +done: + *name = '\0'; + return pos; +} + + +// setxattr + +static int getxattr_len(const char *fn, const char *name) +{ + int i = 0, total = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_getxattr(fn, raw_name, 0, 0); + if (!i && r < 0) + return r; + if (r < 0) + break; + total += r; + i++; + } while (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN); + + return total; +} + +int chain_getxattr(const char *fn, const char *name, void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + int r; + size_t chunk_size; + + if (!size) + return getxattr_len(fn, name); + + do { + chunk_size = size; + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + + r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size); + if (i && r == -ENODATA) { + ret = pos; + break; + } + if (r < 0) { + ret = r; + break; + } + + if (r > 0) { + pos += r; + size -= r; + } + + i++; + } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN)); + + if (r >= 0) { + ret = pos; + /* is there another chunk? that can happen if the last read size span over + exactly one block */ + if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN || + chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_getxattr(fn, raw_name, 0, 0); + if (r > 0) { // there's another chunk.. the original buffer was too small + ret = -ERANGE; + } + } + } + return ret; +} + +int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp) +{ + size_t size = 1024; // Initial + while (1) { + bufferptr buf(size); + int r = chain_getxattr( + fn, + name, + buf.c_str(), + size); + if (r > 0) { + buf.set_length(r); + if (bp) + bp->swap(buf); + return r; + } else if (r == 0) { + return 0; + } else { + if (r == -ERANGE) { + size *= 2; + } else { + return r; + } + } + } + ceph_abort_msg("unreachable"); + return 0; +} + +static int chain_fgetxattr_len(int fd, const char *name) +{ + int i = 0, total = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fgetxattr(fd, raw_name, 0, 0); + if (!i && r < 0) + return r; + if (r < 0) + break; + total += r; + i++; + } while (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN); + + return total; +} + +int chain_fgetxattr(int fd, const char *name, void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + int r; + size_t chunk_size; + + if (!size) + return chain_fgetxattr_len(fd, name); + + do { + chunk_size = size; + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + + r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size); + if (i && r == -ENODATA) { + ret = pos; + break; + } + if (r < 0) { + ret = r; + break; + } + + if (r > 0) { + pos += r; + size -= r; + } + + i++; + } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN)); + + if (r >= 0) { + ret = pos; + /* is there another chunk? that can happen if the last read size span over + exactly one block */ + if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN || + chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fgetxattr(fd, raw_name, 0, 0); + if (r > 0) { // there's another chunk.. the original buffer was too small + ret = -ERANGE; + } + } + } + return ret; +} + + +// setxattr + +int get_xattr_block_size(size_t size) +{ + if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD) + // this may fit in the inode; stripe over short attrs so that XFS + // won't kick it out. + return CHAIN_XATTR_SHORT_BLOCK_LEN; + return CHAIN_XATTR_MAX_BLOCK_LEN; +} + +// removexattr + +int chain_removexattr(const char *fn, const char *name) +{ + int i = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_removexattr(fn, raw_name); + if (!i && r < 0) { + return r; + } + i++; + } while (r >= 0); + return 0; +} + +int chain_fremovexattr(int fd, const char *name) +{ + int i = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fremovexattr(fd, raw_name); + if (!i && r < 0) { + return r; + } + i++; + } while (r >= 0); + return 0; +} + + +// listxattr + +int chain_listxattr(const char *fn, char *names, size_t len) { + int r; + + if (!len) + return sys_listxattr(fn, names, len) * 2; + + r = sys_listxattr(fn, 0, 0); + if (r < 0) + return r; + + size_t total_len = r * 2; // should be enough + char *full_buf = (char *)malloc(total_len); + if (!full_buf) + return -ENOMEM; + + r = sys_listxattr(fn, full_buf, total_len); + if (r < 0) { + free(full_buf); + return r; + } + + char *p = full_buf; + const char *end = full_buf + r; + char *dest = names; + char *dest_end = names + len; + + while (p < end) { + char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int attr_len = strlen(p); + bool is_first; + int name_len = translate_raw_name(p, name, sizeof(name), &is_first); + if (is_first) { + if (dest + name_len > dest_end) { + r = -ERANGE; + goto done; + } + strcpy(dest, name); + dest += name_len + 1; + } + p += attr_len + 1; + } + r = dest - names; + +done: + free(full_buf); + return r; +} + +int chain_flistxattr(int fd, char *names, size_t len) { + int r; + char *p; + const char * end; + char *dest; + char *dest_end; + + if (!len) + return sys_flistxattr(fd, names, len) * 2; + + r = sys_flistxattr(fd, 0, 0); + if (r < 0) + return r; + + size_t total_len = r * 2; // should be enough + char *full_buf = (char *)malloc(total_len); + if (!full_buf) + return -ENOMEM; + + r = sys_flistxattr(fd, full_buf, total_len); + if (r < 0) + goto done; + + p = full_buf; + end = full_buf + r; + dest = names; + dest_end = names + len; + + while (p < end) { + char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int attr_len = strlen(p); + bool is_first; + int name_len = translate_raw_name(p, name, sizeof(name), &is_first); + if (is_first) { + if (dest + name_len > dest_end) { + r = -ERANGE; + goto done; + } + strcpy(dest, name); + dest += name_len + 1; + } + p += attr_len + 1; + } + r = dest - names; + +done: + free(full_buf); + return r; +} diff --git a/src/os/filestore/chain_xattr.h b/src/os/filestore/chain_xattr.h new file mode 100644 index 00000000..a2d17fa6 --- /dev/null +++ b/src/os/filestore/chain_xattr.h @@ -0,0 +1,182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef __CEPH_OSD_CHAIN_XATTR_H +#define __CEPH_OSD_CHAIN_XATTR_H + +#include "include/compat.h" +#include <errno.h> +#include <stdio.h> +#include "common/xattr.h" +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" + +#if defined(__linux__) +#include <linux/limits.h> +#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2) +#elif defined(__APPLE__) +#include <sys/xattr.h> +#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2) +#else +#define CHAIN_XATTR_MAX_NAME_LEN 128 +#endif + +#define CHAIN_XATTR_MAX_BLOCK_LEN 2048 + +/* + * XFS will only inline xattrs < 255 bytes, so for xattrs that are + * likely to fit in the inode, stripe over short xattrs. + */ +#define CHAIN_XATTR_SHORT_BLOCK_LEN 250 +#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000 + +// wrappers to hide annoying errno handling. + +static inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size) +{ + int r = ::ceph_os_fgetxattr(fd, name, val, size); + return (r < 0 ? -errno : r); +} +static inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size) +{ + int r = ::ceph_os_getxattr(fn, name, val, size); + return (r < 0 ? -errno : r); +} + +static inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size) +{ + int r = ::ceph_os_setxattr(fn, name, val, size); + return (r < 0 ? -errno : r); +} +static inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size) +{ + int r = ::ceph_os_fsetxattr(fd, name, val, size); + return (r < 0 ? -errno : r); +} + +static inline int sys_listxattr(const char *fn, char *names, size_t len) +{ + int r = ::ceph_os_listxattr(fn, names, len); + return (r < 0 ? -errno : r); +} +static inline int sys_flistxattr(int fd, char *names, size_t len) +{ + int r = ::ceph_os_flistxattr(fd, names, len); + return (r < 0 ? -errno : r); +} + +static inline int sys_removexattr(const char *fn, const char *name) +{ + int r = ::ceph_os_removexattr(fn, name); + return (r < 0 ? -errno : r); +} +static inline int sys_fremovexattr(int fd, const char *name) +{ + int r = ::ceph_os_fremovexattr(fd, name); + return (r < 0 ? -errno : r); +} + + +// wrappers to chain large values across multiple xattrs + +int chain_getxattr(const char *fn, const char *name, void *val, size_t size); +int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp); +int chain_fgetxattr(int fd, const char *name, void *val, size_t size); + +int get_xattr_block_size(size_t size); +void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len); + +template <bool skip_chain_cleanup=false, bool ensure_single_attr=false> +int chain_setxattr( + const char *fn, const char *name, const void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + size_t max_chunk_size = + ensure_single_attr ? size : get_xattr_block_size(size); + + static_assert( + !skip_chain_cleanup || ensure_single_attr, + "skip_chain_cleanup must imply ensure_single_attr"); + + do { + size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size); + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + size -= chunk_size; + + int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size); + if (r < 0) { + ret = r; + break; + } + pos += chunk_size; + ret = pos; + i++; + ceph_assert(size == 0 || !ensure_single_attr); + } while (size); + + if (ret >= 0 && !skip_chain_cleanup) { + int r; + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_removexattr(fn, raw_name); + if (r < 0 && r != -ENODATA) + ret = r; + i++; + } while (r != -ENODATA); + } + + return ret; +} + +template <bool skip_chain_cleanup=false, bool ensure_single_attr=false> +int chain_fsetxattr( + int fd, const char *name, const void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + size_t max_chunk_size = + ensure_single_attr ? size : get_xattr_block_size(size); + + static_assert( + !skip_chain_cleanup || ensure_single_attr, + "skip_chain_cleanup must imply ensure_single_attr"); + + do { + size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size); + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + size -= chunk_size; + + int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size); + if (r < 0) { + ret = r; + break; + } + pos += chunk_size; + ret = pos; + i++; + ceph_assert(size == 0 || !ensure_single_attr); + } while (size); + + if (ret >= 0 && !skip_chain_cleanup) { + int r; + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fremovexattr(fd, raw_name); + if (r < 0 && r != -ENODATA) + ret = r; + i++; + } while (r != -ENODATA); + } + + return ret; +} + +int chain_listxattr(const char *fn, char *names, size_t len); +int chain_flistxattr(int fd, char *names, size_t len); +int chain_removexattr(const char *fn, const char *name); +int chain_fremovexattr(int fd, const char *name); + +#endif |