summaryrefslogtreecommitdiffstats
path: root/src/os/filestore
diff options
context:
space:
mode:
Diffstat (limited to 'src/os/filestore')
-rw-r--r--src/os/filestore/BtrfsFileStoreBackend.cc575
-rw-r--r--src/os/filestore/BtrfsFileStoreBackend.h49
-rw-r--r--src/os/filestore/CollectionIndex.h207
-rw-r--r--src/os/filestore/DBObjectMap.cc1415
-rw-r--r--src/os/filestore/DBObjectMap.h585
-rw-r--r--src/os/filestore/FDCache.h112
-rw-r--r--src/os/filestore/FileJournal.cc2216
-rw-r--r--src/os/filestore/FileJournal.h556
-rw-r--r--src/os/filestore/FileStore.cc6425
-rw-r--r--src/os/filestore/FileStore.h938
-rw-r--r--src/os/filestore/GenericFileStoreBackend.cc468
-rw-r--r--src/os/filestore/GenericFileStoreBackend.h75
-rw-r--r--src/os/filestore/HashIndex.cc1195
-rw-r--r--src/os/filestore/HashIndex.h462
-rw-r--r--src/os/filestore/IndexManager.cc151
-rw-r--r--src/os/filestore/IndexManager.h99
-rw-r--r--src/os/filestore/Journal.h94
-rw-r--r--src/os/filestore/JournalThrottle.cc67
-rw-r--r--src/os/filestore/JournalThrottle.h101
-rw-r--r--src/os/filestore/JournalingObjectStore.cc271
-rw-r--r--src/os/filestore/JournalingObjectStore.h147
-rw-r--r--src/os/filestore/LFNIndex.cc1407
-rw-r--r--src/os/filestore/LFNIndex.h614
-rw-r--r--src/os/filestore/SequencerPosition.h59
-rw-r--r--src/os/filestore/WBThrottle.cc272
-rw-r--r--src/os/filestore/WBThrottle.h187
-rw-r--r--src/os/filestore/XfsFileStoreBackend.cc149
-rw-r--r--src/os/filestore/XfsFileStoreBackend.h36
-rw-r--r--src/os/filestore/ZFSFileStoreBackend.cc258
-rw-r--r--src/os/filestore/ZFSFileStoreBackend.h33
-rw-r--r--src/os/filestore/chain_xattr.cc413
-rw-r--r--src/os/filestore/chain_xattr.h182
32 files changed, 19818 insertions, 0 deletions
diff --git a/src/os/filestore/BtrfsFileStoreBackend.cc b/src/os/filestore/BtrfsFileStoreBackend.cc
new file mode 100644
index 00000000..2ff2000d
--- /dev/null
+++ b/src/os/filestore/BtrfsFileStoreBackend.cc
@@ -0,0 +1,575 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/ceph_assert.h"
+
+#ifndef __CYGWIN__
+#include "os/fs/btrfs_ioctl.h"
+#endif
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "BtrfsFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+
+#if defined(__linux__)
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
+ GenericFileStoreBackend(fs), has_clone_range(false),
+ has_snap_create(false), has_snap_destroy(false),
+ has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
+ m_filestore_btrfs_clone_range(cct()->_conf->filestore_btrfs_clone_range),
+ m_filestore_btrfs_snap (cct()->_conf->filestore_btrfs_snap) { }
+
+int BtrfsFileStoreBackend::detect_features()
+{
+ int r;
+
+ r = GenericFileStoreBackend::detect_features();
+ if (r < 0)
+ return r;
+
+ // clone_range?
+ if (m_filestore_btrfs_clone_range) {
+ int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY|O_CLOEXEC, 0600);
+ if (fd >= 0) {
+ if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
+ << cpp_strerror(r) << dendl;
+ }
+ btrfs_ioctl_clone_range_args clone_args;
+ memset(&clone_args, 0, sizeof(clone_args));
+ clone_args.src_fd = -1;
+ r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args);
+ if (r < 0 && errno == EBADF) {
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl;
+ has_clone_range = true;
+ } else {
+ r = -errno;
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
+ }
+ TEMP_FAILURE_RETRY(::close(fd));
+ } else {
+ r = -errno;
+ dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
+ << cpp_strerror(r) << dendl;
+ }
+ } else {
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
+ }
+
+ struct btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+
+ // create test source volume
+ vol_args.fd = 0;
+ strcpy(vol_args.name, "test_subvol");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args);
+ if (r != 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+ int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY|O_CLOEXEC);
+ if (srcfd < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+
+ // snap_create and snap_destroy?
+ vol_args.fd = srcfd;
+ strcpy(vol_args.name, "sync_snap_test");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ int err = errno;
+ if (r == 0 || errno == EEXIST) {
+ dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl;
+ has_snap_create = true;
+
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r == 0) {
+ dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl;
+ has_snap_destroy = true;
+ } else {
+ err = -errno;
+ dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+
+ if (err == -EPERM && getuid() != 0) {
+ dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
+ cerr << TEXT_YELLOW
+ << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
+ << TEXT_NORMAL << std::endl;
+ } else if (err == -EOPNOTSUPP) {
+ derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
+ }
+ }
+ } else {
+ dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
+ }
+
+ if (m_filestore_btrfs_snap) {
+ if (has_snap_destroy)
+ stable_commits = true;
+ else
+ dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
+ }
+
+ // start_sync?
+ __u64 transid = 0;
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid);
+ if (r < 0) {
+ int err = errno;
+ dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl;
+ }
+ if (r == 0 && transid > 0) {
+ dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl;
+
+ // do we have wait_sync too?
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+ if (r == 0 || errno == ERANGE) {
+ dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl;
+ has_wait_sync = true;
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+
+ if (has_wait_sync) {
+ // async snap creation?
+ struct btrfs_ioctl_vol_args_v2 async_args;
+ memset(&async_args, 0, sizeof(async_args));
+ async_args.fd = srcfd;
+ async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+ strcpy(async_args.name, "async_snap_test");
+
+ // remove old one, first
+ struct stat st;
+ strcpy(vol_args.name, async_args.name);
+ if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) {
+ dout(0) << "detect_feature: removing old async_snap_test" << dendl;
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r != 0) {
+ int err = errno;
+ dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
+ }
+ }
+
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+ if (r == 0 || errno == EEXIST) {
+ dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl;
+ has_snap_create_v2 = true;
+
+ // clean up
+ strcpy(vol_args.name, "async_snap_test");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r != 0) {
+ int err = errno;
+ dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+ }
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+ }
+
+ // clean up test subvol
+ if (srcfd >= 0)
+ TEMP_FAILURE_RETRY(::close(srcfd));
+
+ strcpy(vol_args.name, "test_subvol");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+
+ if (m_filestore_btrfs_snap && !has_snap_create_v2) {
+ dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
+ << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
+ << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n"
+ << " performance.\n"
+ << TEXT_NORMAL;
+ }
+
+ return 0;
+}
+
+bool BtrfsFileStoreBackend::can_checkpoint()
+{
+ return stable_commits;
+}
+
+int BtrfsFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+ return -EINVAL;
+ }
+
+ struct stat basest;
+ struct statfs currentfs;
+ ret = ::fstat(get_basedir_fd(), &basest);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = ::statfs(get_current_path().c_str(), &currentfs);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) {
+ dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl;
+ stable_commits = true;
+ }
+ return 0;
+ }
+
+ struct btrfs_ioctl_vol_args volargs;
+ memset(&volargs, 0, sizeof(volargs));
+
+ volargs.fd = 0;
+ strcpy(volargs.name, "current");
+ if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
+ ret = -errno;
+ dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl;
+ if (::chmod(get_current_path().c_str(), 0755) < 0) {
+ ret = -errno;
+ dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ stable_commits = true;
+ return 0;
+}
+
+int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+ int ret, err = 0;
+
+ struct stat basest;
+ ret = ::fstat(get_basedir_fd(), &basest);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // get snap list
+ DIR *dir = ::opendir(get_basedir_path().c_str());
+ if (!dir) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ list<string> snaps;
+ char path[PATH_MAX];
+ struct dirent *de;
+ while ((de = ::readdir(dir))) {
+ snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name);
+
+ struct stat st;
+ ret = ::stat(path, &st);
+ if (ret < 0) {
+ err = -errno;
+ dout(0) << "list_checkpoints: stat '" << path << "' failed: "
+ << cpp_strerror(err) << dendl;
+ break;
+ }
+
+ if (!S_ISDIR(st.st_mode))
+ continue;
+
+ struct statfs fs;
+ ret = ::statfs(path, &fs);
+ if (ret < 0) {
+ err = -errno;
+ dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
+ << cpp_strerror(err) << dendl;
+ break;
+ }
+
+ if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
+ snaps.push_back(string(de->d_name));
+ }
+
+ if (::closedir(dir) < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
+ if (!err)
+ err = ret;
+ }
+
+ if (err)
+ return err;
+
+ ls.swap(snaps);
+ return 0;
+}
+
+int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid)
+{
+ dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+ if (has_snap_create_v2 && transid) {
+ struct btrfs_ioctl_vol_args_v2 async_args;
+ memset(&async_args, 0, sizeof(async_args));
+ async_args.fd = get_current_fd();
+ async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+
+ size_t name_size = sizeof(async_args.name);
+ strncpy(async_args.name, name.c_str(), name_size);
+ async_args.name[name_size-1] = '\0';
+
+ int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl;
+ *transid = async_args.transid;
+ } else {
+ struct btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = get_current_fd();
+
+ size_t name_size = sizeof(vol_args.name);
+ strncpy(vol_args.name, name.c_str(), name_size);
+ vol_args.name[name_size-1] = '\0';
+
+ int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (transid)
+ *transid = 0;
+ }
+ return 0;
+}
+
+int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid)
+{
+ // wait for commit
+ dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl;
+ int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl;
+ return -errno;
+ }
+ dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl;
+ return 0;
+}
+
+int BtrfsFileStoreBackend::rollback_to(const string& name)
+{
+ dout(10) << "rollback_to: to '" << name << "'" << dendl;
+ char s[PATH_MAX];
+ btrfs_ioctl_vol_args vol_args;
+
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = 0;
+ strcpy(vol_args.name, "current");
+
+ int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (ret && errno != ENOENT) {
+ dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl;
+ snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
+ if (::rename(get_current_path().c_str(), s)) {
+ ret = -errno;
+ dout(0) << "rollback_to: error renaming old current subvol: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+
+ snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str());
+
+ // roll back
+ vol_args.fd = ::open(s, O_RDONLY|O_CLOEXEC);
+ if (vol_args.fd < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ if (ret < 0 ) {
+ ret = -errno;
+ dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl;
+ }
+ TEMP_FAILURE_RETRY(::close(vol_args.fd));
+ return ret;
+}
+
+int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
+{
+ dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+ btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = 0;
+ strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name));
+
+ int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (ret) {
+ ret = -errno;
+ dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int BtrfsFileStoreBackend::syncfs()
+{
+ dout(15) << "syncfs" << dendl;
+ // do a full btrfs commit
+ int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl;
+ }
+ return ret;
+}
+
+int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl;
+ size_t blk_size = get_blksize();
+ if (!has_clone_range ||
+ srcoff % blk_size != dstoff % blk_size) {
+ dout(20) << "clone_range: using copy" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ int err = 0;
+ int r = 0;
+
+ uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
+ uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
+ if (srcoffclone >= srcoff + len) {
+ dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ uint64_t lenclone = len - (srcoffclone - srcoff);
+ if (!ALIGNED(lenclone, blk_size)) {
+ struct stat from_stat, to_stat;
+ err = ::fstat(from, &from_stat);
+ if (err) return -errno;
+ err = ::fstat(to , &to_stat);
+ if (err) return -errno;
+
+ if (srcoff + len != (uint64_t)from_stat.st_size ||
+ dstoff + len < (uint64_t)to_stat.st_size) {
+ // Not to the end of the file, need to align length as well
+ lenclone = ALIGN_DOWN(lenclone, blk_size);
+ }
+ }
+ if (lenclone == 0) {
+ // too short
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone
+ << " to " << dstoffclone << " = " << r << dendl;
+ btrfs_ioctl_clone_range_args a;
+ a.src_fd = from;
+ a.src_offset = srcoffclone;
+ a.src_length = lenclone;
+ a.dest_offset = dstoffclone;
+ err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
+ if (err >= 0) {
+ r += err;
+ } else if (errno == EINVAL) {
+ // Still failed, might be compressed
+ dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ } else {
+ return -errno;
+ }
+
+ // Take care any trimmed from front
+ if (srcoffclone != srcoff) {
+ err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
+ if (err >= 0) {
+ r += err;
+ } else {
+ return err;
+ }
+ }
+
+ // Copy end
+ if (srcoffclone + lenclone != srcoff + len) {
+ err = _copy_range(from, to,
+ srcoffclone + lenclone,
+ (srcoff + len) - (srcoffclone + lenclone),
+ dstoffclone + lenclone);
+ if (err >= 0) {
+ r += err;
+ } else {
+ return err;
+ }
+ }
+ dout(20) << "clone_range: finished " << srcoff << "~" << len
+ << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+#endif
diff --git a/src/os/filestore/BtrfsFileStoreBackend.h b/src/os/filestore/BtrfsFileStoreBackend.h
new file mode 100644
index 00000000..0794be2d
--- /dev/null
+++ b/src/os/filestore/BtrfsFileStoreBackend.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BTRFSFILESTOREBACKEDN_H
+#define CEPH_BTRFSFILESTOREBACKEDN_H
+
+#if defined(__linux__)
+#include "GenericFileStoreBackend.h"
+
+class BtrfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+ bool has_clone_range; ///< clone range ioctl is supported
+ bool has_snap_create; ///< snap create ioctl is supported
+ bool has_snap_destroy; ///< snap destroy ioctl is supported
+ bool has_snap_create_v2; ///< snap create v2 ioctl (async!) is supported
+ bool has_wait_sync; ///< wait sync ioctl is supported
+ bool stable_commits;
+ bool m_filestore_btrfs_clone_range;
+ bool m_filestore_btrfs_snap;
+public:
+ explicit BtrfsFileStoreBackend(FileStore *fs);
+ ~BtrfsFileStoreBackend() override {}
+ const char *get_name() override {
+ return "btrfs";
+ }
+ int detect_features() override;
+ bool can_checkpoint() override;
+ int create_current() override;
+ int list_checkpoints(list<string>& ls) override;
+ int create_checkpoint(const string& name, uint64_t *cid) override;
+ int sync_checkpoint(uint64_t cid) override;
+ int rollback_to(const string& name) override;
+ int destroy_checkpoint(const string& name) override;
+ int syncfs() override;
+ int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override;
+};
+#endif
+#endif
diff --git a/src/os/filestore/CollectionIndex.h b/src/os/filestore/CollectionIndex.h
new file mode 100644
index 00000000..eb43e47d
--- /dev/null
+++ b/src/os/filestore/CollectionIndex.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef OS_COLLECTIONINDEX_H
+#define OS_COLLECTIONINDEX_H
+
+#include <string>
+#include <vector>
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/RWLock.h"
+
+/**
+ CollectionIndex provides an interface for manipulating indexed collections
+ */
+class CollectionIndex {
+public:
+ CephContext* cct;
+protected:
+ /**
+ * Object encapsulating a returned path.
+ *
+ * A path to an object (existent or non-existent) becomes invalid
+ * when a different object is created in the index. Path stores
+ * a shared_ptr to the CollectionIndex to keep the index alive
+ * during its lifetime.
+ * @see IndexManager
+ * @see self_ref
+ * @see set_ref
+ */
+ class Path {
+ public:
+ /// Returned path
+ string full_path;
+ /// Ref to parent Index
+ CollectionIndex* parent_ref;
+ /// coll_t for parent Index
+ coll_t parent_coll;
+
+ /// Normal Constructor
+ Path(
+ string path, ///< [in] Path to return.
+ CollectionIndex* ref)
+ : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {}
+
+ /// Debugging Constructor
+ Path(
+ string path, ///< [in] Path to return.
+ const coll_t& coll) ///< [in] collection
+ : full_path(path), parent_coll(coll) {}
+
+ /// Getter for the stored path.
+ const char *path() const { return full_path.c_str(); }
+
+ /// Getter for collection
+ const coll_t& coll() const { return parent_coll; }
+
+ /// Getter for parent
+ CollectionIndex* get_index() const {
+ return parent_ref;
+ }
+ };
+ public:
+
+ RWLock access_lock;
+ /// Type of returned paths
+ typedef std::shared_ptr<Path> IndexedPath;
+
+ static IndexedPath get_testing_path(string path, coll_t collection) {
+ return std::make_shared<Path>(path, collection);
+ }
+
+ static const uint32_t FLAT_INDEX_TAG = 0;
+ static const uint32_t HASH_INDEX_TAG = 1;
+ static const uint32_t HASH_INDEX_TAG_2 = 2;
+ static const uint32_t HOBJECT_WITH_POOL = 3;
+ /**
+ * For tracking Filestore collection versions.
+ *
+ * @return Collection version represented by the Index implementation
+ */
+ virtual uint32_t collection_version() = 0;
+
+ /**
+ * Returns the collection managed by this CollectionIndex
+ */
+ virtual coll_t coll() const = 0;
+
+
+ /**
+ * Initializes the index.
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int init() = 0;
+
+ /**
+ * Cleanup before replaying journal
+ *
+ * Index implementations may need to perform compound operations
+ * which may leave the collection unstable if interrupted. cleanup
+ * is called on mount to allow the CollectionIndex implementation
+ * to stabilize.
+ *
+ * @see HashIndex
+ * @return Error Code, 0 for success
+ */
+ virtual int cleanup() = 0;
+
+ /**
+ * Call when a file is created using a path returned from lookup.
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int created(
+ const ghobject_t &oid, ///< [in] Created object.
+ const char *path ///< [in] Path to created object.
+ ) = 0;
+
+ /**
+ * Removes oid from the collection
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int unlink(
+ const ghobject_t &oid ///< [in] Object to remove
+ ) = 0;
+
+ /**
+ * Gets the IndexedPath for oid.
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int lookup(
+ const ghobject_t &oid, ///< [in] Object to lookup
+ IndexedPath *path, ///< [out] Path to object
+ int *hardlink ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist.
+ ) = 0;
+
+ /**
+ * Moves objects matching @e match in the lsb @e bits
+ *
+ * dest and this must be the same subclass
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int split(
+ uint32_t match, //< [in] value to match
+ uint32_t bits, //< [in] bits to check
+ CollectionIndex* dest //< [in] destination index
+ ) { ceph_abort(); return 0; }
+
+ virtual int merge(
+ uint32_t bits, //< [in] common (target) bits
+ CollectionIndex* dest //< [in] destination index
+ ) { ceph_abort(); return 0; }
+
+
+ /// List contents of collection by hash
+ virtual int collection_list_partial(
+ const ghobject_t &start, ///< [in] object at which to start
+ const ghobject_t &end, ///< [in] list only objects < end
+ int max_count, ///< [in] return at most max_count objects
+ vector<ghobject_t> *ls, ///< [out] Listed objects
+ ghobject_t *next ///< [out] Next object to list
+ ) = 0;
+
+ /// Call prior to removing directory
+ virtual int prep_delete() { return 0; }
+
+ CollectionIndex(CephContext* cct, const coll_t& collection)
+ : cct(cct), access_lock("CollectionIndex::access_lock", true, false) {}
+
+ /*
+ * Pre-hash the collection, this collection should map to a PG folder.
+ *
+ * @param pg_num - pg number of the pool this collection belongs to.
+ * @param expected_num_objs - expected number of objects in this collection.
+ * @Return 0 on success, an error code otherwise.
+ */
+ virtual int pre_hash_collection(
+ uint32_t pg_num, ///< [in] pg number of the pool this collection belongs to
+ uint64_t expected_num_objs ///< [in] expected number of objects this collection has
+ ) { ceph_abort(); return 0; }
+
+ virtual int apply_layout_settings(int target_level) { ceph_abort(); return 0; }
+
+ /// Read index-wide settings (should be called after construction)
+ virtual int read_settings() { return 0; }
+
+ /// Virtual destructor
+ virtual ~CollectionIndex() {}
+};
+
+#endif
diff --git a/src/os/filestore/DBObjectMap.cc b/src/os/filestore/DBObjectMap.cc
new file mode 100644
index 00000000..5a057014
--- /dev/null
+++ b/src/os/filestore/DBObjectMap.cc
@@ -0,0 +1,1415 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+#include <iostream>
+#include <set>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "DBObjectMap.h"
+#include <errno.h>
+
+#include "common/debug.h"
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore "
+
+const string DBObjectMap::USER_PREFIX = "_USER_";
+const string DBObjectMap::XATTR_PREFIX = "_AXATTR_";
+const string DBObjectMap::SYS_PREFIX = "_SYS_";
+const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_";
+const string DBObjectMap::HEADER_KEY = "HEADER";
+const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER";
+const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER";
+const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_";
+
+// Legacy
+const string DBObjectMap::LEAF_PREFIX = "_LEAF_";
+const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_";
+
+static void append_escaped(const string &in, string *out)
+{
+ for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+ if (*i == '%') {
+ out->push_back('%');
+ out->push_back('p');
+ } else if (*i == '.') {
+ out->push_back('%');
+ out->push_back('e');
+ } else if (*i == '_') {
+ out->push_back('%');
+ out->push_back('u');
+ } else {
+ out->push_back(*i);
+ }
+ }
+}
+
+int DBObjectMap::check(std::ostream &out, bool repair, bool force)
+{
+ int errors = 0, comp_errors = 0;
+ bool repaired = false;
+ map<uint64_t, uint64_t> parent_to_num_children;
+ map<uint64_t, uint64_t> parent_to_actual_num_children;
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ _Header header;
+ bufferlist bl = iter->value();
+ while (true) {
+ auto bliter = bl.cbegin();
+ header.decode(bliter);
+ if (header.seq != 0)
+ parent_to_actual_num_children[header.seq] = header.num_children;
+
+ if (state.v == 2 || force) {
+ // Check complete table
+ bool complete_error = false;
+ boost::optional<string> prev;
+ KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+ for (complete_iter->seek_to_first(); complete_iter->valid();
+ complete_iter->next()) {
+ if (prev && prev >= complete_iter->key()) {
+ out << "Bad complete for " << header.oid << std::endl;
+ complete_error = true;
+ break;
+ }
+ prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
+ }
+ if (complete_error) {
+ out << "Complete mapping for " << header.seq << " :" << std::endl;
+ for (complete_iter->seek_to_first(); complete_iter->valid();
+ complete_iter->next()) {
+ out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
+ }
+ if (repair) {
+ repaired = true;
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+ db->submit_transaction(t);
+ out << "Cleared complete mapping to repair" << std::endl;
+ } else {
+ errors++; // Only count when not repaired
+ comp_errors++; // Track errors here for version update
+ }
+ }
+ }
+
+ if (header.parent == 0)
+ break;
+
+ if (!parent_to_num_children.count(header.parent))
+ parent_to_num_children[header.parent] = 0;
+ parent_to_num_children[header.parent]++;
+ if (parent_to_actual_num_children.count(header.parent))
+ break;
+
+ set<string> to_get;
+ map<string, bufferlist> got;
+ to_get.insert(HEADER_KEY);
+ db->get(sys_parent_prefix(header), to_get, &got);
+ if (got.empty()) {
+ out << "Missing: seq " << header.parent << std::endl;
+ errors++;
+ break;
+ } else {
+ bl = got.begin()->second;
+ }
+ }
+ }
+
+ for (map<uint64_t, uint64_t>::iterator i = parent_to_num_children.begin();
+ i != parent_to_num_children.end();
+ parent_to_num_children.erase(i++)) {
+ if (!parent_to_actual_num_children.count(i->first))
+ continue;
+ if (parent_to_actual_num_children[i->first] != i->second) {
+ out << "Invalid: seq " << i->first << " recorded children: "
+ << parent_to_actual_num_children[i->first] << " found: "
+ << i->second << std::endl;
+ errors++;
+ }
+ parent_to_actual_num_children.erase(i->first);
+ }
+
+ // Only advance the version from 2 to 3 here
+ // Mark as legacy because there are still older structures
+ // we don't update. The value of legacy is only used
+ // for internal assertions.
+ if (comp_errors == 0 && state.v == 2 && repair) {
+ state.v = 3;
+ state.legacy = true;
+ set_state();
+ }
+
+ if (errors == 0 && repaired)
+ return -1;
+ return errors;
+}
+
+string DBObjectMap::ghobject_key(const ghobject_t &oid)
+{
+ string out;
+ append_escaped(oid.hobj.oid.name, &out);
+ out.push_back('.');
+ append_escaped(oid.hobj.get_key(), &out);
+ out.push_back('.');
+ append_escaped(oid.hobj.nspace, &out);
+ out.push_back('.');
+
+ char snap_with_hash[1000];
+ char *t = snap_with_hash;
+ char *end = t + sizeof(snap_with_hash);
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "snapdir");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+
+ if (oid.hobj.pool == -1)
+ t += snprintf(t, end - t, ".none");
+ else
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
+ t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash());
+
+ if (oid.generation != ghobject_t::NO_GEN ||
+ oid.shard_id != shard_id_t::NO_SHARD) {
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
+ t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
+ }
+ out += string(snap_with_hash);
+ return out;
+}
+
+// ok: pglog%u3%efs1...0.none.0017B237
+// bad: plana8923501-10...4c.3.ffffffffffffffff.2
+// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2
+// returns 0 for false, 1 for true, negative for error
+int DBObjectMap::is_buggy_ghobject_key_v1(CephContext* cct,
+ const string &in)
+{
+ int dots = 5; // skip 5 .'s
+ const char *s = in.c_str();
+ do {
+ while (*s && *s != '.')
+ ++s;
+ if (!*s) {
+ derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+ return -EINVAL;
+ }
+ ++s;
+ } while (*s && --dots);
+ if (!*s) {
+ derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+ return -EINVAL;
+ }
+ // we are now either at a hash value (32 bits, 8 chars) or a generation
+ // value (64 bits) '.' and shard id. count the dots!
+ int len = 0;
+ while (*s && *s != '.') {
+ ++s;
+ ++len;
+ }
+ if (*s == '\0') {
+ if (len != 8) {
+ derr << "hash value is not 8 chars" << dendl;
+ return -EINVAL; // the hash value is always 8 chars.
+ }
+ return 0;
+ }
+ if (*s != '.') { // the shard follows.
+ derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl;
+ return -EINVAL;
+ }
+ return 1;
+}
+
+
+string DBObjectMap::map_header_key(const ghobject_t &oid)
+{
+ return ghobject_key(oid);
+}
+
+string DBObjectMap::header_key(uint64_t seq)
+{
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq);
+ return string(buf);
+}
+
+string DBObjectMap::complete_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX;
+}
+
+string DBObjectMap::user_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + USER_PREFIX;
+}
+
+string DBObjectMap::sys_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + SYS_PREFIX;
+}
+
+string DBObjectMap::xattr_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX;
+}
+
+string DBObjectMap::sys_parent_prefix(_Header header)
+{
+ return USER_PREFIX + header_key(header.parent) + SYS_PREFIX;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::init()
+{
+ invalid = false;
+ if (ready) {
+ return 0;
+ }
+ ceph_assert(!parent_iter);
+ if (header->parent) {
+ Header parent = map->lookup_parent(header);
+ if (!parent) {
+ ceph_abort();
+ return -EINVAL;
+ }
+ parent_iter = std::make_shared<DBObjectMapIteratorImpl>(map, parent);
+ }
+ key_iter = map->db->get_iterator(map->user_prefix(header));
+ ceph_assert(key_iter);
+ complete_iter = map->db->get_iterator(map->complete_prefix(header));
+ ceph_assert(complete_iter);
+ cur_iter = key_iter;
+ ceph_assert(cur_iter);
+ ready = true;
+ return 0;
+}
+
+ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
+ const ghobject_t &oid)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return ObjectMapIterator(new EmptyIteratorImpl());
+ DBObjectMapIterator iter = _get_iterator(header);
+ iter->hlock.swap(hl);
+ return iter;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first()
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->seek_to_first();
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->seek_to_first();
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last()
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->seek_to_last();
+ if (r < 0)
+ return r;
+ if (parent_iter->valid())
+ r = parent_iter->next();
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->seek_to_last();
+ if (r < 0)
+ return r;
+ if (key_iter->valid())
+ r = key_iter->next();
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to)
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->lower_bound(to);
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->lower_bound(to);
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound_parent(const string &to)
+{
+ int r = lower_bound(to);
+ if (r < 0)
+ return r;
+ if (valid() && !on_parent())
+ return next_parent();
+ else
+ return r;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after)
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->upper_bound(after);
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->upper_bound(after);
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid()
+{
+ bool valid = !invalid && ready;
+ ceph_assert(!valid || cur_iter->valid());
+ return valid;
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent()
+{
+ if (parent_iter && parent_iter->valid() &&
+ (!key_iter->valid() || key_iter->key() > parent_iter->key()))
+ return true;
+ return false;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next()
+{
+ ceph_assert(cur_iter->valid());
+ ceph_assert(valid());
+ cur_iter->next();
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next_parent()
+{
+ r = next();
+ if (r < 0)
+ return r;
+ while (parent_iter && parent_iter->valid() && !on_parent()) {
+ ceph_assert(valid());
+ r = lower_bound(parent_iter->key());
+ if (r < 0)
+ return r;
+ }
+
+ if (!parent_iter || !parent_iter->valid()) {
+ invalid = true;
+ }
+ return 0;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test,
+ string *begin,
+ string *end)
+{
+ /* This is clumsy because one cannot call prev() on end(), nor can one
+ * test for == begin().
+ */
+ complete_iter->upper_bound(to_test);
+ if (complete_iter->valid()) {
+ complete_iter->prev();
+ if (!complete_iter->valid()) {
+ complete_iter->upper_bound(to_test);
+ return false;
+ }
+ } else {
+ complete_iter->seek_to_last();
+ if (!complete_iter->valid())
+ return false;
+ }
+
+ ceph_assert(complete_iter->key() <= to_test);
+ ceph_assert(complete_iter->value().length() >= 1);
+ string _end(complete_iter->value().c_str(),
+ complete_iter->value().length() - 1);
+ if (_end.empty() || _end > to_test) {
+ if (begin)
+ *begin = complete_iter->key();
+ if (end)
+ *end = _end;
+ return true;
+ } else {
+ complete_iter->next();
+ ceph_assert(!complete_iter->valid() || complete_iter->key() > to_test);
+ return false;
+ }
+}
+
+/**
+ * Moves parent_iter to the next position both out of the complete_region and
+ * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and
+ * less than key_iter and key_iter otherwise.
+ */
+int DBObjectMap::DBObjectMapIteratorImpl::adjust()
+{
+ string begin, end;
+ while (parent_iter && parent_iter->valid()) {
+ if (in_complete_region(parent_iter->key(), &begin, &end)) {
+ if (end.size() == 0) {
+ parent_iter->seek_to_last();
+ if (parent_iter->valid())
+ parent_iter->next();
+ } else
+ parent_iter->lower_bound(end);
+ } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) {
+ parent_iter->next();
+ } else {
+ break;
+ }
+ }
+ if (valid_parent()) {
+ cur_iter = parent_iter;
+ } else if (key_iter->valid()) {
+ cur_iter = key_iter;
+ } else {
+ invalid = true;
+ }
+ ceph_assert(invalid || cur_iter->valid());
+ return 0;
+}
+
+
+string DBObjectMap::DBObjectMapIteratorImpl::key()
+{
+ return cur_iter->key();
+}
+
+bufferlist DBObjectMap::DBObjectMapIteratorImpl::value()
+{
+ return cur_iter->value();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::status()
+{
+ return r;
+}
+
+int DBObjectMap::set_keys(const ghobject_t &oid,
+ const map<string, bufferlist> &set,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_create_map_header(hl, oid, t);
+ if (!header)
+ return -EINVAL;
+ if (check_spos(oid, header, spos))
+ return 0;
+
+ t->set(user_prefix(header), set);
+
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::set_header(const ghobject_t &oid,
+ const bufferlist &bl,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_create_map_header(hl, oid, t);
+ if (!header)
+ return -EINVAL;
+ if (check_spos(oid, header, spos))
+ return 0;
+ _set_header(header, bl, t);
+ return db->submit_transaction(t);
+}
+
+void DBObjectMap::_set_header(Header header, const bufferlist &bl,
+ KeyValueDB::Transaction t)
+{
+ map<string, bufferlist> to_set;
+ to_set[USER_HEADER_KEY] = bl;
+ t->set(sys_prefix(header), to_set);
+}
+
+int DBObjectMap::get_header(const ghobject_t &oid,
+ bufferlist *bl)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header) {
+ return 0;
+ }
+ return _get_header(header, bl);
+}
+
+int DBObjectMap::_get_header(Header header,
+ bufferlist *bl)
+{
+ map<string, bufferlist> out;
+ while (true) {
+ out.clear();
+ set<string> to_get;
+ to_get.insert(USER_HEADER_KEY);
+ int r = db->get(sys_prefix(header), to_get, &out);
+ if (r == 0 && !out.empty())
+ break;
+ if (r < 0)
+ return r;
+ Header current(header);
+ if (!current->parent)
+ break;
+ header = lookup_parent(current);
+ }
+
+ if (!out.empty())
+ bl->swap(out.begin()->second);
+ return 0;
+}
+
+int DBObjectMap::clear(const ghobject_t &oid,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ if (check_spos(oid, header, spos))
+ return 0;
+ remove_map_header(hl, oid, header, t);
+ ceph_assert(header->num_children > 0);
+ header->num_children--;
+ int r = _clear(header, t);
+ if (r < 0)
+ return r;
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::_clear(Header header,
+ KeyValueDB::Transaction t)
+{
+ while (1) {
+ if (header->num_children) {
+ set_header(header, t);
+ break;
+ }
+ clear_header(header, t);
+ if (!header->parent)
+ break;
+ Header parent = lookup_parent(header);
+ if (!parent) {
+ return -EINVAL;
+ }
+ ceph_assert(parent->num_children > 0);
+ parent->num_children--;
+ header.swap(parent);
+ }
+ return 0;
+}
+
+int DBObjectMap::copy_up_header(Header header,
+ KeyValueDB::Transaction t)
+{
+ bufferlist bl;
+ int r = _get_header(header, &bl);
+ if (r < 0)
+ return r;
+
+ _set_header(header, bl, t);
+ return 0;
+}
+
+int DBObjectMap::rm_keys(const ghobject_t &oid,
+ const set<string> &to_clear,
+ const SequencerPosition *spos)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ KeyValueDB::Transaction t = db->get_transaction();
+ if (check_spos(oid, header, spos))
+ return 0;
+ t->rmkeys(user_prefix(header), to_clear);
+ if (!header->parent) {
+ return db->submit_transaction(t);
+ }
+
+ ceph_assert(state.legacy);
+
+ {
+ // We only get here for legacy (v2) stores
+ // Copy up all keys from parent excluding to_clear
+ // and remove parent
+ // This eliminates a v2 format use of complete for this oid only
+ map<string, bufferlist> to_write;
+ ObjectMapIterator iter = _get_iterator(header);
+ for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+ if (iter->status())
+ return iter->status();
+ if (!to_clear.count(iter->key()))
+ to_write[iter->key()] = iter->value();
+ }
+ t->set(user_prefix(header), to_write);
+ } // destruct iter which has parent in_use
+
+ copy_up_header(header, t);
+ Header parent = lookup_parent(header);
+ if (!parent)
+ return -EINVAL;
+ parent->num_children--;
+ _clear(parent, t);
+ header->parent = 0;
+ set_map_header(hl, oid, *header, t);
+ t->rmkeys_by_prefix(complete_prefix(header));
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::clear_keys_header(const ghobject_t &oid,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ if (check_spos(oid, header, spos))
+ return 0;
+
+ // save old attrs
+ KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+ if (!iter)
+ return -EINVAL;
+ map<string, bufferlist> attrs;
+ for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+ attrs.insert(make_pair(iter->key(), iter->value()));
+ if (iter->status())
+ return iter->status();
+
+ // remove current header
+ remove_map_header(hl, oid, header, t);
+ ceph_assert(header->num_children > 0);
+ header->num_children--;
+ int r = _clear(header, t);
+ if (r < 0)
+ return r;
+
+ // create new header
+ Header newheader = generate_new_header(oid, Header());
+ set_map_header(hl, oid, *newheader, t);
+ if (!attrs.empty())
+ t->set(xattr_prefix(newheader), attrs);
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::get(const ghobject_t &oid,
+ bufferlist *_header,
+ map<string, bufferlist> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ _get_header(header, _header);
+ ObjectMapIterator iter = _get_iterator(header);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ if (iter->status())
+ return iter->status();
+ out->insert(make_pair(iter->key(), iter->value()));
+ }
+ return 0;
+}
+
+int DBObjectMap::get_keys(const ghobject_t &oid,
+ set<string> *keys)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ ObjectMapIterator iter = _get_iterator(header);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ if (iter->status())
+ return iter->status();
+ keys->insert(iter->key());
+ }
+ return 0;
+}
+
+int DBObjectMap::scan(Header header,
+ const set<string> &in_keys,
+ set<string> *out_keys,
+ map<string, bufferlist> *out_values)
+{
+ ObjectMapIterator db_iter = _get_iterator(header);
+ for (set<string>::const_iterator key_iter = in_keys.begin();
+ key_iter != in_keys.end();
+ ++key_iter) {
+ db_iter->lower_bound(*key_iter);
+ if (db_iter->status())
+ return db_iter->status();
+ if (db_iter->valid() && db_iter->key() == *key_iter) {
+ if (out_keys)
+ out_keys->insert(*key_iter);
+ if (out_values)
+ out_values->insert(make_pair(db_iter->key(), db_iter->value()));
+ }
+ }
+ return 0;
+}
+
+int DBObjectMap::get_values(const ghobject_t &oid,
+ const set<string> &keys,
+ map<string, bufferlist> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ return scan(header, keys, 0, out);
+}
+
+int DBObjectMap::check_keys(const ghobject_t &oid,
+ const set<string> &keys,
+ set<string> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ return scan(header, keys, out, 0);
+}
+
+int DBObjectMap::get_xattrs(const ghobject_t &oid,
+ const set<string> &to_get,
+ map<string, bufferlist> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ return db->get(xattr_prefix(header), to_get, out);
+}
+
+int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
+ set<string> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+ if (!iter)
+ return -EINVAL;
+ for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+ out->insert(iter->key());
+ return iter->status();
+}
+
+int DBObjectMap::set_xattrs(const ghobject_t &oid,
+ const map<string, bufferlist> &to_set,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_create_map_header(hl, oid, t);
+ if (!header)
+ return -EINVAL;
+ if (check_spos(oid, header, spos))
+ return 0;
+ t->set(xattr_prefix(header), to_set);
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::remove_xattrs(const ghobject_t &oid,
+ const set<string> &to_remove,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ if (check_spos(oid, header, spos))
+ return 0;
+ t->rmkeys(xattr_prefix(header), to_remove);
+ return db->submit_transaction(t);
+}
+
+// ONLY USED FOR TESTING
+// Set version to 2 to avoid asserts
+int DBObjectMap::legacy_clone(const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos)
+{
+ state.legacy = true;
+
+ if (oid == target)
+ return 0;
+
+ MapHeaderLock _l1(this, std::min(oid, target));
+ MapHeaderLock _l2(this, std::max(oid, target));
+ MapHeaderLock *lsource, *ltarget;
+ if (oid > target) {
+ lsource = &_l2;
+ ltarget= &_l1;
+ } else {
+ lsource = &_l1;
+ ltarget= &_l2;
+ }
+
+ KeyValueDB::Transaction t = db->get_transaction();
+ {
+ Header destination = lookup_map_header(*ltarget, target);
+ if (destination) {
+ if (check_spos(target, destination, spos))
+ return 0;
+ destination->num_children--;
+ remove_map_header(*ltarget, target, destination, t);
+ _clear(destination, t);
+ }
+ }
+
+ Header parent = lookup_map_header(*lsource, oid);
+ if (!parent)
+ return db->submit_transaction(t);
+
+ Header source = generate_new_header(oid, parent);
+ Header destination = generate_new_header(target, parent);
+ if (spos)
+ destination->spos = *spos;
+
+ parent->num_children = 2;
+ set_header(parent, t);
+ set_map_header(*lsource, oid, *source, t);
+ set_map_header(*ltarget, target, *destination, t);
+
+ map<string, bufferlist> to_set;
+ KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent));
+ for (xattr_iter->seek_to_first();
+ xattr_iter->valid();
+ xattr_iter->next())
+ to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+ t->set(xattr_prefix(source), to_set);
+ t->set(xattr_prefix(destination), to_set);
+ t->rmkeys_by_prefix(xattr_prefix(parent));
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::clone(const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos)
+{
+ if (oid == target)
+ return 0;
+
+ MapHeaderLock _l1(this, std::min(oid, target));
+ MapHeaderLock _l2(this, std::max(oid, target));
+ MapHeaderLock *lsource, *ltarget;
+ if (oid > target) {
+ lsource = &_l2;
+ ltarget= &_l1;
+ } else {
+ lsource = &_l1;
+ ltarget= &_l2;
+ }
+
+ KeyValueDB::Transaction t = db->get_transaction();
+ {
+ Header destination = lookup_map_header(*ltarget, target);
+ if (destination) {
+ if (check_spos(target, destination, spos))
+ return 0;
+ destination->num_children--;
+ remove_map_header(*ltarget, target, destination, t);
+ _clear(destination, t);
+ }
+ }
+
+ Header source = lookup_map_header(*lsource, oid);
+ if (!source)
+ return db->submit_transaction(t);
+
+ Header destination = generate_new_header(target, Header());
+ if (spos)
+ destination->spos = *spos;
+
+ set_map_header(*ltarget, target, *destination, t);
+
+ bufferlist bl;
+ int r = _get_header(source, &bl);
+ if (r < 0)
+ return r;
+ _set_header(destination, bl, t);
+
+ map<string, bufferlist> to_set;
+ KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(source));
+ for (xattr_iter->seek_to_first();
+ xattr_iter->valid();
+ xattr_iter->next())
+ to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+ t->set(xattr_prefix(destination), to_set);
+
+ map<string, bufferlist> to_write;
+ ObjectMapIterator iter = _get_iterator(source);
+ for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+ if (iter->status())
+ return iter->status();
+ to_write[iter->key()] = iter->value();
+ }
+ t->set(user_prefix(destination), to_write);
+
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::upgrade_to_v2()
+{
+ dout(1) << __func__ << " start" << dendl;
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ iter->seek_to_first();
+ while (iter->valid()) {
+ unsigned count = 0;
+ KeyValueDB::Transaction t = db->get_transaction();
+ set<string> remove;
+ map<string, bufferlist> add;
+ for (;
+ iter->valid() && count < 300;
+ iter->next()) {
+ dout(20) << __func__ << " key is " << iter->key() << dendl;
+ int r = is_buggy_ghobject_key_v1(cct, iter->key());
+ if (r < 0) {
+ derr << __func__ << " bad key '" << iter->key() << "'" << dendl;
+ return r;
+ }
+ if (!r) {
+ dout(20) << __func__ << " " << iter->key() << " ok" << dendl;
+ continue;
+ }
+
+ // decode header to get oid
+ _Header hdr;
+ bufferlist bl = iter->value();
+ auto bliter = bl.cbegin();
+ hdr.decode(bliter);
+
+ string newkey(ghobject_key(hdr.oid));
+ dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl;
+ add[newkey] = iter->value();
+ remove.insert(iter->key());
+ ++count;
+ }
+
+ if (!remove.empty()) {
+ dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl;
+ t->rmkeys(HOBJECT_TO_SEQ, remove);
+ t->set(HOBJECT_TO_SEQ, add);
+ int r = db->submit_transaction(t);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ state.v = 2;
+
+ set_state();
+ return 0;
+}
+
+void DBObjectMap::set_state()
+{
+ Mutex::Locker l(header_lock);
+ KeyValueDB::Transaction t = db->get_transaction();
+ write_state(t);
+ int ret = db->submit_transaction_sync(t);
+ ceph_assert(ret == 0);
+ dout(1) << __func__ << " done" << dendl;
+ return;
+}
+
+int DBObjectMap::get_state()
+{
+ map<string, bufferlist> result;
+ set<string> to_get;
+ to_get.insert(GLOBAL_STATE_KEY);
+ int r = db->get(SYS_PREFIX, to_get, &result);
+ if (r < 0)
+ return r;
+ if (!result.empty()) {
+ auto bliter = result.begin()->second.cbegin();
+ state.decode(bliter);
+ } else {
+ // New store
+ state.v = State::CUR_VERSION;
+ state.seq = 1;
+ state.legacy = false;
+ }
+ return 0;
+}
+
+int DBObjectMap::init(bool do_upgrade)
+{
+ int ret = get_state();
+ if (ret < 0)
+ return ret;
+ if (state.v < 1) {
+ dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
+ << dendl;
+ return -ENOTSUP;
+ }
+ if (state.v < 2) { // Needs upgrade
+ if (!do_upgrade) {
+ dout(1) << "DOBjbectMap requires an upgrade,"
+ << " set filestore_update_to"
+ << dendl;
+ return -ENOTSUP;
+ } else {
+ int r = upgrade_to_v2();
+ if (r < 0)
+ return r;
+ }
+ }
+ ostringstream ss;
+ int errors = check(ss, true);
+ if (errors) {
+ derr << ss.str() << dendl;
+ if (errors > 0)
+ return -EINVAL;
+ }
+ dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl;
+ return 0;
+}
+
+int DBObjectMap::sync(const ghobject_t *oid,
+ const SequencerPosition *spos) {
+ KeyValueDB::Transaction t = db->get_transaction();
+ if (oid) {
+ ceph_assert(spos);
+ MapHeaderLock hl(this, *oid);
+ Header header = lookup_map_header(hl, *oid);
+ if (header) {
+ dout(10) << "oid: " << *oid << " setting spos to "
+ << *spos << dendl;
+ header->spos = *spos;
+ set_map_header(hl, *oid, *header, t);
+ }
+ /* It may appear that this and the identical portion of the else
+ * block can combined below, but in this block, the transaction
+ * must be submitted under *both* the MapHeaderLock and the full
+ * header_lock.
+ *
+ * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891.
+ */
+ Mutex::Locker l(header_lock);
+ write_state(t);
+ return db->submit_transaction_sync(t);
+ } else {
+ Mutex::Locker l(header_lock);
+ write_state(t);
+ return db->submit_transaction_sync(t);
+ }
+}
+
+int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
+ ceph_assert(header_lock.is_locked_by_me());
+ dout(20) << "dbobjectmap: seq is " << state.seq << dendl;
+ KeyValueDB::Transaction t = _t ? _t : db->get_transaction();
+ bufferlist bl;
+ state.encode(bl);
+ map<string, bufferlist> to_write;
+ to_write[GLOBAL_STATE_KEY] = bl;
+ t->set(SYS_PREFIX, to_write);
+ return _t ? 0 : db->submit_transaction(t);
+}
+
+
+DBObjectMap::Header DBObjectMap::_lookup_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid)
+{
+ ceph_assert(l.get_locked() == oid);
+
+ _Header *header = new _Header();
+ {
+ Mutex::Locker l(cache_lock);
+ if (caches.lookup(oid, header)) {
+ ceph_assert(!in_use.count(header->seq));
+ in_use.insert(header->seq);
+ return Header(header, RemoveOnDelete(this));
+ }
+ }
+
+ bufferlist out;
+ int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out);
+ if (r < 0 || out.length()==0) {
+ delete header;
+ return Header();
+ }
+
+ Header ret(header, RemoveOnDelete(this));
+ auto iter = out.cbegin();
+ ret->decode(iter);
+ {
+ Mutex::Locker l(cache_lock);
+ caches.add(oid, *ret);
+ }
+
+ ceph_assert(!in_use.count(header->seq));
+ in_use.insert(header->seq);
+ return ret;
+}
+
+DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
+ Header parent)
+{
+ Header header = Header(new _Header(), RemoveOnDelete(this));
+ header->seq = state.seq++;
+ if (parent) {
+ header->parent = parent->seq;
+ header->spos = parent->spos;
+ }
+ header->num_children = 1;
+ header->oid = oid;
+ ceph_assert(!in_use.count(header->seq));
+ in_use.insert(header->seq);
+
+ write_state();
+ return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
+{
+ Mutex::Locker l(header_lock);
+ while (in_use.count(input->parent))
+ header_cond.Wait(header_lock);
+ map<string, bufferlist> out;
+ set<string> keys;
+ keys.insert(HEADER_KEY);
+
+ dout(20) << "lookup_parent: parent " << input->parent
+ << " for seq " << input->seq << dendl;
+ int r = db->get(sys_parent_prefix(input), keys, &out);
+ if (r < 0) {
+ ceph_abort();
+ return Header();
+ }
+ if (out.empty()) {
+ ceph_abort();
+ return Header();
+ }
+
+ Header header = Header(new _Header(), RemoveOnDelete(this));
+ auto iter = out.begin()->second.cbegin();
+ header->decode(iter);
+ ceph_assert(header->seq == input->parent);
+ dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
+ << header->parent << dendl;
+ in_use.insert(header->seq);
+ return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_create_map_header(
+ const MapHeaderLock &hl,
+ const ghobject_t &oid,
+ KeyValueDB::Transaction t)
+{
+ Mutex::Locker l(header_lock);
+ Header header = _lookup_map_header(hl, oid);
+ if (!header) {
+ header = _generate_new_header(oid, Header());
+ set_map_header(hl, oid, *header, t);
+ }
+ return header;
+}
+
+void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t)
+{
+ dout(20) << "clear_header: clearing seq " << header->seq << dendl;
+ t->rmkeys_by_prefix(user_prefix(header));
+ t->rmkeys_by_prefix(sys_prefix(header));
+ if (state.legacy)
+ t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0
+ t->rmkeys_by_prefix(xattr_prefix(header));
+ set<string> keys;
+ keys.insert(header_key(header->seq));
+ t->rmkeys(USER_PREFIX, keys);
+}
+
+void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
+{
+ dout(20) << "set_header: setting seq " << header->seq << dendl;
+ map<string, bufferlist> to_write;
+ header->encode(to_write[HEADER_KEY]);
+ t->set(sys_prefix(header), to_write);
+}
+
+void DBObjectMap::remove_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid,
+ Header header,
+ KeyValueDB::Transaction t)
+{
+ ceph_assert(l.get_locked() == oid);
+ dout(20) << "remove_map_header: removing " << header->seq
+ << " oid " << oid << dendl;
+ set<string> to_remove;
+ to_remove.insert(map_header_key(oid));
+ t->rmkeys(HOBJECT_TO_SEQ, to_remove);
+ {
+ Mutex::Locker l(cache_lock);
+ caches.clear(oid);
+ }
+}
+
+void DBObjectMap::set_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid, _Header header,
+ KeyValueDB::Transaction t)
+{
+ ceph_assert(l.get_locked() == oid);
+ dout(20) << "set_map_header: setting " << header.seq
+ << " oid " << oid << " parent seq "
+ << header.parent << dendl;
+ map<string, bufferlist> to_set;
+ header.encode(to_set[map_header_key(oid)]);
+ t->set(HOBJECT_TO_SEQ, to_set);
+ {
+ Mutex::Locker l(cache_lock);
+ caches.add(oid, header);
+ }
+}
+
+bool DBObjectMap::check_spos(const ghobject_t &oid,
+ Header header,
+ const SequencerPosition *spos)
+{
+ if (!spos || *spos > header->spos) {
+ stringstream out;
+ if (spos)
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
+ << *spos << dendl;
+ else
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
+ << "empty" << dendl;
+ dout(10) << " > header.spos " << header->spos << dendl;
+ return false;
+ } else {
+ dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
+ << " <= header.spos " << header->spos << dendl;
+ return true;
+ }
+}
+
+int DBObjectMap::list_objects(vector<ghobject_t> *out)
+{
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ bufferlist bl = iter->value();
+ auto bliter = bl.cbegin();
+ _Header header;
+ header.decode(bliter);
+ out->push_back(header.oid);
+ }
+ return 0;
+}
+
+int DBObjectMap::list_object_headers(vector<_Header> *out)
+{
+ int error = 0;
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ bufferlist bl = iter->value();
+ auto bliter = bl.cbegin();
+ _Header header;
+ header.decode(bliter);
+ out->push_back(header);
+ while (header.parent) {
+ set<string> to_get;
+ map<string, bufferlist> got;
+ to_get.insert(HEADER_KEY);
+ db->get(sys_parent_prefix(header), to_get, &got);
+ if (got.empty()) {
+ dout(0) << "Missing: seq " << header.parent << dendl;
+ error = -ENOENT;
+ break;
+ } else {
+ bl = got.begin()->second;
+ auto bliter = bl.cbegin();
+ header.decode(bliter);
+ out->push_back(header);
+ }
+ }
+ }
+ return error;
+}
+
+ostream& operator<<(ostream& out, const DBObjectMap::_Header& h)
+{
+ out << "seq=" << h.seq << " parent=" << h.parent
+ << " num_children=" << h.num_children
+ << " ghobject=" << h.oid;
+ return out;
+}
+
+int DBObjectMap::rename(const ghobject_t &from,
+ const ghobject_t &to,
+ const SequencerPosition *spos)
+{
+ if (from == to)
+ return 0;
+
+ MapHeaderLock _l1(this, std::min(from, to));
+ MapHeaderLock _l2(this, std::max(from, to));
+ MapHeaderLock *lsource, *ltarget;
+ if (from > to) {
+ lsource = &_l2;
+ ltarget= &_l1;
+ } else {
+ lsource = &_l1;
+ ltarget= &_l2;
+ }
+
+ KeyValueDB::Transaction t = db->get_transaction();
+ {
+ Header destination = lookup_map_header(*ltarget, to);
+ if (destination) {
+ if (check_spos(to, destination, spos))
+ return 0;
+ destination->num_children--;
+ remove_map_header(*ltarget, to, destination, t);
+ _clear(destination, t);
+ }
+ }
+
+ Header hdr = lookup_map_header(*lsource, from);
+ if (!hdr)
+ return db->submit_transaction(t);
+
+ remove_map_header(*lsource, from, hdr, t);
+ hdr->oid = to;
+ set_map_header(*ltarget, to, *hdr, t);
+
+ return db->submit_transaction(t);
+}
diff --git a/src/os/filestore/DBObjectMap.h b/src/os/filestore/DBObjectMap.h
new file mode 100644
index 00000000..e288df83
--- /dev/null
+++ b/src/os/filestore/DBObjectMap.h
@@ -0,0 +1,585 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef DBOBJECTMAP_DB_H
+#define DBOBJECTMAP_DB_H
+
+#include "include/buffer_fwd.h"
+#include <set>
+#include <map>
+#include <string>
+
+#include <vector>
+#include <boost/scoped_ptr.hpp>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "osd/osd_types.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/simple_cache.hpp"
+#include <boost/optional/optional_io.hpp>
+
+#include "SequencerPosition.h"
+
+/**
+ * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
+ *
+ * Prefix space structure:
+ *
+ * @see complete_prefix
+ * @see user_prefix
+ * @see sys_prefix
+ *
+ * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and
+ * corresponding omap header
+ * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
+ * @see State
+ * @see write_state
+ * @see init
+ * @see generate_new_header
+ * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
+ * : key->value for header->seq
+ * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
+ * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
+ * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
+ * : USER_HEADER_KEY - omap header for header->seq
+ * : HEADER_KEY - encoding of header for header->seq
+ *
+ * For each node (represented by a header), we
+ * store three mappings: the key mapping, the complete mapping, and the parent.
+ * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in
+ * this mapping indicates that the key mapping contains all entries on [x,y).
+ * Note, max string is represented by "", so ""->"" indicates that the parent
+ * is unnecessary (@see rm_keys). When looking up a key not contained in the
+ * the complete set, we have to check the parent if we don't find it in the
+ * key set. During rm_keys, we copy keys from the parent and update the
+ * complete set to reflect the change @see rm_keys.
+ */
+class DBObjectMap : public ObjectMap {
+public:
+
+ KeyValueDB *get_db() override { return db.get(); }
+
+ /**
+ * Serializes access to next_seq as well as the in_use set
+ */
+ Mutex header_lock;
+ Cond header_cond;
+ Cond map_header_cond;
+
+ /**
+ * Set of headers currently in use
+ */
+ set<uint64_t> in_use;
+ set<ghobject_t> map_header_in_use;
+
+ /**
+ * Takes the map_header_in_use entry in constructor, releases in
+ * destructor
+ */
+ class MapHeaderLock {
+ DBObjectMap *db;
+ boost::optional<ghobject_t> locked;
+
+ MapHeaderLock(const MapHeaderLock &);
+ MapHeaderLock &operator=(const MapHeaderLock &);
+ public:
+ explicit MapHeaderLock(DBObjectMap *db) : db(db) {}
+ MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) {
+ Mutex::Locker l(db->header_lock);
+ while (db->map_header_in_use.count(*locked))
+ db->map_header_cond.Wait(db->header_lock);
+ db->map_header_in_use.insert(*locked);
+ }
+
+ const ghobject_t &get_locked() const {
+ ceph_assert(locked);
+ return *locked;
+ }
+
+ void swap(MapHeaderLock &o) {
+ ceph_assert(db == o.db);
+
+ // centos6's boost optional doesn't seem to have swap :(
+ boost::optional<ghobject_t> _locked = o.locked;
+ o.locked = locked;
+ locked = _locked;
+ }
+
+ ~MapHeaderLock() {
+ if (locked) {
+ Mutex::Locker l(db->header_lock);
+ ceph_assert(db->map_header_in_use.count(*locked));
+ db->map_header_cond.Signal();
+ db->map_header_in_use.erase(*locked);
+ }
+ }
+ };
+
+ DBObjectMap(CephContext* cct, KeyValueDB *db)
+ : ObjectMap(cct, db), header_lock("DBOBjectMap"),
+ cache_lock("DBObjectMap::CacheLock"),
+ caches(cct->_conf->filestore_omap_header_cache_size)
+ {}
+
+ int set_keys(
+ const ghobject_t &oid,
+ const map<string, bufferlist> &set,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int set_header(
+ const ghobject_t &oid,
+ const bufferlist &bl,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int get_header(
+ const ghobject_t &oid,
+ bufferlist *bl
+ ) override;
+
+ int clear(
+ const ghobject_t &oid,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int clear_keys_header(
+ const ghobject_t &oid,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int rm_keys(
+ const ghobject_t &oid,
+ const set<string> &to_clear,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int get(
+ const ghobject_t &oid,
+ bufferlist *header,
+ map<string, bufferlist> *out
+ ) override;
+
+ int get_keys(
+ const ghobject_t &oid,
+ set<string> *keys
+ ) override;
+
+ int get_values(
+ const ghobject_t &oid,
+ const set<string> &keys,
+ map<string, bufferlist> *out
+ ) override;
+
+ int check_keys(
+ const ghobject_t &oid,
+ const set<string> &keys,
+ set<string> *out
+ ) override;
+
+ int get_xattrs(
+ const ghobject_t &oid,
+ const set<string> &to_get,
+ map<string, bufferlist> *out
+ ) override;
+
+ int get_all_xattrs(
+ const ghobject_t &oid,
+ set<string> *out
+ ) override;
+
+ int set_xattrs(
+ const ghobject_t &oid,
+ const map<string, bufferlist> &to_set,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int remove_xattrs(
+ const ghobject_t &oid,
+ const set<string> &to_remove,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int clone(
+ const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int rename(
+ const ghobject_t &from,
+ const ghobject_t &to,
+ const SequencerPosition *spos=0
+ );
+
+ int legacy_clone(
+ const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos=0
+ );
+
+ /// Read initial state from backing store
+ int get_state();
+ /// Write current state settings to DB
+ void set_state();
+ /// Read initial state and upgrade or initialize state
+ int init(bool upgrade = false);
+
+ /// Upgrade store to current version
+ int upgrade_to_v2();
+
+ /// Consistency check, debug, there must be no parallel writes
+ int check(std::ostream &out, bool repair = false, bool force = false) override;
+
+ /// Ensure that all previous operations are durable
+ int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
+
+ void compact() override {
+ ceph_assert(db);
+ db->compact();
+ }
+
+ /// Util, get all objects, there must be no other concurrent access
+ int list_objects(vector<ghobject_t> *objs ///< [out] objects
+ );
+
+ struct _Header;
+ // Util, get all object headers, there must be no other concurrent access
+ int list_object_headers(vector<_Header> *out ///< [out] headers
+ );
+
+ ObjectMapIterator get_iterator(const ghobject_t &oid) override;
+
+ static const string USER_PREFIX;
+ static const string XATTR_PREFIX;
+ static const string SYS_PREFIX;
+ static const string COMPLETE_PREFIX;
+ static const string HEADER_KEY;
+ static const string USER_HEADER_KEY;
+ static const string GLOBAL_STATE_KEY;
+ static const string HOBJECT_TO_SEQ;
+
+ /// Legacy
+ static const string LEAF_PREFIX;
+ static const string REVERSE_LEAF_PREFIX;
+
+ /// persistent state for store @see generate_header
+ struct State {
+ static const __u8 CUR_VERSION = 3;
+ __u8 v;
+ uint64_t seq;
+ // legacy is false when complete regions never used
+ bool legacy;
+ State() : v(0), seq(1), legacy(false) {}
+ explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {}
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(3, 1, bl);
+ encode(v, bl);
+ encode(seq, bl);
+ encode(legacy, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl) {
+ DECODE_START(3, bl);
+ if (struct_v >= 2)
+ decode(v, bl);
+ else
+ v = 0;
+ decode(seq, bl);
+ if (struct_v >= 3)
+ decode(legacy, bl);
+ else
+ legacy = false;
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_unsigned("v", v);
+ f->dump_unsigned("seq", seq);
+ f->dump_bool("legacy", legacy);
+ }
+
+ static void generate_test_instances(list<State*> &o) {
+ o.push_back(new State(0));
+ o.push_back(new State(20));
+ }
+ } state;
+
+ struct _Header {
+ uint64_t seq;
+ uint64_t parent;
+ uint64_t num_children;
+
+ ghobject_t oid;
+
+ SequencerPosition spos;
+
+ void encode(bufferlist &bl) const {
+ coll_t unused;
+ ENCODE_START(2, 1, bl);
+ encode(seq, bl);
+ encode(parent, bl);
+ encode(num_children, bl);
+ encode(unused, bl);
+ encode(oid, bl);
+ encode(spos, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl) {
+ coll_t unused;
+ DECODE_START(2, bl);
+ decode(seq, bl);
+ decode(parent, bl);
+ decode(num_children, bl);
+ decode(unused, bl);
+ decode(oid, bl);
+ if (struct_v >= 2)
+ decode(spos, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("parent", parent);
+ f->dump_unsigned("num_children", num_children);
+ f->dump_stream("oid") << oid;
+ }
+
+ static void generate_test_instances(list<_Header*> &o) {
+ o.push_back(new _Header);
+ o.push_back(new _Header);
+ o.back()->parent = 20;
+ o.back()->seq = 30;
+ }
+
+ size_t length() {
+ return sizeof(_Header);
+ }
+
+ _Header() : seq(0), parent(0), num_children(1) {}
+ };
+
+ /// String munging (public for testing)
+ static string ghobject_key(const ghobject_t &oid);
+ static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
+ static int is_buggy_ghobject_key_v1(CephContext* cct,
+ const string &in);
+private:
+ /// Implicit lock on Header->seq
+ typedef std::shared_ptr<_Header> Header;
+ Mutex cache_lock;
+ SimpleLRU<ghobject_t, _Header> caches;
+
+ string map_header_key(const ghobject_t &oid);
+ string header_key(uint64_t seq);
+ string complete_prefix(Header header);
+ string user_prefix(Header header);
+ string sys_prefix(Header header);
+ string xattr_prefix(Header header);
+ string sys_parent_prefix(_Header header);
+ string sys_parent_prefix(Header header) {
+ return sys_parent_prefix(*header);
+ }
+
+ class EmptyIteratorImpl : public ObjectMapIteratorImpl {
+ public:
+ int seek_to_first() override { return 0; }
+ int seek_to_last() { return 0; }
+ int upper_bound(const string &after) override { return 0; }
+ int lower_bound(const string &to) override { return 0; }
+ bool valid() override { return false; }
+ int next() override { ceph_abort(); return 0; }
+ string key() override { ceph_abort(); return ""; }
+ bufferlist value() override { ceph_abort(); return bufferlist(); }
+ int status() override { return 0; }
+ };
+
+
+ /// Iterator
+ class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl {
+ public:
+ DBObjectMap *map;
+
+ /// NOTE: implicit lock hlock->get_locked() when returned out of the class
+ MapHeaderLock hlock;
+ /// NOTE: implicit lock on header->seq AND for all ancestors
+ Header header;
+
+ /// parent_iter == NULL iff no parent
+ std::shared_ptr<DBObjectMapIteratorImpl> parent_iter;
+ KeyValueDB::Iterator key_iter;
+ KeyValueDB::Iterator complete_iter;
+
+ /// cur_iter points to currently valid iterator
+ std::shared_ptr<ObjectMapIteratorImpl> cur_iter;
+ int r;
+
+ /// init() called, key_iter, complete_iter, parent_iter filled in
+ bool ready;
+ /// past end
+ bool invalid;
+
+ DBObjectMapIteratorImpl(DBObjectMap *map, Header header) :
+ map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {}
+ int seek_to_first() override;
+ int seek_to_last();
+ int upper_bound(const string &after) override;
+ int lower_bound(const string &to) override;
+ bool valid() override;
+ int next() override;
+ string key() override;
+ bufferlist value() override;
+ int status() override;
+
+ bool on_parent() {
+ return cur_iter == parent_iter;
+ }
+
+ /// skips to next valid parent entry
+ int next_parent();
+
+ /// first parent() >= to
+ int lower_bound_parent(const string &to);
+
+ /**
+ * Tests whether to_test is in complete region
+ *
+ * postcondition: complete_iter will be max s.t. complete_iter->value > to_test
+ */
+ int in_complete_region(const string &to_test, ///< [in] key to test
+ string *begin, ///< [out] beginning of region
+ string *end ///< [out] end of region
+ ); ///< @returns true if to_test is in the complete region, else false
+
+ private:
+ int init();
+ bool valid_parent();
+ int adjust();
+ };
+
+ typedef std::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator;
+ DBObjectMapIterator _get_iterator(Header header) {
+ return std::make_shared<DBObjectMapIteratorImpl>(this, header);
+ }
+
+ /// sys
+
+ /// Removes node corresponding to header
+ void clear_header(Header header, KeyValueDB::Transaction t);
+
+ /// Set node containing input to new contents
+ void set_header(Header input, KeyValueDB::Transaction t);
+
+ /// Remove leaf node corresponding to oid in c
+ void remove_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid,
+ Header header,
+ KeyValueDB::Transaction t);
+
+ /// Set leaf node for c and oid to the value of header
+ void set_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid, _Header header,
+ KeyValueDB::Transaction t);
+
+ /// Set leaf node for c and oid to the value of header
+ bool check_spos(const ghobject_t &oid,
+ Header header,
+ const SequencerPosition *spos);
+
+ /// Lookup or create header for c oid
+ Header lookup_create_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid,
+ KeyValueDB::Transaction t);
+
+ /**
+ * Generate new header for c oid with new seq number
+ *
+ * Has the side effect of synchronously saving the new DBObjectMap state
+ */
+ Header _generate_new_header(const ghobject_t &oid, Header parent);
+ Header generate_new_header(const ghobject_t &oid, Header parent) {
+ Mutex::Locker l(header_lock);
+ return _generate_new_header(oid, parent);
+ }
+
+ /// Lookup leaf header for c oid
+ Header _lookup_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid);
+ Header lookup_map_header(
+ const MapHeaderLock &l2,
+ const ghobject_t &oid) {
+ Mutex::Locker l(header_lock);
+ return _lookup_map_header(l2, oid);
+ }
+
+ /// Lookup header node for input
+ Header lookup_parent(Header input);
+
+
+ /// Helpers
+ int _get_header(Header header, bufferlist *bl);
+
+ /// Scan keys in header into out_keys and out_values (if nonnull)
+ int scan(Header header,
+ const set<string> &in_keys,
+ set<string> *out_keys,
+ map<string, bufferlist> *out_values);
+
+ /// Remove header and all related prefixes
+ int _clear(Header header,
+ KeyValueDB::Transaction t);
+
+ /* Scan complete region bumping *begin to the beginning of any
+ * containing region and adding all complete region keys between
+ * the updated begin and end to the complete_keys_to_remove set */
+ int merge_new_complete(DBObjectMapIterator &iter,
+ string *begin,
+ const string &end,
+ set<string> *complete_keys_to_remove);
+
+ /// Writes out State (mainly next_seq)
+ int write_state(KeyValueDB::Transaction _t =
+ KeyValueDB::Transaction());
+
+ /// Copies header entry from parent @see rm_keys
+ int copy_up_header(Header header,
+ KeyValueDB::Transaction t);
+
+ /// Sets header @see set_header
+ void _set_header(Header header, const bufferlist &bl,
+ KeyValueDB::Transaction t);
+
+ /**
+ * Removes header seq lock and possibly object lock
+ * once Header is out of scope
+ * @see lookup_parent
+ * @see generate_new_header
+ */
+ class RemoveOnDelete {
+ public:
+ DBObjectMap *db;
+ explicit RemoveOnDelete(DBObjectMap *db) :
+ db(db) {}
+ void operator() (_Header *header) {
+ Mutex::Locker l(db->header_lock);
+ ceph_assert(db->in_use.count(header->seq));
+ db->in_use.erase(header->seq);
+ db->header_cond.Signal();
+ delete header;
+ }
+ };
+ friend class RemoveOnDelete;
+};
+WRITE_CLASS_ENCODER(DBObjectMap::_Header)
+WRITE_CLASS_ENCODER(DBObjectMap::State)
+
+ostream& operator<<(ostream& out, const DBObjectMap::_Header& h);
+
+#endif
diff --git a/src/os/filestore/FDCache.h b/src/os/filestore/FDCache.h
new file mode 100644
index 00000000..ee8c4fb0
--- /dev/null
+++ b/src/os/filestore/FDCache.h
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_FDCACHE_H
+#define CEPH_FDCACHE_H
+
+#include <memory>
+#include <errno.h>
+#include <cstdio>
+#include "common/config_obs.h"
+#include "common/hobject.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/shared_cache.hpp"
+#include "include/compat.h"
+#include "include/intarith.h"
+
+/**
+ * FD Cache
+ */
+class FDCache : public md_config_obs_t {
+public:
+ /**
+ * FD
+ *
+ * Wrapper for an fd. Destructor closes the fd.
+ */
+ class FD {
+ public:
+ const int fd;
+ explicit FD(int _fd) : fd(_fd) {
+ ceph_assert(_fd >= 0);
+ }
+ int operator*() const {
+ return fd;
+ }
+ ~FD() {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ };
+
+private:
+ CephContext *cct;
+ const int registry_shards;
+ SharedLRU<ghobject_t, FD> *registry;
+
+public:
+ explicit FDCache(CephContext *cct) : cct(cct),
+ registry_shards(std::max<int64_t>(cct->_conf->filestore_fd_cache_shards, 1)) {
+ ceph_assert(cct);
+ cct->_conf.add_observer(this);
+ registry = new SharedLRU<ghobject_t, FD>[registry_shards];
+ for (int i = 0; i < registry_shards; ++i) {
+ registry[i].set_cct(cct);
+ registry[i].set_size(
+ std::max<int64_t>((cct->_conf->filestore_fd_cache_size / registry_shards), 1));
+ }
+ }
+ ~FDCache() override {
+ cct->_conf.remove_observer(this);
+ delete[] registry;
+ }
+ typedef std::shared_ptr<FD> FDRef;
+
+ FDRef lookup(const ghobject_t &hoid) {
+ int registry_id = hoid.hobj.get_hash() % registry_shards;
+ return registry[registry_id].lookup(hoid);
+ }
+
+ FDRef add(const ghobject_t &hoid, int fd, bool *existed) {
+ int registry_id = hoid.hobj.get_hash() % registry_shards;
+ return registry[registry_id].add(hoid, new FD(fd), existed);
+ }
+
+ /// clear cached fd for hoid, subsequent lookups will get an empty FD
+ void clear(const ghobject_t &hoid) {
+ int registry_id = hoid.hobj.get_hash() % registry_shards;
+ registry[registry_id].purge(hoid);
+ }
+
+ /// md_config_obs_t
+ const char** get_tracked_conf_keys() const override {
+ static const char* KEYS[] = {
+ "filestore_fd_cache_size",
+ NULL
+ };
+ return KEYS;
+ }
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override {
+ if (changed.count("filestore_fd_cache_size")) {
+ for (int i = 0; i < registry_shards; ++i)
+ registry[i].set_size(
+ std::max<int64_t>((conf->filestore_fd_cache_size / registry_shards), 1));
+ }
+ }
+
+};
+typedef FDCache::FDRef FDRef;
+
+#endif
diff --git a/src/os/filestore/FileJournal.cc b/src/os/filestore/FileJournal.cc
new file mode 100644
index 00000000..f0351fe4
--- /dev/null
+++ b/src/os/filestore/FileJournal.cc
@@ -0,0 +1,2216 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "acconfig.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "FileJournal.h"
+#include "include/color.h"
+#include "common/perf_counters.h"
+#include "FileStore.h"
+
+#include "include/compat.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+
+#include "common/blkdev.h"
+#if defined(__linux__)
+#include "common/linux_version.h"
+#endif
+
+#if defined(__FreeBSD__)
+#define O_DSYNC O_SYNC
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+const static int64_t ONE_MEG(1 << 20);
+const static int CEPH_DIRECTIO_ALIGNMENT(4096);
+
+
+int FileJournal::_open(bool forwrite, bool create)
+{
+ int flags, ret;
+
+ if (forwrite) {
+ flags = O_RDWR;
+ if (directio)
+ flags |= O_DIRECT | O_DSYNC;
+ } else {
+ flags = O_RDONLY;
+ }
+ if (create)
+ flags |= O_CREAT;
+
+ if (fd >= 0) {
+ if (TEMP_FAILURE_RETRY(::close(fd))) {
+ int err = errno;
+ derr << "FileJournal::_open: error closing old fd: "
+ << cpp_strerror(err) << dendl;
+ }
+ }
+ fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags|O_CLOEXEC, 0644));
+ if (fd < 0) {
+ int err = errno;
+ dout(2) << "FileJournal::_open unable to open journal "
+ << fn << ": " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+
+ struct stat st;
+ ret = ::fstat(fd, &st);
+ if (ret) {
+ ret = errno;
+ derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;
+ ret = -ret;
+ goto out_fd;
+ }
+
+ if (S_ISBLK(st.st_mode)) {
+ ret = _open_block_device();
+ } else if (S_ISREG(st.st_mode)) {
+ if (aio && !force_aio) {
+ derr << "FileJournal::_open: disabling aio for non-block journal. Use "
+ << "journal_force_aio to force use of aio anyway" << dendl;
+ aio = false;
+ }
+ ret = _open_file(st.st_size, st.st_blksize, create);
+ } else {
+ derr << "FileJournal::_open: wrong journal file type: " << st.st_mode
+ << dendl;
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ goto out_fd;
+
+#ifdef HAVE_LIBAIO
+ if (aio) {
+ aio_ctx = 0;
+ ret = io_setup(128, &aio_ctx);
+ if (ret < 0) {
+ switch (ret) {
+ // Contrary to naive expectations -EAGIAN means ...
+ case -EAGAIN:
+ derr << "FileJournal::_open: user's limit of aio events exceeded. "
+ << "Try increasing /proc/sys/fs/aio-max-nr" << dendl;
+ break;
+ default:
+ derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl;
+ break;
+ }
+ goto out_fd;
+ }
+ }
+#endif
+
+ /* We really want max_size to be a multiple of block_size. */
+ max_size -= max_size % block_size;
+
+ dout(1) << "_open " << fn << " fd " << fd
+ << ": " << max_size
+ << " bytes, block size " << block_size
+ << " bytes, directio = " << directio
+ << ", aio = " << aio
+ << dendl;
+ return 0;
+
+ out_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ fd = -1;
+ return ret;
+}
+
+int FileJournal::_open_block_device()
+{
+ int64_t bdev_sz = 0;
+ BlkDev blkdev(fd);
+ int ret = blkdev.get_size(&bdev_sz);
+ if (ret) {
+ dout(0) << __func__ << ": failed to read block device size." << dendl;
+ return -EIO;
+ }
+
+ /* Check for bdev_sz too small */
+ if (bdev_sz < ONE_MEG) {
+ dout(0) << __func__ << ": your block device must be at least "
+ << ONE_MEG << " bytes to be used for a Ceph journal." << dendl;
+ return -EINVAL;
+ }
+
+ dout(10) << __func__ << ": ignoring osd journal size. "
+ << "We'll use the entire block device (size: " << bdev_sz << ")"
+ << dendl;
+ max_size = bdev_sz;
+
+ block_size = cct->_conf->journal_block_size;
+
+ if (cct->_conf->journal_discard) {
+ discard = blkdev.support_discard();
+ dout(10) << fn << " support discard: " << (int)discard << dendl;
+ }
+
+ return 0;
+}
+
+int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
+ bool create)
+{
+ int ret;
+ int64_t conf_journal_sz(cct->_conf->osd_journal_size);
+ conf_journal_sz <<= 20;
+
+ if ((cct->_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) {
+ derr << "I'm sorry, I don't know how large of a journal to create."
+ << "Please specify a block device to use as the journal OR "
+ << "set osd_journal_size in your ceph.conf" << dendl;
+ return -EINVAL;
+ }
+
+ if (create && (oldsize < conf_journal_sz)) {
+ uint64_t newsize(conf_journal_sz);
+ dout(10) << __func__ << " _open extending to " << newsize << " bytes" << dendl;
+ ret = ::ftruncate(fd, newsize);
+ if (ret < 0) {
+ int err = errno;
+ derr << "FileJournal::_open_file : unable to extend journal to "
+ << newsize << " bytes: " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ ret = ceph_posix_fallocate(fd, 0, newsize);
+ if (ret) {
+ derr << "FileJournal::_open_file : unable to preallocation journal to "
+ << newsize << " bytes: " << cpp_strerror(ret) << dendl;
+ return -ret;
+ }
+ max_size = newsize;
+ }
+ else {
+ max_size = oldsize;
+ }
+ block_size = cct->_conf->journal_block_size;
+
+ if (create && cct->_conf->journal_zero_on_create) {
+ derr << "FileJournal::_open_file : zeroing journal" << dendl;
+ uint64_t write_size = 1 << 20;
+ char *buf;
+ ret = ::posix_memalign((void **)&buf, block_size, write_size);
+ if (ret != 0) {
+ return -ret;
+ }
+ memset(static_cast<void*>(buf), 0, write_size);
+ uint64_t i = 0;
+ for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
+ ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
+ if (ret < 0) {
+ free(buf);
+ return -errno;
+ }
+ }
+ if (i < (uint64_t)max_size) {
+ ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
+ if (ret < 0) {
+ free(buf);
+ return -errno;
+ }
+ }
+ free(buf);
+ }
+
+
+ dout(10) << "_open journal is not a block device, NOT checking disk "
+ << "write cache on '" << fn << "'" << dendl;
+
+ return 0;
+}
+
+// This can not be used on an active journal
+int FileJournal::check()
+{
+ int ret;
+
+ ceph_assert(fd == -1);
+ ret = _open(false, false);
+ if (ret)
+ return ret;
+
+ ret = read_header(&header);
+ if (ret < 0)
+ goto done;
+
+ if (header.fsid != fsid) {
+ derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+ << ", invalid (someone else's?) journal" << dendl;
+ ret = -EINVAL;
+ goto done;
+ }
+
+ dout(1) << "check: header looks ok" << dendl;
+ ret = 0;
+
+ done:
+ close();
+ return ret;
+}
+
+
+int FileJournal::create()
+{
+ void *buf = 0;
+ int64_t needed_space;
+ int ret;
+ buffer::ptr bp;
+ dout(2) << "create " << fn << " fsid " << fsid << dendl;
+
+ ret = _open(true, true);
+ if (ret)
+ goto done;
+
+ // write empty header
+ header = header_t();
+ header.flags = header_t::FLAG_CRC; // enable crcs on any new journal.
+ header.fsid = fsid;
+ header.max_size = max_size;
+ header.block_size = block_size;
+ if (cct->_conf->journal_block_align || directio)
+ header.alignment = block_size;
+ else
+ header.alignment = 16; // at least stay word aligned on 64bit machines...
+
+ header.start = get_top();
+ header.start_seq = 0;
+
+ print_header(header);
+
+ // static zeroed buffer for alignment padding
+ delete [] zero_buf;
+ zero_buf = new char[header.alignment];
+ memset(zero_buf, 0, header.alignment);
+
+ bp = prepare_header();
+ if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) {
+ ret = -errno;
+ derr << "FileJournal::create : create write header error "
+ << cpp_strerror(ret) << dendl;
+ goto close_fd;
+ }
+
+ // zero first little bit, too.
+ ret = posix_memalign(&buf, block_size, block_size);
+ if (ret) {
+ ret = -ret;
+ derr << "FileJournal::create: failed to allocate " << block_size
+ << " bytes of memory: " << cpp_strerror(ret) << dendl;
+ goto close_fd;
+ }
+ memset(buf, 0, block_size);
+ if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) {
+ ret = -errno;
+ derr << "FileJournal::create: error zeroing first " << block_size
+ << " bytes " << cpp_strerror(ret) << dendl;
+ goto free_buf;
+ }
+
+ needed_space = cct->_conf->osd_max_write_size << 20;
+ needed_space += (2 * sizeof(entry_header_t)) + get_top();
+ if (header.max_size - header.start < needed_space) {
+ derr << "FileJournal::create: OSD journal is not large enough to hold "
+ << "osd_max_write_size bytes!" << dendl;
+ ret = -ENOSPC;
+ goto free_buf;
+ }
+
+ dout(2) << "create done" << dendl;
+ ret = 0;
+
+free_buf:
+ free(buf);
+ buf = 0;
+close_fd:
+ if (TEMP_FAILURE_RETRY(::close(fd)) < 0) {
+ ret = -errno;
+ derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret)
+ << dendl;
+ }
+done:
+ fd = -1;
+ return ret;
+}
+
+// This can not be used on an active journal
+int FileJournal::peek_fsid(uuid_d& fsid)
+{
+ ceph_assert(fd == -1);
+ int r = _open(false, false);
+ if (r)
+ return r;
+ r = read_header(&header);
+ if (r < 0)
+ goto out;
+ fsid = header.fsid;
+out:
+ close();
+ return r;
+}
+
+int FileJournal::open(uint64_t fs_op_seq)
+{
+ dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl;
+
+ uint64_t next_seq = fs_op_seq + 1;
+ uint64_t seq = -1;
+
+ int err = _open(false);
+ if (err)
+ return err;
+
+ // assume writeable, unless...
+ read_pos = 0;
+ write_pos = get_top();
+
+ // read header?
+ err = read_header(&header);
+ if (err < 0)
+ goto out;
+
+ // static zeroed buffer for alignment padding
+ delete [] zero_buf;
+ zero_buf = new char[header.alignment];
+ memset(zero_buf, 0, header.alignment);
+
+ dout(10) << "open header.fsid = " << header.fsid
+ //<< " vs expected fsid = " << fsid
+ << dendl;
+ if (header.fsid != fsid) {
+ derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+ << ", invalid (someone else's?) journal" << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if (header.max_size > max_size) {
+ dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if (header.block_size != block_size) {
+ dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if (header.max_size % header.block_size) {
+ dout(2) << "open journal max size " << header.max_size
+ << " not a multiple of block size " << header.block_size << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if (header.alignment != block_size && directio) {
+ dout(0) << "open journal alignment " << header.alignment << " does not match block size "
+ << block_size << " (required for direct_io journal mode)" << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if ((header.alignment % CEPH_DIRECTIO_ALIGNMENT) && directio) {
+ dout(0) << "open journal alignment " << header.alignment
+ << " is not multiple of minimum directio alignment "
+ << CEPH_DIRECTIO_ALIGNMENT << " (required for direct_io journal mode)"
+ << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+
+ // looks like a valid header.
+ write_pos = 0; // not writeable yet
+
+ journaled_seq = header.committed_up_to;
+
+ // find next entry
+ read_pos = header.start;
+ seq = header.start_seq;
+
+ while (1) {
+ bufferlist bl;
+ off64_t old_pos = read_pos;
+ if (!read_entry(bl, seq)) {
+ dout(10) << "open reached end of journal." << dendl;
+ break;
+ }
+ if (seq > next_seq) {
+ dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq
+ << ", ignoring journal contents"
+ << dendl;
+ read_pos = -1;
+ last_committed_seq = 0;
+ return 0;
+ }
+ if (seq == next_seq) {
+ dout(10) << "open reached seq " << seq << dendl;
+ read_pos = old_pos;
+ break;
+ }
+ seq++; // next event should follow.
+ }
+
+ return 0;
+out:
+ close();
+ return err;
+}
+
+void FileJournal::_close(int fd) const
+{
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileJournal::close()
+{
+ dout(1) << "close " << fn << dendl;
+
+ // stop writer thread
+ stop_writer();
+
+ // close
+ ceph_assert(writeq_empty());
+ ceph_assert(!must_write_header);
+ ceph_assert(fd >= 0);
+ _close(fd);
+ fd = -1;
+}
+
+
+int FileJournal::dump(ostream& out)
+{
+ return _dump(out, false);
+}
+
+int FileJournal::simple_dump(ostream& out)
+{
+ return _dump(out, true);
+}
+
+int FileJournal::_dump(ostream& out, bool simple)
+{
+ JSONFormatter f(true);
+ int ret = _fdump(f, simple);
+ f.flush(out);
+ return ret;
+}
+
+int FileJournal::_fdump(Formatter &f, bool simple)
+{
+ dout(10) << "_fdump" << dendl;
+
+ ceph_assert(fd == -1);
+ int err = _open(false, false);
+ if (err)
+ return err;
+
+ err = read_header(&header);
+ if (err < 0) {
+ close();
+ return err;
+ }
+
+ off64_t next_pos = header.start;
+
+ f.open_object_section("journal");
+
+ f.open_object_section("header");
+ f.dump_unsigned("flags", header.flags);
+ ostringstream os;
+ os << header.fsid;
+ f.dump_string("fsid", os.str());
+ f.dump_unsigned("block_size", header.block_size);
+ f.dump_unsigned("alignment", header.alignment);
+ f.dump_int("max_size", header.max_size);
+ f.dump_int("start", header.start);
+ f.dump_unsigned("committed_up_to", header.committed_up_to);
+ f.dump_unsigned("start_seq", header.start_seq);
+ f.close_section();
+
+ f.open_array_section("entries");
+ uint64_t seq = header.start_seq;
+ while (1) {
+ bufferlist bl;
+ off64_t pos = next_pos;
+
+ if (!pos) {
+ dout(2) << "_dump -- not readable" << dendl;
+ err = -EINVAL;
+ break;
+ }
+ stringstream ss;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ &ss);
+ if (result != SUCCESS) {
+ if (seq < header.committed_up_to) {
+ dout(2) << "Unable to read past sequence " << seq
+ << " but header indicates the journal has committed up through "
+ << header.committed_up_to << ", journal is corrupt" << dendl;
+ err = -EINVAL;
+ }
+ dout(25) << ss.str() << dendl;
+ dout(25) << "No further valid entries found, journal is most likely valid"
+ << dendl;
+ break;
+ }
+
+ f.open_object_section("entry");
+ f.dump_unsigned("offset", pos);
+ f.dump_unsigned("seq", seq);
+ if (simple) {
+ f.dump_unsigned("bl.length", bl.length());
+ } else {
+ f.open_array_section("transactions");
+ auto p = bl.cbegin();
+ int trans_num = 0;
+ while (!p.end()) {
+ ObjectStore::Transaction t(p);
+ f.open_object_section("transaction");
+ f.dump_unsigned("trans_num", trans_num);
+ t.dump(&f);
+ f.close_section();
+ trans_num++;
+ }
+ f.close_section();
+ }
+ f.close_section();
+ }
+
+ f.close_section();
+ f.close_section();
+ dout(10) << "dump finish" << dendl;
+
+ close();
+ return err;
+}
+
+
+void FileJournal::start_writer()
+{
+ write_stop = false;
+ aio_stop = false;
+ write_thread.create("journal_write");
+#ifdef HAVE_LIBAIO
+ if (aio)
+ write_finish_thread.create("journal_wrt_fin");
+#endif
+}
+
+void FileJournal::stop_writer()
+{
+ // Do nothing if writer already stopped or never started
+ if (!write_stop)
+ {
+ {
+ Mutex::Locker l(write_lock);
+ Mutex::Locker p(writeq_lock);
+ write_stop = true;
+ writeq_cond.Signal();
+ // Doesn't hurt to signal commit_cond in case thread is waiting there
+ // and caller didn't use committed_thru() first.
+ commit_cond.Signal();
+ }
+ write_thread.join();
+
+ // write journal header now so that we have less to replay on remount
+ write_header_sync();
+ }
+
+#ifdef HAVE_LIBAIO
+ // stop aio completeion thread *after* writer thread has stopped
+ // and has submitted all of its io
+ if (aio && !aio_stop) {
+ aio_lock.Lock();
+ aio_stop = true;
+ aio_cond.Signal();
+ write_finish_cond.Signal();
+ aio_lock.Unlock();
+ write_finish_thread.join();
+ }
+#endif
+}
+
+
+
+void FileJournal::print_header(const header_t &header) const
+{
+ dout(10) << "header: block_size " << header.block_size
+ << " alignment " << header.alignment
+ << " max_size " << header.max_size
+ << dendl;
+ dout(10) << "header: start " << header.start << dendl;
+ dout(10) << " write_pos " << write_pos << dendl;
+}
+
+int FileJournal::read_header(header_t *hdr) const
+{
+ dout(10) << "read_header" << dendl;
+ bufferlist bl;
+
+ buffer::ptr bp = buffer::create_small_page_aligned(block_size);
+ char* bpdata = bp.c_str();
+ int r = ::pread(fd, bpdata, bp.length(), 0);
+
+ if (r < 0) {
+ int err = errno;
+ dout(0) << "read_header got " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+
+ // don't use bp.zero() here, because it also invalidates
+ // crc cache (which is not yet populated anyway)
+ if (bp.length() != (size_t)r) {
+ // r will be always less or equal than bp.length
+ bpdata += r;
+ memset(bpdata, 0, bp.length() - r);
+ }
+
+ bl.push_back(std::move(bp));
+
+ try {
+ auto p = bl.cbegin();
+ decode(*hdr, p);
+ }
+ catch (buffer::error& e) {
+ derr << "read_header error decoding journal header" << dendl;
+ return -EINVAL;
+ }
+
+
+ /*
+ * Unfortunately we weren't initializing the flags field for new
+ * journals! Aie. This is safe(ish) now that we have only one
+ * flag. Probably around when we add the next flag we need to
+ * remove this or else this (eventually old) code will clobber newer
+ * code's flags.
+ */
+ if (hdr->flags > 3) {
+ derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
+ hdr->flags = 0;
+ }
+
+ print_header(*hdr);
+
+ return 0;
+}
+
+bufferptr FileJournal::prepare_header()
+{
+ bufferlist bl;
+ {
+ Mutex::Locker l(finisher_lock);
+ header.committed_up_to = journaled_seq;
+ }
+ encode(header, bl);
+ bufferptr bp = buffer::create_small_page_aligned(get_top());
+ // don't use bp.zero() here, because it also invalidates
+ // crc cache (which is not yet populated anyway)
+ char* data = bp.c_str();
+ memcpy(data, bl.c_str(), bl.length());
+ data += bl.length();
+ memset(data, 0, bp.length()-bl.length());
+ return bp;
+}
+
+void FileJournal::write_header_sync()
+{
+ Mutex::Locker locker(write_lock);
+ must_write_header = true;
+ bufferlist bl;
+ do_write(bl);
+ dout(20) << __func__ << " finish" << dendl;
+}
+
+int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size)
+{
+ // already full?
+ if (full_state != FULL_NOTFULL)
+ return -ENOSPC;
+
+ // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL.
+ off64_t room;
+ if (pos >= header.start)
+ room = (header.max_size - pos) + (header.start - get_top()) - 1;
+ else
+ room = header.start - pos - 1;
+ dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start
+ << " top " << get_top() << dendl;
+
+ if (do_sync_cond) {
+ if (room >= (header.max_size >> 1) &&
+ room - size < (header.max_size >> 1)) {
+ dout(10) << " passing half full mark, triggering commit" << dendl;
+ do_sync_cond->SloppySignal(); // initiate a real commit so we can trim
+ }
+ }
+
+ if (room >= size) {
+ dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl;
+ if (pos + size > header.max_size)
+ must_write_header = true;
+ return 0;
+ }
+
+ // full
+ dout(1) << "check_for_full at " << pos << " : JOURNAL FULL "
+ << pos << " >= " << room
+ << " (max_size " << header.max_size << " start " << header.start << ")"
+ << dendl;
+
+ off64_t max = header.max_size - get_top();
+ if (size > max)
+ dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl;
+
+ return -ENOSPC;
+}
+
+int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+ // gather queued writes
+ off64_t queue_pos = write_pos;
+
+ int eleft = cct->_conf->journal_max_write_entries;
+ unsigned bmax = cct->_conf->journal_max_write_bytes;
+
+ if (full_state != FULL_NOTFULL)
+ return -ENOSPC;
+
+ while (!writeq_empty()) {
+ list<write_item> items;
+ batch_pop_write(items);
+ list<write_item>::iterator it = items.begin();
+ while (it != items.end()) {
+ uint64_t bytes = it->bl.length();
+ int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes);
+ if (r == 0) { // prepare ok, delete it
+ items.erase(it++);
+#ifdef HAVE_LIBAIO
+ {
+ Mutex::Locker locker(aio_lock);
+ ceph_assert(aio_write_queue_ops > 0);
+ aio_write_queue_ops--;
+ ceph_assert(aio_write_queue_bytes >= bytes);
+ aio_write_queue_bytes -= bytes;
+ }
+#else
+ (void)bytes;
+#endif
+ }
+ if (r == -ENOSPC) {
+ // the journal maybe full, insert the left item to writeq
+ batch_unpop_write(items);
+ if (orig_ops)
+ goto out; // commit what we have
+
+ if (logger)
+ logger->inc(l_filestore_journal_full);
+
+ if (wait_on_full) {
+ dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl;
+ } else {
+ dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl;
+
+ // throw out what we have so far
+ full_state = FULL_FULL;
+ while (!writeq_empty()) {
+ complete_write(1, peek_write().orig_len);
+ pop_write();
+ }
+ print_header(header);
+ }
+
+ return -ENOSPC; // hrm, full on first op
+ }
+ if (eleft) {
+ if (--eleft == 0) {
+ dout(20) << "prepare_multi_write hit max events per write "
+ << cct->_conf->journal_max_write_entries << dendl;
+ batch_unpop_write(items);
+ goto out;
+ }
+ }
+ if (bmax) {
+ if (bl.length() >= bmax) {
+ dout(20) << "prepare_multi_write hit max write size "
+ << cct->_conf->journal_max_write_bytes << dendl;
+ batch_unpop_write(items);
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl;
+ ceph_assert((write_pos + bl.length() == queue_pos) ||
+ (write_pos + bl.length() - header.max_size + get_top() == queue_pos));
+ return 0;
+}
+
+/*
+void FileJournal::queue_write_fin(uint64_t seq, Context *fin)
+{
+ writing_seq.push_back(seq);
+ if (!waiting_for_notfull.empty()) {
+ // make sure previously unjournaled stuff waiting for UNFULL triggers
+ // _before_ newly journaled stuff does
+ dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin
+ << " until after UNFULL" << dendl;
+ C_Gather *g = new C_Gather(writeq.front().fin);
+ writing_fin.push_back(g->new_sub());
+ waiting_for_notfull.push_back(g->new_sub());
+ } else {
+ writing_fin.push_back(writeq.front().fin);
+ dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl;
+ }
+}
+*/
+
+void FileJournal::queue_completions_thru(uint64_t seq)
+{
+ ceph_assert(finisher_lock.is_locked());
+ utime_t now = ceph_clock_now();
+ list<completion_item> items;
+ batch_pop_completions(items);
+ list<completion_item>::iterator it = items.begin();
+ while (it != items.end()) {
+ completion_item& next = *it;
+ if (next.seq > seq)
+ break;
+ utime_t lat = now;
+ lat -= next.start;
+ dout(10) << "queue_completions_thru seq " << seq
+ << " queueing seq " << next.seq
+ << " " << next.finish
+ << " lat " << lat << dendl;
+ if (logger) {
+ logger->tinc(l_filestore_journal_latency, lat);
+ }
+ if (next.finish)
+ finisher->queue(next.finish);
+ if (next.tracked_op) {
+ next.tracked_op->mark_event("journaled_completion_queued");
+ next.tracked_op->journal_trace.event("queued completion");
+ next.tracked_op->journal_trace.keyval("completed through", seq);
+ }
+ items.erase(it++);
+ }
+ batch_unpop_completions(items);
+ finisher_cond.Signal();
+}
+
+
+int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+ uint64_t seq = next_write.seq;
+ bufferlist &ebl = next_write.bl;
+ off64_t size = ebl.length();
+
+ int r = check_for_full(seq, queue_pos, size);
+ if (r < 0)
+ return r; // ENOSPC or EAGAIN
+
+ uint32_t orig_len = next_write.orig_len;
+ orig_bytes += orig_len;
+ orig_ops++;
+
+ // add to write buffer
+ dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq
+ << " len " << orig_len << " -> " << size << dendl;
+
+ unsigned seq_offset = offsetof(entry_header_t, seq);
+ unsigned magic1_offset = offsetof(entry_header_t, magic1);
+ unsigned magic2_offset = offsetof(entry_header_t, magic2);
+
+ bufferptr headerptr = ebl.buffers().front();
+ uint64_t _seq = seq;
+ uint64_t _queue_pos = queue_pos;
+ uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64());
+ headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq);
+ headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+ headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+ bufferptr footerptr = ebl.buffers().back();
+ unsigned post_offset = footerptr.length() - sizeof(entry_header_t);
+ footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq);
+ footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+ footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+ bl.claim_append(ebl);
+ if (next_write.tracked_op) {
+ next_write.tracked_op->mark_event("write_thread_in_journal_buffer");
+ next_write.tracked_op->journal_trace.event("prepare_single_write");
+ }
+
+ journalq.push_back(pair<uint64_t,off64_t>(seq, queue_pos));
+ writing_seq = seq;
+
+ queue_pos += size;
+ if (queue_pos >= header.max_size)
+ queue_pos = queue_pos + get_top() - header.max_size;
+
+ return 0;
+}
+
+void FileJournal::check_align(off64_t pos, bufferlist& bl)
+{
+ // make sure list segments are page aligned
+ if (directio && !bl.is_aligned_size_and_memory(block_size, CEPH_DIRECTIO_ALIGNMENT)) {
+ ceph_assert((bl.length() & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0);
+ ceph_assert((pos & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0);
+ ceph_abort_msg("bl was not aligned");
+ }
+}
+
+int FileJournal::write_bl(off64_t& pos, bufferlist& bl)
+{
+ int ret;
+
+ off64_t spos = ::lseek64(fd, pos, SEEK_SET);
+ if (spos < 0) {
+ ret = -errno;
+ derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = bl.write_fd(fd);
+ if (ret) {
+ derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ pos += bl.length();
+ if (pos == header.max_size)
+ pos = get_top();
+ return 0;
+}
+
+void FileJournal::do_write(bufferlist& bl)
+{
+ // nothing to do?
+ if (bl.length() == 0 && !must_write_header)
+ return;
+
+ buffer::ptr hbp;
+ if (cct->_conf->journal_write_header_frequency &&
+ (((++journaled_since_start) %
+ cct->_conf->journal_write_header_frequency) == 0)) {
+ must_write_header = true;
+ }
+
+ if (must_write_header) {
+ must_write_header = false;
+ hbp = prepare_header();
+ }
+
+ dout(15) << "do_write writing " << write_pos << "~" << bl.length()
+ << (hbp.length() ? " + header":"")
+ << dendl;
+
+ utime_t from = ceph_clock_now();
+
+ // entry
+ off64_t pos = write_pos;
+
+ // Adjust write_pos
+ write_pos += bl.length();
+ if (write_pos >= header.max_size)
+ write_pos = write_pos - header.max_size + get_top();
+
+ write_lock.Unlock();
+
+ // split?
+ off64_t split = 0;
+ if (pos + bl.length() > header.max_size) {
+ bufferlist first, second;
+ split = header.max_size - pos;
+ first.substr_of(bl, 0, split);
+ second.substr_of(bl, split, bl.length() - split);
+ ceph_assert(first.length() + second.length() == bl.length());
+ dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length()
+ << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl;
+
+ //Save pos to write first piece second
+ off64_t first_pos = pos;
+ off64_t orig_pos;
+ pos = get_top();
+ // header too?
+ if (hbp.length()) {
+ // be sneaky: include the header in the second fragment
+ bufferlist tmp;
+ tmp.push_back(hbp);
+ tmp.claim_append(second);
+ second.swap(tmp);
+ pos = 0; // we included the header
+ }
+ // Write the second portion first possible with the header, so
+ // do_read_entry() won't even get a valid entry_header_t if there
+ // is a crash between the two writes.
+ orig_pos = pos;
+ if (write_bl(pos, second)) {
+ derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+ << ") failed" << dendl;
+ check_align(pos, second);
+ ceph_abort();
+ }
+ orig_pos = first_pos;
+ if (write_bl(first_pos, first)) {
+ derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+ << ") failed" << dendl;
+ check_align(first_pos, first);
+ ceph_abort();
+ }
+ ceph_assert(first_pos == get_top());
+ } else {
+ // header too?
+ if (hbp.length()) {
+ if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) {
+ int err = errno;
+ derr << "FileJournal::do_write: pwrite(fd=" << fd
+ << ", hbp.length=" << hbp.length() << ") failed :"
+ << cpp_strerror(err) << dendl;
+ ceph_abort();
+ }
+ }
+
+ if (write_bl(pos, bl)) {
+ derr << "FileJournal::do_write: write_bl(pos=" << pos
+ << ") failed" << dendl;
+ check_align(pos, bl);
+ ceph_abort();
+ }
+ }
+
+ if (!directio) {
+ dout(20) << "do_write fsync" << dendl;
+
+ /*
+ * We'd really love to have a fsync_range or fdatasync_range and do a:
+ *
+ * if (split) {
+ * ::fsync_range(fd, header.max_size - split, split)l
+ * ::fsync_range(fd, get_top(), bl.length() - split);
+ * else
+ * ::fsync_range(fd, write_pos, bl.length())
+ *
+ * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be
+ * too hard given all the underlying infrastructure already exist.
+ *
+ * NOTE: using sync_file_range here would not be safe as it does not
+ * flush disk caches or commits any sort of metadata.
+ */
+ int ret = 0;
+#if defined(__APPLE__) || defined(__FreeBSD__)
+ ret = ::fsync(fd);
+#else
+ ret = ::fdatasync(fd);
+#endif
+ if (ret < 0) {
+ derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+#ifdef HAVE_POSIX_FADVISE
+ if (cct->_conf->filestore_fadvise)
+ posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+ }
+
+ utime_t lat = ceph_clock_now() - from;
+ dout(20) << "do_write latency " << lat << dendl;
+
+ write_lock.Lock();
+
+ ceph_assert(write_pos == pos);
+ ceph_assert(write_pos % header.alignment == 0);
+
+ {
+ Mutex::Locker locker(finisher_lock);
+ journaled_seq = writing_seq;
+
+ // kick finisher?
+ // only if we haven't filled up recently!
+ if (full_state != FULL_NOTFULL) {
+ dout(10) << "do_write NOT queueing finisher seq " << journaled_seq
+ << ", full_commit_seq|full_restart_seq" << dendl;
+ } else {
+ if (plug_journal_completions) {
+ dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq
+ << " due to completion plug" << dendl;
+ } else {
+ dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl;
+ queue_completions_thru(journaled_seq);
+ }
+ }
+ }
+}
+
+void FileJournal::flush()
+{
+ dout(10) << "waiting for completions to empty" << dendl;
+ {
+ Mutex::Locker l(finisher_lock);
+ while (!completions_empty())
+ finisher_cond.Wait(finisher_lock);
+ }
+ dout(10) << "flush waiting for finisher" << dendl;
+ finisher->wait_for_empty();
+ dout(10) << "flush done" << dendl;
+}
+
+
+void FileJournal::write_thread_entry()
+{
+ dout(10) << "write_thread_entry start" << dendl;
+ while (1) {
+ {
+ Mutex::Locker locker(writeq_lock);
+ if (writeq.empty() && !must_write_header) {
+ if (write_stop)
+ break;
+ dout(20) << "write_thread_entry going to sleep" << dendl;
+ writeq_cond.Wait(writeq_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ continue;
+ }
+ }
+
+#ifdef HAVE_LIBAIO
+ if (aio) {
+ Mutex::Locker locker(aio_lock);
+ // should we back off to limit aios in flight? try to do this
+ // adaptively so that we submit larger aios once we have lots of
+ // them in flight.
+ //
+ // NOTE: our condition here is based on aio_num (protected by
+ // aio_lock) and throttle_bytes (part of the write queue). when
+ // we sleep, we *only* wait for aio_num to change, and do not
+ // wake when more data is queued. this is not strictly correct,
+ // but should be fine given that we will have plenty of aios in
+ // flight if we hit this limit to ensure we keep the device
+ // saturated.
+ while (aio_num > 0) {
+ int exp = std::min<int>(aio_num * 2, 24);
+ long unsigned min_new = 1ull << exp;
+ uint64_t cur = aio_write_queue_bytes;
+ dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes
+ << " ... exp " << exp << " min_new " << min_new
+ << " ... pending " << cur << dendl;
+ if (cur >= min_new)
+ break;
+ dout(20) << "write_thread_entry deferring until more aios complete: "
+ << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new
+ << " bytes to start a new aio (currently " << cur << " pending)" << dendl;
+ aio_cond.Wait(aio_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ }
+ }
+#endif
+
+ Mutex::Locker locker(write_lock);
+ uint64_t orig_ops = 0;
+ uint64_t orig_bytes = 0;
+
+ bufferlist bl;
+ int r = prepare_multi_write(bl, orig_ops, orig_bytes);
+ // Don't care about journal full if stoppping, so drop queue and
+ // possibly let header get written and loop above to notice stop
+ if (r == -ENOSPC) {
+ if (write_stop) {
+ dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl;
+ while (!writeq_empty()) {
+ complete_write(1, peek_write().orig_len);
+ pop_write();
+ }
+ print_header(header);
+ r = 0;
+ } else {
+ dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl;
+ commit_cond.Wait(write_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ continue;
+ }
+ }
+ ceph_assert(r == 0);
+
+ if (logger) {
+ logger->inc(l_filestore_journal_wr);
+ logger->inc(l_filestore_journal_wr_bytes, bl.length());
+ }
+
+#ifdef HAVE_LIBAIO
+ if (aio)
+ do_aio_write(bl);
+ else
+ do_write(bl);
+#else
+ do_write(bl);
+#endif
+ complete_write(orig_ops, orig_bytes);
+ }
+
+ dout(10) << "write_thread_entry finish" << dendl;
+}
+
+#ifdef HAVE_LIBAIO
+void FileJournal::do_aio_write(bufferlist& bl)
+{
+
+ if (cct->_conf->journal_write_header_frequency &&
+ (((++journaled_since_start) %
+ cct->_conf->journal_write_header_frequency) == 0)) {
+ must_write_header = true;
+ }
+
+ // nothing to do?
+ if (bl.length() == 0 && !must_write_header)
+ return;
+
+ buffer::ptr hbp;
+ if (must_write_header) {
+ must_write_header = false;
+ hbp = prepare_header();
+ }
+
+ // entry
+ off64_t pos = write_pos;
+
+ dout(15) << "do_aio_write writing " << pos << "~" << bl.length()
+ << (hbp.length() ? " + header":"")
+ << dendl;
+
+ // split?
+ off64_t split = 0;
+ if (pos + bl.length() > header.max_size) {
+ bufferlist first, second;
+ split = header.max_size - pos;
+ first.substr_of(bl, 0, split);
+ second.substr_of(bl, split, bl.length() - split);
+ ceph_assert(first.length() + second.length() == bl.length());
+ dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl;
+
+ if (write_aio_bl(pos, first, 0)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ ceph_assert(pos == header.max_size);
+ if (hbp.length()) {
+ // be sneaky: include the header in the second fragment
+ bufferlist tmp;
+ tmp.push_back(hbp);
+ tmp.claim_append(second);
+ second.swap(tmp);
+ pos = 0; // we included the header
+ } else
+ pos = get_top(); // no header, start after that
+ if (write_aio_bl(pos, second, writing_seq)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ } else {
+ // header too?
+ if (hbp.length()) {
+ bufferlist hbl;
+ hbl.push_back(hbp);
+ loff_t pos = 0;
+ if (write_aio_bl(pos, hbl, 0)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl;
+ ceph_abort();
+ }
+ }
+
+ if (write_aio_bl(pos, bl, writing_seq)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ }
+
+ write_pos = pos;
+ if (write_pos == header.max_size)
+ write_pos = get_top();
+ ceph_assert(write_pos % header.alignment == 0);
+}
+
+/**
+ * write a buffer using aio
+ *
+ * @param seq seq to trigger when this aio completes. if 0, do not update any state
+ * on completion.
+ */
+int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
+{
+ dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl;
+
+ while (bl.length() > 0) {
+ int max = std::min<int>(bl.get_num_buffers(), IOV_MAX-1);
+ iovec *iov = new iovec[max];
+ int n = 0;
+ unsigned len = 0;
+ for (auto p = std::cbegin(bl.buffers()); n < max; ++p, ++n) {
+ ceph_assert(p != std::cend(bl.buffers()));
+ iov[n].iov_base = const_cast<void*>(static_cast<const void*>(p->c_str()));
+ iov[n].iov_len = p->length();
+ len += p->length();
+ }
+
+ bufferlist tbl;
+ bl.splice(0, len, &tbl); // move bytes from bl -> tbl
+
+ // lock only aio_queue, current aio, aio_num, aio_bytes, which may be
+ // modified in check_aio_completion
+ aio_lock.Lock();
+ aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq));
+ aio_info& aio = aio_queue.back();
+ aio.iov = iov;
+
+ io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos);
+
+ dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len
+ << " in " << n << dendl;
+
+ aio_num++;
+ aio_bytes += aio.len;
+
+ // need to save current aio len to update write_pos later because current
+ // aio could be ereased from aio_queue once it is done
+ uint64_t cur_len = aio.len;
+ // unlock aio_lock because following io_submit might take time to return
+ aio_lock.Unlock();
+
+ iocb *piocb = &aio.iocb;
+
+ // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
+ int attempts = 16;
+ int delay = 125;
+ do {
+ int r = io_submit(aio_ctx, 1, &piocb);
+ dout(20) << "write_aio_bl io_submit return value: " << r << dendl;
+ if (r < 0) {
+ derr << "io_submit to " << aio.off << "~" << cur_len
+ << " got " << cpp_strerror(r) << dendl;
+ if (r == -EAGAIN && attempts-- > 0) {
+ usleep(delay);
+ delay *= 2;
+ continue;
+ }
+ check_align(pos, tbl);
+ ceph_abort_msg("io_submit got unexpected error");
+ } else {
+ break;
+ }
+ } while (true);
+ pos += cur_len;
+ }
+ aio_lock.Lock();
+ write_finish_cond.Signal();
+ aio_lock.Unlock();
+ return 0;
+}
+#endif
+
+void FileJournal::write_finish_thread_entry()
+{
+#ifdef HAVE_LIBAIO
+ dout(10) << __func__ << " enter" << dendl;
+ while (true) {
+ {
+ Mutex::Locker locker(aio_lock);
+ if (aio_queue.empty()) {
+ if (aio_stop)
+ break;
+ dout(20) << __func__ << " sleeping" << dendl;
+ write_finish_cond.Wait(aio_lock);
+ continue;
+ }
+ }
+
+ dout(20) << __func__ << " waiting for aio(s)" << dendl;
+ io_event event[16];
+ int r = io_getevents(aio_ctx, 1, 16, event, NULL);
+ if (r < 0) {
+ if (r == -EINTR) {
+ dout(0) << "io_getevents got " << cpp_strerror(r) << dendl;
+ continue;
+ }
+ derr << "io_getevents got " << cpp_strerror(r) << dendl;
+ if (r == -EIO) {
+ note_io_error_event(devname.c_str(), fn.c_str(), -EIO, 0, 0, 0);
+ }
+ ceph_abort_msg("got unexpected error from io_getevents");
+ }
+
+ {
+ Mutex::Locker locker(aio_lock);
+ for (int i=0; i<r; i++) {
+ aio_info *ai = (aio_info *)event[i].obj;
+ if (event[i].res != ai->len) {
+ derr << "aio to " << ai->off << "~" << ai->len
+ << " returned: " << (int)event[i].res << dendl;
+ ceph_abort_msg("unexpected aio error");
+ }
+ dout(10) << __func__ << " aio " << ai->off
+ << "~" << ai->len << " done" << dendl;
+ ai->done = true;
+ }
+ check_aio_completion();
+ }
+ }
+ dout(10) << __func__ << " exit" << dendl;
+#endif
+}
+
+#ifdef HAVE_LIBAIO
+/**
+ * check aio_wait for completed aio, and update state appropriately.
+ */
+void FileJournal::check_aio_completion()
+{
+ ceph_assert(aio_lock.is_locked());
+ dout(20) << "check_aio_completion" << dendl;
+
+ bool completed_something = false, signal = false;
+ uint64_t new_journaled_seq = 0;
+
+ list<aio_info>::iterator p = aio_queue.begin();
+ while (p != aio_queue.end() && p->done) {
+ dout(20) << "check_aio_completion completed seq " << p->seq << " "
+ << p->off << "~" << p->len << dendl;
+ if (p->seq) {
+ new_journaled_seq = p->seq;
+ completed_something = true;
+ }
+ aio_num--;
+ aio_bytes -= p->len;
+ aio_queue.erase(p++);
+ signal = true;
+ }
+
+ if (completed_something) {
+ // kick finisher?
+ // only if we haven't filled up recently!
+ Mutex::Locker locker(finisher_lock);
+ journaled_seq = new_journaled_seq;
+ if (full_state != FULL_NOTFULL) {
+ dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq
+ << ", full_commit_seq|full_restart_seq" << dendl;
+ } else {
+ if (plug_journal_completions) {
+ dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq
+ << " due to completion plug" << dendl;
+ } else {
+ dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl;
+ queue_completions_thru(journaled_seq);
+ }
+ }
+ }
+ if (signal) {
+ // maybe write queue was waiting for aio count to drop?
+ aio_cond.Signal();
+ }
+}
+#endif
+
+int FileJournal::prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) {
+ dout(10) << "prepare_entry " << tls << dendl;
+ int data_len = cct->_conf->journal_align_min_size - 1;
+ int data_align = -1; // -1 indicates that we don't care about the alignment
+ bufferlist bl;
+ for (vector<ObjectStore::Transaction>::iterator p = tls.begin();
+ p != tls.end(); ++p) {
+ if ((int)(*p).get_data_length() > data_len) {
+ data_len = (*p).get_data_length();
+ data_align = ((*p).get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK;
+ }
+ encode(*p, bl);
+ }
+ if (tbl->length()) {
+ bl.claim_append(*tbl);
+ }
+ // add it this entry
+ entry_header_t h;
+ unsigned head_size = sizeof(entry_header_t);
+ off64_t base_size = 2*head_size + bl.length();
+ memset(&h, 0, sizeof(h));
+ if (data_align >= 0)
+ h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK;
+ off64_t size = round_up_to(base_size + h.pre_pad, header.alignment);
+ unsigned post_pad = size - base_size - h.pre_pad;
+ h.len = bl.length();
+ h.post_pad = post_pad;
+ h.crc32c = bl.crc32c(0);
+ dout(10) << " len " << bl.length() << " -> " << size
+ << " (head " << head_size << " pre_pad " << h.pre_pad
+ << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")"
+ << " (bl alignment " << data_align << ")"
+ << dendl;
+ bufferlist ebl;
+ // header
+ ebl.append((const char*)&h, sizeof(h));
+ if (h.pre_pad) {
+ ebl.push_back(buffer::create_static(h.pre_pad, zero_buf));
+ }
+ // payload
+ ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+ if (h.post_pad) {
+ ebl.push_back(buffer::create_static(h.post_pad, zero_buf));
+ }
+ // footer
+ ebl.append((const char*)&h, sizeof(h));
+ if (directio)
+ ebl.rebuild_aligned(CEPH_DIRECTIO_ALIGNMENT);
+ tbl->claim(ebl);
+ return h.len;
+}
+
+void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
+ Context *oncommit, TrackedOpRef osd_op)
+{
+ // dump on queue
+ dout(5) << "submit_entry seq " << seq
+ << " len " << e.length()
+ << " (" << oncommit << ")" << dendl;
+ ceph_assert(e.length() > 0);
+ ceph_assert(e.length() < header.max_size);
+
+ if (logger) {
+ logger->inc(l_filestore_journal_queue_bytes, orig_len);
+ logger->inc(l_filestore_journal_queue_ops, 1);
+ }
+
+ throttle.register_throttle_seq(seq, e.length());
+ if (logger) {
+ logger->inc(l_filestore_journal_ops, 1);
+ logger->inc(l_filestore_journal_bytes, e.length());
+ }
+
+ if (osd_op) {
+ osd_op->mark_event("commit_queued_for_journal_write");
+ if (osd_op->store_trace) {
+ osd_op->journal_trace.init("journal", &trace_endpoint, &osd_op->store_trace);
+ osd_op->journal_trace.event("submit_entry");
+ osd_op->journal_trace.keyval("seq", seq);
+ }
+ }
+ {
+ Mutex::Locker l1(writeq_lock);
+#ifdef HAVE_LIBAIO
+ Mutex::Locker l2(aio_lock);
+#endif
+ Mutex::Locker l3(completions_lock);
+
+#ifdef HAVE_LIBAIO
+ aio_write_queue_ops++;
+ aio_write_queue_bytes += e.length();
+ aio_cond.Signal();
+#endif
+
+ completions.push_back(
+ completion_item(
+ seq, oncommit, ceph_clock_now(), osd_op));
+ if (writeq.empty())
+ writeq_cond.Signal();
+ writeq.push_back(write_item(seq, e, orig_len, osd_op));
+ if (osd_op)
+ osd_op->journal_trace.keyval("queue depth", writeq.size());
+ }
+}
+
+bool FileJournal::writeq_empty()
+{
+ Mutex::Locker locker(writeq_lock);
+ return writeq.empty();
+}
+
+FileJournal::write_item &FileJournal::peek_write()
+{
+ ceph_assert(write_lock.is_locked());
+ Mutex::Locker locker(writeq_lock);
+ return writeq.front();
+}
+
+void FileJournal::pop_write()
+{
+ ceph_assert(write_lock.is_locked());
+ Mutex::Locker locker(writeq_lock);
+ if (logger) {
+ logger->dec(l_filestore_journal_queue_bytes, writeq.front().orig_len);
+ logger->dec(l_filestore_journal_queue_ops, 1);
+ }
+ writeq.pop_front();
+}
+
+void FileJournal::batch_pop_write(list<write_item> &items)
+{
+ ceph_assert(write_lock.is_locked());
+ {
+ Mutex::Locker locker(writeq_lock);
+ writeq.swap(items);
+ }
+ for (auto &&i : items) {
+ if (logger) {
+ logger->dec(l_filestore_journal_queue_bytes, i.orig_len);
+ logger->dec(l_filestore_journal_queue_ops, 1);
+ }
+ }
+}
+
+void FileJournal::batch_unpop_write(list<write_item> &items)
+{
+ ceph_assert(write_lock.is_locked());
+ for (auto &&i : items) {
+ if (logger) {
+ logger->inc(l_filestore_journal_queue_bytes, i.orig_len);
+ logger->inc(l_filestore_journal_queue_ops, 1);
+ }
+ }
+ Mutex::Locker locker(writeq_lock);
+ writeq.splice(writeq.begin(), items);
+}
+
+void FileJournal::commit_start(uint64_t seq)
+{
+ dout(10) << "commit_start" << dendl;
+
+ // was full?
+ switch (full_state) {
+ case FULL_NOTFULL:
+ break; // all good
+
+ case FULL_FULL:
+ if (seq >= journaled_seq) {
+ dout(1) << " FULL_FULL -> FULL_WAIT. commit_start on seq "
+ << seq << " > journaled_seq " << journaled_seq
+ << ", moving to FULL_WAIT."
+ << dendl;
+ full_state = FULL_WAIT;
+ } else {
+ dout(1) << "FULL_FULL commit_start on seq "
+ << seq << " < journaled_seq " << journaled_seq
+ << ", remaining in FULL_FULL"
+ << dendl;
+ }
+ break;
+
+ case FULL_WAIT:
+ dout(1) << " FULL_WAIT -> FULL_NOTFULL. journal now active, setting completion plug." << dendl;
+ full_state = FULL_NOTFULL;
+ plug_journal_completions = true;
+ break;
+ }
+}
+
+/*
+ *send discard command to joural block deivce
+ */
+void FileJournal::do_discard(int64_t offset, int64_t end)
+{
+ dout(10) << __func__ << " trim(" << offset << ", " << end << dendl;
+
+ offset = round_up_to(offset, block_size);
+ if (offset >= end)
+ return;
+ end = round_up_to(end - block_size, block_size);
+ ceph_assert(end >= offset);
+ if (offset < end) {
+ BlkDev blkdev(fd);
+ if (blkdev.discard(offset, end - offset) < 0) {
+ dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl;
+ }
+ }
+}
+
+void FileJournal::committed_thru(uint64_t seq)
+{
+ Mutex::Locker locker(write_lock);
+
+ auto released = throttle.flush(seq);
+ if (logger) {
+ logger->dec(l_filestore_journal_ops, released.first);
+ logger->dec(l_filestore_journal_bytes, released.second);
+ }
+
+ if (seq < last_committed_seq) {
+ dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl;
+ ceph_assert(seq >= last_committed_seq);
+ return;
+ }
+ if (seq == last_committed_seq) {
+ dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl;
+ return;
+ }
+
+ dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl;
+ last_committed_seq = seq;
+
+ // completions!
+ {
+ Mutex::Locker locker(finisher_lock);
+ queue_completions_thru(seq);
+ if (plug_journal_completions && seq >= header.start_seq) {
+ dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl;
+ plug_journal_completions = false;
+ queue_completions_thru(journaled_seq);
+ }
+ }
+
+ // adjust start pointer
+ while (!journalq.empty() && journalq.front().first <= seq) {
+ journalq.pop_front();
+ }
+
+ int64_t old_start = header.start;
+ if (!journalq.empty()) {
+ header.start = journalq.front().second;
+ header.start_seq = journalq.front().first;
+ } else {
+ header.start = write_pos;
+ header.start_seq = seq + 1;
+ }
+
+ if (discard) {
+ dout(10) << __func__ << " will trim (" << old_start << ", " << header.start << ")" << dendl;
+ if (old_start < header.start)
+ do_discard(old_start, header.start - 1);
+ else {
+ do_discard(old_start, header.max_size - 1);
+ do_discard(get_top(), header.start - 1);
+ }
+ }
+
+ must_write_header = true;
+ print_header(header);
+
+ // committed but unjournaled items
+ while (!writeq_empty() && peek_write().seq <= seq) {
+ dout(15) << " dropping committed but unwritten seq " << peek_write().seq
+ << " len " << peek_write().bl.length()
+ << dendl;
+ complete_write(1, peek_write().orig_len);
+ pop_write();
+ }
+
+ commit_cond.Signal();
+
+ dout(10) << "committed_thru done" << dendl;
+}
+
+
+void FileJournal::complete_write(uint64_t ops, uint64_t bytes)
+{
+ dout(5) << __func__ << " finished " << ops << " ops and "
+ << bytes << " bytes" << dendl;
+}
+
+int FileJournal::make_writeable()
+{
+ dout(10) << __func__ << dendl;
+ int r = set_throttle_params();
+ if (r < 0)
+ return r;
+
+ r = _open(true);
+ if (r < 0)
+ return r;
+
+ if (read_pos > 0)
+ write_pos = read_pos;
+ else
+ write_pos = get_top();
+ read_pos = 0;
+
+ must_write_header = true;
+
+ start_writer();
+ return 0;
+}
+
+int FileJournal::set_throttle_params()
+{
+ stringstream ss;
+ bool valid = throttle.set_params(
+ cct->_conf->journal_throttle_low_threshhold,
+ cct->_conf->journal_throttle_high_threshhold,
+ cct->_conf->filestore_expected_throughput_bytes,
+ cct->_conf->journal_throttle_high_multiple,
+ cct->_conf->journal_throttle_max_multiple,
+ header.max_size - get_top(),
+ &ss);
+
+ if (!valid) {
+ derr << "tried to set invalid params: "
+ << ss.str()
+ << dendl;
+ }
+ return valid ? 0 : -EINVAL;
+}
+
+const char** FileJournal::get_tracked_conf_keys() const
+{
+ static const char *KEYS[] = {
+ "journal_throttle_low_threshhold",
+ "journal_throttle_high_threshhold",
+ "journal_throttle_high_multiple",
+ "journal_throttle_max_multiple",
+ "filestore_expected_throughput_bytes",
+ NULL};
+ return KEYS;
+}
+
+void FileJournal::wrap_read_bl(
+ off64_t pos,
+ int64_t olen,
+ bufferlist* bl,
+ off64_t *out_pos
+ ) const
+{
+ while (olen > 0) {
+ while (pos >= header.max_size)
+ pos = pos + get_top() - header.max_size;
+
+ int64_t len;
+ if (pos + olen > header.max_size)
+ len = header.max_size - pos; // partial
+ else
+ len = olen; // rest
+
+ int64_t actual = ::lseek64(fd, pos, SEEK_SET);
+ ceph_assert(actual == pos);
+
+ bufferptr bp = buffer::create(len);
+ int r = safe_read_exact(fd, bp.c_str(), len);
+ if (r) {
+ derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned "
+ << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ bl->push_back(std::move(bp));
+ pos += len;
+ olen -= len;
+ }
+ if (pos >= header.max_size)
+ pos = pos + get_top() - header.max_size;
+ if (out_pos)
+ *out_pos = pos;
+}
+
+bool FileJournal::read_entry(
+ bufferlist &bl,
+ uint64_t &next_seq,
+ bool *corrupt)
+{
+ if (corrupt)
+ *corrupt = false;
+ uint64_t seq = next_seq;
+
+ if (!read_pos) {
+ dout(2) << "read_entry -- not readable" << dendl;
+ return false;
+ }
+
+ off64_t pos = read_pos;
+ off64_t next_pos = pos;
+ stringstream ss;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ &ss);
+ if (result == SUCCESS) {
+ journalq.push_back( pair<uint64_t,off64_t>(seq, pos));
+ uint64_t amount_to_take =
+ next_pos > pos ?
+ next_pos - pos :
+ (header.max_size - pos) + (next_pos - get_top());
+ throttle.take(amount_to_take);
+ throttle.register_throttle_seq(next_seq, amount_to_take);
+ if (logger) {
+ logger->inc(l_filestore_journal_ops, 1);
+ logger->inc(l_filestore_journal_bytes, amount_to_take);
+ }
+ if (next_seq > seq) {
+ return false;
+ } else {
+ read_pos = next_pos;
+ next_seq = seq;
+ if (seq > journaled_seq)
+ journaled_seq = seq;
+ return true;
+ }
+ } else {
+ derr << "do_read_entry(" << pos << "): " << ss.str() << dendl;
+ }
+
+ if (seq && seq < header.committed_up_to) {
+ derr << "Unable to read past sequence " << seq
+ << " but header indicates the journal has committed up through "
+ << header.committed_up_to << ", journal is corrupt" << dendl;
+ if (cct->_conf->journal_ignore_corruption) {
+ if (corrupt)
+ *corrupt = true;
+ return false;
+ } else {
+ ceph_abort();
+ }
+ }
+
+ dout(2) << "No further valid entries found, journal is most likely valid"
+ << dendl;
+ return false;
+}
+
+FileJournal::read_entry_result FileJournal::do_read_entry(
+ off64_t init_pos,
+ off64_t *next_pos,
+ bufferlist *bl,
+ uint64_t *seq,
+ ostream *ss,
+ entry_header_t *_h) const
+{
+ off64_t cur_pos = init_pos;
+ bufferlist _bl;
+ if (!bl)
+ bl = &_bl;
+
+ // header
+ entry_header_t *h;
+ bufferlist hbl;
+ off64_t _next_pos;
+ wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos);
+ h = reinterpret_cast<entry_header_t *>(hbl.c_str());
+
+ if (!h->check_magic(cur_pos, header.get_fsid64())) {
+ dout(25) << "read_entry " << init_pos
+ << " : bad header magic, end of journal" << dendl;
+ if (ss)
+ *ss << "bad header magic";
+ if (next_pos)
+ *next_pos = init_pos + (4<<10); // check 4k ahead
+ return MAYBE_CORRUPT;
+ }
+ cur_pos = _next_pos;
+
+ // pad + body + pad
+ if (h->pre_pad)
+ cur_pos += h->pre_pad;
+
+ bl->clear();
+ wrap_read_bl(cur_pos, h->len, bl, &cur_pos);
+
+ if (h->post_pad)
+ cur_pos += h->post_pad;
+
+ // footer
+ entry_header_t *f;
+ bufferlist fbl;
+ wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos);
+ f = reinterpret_cast<entry_header_t *>(fbl.c_str());
+ if (memcmp(f, h, sizeof(*f))) {
+ if (ss)
+ *ss << "bad footer magic, partial entry";
+ if (next_pos)
+ *next_pos = cur_pos;
+ return MAYBE_CORRUPT;
+ }
+
+ if ((header.flags & header_t::FLAG_CRC) || // if explicitly enabled (new journal)
+ h->crc32c != 0) { // newer entry in old journal
+ uint32_t actual_crc = bl->crc32c(0);
+ if (actual_crc != h->crc32c) {
+ if (ss)
+ *ss << "header crc (" << h->crc32c
+ << ") doesn't match body crc (" << actual_crc << ")";
+ if (next_pos)
+ *next_pos = cur_pos;
+ return MAYBE_CORRUPT;
+ }
+ }
+
+ // yay!
+ dout(2) << "read_entry " << init_pos << " : seq " << h->seq
+ << " " << h->len << " bytes"
+ << dendl;
+
+ // ok!
+ if (seq)
+ *seq = h->seq;
+
+
+ if (next_pos)
+ *next_pos = cur_pos;
+
+ if (_h)
+ *_h = *h;
+
+ ceph_assert(cur_pos % header.alignment == 0);
+ return SUCCESS;
+}
+
+void FileJournal::reserve_throttle_and_backoff(uint64_t count)
+{
+ throttle.get(count);
+}
+
+void FileJournal::get_header(
+ uint64_t wanted_seq,
+ off64_t *_pos,
+ entry_header_t *h)
+{
+ off64_t pos = header.start;
+ off64_t next_pos = pos;
+ bufferlist bl;
+ uint64_t seq = 0;
+ dout(2) << __func__ << dendl;
+ while (1) {
+ bl.clear();
+ pos = next_pos;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ 0,
+ h);
+ if (result == FAILURE || result == MAYBE_CORRUPT)
+ ceph_abort();
+ if (seq == wanted_seq) {
+ if (_pos)
+ *_pos = pos;
+ return;
+ }
+ }
+ ceph_abort(); // not reachable
+}
+
+void FileJournal::corrupt(
+ int wfd,
+ off64_t corrupt_at)
+{
+ dout(2) << __func__ << dendl;
+ if (corrupt_at >= header.max_size)
+ corrupt_at = corrupt_at + get_top() - header.max_size;
+
+ int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET);
+ ceph_assert(actual == corrupt_at);
+
+ char buf[10];
+ int r = safe_read_exact(fd, buf, 1);
+ ceph_assert(r == 0);
+
+ actual = ::lseek64(wfd, corrupt_at, SEEK_SET);
+ ceph_assert(actual == corrupt_at);
+
+ buf[0]++;
+ r = safe_write(wfd, buf, 1);
+ ceph_assert(r == 0);
+}
+
+void FileJournal::corrupt_payload(
+ int wfd,
+ uint64_t seq)
+{
+ dout(2) << __func__ << dendl;
+ off64_t pos = 0;
+ entry_header_t h;
+ get_header(seq, &pos, &h);
+ off64_t corrupt_at =
+ pos + sizeof(entry_header_t) + h.pre_pad;
+ corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_footer_magic(
+ int wfd,
+ uint64_t seq)
+{
+ dout(2) << __func__ << dendl;
+ off64_t pos = 0;
+ entry_header_t h;
+ get_header(seq, &pos, &h);
+ off64_t corrupt_at =
+ pos + sizeof(entry_header_t) + h.pre_pad +
+ h.len + h.post_pad +
+ (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+ corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_header_magic(
+ int wfd,
+ uint64_t seq)
+{
+ dout(2) << __func__ << dendl;
+ off64_t pos = 0;
+ entry_header_t h;
+ get_header(seq, &pos, &h);
+ off64_t corrupt_at =
+ pos +
+ (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+ corrupt(wfd, corrupt_at);
+}
+
+off64_t FileJournal::get_journal_size_estimate()
+{
+ off64_t size, start = header.start;
+ if (write_pos < start) {
+ size = (max_size - start) + write_pos;
+ } else {
+ size = write_pos - start;
+ }
+ dout(20) << __func__ << " journal size=" << size << dendl;
+ return size;
+}
+
+void FileJournal::get_devices(set<string> *ls)
+{
+ string dev_node;
+ BlkDev blkdev(fd);
+ if (int rc = blkdev.wholedisk(&dev_node); rc) {
+ return;
+ }
+ get_raw_devices(dev_node, ls);
+}
+
+void FileJournal::collect_metadata(map<string,string> *pm)
+{
+ BlkDev blkdev(fd);
+ char partition_path[PATH_MAX];
+ char dev_node[PATH_MAX];
+ if (blkdev.partition(partition_path, PATH_MAX)) {
+ (*pm)["backend_filestore_journal_partition_path"] = "unknown";
+ } else {
+ (*pm)["backend_filestore_journal_partition_path"] = string(partition_path);
+ }
+ if (blkdev.wholedisk(dev_node, PATH_MAX)) {
+ (*pm)["backend_filestore_journal_dev_node"] = "unknown";
+ } else {
+ (*pm)["backend_filestore_journal_dev_node"] = string(dev_node);
+ devname = dev_node;
+ }
+}
diff --git a/src/os/filestore/FileJournal.h b/src/os/filestore/FileJournal.h
new file mode 100644
index 00000000..2313b4b8
--- /dev/null
+++ b/src/os/filestore/FileJournal.h
@@ -0,0 +1,556 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILEJOURNAL_H
+#define CEPH_FILEJOURNAL_H
+
+#include <stdlib.h>
+#include <deque>
+using std::deque;
+
+#include "Journal.h"
+#include "common/config_fwd.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+#include "common/Throttle.h"
+#include "JournalThrottle.h"
+#include "common/zipkin_trace.h"
+
+#ifdef HAVE_LIBAIO
+# include <libaio.h>
+#endif
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+/**
+ * Implements journaling on top of block device or file.
+ *
+ * Lock ordering is write_lock > aio_lock > (completions_lock | finisher_lock)
+ */
+class FileJournal :
+ public Journal,
+ public md_config_obs_t {
+public:
+ /// Protected by finisher_lock
+ struct completion_item {
+ uint64_t seq;
+ Context *finish;
+ utime_t start;
+ TrackedOpRef tracked_op;
+ completion_item(uint64_t o, Context *c, utime_t s, TrackedOpRef opref)
+ : seq(o), finish(c), start(s), tracked_op(opref) {}
+ completion_item() : seq(0), finish(0), start(0) {}
+ };
+ struct write_item {
+ uint64_t seq;
+ bufferlist bl;
+ uint32_t orig_len;
+ TrackedOpRef tracked_op;
+ ZTracer::Trace trace;
+ write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) :
+ seq(s), orig_len(ol), tracked_op(opref) {
+ bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+ }
+ write_item() : seq(0), orig_len(0) {}
+ };
+
+ Mutex finisher_lock;
+ Cond finisher_cond;
+ uint64_t journaled_seq;
+ bool plug_journal_completions;
+
+ Mutex writeq_lock;
+ Cond writeq_cond;
+ list<write_item> writeq;
+ bool writeq_empty();
+ write_item &peek_write();
+ void pop_write();
+ void batch_pop_write(list<write_item> &items);
+ void batch_unpop_write(list<write_item> &items);
+
+ Mutex completions_lock;
+ list<completion_item> completions;
+ bool completions_empty() {
+ Mutex::Locker l(completions_lock);
+ return completions.empty();
+ }
+ void batch_pop_completions(list<completion_item> &items) {
+ Mutex::Locker l(completions_lock);
+ completions.swap(items);
+ }
+ void batch_unpop_completions(list<completion_item> &items) {
+ Mutex::Locker l(completions_lock);
+ completions.splice(completions.begin(), items);
+ }
+ completion_item completion_peek_front() {
+ Mutex::Locker l(completions_lock);
+ ceph_assert(!completions.empty());
+ return completions.front();
+ }
+ void completion_pop_front() {
+ Mutex::Locker l(completions_lock);
+ ceph_assert(!completions.empty());
+ completions.pop_front();
+ }
+
+ int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) override;
+
+ void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len,
+ Context *oncommit,
+ TrackedOpRef osd_op = TrackedOpRef()) override;
+ /// End protected by finisher_lock
+
+ /*
+ * journal header
+ */
+ struct header_t {
+ enum {
+ FLAG_CRC = (1<<0),
+ // NOTE: remove kludgey weirdness in read_header() next time a flag is added.
+ };
+
+ uint64_t flags;
+ uuid_d fsid;
+ __u32 block_size;
+ __u32 alignment;
+ int64_t max_size; // max size of journal ring buffer
+ int64_t start; // offset of first entry
+ uint64_t committed_up_to; // committed up to
+
+ /**
+ * start_seq
+ *
+ * entry at header.start has sequence >= start_seq
+ *
+ * Generally, the entry at header.start will have sequence
+ * start_seq if it exists. The only exception is immediately
+ * after journal creation since the first sequence number is
+ * not known.
+ *
+ * If the first read on open fails, we can assume corruption
+ * if start_seq > committed_up_to because the entry would have
+ * a sequence >= start_seq and therefore > committed_up_to.
+ */
+ uint64_t start_seq;
+
+ header_t() :
+ flags(0), block_size(0), alignment(0), max_size(0), start(0),
+ committed_up_to(0), start_seq(0) {}
+
+ void clear() {
+ start = block_size;
+ }
+
+ uint64_t get_fsid64() const {
+ return *(uint64_t*)fsid.bytes();
+ }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ __u32 v = 4;
+ encode(v, bl);
+ bufferlist em;
+ {
+ encode(flags, em);
+ encode(fsid, em);
+ encode(block_size, em);
+ encode(alignment, em);
+ encode(max_size, em);
+ encode(start, em);
+ encode(committed_up_to, em);
+ encode(start_seq, em);
+ }
+ encode(em, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ __u32 v;
+ decode(v, bl);
+ if (v < 2) { // normally 0, but conceivably 1
+ // decode old header_t struct (pre v0.40).
+ bl.advance(4u); // skip __u32 flags (it was unused by any old code)
+ flags = 0;
+ uint64_t tfsid;
+ decode(tfsid, bl);
+ *(uint64_t*)&fsid.bytes()[0] = tfsid;
+ *(uint64_t*)&fsid.bytes()[8] = tfsid;
+ decode(block_size, bl);
+ decode(alignment, bl);
+ decode(max_size, bl);
+ decode(start, bl);
+ committed_up_to = 0;
+ start_seq = 0;
+ return;
+ }
+ bufferlist em;
+ decode(em, bl);
+ auto t = em.cbegin();
+ decode(flags, t);
+ decode(fsid, t);
+ decode(block_size, t);
+ decode(alignment, t);
+ decode(max_size, t);
+ decode(start, t);
+
+ if (v > 2)
+ decode(committed_up_to, t);
+ else
+ committed_up_to = 0;
+
+ if (v > 3)
+ decode(start_seq, t);
+ else
+ start_seq = 0;
+ }
+ } header;
+
+ struct entry_header_t {
+ uint64_t seq; // fs op seq #
+ uint32_t crc32c; // payload only. not header, pre_pad, post_pad, or footer.
+ uint32_t len;
+ uint32_t pre_pad, post_pad;
+ uint64_t magic1;
+ uint64_t magic2;
+
+ static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) {
+ return (fsid ^ seq ^ len);
+ }
+ bool check_magic(off64_t pos, uint64_t fsid) {
+ return
+ magic1 == (uint64_t)pos &&
+ magic2 == (fsid ^ seq ^ len);
+ }
+ } __attribute__((__packed__, aligned(4)));
+
+ bool journalq_empty() { return journalq.empty(); }
+
+private:
+ string fn;
+
+ char *zero_buf;
+ off64_t max_size;
+ size_t block_size;
+ bool directio, aio, force_aio;
+ bool must_write_header;
+ off64_t write_pos; // byte where the next entry to be written will go
+ off64_t read_pos; //
+ bool discard; //for block journal whether support discard
+
+#ifdef HAVE_LIBAIO
+ /// state associated with an in-flight aio request
+ /// Protected by aio_lock
+ struct aio_info {
+ struct iocb iocb {};
+ bufferlist bl;
+ struct iovec *iov;
+ bool done;
+ uint64_t off, len; ///< these are for debug only
+ uint64_t seq; ///< seq number to complete on aio completion, if non-zero
+
+ aio_info(bufferlist& b, uint64_t o, uint64_t s)
+ : iov(NULL), done(false), off(o), len(b.length()), seq(s) {
+ bl.claim(b);
+ }
+ ~aio_info() {
+ delete[] iov;
+ }
+ };
+ Mutex aio_lock;
+ Cond aio_cond;
+ Cond write_finish_cond;
+ io_context_t aio_ctx;
+ list<aio_info> aio_queue;
+ int aio_num, aio_bytes;
+ uint64_t aio_write_queue_ops;
+ uint64_t aio_write_queue_bytes;
+ /// End protected by aio_lock
+#endif
+
+ uint64_t last_committed_seq;
+ uint64_t journaled_since_start;
+
+ string devname;
+
+ /*
+ * full states cycle at the beginnging of each commit epoch, when commit_start()
+ * is called.
+ * FULL - we just filled up during this epoch.
+ * WAIT - we filled up last epoch; now we have to wait until everything during
+ * that epoch commits to the fs before we can start writing over it.
+ * NOTFULL - all good, journal away.
+ */
+ enum {
+ FULL_NOTFULL = 0,
+ FULL_FULL = 1,
+ FULL_WAIT = 2,
+ } full_state;
+
+ int fd;
+
+ // in journal
+ deque<pair<uint64_t, off64_t> > journalq; // track seq offsets, so we can trim later.
+ uint64_t writing_seq;
+
+
+ // throttle
+ int set_throttle_params();
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set <std::string> &changed) override {
+ for (const char **i = get_tracked_conf_keys();
+ *i;
+ ++i) {
+ if (changed.count(string(*i))) {
+ set_throttle_params();
+ return;
+ }
+ }
+ }
+
+ void complete_write(uint64_t ops, uint64_t bytes);
+ JournalThrottle throttle;
+
+ // write thread
+ Mutex write_lock;
+ bool write_stop;
+ bool aio_stop;
+
+ Cond commit_cond;
+
+ int _open(bool wr, bool create=false);
+ int _open_block_device();
+ void _close(int fd) const;
+ int _open_file(int64_t oldsize, blksize_t blksize, bool create);
+ int _dump(ostream& out, bool simple);
+ void print_header(const header_t &hdr) const;
+ int read_header(header_t *hdr) const;
+ bufferptr prepare_header();
+ void start_writer();
+ void stop_writer();
+ void write_thread_entry();
+
+ void queue_completions_thru(uint64_t seq);
+
+ int check_for_full(uint64_t seq, off64_t pos, off64_t size);
+ int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee);
+ int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos,
+ uint64_t& orig_ops, uint64_t& orig_bytes);
+ void do_write(bufferlist& bl);
+
+ void write_finish_thread_entry();
+ void check_aio_completion();
+ void do_aio_write(bufferlist& bl);
+ int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq);
+
+
+ void check_align(off64_t pos, bufferlist& bl);
+ int write_bl(off64_t& pos, bufferlist& bl);
+
+ /// read len from journal starting at in_pos and wrapping up to len
+ void wrap_read_bl(
+ off64_t in_pos, ///< [in] start position
+ int64_t len, ///< [in] length to read
+ bufferlist* bl, ///< [out] result
+ off64_t *out_pos ///< [out] next position to read, will be wrapped
+ ) const;
+
+ void do_discard(int64_t offset, int64_t end);
+
+ class Writer : public Thread {
+ FileJournal *journal;
+ public:
+ explicit Writer(FileJournal *fj) : journal(fj) {}
+ void *entry() override {
+ journal->write_thread_entry();
+ return 0;
+ }
+ } write_thread;
+
+ class WriteFinisher : public Thread {
+ FileJournal *journal;
+ public:
+ explicit WriteFinisher(FileJournal *fj) : journal(fj) {}
+ void *entry() override {
+ journal->write_finish_thread_entry();
+ return 0;
+ }
+ } write_finish_thread;
+
+ off64_t get_top() const {
+ return round_up_to(sizeof(header), block_size);
+ }
+
+ ZTracer::Endpoint trace_endpoint;
+
+ public:
+ FileJournal(CephContext* cct, uuid_d fsid, Finisher *fin, Cond *sync_cond,
+ const char *f, bool dio=false, bool ai=true, bool faio=false) :
+ Journal(cct, fsid, fin, sync_cond),
+ finisher_lock("FileJournal::finisher_lock", false, true, false),
+ journaled_seq(0),
+ plug_journal_completions(false),
+ writeq_lock("FileJournal::writeq_lock", false, true, false),
+ completions_lock(
+ "FileJournal::completions_lock", false, true, false),
+ fn(f),
+ zero_buf(NULL),
+ max_size(0), block_size(0),
+ directio(dio), aio(ai), force_aio(faio),
+ must_write_header(false),
+ write_pos(0), read_pos(0),
+ discard(false),
+#ifdef HAVE_LIBAIO
+ aio_lock("FileJournal::aio_lock"),
+ aio_ctx(0),
+ aio_num(0), aio_bytes(0),
+ aio_write_queue_ops(0),
+ aio_write_queue_bytes(0),
+#endif
+ last_committed_seq(0),
+ journaled_since_start(0),
+ full_state(FULL_NOTFULL),
+ fd(-1),
+ writing_seq(0),
+ throttle(cct->_conf->filestore_caller_concurrency),
+ write_lock("FileJournal::write_lock", false, true, false),
+ write_stop(true),
+ aio_stop(true),
+ write_thread(this),
+ write_finish_thread(this),
+ trace_endpoint("0.0.0.0", 0, "FileJournal") {
+
+ if (aio && !directio) {
+ lderr(cct) << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
+ aio = false;
+ }
+#ifndef HAVE_LIBAIO
+ if (aio && ::getenv("CEPH_DEV") == NULL) {
+ lderr(cct) << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl;
+ aio = false;
+ }
+#endif
+
+ cct->_conf.add_observer(this);
+ }
+ ~FileJournal() override {
+ ceph_assert(fd == -1);
+ delete[] zero_buf;
+ cct->_conf.remove_observer(this);
+ }
+
+ int check() override;
+ int create() override;
+ int open(uint64_t fs_op_seq) override;
+ void close() override;
+ int peek_fsid(uuid_d& fsid);
+
+ int dump(ostream& out) override;
+ int simple_dump(ostream& out);
+ int _fdump(Formatter &f, bool simple);
+
+ void flush() override;
+
+ void get_devices(set<string> *ls) override;
+ void collect_metadata(map<string,string> *pm) override;
+
+ void reserve_throttle_and_backoff(uint64_t count) override;
+
+ bool is_writeable() override {
+ return read_pos == 0;
+ }
+ int make_writeable() override;
+
+ // writes
+ void commit_start(uint64_t seq) override;
+ void committed_thru(uint64_t seq) override;
+ bool should_commit_now() override {
+ return full_state != FULL_NOTFULL && !write_stop;
+ }
+
+ void write_header_sync();
+
+ void set_wait_on_full(bool b) { wait_on_full = b; }
+
+ off64_t get_journal_size_estimate();
+
+ // reads
+
+ /// Result code for read_entry
+ enum read_entry_result {
+ SUCCESS,
+ FAILURE,
+ MAYBE_CORRUPT
+ };
+
+ /**
+ * read_entry
+ *
+ * Reads next entry starting at pos. If the entry appears
+ * clean, *bl will contain the payload, *seq will contain
+ * the sequence number, and *out_pos will reflect the next
+ * read position. If the entry is invalid *ss will contain
+ * debug text, while *seq, *out_pos, and *bl will be unchanged.
+ *
+ * If the entry suggests a corrupt log, *ss will contain debug
+ * text, *out_pos will contain the next index to check. If
+ * we find an entry in this way that returns SUCCESS, the journal
+ * is most likely corrupt.
+ */
+ read_entry_result do_read_entry(
+ off64_t pos, ///< [in] position to read
+ off64_t *next_pos, ///< [out] next position to read
+ bufferlist* bl, ///< [out] payload for successful read
+ uint64_t *seq, ///< [out] seq of successful read
+ ostream *ss, ///< [out] error output
+ entry_header_t *h = 0 ///< [out] header
+ ) const; ///< @return result code
+
+ bool read_entry(
+ bufferlist &bl,
+ uint64_t &last_seq,
+ bool *corrupt
+ );
+
+ bool read_entry(
+ bufferlist &bl,
+ uint64_t &last_seq) override {
+ return read_entry(bl, last_seq, 0);
+ }
+
+ // Debug/Testing
+ void get_header(
+ uint64_t wanted_seq,
+ off64_t *_pos,
+ entry_header_t *h);
+ void corrupt(
+ int wfd,
+ off64_t corrupt_at);
+ void corrupt_payload(
+ int wfd,
+ uint64_t seq);
+ void corrupt_footer_magic(
+ int wfd,
+ uint64_t seq);
+ void corrupt_header_magic(
+ int wfd,
+ uint64_t seq);
+};
+
+WRITE_CLASS_ENCODER(FileJournal::header_t)
+
+#endif
diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc
new file mode 100644
index 00000000..d387947e
--- /dev/null
+++ b/src/os/filestore/FileStore.cc
@@ -0,0 +1,6425 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "include/compat.h"
+#include "include/int_types.h"
+#include "boost/tuple/tuple.hpp"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <errno.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#include <linux/falloc.h>
+#endif
+
+#include <iostream>
+#include <map>
+
+#include "include/linux_fiemap.h"
+
+#include "common/xattr.h"
+#include "chain_xattr.h"
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+
+#include <fstream>
+#include <sstream>
+
+#include "FileStore.h"
+#include "GenericFileStoreBackend.h"
+#include "BtrfsFileStoreBackend.h"
+#include "XfsFileStoreBackend.h"
+#include "ZFSFileStoreBackend.h"
+#include "common/BackTrace.h"
+#include "include/types.h"
+#include "FileJournal.h"
+
+#include "osd/osd_types.h"
+#include "include/color.h"
+#include "include/buffer.h"
+
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/run_cmd.h"
+#include "common/safe_io.h"
+#include "common/perf_counters.h"
+#include "common/sync_filesystem.h"
+#include "common/fd.h"
+#include "HashIndex.h"
+#include "DBObjectMap.h"
+#include "kv/KeyValueDB.h"
+
+#include "common/ceph_crypto.h"
+using ceph::crypto::SHA1;
+
+#include "include/ceph_assert.h"
+
+#include "common/config.h"
+#include "common/blkdev.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/objectstore.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore(" << basedir << ") "
+
+#define COMMIT_SNAP_ITEM "snap_%llu"
+#define CLUSTER_SNAP_ITEM "clustersnap_%s"
+
+#define REPLAY_GUARD_XATTR "user.cephos.seq"
+#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
+
+// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether
+// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file
+// xattrs and the value is "no", it indicates no xattrs in DBObjectMap
+#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out"
+#define XATTR_NO_SPILL_OUT "0"
+#define XATTR_SPILL_OUT "1"
+#define __FUNC__ __func__ << "(" << __LINE__ << ")"
+
+//Initial features in new superblock.
+static CompatSet get_fs_initial_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+
+//Features are added here that this FileStore supports.
+static CompatSet get_fs_supported_compat_set() {
+ CompatSet compat = get_fs_initial_compat_set();
+ //Any features here can be set in code, but not in initial superblock
+ compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ return compat;
+}
+
+int FileStore::validate_hobject_key(const hobject_t &obj) const
+{
+ unsigned len = LFNIndex::get_max_escaped_name_len(obj);
+ return len > m_filestore_max_xattr_value_size ? -ENAMETOOLONG : 0;
+}
+
+int FileStore::get_block_device_fsid(CephContext* cct, const string& path,
+ uuid_d *fsid)
+{
+ // make sure we don't try to use aio or direct_io (and get annoying
+ // error messages from failing to do so); performance implications
+ // should be irrelevant for this use
+ FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false);
+ return j.peek_fsid(*fsid);
+}
+
+void FileStore::FSPerfTracker::update_from_perfcounters(
+ PerfCounters &logger)
+{
+ os_commit_latency_ns.consume_next(
+ logger.get_tavg_ns(
+ l_filestore_journal_latency));
+ os_apply_latency_ns.consume_next(
+ logger.get_tavg_ns(
+ l_filestore_apply_latency));
+}
+
+
+ostream& operator<<(ostream& out, const FileStore::OpSequencer& s)
+{
+ return out << "osr(" << s.cid << ")";
+}
+
+int FileStore::get_cdir(const coll_t& cid, char *s, int len)
+{
+ const string &cid_str(cid.to_str());
+ return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str());
+}
+
+void FileStore::handle_eio()
+{
+ // don't try to map this back to an offset; too hard since there is
+ // a file system in between. we also don't really know whether this
+ // was a read or a write, since we have so many layers beneath us.
+ // don't even try.
+ note_io_error_event(devname.c_str(), basedir.c_str(), -EIO, 0, 0, 0);
+ ceph_abort_msg("unexpected eio error");
+}
+
+int FileStore::get_index(const coll_t& cid, Index *index)
+{
+ int r = index_manager.get_index(cid, basedir, index);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+int FileStore::init_index(const coll_t& cid)
+{
+ char path[PATH_MAX];
+ get_cdir(cid, path, sizeof(path));
+ int r = index_manager.init_index(cid, path, target_version);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path)
+{
+ IndexedPath path2;
+ if (!path)
+ path = &path2;
+ int r, exist;
+ ceph_assert(index.index);
+ r = (index.index)->lookup(oid, path, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (!exist)
+ return -ENOENT;
+ return 0;
+}
+
+int FileStore::lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length)
+{
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0)
+ return r;
+ r = ::ftruncate(**fd, length);
+ if (r < 0)
+ r = -errno;
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_truncate(**fd, length);
+ ceph_assert(rc >= 0);
+ }
+ lfn_close(fd);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+int FileStore::lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf)
+{
+ IndexedPath path;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ r = lfn_find(oid, index, &path);
+ if (r < 0)
+ return r;
+ r = ::stat(path->path(), buf);
+ if (r < 0)
+ r = -errno;
+ return r;
+}
+
+int FileStore::lfn_open(const coll_t& cid,
+ const ghobject_t& oid,
+ bool create,
+ FDRef *outfd,
+ Index *index)
+{
+ ceph_assert(outfd);
+ int r = 0;
+ bool need_lock = true;
+ int flags = O_RDWR;
+
+ if (create)
+ flags |= O_CREAT;
+ if (cct->_conf->filestore_odsync_write) {
+ flags |= O_DSYNC;
+ }
+
+ Index index2;
+ if (!index) {
+ index = &index2;
+ }
+ if (!((*index).index)) {
+ r = get_index(cid, index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ return r;
+ }
+ } else {
+ need_lock = false;
+ }
+
+ int fd, exist;
+ ceph_assert((*index).index);
+ if (need_lock) {
+ ((*index).index)->access_lock.get_write();
+ }
+ if (!replaying) {
+ *outfd = fdcache.lookup(oid);
+ if (*outfd) {
+ if (need_lock) {
+ ((*index).index)->access_lock.put_write();
+ }
+ return 0;
+ }
+ }
+
+
+ IndexedPath path2;
+ IndexedPath *path = &path2;
+
+ r = (*index)->lookup(oid, path, &exist);
+ if (r < 0) {
+ derr << "could not find " << oid << " in index: "
+ << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+
+ r = ::open((*path)->path(), flags|O_CLOEXEC, 0644);
+ if (r < 0) {
+ r = -errno;
+ dout(10) << "error opening file " << (*path)->path() << " with flags="
+ << flags << ": " << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+ fd = r;
+ if (create && (!exist)) {
+ r = (*index)->created(oid, (*path)->path());
+ if (r < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << "error creating " << oid << " (" << (*path)->path()
+ << ") in index: " << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+ r = chain_fsetxattr<true, true>(
+ fd, XATTR_SPILL_OUT_NAME,
+ XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT));
+ if (r < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path()
+ << "):" << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+ }
+
+ if (!replaying) {
+ bool existed;
+ *outfd = fdcache.add(oid, fd, &existed);
+ if (existed) {
+ TEMP_FAILURE_RETRY(::close(fd));
+ }
+ } else {
+ *outfd = std::make_shared<FDCache::FD>(fd);
+ }
+
+ if (need_lock) {
+ ((*index).index)->access_lock.put_write();
+ }
+
+ return 0;
+
+ fail:
+
+ if (need_lock) {
+ ((*index).index)->access_lock.put_write();
+ }
+
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+void FileStore::lfn_close(FDRef fd)
+{
+}
+
+int FileStore::lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid)
+{
+ Index index_new, index_old;
+ IndexedPath path_new, path_old;
+ int exist;
+ int r;
+ bool index_same = false;
+ if (c < newcid) {
+ r = get_index(newcid, &index_new);
+ if (r < 0)
+ return r;
+ r = get_index(c, &index_old);
+ if (r < 0)
+ return r;
+ } else if (c == newcid) {
+ r = get_index(c, &index_old);
+ if (r < 0)
+ return r;
+ index_new = index_old;
+ index_same = true;
+ } else {
+ r = get_index(c, &index_old);
+ if (r < 0)
+ return r;
+ r = get_index(newcid, &index_new);
+ if (r < 0)
+ return r;
+ }
+
+ ceph_assert(index_old.index);
+ ceph_assert(index_new.index);
+
+ if (!index_same) {
+
+ RWLock::RLocker l1((index_old.index)->access_lock);
+
+ r = index_old->lookup(o, &path_old, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (!exist)
+ return -ENOENT;
+
+ RWLock::WLocker l2((index_new.index)->access_lock);
+
+ r = index_new->lookup(newoid, &path_new, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (exist)
+ return -EEXIST;
+
+ dout(25) << __FUNC__ << ": path_old: " << path_old << dendl;
+ dout(25) << __FUNC__ << ": path_new: " << path_new << dendl;
+ r = ::link(path_old->path(), path_new->path());
+ if (r < 0)
+ return -errno;
+
+ r = index_new->created(newoid, path_new->path());
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ } else {
+ RWLock::WLocker l1((index_old.index)->access_lock);
+
+ r = index_old->lookup(o, &path_old, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (!exist)
+ return -ENOENT;
+
+ r = index_new->lookup(newoid, &path_new, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (exist)
+ return -EEXIST;
+
+ dout(25) << __FUNC__ << ": path_old: " << path_old << dendl;
+ dout(25) << __FUNC__ << ": path_new: " << path_new << dendl;
+ r = ::link(path_old->path(), path_new->path());
+ if (r < 0)
+ return -errno;
+
+ // make sure old fd for unlinked/overwritten file is gone
+ fdcache.clear(newoid);
+
+ r = index_new->created(newoid, path_new->path());
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ }
+ return 0;
+}
+
+int FileStore::lfn_unlink(const coll_t& cid, const ghobject_t& o,
+ const SequencerPosition &spos,
+ bool force_clear_omap)
+{
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0) {
+ dout(25) << __FUNC__ << ": get_index failed " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ ceph_assert(index.index);
+ RWLock::WLocker l((index.index)->access_lock);
+
+ {
+ IndexedPath path;
+ int hardlink;
+ r = index->lookup(o, &path, &hardlink);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+
+ if (!force_clear_omap) {
+ if (hardlink == 0 || hardlink == 1) {
+ force_clear_omap = true;
+ }
+ }
+ if (force_clear_omap) {
+ dout(20) << __FUNC__ << ": clearing omap on " << o
+ << " in cid " << cid << dendl;
+ r = object_map->clear(o, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(25) << __FUNC__ << ": omap clear failed " << cpp_strerror(r) << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (cct->_conf->filestore_debug_inject_read_err) {
+ debug_obj_on_delete(o);
+ }
+ if (!m_disable_wbthrottle) {
+ wbthrottle.clear_object(o); // should be only non-cache ref
+ }
+ fdcache.clear(o);
+ } else {
+ /* Ensure that replay of this op doesn't result in the object_map
+ * going away.
+ */
+ if (!backend->can_checkpoint())
+ object_map->sync(&o, &spos);
+ }
+ if (hardlink == 0) {
+ if (!m_disable_wbthrottle) {
+ wbthrottle.clear_object(o); // should be only non-cache ref
+ }
+ return 0;
+ }
+ }
+ r = index->unlink(o);
+ if (r < 0) {
+ dout(25) << __FUNC__ << ": index unlink failed " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+FileStore::FileStore(CephContext* cct, const std::string &base,
+ const std::string &jdev, osflagbits_t flags,
+ const char *name, bool do_update) :
+ JournalingObjectStore(cct, base),
+ internal_name(name),
+ basedir(base), journalpath(jdev),
+ generic_flags(flags),
+ blk_size(0),
+ fsid_fd(-1), op_fd(-1),
+ basedir_fd(-1), current_fd(-1),
+ backend(nullptr),
+ index_manager(cct, do_update),
+ lock("FileStore::lock"),
+ force_sync(false),
+ sync_entry_timeo_lock("FileStore::sync_entry_timeo_lock"),
+ timer(cct, sync_entry_timeo_lock),
+ stop(false), sync_thread(this),
+ coll_lock("FileStore::coll_lock"),
+ fdcache(cct),
+ wbthrottle(cct),
+ next_osr_id(0),
+ m_disable_wbthrottle(cct->_conf->filestore_odsync_write ||
+ !cct->_conf->filestore_wbthrottle_enable),
+ throttle_ops(cct, "filestore_ops", cct->_conf->filestore_caller_concurrency),
+ throttle_bytes(cct, "filestore_bytes", cct->_conf->filestore_caller_concurrency),
+ m_ondisk_finisher_num(cct->_conf->filestore_ondisk_finisher_threads),
+ m_apply_finisher_num(cct->_conf->filestore_apply_finisher_threads),
+ op_tp(cct, "FileStore::op_tp", "tp_fstore_op", cct->_conf->filestore_op_threads, "filestore_op_threads"),
+ op_wq(this, cct->_conf->filestore_op_thread_timeout,
+ cct->_conf->filestore_op_thread_suicide_timeout, &op_tp),
+ logger(nullptr),
+ trace_endpoint("0.0.0.0", 0, "FileStore"),
+ read_error_lock("FileStore::read_error_lock"),
+ m_filestore_commit_timeout(cct->_conf->filestore_commit_timeout),
+ m_filestore_journal_parallel(cct->_conf->filestore_journal_parallel ),
+ m_filestore_journal_trailing(cct->_conf->filestore_journal_trailing),
+ m_filestore_journal_writeahead(cct->_conf->filestore_journal_writeahead),
+ m_filestore_fiemap_threshold(cct->_conf->filestore_fiemap_threshold),
+ m_filestore_max_sync_interval(cct->_conf->filestore_max_sync_interval),
+ m_filestore_min_sync_interval(cct->_conf->filestore_min_sync_interval),
+ m_filestore_fail_eio(cct->_conf->filestore_fail_eio),
+ m_filestore_fadvise(cct->_conf->filestore_fadvise),
+ do_update(do_update),
+ m_journal_dio(cct->_conf->journal_dio),
+ m_journal_aio(cct->_conf->journal_aio),
+ m_journal_force_aio(cct->_conf->journal_force_aio),
+ m_osd_rollback_to_cluster_snap(cct->_conf->osd_rollback_to_cluster_snap),
+ m_osd_use_stale_snap(cct->_conf->osd_use_stale_snap),
+ m_filestore_do_dump(false),
+ m_filestore_dump_fmt(true),
+ m_filestore_sloppy_crc(cct->_conf->filestore_sloppy_crc),
+ m_filestore_sloppy_crc_block_size(cct->_conf->filestore_sloppy_crc_block_size),
+ m_filestore_max_alloc_hint_size(cct->_conf->filestore_max_alloc_hint_size),
+ m_fs_type(0),
+ m_filestore_max_inline_xattr_size(0),
+ m_filestore_max_inline_xattrs(0),
+ m_filestore_max_xattr_value_size(0)
+{
+ m_filestore_kill_at = cct->_conf->filestore_kill_at;
+ for (int i = 0; i < m_ondisk_finisher_num; ++i) {
+ ostringstream oss;
+ oss << "filestore-ondisk-" << i;
+ Finisher *f = new Finisher(cct, oss.str(), "fn_odsk_fstore");
+ ondisk_finishers.push_back(f);
+ }
+ for (int i = 0; i < m_apply_finisher_num; ++i) {
+ ostringstream oss;
+ oss << "filestore-apply-" << i;
+ Finisher *f = new Finisher(cct, oss.str(), "fn_appl_fstore");
+ apply_finishers.push_back(f);
+ }
+
+ ostringstream oss;
+ oss << basedir << "/current";
+ current_fn = oss.str();
+
+ ostringstream sss;
+ sss << basedir << "/current/commit_op_seq";
+ current_op_seq_fn = sss.str();
+
+ ostringstream omss;
+ if (cct->_conf->filestore_omap_backend_path != "") {
+ omap_dir = cct->_conf->filestore_omap_backend_path;
+ } else {
+ omss << basedir << "/current/omap";
+ omap_dir = omss.str();
+ }
+
+ // initialize logger
+ PerfCountersBuilder plb(cct, internal_name, l_filestore_first, l_filestore_last);
+
+ plb.add_u64(l_filestore_journal_queue_ops, "journal_queue_ops", "Operations in journal queue");
+ plb.add_u64(l_filestore_journal_ops, "journal_ops", "Active journal entries to be applied");
+ plb.add_u64(l_filestore_journal_queue_bytes, "journal_queue_bytes", "Size of journal queue");
+ plb.add_u64(l_filestore_journal_bytes, "journal_bytes", "Active journal operation size to be applied");
+ plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency",
+ NULL, PerfCountersBuilder::PRIO_USEFUL);
+ plb.add_u64_counter(l_filestore_journal_wr, "journal_wr", "Journal write IOs");
+ plb.add_u64_avg(l_filestore_journal_wr_bytes, "journal_wr_bytes", "Journal data written");
+ plb.add_u64(l_filestore_op_queue_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue");
+ plb.add_u64(l_filestore_op_queue_ops, "op_queue_ops", "Operations in writing to FS queue");
+ plb.add_u64_counter(l_filestore_ops, "ops", "Operations written to store");
+ plb.add_u64(l_filestore_op_queue_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue");
+ plb.add_u64(l_filestore_op_queue_bytes, "op_queue_bytes", "Size of writing to FS queue");
+ plb.add_u64_counter(l_filestore_bytes, "bytes", "Data written to store");
+ plb.add_time_avg(l_filestore_apply_latency, "apply_latency", "Apply latency");
+ plb.add_u64(l_filestore_committing, "committing", "Is currently committing");
+
+ plb.add_u64_counter(l_filestore_commitcycle, "commitcycle", "Commit cycles");
+ plb.add_time_avg(l_filestore_commitcycle_interval, "commitcycle_interval", "Average interval between commits");
+ plb.add_time_avg(l_filestore_commitcycle_latency, "commitcycle_latency", "Average latency of commit");
+ plb.add_u64_counter(l_filestore_journal_full, "journal_full", "Journal writes while full");
+ plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg",
+ "Store operation queue latency", NULL, PerfCountersBuilder::PRIO_USEFUL);
+ plb.add_time(l_filestore_sync_pause_max_lat, "sync_pause_max_latency", "Max latency of op_wq pause before syncfs");
+
+ logger = plb.create_perf_counters();
+
+ cct->get_perfcounters_collection()->add(logger);
+ cct->_conf.add_observer(this);
+
+ superblock.compat_features = get_fs_initial_compat_set();
+}
+
+FileStore::~FileStore()
+{
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ delete *it;
+ *it = nullptr;
+ }
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ delete *it;
+ *it = nullptr;
+ }
+ cct->_conf.remove_observer(this);
+ cct->get_perfcounters_collection()->remove(logger);
+
+ if (journal)
+ journal->logger = nullptr;
+ delete logger;
+ logger = nullptr;
+
+ if (m_filestore_do_dump) {
+ dump_stop();
+ }
+}
+
+static void get_attrname(const char *name, char *buf, int len)
+{
+ snprintf(buf, len, "user.ceph.%s", name);
+}
+
+bool parse_attrname(char **name)
+{
+ if (strncmp(*name, "user.ceph.", 10) == 0) {
+ *name += 10;
+ return true;
+ }
+ return false;
+}
+
+void FileStore::collect_metadata(map<string,string> *pm)
+{
+ char partition_path[PATH_MAX];
+ char dev_node[PATH_MAX];
+
+ (*pm)["filestore_backend"] = backend->get_name();
+ ostringstream ss;
+ ss << "0x" << std::hex << m_fs_type << std::dec;
+ (*pm)["filestore_f_type"] = ss.str();
+
+ if (cct->_conf->filestore_collect_device_partition_information) {
+ int rc = 0;
+ BlkDev blkdev(fsid_fd);
+ if (rc = blkdev.partition(partition_path, PATH_MAX); rc) {
+ (*pm)["backend_filestore_partition_path"] = "unknown";
+ } else {
+ (*pm)["backend_filestore_partition_path"] = string(partition_path);
+ }
+ if (rc = blkdev.wholedisk(dev_node, PATH_MAX); rc) {
+ (*pm)["backend_filestore_dev_node"] = "unknown";
+ } else {
+ (*pm)["backend_filestore_dev_node"] = string(dev_node);
+ devname = dev_node;
+ }
+ if (rc == 0 && vdo_fd >= 0) {
+ (*pm)["vdo"] = "true";
+ (*pm)["vdo_physical_size"] =
+ stringify(4096 * get_vdo_stat(vdo_fd, "physical_blocks"));
+ }
+ if (journal) {
+ journal->collect_metadata(pm);
+ }
+ }
+}
+
+int FileStore::get_devices(set<string> *ls)
+{
+ string dev_node;
+ BlkDev blkdev(fsid_fd);
+ if (int rc = blkdev.wholedisk(&dev_node); rc) {
+ return rc;
+ }
+ get_raw_devices(dev_node, ls);
+ if (journal) {
+ journal->get_devices(ls);
+ }
+ return 0;
+}
+
+int FileStore::statfs(struct store_statfs_t *buf0, osd_alert_list_t* alerts)
+{
+ struct statfs buf;
+ buf0->reset();
+ if (alerts) {
+ alerts->clear(); // returns nothing for now
+ }
+ if (::statfs(basedir.c_str(), &buf) < 0) {
+ int r = -errno;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ ceph_assert(r != -ENOENT);
+ return r;
+ }
+
+ uint64_t bfree = buf.f_bavail * buf.f_bsize;
+
+ // assume all of leveldb/rocksdb is omap.
+ {
+ map<string,uint64_t> kv_usage;
+ buf0->omap_allocated += object_map->get_db()->get_estimated_size(kv_usage);
+ }
+
+ uint64_t thin_total, thin_avail;
+ if (get_vdo_utilization(vdo_fd, &thin_total, &thin_avail)) {
+ buf0->total = thin_total;
+ bfree = std::min(bfree, thin_avail);
+ buf0->allocated = thin_total - thin_avail;
+ buf0->data_stored = bfree;
+ } else {
+ buf0->total = buf.f_blocks * buf.f_bsize;
+ buf0->allocated = bfree;
+ buf0->data_stored = bfree;
+ }
+ buf0->available = bfree;
+
+ // FIXME: we don't know how to populate buf->internal_metadata; XFS doesn't
+ // tell us what its internal overhead is.
+
+ // Adjust for writes pending in the journal
+ if (journal) {
+ uint64_t estimate = journal->get_journal_size_estimate();
+ buf0->internally_reserved = estimate;
+ if (buf0->available > estimate)
+ buf0->available -= estimate;
+ else
+ buf0->available = 0;
+ }
+
+ return 0;
+}
+
+int FileStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf)
+{
+ return -ENOTSUP;
+}
+
+void FileStore::new_journal()
+{
+ if (journalpath.length()) {
+ dout(10) << "open_journal at " << journalpath << dendl;
+ journal = new FileJournal(cct, fsid, &finisher, &sync_cond,
+ journalpath.c_str(),
+ m_journal_dio, m_journal_aio,
+ m_journal_force_aio);
+ if (journal)
+ journal->logger = logger;
+ }
+ return;
+}
+
+int FileStore::dump_journal(ostream& out)
+{
+ int r;
+
+ if (!journalpath.length())
+ return -EINVAL;
+
+ FileJournal *journal = new FileJournal(cct, fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio);
+ r = journal->dump(out);
+ delete journal;
+ journal = nullptr;
+ return r;
+}
+
+FileStoreBackend *FileStoreBackend::create(unsigned long f_type, FileStore *fs)
+{
+ switch (f_type) {
+#if defined(__linux__)
+ case BTRFS_SUPER_MAGIC:
+ return new BtrfsFileStoreBackend(fs);
+# ifdef HAVE_LIBXFS
+ case XFS_SUPER_MAGIC:
+ return new XfsFileStoreBackend(fs);
+# endif
+#endif
+#ifdef HAVE_LIBZFS
+ case ZFS_SUPER_MAGIC:
+ return new ZFSFileStoreBackend(fs);
+#endif
+ default:
+ return new GenericFileStoreBackend(fs);
+ }
+}
+
+void FileStore::create_backend(unsigned long f_type)
+{
+ m_fs_type = f_type;
+
+ ceph_assert(!backend);
+ backend = FileStoreBackend::create(f_type, this);
+
+ dout(0) << "backend " << backend->get_name()
+ << " (magic 0x" << std::hex << f_type << std::dec << ")"
+ << dendl;
+
+ switch (f_type) {
+#if defined(__linux__)
+ case BTRFS_SUPER_MAGIC:
+ if (!m_disable_wbthrottle){
+ wbthrottle.set_fs(WBThrottle::BTRFS);
+ }
+ break;
+
+ case XFS_SUPER_MAGIC:
+ // wbthrottle is constructed with fs(WBThrottle::XFS)
+ break;
+#endif
+ }
+
+ set_xattr_limits_via_conf();
+}
+
+int FileStore::mkfs()
+{
+ int ret = 0;
+ char fsid_fn[PATH_MAX];
+ char fsid_str[40];
+ uuid_d old_fsid;
+ uuid_d old_omap_fsid;
+
+ dout(1) << "mkfs in " << basedir << dendl;
+ basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+ if (basedir_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // open+lock fsid
+ snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str());
+ fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644);
+ if (fsid_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ if (lock_fsid() < 0) {
+ ret = -EBUSY;
+ goto close_fsid_fd;
+ }
+
+ if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) {
+ if (fsid.is_zero()) {
+ fsid.generate_random();
+ dout(1) << __FUNC__ << ": generated fsid " << fsid << dendl;
+ } else {
+ dout(1) << __FUNC__ << ": using provided fsid " << fsid << dendl;
+ }
+
+ fsid.print(fsid_str);
+ strcat(fsid_str, "\n");
+ ret = ::ftruncate(fsid_fd, 0);
+ if (ret < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to truncate fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str));
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ if (::fsync(fsid_fd) < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": close failed: can't write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ dout(10) << __FUNC__ << ": fsid is " << fsid << dendl;
+ } else {
+ if (!fsid.is_zero() && fsid != old_fsid) {
+ derr << __FUNC__ << ": on-disk fsid " << old_fsid << " != provided " << fsid << dendl;
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+ fsid = old_fsid;
+ dout(1) << __FUNC__ << ": fsid is already set to " << fsid << dendl;
+ }
+
+ // version stamp
+ ret = write_version_stamp();
+ if (ret < 0) {
+ derr << __FUNC__ << ": write_version_stamp() failed: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ // superblock
+ superblock.omap_backend = cct->_conf->filestore_omap_backend;
+ ret = write_superblock();
+ if (ret < 0) {
+ derr << __FUNC__ << ": write_superblock() failed: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ struct statfs basefs;
+ ret = ::fstatfs(basedir_fd, &basefs);
+ if (ret < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": cannot fstatfs basedir "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+#if defined(__linux__)
+ if (basefs.f_type == BTRFS_SUPER_MAGIC &&
+ !g_ceph_context->check_experimental_feature_enabled("btrfs")) {
+ derr << __FUNC__ << ": deprecated btrfs support is not enabled" << dendl;
+ goto close_fsid_fd;
+ }
+#endif
+
+ create_backend(basefs.f_type);
+
+ ret = backend->create_current();
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to create current/ " << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ // write initial op_seq
+ {
+ uint64_t initial_seq = 0;
+ int fd = read_op_seq(&initial_seq);
+ if (fd < 0) {
+ ret = fd;
+ derr << __FUNC__ << ": failed to create " << current_op_seq_fn << ": "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ if (initial_seq == 0) {
+ ret = write_op_seq(fd, 1);
+ if (ret < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << __FUNC__ << ": failed to write to " << current_op_seq_fn << ": "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ if (backend->can_checkpoint()) {
+ // create snap_1 too
+ current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC);
+ ceph_assert(current_fd >= 0);
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull);
+ ret = backend->create_checkpoint(s, nullptr);
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ if (ret < 0 && ret != -EEXIST) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << __FUNC__ << ": failed to create snap_1: " << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ }
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir);
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to create " << cct->_conf->filestore_omap_backend << dendl;
+ goto close_fsid_fd;
+ }
+ // create fsid under omap
+ // open+lock fsid
+ int omap_fsid_fd;
+ char omap_fsid_fn[PATH_MAX];
+ snprintf(omap_fsid_fn, sizeof(omap_fsid_fn), "%s/osd_uuid", omap_dir.c_str());
+ omap_fsid_fd = ::open(omap_fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644);
+ if (omap_fsid_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open " << omap_fsid_fn << ": " << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ if (read_fsid(omap_fsid_fd, &old_omap_fsid) < 0 || old_omap_fsid.is_zero()) {
+ ceph_assert(!fsid.is_zero());
+ fsid.print(fsid_str);
+ strcat(fsid_str, "\n");
+ ret = ::ftruncate(omap_fsid_fd, 0);
+ if (ret < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to truncate fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_omap_fsid_fd;
+ }
+ ret = safe_write(omap_fsid_fd, fsid_str, strlen(fsid_str));
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_omap_fsid_fd;
+ }
+ dout(10) << __FUNC__ << ": write success, fsid:" << fsid_str << ", ret:" << ret << dendl;
+ if (::fsync(omap_fsid_fd) < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": close failed: can't write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_omap_fsid_fd;
+ }
+ dout(10) << "mkfs omap fsid is " << fsid << dendl;
+ } else {
+ if (fsid != old_omap_fsid) {
+ derr << __FUNC__ << ": " << omap_fsid_fn
+ << " has existed omap fsid " << old_omap_fsid
+ << " != expected osd fsid " << fsid
+ << dendl;
+ ret = -EINVAL;
+ goto close_omap_fsid_fd;
+ }
+ dout(1) << __FUNC__ << ": omap fsid is already set to " << fsid << dendl;
+ }
+
+ dout(1) << cct->_conf->filestore_omap_backend << " db exists/created" << dendl;
+
+ // journal?
+ ret = mkjournal();
+ if (ret)
+ goto close_omap_fsid_fd;
+
+ ret = write_meta("type", "filestore");
+ if (ret)
+ goto close_omap_fsid_fd;
+
+ dout(1) << "mkfs done in " << basedir << dendl;
+ ret = 0;
+
+ close_omap_fsid_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd));
+ close_fsid_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ close_basedir_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+ delete backend;
+ backend = nullptr;
+ return ret;
+}
+
+int FileStore::mkjournal()
+{
+ // read fsid
+ int ret;
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC, 0644);
+ if (fd < 0) {
+ int err = errno;
+ derr << __FUNC__ << ": open error: " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ ret = read_fsid(fd, &fsid);
+ if (ret < 0) {
+ derr << __FUNC__ << ": read error: " << cpp_strerror(ret) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+ ret = 0;
+
+ new_journal();
+ if (journal) {
+ ret = journal->check();
+ if (ret < 0) {
+ ret = journal->create();
+ if (ret)
+ derr << __FUNC__ << ": error creating journal on " << journalpath
+ << ": " << cpp_strerror(ret) << dendl;
+ else
+ dout(0) << __FUNC__ << ": created journal on " << journalpath << dendl;
+ }
+ delete journal;
+ journal = nullptr;
+ }
+ return ret;
+}
+
+int FileStore::read_fsid(int fd, uuid_d *uuid)
+{
+ char fsid_str[40];
+ memset(fsid_str, 0, sizeof(fsid_str));
+ int ret = safe_read(fd, fsid_str, sizeof(fsid_str));
+ if (ret < 0)
+ return ret;
+ if (ret == 8) {
+ // old 64-bit fsid... mirror it.
+ *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str;
+ *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str;
+ return 0;
+ }
+
+ if (ret > 36)
+ fsid_str[36] = 0;
+ else
+ fsid_str[ret] = 0;
+ if (!uuid->parse(fsid_str))
+ return -EINVAL;
+ return 0;
+}
+
+int FileStore::lock_fsid()
+{
+ struct flock l;
+ memset(&l, 0, sizeof(l));
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ l.l_start = 0;
+ l.l_len = 0;
+ int r = ::fcntl(fsid_fd, F_SETLK, &l);
+ if (r < 0) {
+ int err = errno;
+ dout(0) << __FUNC__ << ": failed to lock " << basedir << "/fsid, is another ceph-osd still running? "
+ << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ return 0;
+}
+
+bool FileStore::test_mount_in_use()
+{
+ dout(5) << __FUNC__ << ": basedir " << basedir << " journal " << journalpath << dendl;
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+
+ // verify fs isn't in use
+
+ fsid_fd = ::open(fn, O_RDWR|O_CLOEXEC, 0644);
+ if (fsid_fd < 0)
+ return 0; // no fsid, ok.
+ bool inuse = lock_fsid() < 0;
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ return inuse;
+}
+
+bool FileStore::is_rotational()
+{
+ bool rotational;
+ if (backend) {
+ rotational = backend->is_rotational();
+ } else {
+ int fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return true;
+ struct statfs st;
+ int r = ::fstatfs(fd, &st);
+ ::close(fd);
+ if (r < 0) {
+ return true;
+ }
+ create_backend(st.f_type);
+ rotational = backend->is_rotational();
+ delete backend;
+ backend = nullptr;
+ }
+ dout(10) << __func__ << " " << (int)rotational << dendl;
+ return rotational;
+}
+
+bool FileStore::is_journal_rotational()
+{
+ bool journal_rotational;
+ if (backend) {
+ journal_rotational = backend->is_journal_rotational();
+ } else {
+ int fd = ::open(journalpath.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return true;
+ struct statfs st;
+ int r = ::fstatfs(fd, &st);
+ ::close(fd);
+ if (r < 0) {
+ return true;
+ }
+ create_backend(st.f_type);
+ journal_rotational = backend->is_journal_rotational();
+ delete backend;
+ backend = nullptr;
+ }
+ dout(10) << __func__ << " " << (int)journal_rotational << dendl;
+ return journal_rotational;
+}
+
+int FileStore::_detect_fs()
+{
+ struct statfs st;
+ int r = ::fstatfs(basedir_fd, &st);
+ if (r < 0)
+ return -errno;
+
+ blk_size = st.f_bsize;
+
+#if defined(__linux__)
+ if (st.f_type == BTRFS_SUPER_MAGIC &&
+ !g_ceph_context->check_experimental_feature_enabled("btrfs")) {
+ derr <<__FUNC__ << ": deprecated btrfs support is not enabled" << dendl;
+ return -EPERM;
+ }
+#endif
+
+ create_backend(st.f_type);
+
+ r = backend->detect_features();
+ if (r < 0) {
+ derr << __FUNC__ << ": detect_features error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // vdo
+ {
+ char dev_node[PATH_MAX];
+ if (int rc = BlkDev{fsid_fd}.wholedisk(dev_node, PATH_MAX); rc == 0) {
+ vdo_fd = get_vdo_stats_handle(dev_node, &vdo_name);
+ if (vdo_fd >= 0) {
+ dout(0) << __func__ << " VDO volume " << vdo_name << " for " << dev_node
+ << dendl;
+ }
+ }
+ }
+
+ // test xattrs
+ char fn[PATH_MAX];
+ int x = rand();
+ int y = x+1;
+ snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str());
+ int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0700);
+ if (tmpfd < 0) {
+ int ret = -errno;
+ derr << __FUNC__ << ": unable to create " << fn << ": " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x));
+ if (ret >= 0)
+ ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y));
+ if ((ret < 0) || (x != y)) {
+ derr << "Extended attributes don't appear to work. ";
+ if (ret)
+ *_dout << "Got error " + cpp_strerror(ret) + ". ";
+ *_dout << "If you are using ext3 or ext4, be sure to mount the underlying "
+ << "file system with the 'user_xattr' option." << dendl;
+ ::unlink(fn);
+ VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+ return -ENOTSUP;
+ }
+
+ char buf[1000];
+ memset(buf, 0, sizeof(buf)); // shut up valgrind
+ chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf));
+ chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf));
+ chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf));
+ chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
+ ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
+ if (ret == -ENOSPC) {
+ dout(0) << "limited size xattrs" << dendl;
+ }
+ chain_fremovexattr(tmpfd, "user.test");
+ chain_fremovexattr(tmpfd, "user.test2");
+ chain_fremovexattr(tmpfd, "user.test3");
+ chain_fremovexattr(tmpfd, "user.test4");
+ chain_fremovexattr(tmpfd, "user.test5");
+
+ ::unlink(fn);
+ VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+
+ return 0;
+}
+
+int FileStore::_sanity_check_fs()
+{
+ // sanity check(s)
+
+ if (((int)m_filestore_journal_writeahead +
+ (int)m_filestore_journal_parallel +
+ (int)m_filestore_journal_trailing) > 1) {
+ dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl;
+ cerr << TEXT_RED
+ << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n"
+ << " is enabled in ceph.conf. You must choose a single journal mode."
+ << TEXT_NORMAL << std::endl;
+ return -EINVAL;
+ }
+
+ if (!backend->can_checkpoint()) {
+ if (!journal || !m_filestore_journal_writeahead) {
+ dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl;
+ cerr << TEXT_RED
+ << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n"
+ << " For non-btrfs volumes, a writeahead journal is required to\n"
+ << " maintain on-disk consistency in the event of a crash. Your conf\n"
+ << " should include something like:\n"
+ << " osd journal = /path/to/journal_device_or_file\n"
+ << " filestore journal writeahead = true\n"
+ << TEXT_NORMAL;
+ }
+ }
+
+ if (!journal) {
+ dout(0) << "mount WARNING: no journal" << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: No osd journal is configured: write latency may be high.\n"
+ << " If you will not be using an osd journal, write latency may be\n"
+ << " relatively high. It can be reduced somewhat by lowering\n"
+ << " filestore_max_sync_interval, but lower values mean lower write\n"
+ << " throughput, especially with spinning disks.\n"
+ << TEXT_NORMAL;
+ }
+
+ return 0;
+}
+
+int FileStore::write_superblock()
+{
+ bufferlist bl;
+ encode(superblock, bl);
+ return safe_write_file(basedir.c_str(), "superblock",
+ bl.c_str(), bl.length(), 0600);
+}
+
+int FileStore::read_superblock()
+{
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "superblock",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ // If the file doesn't exist write initial CompatSet
+ return write_superblock();
+ }
+ return ret;
+ }
+
+ bufferlist bl;
+ bl.push_back(std::move(bp));
+ auto i = bl.cbegin();
+ decode(superblock, i);
+ return 0;
+}
+
+int FileStore::update_version_stamp()
+{
+ return write_version_stamp();
+}
+
+int FileStore::version_stamp_is_valid(uint32_t *version)
+{
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "store_version",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ return ret;
+ }
+ bufferlist bl;
+ bl.push_back(std::move(bp));
+ auto i = bl.cbegin();
+ decode(*version, i);
+ dout(10) << __FUNC__ << ": was " << *version << " vs target "
+ << target_version << dendl;
+ if (*version == target_version)
+ return 1;
+ else
+ return 0;
+}
+
+int FileStore::flush_cache(ostream *os)
+{
+ string drop_caches_file = "/proc/sys/vm/drop_caches";
+ int drop_caches_fd = ::open(drop_caches_file.c_str(), O_WRONLY|O_CLOEXEC), ret = 0;
+ char buf[2] = "3";
+ size_t len = strlen(buf);
+
+ if (drop_caches_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open " << drop_caches_file << ": " << cpp_strerror(ret) << dendl;
+ if (os) {
+ *os << "FileStore flush_cache: failed to open " << drop_caches_file << ": " << cpp_strerror(ret);
+ }
+ return ret;
+ }
+
+ if (::write(drop_caches_fd, buf, len) < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to write to " << drop_caches_file << ": " << cpp_strerror(ret) << dendl;
+ if (os) {
+ *os << "FileStore flush_cache: failed to write to " << drop_caches_file << ": " << cpp_strerror(ret);
+ }
+ goto out;
+ }
+
+out:
+ ::close(drop_caches_fd);
+ return ret;
+}
+
+int FileStore::write_version_stamp()
+{
+ dout(1) << __FUNC__ << ": " << target_version << dendl;
+ bufferlist bl;
+ encode(target_version, bl);
+
+ return safe_write_file(basedir.c_str(), "store_version",
+ bl.c_str(), bl.length(), 0600);
+}
+
+int FileStore::upgrade()
+{
+ dout(1) << __FUNC__ << dendl;
+ uint32_t version;
+ int r = version_stamp_is_valid(&version);
+
+ if (r == -ENOENT) {
+ derr << "The store_version file doesn't exist." << dendl;
+ return -EINVAL;
+ }
+ if (r < 0)
+ return r;
+ if (r == 1)
+ return 0;
+
+ if (version < 3) {
+ derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl;
+ return -EINVAL;
+ }
+
+ // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to
+ // open up DBObjectMap with the do_upgrade flag, which we already did.
+ update_version_stamp();
+ return 0;
+}
+
+int FileStore::read_op_seq(uint64_t *seq)
+{
+ int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR|O_CLOEXEC, 0644);
+ if (op_fd < 0) {
+ int r = -errno;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ char s[40];
+ memset(s, 0, sizeof(s));
+ int ret = safe_read(op_fd, s, sizeof(s) - 1);
+ if (ret < 0) {
+ derr << __FUNC__ << ": error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+ ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+ return ret;
+ }
+ *seq = atoll(s);
+ return op_fd;
+}
+
+int FileStore::write_op_seq(int fd, uint64_t seq)
+{
+ char s[30];
+ snprintf(s, sizeof(s), "%" PRId64 "\n", seq);
+ int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0));
+ if (ret < 0) {
+ ret = -errno;
+ ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+ }
+ return ret;
+}
+
+int FileStore::mount()
+{
+ int ret;
+ char buf[PATH_MAX];
+ uint64_t initial_op_seq;
+ uuid_d omap_fsid;
+ set<string> cluster_snaps;
+ CompatSet supported_compat_set = get_fs_supported_compat_set();
+
+ dout(5) << "basedir " << basedir << " journal " << journalpath << dendl;
+
+ ret = set_throttle_params();
+ if (ret != 0)
+ goto done;
+
+ // make sure global base dir exists
+ if (::access(basedir.c_str(), R_OK | W_OK)) {
+ ret = -errno;
+ derr << __FUNC__ << ": unable to access basedir '" << basedir << "': "
+ << cpp_strerror(ret) << dendl;
+ goto done;
+ }
+
+ // get fsid
+ snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str());
+ fsid_fd = ::open(buf, O_RDWR|O_CLOEXEC, 0644);
+ if (fsid_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": error opening '" << buf << "': "
+ << cpp_strerror(ret) << dendl;
+ goto done;
+ }
+
+ ret = read_fsid(fsid_fd, &fsid);
+ if (ret < 0) {
+ derr << __FUNC__ << ": error reading fsid_fd: " << cpp_strerror(ret)
+ << dendl;
+ goto close_fsid_fd;
+ }
+
+ if (lock_fsid() < 0) {
+ derr << __FUNC__ << ": lock_fsid failed" << dendl;
+ ret = -EBUSY;
+ goto close_fsid_fd;
+ }
+
+ dout(10) << "mount fsid is " << fsid << dendl;
+
+
+ uint32_t version_stamp;
+ ret = version_stamp_is_valid(&version_stamp);
+ if (ret < 0) {
+ derr << __FUNC__ << ": error in version_stamp_is_valid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ } else if (ret == 0) {
+ if (do_update || (int)version_stamp < cct->_conf->filestore_update_to) {
+ derr << __FUNC__ << ": stale version stamp detected: "
+ << version_stamp
+ << ". Proceeding, do_update "
+ << "is set, performing disk format upgrade."
+ << dendl;
+ do_update = true;
+ } else {
+ ret = -EINVAL;
+ derr << __FUNC__ << ": stale version stamp " << version_stamp
+ << ". Please run the FileStore update script before starting the "
+ << "OSD, or set filestore_update_to to " << target_version
+ << " (currently " << cct->_conf->filestore_update_to << ")"
+ << dendl;
+ goto close_fsid_fd;
+ }
+ }
+
+ ret = read_superblock();
+ if (ret < 0) {
+ goto close_fsid_fd;
+ }
+
+ // Check if this FileStore supports all the necessary features to mount
+ if (supported_compat_set.compare(superblock.compat_features) == -1) {
+ derr << __FUNC__ << ": Incompatible features set "
+ << superblock.compat_features << dendl;
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+
+ // open some dir handles
+ basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+ if (basedir_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open " << basedir << ": "
+ << cpp_strerror(ret) << dendl;
+ basedir_fd = -1;
+ goto close_fsid_fd;
+ }
+
+ // test for btrfs, xattrs, etc.
+ ret = _detect_fs();
+ if (ret < 0) {
+ derr << __FUNC__ << ": error in _detect_fs: "
+ << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ {
+ list<string> ls;
+ ret = backend->list_checkpoints(ls);
+ if (ret < 0) {
+ derr << __FUNC__ << ": error in _list_snaps: "<< cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ long long unsigned c, prev = 0;
+ char clustersnap[NAME_MAX];
+ for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) {
+ if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) {
+ ceph_assert(c > prev);
+ prev = c;
+ snaps.push_back(c);
+ } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1)
+ cluster_snaps.insert(*it);
+ }
+ }
+
+ if (m_osd_rollback_to_cluster_snap.length() &&
+ cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) {
+ derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl;
+ ret = -ENOENT;
+ goto close_basedir_fd;
+ }
+
+ char nosnapfn[200];
+ snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str());
+
+ if (backend->can_checkpoint()) {
+ if (snaps.empty()) {
+ dout(0) << __FUNC__ << ": WARNING: no consistent snaps found, store may be in inconsistent state" << dendl;
+ } else {
+ char s[NAME_MAX];
+ uint64_t curr_seq = 0;
+
+ if (m_osd_rollback_to_cluster_snap.length()) {
+ derr << TEXT_RED
+ << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **"
+ << TEXT_NORMAL
+ << dendl;
+ ceph_assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap));
+ snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str());
+ } else {
+ {
+ int fd = read_op_seq(&curr_seq);
+ if (fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ }
+ if (curr_seq)
+ dout(10) << " current/ seq was " << curr_seq << dendl;
+ else
+ dout(10) << " current/ missing entirely (unusual, but okay)" << dendl;
+
+ uint64_t cp = snaps.back();
+ dout(10) << " most recent snap from " << snaps << " is " << cp << dendl;
+
+ // if current/ is marked as non-snapshotted, refuse to roll
+ // back (without clear direction) to avoid throwing out new
+ // data.
+ struct stat st;
+ if (::stat(nosnapfn, &st) == 0) {
+ if (!m_osd_use_stale_snap) {
+ derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl;
+ derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl;
+ derr << "config option for --osd-use-stale-snap startup argument." << dendl;
+ ret = -ENOTSUP;
+ goto close_basedir_fd;
+ }
+ derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq
+ << ", newest snap is " << cp << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: forcing the use of stale snapshot data **"
+ << TEXT_NORMAL << std::endl;
+ }
+
+ dout(10) << __FUNC__ << ": rolling back to consistent snap " << cp << dendl;
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+ }
+
+ // drop current?
+ ret = backend->rollback_to(s);
+ if (ret) {
+ derr << __FUNC__ << ": error rolling back to " << s << ": "
+ << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+ }
+ }
+ initial_op_seq = 0;
+
+ current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC);
+ if (current_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ ceph_assert(current_fd >= 0);
+
+ op_fd = read_op_seq(&initial_op_seq);
+ if (op_fd < 0) {
+ ret = op_fd;
+ derr << __FUNC__ << ": read_op_seq failed" << dendl;
+ goto close_current_fd;
+ }
+
+ dout(5) << "mount op_seq is " << initial_op_seq << dendl;
+ if (initial_op_seq == 0) {
+ derr << "mount initial op seq is 0; something is wrong" << dendl;
+ ret = -EINVAL;
+ goto close_current_fd;
+ }
+
+ if (!backend->can_checkpoint()) {
+ // mark current/ as non-snapshotted so that we don't rollback away
+ // from it.
+ int r = ::creat(nosnapfn, 0644);
+ if (r < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to create current/nosnap" << dendl;
+ goto close_current_fd;
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(r));
+ } else {
+ // clear nosnap marker, if present.
+ ::unlink(nosnapfn);
+ }
+
+ // check fsid with omap
+ // get omap fsid
+ char omap_fsid_buf[PATH_MAX];
+ struct ::stat omap_fsid_stat;
+ snprintf(omap_fsid_buf, sizeof(omap_fsid_buf), "%s/osd_uuid", omap_dir.c_str());
+ // if osd_uuid not exists, assume as this omap matchs corresponding osd
+ if (::stat(omap_fsid_buf, &omap_fsid_stat) != 0){
+ dout(10) << __FUNC__ << ": osd_uuid not found under omap, "
+ << "assume as matched."
+ << dendl;
+ } else {
+ int omap_fsid_fd;
+ // if osd_uuid exists, compares osd_uuid with fsid
+ omap_fsid_fd = ::open(omap_fsid_buf, O_RDONLY|O_CLOEXEC, 0644);
+ if (omap_fsid_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": error opening '" << omap_fsid_buf << "': "
+ << cpp_strerror(ret)
+ << dendl;
+ goto close_current_fd;
+ }
+ ret = read_fsid(omap_fsid_fd, &omap_fsid);
+ VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd));
+ if (ret < 0) {
+ derr << __FUNC__ << ": error reading omap_fsid_fd"
+ << ", omap_fsid = " << omap_fsid
+ << cpp_strerror(ret)
+ << dendl;
+ goto close_current_fd;
+ }
+ if (fsid != omap_fsid) {
+ derr << __FUNC__ << ": " << omap_fsid_buf
+ << " has existed omap fsid " << omap_fsid
+ << " != expected osd fsid " << fsid
+ << dendl;
+ ret = -EINVAL;
+ goto close_current_fd;
+ }
+ }
+
+ dout(0) << "start omap initiation" << dendl;
+ if (!(generic_flags & SKIP_MOUNT_OMAP)) {
+ KeyValueDB * omap_store = KeyValueDB::create(cct,
+ superblock.omap_backend,
+ omap_dir);
+ if (!omap_store)
+ {
+ derr << __FUNC__ << ": Error creating " << superblock.omap_backend << dendl;
+ ret = -1;
+ goto close_current_fd;
+ }
+
+ if (superblock.omap_backend == "rocksdb")
+ ret = omap_store->init(cct->_conf->filestore_rocksdb_options);
+ else
+ ret = omap_store->init();
+
+ if (ret < 0) {
+ derr << __FUNC__ << ": Error initializing omap_store: " << cpp_strerror(ret) << dendl;
+ goto close_current_fd;
+ }
+
+ stringstream err;
+ if (omap_store->create_and_open(err)) {
+ delete omap_store;
+ omap_store = nullptr;
+ derr << __FUNC__ << ": Error initializing " << superblock.omap_backend
+ << " : " << err.str() << dendl;
+ ret = -1;
+ goto close_current_fd;
+ }
+
+ DBObjectMap *dbomap = new DBObjectMap(cct, omap_store);
+ ret = dbomap->init(do_update);
+ if (ret < 0) {
+ delete dbomap;
+ dbomap = nullptr;
+ derr << __FUNC__ << ": Error initializing DBObjectMap: " << ret << dendl;
+ goto close_current_fd;
+ }
+ stringstream err2;
+
+ if (cct->_conf->filestore_debug_omap_check && !dbomap->check(err2)) {
+ derr << err2.str() << dendl;
+ delete dbomap;
+ dbomap = nullptr;
+ ret = -EINVAL;
+ goto close_current_fd;
+ }
+ object_map.reset(dbomap);
+ }
+
+ // journal
+ new_journal();
+
+ // select journal mode?
+ if (journal) {
+ if (!m_filestore_journal_writeahead &&
+ !m_filestore_journal_parallel &&
+ !m_filestore_journal_trailing) {
+ if (!backend->can_checkpoint()) {
+ m_filestore_journal_writeahead = true;
+ dout(0) << __FUNC__ << ": enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl;
+ } else {
+ m_filestore_journal_parallel = true;
+ dout(0) << __FUNC__ << ": enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl;
+ }
+ } else {
+ if (m_filestore_journal_writeahead)
+ dout(0) << __FUNC__ << ": WRITEAHEAD journal mode explicitly enabled in conf" << dendl;
+ if (m_filestore_journal_parallel)
+ dout(0) << __FUNC__ << ": PARALLEL journal mode explicitly enabled in conf" << dendl;
+ if (m_filestore_journal_trailing)
+ dout(0) << __FUNC__ << ": TRAILING journal mode explicitly enabled in conf" << dendl;
+ }
+ if (m_filestore_journal_writeahead)
+ journal->set_wait_on_full(true);
+ } else {
+ dout(0) << __FUNC__ << ": no journal" << dendl;
+ }
+
+ ret = _sanity_check_fs();
+ if (ret) {
+ derr << __FUNC__ << ": _sanity_check_fs failed with error "
+ << ret << dendl;
+ goto close_current_fd;
+ }
+
+ // Cleanup possibly invalid collections
+ {
+ vector<coll_t> collections;
+ ret = list_collections(collections, true);
+ if (ret < 0) {
+ derr << "Error " << ret << " while listing collections" << dendl;
+ goto close_current_fd;
+ }
+ for (vector<coll_t>::iterator i = collections.begin();
+ i != collections.end();
+ ++i) {
+ Index index;
+ ret = get_index(*i, &index);
+ if (ret < 0) {
+ derr << "Unable to mount index " << *i
+ << " with error: " << ret << dendl;
+ goto close_current_fd;
+ }
+ ceph_assert(index.index);
+ RWLock::WLocker l((index.index)->access_lock);
+
+ index->cleanup();
+ }
+ }
+ if (!m_disable_wbthrottle) {
+ wbthrottle.start();
+ } else {
+ dout(0) << __FUNC__ << ": INFO: WbThrottle is disabled" << dendl;
+ if (cct->_conf->filestore_odsync_write) {
+ dout(0) << __FUNC__ << ": INFO: O_DSYNC write is enabled" << dendl;
+ }
+ }
+ sync_thread.create("filestore_sync");
+
+ if (!(generic_flags & SKIP_JOURNAL_REPLAY)) {
+ ret = journal_replay(initial_op_seq);
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl;
+ if (ret == -ENOTTY) {
+ derr << "maybe journal is not pointing to a block device and its size "
+ << "wasn't configured?" << dendl;
+ }
+
+ goto stop_sync;
+ }
+ }
+
+ {
+ stringstream err2;
+ if (cct->_conf->filestore_debug_omap_check && !object_map->check(err2)) {
+ derr << err2.str() << dendl;
+ ret = -EINVAL;
+ goto stop_sync;
+ }
+ }
+
+ init_temp_collections();
+
+ journal_start();
+
+ op_tp.start();
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ (*it)->start();
+ }
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ (*it)->start();
+ }
+
+ timer.init();
+
+ // upgrade?
+ if (cct->_conf->filestore_update_to >= (int)get_target_version()) {
+ int err = upgrade();
+ if (err < 0) {
+ derr << "error converting store" << dendl;
+ umount();
+ return err;
+ }
+ }
+
+ // all okay.
+ return 0;
+
+stop_sync:
+ // stop sync thread
+ lock.Lock();
+ stop = true;
+ sync_cond.Signal();
+ lock.Unlock();
+ sync_thread.join();
+ if (!m_disable_wbthrottle) {
+ wbthrottle.stop();
+ }
+close_current_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ current_fd = -1;
+close_basedir_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+ basedir_fd = -1;
+close_fsid_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+done:
+ ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+ delete backend;
+ backend = nullptr;
+ object_map.reset();
+ return ret;
+}
+
+void FileStore::init_temp_collections()
+{
+ dout(10) << __FUNC__ << dendl;
+ vector<coll_t> ls;
+ int r = list_collections(ls, true);
+ ceph_assert(r >= 0);
+
+ dout(20) << " ls " << ls << dendl;
+
+ SequencerPosition spos;
+
+ set<coll_t> temps;
+ for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p)
+ if (p->is_temp())
+ temps.insert(*p);
+ dout(20) << " temps " << temps << dendl;
+
+ for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ if (p->is_temp())
+ continue;
+ coll_map[*p] = new OpSequencer(cct, ++next_osr_id, *p);
+ if (p->is_meta())
+ continue;
+ coll_t temp = p->get_temp();
+ if (temps.count(temp)) {
+ temps.erase(temp);
+ } else {
+ dout(10) << __FUNC__ << ": creating " << temp << dendl;
+ r = _create_collection(temp, 0, spos);
+ ceph_assert(r == 0);
+ }
+ }
+
+ for (set<coll_t>::iterator p = temps.begin(); p != temps.end(); ++p) {
+ dout(10) << __FUNC__ << ": removing stray " << *p << dendl;
+ r = _collection_remove_recursive(*p, spos);
+ ceph_assert(r == 0);
+ }
+}
+
+int FileStore::umount()
+{
+ dout(5) << __FUNC__ << ": " << basedir << dendl;
+
+ flush();
+ sync();
+ do_force_sync();
+
+ {
+ Mutex::Locker l(coll_lock);
+ coll_map.clear();
+ }
+
+ lock.Lock();
+ stop = true;
+ sync_cond.Signal();
+ lock.Unlock();
+ sync_thread.join();
+ if (!m_disable_wbthrottle){
+ wbthrottle.stop();
+ }
+ op_tp.stop();
+
+ journal_stop();
+ if (!(generic_flags & SKIP_JOURNAL_REPLAY))
+ journal_write_close();
+
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ (*it)->stop();
+ }
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ (*it)->stop();
+ }
+
+ if (vdo_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
+ vdo_fd = -1;
+ }
+ if (fsid_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ }
+ if (op_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+ op_fd = -1;
+ }
+ if (current_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ current_fd = -1;
+ }
+ if (basedir_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+ basedir_fd = -1;
+ }
+
+ force_sync = false;
+
+ delete backend;
+ backend = nullptr;
+
+ object_map.reset();
+
+ {
+ Mutex::Locker l(sync_entry_timeo_lock);
+ timer.shutdown();
+ }
+
+ // nothing
+ return 0;
+}
+
+
+/// -----------------------------
+
+// keep OpSequencer handles alive for all time so that a sequence
+// that removes a collection and creates a new one will not allow
+// two sequencers for the same collection to be alive at once.
+
+ObjectStore::CollectionHandle FileStore::open_collection(const coll_t& c)
+{
+ Mutex::Locker l(coll_lock);
+ auto p = coll_map.find(c);
+ if (p == coll_map.end()) {
+ return CollectionHandle();
+ }
+ return p->second;
+}
+
+ObjectStore::CollectionHandle FileStore::create_new_collection(const coll_t& c)
+{
+ Mutex::Locker l(coll_lock);
+ auto p = coll_map.find(c);
+ if (p == coll_map.end()) {
+ auto *r = new OpSequencer(cct, ++next_osr_id, c);
+ coll_map[c] = r;
+ return r;
+ } else {
+ return p->second;
+ }
+}
+
+
+/// -----------------------------
+
+FileStore::Op *FileStore::build_op(vector<Transaction>& tls,
+ Context *onreadable,
+ Context *onreadable_sync,
+ TrackedOpRef osd_op)
+{
+ uint64_t bytes = 0, ops = 0;
+ for (vector<Transaction>::iterator p = tls.begin();
+ p != tls.end();
+ ++p) {
+ bytes += (*p).get_num_bytes();
+ ops += (*p).get_num_ops();
+ }
+
+ Op *o = new Op;
+ o->start = ceph_clock_now();
+ o->tls = std::move(tls);
+ o->onreadable = onreadable;
+ o->onreadable_sync = onreadable_sync;
+ o->ops = ops;
+ o->bytes = bytes;
+ o->osd_op = osd_op;
+ return o;
+}
+
+
+
+void FileStore::queue_op(OpSequencer *osr, Op *o)
+{
+ // queue op on sequencer, then queue sequencer for the threadpool,
+ // so that regardless of which order the threads pick up the
+ // sequencer, the op order will be preserved.
+
+ osr->queue(o);
+ o->trace.event("queued");
+
+ logger->inc(l_filestore_ops);
+ logger->inc(l_filestore_bytes, o->bytes);
+
+ dout(5) << __FUNC__ << ": " << o << " seq " << o->op
+ << " " << *osr
+ << " " << o->bytes << " bytes"
+ << " (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)"
+ << dendl;
+ op_wq.queue(osr);
+}
+
+void FileStore::op_queue_reserve_throttle(Op *o)
+{
+ throttle_ops.get();
+ throttle_bytes.get(o->bytes);
+
+ logger->set(l_filestore_op_queue_ops, throttle_ops.get_current());
+ logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::op_queue_release_throttle(Op *o)
+{
+ throttle_ops.put();
+ throttle_bytes.put(o->bytes);
+ logger->set(l_filestore_op_queue_ops, throttle_ops.get_current());
+ logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
+{
+ if (!m_disable_wbthrottle) {
+ wbthrottle.throttle();
+ }
+ // inject a stall?
+ if (cct->_conf->filestore_inject_stall) {
+ int orig = cct->_conf->filestore_inject_stall;
+ dout(5) << __FUNC__ << ": filestore_inject_stall " << orig << ", sleeping" << dendl;
+ sleep(orig);
+ cct->_conf.set_val("filestore_inject_stall", "0");
+ dout(5) << __FUNC__ << ": done stalling" << dendl;
+ }
+
+ osr->apply_lock.Lock();
+ Op *o = osr->peek_queue();
+ o->trace.event("op_apply_start");
+ apply_manager.op_apply_start(o->op);
+ dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " start" << dendl;
+ o->trace.event("_do_transactions start");
+ int r = _do_transactions(o->tls, o->op, &handle, osr->osr_name);
+ o->trace.event("op_apply_finish");
+ apply_manager.op_apply_finish(o->op);
+ dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " r = " << r
+ << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl;
+}
+
+void FileStore::_finish_op(OpSequencer *osr)
+{
+ list<Context*> to_queue;
+ Op *o = osr->dequeue(&to_queue);
+
+ o->tls.clear();
+
+ utime_t lat = ceph_clock_now();
+ lat -= o->start;
+
+ dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " lat " << lat << dendl;
+ osr->apply_lock.Unlock(); // locked in _do_op
+ o->trace.event("_finish_op");
+
+ // called with tp lock held
+ op_queue_release_throttle(o);
+
+ logger->tinc(l_filestore_apply_latency, lat);
+
+ if (o->onreadable_sync) {
+ o->onreadable_sync->complete(0);
+ }
+ if (o->onreadable) {
+ apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable);
+ }
+ if (!to_queue.empty()) {
+ apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue);
+ }
+ delete o;
+ o = nullptr;
+}
+
+struct C_JournaledAhead : public Context {
+ FileStore *fs;
+ FileStore::OpSequencer *osr;
+ FileStore::Op *o;
+ Context *ondisk;
+
+ C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk):
+ fs(f), osr(os), o(o), ondisk(ondisk) { }
+ void finish(int r) override {
+ fs->_journaled_ahead(osr, o, ondisk);
+ }
+};
+
+int FileStore::queue_transactions(CollectionHandle& ch, vector<Transaction>& tls,
+ TrackedOpRef osd_op,
+ ThreadPool::TPHandle *handle)
+{
+ Context *onreadable;
+ Context *ondisk;
+ Context *onreadable_sync;
+ ObjectStore::Transaction::collect_contexts(
+ tls, &onreadable, &ondisk, &onreadable_sync);
+
+ if (cct->_conf->objectstore_blackhole) {
+ dout(0) << __FUNC__ << ": objectstore_blackhole = TRUE, dropping transaction"
+ << dendl;
+ delete ondisk;
+ ondisk = nullptr;
+ delete onreadable;
+ onreadable = nullptr;
+ delete onreadable_sync;
+ onreadable_sync = nullptr;
+ return 0;
+ }
+
+ utime_t start = ceph_clock_now();
+
+ OpSequencer *osr = static_cast<OpSequencer*>(ch.get());
+ dout(5) << __FUNC__ << ": osr " << osr << " " << *osr << dendl;
+
+ ZTracer::Trace trace;
+ if (osd_op && osd_op->pg_trace) {
+ osd_op->store_trace.init("filestore op", &trace_endpoint, &osd_op->pg_trace);
+ trace = osd_op->store_trace;
+ }
+
+ if (journal && journal->is_writeable() && !m_filestore_journal_trailing) {
+ Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+
+ //prepare and encode transactions data out of lock
+ bufferlist tbl;
+ int orig_len = journal->prepare_entry(o->tls, &tbl);
+
+ if (handle)
+ handle->suspend_tp_timeout();
+
+ op_queue_reserve_throttle(o);
+ journal->reserve_throttle_and_backoff(tbl.length());
+
+ if (handle)
+ handle->reset_tp_timeout();
+
+ uint64_t op_num = submit_manager.op_submit_start();
+ o->op = op_num;
+ trace.keyval("opnum", op_num);
+
+ if (m_filestore_do_dump)
+ dump_transactions(o->tls, o->op, osr);
+
+ if (m_filestore_journal_parallel) {
+ dout(5) << __FUNC__ << ": (parallel) " << o->op << " " << o->tls << dendl;
+
+ trace.keyval("journal mode", "parallel");
+ trace.event("journal started");
+ _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op);
+
+ // queue inside submit_manager op submission lock
+ queue_op(osr, o);
+ trace.event("op queued");
+ } else if (m_filestore_journal_writeahead) {
+ dout(5) << __FUNC__ << ": (writeahead) " << o->op << " " << o->tls << dendl;
+
+ osr->queue_journal(o);
+
+ trace.keyval("journal mode", "writeahead");
+ trace.event("journal started");
+ _op_journal_transactions(tbl, orig_len, o->op,
+ new C_JournaledAhead(this, osr, o, ondisk),
+ osd_op);
+ } else {
+ ceph_abort();
+ }
+ submit_manager.op_submit_finish(op_num);
+ utime_t end = ceph_clock_now();
+ logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+ return 0;
+ }
+
+ if (!journal) {
+ Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+ dout(5) << __FUNC__ << ": (no journal) " << o << " " << tls << dendl;
+
+ if (handle)
+ handle->suspend_tp_timeout();
+
+ op_queue_reserve_throttle(o);
+
+ if (handle)
+ handle->reset_tp_timeout();
+
+ uint64_t op_num = submit_manager.op_submit_start();
+ o->op = op_num;
+
+ if (m_filestore_do_dump)
+ dump_transactions(o->tls, o->op, osr);
+
+ queue_op(osr, o);
+ trace.keyval("opnum", op_num);
+ trace.keyval("journal mode", "none");
+ trace.event("op queued");
+
+ if (ondisk)
+ apply_manager.add_waiter(op_num, ondisk);
+ submit_manager.op_submit_finish(op_num);
+ utime_t end = ceph_clock_now();
+ logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+ return 0;
+ }
+
+ ceph_assert(journal);
+ //prepare and encode transactions data out of lock
+ bufferlist tbl;
+ int orig_len = -1;
+ if (journal->is_writeable()) {
+ orig_len = journal->prepare_entry(tls, &tbl);
+ }
+ uint64_t op = submit_manager.op_submit_start();
+ dout(5) << __FUNC__ << ": (trailing journal) " << op << " " << tls << dendl;
+
+ if (m_filestore_do_dump)
+ dump_transactions(tls, op, osr);
+
+ trace.event("op_apply_start");
+ trace.keyval("opnum", op);
+ trace.keyval("journal mode", "trailing");
+ apply_manager.op_apply_start(op);
+ trace.event("do_transactions");
+ int r = do_transactions(tls, op);
+
+ if (r >= 0) {
+ trace.event("journal started");
+ _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op);
+ } else {
+ delete ondisk;
+ ondisk = nullptr;
+ }
+
+ // start on_readable finisher after we queue journal item, as on_readable callback
+ // is allowed to delete the Transaction
+ if (onreadable_sync) {
+ onreadable_sync->complete(r);
+ }
+ apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r);
+
+ submit_manager.op_submit_finish(op);
+ trace.event("op_apply_finish");
+ apply_manager.op_apply_finish(op);
+
+ utime_t end = ceph_clock_now();
+ logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+ return r;
+}
+
+void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
+{
+ dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl;
+
+ o->trace.event("writeahead journal finished");
+
+ // this should queue in order because the journal does it's completions in order.
+ queue_op(osr, o);
+
+ list<Context*> to_queue;
+ osr->dequeue_journal(&to_queue);
+
+ // do ondisk completions async, to prevent any onreadable_sync completions
+ // getting blocked behind an ondisk completion.
+ if (ondisk) {
+ dout(10) << " queueing ondisk " << ondisk << dendl;
+ ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk);
+ }
+ if (!to_queue.empty()) {
+ ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue);
+ }
+}
+
+int FileStore::_do_transactions(
+ vector<Transaction> &tls,
+ uint64_t op_seq,
+ ThreadPool::TPHandle *handle,
+ const char *osr_name)
+{
+ int trans_num = 0;
+
+ for (vector<Transaction>::iterator p = tls.begin();
+ p != tls.end();
+ ++p, trans_num++) {
+ _do_transaction(*p, op_seq, trans_num, handle, osr_name);
+ if (handle)
+ handle->reset_tp_timeout();
+ }
+
+ return 0;
+}
+
+void FileStore::_set_global_replay_guard(const coll_t& cid,
+ const SequencerPosition &spos)
+{
+ if (backend->can_checkpoint())
+ return;
+
+ // sync all previous operations on this sequencer
+ int ret = object_map->sync();
+ if (ret < 0) {
+ derr << __FUNC__ << ": omap sync error " << cpp_strerror(ret) << dendl;
+ ceph_abort_msg("_set_global_replay_guard failed");
+ }
+ ret = sync_filesystem(basedir_fd);
+ if (ret < 0) {
+ derr << __FUNC__ << ": sync_filesystem error " << cpp_strerror(ret) << dendl;
+ ceph_abort_msg("_set_global_replay_guard failed");
+ }
+
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ int err = errno;
+ derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("_set_global_replay_guard failed");
+ }
+
+ _inject_failure();
+
+ // then record that we did it
+ bufferlist v;
+ encode(spos, v);
+ int r = chain_fsetxattr<true, true>(
+ fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length());
+ if (r < 0) {
+ derr << __FUNC__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR
+ << " got " << cpp_strerror(r) << dendl;
+ ceph_abort_msg("fsetxattr failed");
+ }
+
+ // and make sure our xattr is durable.
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ _inject_failure();
+
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+int FileStore::_check_global_replay_guard(const coll_t& cid,
+ const SequencerPosition& spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ dout(10) << __FUNC__ << ": " << cid << " dne" << dendl;
+ return 1; // if collection does not exist, there is no guard, and we can replay.
+ }
+
+ char buf[100];
+ int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf));
+ if (r < 0) {
+ dout(20) << __FUNC__ << ": no xattr" << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return 1; // no xattr
+ }
+ bufferlist bl;
+ bl.append(buf, r);
+
+ SequencerPosition opos;
+ auto p = bl.cbegin();
+ decode(opos, p);
+
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return spos >= opos ? 1 : -1;
+}
+
+
+void FileStore::_set_replay_guard(const coll_t& cid,
+ const SequencerPosition &spos,
+ bool in_progress=false)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ int err = errno;
+ derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("_set_replay_guard failed");
+ }
+ _set_replay_guard(fd, spos, 0, in_progress);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+
+void FileStore::_set_replay_guard(int fd,
+ const SequencerPosition& spos,
+ const ghobject_t *hoid,
+ bool in_progress)
+{
+ if (backend->can_checkpoint())
+ return;
+
+ dout(10) << __FUNC__ << ": " << spos << (in_progress ? " START" : "") << dendl;
+
+ _inject_failure();
+
+ // first make sure the previous operation commits
+ int r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ if (!in_progress) {
+ // sync object_map too. even if this object has a header or keys,
+ // it have had them in the past and then removed them, so always
+ // sync.
+ object_map->sync(hoid, &spos);
+ }
+
+ _inject_failure();
+
+ // then record that we did it
+ bufferlist v(40);
+ encode(spos, v);
+ encode(in_progress, v);
+ r = chain_fsetxattr<true, true>(
+ fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+ if (r < 0) {
+ derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+ ceph_abort_msg("fsetxattr failed");
+ }
+
+ // and make sure our xattr is durable.
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ _inject_failure();
+
+ dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+void FileStore::_close_replay_guard(const coll_t& cid,
+ const SequencerPosition &spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ int err = errno;
+ derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("_close_replay_guard failed");
+ }
+ _close_replay_guard(fd, spos);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos,
+ const ghobject_t *hoid)
+{
+ if (backend->can_checkpoint())
+ return;
+
+ dout(10) << __FUNC__ << ": " << spos << dendl;
+
+ _inject_failure();
+
+ // sync object_map too. even if this object has a header or keys,
+ // it have had them in the past and then removed them, so always
+ // sync.
+ object_map->sync(hoid, &spos);
+
+ // then record that we are done with this operation
+ bufferlist v(40);
+ encode(spos, v);
+ bool in_progress = false;
+ encode(in_progress, v);
+ int r = chain_fsetxattr<true, true>(
+ fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+ if (r < 0) {
+ derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+ ceph_abort_msg("fsetxattr failed");
+ }
+
+ // and make sure our xattr is durable.
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ _inject_failure();
+
+ dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+int FileStore::_check_replay_guard(const coll_t& cid, const ghobject_t &oid,
+ const SequencerPosition& spos)
+{
+ if (!replaying || backend->can_checkpoint())
+ return 1;
+
+ int r = _check_global_replay_guard(cid, spos);
+ if (r < 0)
+ return r;
+
+ FDRef fd;
+ r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": " << cid << " " << oid << " dne" << dendl;
+ return 1; // if file does not exist, there is no guard, and we can replay.
+ }
+ int ret = _check_replay_guard(**fd, spos);
+ lfn_close(fd);
+ return ret;
+}
+
+int FileStore::_check_replay_guard(const coll_t& cid, const SequencerPosition& spos)
+{
+ if (!replaying || backend->can_checkpoint())
+ return 1;
+
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ dout(10) << __FUNC__ << ": " << cid << " dne" << dendl;
+ return 1; // if collection does not exist, there is no guard, and we can replay.
+ }
+ int ret = _check_replay_guard(fd, spos);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+}
+
+int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
+{
+ if (!replaying || backend->can_checkpoint())
+ return 1;
+
+ char buf[100];
+ int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf));
+ if (r < 0) {
+ dout(20) << __FUNC__ << ": no xattr" << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return 1; // no xattr
+ }
+ bufferlist bl;
+ bl.append(buf, r);
+
+ SequencerPosition opos;
+ auto p = bl.cbegin();
+ decode(opos, p);
+ bool in_progress = false;
+ if (!p.end()) // older journals don't have this
+ decode(in_progress, p);
+ if (opos > spos) {
+ dout(10) << __FUNC__ << ": object has " << opos << " > current pos " << spos
+ << ", now or in future, SKIPPING REPLAY" << dendl;
+ return -1;
+ } else if (opos == spos) {
+ if (in_progress) {
+ dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos
+ << ", in_progress=true, CONDITIONAL REPLAY" << dendl;
+ return 0;
+ } else {
+ dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos
+ << ", in_progress=false, SKIPPING REPLAY" << dendl;
+ return -1;
+ }
+ } else {
+ dout(10) << __FUNC__ << ": object has " << opos << " < current pos " << spos
+ << ", in past, will replay" << dendl;
+ return 1;
+ }
+}
+
+void FileStore::_do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle,
+ const char *osr_name)
+{
+ dout(10) << __FUNC__ << ": on " << &t << dendl;
+
+ Transaction::iterator i = t.begin();
+
+ SequencerPosition spos(op_seq, trans_num, 0);
+ while (i.have_op()) {
+ if (handle)
+ handle->reset_tp_timeout();
+
+ Transaction::Op *op = i.decode_op();
+ int r = 0;
+
+ _inject_failure();
+
+ switch (op->op) {
+ case Transaction::OP_NOP:
+ break;
+ case Transaction::OP_TOUCH:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, touch_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _touch(cid, oid);
+ tracepoint(objectstore, touch_exit, r);
+ }
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, write_enter, osr_name, off, len);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _write(cid, oid, off, len, bl, fadvise_flags);
+ tracepoint(objectstore, write_exit, r);
+ }
+ break;
+
+ case Transaction::OP_ZERO:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ tracepoint(objectstore, zero_enter, osr_name, off, len);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _zero(cid, oid, off, len);
+ tracepoint(objectstore, zero_exit, r);
+ }
+ break;
+
+ case Transaction::OP_TRIMCACHE:
+ {
+ // deprecated, no-op
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ uint64_t off = op->off;
+ tracepoint(objectstore, truncate_enter, osr_name, off);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _truncate(cid, oid, off);
+ tracepoint(objectstore, truncate_exit, r);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, remove_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _remove(cid, oid, spos);
+ tracepoint(objectstore, remove_exit, r);
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ string name = i.decode_string();
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, setattr_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0) {
+ map<string, bufferptr> to_set;
+ to_set[name] = bufferptr(bl.c_str(), bl.length());
+ r = _setattrs(cid, oid, to_set, spos);
+ if (r == -ENOSPC)
+ dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid
+ << " name " << name << " size " << bl.length() << dendl;
+ }
+ tracepoint(objectstore, setattr_exit, r);
+ }
+ break;
+
+ case Transaction::OP_SETATTRS:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ map<string, bufferptr> aset;
+ i.decode_attrset(aset);
+ tracepoint(objectstore, setattrs_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _setattrs(cid, oid, aset, spos);
+ tracepoint(objectstore, setattrs_exit, r);
+ if (r == -ENOSPC)
+ dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl;
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ string name = i.decode_string();
+ tracepoint(objectstore, rmattr_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _rmattr(cid, oid, name.c_str(), spos);
+ tracepoint(objectstore, rmattr_exit, r);
+ }
+ break;
+
+ case Transaction::OP_RMATTRS:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, rmattrs_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _rmattrs(cid, oid, spos);
+ tracepoint(objectstore, rmattrs_exit, r);
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ const ghobject_t &noid = i.get_oid(op->dest_oid);
+ tracepoint(objectstore, clone_enter, osr_name);
+ r = _clone(cid, oid, noid, spos);
+ tracepoint(objectstore, clone_exit, r);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const ghobject_t &noid = i.get_oid(op->dest_oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ?
+ _cid : _cid.get_temp();
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ tracepoint(objectstore, clone_range_enter, osr_name, len);
+ r = _clone_range(cid, oid, ncid, noid, off, len, off, spos);
+ tracepoint(objectstore, clone_range_exit, r);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE2:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const ghobject_t &noid = i.get_oid(op->dest_oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ?
+ _cid : _cid.get_temp();
+ uint64_t srcoff = op->off;
+ uint64_t len = op->len;
+ uint64_t dstoff = op->dest_off;
+ tracepoint(objectstore, clone_range2_enter, osr_name, len);
+ r = _clone_range(cid, oid, ncid, noid, srcoff, len, dstoff, spos);
+ tracepoint(objectstore, clone_range2_exit, r);
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ tracepoint(objectstore, mkcoll_enter, osr_name);
+ if (_check_replay_guard(cid, spos) > 0)
+ r = _create_collection(cid, op->split_bits, spos);
+ tracepoint(objectstore, mkcoll_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_SET_BITS:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ int bits = op->split_bits;
+ r = _collection_set_bits(cid, bits);
+ }
+ break;
+
+ case Transaction::OP_COLL_HINT:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ uint32_t type = op->hint_type;
+ bufferlist hint;
+ i.decode_bl(hint);
+ auto hiter = hint.cbegin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ decode(pg_num, hiter);
+ decode(num_objs, hiter);
+ if (_check_replay_guard(cid, spos) > 0) {
+ r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos);
+ }
+ } else {
+ // Ignore the hint
+ dout(10) << "Unrecognized collection hint type: " << type << dendl;
+ }
+ }
+ break;
+
+ case Transaction::OP_RMCOLL:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ tracepoint(objectstore, rmcoll_enter, osr_name);
+ if (_check_replay_guard(cid, spos) > 0)
+ r = _destroy_collection(cid);
+ tracepoint(objectstore, rmcoll_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ {
+ const coll_t &ocid = i.get_cid(op->cid);
+ const coll_t &ncid = i.get_cid(op->dest_cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+
+ ceph_assert(oid.hobj.pool >= -1);
+
+ // always followed by OP_COLL_REMOVE
+ Transaction::Op *op2 = i.decode_op();
+ const coll_t &ocid2 = i.get_cid(op2->cid);
+ const ghobject_t &oid2 = i.get_oid(op2->oid);
+ ceph_assert(op2->op == Transaction::OP_COLL_REMOVE);
+ ceph_assert(ocid2 == ocid);
+ ceph_assert(oid2 == oid);
+
+ tracepoint(objectstore, coll_add_enter);
+ r = _collection_add(ncid, ocid, oid, spos);
+ tracepoint(objectstore, coll_add_exit, r);
+ spos.op++;
+ if (r < 0)
+ break;
+ tracepoint(objectstore, coll_remove_enter, osr_name);
+ if (_check_replay_guard(ocid, oid, spos) > 0)
+ r = _remove(ocid, oid, spos);
+ tracepoint(objectstore, coll_remove_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_MOVE:
+ {
+ // WARNING: this is deprecated and buggy; only here to replay old journals.
+ const coll_t &ocid = i.get_cid(op->cid);
+ const coll_t &ncid = i.get_cid(op->dest_cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ tracepoint(objectstore, coll_move_enter);
+ r = _collection_add(ocid, ncid, oid, spos);
+ if (r == 0 &&
+ (_check_replay_guard(ocid, oid, spos) > 0))
+ r = _remove(ocid, oid, spos);
+ tracepoint(objectstore, coll_move_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_MOVE_RENAME:
+ {
+ const coll_t &_oldcid = i.get_cid(op->cid);
+ const ghobject_t &oldoid = i.get_oid(op->oid);
+ const coll_t &_newcid = i.get_cid(op->dest_cid);
+ const ghobject_t &newoid = i.get_oid(op->dest_oid);
+ const coll_t &oldcid = !_need_temp_object_collection(_oldcid, oldoid) ?
+ _oldcid : _oldcid.get_temp();
+ const coll_t &newcid = !_need_temp_object_collection(_newcid, newoid) ?
+ _oldcid : _newcid.get_temp();
+ tracepoint(objectstore, coll_move_rename_enter);
+ r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
+ tracepoint(objectstore, coll_move_rename_exit, r);
+ }
+ break;
+
+ case Transaction::OP_TRY_RENAME:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oldoid = i.get_oid(op->oid);
+ const ghobject_t &newoid = i.get_oid(op->dest_oid);
+ const coll_t &oldcid = !_need_temp_object_collection(_cid, oldoid) ?
+ _cid : _cid.get_temp();
+ const coll_t &newcid = !_need_temp_object_collection(_cid, newoid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, coll_try_rename_enter);
+ r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos, true);
+ tracepoint(objectstore, coll_try_rename_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ case Transaction::OP_COLL_RMATTR:
+ ceph_abort_msg("collection attr methods no longer implemented");
+ break;
+
+ case Transaction::OP_COLL_RENAME:
+ {
+ r = -EOPNOTSUPP;
+ }
+ break;
+
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, omap_clear_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_clear(cid, oid, spos);
+ tracepoint(objectstore, omap_clear_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ map<string, bufferlist> aset;
+ i.decode_attrset(aset);
+ tracepoint(objectstore, omap_setkeys_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_setkeys(cid, oid, aset, spos);
+ tracepoint(objectstore, omap_setkeys_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ set<string> keys;
+ i.decode_keyset(keys);
+ tracepoint(objectstore, omap_rmkeys_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_rmkeys(cid, oid, keys, spos);
+ tracepoint(objectstore, omap_rmkeys_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ tracepoint(objectstore, omap_rmkeyrange_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_rmkeyrange(cid, oid, first, last, spos);
+ tracepoint(objectstore, omap_rmkeyrange_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, omap_setheader_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_setheader(cid, oid, bl, spos);
+ tracepoint(objectstore, omap_setheader_exit, r);
+ }
+ break;
+ case Transaction::OP_SPLIT_COLLECTION:
+ {
+ ceph_abort_msg("not legacy journal; upgrade to firefly first");
+ }
+ break;
+ case Transaction::OP_SPLIT_COLLECTION2:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ uint32_t rem = op->split_rem;
+ coll_t dest = i.get_cid(op->dest_cid);
+ tracepoint(objectstore, split_coll2_enter, osr_name);
+ r = _split_collection(cid, bits, rem, dest, spos);
+ tracepoint(objectstore, split_coll2_exit, r);
+ }
+ break;
+
+ case Transaction::OP_MERGE_COLLECTION:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ coll_t dest = i.get_cid(op->dest_cid);
+ tracepoint(objectstore, merge_coll_enter, osr_name);
+ r = _merge_collection(cid, bits, dest, spos);
+ tracepoint(objectstore, merge_coll_exit, r);
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ uint64_t expected_object_size = op->expected_object_size;
+ uint64_t expected_write_size = op->expected_write_size;
+ tracepoint(objectstore, setallochint_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _set_alloc_hint(cid, oid, expected_object_size,
+ expected_write_size);
+ tracepoint(objectstore, setallochint_exit, r);
+ }
+ break;
+
+ default:
+ derr << "bad op " << op->op << dendl;
+ ceph_abort();
+ }
+
+ if (r < 0) {
+ bool ok = false;
+
+ if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2 ||
+ op->op == Transaction::OP_COLL_ADD ||
+ op->op == Transaction::OP_SETATTR ||
+ op->op == Transaction::OP_SETATTRS ||
+ op->op == Transaction::OP_RMATTR ||
+ op->op == Transaction::OP_OMAP_SETKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYRANGE ||
+ op->op == Transaction::OP_OMAP_SETHEADER))
+ // -ENOENT is normally okay
+ // ...including on a replayed OP_RMCOLL with checkpoint mode
+ ok = true;
+ if (r == -ENODATA)
+ ok = true;
+
+ if (op->op == Transaction::OP_SETALLOCHINT)
+ // Either EOPNOTSUPP or EINVAL most probably. EINVAL in most
+ // cases means invalid hint size (e.g. too big, not a multiple
+ // of block size, etc) or, at least on xfs, an attempt to set
+ // or change it when the file is not empty. However,
+ // OP_SETALLOCHINT is advisory, so ignore all errors.
+ ok = true;
+
+ if (replaying && !backend->can_checkpoint()) {
+ if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) {
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+ ok = true;
+ }
+ if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) {
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+ ok = true;
+ }
+ if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) {
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+ ok = true;
+ }
+ if (r == -ERANGE) {
+ dout(10) << "tolerating ERANGE on replay" << dendl;
+ ok = true;
+ }
+ if (r == -ENOENT) {
+ dout(10) << "tolerating ENOENT on replay" << dendl;
+ ok = true;
+ }
+ }
+
+ if (!ok) {
+ const char *msg = "unexpected error code";
+
+ if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2)) {
+ msg = "ENOENT on clone suggests osd bug";
+ } else if (r == -ENOSPC) {
+ // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+ // by partially applying transactions.
+ msg = "ENOSPC from disk filesystem, misconfigured cluster";
+ } else if (r == -ENOTEMPTY) {
+ msg = "ENOTEMPTY suggests garbage data in osd data dir";
+ } else if (r == -EPERM) {
+ msg = "EPERM suggests file(s) in osd data dir not owned by ceph user, or leveldb corruption";
+ }
+
+ derr << " error " << cpp_strerror(r) << " not handled on operation " << op
+ << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl;
+ dout(0) << msg << dendl;
+ dout(0) << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t.dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ if (r == -EMFILE) {
+ dump_open_fds(cct);
+ }
+
+ ceph_abort_msg("unexpected error");
+ }
+ }
+
+ spos.op++;
+ }
+
+ _inject_failure();
+}
+
+ /*********************************************/
+
+
+
+// --------------------
+// objects
+
+bool FileStore::exists(CollectionHandle& ch, const ghobject_t& oid)
+{
+ tracepoint(objectstore, exists_enter, ch->cid.c_str());
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+ struct stat st;
+ bool retval = stat(ch, oid, &st) == 0;
+ tracepoint(objectstore, exists_exit, retval);
+ return retval;
+}
+
+int FileStore::stat(
+ CollectionHandle& ch, const ghobject_t& oid, struct stat *st, bool allow_eio)
+{
+ tracepoint(objectstore, stat_enter, ch->cid.c_str());
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+ int r = lfn_stat(cid, oid, st);
+ ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid
+ << " = " << r << dendl;
+ } else {
+ dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid
+ << " = " << r
+ << " (size " << st->st_size << ")" << dendl;
+ }
+ if (cct->_conf->filestore_debug_inject_read_err &&
+ debug_mdata_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, stat_exit, r);
+ return r;
+ }
+}
+
+int FileStore::set_collection_opts(
+ CollectionHandle& ch,
+ const pool_opts_t& opts)
+{
+ return -EOPNOTSUPP;
+}
+
+int FileStore::read(
+ CollectionHandle& ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags)
+{
+ int got;
+ tracepoint(objectstore, read_enter, ch->cid.c_str(), offset, len);
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") open error: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (offset == 0 && len == 0) {
+ struct stat st;
+ memset(&st, 0, sizeof(struct stat));
+ int r = ::fstat(**fd, &st);
+ ceph_assert(r == 0);
+ len = st.st_size;
+ }
+
+#ifdef HAVE_POSIX_FADVISE
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM)
+ posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM);
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)
+ posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL);
+#endif
+
+ bufferptr bptr(len); // prealloc space for entire read
+ got = safe_pread(**fd, bptr.c_str(), len, offset);
+ if (got < 0) {
+ dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl;
+ lfn_close(fd);
+ return got;
+ }
+ bptr.set_length(got); // properly size the buffer
+ bl.clear();
+ bl.push_back(std::move(bptr)); // put it in the target bufferlist
+
+#ifdef HAVE_POSIX_FADVISE
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
+ posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED);
+ if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL))
+ posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL);
+#endif
+
+ if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) {
+ ostringstream ss;
+ int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss);
+ if (errors != 0) {
+ dout(0) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~"
+ << got << " ... BAD CRC:\n" << ss.str() << dendl;
+ ceph_abort_msg("bad crc on read");
+ }
+ }
+
+ lfn_close(fd);
+
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~"
+ << got << "/" << len << dendl;
+ if (cct->_conf->filestore_debug_inject_read_err &&
+ debug_data_eio(oid)) {
+ return -EIO;
+ } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
+ cct->_conf->filestore_debug_random_read_err &&
+ (rand() % (int)(cct->_conf->filestore_debug_random_read_err *
+ 100.0)) == 0) {
+ dout(0) << __func__ << ": inject random EIO" << dendl;
+ return -EIO;
+ } else {
+ tracepoint(objectstore, read_exit, got);
+ return got;
+ }
+}
+
+int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m)
+{
+ uint64_t i;
+ struct fiemap_extent *extent = nullptr;
+ struct fiemap *fiemap = nullptr;
+ int r = 0;
+
+more:
+ r = backend->do_fiemap(fd, offset, len, &fiemap);
+ if (r < 0)
+ return r;
+
+ if (fiemap->fm_mapped_extents == 0) {
+ free(fiemap);
+ return r;
+ }
+
+ extent = &fiemap->fm_extents[0];
+
+ /* start where we were asked to start */
+ if (extent->fe_logical < offset) {
+ extent->fe_length -= offset - extent->fe_logical;
+ extent->fe_logical = offset;
+ }
+
+ i = 0;
+
+ struct fiemap_extent *last = nullptr;
+ while (i < fiemap->fm_mapped_extents) {
+ struct fiemap_extent *next = extent + 1;
+
+ dout(10) << __FUNC__ << ": fm_mapped_extents=" << fiemap->fm_mapped_extents
+ << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl;
+
+ /* try to merge extents */
+ while ((i < fiemap->fm_mapped_extents - 1) &&
+ (extent->fe_logical + extent->fe_length == next->fe_logical)) {
+ next->fe_length += extent->fe_length;
+ next->fe_logical = extent->fe_logical;
+ extent = next;
+ next = extent + 1;
+ i++;
+ }
+
+ if (extent->fe_logical + extent->fe_length > offset + len)
+ extent->fe_length = offset + len - extent->fe_logical;
+ (*m)[extent->fe_logical] = extent->fe_length;
+ i++;
+ last = extent++;
+ }
+ uint64_t xoffset = last->fe_logical + last->fe_length - offset;
+ offset = last->fe_logical + last->fe_length;
+ len -= xoffset;
+ const bool is_last = (last->fe_flags & FIEMAP_EXTENT_LAST) || (len == 0);
+ free(fiemap);
+ if (!is_last) {
+ goto more;
+ }
+
+ return r;
+}
+
+int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m)
+{
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+ off_t hole_pos, data_pos;
+ int r = 0;
+
+ // If lseek fails with errno setting to be ENXIO, this means the current
+ // file offset is beyond the end of the file.
+ off_t start = offset;
+ while(start < (off_t)(offset + len)) {
+ data_pos = lseek(fd, start, SEEK_DATA);
+ if (data_pos < 0) {
+ if (errno == ENXIO)
+ break;
+ else {
+ r = -errno;
+ dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else if (data_pos > (off_t)(offset + len)) {
+ break;
+ }
+
+ hole_pos = lseek(fd, data_pos, SEEK_HOLE);
+ if (hole_pos < 0) {
+ if (errno == ENXIO) {
+ break;
+ } else {
+ r = -errno;
+ dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ if (hole_pos >= (off_t)(offset + len)) {
+ (*m)[data_pos] = offset + len - data_pos;
+ break;
+ }
+ (*m)[data_pos] = hole_pos - data_pos;
+ start = hole_pos;
+ }
+
+ return r;
+#else
+ (*m)[offset] = len;
+ return 0;
+#endif
+}
+
+int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+ uint64_t offset, size_t len,
+ bufferlist& bl)
+{
+ map<uint64_t, uint64_t> exomap;
+ int r = fiemap(ch, oid, offset, len, exomap);
+ if (r >= 0) {
+ encode(exomap, bl);
+ }
+ return r;
+}
+
+int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+ uint64_t offset, size_t len,
+ map<uint64_t, uint64_t>& destmap)
+{
+ tracepoint(objectstore, fiemap_enter, ch->cid.c_str(), offset, len);
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+ destmap.clear();
+
+ if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) ||
+ len <= (size_t)m_filestore_fiemap_threshold) {
+ destmap[offset] = len;
+ return 0;
+ }
+
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+
+ FDRef fd;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl;
+ goto done;
+ }
+
+ if (backend->has_seek_data_hole()) {
+ dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ r = _do_seek_hole_data(**fd, offset, len, &destmap);
+ } else if (backend->has_fiemap()) {
+ dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ r = _do_fiemap(**fd, offset, len, &destmap);
+ }
+
+ lfn_close(fd);
+
+done:
+
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << destmap.size() << " " << destmap << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ tracepoint(objectstore, fiemap_exit, r);
+ return r;
+}
+
+int FileStore::_remove(const coll_t& cid, const ghobject_t& oid,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+ int r = lfn_unlink(cid, oid, spos);
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << dendl;
+ int r = lfn_truncate(cid, oid, size);
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+ FDRef fd;
+ int r = lfn_open(cid, oid, true, &fd);
+ if (r < 0) {
+ return r;
+ } else {
+ lfn_close(fd);
+ }
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_write(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len,
+ const bufferlist& bl, uint32_t fadvise_flags)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ int r;
+
+ FDRef fd;
+ r = lfn_open(cid, oid, true, &fd);
+ if (r < 0) {
+ dout(0) << __FUNC__ << ": couldn't open " << cid << "/"
+ << oid << ": "
+ << cpp_strerror(r) << dendl;
+ goto out;
+ }
+
+ // write
+ r = bl.write_fd(**fd, offset);
+ if (r < 0) {
+ derr << __FUNC__ << ": write_fd on " << cid << "/" << oid
+ << " error: " << cpp_strerror(r) << dendl;
+ lfn_close(fd);
+ goto out;
+ }
+ r = bl.length();
+
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_write(**fd, offset, len, bl);
+ ceph_assert(rc >= 0);
+ }
+
+ if (replaying || m_disable_wbthrottle) {
+ if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) {
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(**fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+ }
+ } else {
+ wbthrottle.queue_wb(fd, oid, offset, len,
+ fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+ }
+
+ lfn_close(fd);
+
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ int ret = 0;
+
+ if (cct->_conf->filestore_punch_hole) {
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(__APPLE__) && !defined(__FreeBSD__)
+# ifdef FALLOC_FL_KEEP_SIZE
+ // first try to punch a hole.
+ FDRef fd;
+ ret = lfn_open(cid, oid, false, &fd);
+ if (ret < 0) {
+ goto out;
+ }
+
+ struct stat st;
+ ret = ::fstat(**fd, &st);
+ if (ret < 0) {
+ ret = -errno;
+ lfn_close(fd);
+ goto out;
+ }
+
+ // first try fallocate
+ ret = fallocate(**fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ offset, len);
+ if (ret < 0) {
+ ret = -errno;
+ } else {
+ // ensure we extend file size, if needed
+ if (len > 0 && offset + len > (uint64_t)st.st_size) {
+ ret = ::ftruncate(**fd, offset + len);
+ if (ret < 0) {
+ ret = -errno;
+ lfn_close(fd);
+ goto out;
+ }
+ }
+ }
+ lfn_close(fd);
+
+ if (ret >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_zero(**fd, offset, len);
+ ceph_assert(rc >= 0);
+ }
+
+ if (ret == 0)
+ goto out; // yay!
+ if (ret != -EOPNOTSUPP)
+ goto out; // some other error
+# endif
+# endif
+#endif
+ }
+
+ // lame, kernel is old and doesn't support it.
+ // write zeros.. yuck!
+ dout(20) << __FUNC__ << ": falling back to writing zeros" << dendl;
+ {
+ bufferlist bl;
+ bl.append_zero(len);
+ ret = _write(cid, oid, offset, len, bl);
+ }
+
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(__APPLE__) && !defined(__FreeBSD__)
+# ifdef FALLOC_FL_KEEP_SIZE
+ out:
+# endif
+# endif
+#endif
+ dout(20) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl;
+ return ret;
+}
+
+int FileStore::_clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+ const SequencerPosition& spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl;
+
+ if (_check_replay_guard(cid, newoid, spos) < 0)
+ return 0;
+
+ int r;
+ FDRef o, n;
+ {
+ Index index;
+ r = lfn_open(cid, oldoid, false, &o, &index);
+ if (r < 0) {
+ goto out2;
+ }
+ ceph_assert(index.index);
+ RWLock::WLocker l((index.index)->access_lock);
+
+ r = lfn_open(cid, newoid, true, &n, &index);
+ if (r < 0) {
+ goto out;
+ }
+ r = ::ftruncate(**n, 0);
+ if (r < 0) {
+ r = -errno;
+ goto out3;
+ }
+ struct stat st;
+ r = ::fstat(**o, &st);
+ if (r < 0) {
+ r = -errno;
+ goto out3;
+ }
+
+ r = _do_clone_range(**o, **n, 0, st.st_size, 0);
+ if (r < 0) {
+ goto out3;
+ }
+
+ dout(20) << "objectmap clone" << dendl;
+ r = object_map->clone(oldoid, newoid, &spos);
+ if (r < 0 && r != -ENOENT)
+ goto out3;
+ }
+
+ {
+ char buf[2];
+ map<string, bufferptr> aset;
+ r = _fgetattrs(**o, aset);
+ if (r < 0)
+ goto out3;
+
+ r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+ r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+ sizeof(XATTR_NO_SPILL_OUT));
+ } else {
+ r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+ sizeof(XATTR_SPILL_OUT));
+ }
+ if (r < 0)
+ goto out3;
+
+ r = _fsetattrs(**n, aset);
+ if (r < 0)
+ goto out3;
+ }
+
+ // clone is non-idempotent; record our work.
+ _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+ lfn_close(n);
+ out:
+ lfn_close(o);
+ out2:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << __FUNC__ << ": copy " << srcoff << "~" << len << " to " << dstoff << dendl;
+ return backend->clone_range(from, to, srcoff, len, dstoff);
+}
+
+int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl;
+ int r = 0;
+ map<uint64_t, uint64_t> exomap;
+ // fiemap doesn't allow zero length
+ if (len == 0)
+ return 0;
+
+ if (backend->has_seek_data_hole()) {
+ dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl;
+ r = _do_seek_hole_data(from, srcoff, len, &exomap);
+ } else if (backend->has_fiemap()) {
+ dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl;
+ r = _do_fiemap(from, srcoff, len, &exomap);
+ }
+
+
+ int64_t written = 0;
+ if (r < 0)
+ goto out;
+
+ for (map<uint64_t, uint64_t>::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) {
+ uint64_t it_off = miter->first - srcoff + dstoff;
+ r = _do_copy_range(from, to, miter->first, miter->second, it_off, true);
+ if (r < 0) {
+ derr << __FUNC__ << ": copy error at " << miter->first << "~" << miter->second
+ << " to " << it_off << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ written += miter->second;
+ }
+
+ if (r >= 0) {
+ if (m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+ ceph_assert(rc >= 0);
+ }
+ struct stat st;
+ r = ::fstat(to, &st);
+ if (r < 0) {
+ r = -errno;
+ derr << __FUNC__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ if (st.st_size < (int)(dstoff + len)) {
+ r = ::ftruncate(to, dstoff + len);
+ if (r < 0) {
+ r = -errno;
+ derr << __FUNC__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ }
+ r = written;
+ }
+
+ out:
+ dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc)
+{
+ dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl;
+ int r = 0;
+ loff_t pos = srcoff;
+ loff_t end = srcoff + len;
+ int buflen = 4096 * 16; //limit by pipe max size.see fcntl
+
+#ifdef CEPH_HAVE_SPLICE
+ if (backend->has_splice()) {
+ int pipefd[2];
+ if (pipe_cloexec(pipefd) < 0) {
+ int e = errno;
+ derr << " pipe " << " got " << cpp_strerror(e) << dendl;
+ return -e;
+ }
+
+ loff_t dstpos = dstoff;
+ while (pos < end) {
+ int l = std::min<int>(end-pos, buflen);
+ r = safe_splice(from, &pos, pipefd[1], nullptr, l, SPLICE_F_NONBLOCK);
+ dout(10) << " safe_splice read from " << pos << "~" << l << " got " << r << dendl;
+ if (r < 0) {
+ derr << __FUNC__ << ": safe_splice read error at " << pos << "~" << len
+ << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ if (r == 0) {
+ // hrm, bad source range, wtf.
+ r = -ERANGE;
+ derr << __FUNC__ << ": got short read result at " << pos
+ << " of fd " << from << " len " << len << dendl;
+ break;
+ }
+
+ r = safe_splice(pipefd[0], nullptr, to, &dstpos, r, 0);
+ dout(10) << " safe_splice write to " << to << " len " << r
+ << " got " << r << dendl;
+ if (r < 0) {
+ derr << __FUNC__ << ": write error at " << pos << "~"
+ << r << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ }
+ close(pipefd[0]);
+ close(pipefd[1]);
+ } else
+#endif
+ {
+ int64_t actual;
+
+ actual = ::lseek64(from, srcoff, SEEK_SET);
+ if (actual != (int64_t)srcoff) {
+ if (actual < 0)
+ r = -errno;
+ else
+ r = -EINVAL;
+ derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ actual = ::lseek64(to, dstoff, SEEK_SET);
+ if (actual != (int64_t)dstoff) {
+ if (actual < 0)
+ r = -errno;
+ else
+ r = -EINVAL;
+ derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ char buf[buflen];
+ while (pos < end) {
+ int l = std::min<int>(end-pos, buflen);
+ r = ::read(from, buf, l);
+ dout(25) << " read from " << pos << "~" << l << " got " << r << dendl;
+ if (r < 0) {
+ if (errno == EINTR) {
+ continue;
+ } else {
+ r = -errno;
+ derr << __FUNC__ << ": read error at " << pos << "~" << len
+ << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ }
+ if (r == 0) {
+ // hrm, bad source range, wtf.
+ r = -ERANGE;
+ derr << __FUNC__ << ": got short read result at " << pos
+ << " of fd " << from << " len " << len << dendl;
+ break;
+ }
+ int op = 0;
+ while (op < r) {
+ int r2 = safe_write(to, buf+op, r-op);
+ dout(25) << " write to " << to << " len " << (r-op)
+ << " got " << r2 << dendl;
+ if (r2 < 0) {
+ r = r2;
+ derr << __FUNC__ << ": write error at " << pos << "~"
+ << r-op << ", " << cpp_strerror(r) << dendl;
+
+ break;
+ }
+ op += (r-op);
+ }
+ if (r < 0)
+ break;
+ pos += r;
+ }
+ }
+
+ if (r < 0 && replaying) {
+ ceph_assert(r == -ERANGE);
+ derr << __FUNC__ << ": short source tolerated because we are replaying" << dendl;
+ r = len;
+ }
+ ceph_assert(replaying || pos == end);
+ if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+ ceph_assert(rc >= 0);
+ }
+ dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
+ uint64_t srcoff, uint64_t len, uint64_t dstoff,
+ const SequencerPosition& spos)
+{
+ dout(15) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl;
+
+ if (_check_replay_guard(newcid, newoid, spos) < 0)
+ return 0;
+
+ int r;
+ FDRef o, n;
+ r = lfn_open(oldcid, oldoid, false, &o);
+ if (r < 0) {
+ goto out2;
+ }
+ r = lfn_open(newcid, newoid, true, &n);
+ if (r < 0) {
+ goto out;
+ }
+ r = _do_clone_range(**o, **n, srcoff, len, dstoff);
+ if (r < 0) {
+ goto out3;
+ }
+
+ // clone is non-idempotent; record our work.
+ _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+ lfn_close(n);
+ out:
+ lfn_close(o);
+ out2:
+ dout(10) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " "
+ << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+
+class SyncEntryTimeout : public Context {
+public:
+ CephContext* cct;
+ explicit SyncEntryTimeout(CephContext* cct, int commit_timeo)
+ : cct(cct), m_commit_timeo(commit_timeo)
+ {
+ }
+
+ void finish(int r) override {
+ BackTrace *bt = new BackTrace(1);
+ generic_dout(-1) << "FileStore: sync_entry timed out after "
+ << m_commit_timeo << " seconds.\n";
+ bt->print(*_dout);
+ *_dout << dendl;
+ delete bt;
+ bt = nullptr;
+ ceph_abort();
+ }
+private:
+ int m_commit_timeo;
+};
+
+void FileStore::sync_entry()
+{
+ lock.Lock();
+ while (!stop) {
+ utime_t max_interval;
+ max_interval.set_from_double(m_filestore_max_sync_interval);
+ utime_t min_interval;
+ min_interval.set_from_double(m_filestore_min_sync_interval);
+
+ utime_t startwait = ceph_clock_now();
+ if (!force_sync) {
+ dout(20) << __FUNC__ << ": waiting for max_interval " << max_interval << dendl;
+ sync_cond.WaitInterval(lock, max_interval);
+ } else {
+ dout(20) << __FUNC__ << ": not waiting, force_sync set" << dendl;
+ }
+
+ if (force_sync) {
+ dout(20) << __FUNC__ << ": force_sync set" << dendl;
+ force_sync = false;
+ } else if (stop) {
+ dout(20) << __FUNC__ << ": stop set" << dendl;
+ break;
+ } else {
+ // wait for at least the min interval
+ utime_t woke = ceph_clock_now();
+ woke -= startwait;
+ dout(20) << __FUNC__ << ": woke after " << woke << dendl;
+ if (woke < min_interval) {
+ utime_t t = min_interval;
+ t -= woke;
+ dout(20) << __FUNC__ << ": waiting for another " << t
+ << " to reach min interval " << min_interval << dendl;
+ sync_cond.WaitInterval(lock, t);
+ }
+ }
+
+ list<Context*> fin;
+ again:
+ fin.swap(sync_waiters);
+ lock.Unlock();
+
+ op_tp.pause();
+ if (apply_manager.commit_start()) {
+ utime_t start = ceph_clock_now();
+ uint64_t cp = apply_manager.get_committing_seq();
+
+ sync_entry_timeo_lock.Lock();
+ SyncEntryTimeout *sync_entry_timeo =
+ new SyncEntryTimeout(cct, m_filestore_commit_timeout);
+ if (!timer.add_event_after(m_filestore_commit_timeout,
+ sync_entry_timeo)) {
+ sync_entry_timeo = nullptr;
+ }
+ sync_entry_timeo_lock.Unlock();
+
+ logger->set(l_filestore_committing, 1);
+
+ dout(15) << __FUNC__ << ": committing " << cp << dendl;
+ stringstream errstream;
+ if (cct->_conf->filestore_debug_omap_check && !object_map->check(errstream)) {
+ derr << errstream.str() << dendl;
+ ceph_abort();
+ }
+
+ if (backend->can_checkpoint()) {
+ int err = write_op_seq(op_fd, cp);
+ if (err < 0) {
+ derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("error during write_op_seq");
+ }
+
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+ uint64_t cid = 0;
+ err = backend->create_checkpoint(s, &cid);
+ if (err < 0) {
+ int err = errno;
+ derr << "snap create '" << s << "' got error " << err << dendl;
+ ceph_assert(err == 0);
+ }
+
+ snaps.push_back(cp);
+ apply_manager.commit_started();
+ op_tp.unpause();
+
+ if (cid > 0) {
+ dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl;
+ err = backend->sync_checkpoint(cid);
+ if (err < 0) {
+ derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("wait_sync got error");
+ }
+ dout(20) << " done waiting for checkpoint " << cid << " to complete" << dendl;
+ }
+ } else {
+ apply_manager.commit_started();
+ op_tp.unpause();
+
+ int err = object_map->sync();
+ if (err < 0) {
+ derr << "object_map sync got " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("object_map sync returned error");
+ }
+
+ err = backend->syncfs();
+ if (err < 0) {
+ derr << "syncfs got " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("syncfs returned error");
+ }
+
+ err = write_op_seq(op_fd, cp);
+ if (err < 0) {
+ derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("error during write_op_seq");
+ }
+ err = ::fsync(op_fd);
+ if (err < 0) {
+ derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("error during fsync of op_seq");
+ }
+ }
+
+ utime_t done = ceph_clock_now();
+ utime_t lat = done - start;
+ utime_t dur = done - startwait;
+ dout(10) << __FUNC__ << ": commit took " << lat << ", interval was " << dur << dendl;
+ utime_t max_pause_lat = logger->tget(l_filestore_sync_pause_max_lat);
+ if (max_pause_lat < dur - lat) {
+ logger->tinc(l_filestore_sync_pause_max_lat, dur - lat);
+ }
+
+ logger->inc(l_filestore_commitcycle);
+ logger->tinc(l_filestore_commitcycle_latency, lat);
+ logger->tinc(l_filestore_commitcycle_interval, dur);
+
+ apply_manager.commit_finish();
+ if (!m_disable_wbthrottle) {
+ wbthrottle.clear();
+ }
+
+ logger->set(l_filestore_committing, 0);
+
+ // remove old snaps?
+ if (backend->can_checkpoint()) {
+ char s[NAME_MAX];
+ while (snaps.size() > 2) {
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front());
+ snaps.pop_front();
+ dout(10) << "removing snap '" << s << "'" << dendl;
+ int r = backend->destroy_checkpoint(s);
+ if (r) {
+ int err = errno;
+ derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl;
+ }
+ }
+ }
+
+ dout(15) << __FUNC__ << ": committed to op_seq " << cp << dendl;
+
+ if (sync_entry_timeo) {
+ Mutex::Locker lock(sync_entry_timeo_lock);
+ timer.cancel_event(sync_entry_timeo);
+ }
+ } else {
+ op_tp.unpause();
+ }
+
+ lock.Lock();
+ finish_contexts(cct, fin, 0);
+ fin.clear();
+ if (!sync_waiters.empty()) {
+ dout(10) << __FUNC__ << ": more waiters, committing again" << dendl;
+ goto again;
+ }
+ if (!stop && journal && journal->should_commit_now()) {
+ dout(10) << __FUNC__ << ": journal says we should commit again (probably is/was full)" << dendl;
+ goto again;
+ }
+ }
+ stop = false;
+ lock.Unlock();
+}
+
+void FileStore::do_force_sync()
+{
+ dout(10) << __FUNC__ << dendl;
+ Mutex::Locker l(lock);
+ force_sync = true;
+ sync_cond.Signal();
+}
+
+void FileStore::start_sync(Context *onsafe)
+{
+ Mutex::Locker l(lock);
+ sync_waiters.push_back(onsafe);
+ sync_cond.Signal();
+ force_sync = true;
+ dout(10) << __FUNC__ << dendl;
+}
+
+void FileStore::sync()
+{
+ Mutex l("FileStore::sync");
+ Cond c;
+ bool done;
+ C_SafeCond *fin = new C_SafeCond(&l, &c, &done);
+
+ start_sync(fin);
+
+ l.Lock();
+ while (!done) {
+ dout(10) << "sync waiting" << dendl;
+ c.Wait(l);
+ }
+ l.Unlock();
+ dout(10) << "sync done" << dendl;
+}
+
+void FileStore::_flush_op_queue()
+{
+ dout(10) << __FUNC__ << ": draining op tp" << dendl;
+ op_wq.drain();
+ dout(10) << __FUNC__ << ": waiting for apply finisher" << dendl;
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ (*it)->wait_for_empty();
+ }
+}
+
+/*
+ * flush - make every queued write readable
+ */
+void FileStore::flush()
+{
+ dout(10) << __FUNC__ << dendl;
+
+ if (cct->_conf->filestore_blackhole) {
+ // wait forever
+ Mutex lock("FileStore::flush::lock");
+ Cond cond;
+ lock.Lock();
+ while (true)
+ cond.Wait(lock);
+ ceph_abort();
+ }
+
+ if (m_filestore_journal_writeahead) {
+ if (journal)
+ journal->flush();
+ dout(10) << __FUNC__ << ": draining ondisk finisher" << dendl;
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ (*it)->wait_for_empty();
+ }
+ }
+
+ _flush_op_queue();
+ dout(10) << __FUNC__ << ": complete" << dendl;
+}
+
+/*
+ * sync_and_flush - make every queued write readable AND committed to disk
+ */
+void FileStore::sync_and_flush()
+{
+ dout(10) << __FUNC__ << dendl;
+
+ if (m_filestore_journal_writeahead) {
+ if (journal)
+ journal->flush();
+ _flush_op_queue();
+ } else {
+ // includes m_filestore_journal_parallel
+ _flush_op_queue();
+ sync();
+ }
+ dout(10) << __FUNC__ << ": done" << dendl;
+}
+
+int FileStore::flush_journal()
+{
+ dout(10) << __FUNC__ << dendl;
+ sync_and_flush();
+ sync();
+ return 0;
+}
+
+int FileStore::snapshot(const string& name)
+{
+ dout(10) << __FUNC__ << ": " << name << dendl;
+ sync_and_flush();
+
+ if (!backend->can_checkpoint()) {
+ dout(0) << __FUNC__ << ": " << name << " failed, not supported" << dendl;
+ return -EOPNOTSUPP;
+ }
+
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str());
+
+ int r = backend->create_checkpoint(s, nullptr);
+ if (r) {
+ derr << __FUNC__ << ": " << name << " failed: " << cpp_strerror(r) << dendl;
+ }
+
+ return r;
+}
+
+// -------------------------------
+// attributes
+
+int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp)
+{
+ char val[CHAIN_XATTR_MAX_BLOCK_LEN];
+ int l = chain_fgetxattr(fd, name, val, sizeof(val));
+ if (l >= 0) {
+ bp = buffer::create(l);
+ memcpy(bp.c_str(), val, l);
+ } else if (l == -ERANGE) {
+ l = chain_fgetxattr(fd, name, 0, 0);
+ if (l > 0) {
+ bp = buffer::create(l);
+ l = chain_fgetxattr(fd, name, bp.c_str(), l);
+ }
+ }
+ ceph_assert(!m_filestore_fail_eio || l != -EIO);
+ return l;
+}
+
+int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset)
+{
+ // get attr list
+ char names1[100];
+ int len = chain_flistxattr(fd, names1, sizeof(names1)-1);
+ char *names2 = 0;
+ char *name = 0;
+ if (len == -ERANGE) {
+ len = chain_flistxattr(fd, 0, 0);
+ if (len < 0) {
+ ceph_assert(!m_filestore_fail_eio || len != -EIO);
+ return len;
+ }
+ dout(10) << " -ERANGE, len is " << len << dendl;
+ names2 = new char[len+1];
+ len = chain_flistxattr(fd, names2, len);
+ dout(10) << " -ERANGE, got " << len << dendl;
+ if (len < 0) {
+ ceph_assert(!m_filestore_fail_eio || len != -EIO);
+ delete[] names2;
+ return len;
+ }
+ name = names2;
+ } else if (len < 0) {
+ ceph_assert(!m_filestore_fail_eio || len != -EIO);
+ return len;
+ } else {
+ name = names1;
+ }
+ name[len] = 0;
+
+ char *end = name + len;
+ while (name < end) {
+ char *attrname = name;
+ if (parse_attrname(&name)) {
+ if (*name) {
+ dout(20) << __FUNC__ << ": " << fd << " getting '" << name << "'" << dendl;
+ int r = _fgetattr(fd, attrname, aset[name]);
+ if (r < 0) {
+ delete[] names2;
+ return r;
+ }
+ }
+ }
+ name += strlen(name) + 1;
+ }
+
+ delete[] names2;
+ return 0;
+}
+
+int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset)
+{
+ for (map<string, bufferptr>::iterator p = aset.begin();
+ p != aset.end();
+ ++p) {
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+ const char *val;
+ if (p->second.length())
+ val = p->second.c_str();
+ else
+ val = "";
+ // ??? Why do we skip setting all the other attrs if one fails?
+ int r = chain_fsetxattr(fd, n, val, p->second.length());
+ if (r < 0) {
+ derr << __FUNC__ << ": chain_setxattr returned " << r << dendl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+// debug EIO injection
+void FileStore::inject_data_error(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ dout(10) << __FUNC__ << ": init error on " << oid << dendl;
+ data_error_set.insert(oid);
+}
+void FileStore::inject_mdata_error(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ dout(10) << __FUNC__ << ": init error on " << oid << dendl;
+ mdata_error_set.insert(oid);
+}
+
+void FileStore::debug_obj_on_delete(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ dout(10) << __FUNC__ << ": clear error on " << oid << dendl;
+ data_error_set.erase(oid);
+ mdata_error_set.erase(oid);
+}
+bool FileStore::debug_data_eio(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ if (data_error_set.count(oid)) {
+ dout(10) << __FUNC__ << ": inject error on " << oid << dendl;
+ return true;
+ } else {
+ return false;
+ }
+}
+bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ if (mdata_error_set.count(oid)) {
+ dout(10) << __FUNC__ << ": inject error on " << oid << dendl;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+
+// objects
+
+int FileStore::getattr(CollectionHandle& ch, const ghobject_t& oid, const char *name, bufferptr &bp)
+{
+ tracepoint(objectstore, getattr_enter, ch->cid.c_str());
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+ r = _fgetattr(**fd, n, bp);
+ lfn_close(fd);
+ if (r == -ENODATA) {
+ map<string, bufferlist> got;
+ set<string> to_get;
+ to_get.insert(string(name));
+ Index index;
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ goto out;
+ }
+ r = object_map->get_xattrs(oid, to_get, &got);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": get_xattrs err r =" << r << dendl;
+ goto out;
+ }
+ if (got.empty()) {
+ dout(10) << __FUNC__ << ": got.size() is 0" << dendl;
+ return -ENODATA;
+ }
+ bp = bufferptr(got.begin()->second.c_str(),
+ got.begin()->second.length());
+ r = bp.length();
+ }
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ if (cct->_conf->filestore_debug_inject_read_err &&
+ debug_mdata_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, getattr_exit, r);
+ return r < 0 ? r : 0;
+ }
+}
+
+int FileStore::getattrs(CollectionHandle& ch, const ghobject_t& oid, map<string,bufferptr>& aset)
+{
+ tracepoint(objectstore, getattrs_enter, ch->cid.c_str());
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+ set<string> omap_attrs;
+ map<string, bufferlist> omap_aset;
+ Index index;
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+
+ FDRef fd;
+ bool spill_out = true;
+ char buf[2];
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+ spill_out = false;
+
+ r = _fgetattrs(**fd, aset);
+ lfn_close(fd);
+ fd = FDRef(); // defensive
+ if (r < 0) {
+ goto out;
+ }
+
+ if (!spill_out) {
+ dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl;
+ goto out;
+ }
+
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ goto out;
+ }
+ {
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+
+ r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+ if (r == -ENOENT)
+ r = 0;
+ }
+ ceph_assert(omap_attrs.size() == omap_aset.size());
+ for (map<string, bufferlist>::iterator i = omap_aset.begin();
+ i != omap_aset.end();
+ ++i) {
+ string key(i->first);
+ aset.insert(make_pair(key,
+ bufferptr(i->second.c_str(), i->second.length())));
+ }
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+
+ if (cct->_conf->filestore_debug_inject_read_err &&
+ debug_mdata_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, getattrs_exit, r);
+ return r;
+ }
+}
+
+int FileStore::_setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset,
+ const SequencerPosition &spos)
+{
+ map<string, bufferlist> omap_set;
+ set<string> omap_remove;
+ map<string, bufferptr> inline_set;
+ map<string, bufferptr> inline_to_set;
+ FDRef fd;
+ int spill_out = -1;
+ bool incomplete_inline = false;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ char buf[2];
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+ spill_out = 0;
+ else
+ spill_out = 1;
+
+ r = _fgetattrs(**fd, inline_set);
+ incomplete_inline = (r == -E2BIG);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid
+ << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "")
+ << dendl;
+
+ for (map<string,bufferptr>::iterator p = aset.begin();
+ p != aset.end();
+ ++p) {
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+
+ if (incomplete_inline) {
+ chain_fremovexattr(**fd, n); // ignore any error
+ omap_set[p->first].push_back(p->second);
+ continue;
+ }
+
+ if (p->second.length() > m_filestore_max_inline_xattr_size) {
+ if (inline_set.count(p->first)) {
+ inline_set.erase(p->first);
+ r = chain_fremovexattr(**fd, n);
+ if (r < 0)
+ goto out_close;
+ }
+ omap_set[p->first].push_back(p->second);
+ continue;
+ }
+
+ if (!inline_set.count(p->first) &&
+ inline_set.size() >= m_filestore_max_inline_xattrs) {
+ omap_set[p->first].push_back(p->second);
+ continue;
+ }
+ omap_remove.insert(p->first);
+ inline_set.insert(*p);
+
+ inline_to_set.insert(*p);
+ }
+
+ if (spill_out != 1 && !omap_set.empty()) {
+ chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+ sizeof(XATTR_SPILL_OUT));
+ }
+
+ r = _fsetattrs(**fd, inline_to_set);
+ if (r < 0)
+ goto out_close;
+
+ if (spill_out && !omap_remove.empty()) {
+ r = object_map->remove_xattrs(oid, omap_remove, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not remove_xattrs r = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ goto out_close;
+ } else {
+ r = 0; // don't confuse the debug output
+ }
+ }
+
+ if (!omap_set.empty()) {
+ r = object_map->set_xattrs(oid, omap_set, &spos);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not set_xattrs r = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ goto out_close;
+ }
+ }
+ out_close:
+ lfn_close(fd);
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl;
+ FDRef fd;
+ bool spill_out = true;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ char buf[2];
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+ spill_out = false;
+ }
+
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+ r = chain_fremovexattr(**fd, n);
+ if (r == -ENODATA && spill_out) {
+ Index index;
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ goto out_close;
+ }
+ set<string> to_remove;
+ to_remove.insert(string(name));
+ r = object_map->remove_xattrs(oid, to_remove, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not remove_xattrs index r = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ goto out_close;
+ }
+ }
+ out_close:
+ lfn_close(fd);
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+ return r;
+}
+
+int FileStore::_rmattrs(const coll_t& cid, const ghobject_t& oid,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+ map<string,bufferptr> aset;
+ FDRef fd;
+ set<string> omap_attrs;
+ Index index;
+ bool spill_out = true;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ char buf[2];
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+ spill_out = false;
+ }
+
+ r = _fgetattrs(**fd, aset);
+ if (r >= 0) {
+ for (map<string,bufferptr>::iterator p = aset.begin(); p != aset.end(); ++p) {
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+ r = chain_fremovexattr(**fd, n);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not remove xattr r = " << r << dendl;
+ goto out_close;
+ }
+ }
+ }
+
+ if (!spill_out) {
+ dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl;
+ goto out_close;
+ }
+
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ goto out_close;
+ }
+ {
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ goto out_close;
+ }
+ r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not remove omap_attrs r = " << r << dendl;
+ goto out_close;
+ }
+ if (r == -ENOENT)
+ r = 0;
+ chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+ sizeof(XATTR_NO_SPILL_OUT));
+ }
+
+ out_close:
+ lfn_close(fd);
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+
+
+
+int FileStore::_collection_remove_recursive(const coll_t &cid,
+ const SequencerPosition &spos)
+{
+ struct stat st;
+ int r = collection_stat(cid, &st);
+ if (r < 0) {
+ if (r == -ENOENT)
+ return 0;
+ return r;
+ }
+
+ vector<ghobject_t> objects;
+ ghobject_t max;
+ while (!max.is_max()) {
+ r = collection_list(cid, max, ghobject_t::get_max(),
+ 300, &objects, &max);
+ if (r < 0)
+ return r;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ ceph_assert(_check_replay_guard(cid, *i, spos));
+ r = _remove(cid, *i, spos);
+ if (r < 0)
+ return r;
+ }
+ objects.clear();
+ }
+ return _destroy_collection(cid);
+}
+
+// --------------------------
+// collections
+
+int FileStore::list_collections(vector<coll_t>& ls)
+{
+ return list_collections(ls, false);
+}
+
+int FileStore::list_collections(vector<coll_t>& ls, bool include_temp)
+{
+ tracepoint(objectstore, list_collections_enter);
+ dout(10) << __FUNC__ << dendl;
+
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/current", basedir.c_str());
+
+ int r = 0;
+ DIR *dir = ::opendir(fn);
+ if (!dir) {
+ r = -errno;
+ derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+
+ struct dirent *de = nullptr;
+ while ((de = ::readdir(dir))) {
+ if (de->d_type == DT_UNKNOWN) {
+ // d_type not supported (non-ext[234], btrfs), must stat
+ struct stat sb;
+ char filename[PATH_MAX];
+ if (int n = snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name);
+ n >= static_cast<int>(sizeof(filename))) {
+ derr << __func__ << " path length overrun: " << n << dendl;
+ ceph_abort();
+ }
+
+ r = ::stat(filename, &sb);
+ if (r < 0) {
+ r = -errno;
+ derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ break;
+ }
+ if (!S_ISDIR(sb.st_mode)) {
+ continue;
+ }
+ } else if (de->d_type != DT_DIR) {
+ continue;
+ }
+ if (strcmp(de->d_name, "omap") == 0) {
+ continue;
+ }
+ if (de->d_name[0] == '.' &&
+ (de->d_name[1] == '\0' ||
+ (de->d_name[1] == '.' &&
+ de->d_name[2] == '\0')))
+ continue;
+ coll_t cid;
+ if (!cid.parse(de->d_name)) {
+ derr << "ignoring invalid collection '" << de->d_name << "'" << dendl;
+ continue;
+ }
+ if (!cid.is_temp() || include_temp)
+ ls.push_back(cid);
+ }
+
+ if (r > 0) {
+ derr << "trying readdir " << fn << ": " << cpp_strerror(r) << dendl;
+ r = -r;
+ }
+
+ ::closedir(dir);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ tracepoint(objectstore, list_collections_exit, r);
+ return r;
+}
+
+int FileStore::collection_stat(const coll_t& c, struct stat *st)
+{
+ tracepoint(objectstore, collection_stat_enter, c.c_str());
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << __FUNC__ << ": " << fn << dendl;
+ int r = ::stat(fn, st);
+ if (r < 0)
+ r = -errno;
+ dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ tracepoint(objectstore, collection_stat_exit, r);
+ return r;
+}
+
+bool FileStore::collection_exists(const coll_t& c)
+{
+ tracepoint(objectstore, collection_exists_enter, c.c_str());
+ struct stat st;
+ bool ret = collection_stat(c, &st) == 0;
+ tracepoint(objectstore, collection_exists_exit, ret);
+ return ret;
+}
+
+int FileStore::collection_empty(const coll_t& cid, bool *empty)
+{
+ tracepoint(objectstore, collection_empty_enter, cid.c_str());
+ dout(15) << __FUNC__ << ": " << cid << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0) {
+ derr << __FUNC__ << ": get_index returned: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ vector<ghobject_t> ls;
+ r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(),
+ 1, &ls, nullptr);
+ if (r < 0) {
+ derr << __FUNC__ << ": collection_list_partial returned: "
+ << cpp_strerror(r) << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ *empty = ls.empty();
+ tracepoint(objectstore, collection_empty_exit, *empty);
+ return 0;
+}
+
+int FileStore::_collection_set_bits(const coll_t& c, int bits)
+{
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(10) << __FUNC__ << ": " << fn << " " << bits << dendl;
+ char n[PATH_MAX];
+ int r;
+ int32_t v = bits;
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto out;
+ }
+ get_attrname("bits", n, PATH_MAX);
+ r = chain_fsetxattr(fd, n, (char*)&v, sizeof(v));
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+ dout(10) << __FUNC__ << ": " << fn << " " << bits << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::collection_bits(CollectionHandle& ch)
+{
+ char fn[PATH_MAX];
+ get_cdir(ch->cid, fn, sizeof(fn));
+ dout(15) << __FUNC__ << ": " << fn << dendl;
+ int r;
+ char n[PATH_MAX];
+ int32_t bits;
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ bits = r = -errno;
+ goto out;
+ }
+ get_attrname("bits", n, PATH_MAX);
+ r = chain_fgetxattr(fd, n, (char*)&bits, sizeof(bits));
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ if (r < 0) {
+ bits = r;
+ goto out;
+ }
+ out:
+ dout(10) << __FUNC__ << ": " << fn << " = " << bits << dendl;
+ return bits;
+}
+
+int FileStore::collection_list(const coll_t& c,
+ const ghobject_t& orig_start,
+ const ghobject_t& end,
+ int max,
+ vector<ghobject_t> *ls, ghobject_t *next)
+{
+ ghobject_t start = orig_start;
+ if (start.is_max())
+ return 0;
+
+ ghobject_t temp_next;
+ if (!next)
+ next = &temp_next;
+ // figure out the pool id. we need this in order to generate a
+ // meaningful 'next' value.
+ int64_t pool = -1;
+ shard_id_t shard;
+ {
+ spg_t pgid;
+ if (c.is_temp(&pgid)) {
+ pool = -2 - pgid.pool();
+ shard = pgid.shard;
+ } else if (c.is_pg(&pgid)) {
+ pool = pgid.pool();
+ shard = pgid.shard;
+ } else if (c.is_meta()) {
+ pool = -1;
+ shard = shard_id_t::NO_SHARD;
+ } else {
+ // hrm, the caller is test code! we should get kill it off. for now,
+ // tolerate it.
+ pool = 0;
+ shard = shard_id_t::NO_SHARD;
+ }
+ dout(20) << __FUNC__ << ": pool is " << pool << " shard is " << shard
+ << " pgid " << pgid << dendl;
+ }
+ ghobject_t sep;
+ sep.hobj.pool = -1;
+ sep.set_shard(shard);
+ if (!c.is_temp() && !c.is_meta()) {
+ if (start < sep) {
+ dout(10) << __FUNC__ << ": first checking temp pool" << dendl;
+ coll_t temp = c.get_temp();
+ int r = collection_list(temp, start, end, max, ls, next);
+ if (r < 0)
+ return r;
+ if (*next != ghobject_t::get_max())
+ return r;
+ start = sep;
+ dout(10) << __FUNC__ << ": fall through to non-temp collection, start "
+ << start << dendl;
+ } else {
+ dout(10) << __FUNC__ << ": start " << start << " >= sep " << sep << dendl;
+ }
+ }
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ r = index->collection_list_partial(start, end, max, ls, next);
+
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ dout(20) << "objects: " << *ls << dendl;
+
+ // HashIndex doesn't know the pool when constructing a 'next' value
+ if (!next->is_max()) {
+ next->hobj.pool = pool;
+ next->set_shard(shard);
+ dout(20) << " next " << *next << dendl;
+ }
+
+ return 0;
+}
+
+int FileStore::omap_get(CollectionHandle& ch, const ghobject_t &hoid,
+ bufferlist *header,
+ map<string, bufferlist> *out)
+{
+ tracepoint(objectstore, omap_get_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->get(hoid, header, out);
+ if (r < 0 && r != -ENOENT) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ tracepoint(objectstore, omap_get_exit, 0);
+ return 0;
+}
+
+int FileStore::omap_get_header(
+ CollectionHandle& ch,
+ const ghobject_t &hoid,
+ bufferlist *bl,
+ bool allow_eio)
+{
+ tracepoint(objectstore, omap_get_header_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->get_header(hoid, bl);
+ if (r < 0 && r != -ENOENT) {
+ ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ tracepoint(objectstore, omap_get_header_exit, 0);
+ return 0;
+}
+
+int FileStore::omap_get_keys(CollectionHandle& ch, const ghobject_t &hoid, set<string> *keys)
+{
+ tracepoint(objectstore, omap_get_keys_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->get_keys(hoid, keys);
+ if (r < 0 && r != -ENOENT) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ tracepoint(objectstore, omap_get_keys_exit, 0);
+ return 0;
+}
+
+int FileStore::omap_get_values(CollectionHandle& ch, const ghobject_t &hoid,
+ const set<string> &keys,
+ map<string, bufferlist> *out)
+{
+ tracepoint(objectstore, omap_get_values_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ const char *where = "()";
+ int r = get_index(c, &index);
+ if (r < 0) {
+ where = " (get_index)";
+ goto out;
+ }
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0) {
+ where = " (lfn_find)";
+ goto out;
+ }
+ }
+ r = object_map->get_values(hoid, keys, out);
+ if (r < 0 && r != -ENOENT) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ where = " (get_values)";
+ goto out;
+ }
+ r = 0;
+ out:
+ tracepoint(objectstore, omap_get_values_exit, r);
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << " = " << r
+ << where << dendl;
+ return r;
+}
+
+int FileStore::omap_check_keys(CollectionHandle& ch, const ghobject_t &hoid,
+ const set<string> &keys,
+ set<string> *out)
+{
+ tracepoint(objectstore, omap_check_keys_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->check_keys(hoid, keys, out);
+ if (r < 0 && r != -ENOENT) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ tracepoint(objectstore, omap_check_keys_exit, 0);
+ return 0;
+}
+
+ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(
+ CollectionHandle& ch,
+ const ghobject_t &oid)
+{
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+ return get_omap_iterator(ch->cid, oid);
+}
+
+ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(const coll_t& _c,
+ const ghobject_t &hoid)
+{
+ tracepoint(objectstore, get_omap_iterator, _c.c_str());
+ const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 "
+ << "(get_index failed with " << cpp_strerror(r) << ")" << dendl;
+ return ObjectMap::ObjectMapIterator();
+ }
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 "
+ << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl;
+ return ObjectMap::ObjectMapIterator();
+ }
+ }
+ return object_map->get_iterator(hoid);
+}
+
+int FileStore::_collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
+ uint64_t expected_num_objs,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": collection: " << c << " pg number: "
+ << pg_num << " expected number of objects: " << expected_num_objs << dendl;
+
+ bool empty;
+ int ret = collection_empty(c, &empty);
+ if (ret < 0)
+ return ret;
+ if (!empty && !replaying) {
+ dout(0) << "Failed to give an expected number of objects hint to collection : "
+ << c << ", only empty collection can take such type of hint. " << dendl;
+ return 0;
+ }
+
+ Index index;
+ ret = get_index(c, &index);
+ if (ret < 0)
+ return ret;
+ // Pre-hash the collection
+ ret = index->pre_hash_collection(pg_num, expected_num_objs);
+ dout(10) << "pre_hash_collection " << c << " = " << ret << dendl;
+ if (ret < 0)
+ return ret;
+ _set_replay_guard(c, spos);
+
+ return 0;
+}
+
+int FileStore::_create_collection(
+ const coll_t& c,
+ int bits,
+ const SequencerPosition &spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << __FUNC__ << ": " << fn << dendl;
+ int r = ::mkdir(fn, 0755);
+ if (r < 0)
+ r = -errno;
+ if (r == -EEXIST && replaying)
+ r = 0;
+ dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+
+ if (r < 0)
+ return r;
+ r = init_index(c);
+ if (r < 0)
+ return r;
+ r = _collection_set_bits(c, bits);
+ if (r < 0)
+ return r;
+ // create parallel temp collection, too
+ if (!c.is_meta() && !c.is_temp()) {
+ coll_t temp = c.get_temp();
+ r = _create_collection(temp, 0, spos);
+ if (r < 0)
+ return r;
+ }
+
+ _set_replay_guard(c, spos);
+ return 0;
+}
+
+int FileStore::_destroy_collection(const coll_t& c)
+{
+ int r = 0;
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << __FUNC__ << ": " << fn << dendl;
+ {
+ Index from;
+ r = get_index(c, &from);
+ if (r < 0)
+ goto out;
+ ceph_assert(from.index);
+ RWLock::WLocker l((from.index)->access_lock);
+
+ r = from->prep_delete();
+ if (r < 0)
+ goto out;
+ }
+ r = ::rmdir(fn);
+ if (r < 0) {
+ r = -errno;
+ goto out;
+ }
+
+ out:
+ // destroy parallel temp collection, too
+ if (!c.is_meta() && !c.is_temp()) {
+ coll_t temp = c.get_temp();
+ int r2 = _destroy_collection(temp);
+ if (r2 < 0) {
+ r = r2;
+ goto out_final;
+ }
+ }
+
+ out_final:
+ dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_collection_add(const coll_t& c, const coll_t& oldcid, const ghobject_t& o,
+ const SequencerPosition& spos)
+{
+ dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << dendl;
+
+ int dstcmp = _check_replay_guard(c, o, spos);
+ if (dstcmp < 0)
+ return 0;
+
+ // check the src name too; it might have a newer guard, and we don't
+ // want to clobber it
+ int srccmp = _check_replay_guard(oldcid, o, spos);
+ if (srccmp < 0)
+ return 0;
+
+ // open guard on object so we don't any previous operations on the
+ // new name that will modify the source inode.
+ FDRef fd;
+ int r = lfn_open(oldcid, o, 0, &fd);
+ if (r < 0) {
+ // the source collection/object does not exist. If we are replaying, we
+ // should be safe, so just return 0 and move on.
+ ceph_assert(replaying);
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+ << oldcid << "/" << o << " (dne, continue replay) " << dendl;
+ return 0;
+ }
+ if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
+ _set_replay_guard(**fd, spos, &o, true);
+ }
+
+ r = lfn_link(oldcid, c, o, o);
+ if (replaying && !backend->can_checkpoint() &&
+ r == -EEXIST) // crashed between link() and set_replay_guard()
+ r = 0;
+
+ _inject_failure();
+
+ // close guard on object so we don't do this again
+ if (r == 0) {
+ _close_replay_guard(**fd, spos);
+ }
+ lfn_close(fd);
+
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
+ const SequencerPosition& spos,
+ bool allow_enoent)
+{
+ dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl;
+ int r = 0;
+ int dstcmp, srccmp;
+
+ if (replaying) {
+ /* If the destination collection doesn't exist during replay,
+ * we need to delete the src object and continue on
+ */
+ if (!collection_exists(c))
+ goto out_rm_src;
+ }
+
+ dstcmp = _check_replay_guard(c, o, spos);
+ if (dstcmp < 0)
+ goto out_rm_src;
+
+ // check the src name too; it might have a newer guard, and we don't
+ // want to clobber it
+ srccmp = _check_replay_guard(oldcid, oldoid, spos);
+ if (srccmp < 0)
+ return 0;
+
+ {
+ // open guard on object so we don't any previous operations on the
+ // new name that will modify the source inode.
+ FDRef fd;
+ r = lfn_open(oldcid, oldoid, 0, &fd);
+ if (r < 0) {
+ // the source collection/object does not exist. If we are replaying, we
+ // should be safe, so just return 0 and move on.
+ if (replaying) {
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+ << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl;
+ } else if (allow_enoent) {
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+ << oldcid << "/" << oldoid << " (dne, ignoring enoent)"
+ << dendl;
+ } else {
+ ceph_abort_msg("ERROR: source must exist");
+ }
+
+ if (!replaying) {
+ return 0;
+ }
+ if (allow_enoent && dstcmp > 0) { // if dstcmp == 0, try_rename was started.
+ return 0;
+ }
+
+ r = 0; // don't know if object_map was cloned
+ } else {
+ if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
+ _set_replay_guard(**fd, spos, &o, true);
+ }
+
+ r = lfn_link(oldcid, c, oldoid, o);
+ if (replaying && !backend->can_checkpoint() &&
+ r == -EEXIST) // crashed between link() and set_replay_guard()
+ r = 0;
+
+ lfn_close(fd);
+ fd = FDRef();
+
+ _inject_failure();
+ }
+
+ if (r == 0) {
+ // the name changed; link the omap content
+ r = object_map->rename(oldoid, o, &spos);
+ if (r == -ENOENT)
+ r = 0;
+ }
+
+ _inject_failure();
+
+ if (r == 0)
+ r = lfn_unlink(oldcid, oldoid, spos, true);
+
+ if (r == 0)
+ r = lfn_open(c, o, 0, &fd);
+
+ // close guard on object so we don't do this again
+ if (r == 0) {
+ _close_replay_guard(**fd, spos, &o);
+ lfn_close(fd);
+ }
+ }
+
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid
+ << " = " << r << dendl;
+ return r;
+
+ out_rm_src:
+ // remove source
+ if (_check_replay_guard(oldcid, oldoid, spos) > 0) {
+ r = lfn_unlink(oldcid, oldoid, spos, true);
+ }
+
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid
+ << " = " << r << dendl;
+ return r;
+}
+
+void FileStore::_inject_failure()
+{
+ if (m_filestore_kill_at) {
+ int final = --m_filestore_kill_at;
+ dout(5) << __FUNC__ << ": " << (final+1) << " -> " << final << dendl;
+ if (final == 0) {
+ derr << __FUNC__ << ": KILLING" << dendl;
+ cct->_log->flush();
+ _exit(1);
+ }
+ }
+}
+
+int FileStore::_omap_clear(const coll_t& cid, const ghobject_t &hoid,
+ const SequencerPosition &spos) {
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->clear_keys_header(hoid, &spos);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ return 0;
+}
+
+int FileStore::_omap_setkeys(const coll_t& cid, const ghobject_t &hoid,
+ const map<string, bufferlist> &aset,
+ const SequencerPosition &spos) {
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+ Index index;
+ int r;
+ //treat pgmeta as a logical object, skip to check exist
+ if (hoid.is_pgmeta())
+ goto skip;
+
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(20) << __FUNC__ << ": get_index got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0) {
+ dout(20) << __FUNC__ << ": lfn_find got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+skip:
+ if (g_conf()->subsys.should_gather<ceph_subsys_filestore, 20>()) {
+ for (auto& p : aset) {
+ dout(20) << __FUNC__ << ": set " << p.first << dendl;
+ }
+ }
+ r = object_map->set_keys(hoid, aset, &spos);
+ dout(20) << __FUNC__ << ": " << cid << "/" << hoid << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &hoid,
+ const set<string> &keys,
+ const SequencerPosition &spos) {
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+ Index index;
+ int r;
+ //treat pgmeta as a logical object, skip to check exist
+ if (hoid.is_pgmeta())
+ goto skip;
+
+ r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+skip:
+ r = object_map->rm_keys(hoid, keys, &spos);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ return 0;
+}
+
+int FileStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &hoid,
+ const string& first, const string& last,
+ const SequencerPosition &spos) {
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl;
+ set<string> keys;
+ {
+ ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid);
+ if (!iter)
+ return -ENOENT;
+ for (iter->lower_bound(first); iter->valid() && iter->key() < last;
+ iter->next()) {
+ keys.insert(iter->key());
+ }
+ }
+ return _omap_rmkeys(cid, hoid, keys, spos);
+}
+
+int FileStore::_omap_setheader(const coll_t& cid, const ghobject_t &hoid,
+ const bufferlist &bl,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ return object_map->set_header(hoid, bl, &spos);
+}
+
+int FileStore::_merge_collection(const coll_t& cid,
+ uint32_t bits,
+ coll_t dest,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << " " << dest
+ << " bits " << bits << dendl;
+ int r = 0;
+
+ if (!collection_exists(cid)) {
+ dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl;
+ ceph_assert(replaying);
+ return 0;
+ }
+ if (!collection_exists(dest)) {
+ dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl;
+ ceph_assert(replaying);
+ return 0;
+ }
+
+ // set bits
+ if (_check_replay_guard(cid, spos) > 0)
+ _collection_set_bits(dest, bits);
+
+ spg_t pgid;
+ bool is_pg = dest.is_pg(&pgid);
+ ceph_assert(is_pg);
+
+ int dstcmp = _check_replay_guard(dest, spos);
+ if (dstcmp < 0)
+ return 0;
+
+ int srccmp = _check_replay_guard(cid, spos);
+ if (srccmp < 0)
+ return 0;
+
+ _set_global_replay_guard(cid, spos);
+ _set_replay_guard(cid, spos, true);
+ _set_replay_guard(dest, spos, true);
+
+ // main collection
+ {
+ Index from;
+ r = get_index(cid, &from);
+
+ Index to;
+ if (!r)
+ r = get_index(dest, &to);
+
+ if (!r) {
+ ceph_assert(from.index);
+ RWLock::WLocker l1((from.index)->access_lock);
+
+ ceph_assert(to.index);
+ RWLock::WLocker l2((to.index)->access_lock);
+
+ r = from->merge(bits, to.index);
+ }
+ }
+
+ // temp too
+ {
+ Index from;
+ r = get_index(cid.get_temp(), &from);
+
+ Index to;
+ if (!r)
+ r = get_index(dest.get_temp(), &to);
+
+ if (!r) {
+ ceph_assert(from.index);
+ RWLock::WLocker l1((from.index)->access_lock);
+
+ ceph_assert(to.index);
+ RWLock::WLocker l2((to.index)->access_lock);
+
+ r = from->merge(bits, to.index);
+ }
+ }
+
+ // remove source
+ _destroy_collection(cid);
+
+ _close_replay_guard(dest, spos);
+ _close_replay_guard(dest.get_temp(), spos);
+ // no need to close guards on cid... it's removed.
+
+ if (!r && cct->_conf->filestore_debug_verify_split) {
+ vector<ghobject_t> objects;
+ ghobject_t next;
+ while (1) {
+ collection_list(
+ dest,
+ next, ghobject_t::get_max(),
+ get_ideal_list_max(),
+ &objects,
+ &next);
+ if (objects.empty())
+ break;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (!i->match(bits, pgid.pgid.ps())) {
+ dout(20) << __FUNC__ << ": " << *i << " does not belong in "
+ << cid << dendl;
+ ceph_assert(i->match(bits, pgid.pgid.ps()));
+ }
+ }
+ objects.clear();
+ }
+ }
+
+ dout(15) << __FUNC__ << ": " << cid << " " << dest << " bits " << bits
+ << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_split_collection(const coll_t& cid,
+ uint32_t bits,
+ uint32_t rem,
+ coll_t dest,
+ const SequencerPosition &spos)
+{
+ int r;
+ {
+ dout(15) << __FUNC__ << ": " << cid << " bits: " << bits << dendl;
+ if (!collection_exists(cid)) {
+ dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl;
+ ceph_assert(replaying);
+ return 0;
+ }
+ if (!collection_exists(dest)) {
+ dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl;
+ ceph_assert(replaying);
+ return 0;
+ }
+
+ int dstcmp = _check_replay_guard(dest, spos);
+ if (dstcmp < 0)
+ return 0;
+
+ int srccmp = _check_replay_guard(cid, spos);
+ if (srccmp < 0)
+ return 0;
+
+ _set_global_replay_guard(cid, spos);
+ _set_replay_guard(cid, spos, true);
+ _set_replay_guard(dest, spos, true);
+
+ Index from;
+ r = get_index(cid, &from);
+
+ Index to;
+ if (!r)
+ r = get_index(dest, &to);
+
+ if (!r) {
+ ceph_assert(from.index);
+ RWLock::WLocker l1((from.index)->access_lock);
+
+ ceph_assert(to.index);
+ RWLock::WLocker l2((to.index)->access_lock);
+
+ r = from->split(rem, bits, to.index);
+ }
+
+ _close_replay_guard(cid, spos);
+ _close_replay_guard(dest, spos);
+ }
+ _collection_set_bits(cid, bits);
+ if (!r && cct->_conf->filestore_debug_verify_split) {
+ vector<ghobject_t> objects;
+ ghobject_t next;
+ while (1) {
+ collection_list(
+ cid,
+ next, ghobject_t::get_max(),
+ get_ideal_list_max(),
+ &objects,
+ &next);
+ if (objects.empty())
+ break;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ dout(20) << __FUNC__ << ": " << *i << " still in source "
+ << cid << dendl;
+ ceph_assert(!i->match(bits, rem));
+ }
+ objects.clear();
+ }
+ next = ghobject_t();
+ while (1) {
+ collection_list(
+ dest,
+ next, ghobject_t::get_max(),
+ get_ideal_list_max(),
+ &objects,
+ &next);
+ if (objects.empty())
+ break;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ dout(20) << __FUNC__ << ": " << *i << " now in dest "
+ << *i << dendl;
+ ceph_assert(i->match(bits, rem));
+ }
+ objects.clear();
+ }
+ }
+ return r;
+}
+
+int FileStore::_set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl;
+
+ FDRef fd;
+ int ret = 0;
+
+ if (expected_object_size == 0 || expected_write_size == 0)
+ goto out;
+
+ ret = lfn_open(cid, oid, false, &fd);
+ if (ret < 0)
+ goto out;
+
+ {
+ // TODO: a more elaborate hint calculation
+ uint64_t hint = std::min<uint64_t>(expected_write_size, m_filestore_max_alloc_hint_size);
+
+ ret = backend->set_alloc_hint(**fd, hint);
+ dout(20) << __FUNC__ << ": hint " << hint << " ret " << ret << dendl;
+ }
+
+ lfn_close(fd);
+out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl;
+ ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+ return ret;
+}
+
+const char** FileStore::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "filestore_max_inline_xattr_size",
+ "filestore_max_inline_xattr_size_xfs",
+ "filestore_max_inline_xattr_size_btrfs",
+ "filestore_max_inline_xattr_size_other",
+ "filestore_max_inline_xattrs",
+ "filestore_max_inline_xattrs_xfs",
+ "filestore_max_inline_xattrs_btrfs",
+ "filestore_max_inline_xattrs_other",
+ "filestore_max_xattr_value_size",
+ "filestore_max_xattr_value_size_xfs",
+ "filestore_max_xattr_value_size_btrfs",
+ "filestore_max_xattr_value_size_other",
+ "filestore_min_sync_interval",
+ "filestore_max_sync_interval",
+ "filestore_queue_max_ops",
+ "filestore_queue_max_bytes",
+ "filestore_expected_throughput_bytes",
+ "filestore_expected_throughput_ops",
+ "filestore_queue_low_threshhold",
+ "filestore_queue_high_threshhold",
+ "filestore_queue_high_delay_multiple",
+ "filestore_queue_max_delay_multiple",
+ "filestore_commit_timeout",
+ "filestore_dump_file",
+ "filestore_kill_at",
+ "filestore_fail_eio",
+ "filestore_fadvise",
+ "filestore_sloppy_crc",
+ "filestore_sloppy_crc_block_size",
+ "filestore_max_alloc_hint_size",
+ NULL
+ };
+ return KEYS;
+}
+
+void FileStore::handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed)
+{
+ if (changed.count("filestore_max_inline_xattr_size") ||
+ changed.count("filestore_max_inline_xattr_size_xfs") ||
+ changed.count("filestore_max_inline_xattr_size_btrfs") ||
+ changed.count("filestore_max_inline_xattr_size_other") ||
+ changed.count("filestore_max_inline_xattrs") ||
+ changed.count("filestore_max_inline_xattrs_xfs") ||
+ changed.count("filestore_max_inline_xattrs_btrfs") ||
+ changed.count("filestore_max_inline_xattrs_other") ||
+ changed.count("filestore_max_xattr_value_size") ||
+ changed.count("filestore_max_xattr_value_size_xfs") ||
+ changed.count("filestore_max_xattr_value_size_btrfs") ||
+ changed.count("filestore_max_xattr_value_size_other")) {
+ if (backend) {
+ Mutex::Locker l(lock);
+ set_xattr_limits_via_conf();
+ }
+ }
+
+ if (changed.count("filestore_queue_max_bytes") ||
+ changed.count("filestore_queue_max_ops") ||
+ changed.count("filestore_expected_throughput_bytes") ||
+ changed.count("filestore_expected_throughput_ops") ||
+ changed.count("filestore_queue_low_threshhold") ||
+ changed.count("filestore_queue_high_threshhold") ||
+ changed.count("filestore_queue_high_delay_multiple") ||
+ changed.count("filestore_queue_max_delay_multiple")) {
+ Mutex::Locker l(lock);
+ set_throttle_params();
+ }
+
+ if (changed.count("filestore_min_sync_interval") ||
+ changed.count("filestore_max_sync_interval") ||
+ changed.count("filestore_kill_at") ||
+ changed.count("filestore_fail_eio") ||
+ changed.count("filestore_sloppy_crc") ||
+ changed.count("filestore_sloppy_crc_block_size") ||
+ changed.count("filestore_max_alloc_hint_size") ||
+ changed.count("filestore_fadvise")) {
+ Mutex::Locker l(lock);
+ m_filestore_min_sync_interval = conf->filestore_min_sync_interval;
+ m_filestore_max_sync_interval = conf->filestore_max_sync_interval;
+ m_filestore_kill_at = conf->filestore_kill_at;
+ m_filestore_fail_eio = conf->filestore_fail_eio;
+ m_filestore_fadvise = conf->filestore_fadvise;
+ m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
+ m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
+ m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size;
+ }
+ if (changed.count("filestore_commit_timeout")) {
+ Mutex::Locker l(sync_entry_timeo_lock);
+ m_filestore_commit_timeout = conf->filestore_commit_timeout;
+ }
+ if (changed.count("filestore_dump_file")) {
+ if (conf->filestore_dump_file.length() &&
+ conf->filestore_dump_file != "-") {
+ dump_start(conf->filestore_dump_file);
+ } else {
+ dump_stop();
+ }
+ }
+}
+
+int FileStore::set_throttle_params()
+{
+ stringstream ss;
+ bool valid = throttle_bytes.set_params(
+ cct->_conf->filestore_queue_low_threshhold,
+ cct->_conf->filestore_queue_high_threshhold,
+ cct->_conf->filestore_expected_throughput_bytes,
+ cct->_conf->filestore_queue_high_delay_multiple?
+ cct->_conf->filestore_queue_high_delay_multiple:
+ cct->_conf->filestore_queue_high_delay_multiple_bytes,
+ cct->_conf->filestore_queue_max_delay_multiple?
+ cct->_conf->filestore_queue_max_delay_multiple:
+ cct->_conf->filestore_queue_max_delay_multiple_bytes,
+ cct->_conf->filestore_queue_max_bytes,
+ &ss);
+
+ valid &= throttle_ops.set_params(
+ cct->_conf->filestore_queue_low_threshhold,
+ cct->_conf->filestore_queue_high_threshhold,
+ cct->_conf->filestore_expected_throughput_ops,
+ cct->_conf->filestore_queue_high_delay_multiple?
+ cct->_conf->filestore_queue_high_delay_multiple:
+ cct->_conf->filestore_queue_high_delay_multiple_ops,
+ cct->_conf->filestore_queue_max_delay_multiple?
+ cct->_conf->filestore_queue_max_delay_multiple:
+ cct->_conf->filestore_queue_max_delay_multiple_ops,
+ cct->_conf->filestore_queue_max_ops,
+ &ss);
+
+ logger->set(l_filestore_op_queue_max_ops, throttle_ops.get_max());
+ logger->set(l_filestore_op_queue_max_bytes, throttle_bytes.get_max());
+
+ if (!valid) {
+ derr << "tried to set invalid params: "
+ << ss.str()
+ << dendl;
+ }
+ return valid ? 0 : -EINVAL;
+}
+
+void FileStore::dump_start(const std::string& file)
+{
+ dout(10) << __FUNC__ << ": " << file << dendl;
+ if (m_filestore_do_dump) {
+ dump_stop();
+ }
+ m_filestore_dump_fmt.reset();
+ m_filestore_dump_fmt.open_array_section("dump");
+ m_filestore_dump.open(file.c_str());
+ m_filestore_do_dump = true;
+}
+
+void FileStore::dump_stop()
+{
+ dout(10) << __FUNC__ << dendl;
+ m_filestore_do_dump = false;
+ if (m_filestore_dump.is_open()) {
+ m_filestore_dump_fmt.close_section();
+ m_filestore_dump_fmt.flush(m_filestore_dump);
+ m_filestore_dump.flush();
+ m_filestore_dump.close();
+ }
+}
+
+void FileStore::dump_transactions(vector<ObjectStore::Transaction>& ls, uint64_t seq, OpSequencer *osr)
+{
+ m_filestore_dump_fmt.open_array_section("transactions");
+ unsigned trans_num = 0;
+ for (vector<ObjectStore::Transaction>::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) {
+ m_filestore_dump_fmt.open_object_section("transaction");
+ m_filestore_dump_fmt.dump_stream("osr") << osr->cid;
+ m_filestore_dump_fmt.dump_unsigned("seq", seq);
+ m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num);
+ (*i).dump(&m_filestore_dump_fmt);
+ m_filestore_dump_fmt.close_section();
+ }
+ m_filestore_dump_fmt.close_section();
+ m_filestore_dump_fmt.flush(m_filestore_dump);
+ m_filestore_dump.flush();
+}
+
+void FileStore::get_db_statistics(Formatter* f)
+{
+ object_map->db->get_statistics(f);
+}
+
+void FileStore::set_xattr_limits_via_conf()
+{
+ uint32_t fs_xattr_size;
+ uint32_t fs_xattrs;
+ uint32_t fs_xattr_max_value_size;
+
+ switch (m_fs_type) {
+#if defined(__linux__)
+ case XFS_SUPER_MAGIC:
+ fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_xfs;
+ fs_xattrs = cct->_conf->filestore_max_inline_xattrs_xfs;
+ fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_xfs;
+ break;
+ case BTRFS_SUPER_MAGIC:
+ fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_btrfs;
+ fs_xattrs = cct->_conf->filestore_max_inline_xattrs_btrfs;
+ fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_btrfs;
+ break;
+#endif
+ default:
+ fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_other;
+ fs_xattrs = cct->_conf->filestore_max_inline_xattrs_other;
+ fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_other;
+ break;
+ }
+
+ // Use override value if set
+ if (cct->_conf->filestore_max_inline_xattr_size)
+ m_filestore_max_inline_xattr_size = cct->_conf->filestore_max_inline_xattr_size;
+ else
+ m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+ // Use override value if set
+ if (cct->_conf->filestore_max_inline_xattrs)
+ m_filestore_max_inline_xattrs = cct->_conf->filestore_max_inline_xattrs;
+ else
+ m_filestore_max_inline_xattrs = fs_xattrs;
+
+ // Use override value if set
+ if (cct->_conf->filestore_max_xattr_value_size)
+ m_filestore_max_xattr_value_size = cct->_conf->filestore_max_xattr_value_size;
+ else
+ m_filestore_max_xattr_value_size = fs_xattr_max_value_size;
+
+ if (m_filestore_max_xattr_value_size < cct->_conf->osd_max_object_name_len) {
+ derr << "WARNING: max attr value size ("
+ << m_filestore_max_xattr_value_size
+ << ") is smaller than osd_max_object_name_len ("
+ << cct->_conf->osd_max_object_name_len
+ << "). Your backend filesystem appears to not support attrs large "
+ << "enough to handle the configured max rados name size. You may get "
+ << "unexpected ENAMETOOLONG errors on rados operations or buggy "
+ << "behavior"
+ << dendl;
+ }
+}
+
+uint64_t FileStore::estimate_objects_overhead(uint64_t num_objects)
+{
+ uint64_t res = num_objects * blk_size / 2; //assumes that each object uses ( in average ) additional 1/2 block due to FS allocation granularity.
+ return res;
+}
+
+int FileStore::apply_layout_settings(const coll_t &cid, int target_level)
+{
+ dout(20) << __FUNC__ << ": " << cid << " target level: "
+ << target_level << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << "Error getting index for " << cid << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return index->apply_layout_settings(target_level);
+}
+
+
+// -- FSSuperblock --
+
+void FSSuperblock::encode(bufferlist &bl) const
+{
+ ENCODE_START(2, 1, bl);
+ compat_features.encode(bl);
+ encode(omap_backend, bl);
+ ENCODE_FINISH(bl);
+}
+
+void FSSuperblock::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(2, bl);
+ compat_features.decode(bl);
+ if (struct_v >= 2)
+ decode(omap_backend, bl);
+ else
+ omap_backend = "leveldb";
+ DECODE_FINISH(bl);
+}
+
+void FSSuperblock::dump(Formatter *f) const
+{
+ f->open_object_section("compat");
+ compat_features.dump(f);
+ f->dump_string("omap_backend", omap_backend);
+ f->close_section();
+}
+
+void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o)
+{
+ FSSuperblock z;
+ o.push_back(new FSSuperblock(z));
+ CompatSet::FeatureSet feature_compat;
+ CompatSet::FeatureSet feature_ro_compat;
+ CompatSet::FeatureSet feature_incompat;
+ feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ z.compat_features = CompatSet(feature_compat, feature_ro_compat,
+ feature_incompat);
+ o.push_back(new FSSuperblock(z));
+ z.omap_backend = "rocksdb";
+ o.push_back(new FSSuperblock(z));
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore.osr(" << this << ") "
+
+void FileStore::OpSequencer::_register_apply(Op *o)
+{
+ if (o->registered_apply) {
+ dout(20) << __func__ << " " << o << " already registered" << dendl;
+ return;
+ }
+ o->registered_apply = true;
+ for (auto& t : o->tls) {
+ for (auto& i : t.get_object_index()) {
+ uint32_t key = i.first.hobj.get_hash();
+ applying.emplace(make_pair(key, &i.first));
+ dout(20) << __func__ << " " << o << " " << i.first << " ("
+ << &i.first << ")" << dendl;
+ }
+ }
+}
+
+void FileStore::OpSequencer::_unregister_apply(Op *o)
+{
+ ceph_assert(o->registered_apply);
+ for (auto& t : o->tls) {
+ for (auto& i : t.get_object_index()) {
+ uint32_t key = i.first.hobj.get_hash();
+ auto p = applying.find(key);
+ bool removed = false;
+ while (p != applying.end() &&
+ p->first == key) {
+ if (p->second == &i.first) {
+ dout(20) << __func__ << " " << o << " " << i.first << " ("
+ << &i.first << ")" << dendl;
+ applying.erase(p);
+ removed = true;
+ break;
+ }
+ ++p;
+ }
+ ceph_assert(removed);
+ }
+ }
+}
+
+void FileStore::OpSequencer::wait_for_apply(const ghobject_t& oid)
+{
+ Mutex::Locker l(qlock);
+ uint32_t key = oid.hobj.get_hash();
+retry:
+ while (true) {
+ // search all items in hash slot for a matching object
+ auto p = applying.find(key);
+ while (p != applying.end() &&
+ p->first == key) {
+ if (*p->second == oid) {
+ dout(20) << __func__ << " " << oid << " waiting on " << p->second
+ << dendl;
+ cond.Wait(qlock);
+ goto retry;
+ }
+ ++p;
+ }
+ break;
+ }
+ dout(20) << __func__ << " " << oid << " done" << dendl;
+}
diff --git a/src/os/filestore/FileStore.h b/src/os/filestore/FileStore.h
new file mode 100644
index 00000000..e09b9e04
--- /dev/null
+++ b/src/os/filestore/FileStore.h
@@ -0,0 +1,938 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILESTORE_H
+#define CEPH_FILESTORE_H
+
+#include "include/types.h"
+
+#include <map>
+#include <deque>
+#include <atomic>
+#include <fstream>
+
+
+#include <boost/scoped_ptr.hpp>
+
+#include "include/unordered_map.h"
+
+#include "include/ceph_assert.h"
+
+#include "os/ObjectStore.h"
+#include "JournalingObjectStore.h"
+
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "common/perf_counters.h"
+#include "common/zipkin_trace.h"
+
+#include "common/Mutex.h"
+#include "HashIndex.h"
+#include "IndexManager.h"
+#include "os/ObjectMap.h"
+#include "SequencerPosition.h"
+#include "FDCache.h"
+#include "WBThrottle.h"
+
+#include "include/uuid.h"
+
+#if defined(__linux__)
+# ifndef BTRFS_SUPER_MAGIC
+#define BTRFS_SUPER_MAGIC 0x9123683EUL
+# endif
+# ifndef XFS_SUPER_MAGIC
+#define XFS_SUPER_MAGIC 0x58465342UL
+# endif
+# ifndef ZFS_SUPER_MAGIC
+#define ZFS_SUPER_MAGIC 0x2fc12fc1UL
+# endif
+#endif
+
+
+class FileStoreBackend;
+
+#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
+
+enum {
+ l_filestore_first = 84000,
+ l_filestore_journal_queue_ops,
+ l_filestore_journal_queue_bytes,
+ l_filestore_journal_ops,
+ l_filestore_journal_bytes,
+ l_filestore_journal_latency,
+ l_filestore_journal_wr,
+ l_filestore_journal_wr_bytes,
+ l_filestore_journal_full,
+ l_filestore_committing,
+ l_filestore_commitcycle,
+ l_filestore_commitcycle_interval,
+ l_filestore_commitcycle_latency,
+ l_filestore_op_queue_max_ops,
+ l_filestore_op_queue_ops,
+ l_filestore_ops,
+ l_filestore_op_queue_max_bytes,
+ l_filestore_op_queue_bytes,
+ l_filestore_bytes,
+ l_filestore_apply_latency,
+ l_filestore_queue_transaction_latency_avg,
+ l_filestore_sync_pause_max_lat,
+ l_filestore_last,
+};
+
+class FSSuperblock {
+public:
+ CompatSet compat_features;
+ string omap_backend;
+
+ FSSuperblock() { }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<FSSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(FSSuperblock)
+
+inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
+{
+ return out << "sb(" << sb.compat_features << "): "
+ << sb.omap_backend;
+}
+
+class FileStore : public JournalingObjectStore,
+ public md_config_obs_t
+{
+ static const uint32_t target_version = 4;
+public:
+ uint32_t get_target_version() {
+ return target_version;
+ }
+
+ static int get_block_device_fsid(CephContext* cct, const string& path,
+ uuid_d *fsid);
+ struct FSPerfTracker {
+ PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
+ PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
+
+ objectstore_perf_stat_t get_cur_stats() const {
+ objectstore_perf_stat_t ret;
+ ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
+ ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
+ return ret;
+ }
+
+ void update_from_perfcounters(PerfCounters &logger);
+ } perf_tracker;
+ objectstore_perf_stat_t get_cur_stats() override {
+ perf_tracker.update_from_perfcounters(*logger);
+ return perf_tracker.get_cur_stats();
+ }
+ const PerfCounters* get_perf_counters() const override {
+ return logger;
+ }
+
+private:
+ string internal_name; ///< internal name, used to name the perfcounter instance
+ string basedir, journalpath;
+ osflagbits_t generic_flags;
+ std::string current_fn;
+ std::string current_op_seq_fn;
+ std::string omap_dir;
+ uuid_d fsid;
+
+ size_t blk_size; ///< fs block size
+
+ int fsid_fd, op_fd, basedir_fd, current_fd;
+
+ FileStoreBackend *backend;
+
+ void create_backend(unsigned long f_type);
+
+ string devname;
+
+ int vdo_fd = -1;
+ string vdo_name;
+
+ deque<uint64_t> snaps;
+
+ // Indexed Collections
+ IndexManager index_manager;
+ int get_index(const coll_t& c, Index *index);
+ int init_index(const coll_t& c);
+
+ bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) {
+ // - normal temp case: cid is pg, object is temp (pool < -1)
+ // - hammer temp case: cid is pg (or already temp), object pool is -1
+ return cid.is_pg() && oid.hobj.pool <= -1;
+ }
+ void init_temp_collections();
+
+ void handle_eio();
+
+ // ObjectMap
+ boost::scoped_ptr<ObjectMap> object_map;
+
+ // helper fns
+ int get_cdir(const coll_t& cid, char *s, int len);
+
+ /// read a uuid from fd
+ int read_fsid(int fd, uuid_d *uuid);
+
+ /// lock fsid_fd
+ int lock_fsid();
+
+ // sync thread
+ Mutex lock;
+ bool force_sync;
+ Cond sync_cond;
+
+ Mutex sync_entry_timeo_lock;
+ SafeTimer timer;
+
+ list<Context*> sync_waiters;
+ bool stop;
+ void sync_entry();
+ struct SyncThread : public Thread {
+ FileStore *fs;
+ explicit SyncThread(FileStore *f) : fs(f) {}
+ void *entry() override {
+ fs->sync_entry();
+ return 0;
+ }
+ } sync_thread;
+
+ // -- op workqueue --
+ struct Op {
+ utime_t start;
+ uint64_t op;
+ vector<Transaction> tls;
+ Context *onreadable, *onreadable_sync;
+ uint64_t ops, bytes;
+ TrackedOpRef osd_op;
+ ZTracer::Trace trace;
+ bool registered_apply = false;
+ };
+ class OpSequencer : public CollectionImpl {
+ CephContext *cct;
+ Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
+ list<Op*> q;
+ list<uint64_t> jq;
+ list<pair<uint64_t, Context*> > flush_commit_waiters;
+ Cond cond;
+ string osr_name_str;
+ /// hash of pointers to ghobject_t's for in-flight writes
+ unordered_multimap<uint32_t,const ghobject_t*> applying;
+ public:
+ Mutex apply_lock; // for apply mutual exclusion
+ int id;
+ const char *osr_name;
+
+ /// get_max_uncompleted
+ bool _get_max_uncompleted(
+ uint64_t *seq ///< [out] max uncompleted seq
+ ) {
+ ceph_assert(qlock.is_locked());
+ ceph_assert(seq);
+ *seq = 0;
+ if (q.empty() && jq.empty())
+ return true;
+
+ if (!q.empty())
+ *seq = q.back()->op;
+ if (!jq.empty() && jq.back() > *seq)
+ *seq = jq.back();
+
+ return false;
+ } /// @returns true if both queues are empty
+
+ /// get_min_uncompleted
+ bool _get_min_uncompleted(
+ uint64_t *seq ///< [out] min uncompleted seq
+ ) {
+ ceph_assert(qlock.is_locked());
+ ceph_assert(seq);
+ *seq = 0;
+ if (q.empty() && jq.empty())
+ return true;
+
+ if (!q.empty())
+ *seq = q.front()->op;
+ if (!jq.empty() && jq.front() < *seq)
+ *seq = jq.front();
+
+ return false;
+ } /// @returns true if both queues are empty
+
+ void _wake_flush_waiters(list<Context*> *to_queue) {
+ uint64_t seq;
+ if (_get_min_uncompleted(&seq))
+ seq = -1;
+
+ for (list<pair<uint64_t, Context*> >::iterator i =
+ flush_commit_waiters.begin();
+ i != flush_commit_waiters.end() && i->first < seq;
+ flush_commit_waiters.erase(i++)) {
+ to_queue->push_back(i->second);
+ }
+ }
+
+ void queue_journal(Op *o) {
+ Mutex::Locker l(qlock);
+ jq.push_back(o->op);
+ _register_apply(o);
+ }
+ void dequeue_journal(list<Context*> *to_queue) {
+ Mutex::Locker l(qlock);
+ jq.pop_front();
+ cond.Signal();
+ _wake_flush_waiters(to_queue);
+ }
+ void queue(Op *o) {
+ Mutex::Locker l(qlock);
+ q.push_back(o);
+ _register_apply(o);
+ o->trace.keyval("queue depth", q.size());
+ }
+ void _register_apply(Op *o);
+ void _unregister_apply(Op *o);
+ void wait_for_apply(const ghobject_t& oid);
+ Op *peek_queue() {
+ Mutex::Locker l(qlock);
+ ceph_assert(apply_lock.is_locked());
+ return q.front();
+ }
+
+ Op *dequeue(list<Context*> *to_queue) {
+ ceph_assert(to_queue);
+ ceph_assert(apply_lock.is_locked());
+ Mutex::Locker l(qlock);
+ Op *o = q.front();
+ q.pop_front();
+ cond.Signal();
+ _unregister_apply(o);
+ _wake_flush_waiters(to_queue);
+ return o;
+ }
+
+ void flush() override {
+ Mutex::Locker l(qlock);
+
+ while (cct->_conf->filestore_blackhole)
+ cond.Wait(qlock); // wait forever
+
+
+ // get max for journal _or_ op queues
+ uint64_t seq = 0;
+ if (!q.empty())
+ seq = q.back()->op;
+ if (!jq.empty() && jq.back() > seq)
+ seq = jq.back();
+
+ if (seq) {
+ // everything prior to our watermark to drain through either/both queues
+ while ((!q.empty() && q.front()->op <= seq) ||
+ (!jq.empty() && jq.front() <= seq))
+ cond.Wait(qlock);
+ }
+ }
+ bool flush_commit(Context *c) override {
+ Mutex::Locker l(qlock);
+ uint64_t seq = 0;
+ if (_get_max_uncompleted(&seq)) {
+ return true;
+ } else {
+ flush_commit_waiters.push_back(make_pair(seq, c));
+ return false;
+ }
+ }
+
+ OpSequencer(CephContext* cct, int i, coll_t cid)
+ : CollectionImpl(cid),
+ cct(cct),
+ qlock("FileStore::OpSequencer::qlock", false, false),
+ osr_name_str(stringify(cid)),
+ apply_lock("FileStore::OpSequencer::apply_lock", false, false),
+ id(i),
+ osr_name(osr_name_str.c_str()) {}
+ ~OpSequencer() override {
+ ceph_assert(q.empty());
+ }
+ };
+ typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
+
+ Mutex coll_lock;
+ map<coll_t,OpSequencerRef> coll_map;
+
+ friend ostream& operator<<(ostream& out, const OpSequencer& s);
+
+ FDCache fdcache;
+ WBThrottle wbthrottle;
+
+ std::atomic<int64_t> next_osr_id = { 0 };
+ bool m_disable_wbthrottle;
+ deque<OpSequencer*> op_queue;
+ BackoffThrottle throttle_ops, throttle_bytes;
+ const int m_ondisk_finisher_num;
+ const int m_apply_finisher_num;
+ vector<Finisher*> ondisk_finishers;
+ vector<Finisher*> apply_finishers;
+
+ ThreadPool op_tp;
+ struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
+ FileStore *store;
+ OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
+ : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {}
+
+ bool _enqueue(OpSequencer *osr) override {
+ store->op_queue.push_back(osr);
+ return true;
+ }
+ void _dequeue(OpSequencer *o) override {
+ ceph_abort();
+ }
+ bool _empty() override {
+ return store->op_queue.empty();
+ }
+ OpSequencer *_dequeue() override {
+ if (store->op_queue.empty())
+ return nullptr;
+ OpSequencer *osr = store->op_queue.front();
+ store->op_queue.pop_front();
+ return osr;
+ }
+ void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override {
+ store->_do_op(osr, handle);
+ }
+ void _process_finish(OpSequencer *osr) override {
+ store->_finish_op(osr);
+ }
+ void _clear() override {
+ ceph_assert(store->op_queue.empty());
+ }
+ } op_wq;
+
+ void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
+ void _finish_op(OpSequencer *o);
+ Op *build_op(vector<Transaction>& tls,
+ Context *onreadable, Context *onreadable_sync,
+ TrackedOpRef osd_op);
+ void queue_op(OpSequencer *osr, Op *o);
+ void op_queue_reserve_throttle(Op *o);
+ void op_queue_release_throttle(Op *o);
+ void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
+ friend struct C_JournaledAhead;
+
+ void new_journal();
+
+ PerfCounters *logger;
+
+ ZTracer::Endpoint trace_endpoint;
+
+public:
+ int lfn_find(const ghobject_t& oid, const Index& index,
+ IndexedPath *path = nullptr);
+ int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length);
+ int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf);
+ int lfn_open(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ bool create,
+ FDRef *outfd,
+ Index *index = nullptr);
+
+ void lfn_close(FDRef fd);
+ int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ;
+ int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos,
+ bool force_clear_omap=false);
+
+public:
+ FileStore(CephContext* cct, const std::string &base, const std::string &jdev,
+ osflagbits_t flags = 0,
+ const char *internal_name = "filestore", bool update_to=false);
+ ~FileStore() override;
+
+ string get_type() override {
+ return "filestore";
+ }
+
+ int _detect_fs();
+ int _sanity_check_fs();
+
+ bool test_mount_in_use() override;
+ int read_op_seq(uint64_t *seq);
+ int write_op_seq(int, uint64_t seq);
+ int mount() override;
+ int umount() override;
+
+ int validate_hobject_key(const hobject_t &obj) const override;
+
+ unsigned get_max_attr_name_length() override {
+ // xattr limit is 128; leave room for our prefixes (user.ceph._),
+ // some margin, and cap at 100
+ return 100;
+ }
+ int mkfs() override;
+ int mkjournal() override;
+ bool wants_journal() override {
+ return true;
+ }
+ bool allows_journal() override {
+ return true;
+ }
+ bool needs_journal() override {
+ return false;
+ }
+
+ bool is_sync_onreadable() const override {
+ return false;
+ }
+
+ bool is_rotational() override;
+ bool is_journal_rotational() override;
+
+ void dump_perf_counters(Formatter *f) override {
+ f->open_object_section("perf_counters");
+ logger->dump_formatted(f, false);
+ f->close_section();
+ }
+
+ int flush_cache(ostream *os = NULL) override;
+ int write_version_stamp();
+ int version_stamp_is_valid(uint32_t *version);
+ int update_version_stamp();
+ int upgrade() override;
+
+ bool can_sort_nibblewise() override {
+ return true; // i support legacy sort order
+ }
+
+ void collect_metadata(map<string,string> *pm) override;
+ int get_devices(set<string> *ls) override;
+
+ int statfs(struct store_statfs_t *buf,
+ osd_alert_list_t* alerts = nullptr) override;
+ int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override;
+
+ int _do_transactions(
+ vector<Transaction> &tls, uint64_t op_seq,
+ ThreadPool::TPHandle *handle,
+ const char *osr_name);
+ int do_transactions(vector<Transaction> &tls, uint64_t op_seq) override {
+ return _do_transactions(tls, op_seq, nullptr, "replay");
+ }
+ void _do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle, const char *osr_name);
+
+ CollectionHandle open_collection(const coll_t& c) override;
+ CollectionHandle create_new_collection(const coll_t& c) override;
+ void set_collection_commit_queue(const coll_t& cid,
+ ContextQueue *commit_queue) override {
+ }
+
+ int queue_transactions(CollectionHandle& ch, vector<Transaction>& tls,
+ TrackedOpRef op = TrackedOpRef(),
+ ThreadPool::TPHandle *handle = nullptr) override;
+
+ /**
+ * set replay guard xattr on given file
+ *
+ * This will ensure that we will not replay this (or any previous) operation
+ * against this particular inode/object.
+ *
+ * @param fd open file descriptor for the file/object
+ * @param spos sequencer position of the last operation we should not replay
+ */
+ void _set_replay_guard(int fd,
+ const SequencerPosition& spos,
+ const ghobject_t *oid=0,
+ bool in_progress=false);
+ void _set_replay_guard(const coll_t& cid,
+ const SequencerPosition& spos,
+ bool in_progress);
+ void _set_global_replay_guard(const coll_t& cid,
+ const SequencerPosition &spos);
+
+ /// close a replay guard opened with in_progress=true
+ void _close_replay_guard(int fd, const SequencerPosition& spos,
+ const ghobject_t *oid=0);
+ void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+
+ /**
+ * check replay guard xattr on given file
+ *
+ * Check the current position against any marker on the file that
+ * indicates which operations have already been applied. If the
+ * current or a newer operation has been marked as applied, we
+ * should not replay the current operation again.
+ *
+ * If we are not replaying the journal, we already return true. It
+ * is only on replay that we might return false, indicated that the
+ * operation should not be performed (again).
+ *
+ * @param fd open fd on the file/object in question
+ * @param spos sequencerposition for an operation we could apply/replay
+ * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
+ */
+ int _check_replay_guard(int fd, const SequencerPosition& spos);
+ int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+ int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos);
+ int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+
+ // ------------------
+ // objects
+ int pick_object_revision_lt(ghobject_t& oid) {
+ return 0;
+ }
+ using ObjectStore::exists;
+ bool exists(CollectionHandle& c, const ghobject_t& oid) override;
+ using ObjectStore::stat;
+ int stat(
+ CollectionHandle& c,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio = false) override;
+ using ObjectStore::set_collection_opts;
+ int set_collection_opts(
+ CollectionHandle& c,
+ const pool_opts_t& opts) override;
+ using ObjectStore::read;
+ int read(
+ CollectionHandle& c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags = 0) override;
+ int _do_fiemap(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m);
+ int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m);
+ using ObjectStore::fiemap;
+ int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) override;
+ int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
+
+ int _touch(const coll_t& cid, const ghobject_t& oid);
+ int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
+ const bufferlist& bl, uint32_t fadvise_flags = 0);
+ int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
+ int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+ int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+ const SequencerPosition& spos);
+ int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
+ uint64_t srcoff, uint64_t len, uint64_t dstoff,
+ const SequencerPosition& spos);
+ int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+ int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+ int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
+ int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos);
+
+ int _fgetattr(int fd, const char *name, bufferptr& bp);
+ int _fgetattrs(int fd, map<string,bufferptr>& aset);
+ int _fsetattrs(int fd, map<string, bufferptr> &aset);
+
+ void do_force_sync();
+ void start_sync(Context *onsafe);
+ void sync();
+ void _flush_op_queue();
+ void flush();
+ void sync_and_flush();
+
+ int flush_journal() override;
+ int dump_journal(ostream& out) override;
+
+ void set_fsid(uuid_d u) override {
+ fsid = u;
+ }
+ uuid_d get_fsid() override { return fsid; }
+
+ uint64_t estimate_objects_overhead(uint64_t num_objects) override;
+
+ // DEBUG read error injection, an object is removed from both on delete()
+ Mutex read_error_lock;
+ set<ghobject_t> data_error_set; // read() will return -EIO
+ set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
+ void inject_data_error(const ghobject_t &oid) override;
+ void inject_mdata_error(const ghobject_t &oid) override;
+
+ void compact() override {
+ ceph_assert(object_map);
+ object_map->compact();
+ }
+
+ bool has_builtin_csum() const override {
+ return false;
+ }
+
+ void debug_obj_on_delete(const ghobject_t &oid);
+ bool debug_data_eio(const ghobject_t &oid);
+ bool debug_mdata_eio(const ghobject_t &oid);
+
+ int snapshot(const string& name) override;
+
+ // attrs
+ using ObjectStore::getattr;
+ using ObjectStore::getattrs;
+ int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, bufferptr &bp) override;
+ int getattrs(CollectionHandle& c, const ghobject_t& oid, map<string,bufferptr>& aset) override;
+
+ int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset,
+ const SequencerPosition &spos);
+ int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
+ const SequencerPosition &spos);
+ int _rmattrs(const coll_t& cid, const ghobject_t& oid,
+ const SequencerPosition &spos);
+
+ int _collection_remove_recursive(const coll_t &cid,
+ const SequencerPosition &spos);
+
+ int _collection_set_bits(const coll_t& cid, int bits);
+
+ // collections
+ using ObjectStore::collection_list;
+ int collection_bits(CollectionHandle& c) override;
+ int collection_list(CollectionHandle& c,
+ const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *next) override {
+ c->flush();
+ return collection_list(c->cid, start, end, max, ls, next);
+ }
+ int collection_list(const coll_t& cid,
+ const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *next);
+ int list_collections(vector<coll_t>& ls) override;
+ int list_collections(vector<coll_t>& ls, bool include_temp);
+ int collection_stat(const coll_t& c, struct stat *st);
+ bool collection_exists(const coll_t& c) override;
+ int collection_empty(CollectionHandle& c, bool *empty) override {
+ c->flush();
+ return collection_empty(c->cid, empty);
+ }
+ int collection_empty(const coll_t& cid, bool *empty);
+
+ // omap (see ObjectStore.h for documentation)
+ using ObjectStore::omap_get;
+ int omap_get(CollectionHandle& c, const ghobject_t &oid, bufferlist *header,
+ map<string, bufferlist> *out) override;
+ using ObjectStore::omap_get_header;
+ int omap_get_header(
+ CollectionHandle& c,
+ const ghobject_t &oid,
+ bufferlist *out,
+ bool allow_eio = false) override;
+ using ObjectStore::omap_get_keys;
+ int omap_get_keys(CollectionHandle& c, const ghobject_t &oid, set<string> *keys) override;
+ using ObjectStore::omap_get_values;
+ int omap_get_values(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys,
+ map<string, bufferlist> *out) override;
+ using ObjectStore::omap_check_keys;
+ int omap_check_keys(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys,
+ set<string> *out) override;
+ using ObjectStore::get_omap_iterator;
+ ObjectMap::ObjectMapIterator get_omap_iterator(CollectionHandle& c, const ghobject_t &oid) override;
+ ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& cid, const ghobject_t &oid);
+
+ int _create_collection(const coll_t& c, int bits,
+ const SequencerPosition &spos);
+ int _destroy_collection(const coll_t& c);
+ /**
+ * Give an expected number of objects hint to the collection.
+ *
+ * @param c - collection id.
+ * @param pg_num - pg number of the pool this collection belongs to
+ * @param expected_num_objs - expected number of objects in this collection
+ * @param spos - sequence position
+ *
+ * @return 0 on success, an error code otherwise
+ */
+ int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
+ uint64_t expected_num_objs,
+ const SequencerPosition &spos);
+ int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid,
+ const SequencerPosition& spos);
+ int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
+ const SequencerPosition& spos,
+ bool ignore_enoent = false);
+
+ int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+
+ void dump_start(const std::string& file);
+ void dump_stop();
+ void dump_transactions(vector<Transaction>& ls, uint64_t seq, OpSequencer *osr);
+
+ virtual int apply_layout_settings(const coll_t &cid, int target_level);
+
+ void get_db_statistics(Formatter* f) override;
+
+private:
+ void _inject_failure();
+
+ // omap
+ int _omap_clear(const coll_t& cid, const ghobject_t &oid,
+ const SequencerPosition &spos);
+ int _omap_setkeys(const coll_t& cid, const ghobject_t &oid,
+ const map<string, bufferlist> &aset,
+ const SequencerPosition &spos);
+ int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const set<string> &keys,
+ const SequencerPosition &spos);
+ int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+ const string& first, const string& last,
+ const SequencerPosition &spos);
+ int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl,
+ const SequencerPosition &spos);
+ int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest,
+ const SequencerPosition &spos);
+ int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest,
+ const SequencerPosition &spos);
+
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed) override;
+ int set_throttle_params();
+ float m_filestore_commit_timeout;
+ bool m_filestore_journal_parallel;
+ bool m_filestore_journal_trailing;
+ bool m_filestore_journal_writeahead;
+ int m_filestore_fiemap_threshold;
+ double m_filestore_max_sync_interval;
+ double m_filestore_min_sync_interval;
+ bool m_filestore_fail_eio;
+ bool m_filestore_fadvise;
+ int do_update;
+ bool m_journal_dio, m_journal_aio, m_journal_force_aio;
+ std::string m_osd_rollback_to_cluster_snap;
+ bool m_osd_use_stale_snap;
+ bool m_filestore_do_dump;
+ std::ofstream m_filestore_dump;
+ JSONFormatter m_filestore_dump_fmt;
+ std::atomic<int64_t> m_filestore_kill_at = { 0 };
+ bool m_filestore_sloppy_crc;
+ int m_filestore_sloppy_crc_block_size;
+ uint64_t m_filestore_max_alloc_hint_size;
+ unsigned long m_fs_type;
+
+ //Determined xattr handling based on fs type
+ void set_xattr_limits_via_conf();
+ uint32_t m_filestore_max_inline_xattr_size;
+ uint32_t m_filestore_max_inline_xattrs;
+ uint32_t m_filestore_max_xattr_value_size;
+
+ FSSuperblock superblock;
+
+ /**
+ * write_superblock()
+ *
+ * Write superblock to persisent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int write_superblock();
+
+ /**
+ * read_superblock()
+ *
+ * Fill in FileStore::superblock by reading persistent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int read_superblock();
+
+ friend class FileStoreBackend;
+ friend class TestFileStore;
+};
+
+ostream& operator<<(ostream& out, const FileStore::OpSequencer& s);
+
+struct fiemap;
+
+class FileStoreBackend {
+private:
+ FileStore *filestore;
+protected:
+ int get_basedir_fd() {
+ return filestore->basedir_fd;
+ }
+ int get_current_fd() {
+ return filestore->current_fd;
+ }
+ int get_op_fd() {
+ return filestore->op_fd;
+ }
+ size_t get_blksize() {
+ return filestore->blk_size;
+ }
+ const string& get_basedir_path() {
+ return filestore->basedir;
+ }
+ const string& get_journal_path() {
+ return filestore->journalpath;
+ }
+ const string& get_current_path() {
+ return filestore->current_fn;
+ }
+ int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
+ if (has_fiemap() || has_seek_data_hole()) {
+ return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
+ } else {
+ return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
+ }
+ }
+ int get_crc_block_size() {
+ return filestore->m_filestore_sloppy_crc_block_size;
+ }
+
+public:
+ explicit FileStoreBackend(FileStore *fs) : filestore(fs) {}
+ virtual ~FileStoreBackend() {}
+
+ CephContext* cct() const {
+ return filestore->cct;
+ }
+
+ static FileStoreBackend *create(unsigned long f_type, FileStore *fs);
+
+ virtual const char *get_name() = 0;
+ virtual int detect_features() = 0;
+ virtual int create_current() = 0;
+ virtual bool can_checkpoint() = 0;
+ virtual int list_checkpoints(list<string>& ls) = 0;
+ virtual int create_checkpoint(const string& name, uint64_t *cid) = 0;
+ virtual int sync_checkpoint(uint64_t id) = 0;
+ virtual int rollback_to(const string& name) = 0;
+ virtual int destroy_checkpoint(const string& name) = 0;
+ virtual int syncfs() = 0;
+ virtual bool has_fiemap() = 0;
+ virtual bool has_seek_data_hole() = 0;
+ virtual bool is_rotational() = 0;
+ virtual bool is_journal_rotational() = 0;
+ virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
+ virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
+ virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
+ virtual bool has_splice() const = 0;
+
+ // hooks for (sloppy) crc tracking
+ virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
+ virtual int _crc_update_truncate(int fd, loff_t off) = 0;
+ virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
+ virtual int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff) = 0;
+ virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out) = 0;
+};
+
+#endif
diff --git a/src/os/filestore/GenericFileStoreBackend.cc b/src/os/filestore/GenericFileStoreBackend.cc
new file mode 100644
index 00000000..a75d501f
--- /dev/null
+++ b/src/os/filestore/GenericFileStoreBackend.cc
@@ -0,0 +1,468 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "GenericFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+#include "common/blkdev.h"
+
+#include "common/SloppyCRCMap.h"
+#include "os/filestore/chain_xattr.h"
+
+#define SLOPPY_CRC_XATTR "user.cephos.scrc"
+
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
+ FileStoreBackend(fs),
+ ioctl_fiemap(false),
+ seek_data_hole(false),
+ use_splice(false),
+ m_filestore_fiemap(cct()->_conf->filestore_fiemap),
+ m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole),
+ m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data),
+ m_filestore_splice(cct()->_conf->filestore_splice)
+{
+ // rotational?
+ {
+ // NOTE: the below won't work on btrfs; we'll assume rotational.
+ string fn = get_basedir_path();
+ int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ return;
+ }
+ BlkDev blkdev(fd);
+ m_rotational = blkdev.is_rotational();
+ dout(20) << __func__ << " basedir " << fn
+ << " rotational " << (int)m_rotational << dendl;
+ ::close(fd);
+ }
+ // journal rotational?
+ {
+ // NOTE: the below won't work on btrfs; we'll assume rotational.
+ string fn = get_journal_path();
+ int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ return;
+ }
+ BlkDev blkdev(fd);
+ m_journal_rotational = blkdev.is_rotational();
+ dout(20) << __func__ << " journal filename " << fn.c_str()
+ << " journal rotational " << (int)m_journal_rotational << dendl;
+ ::close(fd);
+ }
+}
+
+int GenericFileStoreBackend::detect_features()
+{
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
+
+ int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, 0644);
+ if (fd < 0) {
+ fd = -errno;
+ derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
+ return fd;
+ }
+
+ // ext4 has a bug in older kernels where fiemap will return an empty
+ // result in some cases. this is a file layout that triggers the bug
+ // on 2.6.34-rc5.
+ int v[] = {
+ 0x0000000000016000, 0x0000000000007000,
+ 0x000000000004a000, 0x0000000000007000,
+ 0x0000000000060000, 0x0000000000001000,
+ 0x0000000000061000, 0x0000000000008000,
+ 0x0000000000069000, 0x0000000000007000,
+ 0x00000000000a3000, 0x000000000000c000,
+ 0x000000000024e000, 0x000000000000c000,
+ 0x000000000028b000, 0x0000000000009000,
+ 0x00000000002b1000, 0x0000000000003000,
+ 0, 0
+ };
+ for (int i=0; v[i]; i++) {
+ int off = v[i++];
+ int len = v[i];
+
+ // write a large extent
+ char buf[len];
+ memset(buf, 1, sizeof(buf));
+ int r = ::lseek(fd, off, SEEK_SET);
+ if (r < 0) {
+ r = -errno;
+ derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+ r = write(fd, buf, sizeof(buf));
+ if (r < 0) {
+ derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+ }
+
+ // fiemap an extent inside that
+ if (!m_filestore_fiemap) {
+ dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
+ ioctl_fiemap = false;
+ } else {
+ struct fiemap *fiemap;
+ int r = do_fiemap(fd, 2430421, 59284, &fiemap);
+ if (r < 0) {
+ dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
+ ioctl_fiemap = false;
+ } else {
+ if (fiemap->fm_mapped_extents == 0) {
+ dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
+ ioctl_fiemap = false;
+ } else {
+ dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
+ ioctl_fiemap = true;
+ }
+ free(fiemap);
+ }
+ }
+
+ // SEEK_DATA/SEEK_HOLE detection
+ if (!m_filestore_seek_data_hole) {
+ dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
+ seek_data_hole = false;
+ } else {
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+ // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
+ // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
+ // Fall back to use fiemap.
+ off_t hole_pos;
+
+ hole_pos = lseek(fd, 0, SEEK_HOLE);
+ if (hole_pos < 0) {
+ if (errno == EINVAL) {
+ dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
+ seek_data_hole = false;
+ } else {
+ derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return -errno;
+ }
+ } else {
+ dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
+ seek_data_hole = true;
+ }
+#endif
+ }
+
+ //splice detection
+#ifdef CEPH_HAVE_SPLICE
+ if (!m_filestore_splice) {
+ dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl;
+ use_splice = false;
+ } else {
+ int pipefd[2];
+ loff_t off_in = 0;
+ int r;
+ if (pipe_cloexec(pipefd) < 0) {
+ int e = errno;
+ dout(0) << "detect_features: splice pipe met error " << cpp_strerror(e) << dendl;
+ } else {
+ lseek(fd, 0, SEEK_SET);
+ r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
+ if (!(r < 0 && errno == EINVAL)) {
+ use_splice = true;
+ dout(0) << "detect_features: splice is supported" << dendl;
+ } else
+ dout(0) << "detect_features: splice is NOT supported" << dendl;
+ close(pipefd[0]);
+ close(pipefd[1]);
+ }
+ }
+#endif
+ ::unlink(fn);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+
+ bool have_syncfs = false;
+#ifdef HAVE_SYS_SYNCFS
+ if (::syncfs(get_basedir_fd()) == 0) {
+ dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
+ have_syncfs = true;
+ } else {
+ dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
+ }
+#elif defined(SYS_syncfs)
+ if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
+ dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
+ have_syncfs = true;
+ } else {
+ dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+ }
+#elif defined(__NR_syncfs)
+ if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
+ dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
+ have_syncfs = true;
+ } else {
+ dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+ }
+#endif
+ if (!have_syncfs) {
+ dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
+ if (m_filestore_fsync_flushes_journal_data) {
+ dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
+ } else {
+ dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
+ dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
+ }
+ }
+
+ return 0;
+}
+
+int GenericFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
+ ret = -EINVAL;
+ }
+ } else {
+ ret = ::mkdir(get_current_path().c_str(), 0755);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
+ }
+ }
+ return ret;
+}
+
+int GenericFileStoreBackend::syncfs()
+{
+ int ret;
+ if (m_filestore_fsync_flushes_journal_data) {
+ dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
+ // make the file system's journal commit.
+ // this works with ext3, but NOT ext4
+ ret = ::fsync(get_op_fd());
+ if (ret < 0)
+ ret = -errno;
+ } else {
+ dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
+ ret = sync_filesystem(get_current_fd());
+ }
+ return ret;
+}
+
+int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
+{
+ struct fiemap *fiemap = NULL;
+ struct fiemap *_realloc_fiemap = NULL;
+ int size;
+ int ret;
+
+ fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
+ if (!fiemap)
+ return -ENOMEM;
+ /*
+ * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
+ * the result is (logical=4096, len=4096). It leak the [3990, 4096).
+ * Commit:"xfs: fix rounding error of fiemap length parameter
+ * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
+ * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
+ */
+ fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
+ fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
+ fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+ ret = -ENOTSUP;
+ goto done_err;
+#else
+ if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+ ret = -errno;
+ goto done_err;
+ }
+#endif
+ size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
+
+ _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
+ if (!_realloc_fiemap) {
+ ret = -ENOMEM;
+ goto done_err;
+ } else {
+ fiemap = _realloc_fiemap;
+ }
+
+ memset(fiemap->fm_extents, 0, size);
+
+ fiemap->fm_extent_count = fiemap->fm_mapped_extents;
+ fiemap->fm_mapped_extents = 0;
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+ ret = -ENOTSUP;
+ goto done_err;
+#else
+ if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+ ret = -errno;
+ goto done_err;
+ }
+ *pfiemap = fiemap;
+#endif
+ return 0;
+
+done_err:
+ *pfiemap = NULL;
+ free(fiemap);
+ return ret;
+}
+
+
+int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
+{
+ char buf[100];
+ bufferptr bp;
+ int r = 0;
+ int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
+ if (l == -ENODATA) {
+ return 0;
+ }
+ if (l >= 0) {
+ bp = buffer::create(l);
+ memcpy(bp.c_str(), buf, l);
+ } else if (l == -ERANGE) {
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
+ if (l > 0) {
+ bp = buffer::create(l);
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
+ }
+ }
+ bufferlist bl;
+ bl.append(std::move(bp));
+ auto p = bl.cbegin();
+ try {
+ decode(*cm, p);
+ }
+ catch (buffer::error &e) {
+ r = -EIO;
+ }
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
+{
+ bufferlist bl;
+ encode(*cm, bl);
+ int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm.write(off, len, bl, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.truncate(off);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.zero(off, len);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff)
+{
+ SloppyCRCMap scm_src(get_crc_block_size());
+ SloppyCRCMap scm_dst(get_crc_block_size());
+ int r = _crc_load_or_init(srcfd, &scm_src);
+ if (r < 0)
+ return r;
+ r = _crc_load_or_init(destfd, &scm_dst);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(destfd, &scm_dst);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ return scm.read(off, len, bl, out);
+}
diff --git a/src/os/filestore/GenericFileStoreBackend.h b/src/os/filestore/GenericFileStoreBackend.h
new file mode 100644
index 00000000..207c3d0d
--- /dev/null
+++ b/src/os/filestore/GenericFileStoreBackend.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_GENERICFILESTOREBACKEDN_H
+#define CEPH_GENERICFILESTOREBACKEDN_H
+
+#include "FileStore.h"
+
+class SloppyCRCMap;
+
+class GenericFileStoreBackend : public FileStoreBackend {
+private:
+ bool ioctl_fiemap;
+ bool seek_data_hole;
+ bool use_splice;
+ bool m_filestore_fiemap;
+ bool m_filestore_seek_data_hole;
+ bool m_filestore_fsync_flushes_journal_data;
+ bool m_filestore_splice;
+ bool m_rotational = true;
+ bool m_journal_rotational = true;
+public:
+ explicit GenericFileStoreBackend(FileStore *fs);
+ ~GenericFileStoreBackend() override {}
+
+ const char *get_name() override {
+ return "generic";
+ }
+ int detect_features() override;
+ int create_current() override;
+ bool can_checkpoint() override { return false; }
+ bool is_rotational() override {
+ return m_rotational;
+ }
+ bool is_journal_rotational() override {
+ return m_journal_rotational;
+ }
+ int list_checkpoints(list<string>& ls) override { return 0; }
+ int create_checkpoint(const string& name, uint64_t *cid) override { return -EOPNOTSUPP; }
+ int sync_checkpoint(uint64_t id) override { return -EOPNOTSUPP; }
+ int rollback_to(const string& name) override { return -EOPNOTSUPP; }
+ int destroy_checkpoint(const string& name) override { return -EOPNOTSUPP; }
+ int syncfs() override;
+ bool has_fiemap() override { return ioctl_fiemap; }
+ bool has_seek_data_hole() override { return seek_data_hole; }
+ int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) override;
+ int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override {
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+ int set_alloc_hint(int fd, uint64_t hint) override { return -EOPNOTSUPP; }
+ bool has_splice() const override { return use_splice; }
+private:
+ int _crc_load_or_init(int fd, SloppyCRCMap *cm);
+ int _crc_save(int fd, SloppyCRCMap *cm);
+public:
+ int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) override;
+ int _crc_update_truncate(int fd, loff_t off) override;
+ int _crc_update_zero(int fd, loff_t off, size_t len) override;
+ int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff) override;
+ int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out) override;
+};
+#endif
diff --git a/src/os/filestore/HashIndex.cc b/src/os/filestore/HashIndex.cc
new file mode 100644
index 00000000..ab56b43c
--- /dev/null
+++ b/src/os/filestore/HashIndex.cc
@@ -0,0 +1,1195 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+#include <errno.h>
+
+#include "HashIndex.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+
+const string HashIndex::SUBDIR_ATTR = "contents";
+const string HashIndex::SETTINGS_ATTR = "settings";
+const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
+
+/// hex digit to integer value
+int hex_to_int(char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ ceph_abort();
+}
+
+/// int value to hex digit
+char int_to_hex(int v)
+{
+ ceph_assert(v < 16);
+ if (v < 10)
+ return '0' + v;
+ return 'A' + v - 10;
+}
+
+/// reverse bits in a nibble (0..15)
+int reverse_nibble_bits(int in)
+{
+ ceph_assert(in < 16);
+ return
+ ((in & 8) >> 3) |
+ ((in & 4) >> 1) |
+ ((in & 2) << 1) |
+ ((in & 1) << 3);
+}
+
+/// reverse nibble bits in a hex digit
+char reverse_hexdigit_bits(char c)
+{
+ return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
+}
+
+/// reverse nibble bits in a hex string
+string reverse_hexdigit_bits_string(string s)
+{
+ for (unsigned i=0; i<s.size(); ++i)
+ s[i] = reverse_hexdigit_bits(s[i]);
+ return s;
+}
+
+/// compare hex digit (as length 1 string) bitwise
+bool cmp_hexdigit_bitwise(const string& l, const string& r)
+{
+ ceph_assert(l.length() == 1 && r.length() == 1);
+ int lv = hex_to_int(l[0]);
+ int rv = hex_to_int(r[0]);
+ ceph_assert(lv < 16);
+ ceph_assert(rv < 16);
+ return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
+}
+
+/// compare hex digit string bitwise
+bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
+{
+ string ll = reverse_hexdigit_bits_string(l);
+ string rr = reverse_hexdigit_bits_string(r);
+ return ll < rr;
+}
+
+int HashIndex::cleanup() {
+ bufferlist bl;
+ int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0) {
+ // No in progress operations!
+ return 0;
+ }
+ auto i = bl.cbegin();
+ InProgressOp in_progress(i);
+ subdir_info_s info;
+ r = get_info(in_progress.path, &info);
+ if (r == -ENOENT) {
+ return end_split_or_merge(in_progress.path);
+ } else if (r < 0) {
+ return r;
+ }
+
+ if (in_progress.is_split())
+ return complete_split(in_progress.path, info);
+ else if (in_progress.is_merge())
+ return complete_merge(in_progress.path, info);
+ else if (in_progress.is_col_split()) {
+ for (vector<string>::iterator i = in_progress.path.begin();
+ i != in_progress.path.end();
+ ++i) {
+ vector<string> path(in_progress.path.begin(), i);
+ int r = reset_attr(path);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+ }
+ else
+ return -EINVAL;
+}
+
+int HashIndex::reset_attr(
+ const vector<string> &path)
+{
+ int exists = 0;
+ int r = path_exists(path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists)
+ return 0;
+ map<string, ghobject_t> objects;
+ vector<string> subdirs;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ subdir_info_s info;
+ info.hash_level = path.size();
+ info.objs = objects.size();
+ info.subdirs = subdirs.size();
+ return set_info(path, info);
+}
+
+int HashIndex::col_split_level(
+ HashIndex &from,
+ HashIndex &to,
+ const vector<string> &path,
+ uint32_t inbits,
+ uint32_t match,
+ unsigned *mkdirred)
+{
+ /* For each subdir, move, recurse, or ignore based on comparing the low order
+ * bits of the hash represented by the subdir path with inbits, match passed
+ * in.
+ */
+ vector<string> subdirs;
+ int r = from.list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, ghobject_t> objects;
+ r = from.list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+
+ set<string> to_move;
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ uint32_t bits = 0;
+ uint32_t hash = 0;
+ vector<string> sub_path(path.begin(), path.end());
+ sub_path.push_back(*i);
+ path_to_hobject_hash_prefix(sub_path, &bits, &hash);
+ if (bits < inbits) {
+ if (hobject_t::match_hash(hash, bits, match)) {
+ r = col_split_level(
+ from,
+ to,
+ sub_path,
+ inbits,
+ match,
+ mkdirred);
+ if (r < 0)
+ return r;
+ if (*mkdirred > path.size())
+ *mkdirred = path.size();
+ } // else, skip, doesn't need to be moved or recursed into
+ } else {
+ if (hobject_t::match_hash(hash, inbits, match)) {
+ to_move.insert(*i);
+ }
+ } // else, skip, doesn't need to be moved or recursed into
+ }
+
+ /* Then, do the same for each object */
+ map<string, ghobject_t> objs_to_move;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (i->second.match(inbits, match)) {
+ objs_to_move.insert(*i);
+ }
+ }
+
+ if (objs_to_move.empty() && to_move.empty())
+ return 0;
+
+ // Make parent directories as needed
+ while (*mkdirred < path.size()) {
+ ++*mkdirred;
+ int exists = 0;
+ vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
+ r = to.path_exists(creating_path, &exists);
+ if (r < 0)
+ return r;
+ if (exists)
+ continue;
+ subdir_info_s info;
+ info.objs = 0;
+ info.subdirs = 0;
+ info.hash_level = creating_path.size();
+ if (*mkdirred < path.size() - 1)
+ info.subdirs = 1;
+ r = to.start_col_split(creating_path);
+ if (r < 0)
+ return r;
+ r = to.create_path(creating_path);
+ if (r < 0)
+ return r;
+ r = to.set_info(creating_path, info);
+ if (r < 0)
+ return r;
+ r = to.end_split_or_merge(creating_path);
+ if (r < 0)
+ return r;
+ }
+
+ subdir_info_s from_info;
+ subdir_info_s to_info;
+ r = from.get_info(path, &from_info);
+ if (r < 0)
+ return r;
+ r = to.get_info(path, &to_info);
+ if (r < 0)
+ return r;
+
+ from.start_col_split(path);
+ to.start_col_split(path);
+
+ // Do subdir moves
+ for (set<string>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ from_info.subdirs--;
+ to_info.subdirs++;
+ r = move_subdir(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+ for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
+ i != objs_to_move.end();
+ ++i) {
+ from_info.objs--;
+ to_info.objs++;
+ r = move_object(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+
+ r = to.set_info(path, to_info);
+ if (r < 0)
+ return r;
+ r = from.set_info(path, from_info);
+ if (r < 0)
+ return r;
+ from.end_split_or_merge(path);
+ to.end_split_or_merge(path);
+ return 0;
+}
+
+int HashIndex::_merge(
+ uint32_t bits,
+ CollectionIndex* dest) {
+ dout(20) << __func__ << " bits " << bits << dendl;
+ ceph_assert(collection_version() == dest->collection_version());
+
+ vector<string> emptypath;
+
+ // pre-split to common/target level so that any shared prefix DIR_?
+ // directories already exist at the destination. Since each
+ // directory is a nibble (4 bits),
+ unsigned shared = bits / 4;
+ dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl;
+ if (shared) {
+ split_dirs(emptypath, shared);
+ ((HashIndex*)dest)->split_dirs(emptypath, shared);
+ }
+
+ // now merge the contents
+ _merge_dirs(*this, *(HashIndex*)dest, emptypath);
+
+ return 0;
+}
+
+int HashIndex::_merge_dirs(
+ HashIndex& from,
+ HashIndex& to,
+ const vector<string>& path)
+{
+ dout(20) << __func__ << " path " << path << dendl;
+ int r;
+
+ vector<string> src_subs, dst_subs;
+ r = from.list_subdirs(path, &src_subs);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "from.list_subdirs"
+ << dendl;
+ return r;
+ }
+ r = to.list_subdirs(path, &dst_subs);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "to.list_subdirs"
+ << dendl;
+ return r;
+ }
+
+ for (auto& i : src_subs) {
+ if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) {
+ // move it
+ r = move_subdir(from, to, path, i);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "move_subdir(...,"
+ << path << "," << i << ")"
+ << dendl;
+ return r;
+ }
+ } else {
+ // common, recurse!
+ vector<string> nested = path;
+ nested.push_back(i);
+ r = _merge_dirs(from, to, nested);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "rec _merge_dirs"
+ << dendl;
+ return r;
+ }
+
+ // now remove it
+ r = remove_path(nested);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "remove_path "
+ << nested
+ << dendl;
+ return r;
+ }
+ }
+ }
+
+ // objects
+ map<string, ghobject_t> objects;
+ r = from.list_objects(path, 0, 0, &objects);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "from.list_objects"
+ << dendl;
+ return r;
+ }
+
+ for (auto& i : objects) {
+ r = move_object(from, to, path, i);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "move_object(...,"
+ << path << "," << i << ")"
+ << dendl;
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+
+int HashIndex::_split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest) {
+ ceph_assert(collection_version() == dest->collection_version());
+ unsigned mkdirred = 0;
+
+ return col_split_level(
+ *this,
+ *static_cast<HashIndex*>(dest),
+ vector<string>(),
+ bits,
+ match,
+ &mkdirred);
+}
+
+int HashIndex::split_dirs(const vector<string> &path, int target_level) {
+ dout(20) << __func__ << " " << path << " target level: "
+ << target_level << dendl;
+ subdir_info_s info;
+ int r = get_info(path, &info);
+ if (r < 0) {
+ dout(10) << "error looking up info for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (must_split(info, target_level)) {
+ dout(1) << __func__ << " " << path << " has " << info.objs
+ << " objects, " << info.hash_level
+ << " level, starting split in pg " << coll() << "." << dendl;
+ r = initiate_split(path, info);
+ if (r < 0) {
+ dout(10) << "error initiating split on " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = complete_split(path, info);
+ dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
+ << dendl;
+ if (r < 0) {
+ dout(10) << "error completing split on " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ vector<string> subdirs;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0) {
+ dout(10) << "error listing subdirs of " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ for (vector<string>::const_iterator it = subdirs.begin();
+ it != subdirs.end(); ++it) {
+ vector<string> subdir_path(path);
+ subdir_path.push_back(*it);
+ r = split_dirs(subdir_path, target_level);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return r;
+}
+
+int HashIndex::apply_layout_settings(int target_level) {
+ vector<string> path;
+ dout(10) << __func__ << " split multiple = " << split_multiplier
+ << " merge threshold = " << merge_threshold
+ << " split rand factor = " << cct->_conf->filestore_split_rand_factor
+ << " target level = " << target_level
+ << dendl;
+ int r = write_settings();
+ if (r < 0)
+ return r;
+ return split_dirs(path, target_level);
+}
+
+int HashIndex::_init() {
+ subdir_info_s info;
+ vector<string> path;
+ int r = set_info(path, info);
+ if (r < 0)
+ return r;
+ return write_settings();
+}
+
+int HashIndex::write_settings() {
+ if (cct->_conf->filestore_split_rand_factor > 0) {
+ settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
+ } else {
+ settings.split_rand_factor = 0;
+ }
+ vector<string> path;
+ bufferlist bl;
+ settings.encode(bl);
+ return add_attr_path(path, SETTINGS_ATTR, bl);
+}
+
+int HashIndex::read_settings() {
+ vector<string> path;
+ bufferlist bl;
+ int r = get_attr_path(path, SETTINGS_ATTR, bl);
+ if (r == -ENODATA)
+ return 0;
+ if (r < 0) {
+ derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ auto it = bl.cbegin();
+ settings.decode(it);
+ dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
+ return 0;
+}
+
+/* LFNIndex virtual method implementations */
+int HashIndex::_created(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name) {
+ subdir_info_s info;
+ int r;
+ r = get_info(path, &info);
+ if (r < 0)
+ return r;
+ info.objs++;
+ r = set_info(path, info);
+ if (r < 0)
+ return r;
+
+ if (must_split(info)) {
+ dout(1) << __func__ << " " << path << " has " << info.objs
+ << " objects, starting split in pg " << coll() << "." << dendl;
+ int r = initiate_split(path, info);
+ if (r < 0)
+ return r;
+ r = complete_split(path, info);
+ dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
+ << dendl;
+ return r;
+ } else {
+ return 0;
+ }
+}
+
+int HashIndex::_remove(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name) {
+ int r;
+ r = remove_object(path, oid);
+ if (r < 0)
+ return r;
+ subdir_info_s info;
+ r = get_info(path, &info);
+ if (r < 0)
+ return r;
+ info.objs--;
+ r = set_info(path, info);
+ if (r < 0)
+ return r;
+ if (must_merge(info)) {
+ r = initiate_merge(path, info);
+ if (r < 0)
+ return r;
+ return complete_merge(path, info);
+ } else {
+ return 0;
+ }
+}
+
+int HashIndex::_lookup(const ghobject_t &oid,
+ vector<string> *path,
+ string *mangled_name,
+ int *hardlink) {
+ vector<string> path_comp;
+ get_path_components(oid, &path_comp);
+ vector<string>::iterator next = path_comp.begin();
+ int exists;
+ while (1) {
+ int r = path_exists(*path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists) {
+ if (path->empty())
+ return -ENOENT;
+ path->pop_back();
+ break;
+ }
+ if (next == path_comp.end())
+ break;
+ path->push_back(*(next++));
+ }
+ return get_mangled_name(*path, oid, mangled_name, hardlink);
+}
+
+int HashIndex::_collection_list_partial(const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next) {
+ vector<string> path;
+ ghobject_t _next;
+ if (!next)
+ next = &_next;
+ *next = start;
+ dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
+ return list_by_hash(path, end, max_count, next, ls);
+}
+
+int HashIndex::prep_delete() {
+ return recursive_remove(vector<string>());
+}
+
+int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
+ int ret;
+ vector<string> path;
+ subdir_info_s root_info;
+ // Make sure there is neither objects nor sub-folders
+ // in this collection
+ ret = get_info(path, &root_info);
+ if (ret < 0)
+ return ret;
+
+ // Do the folder splitting first
+ ret = pre_split_folder(pg_num, expected_num_objs);
+ if (ret < 0)
+ return ret;
+ // Initialize the folder info starting from root
+ return init_split_folder(path, 0);
+}
+
+int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
+{
+ // If folder merging is enabled (by setting the threshold positive),
+ // no need to split
+ if (merge_threshold > 0)
+ return 0;
+ const coll_t c = coll();
+ // Do not split if the expected number of objects in this collection is zero (by default)
+ if (expected_num_objs == 0)
+ return 0;
+
+ // Calculate the number of leaf folders (which actually store files)
+ // need to be created
+ const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
+ uint64_t leavies = expected_num_objs / objs_per_folder ;
+ // No need to split
+ if (leavies == 0 || expected_num_objs == objs_per_folder)
+ return 0;
+
+ spg_t spgid;
+ if (!c.is_pg_prefix(&spgid))
+ return -EINVAL;
+ const ps_t ps = spgid.pgid.ps();
+
+ // the most significant bits of pg_num
+ const int pg_num_bits = calc_num_bits(pg_num - 1);
+ ps_t tmp_id = ps;
+ // calculate the number of levels we only create one sub folder
+ int num = pg_num_bits / 4;
+ // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
+ // so that splitting starts at level 3
+ if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
+ --num;
+ }
+
+ int ret;
+ // Start with creation that only has one subfolder
+ vector<string> paths;
+ int dump_num = num;
+ while (num-- > 0) {
+ ps_t v = tmp_id & 0x0000000f;
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ tmp_id = tmp_id >> 4;
+ }
+
+ // Starting from here, we can split by creating multiple subfolders
+ const int left_bits = pg_num_bits - dump_num * 4;
+ // this variable denotes how many bits (for this level) that can be
+ // used for sub folder splitting
+ int split_bits = 4 - left_bits;
+ // the below logic is inspired by rados.h#ceph_stable_mod,
+ // it basically determines how many sub-folders should we
+ // create for splitting
+ ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT
+ if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
+ ++split_bits;
+ }
+ const uint32_t subs = (1 << split_bits);
+ // Calculate how many levels we create starting from here
+ int level = 0;
+ int level_limit = MAX_HASH_LEVEL - dump_num - 1;
+ uint64_t actual_leaves = subs;
+ while (actual_leaves < leavies && level < level_limit) {
+ ++level;
+ actual_leaves <<= 4;
+ }
+ for (uint32_t i = 0; i < subs; ++i) {
+ ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT
+ int v = tmp_id | (i << ((4 - split_bits) % 4));
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(paths, level);
+ if (ret < 0)
+ return ret;
+ paths.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
+{
+ // Get the number of sub directories for the current path
+ vector<string> subdirs;
+ int ret = list_subdirs(path, &subdirs);
+ if (ret < 0)
+ return ret;
+ subdir_info_s info;
+ info.subdirs = subdirs.size();
+ info.hash_level = hash_level;
+ ret = set_info(path, info);
+ if (ret < 0)
+ return ret;
+ ret = fsync_dir(path);
+ if (ret < 0)
+ return ret;
+
+ // Do the same for subdirs
+ vector<string>::const_iterator iter;
+ for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
+ path.push_back(*iter);
+ ret = init_split_folder(path, hash_level + 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::recursive_create_path(vector<string>& path, int level)
+{
+ if (level == 0)
+ return 0;
+ for (int i = 0; i < 16; ++i) {
+ path.push_back(to_hex(i));
+ int ret = create_path(path);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(path, level - 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::recursive_remove(const vector<string> &path) {
+ return _recursive_remove(path, true);
+}
+
+int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
+ vector<string> subdirs;
+ dout(20) << __func__ << " path=" << path << dendl;
+ int r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, ghobject_t> objects;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ if (!objects.empty())
+ return -ENOTEMPTY;
+ vector<string> subdir(path);
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ subdir.push_back(*i);
+ r = _recursive_remove(subdir, false);
+ if (r < 0)
+ return r;
+ subdir.pop_back();
+ }
+ if (top)
+ return 0;
+ else
+ return remove_path(path);
+}
+
+int HashIndex::start_col_split(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_split(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::SPLIT, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_merge(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::MERGE, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::end_split_or_merge(const vector<string> &path) {
+ return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
+}
+
+int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
+ bufferlist buf;
+ int r = get_attr_path(path, SUBDIR_ATTR, buf);
+ if (r < 0)
+ return r;
+ auto bufiter = buf.cbegin();
+ info->decode(bufiter);
+ ceph_assert(path.size() == (unsigned)info->hash_level);
+ return 0;
+}
+
+int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
+ bufferlist buf;
+ ceph_assert(path.size() == (unsigned)info.hash_level);
+ info.encode(buf);
+ return add_attr_path(path, SUBDIR_ATTR, buf);
+}
+
+bool HashIndex::must_merge(const subdir_info_s &info) {
+ return (info.hash_level > 0 &&
+ merge_threshold > 0 &&
+ info.objs < (unsigned)merge_threshold &&
+ info.subdirs == 0);
+}
+
+bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
+ // target_level is used for ceph-objectstore-tool to split dirs offline.
+ // if it is set (defalult is 0) and current hash level < target_level,
+ // this dir would be split no matters how many objects it has.
+ return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
+ ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
+ (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
+}
+
+int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
+ return start_merge(path);
+}
+
+int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
+ vector<string> dst = path;
+ dst.pop_back();
+ subdir_info_s dstinfo;
+ int r, exists;
+ r = path_exists(path, &exists);
+ if (r < 0)
+ return r;
+ r = get_info(dst, &dstinfo);
+ if (r < 0)
+ return r;
+ if (exists) {
+ r = move_objects(path, dst);
+ if (r < 0)
+ return r;
+ r = reset_attr(dst);
+ if (r < 0)
+ return r;
+ r = remove_path(path);
+ if (r < 0)
+ return r;
+ }
+ if (must_merge(dstinfo)) {
+ r = initiate_merge(dst, dstinfo);
+ if (r < 0)
+ return r;
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+ return complete_merge(dst, dstinfo);
+ }
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+ return end_split_or_merge(path);
+}
+
+int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
+ return start_split(path);
+}
+
+int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
+ int level = info.hash_level;
+ map<string, ghobject_t> objects;
+ vector<string> dst = path;
+ int r;
+ dst.push_back("");
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ vector<string> subdirs_vec;
+ r = list_subdirs(path, &subdirs_vec);
+ if (r < 0)
+ return r;
+ set<string> subdirs;
+ subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
+ map<string, map<string, ghobject_t> > mapped;
+ map<string, ghobject_t> moved;
+ int num_moved = 0;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ vector<string> new_path;
+ get_path_components(i->second, &new_path);
+ mapped[new_path[level]][i->first] = i->second;
+ }
+ for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
+ i != mapped.end();
+ ) {
+ dst[level] = i->first;
+ /* If the info already exists, it must be correct,
+ * we may be picking up a partially finished split */
+ subdir_info_s temp;
+ // subdir has already been fully copied
+ if (subdirs.count(i->first) && !get_info(dst, &temp)) {
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ moved[j->first] = j->second;
+ num_moved++;
+ objects.erase(j->first);
+ }
+ ++i;
+ continue;
+ }
+
+ subdir_info_s info_new;
+ info_new.objs = i->second.size();
+ info_new.subdirs = 0;
+ info_new.hash_level = level + 1;
+ if (must_merge(info_new) && !subdirs.count(i->first)) {
+ mapped.erase(i++);
+ continue;
+ }
+
+ // Subdir doesn't yet exist
+ if (!subdirs.count(i->first)) {
+ info.subdirs += 1;
+ r = create_path(dst);
+ if (r < 0)
+ return r;
+ } // else subdir has been created but only partially copied
+
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ moved[j->first] = j->second;
+ num_moved++;
+ objects.erase(j->first);
+ r = link_object(path, dst, j->second, j->first);
+ // May be a partially finished split
+ if (r < 0 && r != -EEXIST) {
+ return r;
+ }
+ }
+
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+
+ // Presence of info must imply that all objects have been copied
+ r = set_info(dst, info_new);
+ if (r < 0)
+ return r;
+
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+
+ ++i;
+ }
+ r = remove_objects(path, moved, &objects);
+ if (r < 0)
+ return r;
+ info.objs = objects.size();
+ r = reset_attr(path);
+ if (r < 0)
+ return r;
+ r = fsync_dir(path);
+ if (r < 0)
+ return r;
+ return end_split_or_merge(path);
+}
+
+void HashIndex::get_path_components(const ghobject_t &oid,
+ vector<string> *path) {
+ char buf[MAX_HASH_LEVEL + 1];
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
+
+ // Path components are the hex characters of oid.hobj.hash, least
+ // significant first
+ for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+ path->push_back(string(&buf[i], 1));
+ }
+}
+
+string HashIndex::get_hash_str(uint32_t hash) {
+ char buf[MAX_HASH_LEVEL + 1];
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
+ string retval;
+ for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+ retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
+ }
+ return retval;
+}
+
+string HashIndex::get_path_str(const ghobject_t &oid) {
+ ceph_assert(!oid.is_max());
+ return get_hash_str(oid.hobj.get_hash());
+}
+
+uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
+ while (prefix.size() < sizeof(uint32_t) * 2) {
+ prefix.push_back('0');
+ }
+ uint32_t hash;
+ sscanf(prefix.c_str(), "%x", &hash);
+ // nibble reverse
+ hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
+ hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
+ hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
+ return hash;
+}
+
+int HashIndex::get_path_contents_by_hash_bitwise(
+ const vector<string> &path,
+ const ghobject_t *next_object,
+ set<string, CmpHexdigitStringBitwise> *hash_prefixes,
+ set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
+{
+ map<string, ghobject_t> rev_objects;
+ int r;
+ r = list_objects(path, 0, 0, &rev_objects);
+ if (r < 0)
+ return r;
+ // bitwise sort
+ for (map<string, ghobject_t>::iterator i = rev_objects.begin();
+ i != rev_objects.end();
+ ++i) {
+ if (next_object && i->second < *next_object)
+ continue;
+ string hash_prefix = get_path_str(i->second);
+ hash_prefixes->insert(hash_prefix);
+ objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
+ }
+ vector<string> subdirs;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ // sort subdirs bitwise (by reversing hex digit nibbles)
+ std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
+
+ // Local to this function, we will convert the prefix strings
+ // (previously simply the reversed hex digits) to also have each
+ // digit's nibbles reversed. This will make the strings sort
+ // bitwise.
+ string cur_prefix;
+ for (vector<string>::const_iterator i = path.begin();
+ i != path.end();
+ ++i) {
+ cur_prefix.append(reverse_hexdigit_bits_string(*i));
+ }
+ string next_object_string;
+ if (next_object)
+ next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
+ if (next_object) {
+ if (next_object->is_max())
+ continue;
+ if (candidate < next_object_string.substr(0, candidate.size()))
+ continue;
+ }
+ // re-reverse the hex digit nibbles for the caller
+ hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
+ }
+ return 0;
+}
+
+int HashIndex::list_by_hash(const vector<string> &path,
+ const ghobject_t &end,
+ int max_count,
+ ghobject_t *next,
+ vector<ghobject_t> *out)
+{
+ ceph_assert(out);
+ return list_by_hash_bitwise(path, end, max_count, next, out);
+}
+
+int HashIndex::list_by_hash_bitwise(
+ const vector<string> &path,
+ const ghobject_t& end,
+ int max_count,
+ ghobject_t *next,
+ vector<ghobject_t> *out)
+{
+ vector<string> next_path = path;
+ next_path.push_back("");
+ set<string, CmpHexdigitStringBitwise> hash_prefixes;
+ set<pair<string, ghobject_t>, CmpPairBitwise> objects;
+ int r = get_path_contents_by_hash_bitwise(path,
+ next,
+ &hash_prefixes,
+ &objects);
+ if (r < 0)
+ return r;
+ for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
+ i != hash_prefixes.end();
+ ++i) {
+ dout(20) << __func__ << " prefix " << *i << dendl;
+ set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
+ make_pair(*i, ghobject_t()));
+ if (j == objects.end() || j->first != *i) {
+ *(next_path.rbegin()) = *(i->rbegin());
+ ghobject_t next_recurse;
+ if (next)
+ next_recurse = *next;
+ r = list_by_hash_bitwise(next_path,
+ end,
+ max_count,
+ &next_recurse,
+ out);
+
+ if (r < 0)
+ return r;
+ if (!next_recurse.is_max()) {
+ if (next)
+ *next = next_recurse;
+ return 0;
+ }
+ } else {
+ while (j != objects.end() && j->first == *i) {
+ if (max_count > 0 && out->size() == (unsigned)max_count) {
+ if (next)
+ *next = j->second;
+ return 0;
+ }
+ if (j->second >= end) {
+ if (next)
+ *next = j->second;
+ return 0;
+ }
+ if (!next || j->second >= *next) {
+ dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
+ out->push_back(j->second);
+ }
+ ++j;
+ }
+ }
+ }
+ if (next)
+ *next = ghobject_t::get_max();
+ return 0;
+}
+
+
diff --git a/src/os/filestore/HashIndex.h b/src/os/filestore/HashIndex.h
new file mode 100644
index 00000000..7e34d155
--- /dev/null
+++ b/src/os/filestore/HashIndex.h
@@ -0,0 +1,462 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_HASHINDEX_H
+#define CEPH_HASHINDEX_H
+
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "LFNIndex.h"
+
+extern string reverse_hexdigit_bits_string(string l);
+
+/**
+ * Implements collection prehashing.
+ *
+ * @verbatim
+ * (root) - 0 - 0
+ * - 1
+ * - E
+ * - 1
+ * - 2 - D - 0
+ * .
+ * .
+ * .
+ * - F - 0
+ * @endverbatim
+ *
+ * A file is located at the longest existing directory from the root
+ * given by the hex characters in the hash beginning with the least
+ * significant.
+ *
+ * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
+ * would be located in (root)/2/D/0/
+ *
+ * Subdirectories are created when the number of objects in a
+ * directory exceed 16 * (abs(merge_threshhold) * split_multiplier +
+ * split_rand_factor). The number of objects in a directory is encoded
+ * as subdir_info_s in an xattr on the directory.
+ */
+class HashIndex : public LFNIndex {
+private:
+ /// Attribute name for storing subdir info @see subdir_info_s
+ static const string SUBDIR_ATTR;
+ /// Attribute name for storing index-wide settings
+ static const string SETTINGS_ATTR;
+ /// Attribute name for storing in progress op tag
+ static const string IN_PROGRESS_OP_TAG;
+ /// Size (bits) in object hash
+ static const int PATH_HASH_LEN = 32;
+ /// Max length of hashed path
+ static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4);
+
+ /**
+ * Merges occur when the number of object drops below
+ * merge_threshold and splits occur when the number of objects
+ * exceeds:
+ *
+ * 16 * (abs(merge_threshold) * split_multiplier + split_rand_factor)
+ *
+ * Please note if merge_threshold is less than zero, it will never
+ * do merging
+ */
+ int merge_threshold;
+ int split_multiplier;
+
+ /// Encodes current subdir state for determining when to split/merge.
+ struct subdir_info_s {
+ uint64_t objs; ///< Objects in subdir.
+ uint32_t subdirs; ///< Subdirs in subdir.
+ uint32_t hash_level; ///< Hashlevel of subdir.
+
+ subdir_info_s() : objs(0), subdirs(0), hash_level(0) {}
+
+ void encode(bufferlist &bl) const
+ {
+ using ceph::encode;
+ __u8 v = 1;
+ encode(v, bl);
+ encode(objs, bl);
+ encode(subdirs, bl);
+ encode(hash_level, bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl)
+ {
+ using ceph::decode;
+ __u8 v;
+ decode(v, bl);
+ ceph_assert(v == 1);
+ decode(objs, bl);
+ decode(subdirs, bl);
+ decode(hash_level, bl);
+ }
+ };
+
+ struct settings_s {
+ uint32_t split_rand_factor; ///< random factor added to split threshold (only on root of collection)
+ settings_s() : split_rand_factor(0) {}
+ void encode(bufferlist &bl) const
+ {
+ using ceph::encode;
+ __u8 v = 1;
+ encode(v, bl);
+ encode(split_rand_factor, bl);
+ }
+ void decode(bufferlist::const_iterator &bl)
+ {
+ using ceph::decode;
+ __u8 v;
+ decode(v, bl);
+ decode(split_rand_factor, bl);
+ }
+ } settings;
+
+ /// Encodes in progress split or merge
+ struct InProgressOp {
+ static const int SPLIT = 0;
+ static const int MERGE = 1;
+ static const int COL_SPLIT = 2;
+ int op;
+ vector<string> path;
+
+ InProgressOp(int op, const vector<string> &path)
+ : op(op), path(path) {}
+
+ explicit InProgressOp(bufferlist::const_iterator &bl) {
+ decode(bl);
+ }
+
+ bool is_split() const { return op == SPLIT; }
+ bool is_col_split() const { return op == COL_SPLIT; }
+ bool is_merge() const { return op == MERGE; }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ __u8 v = 1;
+ encode(v, bl);
+ encode(op, bl);
+ encode(path, bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl) {
+ using ceph::decode;
+ __u8 v;
+ decode(v, bl);
+ ceph_assert(v == 1);
+ decode(op, bl);
+ decode(path, bl);
+ }
+ };
+
+
+public:
+ /// Constructor.
+ HashIndex(
+ CephContext* cct,
+ coll_t collection, ///< [in] Collection
+ const char *base_path, ///< [in] Path to the index root.
+ int merge_at, ///< [in] Merge threshold.
+ int split_multiple, ///< [in] Split threshold.
+ uint32_t index_version,///< [in] Index version
+ double retry_probability=0) ///< [in] retry probability
+ : LFNIndex(cct, collection, base_path, index_version, retry_probability),
+ merge_threshold(merge_at),
+ split_multiplier(split_multiple)
+ {}
+
+ int read_settings() override;
+
+ /// @see CollectionIndex
+ uint32_t collection_version() override { return index_version; }
+
+ /// @see CollectionIndex
+ int cleanup() override;
+
+ /// @see CollectionIndex
+ int prep_delete() override;
+
+ /// @see CollectionIndex
+ int _split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest
+ ) override;
+
+ /// @see CollectionIndex
+ int _merge(
+ uint32_t bits,
+ CollectionIndex* dest
+ ) override;
+
+ int _merge_dirs(
+ HashIndex& from,
+ HashIndex& to,
+ const vector<string>& path);
+
+ /// @see CollectionIndex
+ int apply_layout_settings(int target_level) override;
+
+protected:
+ int _init() override;
+
+ int _created(
+ const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name
+ ) override;
+ int _remove(
+ const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name
+ ) override;
+ int _lookup(
+ const ghobject_t &oid,
+ vector<string> *path,
+ string *mangled_name,
+ int *hardlink
+ ) override;
+
+ /**
+ * Pre-hash the collection to create folders according to the expected number
+ * of objects in this collection.
+ */
+ int _pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ ) override;
+
+ int _collection_list_partial(
+ const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next
+ ) override;
+private:
+ /// Internal recursively remove path and its subdirs
+ int _recursive_remove(
+ const vector<string> &path, ///< [in] path to remove
+ bool top ///< [in] internal tracking of first caller
+ ); /// @return Error Code, 0 on success
+ /// Recursively remove path and its subdirs
+ int recursive_remove(
+ const vector<string> &path ///< [in] path to remove
+ ); /// @return Error Code, 0 on success
+ /// Tag root directory at beginning of col_split
+ int start_col_split(
+ const vector<string> &path ///< [in] path to split
+ ); ///< @return Error Code, 0 on success
+ /// Tag root directory at beginning of split
+ int start_split(
+ const vector<string> &path ///< [in] path to split
+ ); ///< @return Error Code, 0 on success
+ /// Tag root directory at beginning of split
+ int start_merge(
+ const vector<string> &path ///< [in] path to merge
+ ); ///< @return Error Code, 0 on success
+ /// Remove tag at end of split or merge
+ int end_split_or_merge(
+ const vector<string> &path ///< [in] path to split or merged
+ ); ///< @return Error Code, 0 on success
+ /// Gets info from the xattr on the subdir represented by path
+ int get_info(
+ const vector<string> &path, ///< [in] Path from which to read attribute.
+ subdir_info_s *info ///< [out] Attribute value
+ ); /// @return Error Code, 0 on success
+
+ /// Sets info to the xattr on the subdir represented by path
+ int set_info(
+ const vector<string> &path, ///< [in] Path on which to set attribute.
+ const subdir_info_s &info ///< [in] Value to set
+ ); /// @return Error Code, 0 on success
+
+ /// Encapsulates logic for when to split.
+ bool must_merge(
+ const subdir_info_s &info ///< [in] Info to check
+ ); /// @return True if info must be merged, False otherwise
+
+ /// Encapsulates logic for when to merge.
+ bool must_split(
+ const subdir_info_s &info, ///< [in] Info to check
+ int target_level = 0
+ ); /// @return True if info must be split, False otherwise
+
+ /// Initiates merge
+ int initiate_merge(
+ const vector<string> &path, ///< [in] Subdir to merge
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Completes merge
+ int complete_merge(
+ const vector<string> &path, ///< [in] Subdir to merge
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Resets attr to match actual subdir contents
+ int reset_attr(
+ const vector<string> &path ///< [in] path to cleanup
+ );
+
+ /// Initiate Split
+ int initiate_split(
+ const vector<string> &path, ///< [in] Subdir to split
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Completes Split
+ int complete_split(
+ const vector<string> &path, ///< [in] Subdir to split
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Determine path components from hoid hash
+ void get_path_components(
+ const ghobject_t &oid, ///< [in] Object for which to get path components
+ vector<string> *path ///< [out] Path components for hoid.
+ );
+
+ /// Pre-hash and split folders to avoid runtime splitting
+ /// according to the given expected object number.
+ int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs);
+
+ /// Initialize the folder (dir info) with the given hash
+ /// level and number of its subdirs.
+ int init_split_folder(vector<string> &path, uint32_t hash_level);
+
+ /// do collection split for path
+ static int col_split_level(
+ HashIndex &from, ///< [in] from index
+ HashIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path to split
+ uint32_t bits, ///< [in] num bits to match
+ uint32_t match, ///< [in] bits to match
+ unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred
+ );
+
+
+ /**
+ * Get string representation of ghobject_t/hash
+ *
+ * e.g: 0x01234567 -> "76543210"
+ */
+ static string get_path_str(
+ const ghobject_t &oid ///< [in] Object to get hash string for
+ ); ///< @return Hash string for hoid.
+
+ /// Get string from hash, @see get_path_str
+ static string get_hash_str(
+ uint32_t hash ///< [in] Hash to convert to a string.
+ ); ///< @return String representation of hash
+
+ /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00
+ static uint32_t hash_prefix_to_hash(
+ string prefix ///< [in] string to convert
+ ); ///< @return Hash
+
+ /// Get hash mod from path
+ static void path_to_hobject_hash_prefix(
+ const vector<string> &path,///< [in] path to convert
+ uint32_t *bits, ///< [out] bits
+ uint32_t *hash ///< [out] hash
+ ) {
+ string hash_str;
+ for (vector<string>::const_iterator i = path.begin();
+ i != path.end();
+ ++i) {
+ hash_str.push_back(*i->begin());
+ }
+ uint32_t rev_hash = hash_prefix_to_hash(hash_str);
+ if (hash)
+ *hash = rev_hash;
+ if (bits)
+ *bits = path.size() * 4;
+ }
+
+ /// Calculate the number of bits.
+ static int calc_num_bits(uint64_t n) {
+ int ret = 0;
+ while (n > 0) {
+ n = n >> 1;
+ ret++;
+ }
+ return ret;
+ }
+
+ /// Convert a number to hex string (upper case).
+ static string to_hex(int n) {
+ ceph_assert(n >= 0 && n < 16);
+ char c = (n <= 9 ? ('0' + n) : ('A' + n - 10));
+ string str;
+ str.append(1, c);
+ return str;
+ }
+
+ struct CmpPairBitwise {
+ bool operator()(const pair<string, ghobject_t>& l,
+ const pair<string, ghobject_t>& r) const
+ {
+ if (l.first < r.first)
+ return true;
+ if (l.first > r.first)
+ return false;
+ if (cmp(l.second, r.second) < 0)
+ return true;
+ return false;
+ }
+ };
+
+ struct CmpHexdigitStringBitwise {
+ bool operator()(const string& l, const string& r) const {
+ return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r);
+ }
+ };
+
+ /// Get path contents by hash
+ int get_path_contents_by_hash_bitwise(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t *next_object, /// [in] list > *next_object
+ set<string, CmpHexdigitStringBitwise> *hash_prefixes, /// [out] prefixes in dir
+ set<pair<string, ghobject_t>, CmpPairBitwise> *objects /// [out] objects
+ );
+
+ /// List objects in collection in ghobject_t order
+ int list_by_hash(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t &end, /// [in] List only objects < end
+ int max_count, /// [in] List at most max_count
+ ghobject_t *next, /// [in,out] List objects >= *next
+ vector<ghobject_t> *out /// [out] Listed objects
+ ); ///< @return Error Code, 0 on success
+ /// List objects in collection in ghobject_t order
+ int list_by_hash_bitwise(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t &end, /// [in] List only objects < end
+ int max_count, /// [in] List at most max_count
+ ghobject_t *next, /// [in,out] List objects >= *next
+ vector<ghobject_t> *out /// [out] Listed objects
+ ); ///< @return Error Code, 0 on success
+
+ /// Create the given levels of sub directories from the given root.
+ /// The contents of *path* is not changed after calling this function.
+ int recursive_create_path(vector<string>& path, int level);
+
+ /// split each dir below the given path
+ int split_dirs(const vector<string> &path, int target_level = 0);
+
+ int write_settings();
+};
+
+#endif
diff --git a/src/os/filestore/IndexManager.cc b/src/os/filestore/IndexManager.cc
new file mode 100644
index 00000000..73095026
--- /dev/null
+++ b/src/os/filestore/IndexManager.cc
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/unordered_map.h"
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include <errno.h>
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+
+#include "IndexManager.h"
+#include "HashIndex.h"
+#include "CollectionIndex.h"
+
+#include "chain_xattr.h"
+
+static int set_version(const char *path, uint32_t version) {
+ bufferlist bl;
+ encode(version, bl);
+ return chain_setxattr<true, true>(
+ path, "user.cephos.collection_version", bl.c_str(),
+ bl.length());
+}
+
+static int get_version(const char *path, uint32_t *version) {
+ bufferptr bp(PATH_MAX);
+ int r = chain_getxattr(path, "user.cephos.collection_version",
+ bp.c_str(), bp.length());
+ if (r < 0) {
+ if (r != -ENOENT) {
+ *version = 0;
+ return 0;
+ } else {
+ return r;
+ }
+ }
+ bp.set_length(r);
+ bufferlist bl;
+ bl.push_back(bp);
+ auto i = bl.cbegin();
+ decode(*version, i);
+ return 0;
+}
+
+IndexManager::~IndexManager() {
+
+ for (ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.begin();
+ it != col_indices.end(); ++it) {
+
+ delete it->second;
+ it->second = NULL;
+ }
+ col_indices.clear();
+}
+
+
+int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
+ RWLock::WLocker l(lock);
+ int r = set_version(path, version);
+ if (r < 0)
+ return r;
+ HashIndex index(cct, c, path, cct->_conf->filestore_merge_threshold,
+ cct->_conf->filestore_split_multiple,
+ version,
+ cct->_conf->filestore_index_retry_probability);
+ r = index.init();
+ if (r < 0)
+ return r;
+ return index.read_settings();
+}
+
+int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) {
+ if (upgrade) {
+ // Need to check the collection generation
+ int r;
+ uint32_t version = 0;
+ r = get_version(path, &version);
+ if (r < 0)
+ return r;
+
+ switch (version) {
+ case CollectionIndex::FLAT_INDEX_TAG:
+ case CollectionIndex::HASH_INDEX_TAG: // fall through
+ case CollectionIndex::HASH_INDEX_TAG_2: // fall through
+ case CollectionIndex::HOBJECT_WITH_POOL: {
+ // Must be a HashIndex
+ *index = new HashIndex(cct, c, path,
+ cct->_conf->filestore_merge_threshold,
+ cct->_conf->filestore_split_multiple,
+ version);
+ return (*index)->read_settings();
+ }
+ default: ceph_abort();
+ }
+
+ } else {
+ // No need to check
+ *index = new HashIndex(cct, c, path, cct->_conf->filestore_merge_threshold,
+ cct->_conf->filestore_split_multiple,
+ CollectionIndex::HOBJECT_WITH_POOL,
+ cct->_conf->filestore_index_retry_probability);
+ return (*index)->read_settings();
+ }
+}
+
+bool IndexManager::get_index_optimistic(coll_t c, Index *index) {
+ RWLock::RLocker l(lock);
+ ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
+ if (it == col_indices.end())
+ return false;
+ index->index = it->second;
+ return true;
+}
+
+int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) {
+ if (get_index_optimistic(c, index))
+ return 0;
+ RWLock::WLocker l(lock);
+ ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
+ if (it == col_indices.end()) {
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str());
+ CollectionIndex* colIndex = NULL;
+ int r = build_index(c, path, &colIndex);
+ if (r < 0)
+ return r;
+ col_indices[c] = colIndex;
+ index->index = colIndex;
+ } else {
+ index->index = it->second;
+ }
+ return 0;
+}
diff --git a/src/os/filestore/IndexManager.h b/src/os/filestore/IndexManager.h
new file mode 100644
index 00000000..19cd2926
--- /dev/null
+++ b/src/os/filestore/IndexManager.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef OS_INDEXMANAGER_H
+#define OS_INDEXMANAGER_H
+
+#include "include/unordered_map.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+
+#include "CollectionIndex.h"
+#include "HashIndex.h"
+
+
+/// Public type for Index
+struct Index {
+ CollectionIndex *index;
+
+ Index() : index(NULL) {}
+ explicit Index(CollectionIndex* index) : index(index) {}
+
+ CollectionIndex *operator->() { return index; }
+ CollectionIndex &operator*() { return *index; }
+};
+
+
+/**
+ * Encapsulates mutual exclusion for CollectionIndexes.
+ *
+ * Allowing a modification (removal or addition of an object) to occur
+ * while a read is occurring (lookup of an object's path and use of
+ * that path) may result in the path becoming invalid. Thus, during
+ * the lifetime of a CollectionIndex object and any paths returned
+ * by it, no other concurrent accesses may be allowed.
+ * This is enforced by using CollectionIndex::access_lock
+ */
+class IndexManager {
+ CephContext* cct;
+ RWLock lock; ///< Lock for Index Manager
+ bool upgrade;
+ ceph::unordered_map<coll_t, CollectionIndex* > col_indices;
+
+ /**
+ * Index factory
+ *
+ * Encapsulates logic for handling legacy FileStore
+ * layouts
+ *
+ * @param [in] c Collection for which to get index
+ * @param [in] path Path to collection
+ * @param [out] index Index for c
+ * @return error code
+ */
+ int build_index(coll_t c, const char *path, CollectionIndex **index);
+ bool get_index_optimistic(coll_t c, Index *index);
+public:
+ /// Constructor
+ explicit IndexManager(CephContext* cct,
+ bool upgrade) : cct(cct),
+ lock("IndexManager lock"),
+ upgrade(upgrade) {}
+
+ ~IndexManager();
+
+ /**
+ * Reserve and return index for c
+ *
+ * @param [in] c Collection for which to get index
+ * @param [in] baseDir base directory of collections
+ * @param [out] index Index for c
+ * @return error code
+ */
+ int get_index(coll_t c, const string& baseDir, Index *index);
+
+ /**
+ * Initialize index for collection c at path
+ *
+ * @param [in] c Collection for which to init Index
+ * @param [in] path Path to collection
+ * @param [in] filestore_version version of containing FileStore
+ * @return error code
+ */
+ int init_index(coll_t c, const char *path, uint32_t filestore_version);
+};
+
+#endif
diff --git a/src/os/filestore/Journal.h b/src/os/filestore/Journal.h
new file mode 100644
index 00000000..cfb667d8
--- /dev/null
+++ b/src/os/filestore/Journal.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_JOURNAL_H
+#define CEPH_JOURNAL_H
+
+#include <errno.h>
+
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "common/Finisher.h"
+#include "common/TrackedOp.h"
+#include "os/ObjectStore.h"
+#include "common/zipkin_trace.h"
+
+class PerfCounters;
+
+class Journal {
+protected:
+ uuid_d fsid;
+ Finisher *finisher;
+public:
+ CephContext* cct;
+ PerfCounters *logger;
+protected:
+ Cond *do_sync_cond;
+ bool wait_on_full;
+
+public:
+ Journal(CephContext* cct, uuid_d f, Finisher *fin, Cond *c=0) :
+ fsid(f), finisher(fin), cct(cct), logger(NULL),
+ do_sync_cond(c),
+ wait_on_full(false) { }
+ virtual ~Journal() { }
+
+ virtual int check() = 0; ///< check if journal appears valid
+ virtual int create() = 0; ///< create a fresh journal
+ virtual int open(uint64_t fs_op_seq) = 0; ///< open an existing journal
+ virtual void close() = 0; ///< close an open journal
+
+ virtual void flush() = 0;
+
+ virtual void get_devices(set<string> *ls) {}
+ virtual void collect_metadata(map<string,string> *pm) {}
+ /**
+ * reserve_throttle_and_backoff
+ *
+ * Implementation may throttle or backoff based on ops
+ * reserved here but not yet released using committed_thru.
+ */
+ virtual void reserve_throttle_and_backoff(uint64_t count) = 0;
+
+ virtual int dump(ostream& out) { return -EOPNOTSUPP; }
+
+ void set_wait_on_full(bool b) { wait_on_full = b; }
+
+ // writes
+ virtual bool is_writeable() = 0;
+ virtual int make_writeable() = 0;
+ virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
+ Context *oncommit,
+ TrackedOpRef osd_op = TrackedOpRef()) = 0;
+ virtual void commit_start(uint64_t seq) = 0;
+ virtual void committed_thru(uint64_t seq) = 0;
+
+ /// Read next journal entry - asserts on invalid journal
+ virtual bool read_entry(
+ bufferlist &bl, ///< [out] payload on successful read
+ uint64_t &seq ///< [in,out] sequence number on last successful read
+ ) = 0; ///< @return true on successful read, false on journal end
+
+ virtual bool should_commit_now() = 0;
+
+ virtual int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) = 0;
+
+ virtual off64_t get_journal_size_estimate() { return 0; }
+
+ // reads/recovery
+
+};
+
+#endif
diff --git a/src/os/filestore/JournalThrottle.cc b/src/os/filestore/JournalThrottle.cc
new file mode 100644
index 00000000..8475bbbf
--- /dev/null
+++ b/src/os/filestore/JournalThrottle.cc
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "JournalThrottle.h"
+#include "include/ceph_assert.h"
+
+bool JournalThrottle::set_params(
+ double _low_threshhold,
+ double _high_threshhold,
+ double _expected_throughput,
+ double _high_multiple,
+ double _max_multiple,
+ uint64_t _throttle_max,
+ std::ostream *errstream)
+{
+ return throttle.set_params(
+ _low_threshhold,
+ _high_threshhold,
+ _expected_throughput,
+ _high_multiple,
+ _max_multiple,
+ _throttle_max,
+ errstream);
+}
+
+std::chrono::duration<double> JournalThrottle::get(uint64_t c)
+{
+ return throttle.get(c);
+}
+
+uint64_t JournalThrottle::take(uint64_t c)
+{
+ return throttle.take(c);
+}
+
+void JournalThrottle::register_throttle_seq(uint64_t seq, uint64_t c)
+{
+ locker l(lock);
+ journaled_ops.push_back(std::make_pair(seq, c));
+}
+
+std::pair<uint64_t, uint64_t> JournalThrottle::flush(uint64_t mono_id)
+{
+ uint64_t to_put_bytes = 0;
+ uint64_t to_put_ops = 0;
+ {
+ locker l(lock);
+ while (!journaled_ops.empty() &&
+ journaled_ops.front().first <= mono_id) {
+ to_put_bytes += journaled_ops.front().second;
+ to_put_ops++;
+ journaled_ops.pop_front();
+ }
+ }
+ throttle.put(to_put_bytes);
+ return make_pair(to_put_ops, to_put_bytes);
+}
+
+uint64_t JournalThrottle::get_current()
+{
+ return throttle.get_current();
+}
+
+uint64_t JournalThrottle::get_max()
+{
+ return throttle.get_max();
+}
diff --git a/src/os/filestore/JournalThrottle.h b/src/os/filestore/JournalThrottle.h
new file mode 100644
index 00000000..75485d6d
--- /dev/null
+++ b/src/os/filestore/JournalThrottle.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_THROTTLE_H
+#define CEPH_JOURNAL_THROTTLE_H
+
+#include "common/Throttle.h"
+
+#include <list>
+#include <deque>
+#include <condition_variable>
+#include <thread>
+#include <vector>
+#include <chrono>
+#include <iostream>
+
+/**
+ * JournalThrottle
+ *
+ * Throttle designed to implement dynamic throttling as the journal fills
+ * up. The goal is to not delay ops at all when the journal is relatively
+ * empty, delay ops somewhat as the journal begins to fill (with the delay
+ * getting linearly longer as the journal fills up to a high water mark),
+ * and to delay much more aggressively (though still linearly with usage)
+ * until we hit the max value.
+ *
+ * The implementation simply wraps BackoffThrottle with a queue of
+ * journaled but not synced ops.
+ *
+ * The usage pattern is as follows:
+ * 1) Call get(seq, bytes) before taking the op_queue_throttle
+ * 2) Once the journal is flushed, flush(max_op_id_flushed)
+ */
+class JournalThrottle {
+ BackoffThrottle throttle;
+
+ std::mutex lock;
+ /// deque<id, count>
+ std::deque<std::pair<uint64_t, uint64_t> > journaled_ops;
+ using locker = std::unique_lock<std::mutex>;
+
+public:
+ /**
+ * set_params
+ *
+ * Sets params. If the params are invalid, returns false
+ * and populates errstream (if non-null) with a user compreshensible
+ * explanation.
+ */
+ bool set_params(
+ double low_threshhold,
+ double high_threshhold,
+ double expected_throughput,
+ double high_multiple,
+ double max_multiple,
+ uint64_t throttle_max,
+ std::ostream *errstream);
+
+ /**
+ * gets specified throttle for id mono_id, waiting as necessary
+ *
+ * @param c [in] amount to take
+ * @return duration waited
+ */
+ std::chrono::duration<double> get(uint64_t c);
+
+ /**
+ * take
+ *
+ * Takes specified throttle without waiting
+ */
+ uint64_t take(uint64_t c);
+
+ /**
+ * register_throttle_seq
+ *
+ * Registers a sequence number with an amount of throttle to
+ * release upon flush()
+ *
+ * @param seq [in] seq
+ */
+ void register_throttle_seq(uint64_t seq, uint64_t c);
+
+
+ /**
+ * Releases throttle held by ids <= mono_id
+ *
+ * @param mono_id [in] id up to which to flush
+ * @returns pair<ops_flushed, bytes_flushed>
+ */
+ std::pair<uint64_t, uint64_t> flush(uint64_t mono_id);
+
+ uint64_t get_current();
+ uint64_t get_max();
+
+ JournalThrottle(
+ unsigned expected_concurrency ///< [in] determines size of conds
+ ) : throttle(g_ceph_context, "filestore_journal", expected_concurrency) {}
+};
+
+#endif
diff --git a/src/os/filestore/JournalingObjectStore.cc b/src/os/filestore/JournalingObjectStore.cc
new file mode 100644
index 00000000..714d0935
--- /dev/null
+++ b/src/os/filestore/JournalingObjectStore.cc
@@ -0,0 +1,271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "JournalingObjectStore.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+
+
+void JournalingObjectStore::journal_start()
+{
+ dout(10) << "journal_start" << dendl;
+ finisher.start();
+}
+
+void JournalingObjectStore::journal_stop()
+{
+ dout(10) << "journal_stop" << dendl;
+ finisher.wait_for_empty();
+ finisher.stop();
+}
+
+// A journal_replay() makes journal writeable, this closes that out.
+void JournalingObjectStore::journal_write_close()
+{
+ if (journal) {
+ journal->close();
+ delete journal;
+ journal = 0;
+ }
+ apply_manager.reset();
+}
+
+int JournalingObjectStore::journal_replay(uint64_t fs_op_seq)
+{
+ dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl;
+
+ if (cct->_conf->journal_replay_from) {
+ dout(0) << "journal_replay forcing replay from "
+ << cct->_conf->journal_replay_from
+ << " instead of " << fs_op_seq << dendl;
+ // the previous op is the last one committed
+ fs_op_seq = cct->_conf->journal_replay_from - 1;
+ }
+
+ uint64_t op_seq = fs_op_seq;
+ apply_manager.init_seq(fs_op_seq);
+
+ if (!journal) {
+ submit_manager.set_op_seq(op_seq);
+ return 0;
+ }
+
+ int err = journal->open(op_seq);
+ if (err < 0) {
+ dout(3) << "journal_replay open failed with "
+ << cpp_strerror(err) << dendl;
+ delete journal;
+ journal = 0;
+ return err;
+ }
+
+ replaying = true;
+
+ int count = 0;
+ while (1) {
+ bufferlist bl;
+ uint64_t seq = op_seq + 1;
+ if (!journal->read_entry(bl, seq)) {
+ dout(3) << "journal_replay: end of journal, done." << dendl;
+ break;
+ }
+
+ if (seq <= op_seq) {
+ dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl;
+ continue;
+ }
+ ceph_assert(op_seq == seq-1);
+
+ dout(3) << "journal_replay: applying op seq " << seq << dendl;
+ auto p = bl.cbegin();
+ vector<ObjectStore::Transaction> tls;
+ while (!p.end()) {
+ tls.emplace_back(Transaction(p));
+ }
+
+ apply_manager.op_apply_start(seq);
+ int r = do_transactions(tls, seq);
+ apply_manager.op_apply_finish(seq);
+
+ op_seq = seq;
+ count++;
+
+ dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl;
+ }
+
+ if (count)
+ dout(3) << "journal_replay: total = " << count << dendl;
+
+ replaying = false;
+
+ submit_manager.set_op_seq(op_seq);
+
+ // done reading, make writeable.
+ err = journal->make_writeable();
+ if (err < 0)
+ return err;
+
+ if (!count)
+ journal->committed_thru(fs_op_seq);
+
+ return count;
+}
+
+
+// ------------------------------------
+
+uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op)
+{
+ Mutex::Locker l(apply_lock);
+ while (blocked) {
+ dout(10) << "op_apply_start blocked, waiting" << dendl;
+ blocked_cond.Wait(apply_lock);
+ }
+ dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> "
+ << (open_ops+1) << dendl;
+ ceph_assert(!blocked);
+ ceph_assert(op > committed_seq);
+ open_ops++;
+ return op;
+}
+
+void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op)
+{
+ Mutex::Locker l(apply_lock);
+ dout(10) << "op_apply_finish " << op << " open_ops " << open_ops << " -> "
+ << (open_ops-1) << ", max_applied_seq " << max_applied_seq << " -> "
+ << std::max(op, max_applied_seq) << dendl;
+ --open_ops;
+ ceph_assert(open_ops >= 0);
+
+ // signal a blocked commit_start
+ if (blocked) {
+ blocked_cond.Signal();
+ }
+
+ // there can be multiple applies in flight; track the max value we
+ // note. note that we can't _read_ this value and learn anything
+ // meaningful unless/until we've quiesced all in-flight applies.
+ if (op > max_applied_seq)
+ max_applied_seq = op;
+}
+
+uint64_t JournalingObjectStore::SubmitManager::op_submit_start()
+{
+ lock.Lock();
+ uint64_t op = ++op_seq;
+ dout(10) << "op_submit_start " << op << dendl;
+ return op;
+}
+
+void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op)
+{
+ dout(10) << "op_submit_finish " << op << dendl;
+ if (op != op_submitted + 1) {
+ dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
+ << ", OUT OF ORDER" << dendl;
+ ceph_abort_msg("out of order op_submit_finish");
+ }
+ op_submitted = op;
+ lock.Unlock();
+}
+
+
+// ------------------------------------------
+
+void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c)
+{
+ Mutex::Locker l(com_lock);
+ ceph_assert(c);
+ commit_waiters[op].push_back(c);
+}
+
+bool JournalingObjectStore::ApplyManager::commit_start()
+{
+ bool ret = false;
+
+ {
+ Mutex::Locker l(apply_lock);
+ dout(10) << "commit_start max_applied_seq " << max_applied_seq
+ << ", open_ops " << open_ops << dendl;
+ blocked = true;
+ while (open_ops > 0) {
+ dout(10) << "commit_start waiting for " << open_ops
+ << " open ops to drain" << dendl;
+ blocked_cond.Wait(apply_lock);
+ }
+ ceph_assert(open_ops == 0);
+ dout(10) << "commit_start blocked, all open_ops have completed" << dendl;
+ {
+ Mutex::Locker l(com_lock);
+ if (max_applied_seq == committed_seq) {
+ dout(10) << "commit_start nothing to do" << dendl;
+ blocked = false;
+ ceph_assert(commit_waiters.empty());
+ goto out;
+ }
+
+ committing_seq = max_applied_seq;
+
+ dout(10) << "commit_start committing " << committing_seq
+ << ", still blocked" << dendl;
+ }
+ }
+ ret = true;
+
+ if (journal)
+ journal->commit_start(committing_seq); // tell the journal too
+ out:
+ return ret;
+}
+
+void JournalingObjectStore::ApplyManager::commit_started()
+{
+ Mutex::Locker l(apply_lock);
+ // allow new ops. (underlying fs should now be committing all prior ops)
+ dout(10) << "commit_started committing " << committing_seq << ", unblocking"
+ << dendl;
+ blocked = false;
+ blocked_cond.Signal();
+}
+
+void JournalingObjectStore::ApplyManager::commit_finish()
+{
+ Mutex::Locker l(com_lock);
+ dout(10) << "commit_finish thru " << committing_seq << dendl;
+
+ if (journal)
+ journal->committed_thru(committing_seq);
+
+ committed_seq = committing_seq;
+
+ map<version_t, vector<Context*> >::iterator p = commit_waiters.begin();
+ while (p != commit_waiters.end() &&
+ p->first <= committing_seq) {
+ finisher.queue(p->second);
+ commit_waiters.erase(p++);
+ }
+}
+
+void JournalingObjectStore::_op_journal_transactions(
+ bufferlist& tbl, uint32_t orig_len, uint64_t op,
+ Context *onjournal, TrackedOpRef osd_op)
+{
+ if (osd_op.get())
+ dout(10) << "op_journal_transactions " << op << " reqid_t "
+ << (static_cast<OpRequest *>(osd_op.get()))->get_reqid() << dendl;
+ else
+ dout(10) << "op_journal_transactions " << op << dendl;
+
+ if (journal && journal->is_writeable()) {
+ journal->submit_entry(op, tbl, orig_len, onjournal, osd_op);
+ } else if (onjournal) {
+ apply_manager.add_waiter(op, onjournal);
+ }
+}
diff --git a/src/os/filestore/JournalingObjectStore.h b/src/os/filestore/JournalingObjectStore.h
new file mode 100644
index 00000000..a289d0e8
--- /dev/null
+++ b/src/os/filestore/JournalingObjectStore.h
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_JOURNALINGOBJECTSTORE_H
+#define CEPH_JOURNALINGOBJECTSTORE_H
+
+#include "os/ObjectStore.h"
+#include "Journal.h"
+#include "FileJournal.h"
+#include "common/RWLock.h"
+#include "osd/OpRequest.h"
+
+class JournalingObjectStore : public ObjectStore {
+protected:
+ Journal *journal;
+ Finisher finisher;
+
+
+ class SubmitManager {
+ CephContext* cct;
+ Mutex lock;
+ uint64_t op_seq;
+ uint64_t op_submitted;
+ public:
+ SubmitManager(CephContext* cct) :
+ cct(cct), lock("JOS::SubmitManager::lock", false, true, false),
+ op_seq(0), op_submitted(0)
+ {}
+ uint64_t op_submit_start();
+ void op_submit_finish(uint64_t op);
+ void set_op_seq(uint64_t seq) {
+ Mutex::Locker l(lock);
+ op_submitted = op_seq = seq;
+ }
+ uint64_t get_op_seq() {
+ return op_seq;
+ }
+ } submit_manager;
+
+ class ApplyManager {
+ CephContext* cct;
+ Journal *&journal;
+ Finisher &finisher;
+
+ Mutex apply_lock;
+ bool blocked;
+ Cond blocked_cond;
+ int open_ops;
+ uint64_t max_applied_seq;
+
+ Mutex com_lock;
+ map<version_t, vector<Context*> > commit_waiters;
+ uint64_t committing_seq, committed_seq;
+
+ public:
+ ApplyManager(CephContext* cct, Journal *&j, Finisher &f) :
+ cct(cct), journal(j), finisher(f),
+ apply_lock("JOS::ApplyManager::apply_lock", false, true, false),
+ blocked(false),
+ open_ops(0),
+ max_applied_seq(0),
+ com_lock("JOS::ApplyManager::com_lock", false, true, false),
+ committing_seq(0), committed_seq(0) {}
+ void reset() {
+ ceph_assert(open_ops == 0);
+ ceph_assert(blocked == false);
+ max_applied_seq = 0;
+ committing_seq = 0;
+ committed_seq = 0;
+ }
+ void add_waiter(uint64_t, Context*);
+ uint64_t op_apply_start(uint64_t op);
+ void op_apply_finish(uint64_t op);
+ bool commit_start();
+ void commit_started();
+ void commit_finish();
+ bool is_committing() {
+ Mutex::Locker l(com_lock);
+ return committing_seq != committed_seq;
+ }
+ uint64_t get_committed_seq() {
+ Mutex::Locker l(com_lock);
+ return committed_seq;
+ }
+ uint64_t get_committing_seq() {
+ Mutex::Locker l(com_lock);
+ return committing_seq;
+ }
+ void init_seq(uint64_t fs_op_seq) {
+ {
+ Mutex::Locker l(com_lock);
+ committed_seq = fs_op_seq;
+ committing_seq = fs_op_seq;
+ }
+ {
+ Mutex::Locker l(apply_lock);
+ max_applied_seq = fs_op_seq;
+ }
+ }
+ } apply_manager;
+
+ bool replaying;
+
+protected:
+ void journal_start();
+ void journal_stop();
+ void journal_write_close();
+ int journal_replay(uint64_t fs_op_seq);
+
+ void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op,
+ Context *onjournal, TrackedOpRef osd_op);
+
+ virtual int do_transactions(vector<ObjectStore::Transaction>& tls, uint64_t op_seq) = 0;
+
+public:
+ bool is_committing() {
+ return apply_manager.is_committing();
+ }
+ uint64_t get_committed_seq() {
+ return apply_manager.get_committed_seq();
+ }
+
+public:
+ JournalingObjectStore(CephContext* cct, const std::string& path)
+ : ObjectStore(cct, path),
+ journal(NULL),
+ finisher(cct, "JournalObjectStore", "fn_jrn_objstore"),
+ submit_manager(cct),
+ apply_manager(cct, journal, finisher),
+ replaying(false) {}
+
+ ~JournalingObjectStore() override {
+ }
+};
+
+#endif
diff --git a/src/os/filestore/LFNIndex.cc b/src/os/filestore/LFNIndex.cc
new file mode 100644
index 00000000..2451ae8c
--- /dev/null
+++ b/src/os/filestore/LFNIndex.cc
@@ -0,0 +1,1407 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <errno.h>
+#include <string.h>
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "include/compat.h"
+#include "chain_xattr.h"
+
+#include "LFNIndex.h"
+using ceph::crypto::SHA1;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") "
+
+
+const string LFNIndex::LFN_ATTR = "user.cephos.lfn";
+const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash.";
+const string LFNIndex::SUBDIR_PREFIX = "DIR_";
+const string LFNIndex::FILENAME_COOKIE = "long";
+const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN -
+ FILENAME_COOKIE.size() -
+ FILENAME_EXTRA;
+void LFNIndex::maybe_inject_failure()
+{
+ if (error_injection_enabled) {
+ if (current_failure > last_failure &&
+ (((double)(rand() % 10000))/((double)(10000))
+ < error_injection_probability)) {
+ last_failure = current_failure;
+ current_failure = 0;
+ throw RetryException();
+ }
+ ++current_failure;
+ }
+}
+
+// Helper to close fd's when we leave scope. This is useful when used
+// in combination with RetryException, thrown by the above.
+struct FDCloser {
+ int fd;
+ explicit FDCloser(int f) : fd(f) {}
+ ~FDCloser() {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+};
+
+
+/* Public methods */
+
+uint64_t LFNIndex::get_max_escaped_name_len(const hobject_t &obj)
+{
+ ghobject_t ghobj(obj);
+ ghobj.shard_id = shard_id_t(0);
+ ghobj.generation = 0;
+ ghobj.hobj.snap = 0;
+ return lfn_generate_object_name_current(ghobj).size();
+}
+
+int LFNIndex::init()
+{
+ return _init();
+}
+
+int LFNIndex::created(const ghobject_t &oid, const char *path)
+{
+ WRAP_RETRY(
+ vector<string> path_comp;
+ string short_name;
+ r = decompose_full_path(path, &path_comp, 0, &short_name);
+ if (r < 0)
+ goto out;
+ r = lfn_created(path_comp, oid, short_name);
+ if (r < 0) {
+ if (failed) {
+ /* This is hacky, but the only way we get ENOENT from lfn_created here is
+ * if we did a failure injection in _created below AND actually started the
+ * split or merge. In that case, lfn_created already suceeded, and
+ * WRAP_RETRY already cleaned it up and we are actually done. In a real
+ * failure, the filestore itself would have ended up calling this with
+ * the new path, not the old one, so we'd find it.
+ */
+ r = 0;
+ }
+ goto out;
+ }
+ r = _created(path_comp, oid, short_name);
+ if (r < 0)
+ goto out;
+ );
+}
+
+int LFNIndex::unlink(const ghobject_t &oid)
+{
+ WRAP_RETRY(
+ vector<string> path;
+ string short_name;
+ r = _lookup(oid, &path, &short_name, NULL);
+ if (r < 0) {
+ goto out;
+ }
+ r = _remove(path, oid, short_name);
+ if (r < 0) {
+ goto out;
+ }
+ );
+}
+
+int LFNIndex::lookup(const ghobject_t &oid,
+ IndexedPath *out_path,
+ int *hardlink)
+{
+ WRAP_RETRY(
+ vector<string> path;
+ string short_name;
+ r = _lookup(oid, &path, &short_name, hardlink);
+ if (r < 0)
+ goto out;
+ string full_path = get_full_path(path, short_name);
+ *out_path = std::make_shared<Path>(full_path, this);
+ r = 0;
+ );
+}
+
+int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
+{
+ return _pre_hash_collection(pg_num, expected_num_objs);
+}
+
+
+int LFNIndex::collection_list_partial(const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next)
+{
+ return _collection_list_partial(start, end, max_count, ls, next);
+}
+
+/* Derived class utility methods */
+
+int LFNIndex::fsync_dir(const vector<string> &path)
+{
+ maybe_inject_failure();
+ int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+ FDCloser f(fd);
+ maybe_inject_failure();
+ int r = ::fsync(fd);
+ maybe_inject_failure();
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ return 0;
+}
+
+int LFNIndex::link_object(const vector<string> &from,
+ const vector<string> &to,
+ const ghobject_t &oid,
+ const string &from_short_name)
+{
+ int r;
+ string from_path = get_full_path(from, from_short_name);
+ string to_path;
+ maybe_inject_failure();
+ r = lfn_get_name(to, oid, 0, &to_path, 0);
+ if (r < 0)
+ return r;
+ maybe_inject_failure();
+ r = ::link(from_path.c_str(), to_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::remove_objects(const vector<string> &dir,
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining)
+{
+ set<string> clean_chains;
+ for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin();
+ to_clean != to_remove.end();
+ ++to_clean) {
+ if (!lfn_is_hashed_filename(to_clean->first)) {
+ maybe_inject_failure();
+ int r = ::unlink(get_full_path(dir, to_clean->first).c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ continue;
+ }
+ if (clean_chains.count(lfn_get_short_name(to_clean->second, 0)))
+ continue;
+ set<int> holes;
+ map<int, pair<string, ghobject_t> > chain;
+ for (int i = 0; ; ++i) {
+ string short_name = lfn_get_short_name(to_clean->second, i);
+ if (remaining->count(short_name)) {
+ chain[i] = *(remaining->find(short_name));
+ } else if (to_remove.count(short_name)) {
+ holes.insert(i);
+ } else {
+ break;
+ }
+ }
+
+ map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin();
+ for (set<int>::iterator i = holes.begin();
+ i != holes.end();
+ ++i) {
+ if (candidate == chain.rend() || *i > candidate->first) {
+ string remove_path_name =
+ get_full_path(dir, lfn_get_short_name(to_clean->second, *i));
+ maybe_inject_failure();
+ int r = ::unlink(remove_path_name.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ continue;
+ }
+ string from = get_full_path(dir, candidate->second.first);
+ string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i));
+ maybe_inject_failure();
+ int r = ::rename(from.c_str(), to.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ remaining->erase(candidate->second.first);
+ remaining->insert(pair<string, ghobject_t>(
+ lfn_get_short_name(candidate->second.second, *i),
+ candidate->second.second));
+ ++candidate;
+ }
+ if (!holes.empty())
+ clean_chains.insert(lfn_get_short_name(to_clean->second, 0));
+ }
+ return 0;
+}
+
+int LFNIndex::move_objects(const vector<string> &from,
+ const vector<string> &to)
+{
+ map<string, ghobject_t> to_move;
+ int r;
+ r = list_objects(from, 0, NULL, &to_move);
+ if (r < 0)
+ return r;
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ string from_path = get_full_path(from, i->first);
+ string to_path, to_name;
+ r = lfn_get_name(to, i->second, &to_name, &to_path, 0);
+ if (r < 0)
+ return r;
+ maybe_inject_failure();
+ r = ::link(from_path.c_str(), to_path.c_str());
+ if (r < 0 && errno != EEXIST)
+ return -errno;
+ maybe_inject_failure();
+ r = lfn_created(to, i->second, to_name);
+ maybe_inject_failure();
+ if (r < 0)
+ return r;
+ }
+ r = fsync_dir(to);
+ if (r < 0)
+ return r;
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ maybe_inject_failure();
+ r = ::unlink(get_full_path(from, i->first).c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ }
+ return fsync_dir(from);
+}
+
+int LFNIndex::remove_object(const vector<string> &from,
+ const ghobject_t &oid)
+{
+ string short_name;
+ int r, exist;
+ maybe_inject_failure();
+ r = get_mangled_name(from, oid, &short_name, &exist);
+ maybe_inject_failure();
+ if (r < 0)
+ return r;
+ if (exist == 0)
+ return -ENOENT;
+ return lfn_unlink(from, oid, short_name);
+}
+
+int LFNIndex::get_mangled_name(const vector<string> &from,
+ const ghobject_t &oid,
+ string *mangled_name, int *hardlink)
+{
+ return lfn_get_name(from, oid, mangled_name, 0, hardlink);
+}
+
+int LFNIndex::move_subdir(
+ LFNIndex &from,
+ LFNIndex &dest,
+ const vector<string> &path,
+ string dir
+ )
+{
+ vector<string> sub_path(path.begin(), path.end());
+ sub_path.push_back(dir);
+ string from_path(from.get_full_path_subdir(sub_path));
+ string to_path(dest.get_full_path_subdir(sub_path));
+ int r = ::rename(from_path.c_str(), to_path.c_str());
+ if (r < 0)
+ return -errno;
+ return 0;
+}
+
+int LFNIndex::move_object(
+ LFNIndex &from,
+ LFNIndex &dest,
+ const vector<string> &path,
+ const pair<string, ghobject_t> &obj
+ )
+{
+ string from_path(from.get_full_path(path, obj.first));
+ string to_path;
+ string to_name;
+ int exists;
+ int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists) {
+ r = ::link(from_path.c_str(), to_path.c_str());
+ if (r < 0)
+ return r;
+ }
+ r = dest.lfn_created(path, obj.second, to_name);
+ if (r < 0)
+ return r;
+ r = dest.fsync_dir(path);
+ if (r < 0)
+ return r;
+ r = from.remove_object(path, obj.second);
+ if (r < 0)
+ return r;
+ return from.fsync_dir(path);
+}
+
+
+static int get_hobject_from_oinfo(const char *dir, const char *file,
+ ghobject_t *o)
+{
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/%s", dir, file);
+ // Hack, user.ceph._ is the attribute used to store the object info
+ bufferptr bp;
+ int r = chain_getxattr_buf(
+ path,
+ "user.ceph._",
+ &bp);
+ if (r < 0)
+ return r;
+ bufferlist bl;
+ if (r > 0)
+ bl.push_back(bp);
+ object_info_t oi(bl);
+ *o = ghobject_t(oi.soid);
+ return 0;
+}
+
+
+int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
+ long *handle, map<string, ghobject_t> *out)
+{
+ string to_list_path = get_full_path_subdir(to_list);
+ DIR *dir = ::opendir(to_list_path.c_str());
+ if (!dir) {
+ return -errno;
+ }
+
+ if (handle && *handle) {
+ seekdir(dir, *handle);
+ }
+
+ struct dirent *de = nullptr;
+ int r = 0;
+ int listed = 0;
+ bool end = true;
+ while ((de = ::readdir(dir))) {
+ end = false;
+ if (max_objs > 0 && listed >= max_objs) {
+ break;
+ }
+ if (de->d_name[0] == '.')
+ continue;
+ string short_name(de->d_name);
+ ghobject_t obj;
+ if (lfn_is_object(short_name)) {
+ r = lfn_translate(to_list, short_name, &obj);
+ if (r == -EINVAL) {
+ continue;
+ } else if (r < 0) {
+ goto cleanup;
+ } else {
+ string long_name = lfn_generate_object_name(obj);
+ if (!lfn_must_hash(long_name)) {
+ ceph_assert(long_name == short_name);
+ }
+ if (index_version == HASH_INDEX_TAG)
+ get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
+
+ out->insert(pair<string, ghobject_t>(short_name, obj));
+ ++listed;
+ }
+ }
+ }
+
+ if (handle && !end) {
+ *handle = telldir(dir);
+ }
+
+ r = 0;
+ cleanup:
+ ::closedir(dir);
+ return r;
+}
+
+int LFNIndex::list_subdirs(const vector<string> &to_list,
+ vector<string> *out)
+{
+ string to_list_path = get_full_path_subdir(to_list);
+ DIR *dir = ::opendir(to_list_path.c_str());
+ if (!dir)
+ return -errno;
+
+ struct dirent *de = nullptr;
+ while ((de = ::readdir(dir))) {
+ string short_name(de->d_name);
+ string demangled_name;
+ if (lfn_is_subdir(short_name, &demangled_name)) {
+ out->push_back(demangled_name);
+ }
+ }
+
+ ::closedir(dir);
+ return 0;
+}
+
+int LFNIndex::create_path(const vector<string> &to_create)
+{
+ maybe_inject_failure();
+ int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777);
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::remove_path(const vector<string> &to_remove)
+{
+ maybe_inject_failure();
+ int r = ::rmdir(get_full_path_subdir(to_remove).c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::path_exists(const vector<string> &to_check, int *exists)
+{
+ string full_path = get_full_path_subdir(to_check);
+ struct stat buf;
+ if (::stat(full_path.c_str(), &buf)) {
+ int r = -errno;
+ if (r == -ENOENT) {
+ *exists = 0;
+ return 0;
+ } else {
+ return r;
+ }
+ } else {
+ *exists = 1;
+ return 0;
+ }
+}
+
+int LFNIndex::add_attr_path(const vector<string> &path,
+ const string &attr_name,
+ bufferlist &attr_value)
+{
+ string full_path = get_full_path_subdir(path);
+ maybe_inject_failure();
+ return chain_setxattr<false, true>(
+ full_path.c_str(), mangle_attr_name(attr_name).c_str(),
+ reinterpret_cast<void *>(attr_value.c_str()),
+ attr_value.length());
+}
+
+int LFNIndex::get_attr_path(const vector<string> &path,
+ const string &attr_name,
+ bufferlist &attr_value)
+{
+ string full_path = get_full_path_subdir(path);
+ bufferptr bp;
+ int r = chain_getxattr_buf(
+ full_path.c_str(),
+ mangle_attr_name(attr_name).c_str(),
+ &bp);
+ if (r > 0)
+ attr_value.push_back(bp);
+ return r;
+}
+
+int LFNIndex::remove_attr_path(const vector<string> &path,
+ const string &attr_name)
+{
+ string full_path = get_full_path_subdir(path);
+ string mangled_attr_name = mangle_attr_name(attr_name);
+ maybe_inject_failure();
+ return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
+}
+
+string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid)
+{
+ char s[FILENAME_MAX_LEN];
+ char *end = s + sizeof(s);
+ char *t = s;
+
+ ceph_assert(oid.generation == ghobject_t::NO_GEN);
+ const char *i = oid.hobj.oid.name.c_str();
+ // Escape subdir prefix
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+ *t++ = '\\';
+ *t++ = 'd';
+ i += 4;
+ }
+ while (*i && t < end) {
+ if (*i == '\\') {
+ *t++ = '\\';
+ *t++ = '\\';
+ } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading .
+ *t++ = '\\';
+ *t++ = '.';
+ } else if (*i == '/') {
+ *t++ = '\\';
+ *t++ = 's';
+ } else
+ *t++ = *i;
+ i++;
+ }
+
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "_head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "_snapdir");
+ else
+ t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+
+ return string(s);
+}
+
+static void append_escaped(string::const_iterator begin,
+ string::const_iterator end,
+ string *out)
+{
+ for (string::const_iterator i = begin; i != end; ++i) {
+ if (*i == '\\') {
+ out->append("\\\\");
+ } else if (*i == '/') {
+ out->append("\\s");
+ } else if (*i == '_') {
+ out->append("\\u");
+ } else if (*i == '\0') {
+ out->append("\\n");
+ } else {
+ out->append(i, i+1);
+ }
+ }
+}
+
+string LFNIndex::lfn_generate_object_name_current(const ghobject_t &oid)
+{
+ string full_name;
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+ full_name.append("\\d");
+ i += 4;
+ } else if (oid.hobj.oid.name[0] == '.') {
+ full_name.append("\\.");
+ ++i;
+ }
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+ full_name.append("_");
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+ full_name.append("_");
+
+ char buf[PATH_MAX];
+ char *t = buf;
+ const char *end = t + sizeof(buf);
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "snapdir");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ t += snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+ full_name.append(buf, t);
+ full_name.append("_");
+
+ append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name);
+ full_name.append("_");
+
+ t = buf;
+ if (oid.hobj.pool == -1)
+ t += snprintf(t, end - t, "none");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool);
+ full_name.append(buf, t);
+
+ if (oid.generation != ghobject_t::NO_GEN ||
+ oid.shard_id != shard_id_t::NO_SHARD) {
+ full_name.append("_");
+
+ t = buf;
+ t += snprintf(t, end - buf, "%llx", (long long unsigned)oid.generation);
+ full_name.append(buf, t);
+
+ full_name.append("_");
+
+ t = buf;
+ t += snprintf(t, end - buf, "%x", (int)oid.shard_id);
+ full_name.append(buf, t);
+ }
+
+ return full_name;
+}
+
+string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid)
+{
+ if (index_version == HASH_INDEX_TAG)
+ return lfn_generate_object_name_keyless(oid);
+
+ ceph_assert(oid.generation == ghobject_t::NO_GEN);
+ string full_name;
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+ full_name.append("\\d");
+ i += 4;
+ } else if (oid.hobj.oid.name[0] == '.') {
+ full_name.append("\\.");
+ ++i;
+ }
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+ full_name.append("_");
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+ full_name.append("_");
+
+ char snap_with_hash[PATH_MAX];
+ char *t = snap_with_hash;
+ char *end = t + sizeof(snap_with_hash);
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "snapdir");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+ full_name += string(snap_with_hash);
+ return full_name;
+}
+
+int LFNIndex::lfn_get_name(const vector<string> &path,
+ const ghobject_t &oid,
+ string *mangled_name, string *out_path,
+ int *hardlink)
+{
+ string full_name = lfn_generate_object_name(oid);
+ int r;
+
+ if (!lfn_must_hash(full_name)) {
+ if (mangled_name)
+ *mangled_name = full_name;
+ if (out_path)
+ *out_path = get_full_path(path, full_name);
+ if (hardlink) {
+ struct stat buf;
+ string full_path = get_full_path(path, full_name);
+ maybe_inject_failure();
+ r = ::stat(full_path.c_str(), &buf);
+ if (r < 0) {
+ if (errno == ENOENT)
+ *hardlink = 0;
+ else
+ return -errno;
+ } else {
+ *hardlink = buf.st_nlink;
+ }
+ }
+ return 0;
+ }
+
+ int i = 0;
+ string candidate;
+ string candidate_path;
+ for ( ; ; ++i) {
+ candidate = lfn_get_short_name(oid, i);
+ candidate_path = get_full_path(path, candidate);
+ bufferptr bp;
+ r = chain_getxattr_buf(
+ candidate_path.c_str(),
+ get_lfn_attr().c_str(),
+ &bp);
+ if (r < 0) {
+ if (errno != ENODATA && errno != ENOENT)
+ return -errno;
+ if (errno == ENODATA) {
+ // Left over from incomplete transaction, it'll be replayed
+ maybe_inject_failure();
+ r = ::unlink(candidate_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ }
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (hardlink)
+ *hardlink = 0;
+ return 0;
+ }
+ ceph_assert(r > 0);
+ string lfn(bp.c_str(), bp.length());
+ if (lfn == full_name) {
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (hardlink) {
+ struct stat st;
+ r = ::stat(candidate_path.c_str(), &st);
+ if (r < 0) {
+ if (errno == ENOENT)
+ *hardlink = 0;
+ else
+ return -errno;
+ } else {
+ *hardlink = st.st_nlink;
+ }
+ }
+ return 0;
+ }
+ bp = bufferptr();
+ r = chain_getxattr_buf(
+ candidate_path.c_str(),
+ get_alt_lfn_attr().c_str(),
+ &bp);
+ if (r > 0) {
+ // only consider alt name if nlink > 1
+ struct stat st;
+ int rc = ::stat(candidate_path.c_str(), &st);
+ if (rc < 0)
+ return -errno;
+ if (st.st_nlink <= 1) {
+ // left over from incomplete unlink, remove
+ maybe_inject_failure();
+ dout(20) << __func__ << " found extra alt attr for " << candidate_path
+ << ", long name " << string(bp.c_str(), bp.length()) << dendl;
+ rc = chain_removexattr(candidate_path.c_str(),
+ get_alt_lfn_attr().c_str());
+ maybe_inject_failure();
+ if (rc < 0)
+ return rc;
+ continue;
+ }
+ string lfn(bp.c_str(), bp.length());
+ if (lfn == full_name) {
+ dout(20) << __func__ << " used alt attr for " << full_name << dendl;
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (hardlink)
+ *hardlink = st.st_nlink;
+ return 0;
+ }
+ }
+ }
+ ceph_abort(); // Unreachable
+ return 0;
+}
+
+int LFNIndex::lfn_created(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name)
+{
+ if (!lfn_is_hashed_filename(mangled_name))
+ return 0;
+ string full_path = get_full_path(path, mangled_name);
+ string full_name = lfn_generate_object_name(oid);
+ maybe_inject_failure();
+
+ // if the main attr exists and is different, move it to the alt attr.
+ bufferptr bp;
+ int r = chain_getxattr_buf(
+ full_path.c_str(),
+ get_lfn_attr().c_str(),
+ &bp);
+ if (r > 0) {
+ string lfn(bp.c_str(), bp.length());
+ if (lfn != full_name) {
+ dout(20) << __func__ << " " << mangled_name
+ << " moving old name to alt attr "
+ << lfn
+ << ", new name is " << full_name << dendl;
+ r = chain_setxattr<false, true>(
+ full_path.c_str(), get_alt_lfn_attr().c_str(),
+ bp.c_str(), bp.length());
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return chain_setxattr<false, true>(
+ full_path.c_str(), get_lfn_attr().c_str(),
+ full_name.c_str(), full_name.size());
+}
+
+int LFNIndex::lfn_unlink(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name)
+{
+ if (!lfn_is_hashed_filename(mangled_name)) {
+ string full_path = get_full_path(path, mangled_name);
+ maybe_inject_failure();
+ int r = ::unlink(full_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ return 0;
+ }
+
+ int i = 0;
+ for ( ; ; ++i) {
+ string candidate = lfn_get_short_name(oid, i);
+ if (candidate == mangled_name)
+ break;
+ }
+ int removed_index = i;
+ ++i;
+ for ( ; ; ++i) {
+ struct stat buf;
+ string to_check = lfn_get_short_name(oid, i);
+ string to_check_path = get_full_path(path, to_check);
+ int r = ::stat(to_check_path.c_str(), &buf);
+ if (r < 0) {
+ if (errno == ENOENT) {
+ break;
+ } else {
+ return -errno;
+ }
+ }
+ }
+ string full_path = get_full_path(path, mangled_name);
+ int fd = ::open(full_path.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+ FDCloser f(fd);
+ if (i == removed_index + 1) {
+ maybe_inject_failure();
+ int r = ::unlink(full_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ } else {
+ string& rename_to = full_path;
+ string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
+ maybe_inject_failure();
+ int r = ::rename(rename_from.c_str(), rename_to.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ }
+ struct stat st;
+ int r = ::fstat(fd, &st);
+ if (r == 0 && st.st_nlink > 0) {
+ // remove alt attr
+ dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
+ fsync_dir(path);
+ chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
+ }
+ return r;
+}
+
+int LFNIndex::lfn_translate(const vector<string> &path,
+ const string &short_name,
+ ghobject_t *out)
+{
+ if (!lfn_is_hashed_filename(short_name)) {
+ return lfn_parse_object_name(short_name, out);
+ }
+ string full_path = get_full_path(path, short_name);
+ // First, check alt attr
+ bufferptr bp;
+ int r = chain_getxattr_buf(
+ full_path.c_str(),
+ get_alt_lfn_attr().c_str(),
+ &bp);
+ if (r > 0) {
+ // There is an alt attr, does it match?
+ string lfn(bp.c_str(), bp.length());
+ if (short_name_matches(short_name.c_str(), lfn.c_str())) {
+ return lfn_parse_object_name(lfn, out);
+ }
+ }
+
+ // Get lfn_attr
+ bp = bufferptr();
+ r = chain_getxattr_buf(
+ full_path.c_str(),
+ get_lfn_attr().c_str(),
+ &bp);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ string long_name(bp.c_str(), bp.length());
+ return lfn_parse_object_name(long_name, out);
+}
+
+bool LFNIndex::lfn_is_object(const string &short_name)
+{
+ return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0);
+}
+
+bool LFNIndex::lfn_is_subdir(const string &name, string *demangled)
+{
+ if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) {
+ if (demangled)
+ *demangled = demangle_path_component(name);
+ return 1;
+ }
+ return 0;
+}
+
+static int parse_object(const char *s, ghobject_t& o)
+{
+ const char *hash = s + strlen(s) - 1;
+ while (*hash != '_' &&
+ hash > s)
+ hash--;
+ const char *bar = hash - 1;
+ while (*bar != '_' &&
+ bar > s)
+ bar--;
+ if (*bar == '_') {
+ char buf[bar-s + 1];
+ char *t = buf;
+ const char *i = s;
+ while (i < bar) {
+ if (*i == '\\') {
+ i++;
+ switch (*i) {
+ case '\\': *t++ = '\\'; break;
+ case '.': *t++ = '.'; break;
+ case 's': *t++ = '/'; break;
+ case 'd': {
+ *t++ = 'D';
+ *t++ = 'I';
+ *t++ = 'R';
+ *t++ = '_';
+ break;
+ }
+ default: ceph_abort();
+ }
+ } else {
+ *t++ = *i;
+ }
+ i++;
+ }
+ *t = 0;
+ o.hobj.oid.name = string(buf, t-buf);
+ if (strncmp(bar+1, "head", 4) == 0)
+ o.hobj.snap = CEPH_NOSNAP;
+ else if (strncmp(bar+1, "snapdir", 7) == 0)
+ o.hobj.snap = CEPH_SNAPDIR;
+ else
+ o.hobj.snap = strtoull(bar+1, NULL, 16);
+
+ uint32_t hobject_hash_input;
+ sscanf(hash, "_%X", &hobject_hash_input);
+ o.hobj.set_hash(hobject_hash_input);
+
+ return 1;
+ }
+ return 0;
+}
+
+int LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
+{
+ int r = parse_object(long_name.c_str(), *out);
+ int64_t pool = -1;
+ spg_t pg;
+ if (coll().is_pg_prefix(&pg))
+ pool = (int64_t)pg.pgid.pool();
+ out->hobj.pool = pool;
+ if (!r) return -EINVAL;
+ string temp = lfn_generate_object_name(*out);
+ return 0;
+}
+
+static bool append_unescaped(string::const_iterator begin,
+ string::const_iterator end,
+ string *out)
+{
+ for (string::const_iterator i = begin; i != end; ++i) {
+ if (*i == '\\') {
+ ++i;
+ if (*i == '\\')
+ out->append("\\");
+ else if (*i == 's')
+ out->append("/");
+ else if (*i == 'n')
+ (*out) += '\0';
+ else if (*i == 'u')
+ out->append("_");
+ else
+ return false;
+ } else {
+ out->append(i, i+1);
+ }
+ }
+ return true;
+}
+
+int LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
+ ghobject_t *out)
+{
+ string name;
+ string key;
+ uint32_t hash;
+ snapid_t snap;
+
+ string::const_iterator current = long_name.begin();
+ if (*current == '\\') {
+ ++current;
+ if (current == long_name.end()) {
+ return -EINVAL;
+ } else if (*current == 'd') {
+ name.append("DIR_");
+ ++current;
+ } else if (*current == '.') {
+ name.append(".");
+ ++current;
+ } else {
+ --current;
+ }
+ }
+
+ string::const_iterator end = current;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &name))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &key))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ string snap_str(current, end);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end != long_name.end())
+ return -EINVAL;
+ string hash_str(current, end);
+
+ if (snap_str == "head")
+ snap = CEPH_NOSNAP;
+ else if (snap_str == "snapdir")
+ snap = CEPH_SNAPDIR;
+ else
+ snap = strtoull(snap_str.c_str(), NULL, 16);
+ sscanf(hash_str.c_str(), "%X", &hash);
+
+
+ int64_t pool = -1;
+ spg_t pg;
+ if (coll().is_pg_prefix(&pg))
+ pool = (int64_t)pg.pgid.pool();
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
+ return 0;
+}
+
+
+int LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
+{
+ string name;
+ string key;
+ string ns;
+ uint32_t hash;
+ snapid_t snap;
+ uint64_t pool;
+ gen_t generation = ghobject_t::NO_GEN;
+ shard_id_t shard_id = shard_id_t::NO_SHARD;
+
+ if (index_version == HASH_INDEX_TAG)
+ return lfn_parse_object_name_keyless(long_name, out);
+ if (index_version == HASH_INDEX_TAG_2)
+ return lfn_parse_object_name_poolless(long_name, out);
+
+ string::const_iterator current = long_name.begin();
+ if (*current == '\\') {
+ ++current;
+ if (current == long_name.end()) {
+ return -EINVAL;
+ } else if (*current == 'd') {
+ name.append("DIR_");
+ ++current;
+ } else if (*current == '.') {
+ name.append(".");
+ ++current;
+ } else {
+ --current;
+ }
+ }
+
+ string::const_iterator end = current;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &name))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &key))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ string snap_str(current, end);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ string hash_str(current, end);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &ns))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ string pstring(current, end);
+
+ // Optional generation/shard_id
+ string genstring, shardstring;
+ if (end != long_name.end()) {
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ genstring = string(current, end);
+
+ generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end != long_name.end())
+ return -EINVAL;
+ shardstring = string(current, end);
+
+ shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16);
+ }
+
+ if (snap_str == "head")
+ snap = CEPH_NOSNAP;
+ else if (snap_str == "snapdir")
+ snap = CEPH_SNAPDIR;
+ else
+ snap = strtoull(snap_str.c_str(), NULL, 16);
+ sscanf(hash_str.c_str(), "%X", &hash);
+
+ if (pstring == "none")
+ pool = (uint64_t)-1;
+ else
+ pool = strtoull(pstring.c_str(), NULL, 16);
+
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
+ return 0;
+}
+
+bool LFNIndex::lfn_is_hashed_filename(const string &name)
+{
+ if (name.size() < (unsigned)FILENAME_SHORT_LEN) {
+ return 0;
+ }
+ if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size())
+ == FILENAME_COOKIE) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+bool LFNIndex::lfn_must_hash(const string &long_name)
+{
+ return (int)long_name.size() >= FILENAME_SHORT_LEN;
+}
+
+static inline void buf_to_hex(const unsigned char *buf, int len, char *str)
+{
+ int i;
+ str[0] = '\0';
+ for (i = 0; i < len; i++) {
+ sprintf(&str[i*2], "%02x", (int)buf[i]);
+ }
+}
+
+int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len)
+{
+ if (buf_len < FILENAME_HASH_LEN + 1)
+ return -EINVAL;
+
+ char buf[FILENAME_LFN_DIGEST_SIZE];
+ char hex[FILENAME_LFN_DIGEST_SIZE * 2];
+
+ SHA1 h;
+ h.Update((const unsigned char *)filename, strlen(filename));
+ h.Final((unsigned char *)buf);
+
+ buf_to_hex((unsigned char *)buf, (FILENAME_HASH_LEN + 1) / 2, hex);
+ strncpy(hash, hex, FILENAME_HASH_LEN);
+ hash[FILENAME_HASH_LEN] = '\0';
+ return 0;
+}
+
+void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len)
+{
+ char hash[FILENAME_HASH_LEN + 1];
+
+ ceph_assert(len >= FILENAME_SHORT_LEN + 4);
+
+ strncpy(filename, old_filename, FILENAME_PREFIX_LEN);
+ filename[FILENAME_PREFIX_LEN] = '\0';
+ if ((int)strlen(filename) < FILENAME_PREFIX_LEN)
+ return;
+ if (old_filename[FILENAME_PREFIX_LEN] == '\0')
+ return;
+
+ hash_filename(old_filename, hash, sizeof(hash));
+ int ofs = FILENAME_PREFIX_LEN;
+ while (1) {
+ int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str());
+ if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs)
+ break;
+ ofs--;
+ }
+}
+
+bool LFNIndex::short_name_matches(const char *short_name, const char *cand_long_name)
+{
+ const char *end = short_name;
+ while (*end) ++end;
+ const char *suffix = end;
+ if (suffix > short_name) --suffix; // last char
+ while (suffix > short_name && *suffix != '_') --suffix; // back to first _
+ if (suffix > short_name) --suffix; // one behind that
+ while (suffix > short_name && *suffix != '_') --suffix; // back to second _
+
+ int index = -1;
+ char buf[FILENAME_SHORT_LEN + 4];
+ ceph_assert((end - suffix) < (int)sizeof(buf));
+ int r = sscanf(suffix, "_%d_%s", &index, buf);
+ if (r < 2)
+ return false;
+ if (strcmp(buf, FILENAME_COOKIE.c_str()) != 0)
+ return false;
+ build_filename(cand_long_name, index, buf, sizeof(buf));
+ return strcmp(short_name, buf) == 0;
+}
+
+string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
+{
+ string long_name = lfn_generate_object_name(oid);
+ ceph_assert(lfn_must_hash(long_name));
+ char buf[FILENAME_SHORT_LEN + 4];
+ build_filename(long_name.c_str(), i, buf, sizeof(buf));
+ return string(buf);
+}
+
+const string &LFNIndex::get_base_path()
+{
+ return base_path;
+}
+
+string LFNIndex::get_full_path_subdir(const vector<string> &rel)
+{
+ string retval = get_base_path();
+ for (vector<string>::const_iterator i = rel.begin();
+ i != rel.end();
+ ++i) {
+ retval += "/";
+ retval += mangle_path_component(*i);
+ }
+ return retval;
+}
+
+string LFNIndex::get_full_path(const vector<string> &rel, const string &name)
+{
+ return get_full_path_subdir(rel) + "/" + name;
+}
+
+string LFNIndex::mangle_path_component(const string &component)
+{
+ return SUBDIR_PREFIX + component;
+}
+
+string LFNIndex::demangle_path_component(const string &component)
+{
+ return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size());
+}
+
+int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
+ ghobject_t *oid, string *shortname)
+{
+ const char *beginning = in + get_base_path().size();
+ const char *end = beginning;
+ while (1) {
+ end++;
+ beginning = end++;
+ for ( ; *end != '\0' && *end != '/'; ++end) ;
+ if (*end != '\0') {
+ out->push_back(demangle_path_component(string(beginning, end - beginning)));
+ continue;
+ } else {
+ break;
+ }
+ }
+ *shortname = string(beginning, end - beginning);
+ if (oid) {
+ int r = lfn_translate(*out, *shortname, oid);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+string LFNIndex::mangle_attr_name(const string &attr)
+{
+ return PHASH_ATTR_PREFIX + attr;
+}
diff --git a/src/os/filestore/LFNIndex.h b/src/os/filestore/LFNIndex.h
new file mode 100644
index 00000000..149ed10f
--- /dev/null
+++ b/src/os/filestore/LFNIndex.h
@@ -0,0 +1,614 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef OS_LFNINDEX_H
+#define OS_LFNINDEX_H
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <exception>
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/ceph_crypto.h"
+
+#include "CollectionIndex.h"
+
+/**
+ * LFNIndex also encapsulates logic for manipulating
+ * subdirectories of a collection as well as the long filename
+ * logic.
+ *
+ * The protected methods provide machinery for derived classes to
+ * manipulate subdirectories and objects.
+ *
+ * The virtual methods are to be overridden to provide the actual
+ * hashed layout.
+ *
+ * User must call created when an object is created.
+ *
+ * Synchronization: Calling code must ensure that there are no object
+ * creations or deletions during the lifetime of a Path object (except
+ * of an object at that path).
+ *
+ * Unless otherwise noted, methods which return an int return 0 on success
+ * and a negative error code on failure.
+ */
+#define WRAP_RETRY(x) { \
+ bool failed = false; \
+ int r = 0; \
+ init_inject_failure(); \
+ while (1) { \
+ try { \
+ if (failed) { \
+ r = cleanup(); \
+ ceph_assert(r == 0); \
+ } \
+ { x } \
+ out: \
+ complete_inject_failure(); \
+ return r; \
+ } catch (RetryException&) { \
+ failed = true; \
+ } catch (...) { \
+ ceph_abort(); \
+ } \
+ } \
+ return -1; \
+ } \
+
+
+
+class LFNIndex : public CollectionIndex {
+ /// Hash digest output size.
+ static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE;
+ /// Length of filename hash.
+ static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE;
+ /// Max filename size.
+ static const int FILENAME_MAX_LEN = 4096;
+ /// Length of hashed filename.
+ static const int FILENAME_SHORT_LEN = 255;
+ /// Length of hashed filename prefix.
+ static const int FILENAME_PREFIX_LEN;
+ /// Length of hashed filename cookie.
+ static const int FILENAME_EXTRA = 4;
+ /// Lfn cookie value.
+ static const string FILENAME_COOKIE;
+ /// Name of LFN attribute for storing full name.
+ static const string LFN_ATTR;
+ /// Prefix for subdir index attributes.
+ static const string PHASH_ATTR_PREFIX;
+ /// Prefix for index subdirectories.
+ static const string SUBDIR_PREFIX;
+
+ /// Path to Index base.
+ const string base_path;
+
+protected:
+ const uint32_t index_version;
+
+ /// true if retry injection is enabled
+ struct RetryException : public exception {};
+ bool error_injection_enabled;
+ bool error_injection_on;
+ double error_injection_probability;
+ uint64_t last_failure;
+ uint64_t current_failure;
+ void init_inject_failure() {
+ if (error_injection_on) {
+ error_injection_enabled = true;
+ last_failure = current_failure = 0;
+ }
+ }
+ void maybe_inject_failure();
+ void complete_inject_failure() {
+ error_injection_enabled = false;
+ }
+
+private:
+ string lfn_attribute, lfn_alt_attribute;
+ coll_t collection;
+
+public:
+ /// Constructor
+ LFNIndex(
+ CephContext* cct,
+ coll_t collection,
+ const char *base_path, ///< [in] path to Index root
+ uint32_t index_version,
+ double _error_injection_probability=0)
+ : CollectionIndex(cct, collection),
+ base_path(base_path),
+ index_version(index_version),
+ error_injection_enabled(false),
+ error_injection_on(_error_injection_probability != 0),
+ error_injection_probability(_error_injection_probability),
+ last_failure(0), current_failure(0),
+ collection(collection) {
+ if (index_version == HASH_INDEX_TAG) {
+ lfn_attribute = LFN_ATTR;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", index_version);
+ lfn_attribute = LFN_ATTR + string(buf);
+ lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
+ }
+ }
+
+ coll_t coll() const override { return collection; }
+
+ /// Virtual destructor
+ ~LFNIndex() override {}
+
+ /// @see CollectionIndex
+ int init() override;
+
+ /// @see CollectionIndex
+ int cleanup() override = 0;
+
+ /// @see CollectionIndex
+ int created(
+ const ghobject_t &oid,
+ const char *path
+ ) override;
+
+ /// @see CollectionIndex
+ int unlink(
+ const ghobject_t &oid
+ ) override;
+
+ /// @see CollectionIndex
+ int lookup(
+ const ghobject_t &oid,
+ IndexedPath *path,
+ int *hardlink
+ ) override;
+
+ /// @see CollectionIndex;
+ int pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ ) override;
+
+ /// @see CollectionIndex
+ int collection_list_partial(
+ const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next
+ ) override;
+
+ virtual int _split(
+ uint32_t match, //< [in] value to match
+ uint32_t bits, //< [in] bits to check
+ CollectionIndex* dest //< [in] destination index
+ ) = 0;
+ virtual int _merge(
+ uint32_t bits, //< [in] bits for target
+ CollectionIndex* dest //< [in] destination index
+ ) = 0;
+
+ /// @see CollectionIndex
+ int split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest
+ ) override {
+ WRAP_RETRY(
+ r = _split(match, bits, dest);
+ goto out;
+ );
+ }
+
+ /// @see CollectionIndex
+ int merge(
+ uint32_t bits,
+ CollectionIndex* dest
+ ) override {
+ WRAP_RETRY(
+ r = _merge(bits, dest);
+ goto out;
+ );
+ }
+
+ /**
+ * Returns the length of the longest escaped name which could result
+ * from any clone, shard, or rollback object of this object
+ */
+ static uint64_t get_max_escaped_name_len(const hobject_t &obj);
+
+protected:
+ virtual int _init() = 0;
+
+ /// Will be called upon object creation
+ virtual int _created(
+ const vector<string> &path, ///< [in] Path to subdir.
+ const ghobject_t &oid, ///< [in] Object created.
+ const string &mangled_name ///< [in] Mangled filename.
+ ) = 0;
+
+ /// Will be called to remove an object
+ virtual int _remove(
+ const vector<string> &path, ///< [in] Path to subdir.
+ const ghobject_t &oid, ///< [in] Object to remove.
+ const string &mangled_name ///< [in] Mangled filename.
+ ) = 0;
+
+ /// Return the path and mangled_name for oid.
+ virtual int _lookup(
+ const ghobject_t &oid,///< [in] Object for lookup.
+ vector<string> *path, ///< [out] Path to the object.
+ string *mangled_name, ///< [out] Mangled filename.
+ int *exists ///< [out] True if the object exists.
+ ) = 0;
+
+ /// Pre-hash the collection with the given pg number and
+ /// expected number of objects in the collection.
+ virtual int _pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ ) = 0;
+
+ /// @see CollectionIndex
+ virtual int _collection_list_partial(
+ const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next
+ ) = 0;
+
+protected:
+
+ /* Non-virtual utility methods */
+
+ /// Sync a subdirectory
+ int fsync_dir(
+ const vector<string> &path ///< [in] Path to sync
+ ); ///< @return Error Code, 0 on success
+
+ /// Link an object from from into to
+ int link_object(
+ const vector<string> &from, ///< [in] Source subdirectory.
+ const vector<string> &to, ///< [in] Dest subdirectory.
+ const ghobject_t &oid, ///< [in] Object to move.
+ const string &from_short_name ///< [in] Mangled filename of oid.
+ ); ///< @return Error Code, 0 on success
+
+ /**
+ * Efficiently remove objects from a subdirectory
+ *
+ * remove_object invalidates mangled names in the directory requiring
+ * the mangled name of each additional object to be looked up a second
+ * time. remove_objects removes the need for additional lookups
+ *
+ * @param [in] dir Directory from which to remove.
+ * @param [in] map of objects to remove to mangle names
+ * @param [in,out] map of filenames to objects
+ * @return Error Code, 0 on success.
+ */
+ int remove_objects(
+ const vector<string> &dir,
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining
+ );
+
+
+ /**
+ * Moves contents of from into to.
+ *
+ * Invalidates mangled names in to. If interrupted, all objects will be
+ * present in to before objects are removed from from. Ignores EEXIST
+ * while linking into to.
+ * @return Error Code, 0 on success
+ */
+ int move_objects(
+ const vector<string> &from, ///< [in] Source subdirectory.
+ const vector<string> &to ///< [in] Dest subdirectory.
+ );
+
+ /**
+ * Remove an object from from.
+ *
+ * Invalidates mangled names in from.
+ * @return Error Code, 0 on success
+ */
+ int remove_object(
+ const vector<string> &from, ///< [in] Directory from which to remove.
+ const ghobject_t &to_remove ///< [in] Object to remove.
+ );
+
+ /**
+ * Gets the filename corresponding to oid in from.
+ *
+ * The filename may differ between subdirectories. Furthermore,
+ * file creations ore removals in from may invalidate the name.
+ * @return Error code on failure, 0 on success
+ */
+ int get_mangled_name(
+ const vector<string> &from, ///< [in] Subdirectory
+ const ghobject_t &oid, ///< [in] Object
+ string *mangled_name, ///< [out] Filename
+ int *hardlink ///< [out] hardlink for this file, hardlink=0 mean no-exist
+ );
+
+ /// do move subdir from from to dest
+ static int move_subdir(
+ LFNIndex &from, ///< [in] from index
+ LFNIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path containing dir
+ string dir ///< [in] dir to move
+ );
+
+ /// do move object from from to dest
+ static int move_object(
+ LFNIndex &from, ///< [in] from index
+ LFNIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path to split
+ const pair<string, ghobject_t> &obj ///< [in] obj to move
+ );
+
+ /**
+ * Lists objects in to_list.
+ *
+ * @param [in] to_list Directory to list.
+ * @param [in] max_objects Max number to list.
+ * @param [in,out] handle Cookie for continuing the listing.
+ * Initialize to zero to start at the beginning of the directory.
+ * @param [out] out Mapping of listed object filenames to objects.
+ * @return Error code on failure, 0 on success
+ */
+ int list_objects(
+ const vector<string> &to_list,
+ int max_objects,
+ long *handle,
+ map<string, ghobject_t> *out
+ );
+
+ /// Lists subdirectories.
+ int list_subdirs(
+ const vector<string> &to_list, ///< [in] Directory to list.
+ vector<string> *out ///< [out] Subdirectories listed.
+ );
+
+ /// Create subdirectory.
+ int create_path(
+ const vector<string> &to_create ///< [in] Subdirectory to create.
+ );
+
+ /// Remove subdirectory.
+ int remove_path(
+ const vector<string> &to_remove ///< [in] Subdirectory to remove.
+ );
+
+ /// Check whether to_check exists.
+ int path_exists(
+ const vector<string> &to_check, ///< [in] Subdirectory to check.
+ int *exists ///< [out] 1 if it exists, 0 else
+ );
+
+ /// Save attr_value to attr_name attribute on path.
+ int add_attr_path(
+ const vector<string> &path, ///< [in] Path to modify.
+ const string &attr_name, ///< [in] Name of attribute.
+ bufferlist &attr_value ///< [in] Value to save.
+ );
+
+ /// Read into attr_value attribute attr_name on path.
+ int get_attr_path(
+ const vector<string> &path, ///< [in] Path to read.
+ const string &attr_name, ///< [in] Attribute to read.
+ bufferlist &attr_value ///< [out] Attribute value read.
+ );
+
+ /// Remove attr from path
+ int remove_attr_path(
+ const vector<string> &path, ///< [in] path from which to remove attr
+ const string &attr_name ///< [in] attr to remove
+ ); ///< @return Error code, 0 on success
+
+private:
+ /* lfn translation functions */
+
+ /**
+ * Gets the version specific lfn attribute tag
+ */
+ const string &get_lfn_attr() const {
+ return lfn_attribute;
+ }
+ const string &get_alt_lfn_attr() const {
+ return lfn_alt_attribute;
+ }
+
+ /**
+ * Gets the filename corresponding to oid in path.
+ *
+ * @param [in] path Path in which to get filename for oid.
+ * @param [in] oid Object for which to get filename.
+ * @param [out] mangled_name Filename for oid, pass NULL if not needed.
+ * @param [out] full_path Fullpath for oid, pass NULL if not needed.
+ * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if
+ * not needed
+ * @return Error Code, 0 on success.
+ */
+ int lfn_get_name(
+ const vector<string> &path,
+ const ghobject_t &oid,
+ string *mangled_name,
+ string *full_path,
+ int *hardlink
+ );
+
+ /// Adjusts path contents when oid is created at name mangled_name.
+ int lfn_created(
+ const vector<string> &path, ///< [in] Path to adjust.
+ const ghobject_t &oid, ///< [in] Object created.
+ const string &mangled_name ///< [in] Filename of created object.
+ );
+
+ /// Removes oid from path while adjusting path contents
+ int lfn_unlink(
+ const vector<string> &path, ///< [in] Path containing oid.
+ const ghobject_t &oid, ///< [in] Object to remove.
+ const string &mangled_name ///< [in] Filename of object to remove.
+ );
+
+ ///Transate a file into and ghobject_t.
+ int lfn_translate(
+ const vector<string> &path, ///< [in] Path containing the file.
+ const string &short_name, ///< [in] Filename to translate.
+ ghobject_t *out ///< [out] Object found.
+ ); ///< @return Negative error code on error, 0 if not an object, 1 else
+
+ /* manglers/demanglers */
+ /// Filters object filenames
+ bool lfn_is_object(
+ const string &short_name ///< [in] Filename to check
+ ); ///< True if short_name is an object, false otherwise
+
+ /// Filters subdir filenames
+ bool lfn_is_subdir(
+ const string &short_name, ///< [in] Filename to check.
+ string *demangled_name ///< [out] Demangled subdir name.
+ ); ///< @return True if short_name is a subdir, false otherwise
+
+ /// Generate object name
+ string lfn_generate_object_name_keyless(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ); ///< @return Generated object name.
+
+ /// Generate object name
+ string lfn_generate_object_name_poolless(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ); ///< @return Generated object name.
+
+ /// Generate object name
+ static string lfn_generate_object_name_current(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ); ///< @return Generated object name.
+
+ /// Generate object name
+ string lfn_generate_object_name(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ) {
+ if (index_version == HASH_INDEX_TAG)
+ return lfn_generate_object_name_keyless(oid);
+ if (index_version == HASH_INDEX_TAG_2)
+ return lfn_generate_object_name_poolless(oid);
+ else
+ return lfn_generate_object_name_current(oid);
+ } ///< @return Generated object name.
+
+ /// Parse object name
+ int lfn_parse_object_name_keyless(
+ const string &long_name, ///< [in] Name to parse
+ ghobject_t *out ///< [out] Resulting Object
+ ); ///< @return True if successful, False otherwise.
+
+ /// Parse object name
+ int lfn_parse_object_name_poolless(
+ const string &long_name, ///< [in] Name to parse
+ ghobject_t *out ///< [out] Resulting Object
+ ); ///< @return True if successful, False otherwise.
+
+ /// Parse object name
+ int lfn_parse_object_name(
+ const string &long_name, ///< [in] Name to parse
+ ghobject_t *out ///< [out] Resulting Object
+ ); ///< @return True if successful, False otherwise.
+
+ /// Checks whether short_name is a hashed filename.
+ bool lfn_is_hashed_filename(
+ const string &short_name ///< [in] Name to check.
+ ); ///< @return True if short_name is hashed, False otherwise.
+
+ /// Checks whether long_name must be hashed.
+ bool lfn_must_hash(
+ const string &long_name ///< [in] Name to check.
+ ); ///< @return True if long_name must be hashed, False otherwise.
+
+ /// Generate hashed name.
+ string lfn_get_short_name(
+ const ghobject_t &oid, ///< [in] Object for which to generate.
+ int i ///< [in] Index of hashed name to generate.
+ ); ///< @return Hashed filename.
+
+ /* other common methods */
+ /// Gets the base path
+ const string &get_base_path(); ///< @return Index base_path
+
+ /// Get full path the subdir
+ string get_full_path_subdir(
+ const vector<string> &rel ///< [in] The subdir.
+ ); ///< @return Full path to rel.
+
+ /// Get full path to object
+ string get_full_path(
+ const vector<string> &rel, ///< [in] Path to object.
+ const string &name ///< [in] Filename of object.
+ ); ///< @return Fullpath to object at name in rel.
+
+ /// Get mangled path component
+ string mangle_path_component(
+ const string &component ///< [in] Component to mangle
+ ); /// @return Mangled component
+
+ /// Demangle component
+ string demangle_path_component(
+ const string &component ///< [in] Subdir name to demangle
+ ); ///< @return Demangled path component.
+
+ /// Decompose full path into object name and filename.
+ int decompose_full_path(
+ const char *in, ///< [in] Full path to object.
+ vector<string> *out, ///< [out] Path to object at in.
+ ghobject_t *oid, ///< [out] Object at in.
+ string *shortname ///< [out] Filename of object at in.
+ ); ///< @return Error Code, 0 on success.
+
+ /// Mangle attribute name
+ string mangle_attr_name(
+ const string &attr ///< [in] Attribute to mangle.
+ ); ///< @return Mangled attribute name.
+
+ /// checks whether long_name could hash to short_name
+ bool short_name_matches(
+ const char *short_name, ///< [in] name to check against
+ const char *cand_long_name ///< [in] candidate long name
+ );
+
+ /// Builds hashed filename
+ void build_filename(
+ const char *old_filename, ///< [in] Filename to convert.
+ int i, ///< [in] Index of hash.
+ char *filename, ///< [out] Resulting filename.
+ int len ///< [in] Size of buffer for filename
+ ); ///< @return Error Code, 0 on success
+
+ /// Get hash of filename
+ int hash_filename(
+ const char *filename, ///< [in] Filename to hash.
+ char *hash, ///< [out] Hash of filename.
+ int len ///< [in] Size of hash buffer.
+ ); ///< @return Error Code, 0 on success.
+
+ friend class TestWrapLFNIndex;
+};
+typedef LFNIndex::IndexedPath IndexedPath;
+
+#endif
diff --git a/src/os/filestore/SequencerPosition.h b/src/os/filestore/SequencerPosition.h
new file mode 100644
index 00000000..164112ee
--- /dev/null
+++ b/src/os/filestore/SequencerPosition.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OS_SEQUENCERPOSITION_H
+#define __CEPH_OS_SEQUENCERPOSITION_H
+
+#include "include/types.h"
+#include "include/cmp.h"
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+#include <ostream>
+
+/**
+ * transaction and op offset
+ */
+struct SequencerPosition {
+ uint64_t seq; ///< seq
+ uint32_t trans; ///< transaction in that seq (0-based)
+ uint32_t op; ///< op in that transaction (0-based)
+
+ SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(seq, bl);
+ encode(trans, bl);
+ encode(op, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(seq, p);
+ decode(trans, p);
+ decode(op, p);
+ DECODE_FINISH(p);
+ }
+ void dump(Formatter *f) const {
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("trans", trans);
+ f->dump_unsigned("op", op);
+ }
+ static void generate_test_instances(list<SequencerPosition*>& o) {
+ o.push_back(new SequencerPosition);
+ o.push_back(new SequencerPosition(1, 2, 3));
+ o.push_back(new SequencerPosition(4, 5, 6));
+ }
+};
+WRITE_CLASS_ENCODER(SequencerPosition)
+
+inline ostream& operator<<(ostream& out, const SequencerPosition& t) {
+ return out << t.seq << "." << t.trans << "." << t.op;
+}
+
+WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op)
+WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op)
+
+
+#endif
diff --git a/src/os/filestore/WBThrottle.cc b/src/os/filestore/WBThrottle.cc
new file mode 100644
index 00000000..ba2ed131
--- /dev/null
+++ b/src/os/filestore/WBThrottle.cc
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+
+#include "os/filestore/WBThrottle.h"
+#include "common/perf_counters.h"
+#include "common/errno.h"
+
+WBThrottle::WBThrottle(CephContext *cct) :
+ cur_ios(0), cur_size(0),
+ cct(cct),
+ logger(NULL),
+ stopping(true),
+ lock("WBThrottle::lock", false, true, false),
+ fs(XFS)
+{
+ {
+ Mutex::Locker l(lock);
+ set_from_conf();
+ }
+ ceph_assert(cct);
+ PerfCountersBuilder b(
+ cct, string("WBThrottle"),
+ l_wbthrottle_first, l_wbthrottle_last);
+ b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations");
+ b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations");
+ b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write");
+ b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries");
+ logger = b.create_perf_counters();
+ cct->get_perfcounters_collection()->add(logger);
+ for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i)
+ logger->set(i, 0);
+
+ cct->_conf.add_observer(this);
+}
+
+WBThrottle::~WBThrottle() {
+ ceph_assert(cct);
+ cct->get_perfcounters_collection()->remove(logger);
+ delete logger;
+ cct->_conf.remove_observer(this);
+}
+
+void WBThrottle::start()
+{
+ {
+ Mutex::Locker l(lock);
+ stopping = false;
+ }
+ create("wb_throttle");
+}
+
+void WBThrottle::stop()
+{
+ {
+ Mutex::Locker l(lock);
+ stopping = true;
+ cond.Signal();
+ }
+
+ join();
+}
+
+const char** WBThrottle::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "filestore_wbthrottle_btrfs_bytes_start_flusher",
+ "filestore_wbthrottle_btrfs_bytes_hard_limit",
+ "filestore_wbthrottle_btrfs_ios_start_flusher",
+ "filestore_wbthrottle_btrfs_ios_hard_limit",
+ "filestore_wbthrottle_btrfs_inodes_start_flusher",
+ "filestore_wbthrottle_btrfs_inodes_hard_limit",
+ "filestore_wbthrottle_xfs_bytes_start_flusher",
+ "filestore_wbthrottle_xfs_bytes_hard_limit",
+ "filestore_wbthrottle_xfs_ios_start_flusher",
+ "filestore_wbthrottle_xfs_ios_hard_limit",
+ "filestore_wbthrottle_xfs_inodes_start_flusher",
+ "filestore_wbthrottle_xfs_inodes_hard_limit",
+ NULL
+ };
+ return KEYS;
+}
+
+void WBThrottle::set_from_conf()
+{
+ ceph_assert(lock.is_locked());
+ if (fs == BTRFS) {
+ size_limits.first =
+ cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher;
+ size_limits.second =
+ cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit;
+ io_limits.first =
+ cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher;
+ io_limits.second =
+ cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit;
+ fd_limits.first =
+ cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher;
+ fd_limits.second =
+ cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit;
+ } else if (fs == XFS) {
+ size_limits.first =
+ cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher;
+ size_limits.second =
+ cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit;
+ io_limits.first =
+ cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher;
+ io_limits.second =
+ cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit;
+ fd_limits.first =
+ cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher;
+ fd_limits.second =
+ cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit;
+ } else {
+ ceph_abort_msg("invalid value for fs");
+ }
+ cond.Signal();
+}
+
+void WBThrottle::handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ Mutex::Locker l(lock);
+ for (const char** i = get_tracked_conf_keys(); *i; ++i) {
+ if (changed.count(*i)) {
+ set_from_conf();
+ return;
+ }
+ }
+}
+
+bool WBThrottle::get_next_should_flush(
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next)
+{
+ ceph_assert(lock.is_locked());
+ ceph_assert(next);
+ while (!stopping && (!beyond_limit() || pending_wbs.empty()))
+ cond.Wait(lock);
+ if (stopping)
+ return false;
+ ceph_assert(!pending_wbs.empty());
+ ghobject_t obj(pop_object());
+
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+ pending_wbs.find(obj);
+ *next = boost::make_tuple(obj, i->second.second, i->second.first);
+ pending_wbs.erase(i);
+ return true;
+}
+
+
+void *WBThrottle::entry()
+{
+ Mutex::Locker l(lock);
+ boost::tuple<ghobject_t, FDRef, PendingWB> wb;
+ while (get_next_should_flush(&wb)) {
+ clearing = wb.get<0>();
+ cur_ios -= wb.get<2>().ios;
+ logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
+ logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios);
+ cur_size -= wb.get<2>().size;
+ logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
+ logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size);
+ logger->dec(l_wbthrottle_inodes_dirtied);
+ logger->inc(l_wbthrottle_inodes_wb);
+ lock.Unlock();
+#if defined(HAVE_FDATASYNC)
+ int r = ::fdatasync(**wb.get<1>());
+#else
+ int r = ::fsync(**wb.get<1>());
+#endif
+ if (r < 0) {
+ lderr(cct) << "WBThrottle fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+#ifdef HAVE_POSIX_FADVISE
+ if (cct->_conf->filestore_fadvise && wb.get<2>().nocache) {
+ int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
+ ceph_assert(fa_r == 0);
+ }
+#endif
+ lock.Lock();
+ clearing = ghobject_t();
+ cond.Signal();
+ wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
+ }
+ return 0;
+}
+
+void WBThrottle::queue_wb(
+ FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len,
+ bool nocache)
+{
+ Mutex::Locker l(lock);
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
+ pending_wbs.find(hoid);
+ if (wbiter == pending_wbs.end()) {
+ wbiter = pending_wbs.insert(
+ make_pair(hoid,
+ make_pair(
+ PendingWB(),
+ fd))).first;
+ logger->inc(l_wbthrottle_inodes_dirtied);
+ } else {
+ remove_object(hoid);
+ }
+
+ cur_ios++;
+ logger->inc(l_wbthrottle_ios_dirtied);
+ cur_size += len;
+ logger->inc(l_wbthrottle_bytes_dirtied, len);
+
+ wbiter->second.first.add(nocache, len, 1);
+ insert_object(hoid);
+ if (beyond_limit())
+ cond.Signal();
+}
+
+void WBThrottle::clear()
+{
+ Mutex::Locker l(lock);
+ for (ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+ pending_wbs.begin();
+ i != pending_wbs.end();
+ ++i) {
+#ifdef HAVE_POSIX_FADVISE
+ if (cct->_conf->filestore_fadvise && i->second.first.nocache) {
+ int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED);
+ ceph_assert(fa_r == 0);
+ }
+#endif
+
+ }
+ cur_ios = cur_size = 0;
+ logger->set(l_wbthrottle_ios_dirtied, 0);
+ logger->set(l_wbthrottle_bytes_dirtied, 0);
+ logger->set(l_wbthrottle_inodes_dirtied, 0);
+ pending_wbs.clear();
+ lru.clear();
+ rev_lru.clear();
+ cond.Signal();
+}
+
+void WBThrottle::clear_object(const ghobject_t &hoid)
+{
+ Mutex::Locker l(lock);
+ while (clearing == hoid)
+ cond.Wait(lock);
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+ pending_wbs.find(hoid);
+ if (i == pending_wbs.end())
+ return;
+
+ cur_ios -= i->second.first.ios;
+ logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios);
+ cur_size -= i->second.first.size;
+ logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size);
+ logger->dec(l_wbthrottle_inodes_dirtied);
+
+ pending_wbs.erase(i);
+ remove_object(hoid);
+ cond.Signal();
+}
+
+void WBThrottle::throttle()
+{
+ Mutex::Locker l(lock);
+ while (!stopping && need_flush())
+ cond.Wait(lock);
+}
diff --git a/src/os/filestore/WBThrottle.h b/src/os/filestore/WBThrottle.h
new file mode 100644
index 00000000..ef809ea4
--- /dev/null
+++ b/src/os/filestore/WBThrottle.h
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef WBTHROTTLE_H
+#define WBTHROTTLE_H
+
+#include "include/unordered_map.h"
+#include <boost/tuple/tuple.hpp>
+#include "common/Formatter.h"
+#include "common/hobject.h"
+#include "include/interval_set.h"
+#include "FDCache.h"
+#include "common/Thread.h"
+#include "common/ceph_context.h"
+
+class PerfCounters;
+enum {
+ l_wbthrottle_first = 999090,
+ l_wbthrottle_bytes_dirtied,
+ l_wbthrottle_bytes_wb,
+ l_wbthrottle_ios_dirtied,
+ l_wbthrottle_ios_wb,
+ l_wbthrottle_inodes_dirtied,
+ l_wbthrottle_inodes_wb,
+ l_wbthrottle_last
+};
+
+/**
+ * WBThrottle
+ *
+ * Tracks, throttles, and flushes outstanding IO
+ */
+class WBThrottle : Thread, public md_config_obs_t {
+ ghobject_t clearing;
+ /* *_limits.first is the start_flusher limit and
+ * *_limits.second is the hard limit
+ */
+
+ /// Limits on unflushed bytes
+ pair<uint64_t, uint64_t> size_limits;
+
+ /// Limits on unflushed ios
+ pair<uint64_t, uint64_t> io_limits;
+
+ /// Limits on unflushed objects
+ pair<uint64_t, uint64_t> fd_limits;
+
+ uint64_t cur_ios; /// Currently unflushed IOs
+ uint64_t cur_size; /// Currently unflushed bytes
+
+ /**
+ * PendingWB tracks the ios pending on an object.
+ */
+ class PendingWB {
+ public:
+ bool nocache;
+ uint64_t size;
+ uint64_t ios;
+ PendingWB() : nocache(true), size(0), ios(0) {}
+ void add(bool _nocache, uint64_t _size, uint64_t _ios) {
+ if (!_nocache)
+ nocache = false; // only nocache if all writes are nocache
+ size += _size;
+ ios += _ios;
+ }
+ };
+
+ CephContext *cct;
+ PerfCounters *logger;
+ bool stopping;
+ Mutex lock;
+ Cond cond;
+
+
+ /**
+ * Flush objects in lru order
+ */
+ list<ghobject_t> lru;
+ ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator> rev_lru;
+ void remove_object(const ghobject_t &oid) {
+ ceph_assert(lock.is_locked());
+ ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator>::iterator iter =
+ rev_lru.find(oid);
+ if (iter == rev_lru.end())
+ return;
+
+ lru.erase(iter->second);
+ rev_lru.erase(iter);
+ }
+ ghobject_t pop_object() {
+ ceph_assert(!lru.empty());
+ ghobject_t oid(lru.front());
+ lru.pop_front();
+ rev_lru.erase(oid);
+ return oid;
+ }
+ void insert_object(const ghobject_t &oid) {
+ ceph_assert(rev_lru.find(oid) == rev_lru.end());
+ lru.push_back(oid);
+ rev_lru.insert(make_pair(oid, --lru.end()));
+ }
+
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs;
+
+ /// get next flush to perform
+ bool get_next_should_flush(
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush
+ ); ///< @return false if we are shutting down
+public:
+ enum FS {
+ BTRFS,
+ XFS
+ };
+
+private:
+ FS fs;
+
+ void set_from_conf();
+ bool beyond_limit() const {
+ if (cur_ios < io_limits.first &&
+ pending_wbs.size() < fd_limits.first &&
+ cur_size < size_limits.first)
+ return false;
+ else
+ return true;
+ }
+ bool need_flush() const {
+ if (cur_ios < io_limits.second &&
+ pending_wbs.size() < fd_limits.second &&
+ cur_size < size_limits.second)
+ return false;
+ else
+ return true;
+ }
+
+public:
+ explicit WBThrottle(CephContext *cct);
+ ~WBThrottle() override;
+
+ void start();
+ void stop();
+ /// Set fs as XFS or BTRFS
+ void set_fs(FS new_fs) {
+ Mutex::Locker l(lock);
+ fs = new_fs;
+ set_from_conf();
+ }
+
+ /// Queue wb on oid, fd taking throttle (does not block)
+ void queue_wb(
+ FDRef fd, ///< [in] FDRef to oid
+ const ghobject_t &oid, ///< [in] object
+ uint64_t offset, ///< [in] offset written
+ uint64_t len, ///< [in] length written
+ bool nocache ///< [in] try to clear out of cache after write
+ );
+
+ /// Clear all wb (probably due to sync)
+ void clear();
+
+ /// Clear object
+ void clear_object(const ghobject_t &oid);
+
+ /// Block until there is throttle available
+ void throttle();
+
+ /// md_config_obs_t
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override;
+
+ /// Thread
+ void *entry() override;
+};
+
+#endif
diff --git a/src/os/filestore/XfsFileStoreBackend.cc b/src/os/filestore/XfsFileStoreBackend.cc
new file mode 100644
index 00000000..1081d146
--- /dev/null
+++ b/src/os/filestore/XfsFileStoreBackend.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "XfsFileStoreBackend.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/utsname.h>
+
+#include <xfs/xfs.h>
+
+#include "common/errno.h"
+#include "common/linux_version.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") "
+
+XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs):
+ GenericFileStoreBackend(fs), m_has_extsize(false) { }
+
+/*
+ * Set extsize attr on a file to val. Should be a free-standing
+ * function, but dout_prefix expanding to a call to get_basedir_path()
+ * protected member function won't let it.
+ */
+int XfsFileStoreBackend::set_extsize(int fd, unsigned int val)
+{
+ struct fsxattr fsx;
+ struct stat sb;
+ int ret;
+
+ if (fstat(fd, &sb) < 0) {
+ ret = -errno;
+ dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (!S_ISREG(sb.st_mode)) {
+ dout(0) << "set_extsize: invalid target file type" << dendl;
+ return -EINVAL;
+ }
+
+ if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
+ ret = -errno;
+ dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // already set?
+ if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
+ return 0;
+
+ // xfs won't change extent size if any extents are allocated
+ if (fsx.fsx_nextents != 0)
+ return 0;
+
+ fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
+ fsx.fsx_extsize = val;
+
+ if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
+ ret = -errno;
+ dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int XfsFileStoreBackend::detect_features()
+{
+ int ret;
+
+ ret = GenericFileStoreBackend::detect_features();
+ if (ret < 0)
+ return ret;
+
+ // extsize?
+ int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600);
+ if (fd < 0) {
+ ret = -errno;
+ dout(0) << "detect_feature: failed to create test file for extsize attr: "
+ << cpp_strerror(ret) << dendl;
+ goto out;
+ }
+ if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) {
+ ret = -errno;
+ dout(0) << "detect_feature: failed to unlink test file for extsize attr: "
+ << cpp_strerror(ret) << dendl;
+ goto out_close;
+ }
+
+ if (cct()->_conf->filestore_xfs_extsize) {
+ ret = set_extsize(fd, 1U << 15); // a few pages
+ if (ret) {
+ ret = 0;
+ dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl;
+ goto out_close;
+ }
+
+ // make sure we have 3.5 or newer, which includes this fix
+ // aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d
+ // for this set_extsize bug
+ // http://oss.sgi.com/bugzilla/show_bug.cgi?id=874
+ int ver = get_linux_version();
+ if (ver == 0) {
+ dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl;
+ m_has_extsize = false;
+ } else if (ver < KERNEL_VERSION(3, 5, 0)) {
+ dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl;
+ m_has_extsize = false;
+ } else {
+ dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl;
+ m_has_extsize = true;
+ }
+ } else {
+ dout(0) << "detect_feature: extsize is disabled by conf" << dendl;
+ }
+
+out_close:
+ TEMP_FAILURE_RETRY(::close(fd));
+out:
+ return ret;
+}
+
+int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint)
+{
+ if (!m_has_extsize)
+ return -EOPNOTSUPP;
+
+ ceph_assert(hint < UINT_MAX);
+ return set_extsize(fd, hint);
+}
diff --git a/src/os/filestore/XfsFileStoreBackend.h b/src/os/filestore/XfsFileStoreBackend.h
new file mode 100644
index 00000000..e8b81f9a
--- /dev/null
+++ b/src/os/filestore/XfsFileStoreBackend.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_XFSFILESTOREBACKEND_H
+#define CEPH_XFSFILESTOREBACKEND_H
+
+#include "GenericFileStoreBackend.h"
+
+#include "include/int_types.h"
+
+class XfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+ bool m_has_extsize;
+ int set_extsize(int fd, unsigned int val);
+public:
+ explicit XfsFileStoreBackend(FileStore *fs);
+ ~XfsFileStoreBackend() override {}
+ const char *get_name() override {
+ return "xfs";
+ }
+ int detect_features() override;
+ int set_alloc_hint(int fd, uint64_t hint) override;
+};
+
+#endif /* CEPH_XFSFILESTOREBACKEND_H */
diff --git a/src/os/filestore/ZFSFileStoreBackend.cc b/src/os/filestore/ZFSFileStoreBackend.cc
new file mode 100644
index 00000000..e85dbd52
--- /dev/null
+++ b/src/os/filestore/ZFSFileStoreBackend.cc
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/ceph_assert.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+
+#include "ZFSFileStoreBackend.h"
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") "
+
+ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) :
+ GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL),
+ m_filestore_zfs_snap(cct()->_conf->filestore_zfs_snap)
+{
+ int ret = zfs.init();
+ if (ret < 0) {
+ dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl;
+ return;
+ }
+
+ base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM);
+ if (!base_zh) {
+ dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl;
+ return;
+ }
+
+ update_current_zh();
+}
+
+ZFSFileStoreBackend::~ZFSFileStoreBackend()
+{
+ if (base_zh)
+ zfs.close(base_zh);
+ if (current_zh)
+ zfs.close(current_zh);
+}
+
+int ZFSFileStoreBackend::update_current_zh()
+{
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+ ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM);
+ if (zh) {
+ char *mnt;
+ if (zfs.is_mounted(zh, &mnt)) {
+ int ret = get_current_path() == mnt;
+ free(mnt);
+ if (ret) {
+ current_zh = zh;
+ return 0;
+ }
+ } else {
+ int ret = zfs.mount(zh, NULL, 0);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh)
+ << "' got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+ zfs.close(zh);
+ } else {
+ dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl;
+ return -ENOENT;
+ }
+
+ zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM);
+ if (zh) {
+ if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) {
+ current_zh = zh;
+ return 0;
+ }
+ zfs.close(zh);
+ dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl;
+ } else {
+ dout(0) << "update_current_zh: current/ not exist" << dendl;
+ }
+ return -ENOENT;
+}
+
+int ZFSFileStoreBackend::detect_features()
+{
+ if (!current_zh)
+ dout(0) << "detect_features: null zfs handle for current/" << dendl;
+ return 0;
+}
+
+bool ZFSFileStoreBackend::can_checkpoint()
+{
+ return m_filestore_zfs_snap && current_zh != NULL;
+}
+
+int ZFSFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+ return -ENOTDIR;
+ }
+ return 0;
+ } else if (errno != ENOENT) {
+ ret = -errno;
+ dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+ ret = zfs.create(path, ZFS::TYPE_FILESYSTEM);
+ if (ret < 0 && errno != EEXIST) {
+ ret = -errno;
+ dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ ret = update_current_zh();
+ return ret;
+}
+
+static int list_checkpoints_callback(ZFS::Handle *zh, void *data)
+{
+ list<string> *ls = static_cast<list<string> *>(data);
+ string str = ZFS::get_name(zh);
+ size_t pos = str.find('@');
+ ceph_assert(pos != string::npos && pos + 1 != str.length());
+ ls->push_back(str.substr(pos + 1));
+ return 0;
+}
+
+int ZFSFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+ dout(10) << "list_checkpoints:" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ list<string> snaps;
+ int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ls.swap(snaps);
+ return 0;
+}
+
+int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid)
+{
+ dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ // looks like zfsonlinux doesn't flush dirty data when taking snapshot
+ int ret = sync_filesystem(get_current_fd());
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+ ret = zfs.snapshot(path, false);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (cid)
+ *cid = 0;
+ return 0;
+}
+
+int ZFSFileStoreBackend::rollback_to(const string& name)
+{
+ dout(10) << "rollback_to: '" << name << "'" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ // umount current to avoid triggering online rollback deadlock
+ int ret;
+ if (zfs.is_mounted(current_zh, NULL)) {
+ ret = zfs.umount(current_zh, NULL, 0);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl;
+ }
+ }
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+
+ ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT);
+ if (!snap_zh) {
+ dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl;
+ return -ENOENT;
+ }
+
+ ret = zfs.rollback(current_zh, snap_zh, false);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl;
+ }
+
+ if (!zfs.is_mounted(current_zh, NULL)) {
+ int ret = zfs.mount(current_zh, NULL, 0);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+
+ zfs.close(snap_zh);
+ return ret;
+}
+
+int ZFSFileStoreBackend::destroy_checkpoint(const string& name)
+{
+ dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ int ret = zfs.destroy_snaps(current_zh, name.c_str(), true);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl;
+ }
+ return ret;
+}
diff --git a/src/os/filestore/ZFSFileStoreBackend.h b/src/os/filestore/ZFSFileStoreBackend.h
new file mode 100644
index 00000000..b1fa9887
--- /dev/null
+++ b/src/os/filestore/ZFSFileStoreBackend.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ZFSFILESTOREBACKEND_H
+#define CEPH_ZFSFILESTOREBACKEND_H
+
+#ifdef HAVE_LIBZFS
+#include "GenericFileStoreBackend.h"
+#include "os/fs/ZFS.h"
+
+class ZFSFileStoreBackend : public GenericFileStoreBackend {
+private:
+ ZFS zfs;
+ ZFS::Handle *base_zh;
+ ZFS::Handle *current_zh;
+ bool m_filestore_zfs_snap;
+ int update_current_zh();
+public:
+ explicit ZFSFileStoreBackend(FileStore *fs);
+ ~ZFSFileStoreBackend();
+ const char *get_name() override {
+ return "zfs";
+ }
+ int detect_features();
+ bool can_checkpoint();
+ int create_current();
+ int list_checkpoints(list<string>& ls);
+ int create_checkpoint(const string& name, uint64_t *cid);
+ int rollback_to(const string& name);
+ int destroy_checkpoint(const string& name);
+};
+#endif
+#endif
diff --git a/src/os/filestore/chain_xattr.cc b/src/os/filestore/chain_xattr.cc
new file mode 100644
index 00000000..e4dedd29
--- /dev/null
+++ b/src/os/filestore/chain_xattr.cc
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "chain_xattr.h"
+#include <errno.h> // for ERANGE, ENODATA, ENOMEM
+#include <stdio.h> // for size_t, snprintf
+#include <stdlib.h> // for free, malloc
+#include <string.h> // for strcpy, strlen
+#include "include/ceph_assert.h" // for assert
+#include "include/buffer.h"
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "include/ceph_assert.h"
+
+/*
+ * chaining xattrs
+ *
+ * In order to support xattrs that are larger than the xattr size limit that some file systems
+ * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys
+ * are set as follows:
+ * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char
+ * being esacped ("@@").
+ * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@<id>"
+ * where <id> marks the num of xattr in the chain.
+ */
+
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len)
+{
+ int pos = 0;
+
+ while (*name) {
+ switch (*name) {
+ case '@': /* escape it */
+ pos += 2;
+ ceph_assert (pos < raw_len - 1);
+ *raw_name = '@';
+ raw_name++;
+ *raw_name = '@';
+ break;
+ default:
+ pos++;
+ ceph_assert(pos < raw_len - 1);
+ *raw_name = *name;
+ break;
+ }
+ name++;
+ raw_name++;
+ }
+
+ if (!i) {
+ *raw_name = '\0';
+ } else {
+ int r = snprintf(raw_name, raw_len - pos, "@%d", i);
+ ceph_assert(r < raw_len - pos);
+ }
+}
+
+static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first)
+{
+ int pos = 0;
+
+ *is_first = true;
+ while (*raw_name) {
+ switch (*raw_name) {
+ case '@': /* escape it */
+ raw_name++;
+ if (!*raw_name)
+ break;
+ if (*raw_name != '@') {
+ *is_first = false;
+ goto done;
+ }
+
+ /* fall through */
+ default:
+ *name = *raw_name;
+ break;
+ }
+ pos++;
+ ceph_assert(pos < name_len);
+ name++;
+ raw_name++;
+ }
+done:
+ *name = '\0';
+ return pos;
+}
+
+
+// setxattr
+
+static int getxattr_len(const char *fn, const char *name)
+{
+ int i = 0, total = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_getxattr(fn, raw_name, 0, 0);
+ if (!i && r < 0)
+ return r;
+ if (r < 0)
+ break;
+ total += r;
+ i++;
+ } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+ return total;
+}
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ int r;
+ size_t chunk_size;
+
+ if (!size)
+ return getxattr_len(fn, name);
+
+ do {
+ chunk_size = size;
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+ r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+
+ if (r > 0) {
+ pos += r;
+ size -= r;
+ }
+
+ i++;
+ } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+ if (r >= 0) {
+ ret = pos;
+ /* is there another chunk? that can happen if the last read size span over
+ exactly one block */
+ if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_getxattr(fn, raw_name, 0, 0);
+ if (r > 0) { // there's another chunk.. the original buffer was too small
+ ret = -ERANGE;
+ }
+ }
+ }
+ return ret;
+}
+
+int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp)
+{
+ size_t size = 1024; // Initial
+ while (1) {
+ bufferptr buf(size);
+ int r = chain_getxattr(
+ fn,
+ name,
+ buf.c_str(),
+ size);
+ if (r > 0) {
+ buf.set_length(r);
+ if (bp)
+ bp->swap(buf);
+ return r;
+ } else if (r == 0) {
+ return 0;
+ } else {
+ if (r == -ERANGE) {
+ size *= 2;
+ } else {
+ return r;
+ }
+ }
+ }
+ ceph_abort_msg("unreachable");
+ return 0;
+}
+
+static int chain_fgetxattr_len(int fd, const char *name)
+{
+ int i = 0, total = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fgetxattr(fd, raw_name, 0, 0);
+ if (!i && r < 0)
+ return r;
+ if (r < 0)
+ break;
+ total += r;
+ i++;
+ } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+ return total;
+}
+
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ int r;
+ size_t chunk_size;
+
+ if (!size)
+ return chain_fgetxattr_len(fd, name);
+
+ do {
+ chunk_size = size;
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+ r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+
+ if (r > 0) {
+ pos += r;
+ size -= r;
+ }
+
+ i++;
+ } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+ if (r >= 0) {
+ ret = pos;
+ /* is there another chunk? that can happen if the last read size span over
+ exactly one block */
+ if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fgetxattr(fd, raw_name, 0, 0);
+ if (r > 0) { // there's another chunk.. the original buffer was too small
+ ret = -ERANGE;
+ }
+ }
+ }
+ return ret;
+}
+
+
+// setxattr
+
+int get_xattr_block_size(size_t size)
+{
+ if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD)
+ // this may fit in the inode; stripe over short attrs so that XFS
+ // won't kick it out.
+ return CHAIN_XATTR_SHORT_BLOCK_LEN;
+ return CHAIN_XATTR_MAX_BLOCK_LEN;
+}
+
+// removexattr
+
+int chain_removexattr(const char *fn, const char *name)
+{
+ int i = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_removexattr(fn, raw_name);
+ if (!i && r < 0) {
+ return r;
+ }
+ i++;
+ } while (r >= 0);
+ return 0;
+}
+
+int chain_fremovexattr(int fd, const char *name)
+{
+ int i = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fremovexattr(fd, raw_name);
+ if (!i && r < 0) {
+ return r;
+ }
+ i++;
+ } while (r >= 0);
+ return 0;
+}
+
+
+// listxattr
+
+int chain_listxattr(const char *fn, char *names, size_t len) {
+ int r;
+
+ if (!len)
+ return sys_listxattr(fn, names, len) * 2;
+
+ r = sys_listxattr(fn, 0, 0);
+ if (r < 0)
+ return r;
+
+ size_t total_len = r * 2; // should be enough
+ char *full_buf = (char *)malloc(total_len);
+ if (!full_buf)
+ return -ENOMEM;
+
+ r = sys_listxattr(fn, full_buf, total_len);
+ if (r < 0) {
+ free(full_buf);
+ return r;
+ }
+
+ char *p = full_buf;
+ const char *end = full_buf + r;
+ char *dest = names;
+ char *dest_end = names + len;
+
+ while (p < end) {
+ char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int attr_len = strlen(p);
+ bool is_first;
+ int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+ if (is_first) {
+ if (dest + name_len > dest_end) {
+ r = -ERANGE;
+ goto done;
+ }
+ strcpy(dest, name);
+ dest += name_len + 1;
+ }
+ p += attr_len + 1;
+ }
+ r = dest - names;
+
+done:
+ free(full_buf);
+ return r;
+}
+
+int chain_flistxattr(int fd, char *names, size_t len) {
+ int r;
+ char *p;
+ const char * end;
+ char *dest;
+ char *dest_end;
+
+ if (!len)
+ return sys_flistxattr(fd, names, len) * 2;
+
+ r = sys_flistxattr(fd, 0, 0);
+ if (r < 0)
+ return r;
+
+ size_t total_len = r * 2; // should be enough
+ char *full_buf = (char *)malloc(total_len);
+ if (!full_buf)
+ return -ENOMEM;
+
+ r = sys_flistxattr(fd, full_buf, total_len);
+ if (r < 0)
+ goto done;
+
+ p = full_buf;
+ end = full_buf + r;
+ dest = names;
+ dest_end = names + len;
+
+ while (p < end) {
+ char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int attr_len = strlen(p);
+ bool is_first;
+ int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+ if (is_first) {
+ if (dest + name_len > dest_end) {
+ r = -ERANGE;
+ goto done;
+ }
+ strcpy(dest, name);
+ dest += name_len + 1;
+ }
+ p += attr_len + 1;
+ }
+ r = dest - names;
+
+done:
+ free(full_buf);
+ return r;
+}
diff --git a/src/os/filestore/chain_xattr.h b/src/os/filestore/chain_xattr.h
new file mode 100644
index 00000000..a2d17fa6
--- /dev/null
+++ b/src/os/filestore/chain_xattr.h
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OSD_CHAIN_XATTR_H
+#define __CEPH_OSD_CHAIN_XATTR_H
+
+#include "include/compat.h"
+#include <errno.h>
+#include <stdio.h>
+#include "common/xattr.h"
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+
+#if defined(__linux__)
+#include <linux/limits.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2)
+#elif defined(__APPLE__)
+#include <sys/xattr.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2)
+#else
+#define CHAIN_XATTR_MAX_NAME_LEN 128
+#endif
+
+#define CHAIN_XATTR_MAX_BLOCK_LEN 2048
+
+/*
+ * XFS will only inline xattrs < 255 bytes, so for xattrs that are
+ * likely to fit in the inode, stripe over short xattrs.
+ */
+#define CHAIN_XATTR_SHORT_BLOCK_LEN 250
+#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000
+
+// wrappers to hide annoying errno handling.
+
+static inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+ int r = ::ceph_os_fgetxattr(fd, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+ int r = ::ceph_os_getxattr(fn, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+
+static inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size)
+{
+ int r = ::ceph_os_setxattr(fn, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size)
+{
+ int r = ::ceph_os_fsetxattr(fd, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+
+static inline int sys_listxattr(const char *fn, char *names, size_t len)
+{
+ int r = ::ceph_os_listxattr(fn, names, len);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_flistxattr(int fd, char *names, size_t len)
+{
+ int r = ::ceph_os_flistxattr(fd, names, len);
+ return (r < 0 ? -errno : r);
+}
+
+static inline int sys_removexattr(const char *fn, const char *name)
+{
+ int r = ::ceph_os_removexattr(fn, name);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_fremovexattr(int fd, const char *name)
+{
+ int r = ::ceph_os_fremovexattr(fd, name);
+ return (r < 0 ? -errno : r);
+}
+
+
+// wrappers to chain large values across multiple xattrs
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size);
+int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp);
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size);
+
+int get_xattr_block_size(size_t size);
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len);
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_setxattr(
+ const char *fn, const char *name, const void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ size_t max_chunk_size =
+ ensure_single_attr ? size : get_xattr_block_size(size);
+
+ static_assert(
+ !skip_chain_cleanup || ensure_single_attr,
+ "skip_chain_cleanup must imply ensure_single_attr");
+
+ do {
+ size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ size -= chunk_size;
+
+ int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size);
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+ pos += chunk_size;
+ ret = pos;
+ i++;
+ ceph_assert(size == 0 || !ensure_single_attr);
+ } while (size);
+
+ if (ret >= 0 && !skip_chain_cleanup) {
+ int r;
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_removexattr(fn, raw_name);
+ if (r < 0 && r != -ENODATA)
+ ret = r;
+ i++;
+ } while (r != -ENODATA);
+ }
+
+ return ret;
+}
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_fsetxattr(
+ int fd, const char *name, const void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ size_t max_chunk_size =
+ ensure_single_attr ? size : get_xattr_block_size(size);
+
+ static_assert(
+ !skip_chain_cleanup || ensure_single_attr,
+ "skip_chain_cleanup must imply ensure_single_attr");
+
+ do {
+ size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ size -= chunk_size;
+
+ int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+ pos += chunk_size;
+ ret = pos;
+ i++;
+ ceph_assert(size == 0 || !ensure_single_attr);
+ } while (size);
+
+ if (ret >= 0 && !skip_chain_cleanup) {
+ int r;
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fremovexattr(fd, raw_name);
+ if (r < 0 && r != -ENODATA)
+ ret = r;
+ i++;
+ } while (r != -ENODATA);
+ }
+
+ return ret;
+}
+
+int chain_listxattr(const char *fn, char *names, size_t len);
+int chain_flistxattr(int fd, char *names, size_t len);
+int chain_removexattr(const char *fn, const char *name);
+int chain_fremovexattr(int fd, const char *name);
+
+#endif