summaryrefslogtreecommitdiffstats
path: root/src/os
diff options
context:
space:
mode:
Diffstat (limited to 'src/os')
-rw-r--r--src/os/CMakeLists.txt153
-rw-r--r--src/os/FuseStore.cc1281
-rw-r--r--src/os/FuseStore.h54
-rw-r--r--src/os/ObjectMap.h172
-rw-r--r--src/os/ObjectStore.cc160
-rw-r--r--src/os/ObjectStore.h1925
-rw-r--r--src/os/Transaction.cc516
-rw-r--r--src/os/bluestore/Allocator.cc203
-rw-r--r--src/os/bluestore/Allocator.h72
-rwxr-xr-xsrc/os/bluestore/AvlAllocator.cc430
-rwxr-xr-xsrc/os/bluestore/AvlAllocator.h257
-rwxr-xr-xsrc/os/bluestore/BitmapAllocator.cc114
-rwxr-xr-xsrc/os/bluestore/BitmapAllocator.h52
-rw-r--r--src/os/bluestore/BitmapFreelistManager.cc600
-rw-r--r--src/os/bluestore/BitmapFreelistManager.h88
-rw-r--r--src/os/bluestore/BlockDevice.cc157
-rw-r--r--src/os/bluestore/BlockDevice.h245
-rw-r--r--src/os/bluestore/BlueFS.cc3665
-rw-r--r--src/os/bluestore/BlueFS.h682
-rw-r--r--src/os/bluestore/BlueRocksEnv.cc583
-rw-r--r--src/os/bluestore/BlueRocksEnv.h164
-rw-r--r--src/os/bluestore/BlueStore.cc15265
-rw-r--r--src/os/bluestore/BlueStore.h3602
-rw-r--r--src/os/bluestore/FreelistManager.cc25
-rw-r--r--src/os/bluestore/FreelistManager.h54
-rw-r--r--src/os/bluestore/HybridAllocator.cc222
-rw-r--r--src/os/bluestore/HybridAllocator.h48
-rw-r--r--src/os/bluestore/KernelDevice.cc1185
-rw-r--r--src/os/bluestore/KernelDevice.h150
-rw-r--r--src/os/bluestore/NVMEDevice.cc952
-rw-r--r--src/os/bluestore/NVMEDevice.h83
-rw-r--r--src/os/bluestore/PMEMDevice.cc270
-rw-r--r--src/os/bluestore/PMEMDevice.h73
-rw-r--r--src/os/bluestore/StupidAllocator.cc364
-rw-r--r--src/os/bluestore/StupidAllocator.h65
-rw-r--r--src/os/bluestore/aio.cc124
-rw-r--r--src/os/bluestore/bluefs_types.cc213
-rw-r--r--src/os/bluestore/bluefs_types.h265
-rw-r--r--src/os/bluestore/bluestore_tool.cc864
-rw-r--r--src/os/bluestore/bluestore_types.cc1138
-rw-r--r--src/os/bluestore/bluestore_types.h1044
-rw-r--r--src/os/bluestore/ceph_aio.h144
-rwxr-xr-xsrc/os/bluestore/fastbmap_allocator_impl.cc717
-rwxr-xr-xsrc/os/bluestore/fastbmap_allocator_impl.h833
-rw-r--r--src/os/filestore/BtrfsFileStoreBackend.cc575
-rw-r--r--src/os/filestore/BtrfsFileStoreBackend.h49
-rw-r--r--src/os/filestore/CollectionIndex.h207
-rw-r--r--src/os/filestore/DBObjectMap.cc1415
-rw-r--r--src/os/filestore/DBObjectMap.h585
-rw-r--r--src/os/filestore/FDCache.h112
-rw-r--r--src/os/filestore/FileJournal.cc2216
-rw-r--r--src/os/filestore/FileJournal.h556
-rw-r--r--src/os/filestore/FileStore.cc6425
-rw-r--r--src/os/filestore/FileStore.h938
-rw-r--r--src/os/filestore/GenericFileStoreBackend.cc468
-rw-r--r--src/os/filestore/GenericFileStoreBackend.h75
-rw-r--r--src/os/filestore/HashIndex.cc1195
-rw-r--r--src/os/filestore/HashIndex.h462
-rw-r--r--src/os/filestore/IndexManager.cc151
-rw-r--r--src/os/filestore/IndexManager.h99
-rw-r--r--src/os/filestore/Journal.h94
-rw-r--r--src/os/filestore/JournalThrottle.cc67
-rw-r--r--src/os/filestore/JournalThrottle.h101
-rw-r--r--src/os/filestore/JournalingObjectStore.cc271
-rw-r--r--src/os/filestore/JournalingObjectStore.h147
-rw-r--r--src/os/filestore/LFNIndex.cc1407
-rw-r--r--src/os/filestore/LFNIndex.h614
-rw-r--r--src/os/filestore/SequencerPosition.h59
-rw-r--r--src/os/filestore/WBThrottle.cc272
-rw-r--r--src/os/filestore/WBThrottle.h187
-rw-r--r--src/os/filestore/XfsFileStoreBackend.cc149
-rw-r--r--src/os/filestore/XfsFileStoreBackend.h36
-rw-r--r--src/os/filestore/ZFSFileStoreBackend.cc258
-rw-r--r--src/os/filestore/ZFSFileStoreBackend.h33
-rw-r--r--src/os/filestore/chain_xattr.cc413
-rw-r--r--src/os/filestore/chain_xattr.h182
-rw-r--r--src/os/fs/FS.cc186
-rw-r--r--src/os/fs/FS.h51
-rw-r--r--src/os/fs/XFS.cc55
-rw-r--r--src/os/fs/XFS.h31
-rw-r--r--src/os/fs/ZFS.cc83
-rw-r--r--src/os/fs/ZFS.h39
-rw-r--r--src/os/fs/btrfs_ioctl.h201
-rw-r--r--src/os/kstore/KStore.cc3436
-rw-r--r--src/os/kstore/KStore.h692
-rw-r--r--src/os/kstore/kstore_types.cc102
-rw-r--r--src/os/kstore/kstore_types.h68
-rw-r--r--src/os/kv.h76
-rw-r--r--src/os/memstore/MemStore.cc1801
-rw-r--r--src/os/memstore/MemStore.h414
-rw-r--r--src/os/memstore/PageSet.h232
91 files changed, 66558 insertions, 0 deletions
diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt
new file mode 100644
index 00000000..8fc8d353
--- /dev/null
+++ b/src/os/CMakeLists.txt
@@ -0,0 +1,153 @@
+set(libos_srcs
+ ObjectStore.cc
+ Transaction.cc
+ filestore/chain_xattr.cc
+ filestore/BtrfsFileStoreBackend.cc
+ filestore/DBObjectMap.cc
+ filestore/FileJournal.cc
+ filestore/FileStore.cc
+ filestore/JournalThrottle.cc
+ filestore/GenericFileStoreBackend.cc
+ filestore/JournalingObjectStore.cc
+ filestore/HashIndex.cc
+ filestore/IndexManager.cc
+ filestore/LFNIndex.cc
+ filestore/WBThrottle.cc
+ memstore/MemStore.cc
+ kstore/KStore.cc
+ kstore/kstore_types.cc
+ fs/FS.cc)
+
+if(WITH_BLUESTORE)
+ list(APPEND libos_srcs
+ bluestore/Allocator.cc
+ bluestore/BitmapFreelistManager.cc
+ bluestore/BlockDevice.cc
+ bluestore/BlueFS.cc
+ bluestore/bluefs_types.cc
+ bluestore/BlueRocksEnv.cc
+ bluestore/BlueStore.cc
+ bluestore/bluestore_types.cc
+ bluestore/fastbmap_allocator_impl.cc
+ bluestore/FreelistManager.cc
+ bluestore/StupidAllocator.cc
+ bluestore/BitmapAllocator.cc
+ bluestore/AvlAllocator.cc
+ bluestore/HybridAllocator.cc
+ )
+endif(WITH_BLUESTORE)
+
+if(HAVE_LIBAIO OR HAVE_POSIXAIO)
+ list(APPEND libos_srcs
+ bluestore/KernelDevice.cc
+ bluestore/aio.cc)
+endif()
+
+if(WITH_FUSE)
+ list(APPEND libos_srcs
+ FuseStore.cc)
+endif(WITH_FUSE)
+
+if(WITH_PMEM)
+ list(APPEND libos_srcs
+ bluestore/PMEMDevice.cc)
+endif(WITH_PMEM)
+
+if(HAVE_LIBXFS)
+ list(APPEND libos_srcs
+ filestore/XfsFileStoreBackend.cc
+ fs/XFS.cc)
+endif()
+
+if(HAVE_LIBZFS)
+ add_library(os_zfs_objs OBJECT
+ filestore/ZFSFileStoreBackend.cc
+ fs/ZFS.cc)
+ target_include_directories(os_zfs_objs SYSTEM PRIVATE
+ ${ZFS_INCLUDE_DIRS})
+ list(APPEND libos_srcs $<TARGET_OBJECTS:os_zfs_objs>)
+endif()
+
+if(WITH_SPDK)
+ list(APPEND libos_srcs
+ bluestore/NVMEDevice.cc)
+endif()
+
+add_library(os STATIC ${libos_srcs})
+
+target_link_libraries(os heap_profiler kv)
+
+if(WITH_BLUEFS)
+ add_library(bluefs SHARED
+ bluestore/BlueRocksEnv.cc)
+ target_include_directories(bluefs SYSTEM PUBLIC
+ $<TARGET_PROPERTY:RocksDB::RocksDB,INTERFACE_INCLUDE_DIRECTORIES>)
+ target_link_libraries(bluefs global)
+ install(TARGETS bluefs DESTINATION lib)
+endif(WITH_BLUEFS)
+
+if(HAVE_LIBAIO)
+ target_link_libraries(os ${AIO_LIBRARIES})
+endif(HAVE_LIBAIO)
+
+if(WITH_FUSE)
+ target_link_libraries(os FUSE::FUSE)
+endif()
+
+if(HAVE_LIBZFS)
+ target_link_libraries(os ${ZFS_LIBRARIES})
+endif()
+
+if(WITH_SPDK)
+ target_link_libraries(os
+ ${SPDK_LIBRARIES})
+endif()
+
+if(WITH_LTTNG)
+ add_dependencies(os objectstore-tp)
+endif()
+
+target_link_libraries(os kv)
+
+add_dependencies(os compressor_plugins)
+add_dependencies(os crypto_plugins)
+
+
+if(WITH_BLUESTORE)
+ add_executable(ceph-bluestore-tool
+ bluestore/bluestore_tool.cc)
+ target_link_libraries(ceph-bluestore-tool
+ os global)
+ install(TARGETS ceph-bluestore-tool
+ DESTINATION bin)
+endif()
+
+if(WITH_PMEM)
+ include(ExternalProject)
+ ExternalProject_Add(nvml_ext
+ DOWNLOAD_DIR ${CMAKE_BINARY_DIR}/src/
+ GIT_REPOSITORY "https://github.com/ceph/nvml.git"
+ GIT_TAG "dd622819dd4ee97d3920f913c70be"
+ SOURCE_DIR ${CMAKE_BINARY_DIR}/src/nvml
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND $(MAKE)
+ BUILD_IN_SOURCE 1
+ INSTALL_COMMAND "true")
+
+ ExternalProject_Add_Step(nvml_ext forcebuild
+ DEPENDEES configure
+ DEPENDERS build
+ COMMAND "true"
+ ALWAYS 1)
+ add_library(pmem STATIC IMPORTED GLOBAL)
+ add_dependencies(pmem nvml_ext)
+ set_target_properties(pmem PROPERTIES
+ IMPORTED_LOCATION "${CMAKE_BINARY_DIR}/src/nvml/src/nondebug/libpmem.a"
+ INTERFACE_LINK_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+ target_link_libraries(os pmem)
+ target_include_directories(os SYSTEM PRIVATE "${CMAKE_BINARY_DIR}/src/nvml/src/include")
+endif(WITH_PMEM)
+
+if(WITH_LTTNG AND WITH_EVENTTRACE)
+ add_dependencies(os eventtrace_tp)
+endif()
diff --git a/src/os/FuseStore.cc b/src/os/FuseStore.cc
new file mode 100644
index 00000000..7ae21f3a
--- /dev/null
+++ b/src/os/FuseStore.cc
@@ -0,0 +1,1281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "FuseStore.h"
+#include "os/ObjectStore.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+
+#define FUSE_USE_VERSION 30
+#include <fuse.h>
+#include <fuse_lowlevel.h>
+#include "include/ceph_fuse.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h> /* Definition of AT_* constants */
+#include <sys/stat.h>
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+#define dout_context store->cct
+#define dout_subsys ceph_subsys_fuse
+#include "common/debug.h"
+#undef dout_prefix
+#define dout_prefix *_dout << "fuse "
+
+// some fuse-y bits of state
+struct fs_info {
+ struct fuse_args args;
+ struct fuse *f;
+#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0)
+ struct fuse_chan *ch;
+#endif
+ char *mountpoint;
+};
+
+int FuseStore::open_file(string p, struct fuse_file_info *fi,
+ std::function<int(bufferlist *bl)> f)
+{
+ if (open_files.count(p)) {
+ OpenFile *o = open_files[p];
+ fi->fh = reinterpret_cast<uint64_t>(o);
+ ++o->ref;
+ return 0;
+ }
+ bufferlist bl;
+ int r = f(&bl);
+ if (r < 0) {
+ return r;
+ }
+ OpenFile *o = new OpenFile;
+ o->path = p;
+ o->bl.claim(bl);
+ open_files[p] = o;
+ fi->fh = reinterpret_cast<uint64_t>(o);
+ ++o->ref;
+ return 0;
+}
+
+FuseStore::FuseStore(ObjectStore *s, string p)
+ : store(s),
+ mount_point(p),
+ fuse_thread(this)
+{
+ info = new fs_info();
+}
+
+FuseStore::~FuseStore()
+{
+ delete info;
+}
+
+/*
+ * / - root directory
+ * $cid/
+ * $cid/type - objectstore type
+ * $cid/bitwise_hash_start = lowest hash value
+ * $cid/bitwise_hash_end = highest hash value
+ * $cid/bitwise_hash_bits - how many bits are significant
+ * $cid/pgmeta/ - pgmeta object
+ * $cid/all/ - all objects
+ * $cid/all/$obj/
+ * $cid/all/$obj/bitwise_hash
+ * $cid/all/$obj/data
+ * $cid/all/$obj/omap/$key
+ * $cid/all/$obj/attr/$name
+ * $cid/by_bitwise_hash/$hash/$bits/$obj - all objects with this (bitwise) hash (prefix)
+ */
+enum {
+ FN_ROOT = 1,
+ FN_TYPE,
+ FN_COLLECTION,
+ FN_HASH_START,
+ FN_HASH_END,
+ FN_HASH_BITS,
+ FN_OBJECT,
+ FN_OBJECT_HASH,
+ FN_OBJECT_DATA,
+ FN_OBJECT_OMAP_HEADER,
+ FN_OBJECT_OMAP,
+ FN_OBJECT_OMAP_VAL,
+ FN_OBJECT_ATTR,
+ FN_OBJECT_ATTR_VAL,
+ FN_ALL,
+ FN_HASH_DIR,
+ FN_HASH_VAL,
+};
+
+static int parse_fn(CephContext* cct, const char *path, coll_t *cid,
+ ghobject_t *oid, string *key,
+ uint32_t *hash, uint32_t *hash_bits)
+{
+ list<string> v;
+ for (const char *p = path; *p; ++p) {
+ if (*p == '/')
+ continue;
+ const char *e;
+ for (e = p + 1; *e && *e != '/'; e++) ;
+ string c(p, e-p);
+ v.push_back(c);
+ p = e;
+ if (!*p)
+ break;
+ }
+ ldout(cct, 10) << __func__ << " path " << path << " -> " << v << dendl;
+
+ if (v.empty())
+ return FN_ROOT;
+
+ if (v.front() == "type")
+ return FN_TYPE;
+
+ if (!cid->parse(v.front())) {
+ return -ENOENT;
+ }
+ if (v.size() == 1)
+ return FN_COLLECTION;
+ v.pop_front();
+
+ if (v.front() == "bitwise_hash_start")
+ return FN_HASH_START;
+ if (v.front() == "bitwise_hash_end")
+ return FN_HASH_END;
+ if (v.front() == "bitwise_hash_bits")
+ return FN_HASH_BITS;
+ if (v.front() == "pgmeta") {
+ spg_t pgid;
+ if (cid->is_pg(&pgid)) {
+ *oid = pgid.make_pgmeta_oid();
+ v.pop_front();
+ if (v.empty())
+ return FN_OBJECT;
+ goto do_object;
+ }
+ return -ENOENT;
+ }
+ if (v.front() == "all") {
+ v.pop_front();
+ if (v.empty())
+ return FN_ALL;
+ goto do_dir;
+ }
+ if (v.front() == "by_bitwise_hash") {
+ v.pop_front();
+ if (v.empty())
+ return FN_HASH_DIR;
+ unsigned long hv, hm;
+ int r = sscanf(v.front().c_str(), "%lx", &hv);
+ if (r != 1)
+ return -ENOENT;
+ int shift = 32 - v.front().length() * 4;
+ v.pop_front();
+ if (v.empty())
+ return FN_HASH_DIR;
+ r = sscanf(v.front().c_str(), "%ld", &hm);
+ if (r != 1)
+ return -ENOENT;
+ if (hm < 1 || hm > 32)
+ return -ENOENT;
+ v.pop_front();
+ *hash = hv << shift;//hobject_t::_reverse_bits(hv << shift);
+ *hash_bits = hm;
+ if (v.empty())
+ return FN_HASH_VAL;
+ goto do_dir;
+ }
+ return -ENOENT;
+
+ do_dir:
+ {
+ string o = v.front();
+ if (!oid->parse(o)) {
+ return -ENOENT;
+ }
+ v.pop_front();
+ if (v.empty())
+ return FN_OBJECT;
+ }
+
+ do_object:
+ if (v.front() == "data")
+ return FN_OBJECT_DATA;
+ if (v.front() == "omap_header")
+ return FN_OBJECT_OMAP_HEADER;
+ if (v.front() == "omap") {
+ v.pop_front();
+ if (v.empty())
+ return FN_OBJECT_OMAP;
+ *key = v.front();
+ v.pop_front();
+ if (v.empty())
+ return FN_OBJECT_OMAP_VAL;
+ return -ENOENT;
+ }
+ if (v.front() == "attr") {
+ v.pop_front();
+ if (v.empty())
+ return FN_OBJECT_ATTR;
+ *key = v.front();
+ v.pop_front();
+ if (v.empty())
+ return FN_OBJECT_ATTR_VAL;
+ return -ENOENT;
+ }
+ if (v.front() == "bitwise_hash")
+ return FN_OBJECT_HASH;
+ return -ENOENT;
+}
+
+
+static int os_getattr(const char *path, struct stat *stbuf
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ , struct fuse_file_info *fi
+#endif
+ )
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+ coll_t cid;
+ ghobject_t oid;
+ string key;
+ uint32_t hash_value, hash_bits;
+ int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+ &hash_bits);
+ if (t < 0)
+ return t;
+
+ std::lock_guard<std::mutex> l(fs->lock);
+
+ stbuf->st_size = 0;
+ stbuf->st_uid = 0;
+ stbuf->st_gid = 0;
+ stbuf->st_mode = S_IFREG | 0700;
+
+ auto ch = fs->store->open_collection(cid);
+
+ switch (t) {
+ case FN_OBJECT_OMAP:
+ case FN_OBJECT_ATTR:
+ case FN_OBJECT:
+ case FN_OBJECT_DATA:
+ case FN_OBJECT_OMAP_HEADER:
+ case FN_OBJECT_OMAP_VAL:
+ {
+ spg_t pgid;
+ if (cid.is_pg(&pgid)) {
+ if (!ch) {
+ return -ENOENT;
+ }
+ int bits = fs->store->collection_bits(ch);
+ if (bits >= 0 && !oid.match(bits, pgid.ps())) {
+ // sorry, not part of this PG
+ return -ENOENT;
+ }
+ }
+ }
+ break;
+ }
+
+ switch (t) {
+ case FN_OBJECT_OMAP:
+ case FN_OBJECT_ATTR:
+ case FN_OBJECT:
+ if (!fs->store->exists(ch, oid))
+ return -ENOENT;
+ // fall-thru
+ case FN_ALL:
+ case FN_HASH_DIR:
+ case FN_HASH_VAL:
+ case FN_COLLECTION:
+ if (!fs->store->collection_exists(cid))
+ return -ENOENT;
+ // fall-thru
+ case FN_ROOT:
+ stbuf->st_mode = S_IFDIR | 0700;
+ return 0;
+
+ case FN_TYPE:
+ stbuf->st_size = fs->store->get_type().length() + 1;
+ break;
+
+ case FN_OBJECT_HASH:
+ if (!fs->store->exists(ch, oid))
+ return -ENOENT;
+ stbuf->st_size = 9;
+ return 0;
+
+ case FN_HASH_END:
+ if (!ch)
+ return -ENOENT;
+ if (fs->store->collection_bits(ch) < 0)
+ return -ENOENT;
+ // fall-thru
+ case FN_HASH_START:
+ stbuf->st_size = 9;
+ return 0;
+
+ case FN_HASH_BITS:
+ {
+ if (!ch)
+ return -ENOENT;
+ int bits = fs->store->collection_bits(ch);
+ if (bits < 0)
+ return -ENOENT;
+ char buf[12];
+ snprintf(buf, sizeof(buf), "%d\n", bits);
+ stbuf->st_size = strlen(buf);
+ }
+ return 0;
+
+ case FN_OBJECT_DATA:
+ {
+ if (!fs->store->exists(ch, oid))
+ return -ENOENT;
+ int r = fs->store->stat(ch, oid, stbuf);
+ if (r < 0)
+ return r;
+ }
+ break;
+
+ case FN_OBJECT_OMAP_HEADER:
+ {
+ if (!fs->store->exists(ch, oid))
+ return -ENOENT;
+ bufferlist bl;
+ fs->store->omap_get_header(ch, oid, &bl);
+ stbuf->st_size = bl.length();
+ }
+ break;
+
+ case FN_OBJECT_OMAP_VAL:
+ {
+ if (!fs->store->exists(ch, oid))
+ return -ENOENT;
+ set<string> k;
+ k.insert(key);
+ map<string,bufferlist> v;
+ fs->store->omap_get_values(ch, oid, k, &v);
+ if (!v.count(key)) {
+ return -ENOENT;
+ }
+ stbuf->st_size = v[key].length();
+ }
+ break;
+
+ case FN_OBJECT_ATTR_VAL:
+ {
+ if (!fs->store->exists(ch, oid))
+ return -ENOENT;
+ bufferptr v;
+ int r = fs->store->getattr(ch, oid, key.c_str(), v);
+ if (r == -ENODATA)
+ r = -ENOENT;
+ if (r < 0)
+ return r;
+ stbuf->st_size = v.length();
+ }
+ break;
+
+ default:
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int os_readdir(const char *path,
+ void *buf,
+ fuse_fill_dir_t filler,
+ off_t offset,
+ struct fuse_file_info *fi
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ , enum fuse_readdir_flags
+#endif
+ )
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset
+ << dendl;
+ coll_t cid;
+ ghobject_t oid;
+ string key;
+ uint32_t hash_value, hash_bits;
+ int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+ &hash_bits);
+ if (t < 0)
+ return t;
+
+ std::lock_guard<std::mutex> l(fs->lock);
+
+ auto ch = fs->store->open_collection(cid);
+
+ // we can't shift 32 bits or else off_t will go negative
+ const int hash_shift = 31;
+
+ switch (t) {
+ case FN_ROOT:
+ {
+ filler_compat(filler, buf, "type", NULL, 0);
+ vector<coll_t> cls;
+ fs->store->list_collections(cls);
+ for (auto c : cls) {
+ int r = filler_compat(filler, buf, stringify(c).c_str(), NULL, 0);
+ if (r > 0)
+ break;
+ }
+ }
+ break;
+
+ case FN_COLLECTION:
+ {
+ if (!ch) {
+ return -ENOENT;
+ }
+ filler_compat(filler, buf, "bitwise_hash_start", NULL, 0);
+ if (fs->store->collection_bits(ch) >= 0) {
+ filler_compat(filler, buf, "bitwise_hash_end", NULL, 0);
+ filler_compat(filler, buf, "bitwise_hash_bits", NULL, 0);
+ }
+ filler_compat(filler, buf, "all", NULL, 0);
+ filler_compat(filler, buf, "by_bitwise_hash", NULL, 0);
+ spg_t pgid;
+ if (cid.is_pg(&pgid) &&
+ fs->store->exists(ch, pgid.make_pgmeta_oid())) {
+ filler_compat(filler, buf, "pgmeta", NULL, 0);
+ }
+ }
+ break;
+
+ case FN_OBJECT:
+ {
+ filler_compat(filler, buf, "bitwise_hash", NULL, 0);
+ filler_compat(filler, buf, "data", NULL, 0);
+ filler_compat(filler, buf, "omap", NULL, 0);
+ filler_compat(filler, buf, "attr", NULL, 0);
+ filler_compat(filler, buf, "omap_header", NULL, 0);
+ }
+ break;
+
+ case FN_HASH_VAL:
+ case FN_ALL:
+ {
+ uint32_t bitwise_hash = (offset >> hash_shift) & 0xffffffff;
+ uint32_t hashoff = offset - (bitwise_hash << hash_shift);
+ int skip = hashoff;
+ ghobject_t next = cid.get_min_hobj();
+ if (offset) {
+ // obey the offset
+ next.hobj.set_hash(hobject_t::_reverse_bits(bitwise_hash));
+ } else if (t == FN_HASH_VAL) {
+ next.hobj.set_hash(hobject_t::_reverse_bits(hash_value));
+ }
+ ghobject_t last;
+ if (t == FN_HASH_VAL) {
+ last = next;
+ uint64_t rev_end = (hash_value | (0xffffffff >> hash_bits)) + 1;
+ if (rev_end >= 0x100000000)
+ last = ghobject_t::get_max();
+ else
+ last.hobj.set_hash(hobject_t::_reverse_bits(rev_end));
+ } else {
+ last = ghobject_t::get_max();
+ }
+ ldout(fs->store->cct, 10) << __func__ << std::hex
+ << " offset " << offset << " hash "
+ << hobject_t::_reverse_bits(hash_value)
+ << std::dec
+ << "/" << hash_bits
+ << " first " << next << " last " << last
+ << dendl;
+ while (true) {
+ vector<ghobject_t> ls;
+ int r = fs->store->collection_list(
+ ch, next, last, 1000, &ls, &next);
+ if (r < 0)
+ return r;
+ for (auto p : ls) {
+ if (skip) {
+ --skip;
+ continue;
+ }
+ uint32_t cur_bitwise_hash = p.hobj.get_bitwise_key_u32();
+ if (cur_bitwise_hash != bitwise_hash) {
+ bitwise_hash = cur_bitwise_hash;
+ hashoff = 0;
+ }
+ ++hashoff;
+ uint64_t cur_off = ((uint64_t)bitwise_hash << hash_shift) |
+ (uint64_t)hashoff;
+ string s = stringify(p);
+ r = filler_compat(filler, buf, s.c_str(), NULL, cur_off);
+ if (r)
+ break;
+ }
+ if (r)
+ break;
+ if (next == ghobject_t::get_max() || next == last)
+ break;
+ }
+ }
+ break;
+
+ case FN_OBJECT_OMAP:
+ {
+ set<string> keys;
+ fs->store->omap_get_keys(ch, oid, &keys);
+ unsigned skip = offset;
+ for (auto k : keys) {
+ if (skip) {
+ --skip;
+ continue;
+ }
+ ++offset;
+ int r = filler_compat(filler, buf, k.c_str(), NULL, offset);
+ if (r)
+ break;
+ }
+ }
+ break;
+
+ case FN_OBJECT_ATTR:
+ {
+ map<string,bufferptr> aset;
+ fs->store->getattrs(ch, oid, aset);
+ unsigned skip = offset;
+ for (auto a : aset) {
+ if (skip) {
+ --skip;
+ continue;
+ }
+ ++offset;
+ int r = filler_compat(filler, buf, a.first.c_str(), NULL, offset);
+ if (r)
+ break;
+ }
+ }
+ break;
+ }
+ return 0;
+}
+
+static int os_open(const char *path, struct fuse_file_info *fi)
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+ coll_t cid;
+ ghobject_t oid;
+ string key;
+ uint32_t hash_value, hash_bits;
+ int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+ &hash_bits);
+ if (t < 0)
+ return t;
+
+ std::lock_guard<std::mutex> l(fs->lock);
+
+ auto ch = fs->store->open_collection(cid);
+
+ bufferlist *pbl = 0;
+ switch (t) {
+ case FN_TYPE:
+ pbl = new bufferlist;
+ pbl->append(fs->store->get_type());
+ pbl->append("\n");
+ break;
+
+ case FN_HASH_START:
+ {
+ pbl = new bufferlist;
+ spg_t pgid;
+ if (cid.is_pg(&pgid)) {
+ unsigned long h;
+ h = hobject_t::_reverse_bits(pgid.ps());
+ char buf[10];
+ snprintf(buf, sizeof(buf), "%08lx\n", h);
+ pbl->append(buf);
+ } else {
+ pbl->append("00000000\n");
+ }
+ }
+ break;
+
+ case FN_HASH_END:
+ {
+ if (!ch) {
+ return -ENOENT;
+ }
+ spg_t pgid;
+ unsigned long h;
+ if (cid.is_pg(&pgid)) {
+ int hash_bits = fs->store->collection_bits(ch);
+ if (hash_bits >= 0) {
+ uint64_t rev_start = hobject_t::_reverse_bits(pgid.ps());
+ uint64_t rev_end = (rev_start | (0xffffffff >> hash_bits));
+ h = rev_end;
+ } else {
+ return -ENOENT;
+ }
+ } else {
+ h = 0xffffffff;
+ }
+ char buf[10];
+ snprintf(buf, sizeof(buf), "%08lx\n", h);
+ pbl = new bufferlist;
+ pbl->append(buf);
+ }
+ break;
+
+ case FN_HASH_BITS:
+ {
+ if (!ch) {
+ return -ENOENT;
+ }
+ int r = fs->store->collection_bits(ch);
+ if (r < 0)
+ return r;
+ char buf[12];
+ snprintf(buf, sizeof(buf), "%d\n", r);
+ pbl = new bufferlist;
+ pbl->append(buf);
+ }
+ break;
+
+ case FN_OBJECT_HASH:
+ {
+ pbl = new bufferlist;
+ char buf[10];
+ snprintf(buf, sizeof(buf), "%08x\n",
+ (unsigned)oid.hobj.get_bitwise_key_u32());
+ pbl->append(buf);
+ }
+ break;
+
+ case FN_OBJECT_DATA:
+ {
+ int r = fs->open_file(
+ path, fi,
+ [&](bufferlist *pbl) {
+ return fs->store->read(ch, oid, 0, 0, *pbl);
+ });
+ if (r < 0) {
+ return r;
+ }
+ }
+ break;
+
+ case FN_OBJECT_ATTR_VAL:
+ {
+ int r = fs->open_file(
+ path, fi,
+ [&](bufferlist *pbl) {
+ bufferptr bp;
+ int r = fs->store->getattr(ch, oid, key.c_str(), bp);
+ if (r < 0)
+ return r;
+ pbl->append(bp);
+ return 0;
+ });
+ if (r < 0)
+ return r;
+ }
+ break;
+
+ case FN_OBJECT_OMAP_VAL:
+ {
+ int r = fs->open_file(
+ path, fi,
+ [&](bufferlist *pbl) {
+ set<string> k;
+ k.insert(key);
+ map<string,bufferlist> v;
+ int r = fs->store->omap_get_values(ch, oid, k, &v);
+ if (r < 0)
+ return r;
+ *pbl = v[key];
+ return 0;
+ });
+ if (r < 0)
+ return r;
+ }
+ break;
+
+ case FN_OBJECT_OMAP_HEADER:
+ {
+ int r = fs->open_file(
+ path, fi,
+ [&](bufferlist *pbl) {
+ return fs->store->omap_get_header(ch, oid, pbl);
+ });
+ if (r < 0)
+ return r;
+ }
+ break;
+ }
+
+ if (pbl) {
+ FuseStore::OpenFile *o = new FuseStore::OpenFile;
+ o->bl.claim(*pbl);
+ fi->fh = reinterpret_cast<uint64_t>(o);
+ }
+ return 0;
+}
+
+static int os_mkdir(const char *path, mode_t mode)
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+ coll_t cid;
+ ghobject_t oid;
+ string key;
+ uint32_t hash_value, hash_bits;
+ int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+ &hash_bits);
+ if (f < 0)
+ return f;
+
+ std::lock_guard<std::mutex> l(fs->lock);
+
+ ObjectStore::CollectionHandle ch;
+
+ ObjectStore::Transaction t;
+ switch (f) {
+ case FN_OBJECT:
+ {
+ ch = fs->store->open_collection(cid);
+ if (!ch) {
+ return -ENOENT;
+ }
+ spg_t pgid;
+ if (cid.is_pg(&pgid)) {
+ int bits = fs->store->collection_bits(ch);
+ if (bits >= 0 && !oid.match(bits, pgid.ps())) {
+ // sorry, not part of this PG
+ return -EINVAL;
+ }
+ }
+ t.touch(cid, oid);
+ ch = fs->store->open_collection(cid);
+ }
+ break;
+
+ case FN_COLLECTION:
+ if (cid.is_pg()) {
+ // use the mode for the bit count. e.g., mkdir --mode=0003
+ // mnt/0.7_head will create 0.7 with bits = 3.
+ mode &= 0777;
+ if (mode >= 32)
+ return -EINVAL;
+ } else {
+ mode = 0;
+ }
+ t.create_collection(cid, mode);
+ ch = fs->store->create_new_collection(cid);
+ break;
+
+ default:
+ return -EPERM;
+ }
+
+ if (!t.empty()) {
+ fs->store->queue_transaction(ch, std::move(t));
+ }
+
+ return 0;
+}
+
+static int os_chmod(const char *path, mode_t mode
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ , struct fuse_file_info *fi
+#endif
+ )
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+ return 0;
+}
+
+static int os_create(const char *path, mode_t mode, struct fuse_file_info *fi)
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+ coll_t cid;
+ ghobject_t oid;
+ string key;
+ uint32_t hash_value, hash_bits;
+ int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+ &hash_bits);
+ if (f < 0)
+ return f;
+
+ std::lock_guard<std::mutex> l(fs->lock);
+
+ ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+
+ ObjectStore::Transaction t;
+ bufferlist *pbl = 0;
+ switch (f) {
+ case FN_OBJECT_DATA:
+ {
+ pbl = new bufferlist;
+ fs->store->read(ch, oid, 0, 0, *pbl);
+ }
+ break;
+
+ case FN_OBJECT_ATTR_VAL:
+ {
+ pbl = new bufferlist;
+ bufferptr bp;
+ int r = fs->store->getattr(ch, oid, key.c_str(), bp);
+ if (r == -ENODATA) {
+ bufferlist empty;
+ t.setattr(cid, oid, key.c_str(), empty);
+ }
+ pbl->append(bp);
+ }
+ break;
+
+ case FN_OBJECT_OMAP_VAL:
+ {
+ pbl = new bufferlist;
+ set<string> k;
+ k.insert(key);
+ map<string,bufferlist> v;
+ fs->store->omap_get_values(ch, oid, k, &v);
+ if (v.count(key) == 0) {
+ map<string,bufferlist> aset;
+ aset[key] = bufferlist();
+ t.omap_setkeys(cid, oid, aset);
+ } else {
+ *pbl = v[key];
+ }
+ }
+ break;
+ }
+
+ if (!t.empty()) {
+ fs->store->queue_transaction(ch, std::move(t));
+ }
+
+ if (pbl) {
+ FuseStore::OpenFile *o = new FuseStore::OpenFile;
+ o->bl.claim(*pbl);
+ o->dirty = true;
+ fi->fh = reinterpret_cast<uint64_t>(o);
+ }
+ return 0;
+}
+
+static int os_release(const char *path, struct fuse_file_info *fi)
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+ std::lock_guard<std::mutex> l(fs->lock);
+ FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+ if (--o->ref == 0) {
+ ldout(fs->store->cct, 10) << __func__ << " closing last " << o->path << dendl;
+ fs->open_files.erase(o->path);
+ delete o;
+ }
+ return 0;
+}
+
+static int os_read(const char *path, char *buf, size_t size, off_t offset,
+ struct fuse_file_info *fi)
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset
+ << " size " << size << dendl;
+ std::lock_guard<std::mutex> l(fs->lock);
+ FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+ if (!o)
+ return 0;
+ if (offset >= o->bl.length())
+ return 0;
+ if (offset + size > o->bl.length())
+ size = o->bl.length() - offset;
+ bufferlist r;
+ r.substr_of(o->bl, offset, size);
+ memcpy(buf, r.c_str(), r.length());
+ return r.length();
+}
+
+static int os_write(const char *path, const char *buf, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset
+ << " size " << size << dendl;
+ std::lock_guard<std::mutex> l(fs->lock);
+ FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+ if (!o)
+ return 0;
+
+ bufferlist final;
+ if (offset) {
+ if (offset > o->bl.length()) {
+ final.substr_of(o->bl, 0, offset);
+ } else {
+ final.claim_append(o->bl);
+ size_t zlen = offset - final.length();
+ final.append_zero(zlen);
+ }
+ }
+ final.append(buf, size);
+ if (offset + size < o->bl.length()) {
+ bufferlist rest;
+ rest.substr_of(o->bl, offset + size, o->bl.length() - offset - size);
+ final.claim_append(rest);
+ }
+ o->bl = final;
+ o->dirty = true;
+ return size;
+}
+
+int os_flush(const char *path, struct fuse_file_info *fi)
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+ coll_t cid;
+ ghobject_t oid;
+ string key;
+ uint32_t hash_value, hash_bits;
+ int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+ &hash_bits);
+ if (f < 0)
+ return f;
+
+ std::lock_guard<std::mutex> l(fs->lock);
+
+ FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+ if (!o)
+ return 0;
+ if (!o->dirty)
+ return 0;
+
+ ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+
+ ObjectStore::Transaction t;
+
+ switch (f) {
+ case FN_OBJECT_DATA:
+ t.write(cid, oid, 0, o->bl.length(), o->bl);
+ break;
+
+ case FN_OBJECT_ATTR_VAL:
+ t.setattr(cid, oid, key.c_str(), o->bl);
+ break;
+
+ case FN_OBJECT_OMAP_VAL:
+ {
+ map<string,bufferlist> aset;
+ aset[key] = o->bl;
+ t.omap_setkeys(cid, oid, aset);
+ break;
+ }
+
+ case FN_OBJECT_OMAP_HEADER:
+ t.omap_setheader(cid, oid, o->bl);
+ break;
+
+ default:
+ return 0;
+ }
+
+ fs->store->queue_transaction(ch, std::move(t));
+
+ return 0;
+}
+
+static int os_unlink(const char *path)
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+ coll_t cid;
+ ghobject_t oid;
+ string key;
+ uint32_t hash_value, hash_bits;
+ int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+ &hash_bits);
+ if (f < 0)
+ return f;
+
+ std::lock_guard<std::mutex> l(fs->lock);
+
+ ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+ ObjectStore::Transaction t;
+
+ switch (f) {
+ case FN_OBJECT_OMAP_VAL:
+ {
+ set<string> keys;
+ keys.insert(key);
+ t.omap_rmkeys(cid, oid, keys);
+ }
+ break;
+
+ case FN_OBJECT_ATTR_VAL:
+ t.rmattr(cid, oid, key.c_str());
+ break;
+
+ case FN_OBJECT_OMAP_HEADER:
+ {
+ bufferlist empty;
+ t.omap_setheader(cid, oid, empty);
+ }
+ break;
+
+ case FN_OBJECT:
+ t.remove(cid, oid);
+ break;
+
+ case FN_COLLECTION:
+ {
+ bool empty;
+ int r = fs->store->collection_empty(ch, &empty);
+ if (r < 0)
+ return r;
+ if (!empty)
+ return -ENOTEMPTY;
+ t.remove_collection(cid);
+ }
+ break;
+
+ case FN_OBJECT_DATA:
+ t.truncate(cid, oid, 0);
+ break;
+
+ default:
+ return -EPERM;
+ }
+
+ fs->store->queue_transaction(ch, std::move(t));
+
+ return 0;
+}
+
+static int os_truncate(const char *path, off_t size
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ , struct fuse_file_info *fi
+#endif
+ )
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << " size " << size << dendl;
+ coll_t cid;
+ ghobject_t oid;
+ string key;
+ uint32_t hash_value, hash_bits;
+ int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+ &hash_bits);
+ if (f < 0)
+ return f;
+
+ if (f == FN_OBJECT_OMAP_VAL ||
+ f == FN_OBJECT_ATTR_VAL ||
+ f == FN_OBJECT_OMAP_HEADER) {
+ if (size)
+ return -EPERM;
+ return 0;
+ }
+ if (f != FN_OBJECT_DATA)
+ return -EPERM;
+
+ std::lock_guard<std::mutex> l(fs->lock);
+
+ if (fs->open_files.count(path)) {
+ FuseStore::OpenFile *o = fs->open_files[path];
+ if (o->bl.length() > size) {
+ bufferlist t;
+ t.substr_of(o->bl, 0, size);
+ o->bl.swap(t);
+ }
+ }
+
+ ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+ ObjectStore::Transaction t;
+ t.truncate(cid, oid, size);
+ fs->store->queue_transaction(ch, std::move(t));
+ return 0;
+}
+
+static int os_statfs(const char *path, struct statvfs *stbuf)
+{
+ fuse_context *fc = fuse_get_context();
+ FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+ ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+ std::lock_guard<std::mutex> l(fs->lock);
+
+ struct store_statfs_t s;
+ int r = fs->store->statfs(&s);
+ if (r < 0)
+ return r;
+ stbuf->f_bsize = 4096; // LIES!
+ stbuf->f_blocks = s.total / 4096;
+ stbuf->f_bavail = s.available / 4096;
+ stbuf->f_bfree = stbuf->f_bavail;
+
+ ldout(fs->store->cct, 10) << __func__ << " " << path << ": "
+ << stbuf->f_bavail << "/" << stbuf->f_blocks << dendl;
+ return 0;
+}
+
+static struct fuse_operations fs_oper = {
+ getattr: os_getattr,
+ readlink: 0,
+#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0)
+ getdir: 0,
+#endif
+ mknod: 0,
+ mkdir: os_mkdir,
+ unlink: os_unlink,
+ rmdir: os_unlink,
+ symlink: 0,
+ rename: 0,
+ link: 0,
+ chmod: os_chmod,
+ chown: 0,
+ truncate: os_truncate,
+#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0)
+ utime: 0,
+#endif
+ open: os_open,
+ read: os_read,
+ write: os_write,
+ statfs: os_statfs,
+ flush: os_flush,
+ release: os_release,
+ fsync: 0,
+ setxattr: 0,
+ getxattr: 0,
+ listxattr: 0,
+ removexattr: 0,
+ opendir: 0,
+ readdir: os_readdir,
+ releasedir: 0,
+ fsyncdir: 0,
+ init: 0,
+ destroy: 0,
+ access: 0,
+ create: os_create,
+};
+
+int FuseStore::main()
+{
+ const char *v[] = {
+ "foo",
+ mount_point.c_str(),
+ "-f",
+ "-d", // debug
+ };
+ int c = 3;
+ auto fuse_debug = store->cct->_conf.get_val<bool>("fuse_debug");
+ if (fuse_debug)
+ ++c;
+ return fuse_main(c, (char**)v, &fs_oper, (void*)this);
+}
+
+int FuseStore::start()
+{
+ dout(10) << __func__ << dendl;
+
+ memset(&info->args, 0, sizeof(info->args));
+ const char *v[] = {
+ "foo",
+ mount_point.c_str(),
+ "-f", // foreground
+ "-d", // debug
+ };
+ int c = 3;
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ int rc;
+ struct fuse_cmdline_opts opts = {};
+#endif
+ auto fuse_debug = store->cct->_conf.get_val<bool>("fuse_debug");
+ if (fuse_debug)
+ ++c;
+ fuse_args a = FUSE_ARGS_INIT(c, (char**)v);
+ info->args = a;
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ if (fuse_parse_cmdline(&info->args, &opts) == -1) {
+#else
+ if (fuse_parse_cmdline(&info->args, &info->mountpoint, NULL, NULL) == -1) {
+#endif
+ derr << __func__ << " failed to parse args" << dendl;
+ return -EINVAL;
+ }
+
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ info->mountpoint = opts.mountpoint;
+ info->f = fuse_new(&info->args, &fs_oper, sizeof(fs_oper), (void*)this);
+ if (!info->f) {
+ derr << __func__ << " fuse_new failed" << dendl;
+ return -EIO;
+ }
+
+ rc = fuse_mount(info->f, info->mountpoint);
+ if (rc != 0) {
+ derr << __func__ << " fuse_mount failed" << dendl;
+ return -EIO;
+ }
+#else
+ info->ch = fuse_mount(info->mountpoint, &info->args);
+ if (!info->ch) {
+ derr << __func__ << " fuse_mount failed" << dendl;
+ return -EIO;
+ }
+
+ info->f = fuse_new(info->ch, &info->args, &fs_oper, sizeof(fs_oper),
+ (void*)this);
+ if (!info->f) {
+ fuse_unmount(info->mountpoint, info->ch);
+ derr << __func__ << " fuse_new failed" << dendl;
+ return -EIO;
+ }
+#endif
+
+ fuse_thread.create("fusestore");
+ dout(10) << __func__ << " done" << dendl;
+ return 0;
+}
+
+int FuseStore::loop()
+{
+ dout(10) << __func__ << " enter" << dendl;
+ int r = fuse_loop(info->f);
+ if (r)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ dout(10) << __func__ << " exit" << dendl;
+ return r;
+}
+
+int FuseStore::stop()
+{
+ dout(10) << __func__ << " enter" << dendl;
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+ fuse_unmount(info->f);
+#else
+ fuse_unmount(info->mountpoint, info->ch);
+#endif
+ fuse_thread.join();
+ fuse_destroy(info->f);
+ dout(10) << __func__ << " exit" << dendl;
+ return 0;
+}
diff --git a/src/os/FuseStore.h b/src/os/FuseStore.h
new file mode 100644
index 00000000..db39ca5e
--- /dev/null
+++ b/src/os/FuseStore.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_FUSESTORE_H
+#define CEPH_OS_FUSESTORE_H
+
+#include <string>
+#include <map>
+#include <mutex>
+#include <functional>
+
+#include "common/Thread.h"
+#include "include/buffer.h"
+
+class ObjectStore;
+
+class FuseStore {
+public:
+ ObjectStore *store;
+ std::string mount_point;
+ struct fs_info *info;
+ std::mutex lock;
+
+ struct OpenFile {
+ std::string path;
+ bufferlist bl;
+ bool dirty = false;
+ int ref = 0;
+ };
+ std::map<std::string,OpenFile*> open_files;
+
+ int open_file(std::string p, struct fuse_file_info *fi,
+ std::function<int(bufferlist *bl)> f);
+
+ class FuseThread : public Thread {
+ FuseStore *fs;
+ public:
+ explicit FuseThread(FuseStore *f) : fs(f) {}
+ void *entry() override {
+ fs->loop();
+ return NULL;
+ }
+ } fuse_thread;
+
+ FuseStore(ObjectStore *s, std::string p);
+ ~FuseStore();
+
+ int main();
+ int start();
+ int loop();
+ int stop();
+};
+
+#endif
diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h
new file mode 100644
index 00000000..f903333a
--- /dev/null
+++ b/src/os/ObjectMap.h
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef OS_KEYVALUESTORE_H
+#define OS_KEYVALUESTORE_H
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "kv/KeyValueDB.h"
+#include "common/hobject.h"
+
+class SequencerPosition;
+
+/**
+ * Encapsulates the FileStore key value store
+ *
+ * Implementations of this interface will be used to implement TMAP
+ */
+class ObjectMap {
+public:
+ CephContext* cct;
+ boost::scoped_ptr<KeyValueDB> db;
+ /// Set keys and values from specified map
+ virtual int set_keys(
+ const ghobject_t &oid, ///< [in] object containing map
+ const map<string, bufferlist> &set, ///< [in] key to value map to set
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) = 0;
+
+ /// Set header
+ virtual int set_header(
+ const ghobject_t &oid, ///< [in] object containing map
+ const bufferlist &bl, ///< [in] header to set
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) = 0;
+
+ /// Retrieve header
+ virtual int get_header(
+ const ghobject_t &oid, ///< [in] object containing map
+ bufferlist *bl ///< [out] header to set
+ ) = 0;
+
+ /// Clear all map keys and values from oid
+ virtual int clear(
+ const ghobject_t &oid, ///< [in] object containing map
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) = 0;
+
+ /// Clear all map keys and values in to_clear from oid
+ virtual int rm_keys(
+ const ghobject_t &oid, ///< [in] object containing map
+ const set<string> &to_clear, ///< [in] Keys to clear
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) = 0;
+
+ /// Clear all omap keys and the header
+ virtual int clear_keys_header(
+ const ghobject_t &oid, ///< [in] oid to clear
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) = 0;
+
+ /// Get all keys and values
+ virtual int get(
+ const ghobject_t &oid, ///< [in] object containing map
+ bufferlist *header, ///< [out] Returned Header
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ ) = 0;
+
+ /// Get values for supplied keys
+ virtual int get_keys(
+ const ghobject_t &oid, ///< [in] object containing map
+ set<string> *keys ///< [out] Keys defined on oid
+ ) = 0;
+
+ /// Get values for supplied keys
+ virtual int get_values(
+ const ghobject_t &oid, ///< [in] object containing map
+ const set<string> &keys, ///< [in] Keys to get
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ ) = 0;
+
+ /// Check key existence
+ virtual int check_keys(
+ const ghobject_t &oid, ///< [in] object containing map
+ const set<string> &keys, ///< [in] Keys to check
+ set<string> *out ///< [out] Subset of keys defined on oid
+ ) = 0;
+
+ /// Get xattrs
+ virtual int get_xattrs(
+ const ghobject_t &oid, ///< [in] object
+ const set<string> &to_get, ///< [in] keys to get
+ map<string, bufferlist> *out ///< [out] subset of attrs/vals defined
+ ) = 0;
+
+ /// Get all xattrs
+ virtual int get_all_xattrs(
+ const ghobject_t &oid, ///< [in] object
+ set<string> *out ///< [out] attrs and values
+ ) = 0;
+
+ /// set xattrs in to_set
+ virtual int set_xattrs(
+ const ghobject_t &oid, ///< [in] object
+ const map<string, bufferlist> &to_set,///< [in] attrs/values to set
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) = 0;
+
+ /// remove xattrs in to_remove
+ virtual int remove_xattrs(
+ const ghobject_t &oid, ///< [in] object
+ const set<string> &to_remove, ///< [in] attrs to remove
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) = 0;
+
+
+ /// Clone keys from oid map to target map
+ virtual int clone(
+ const ghobject_t &oid, ///< [in] object containing map
+ const ghobject_t &target, ///< [in] target of clone
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) { return 0; }
+
+ /// Rename map because of name change
+ virtual int rename(
+ const ghobject_t &from, ///< [in] object containing map
+ const ghobject_t &to, ///< [in] new name
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) { return 0; }
+
+ /// For testing clone keys from oid map to target map using faster but more complex method
+ virtual int legacy_clone(
+ const ghobject_t &oid, ///< [in] object containing map
+ const ghobject_t &target, ///< [in] target of clone
+ const SequencerPosition *spos=0 ///< [in] sequencer position
+ ) { return 0; }
+
+ /// Ensure all previous writes are durable
+ virtual int sync(
+ const ghobject_t *oid=0, ///< [in] object
+ const SequencerPosition *spos=0 ///< [in] Sequencer
+ ) { return 0; }
+
+ virtual int check(std::ostream &out, bool repair = false, bool force = false) { return 0; }
+
+ virtual void compact() {}
+
+ typedef KeyValueDB::SimplestIteratorImpl ObjectMapIteratorImpl;
+ typedef std::shared_ptr<ObjectMapIteratorImpl> ObjectMapIterator;
+ virtual ObjectMapIterator get_iterator(const ghobject_t &oid) {
+ return ObjectMapIterator();
+ }
+
+ virtual KeyValueDB *get_db() { return nullptr; }
+
+ ObjectMap(CephContext* cct, KeyValueDB *db) : cct(cct), db(db) {}
+ virtual ~ObjectMap() {}
+};
+
+#endif
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
new file mode 100644
index 00000000..f9c6073d
--- /dev/null
+++ b/src/os/ObjectStore.cc
@@ -0,0 +1,160 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include <ctype.h>
+#include <sstream>
+#include "ObjectStore.h"
+#include "common/Formatter.h"
+#include "common/safe_io.h"
+
+#include "filestore/FileStore.h"
+#include "memstore/MemStore.h"
+#if defined(WITH_BLUESTORE)
+#include "bluestore/BlueStore.h"
+#endif
+#include "kstore/KStore.h"
+
+void decode_str_str_map_to_bl(bufferlist::const_iterator& p,
+ bufferlist *out)
+{
+ auto start = p;
+ __u32 n;
+ decode(n, p);
+ unsigned len = 4;
+ while (n--) {
+ __u32 l;
+ decode(l, p);
+ p.advance(l);
+ len += 4 + l;
+ decode(l, p);
+ p.advance(l);
+ len += 4 + l;
+ }
+ start.copy(len, *out);
+}
+
+void decode_str_set_to_bl(bufferlist::const_iterator& p,
+ bufferlist *out)
+{
+ auto start = p;
+ __u32 n;
+ decode(n, p);
+ unsigned len = 4;
+ while (n--) {
+ __u32 l;
+ decode(l, p);
+ p.advance(l);
+ len += 4 + l;
+ }
+ start.copy(len, *out);
+}
+
+ObjectStore *ObjectStore::create(CephContext *cct,
+ const string& type,
+ const string& data,
+ const string& journal,
+ osflagbits_t flags)
+{
+ if (type == "filestore") {
+ return new FileStore(cct, data, journal, flags);
+ }
+ if (type == "memstore") {
+ return new MemStore(cct, data);
+ }
+#if defined(WITH_BLUESTORE)
+ if (type == "bluestore") {
+ return new BlueStore(cct, data);
+ }
+ if (type == "random") {
+ if (rand() % 2) {
+ return new FileStore(cct, data, journal, flags);
+ } else {
+ return new BlueStore(cct, data);
+ }
+ }
+#else
+ if (type == "random") {
+ return new FileStore(cct, data, journal, flags);
+ }
+#endif
+ if (type == "kstore" &&
+ cct->check_experimental_feature_enabled("kstore")) {
+ return new KStore(cct, data);
+ }
+ return NULL;
+}
+
+int ObjectStore::probe_block_device_fsid(
+ CephContext *cct,
+ const string& path,
+ uuid_d *fsid)
+{
+ int r;
+
+#if defined(WITH_BLUESTORE)
+ // first try bluestore -- it has a crc on its header and will fail
+ // reliably.
+ r = BlueStore::get_block_device_fsid(cct, path, fsid);
+ if (r == 0) {
+ lgeneric_dout(cct, 0) << __func__ << " " << path << " is bluestore, "
+ << *fsid << dendl;
+ return r;
+ }
+#endif
+
+ // okay, try FileStore (journal).
+ r = FileStore::get_block_device_fsid(cct, path, fsid);
+ if (r == 0) {
+ lgeneric_dout(cct, 0) << __func__ << " " << path << " is filestore, "
+ << *fsid << dendl;
+ return r;
+ }
+
+ return -EINVAL;
+}
+
+int ObjectStore::write_meta(const std::string& key,
+ const std::string& value)
+{
+ string v = value;
+ v += "\n";
+ int r = safe_write_file(path.c_str(), key.c_str(),
+ v.c_str(), v.length(), 0600);
+ if (r < 0)
+ return r;
+ return 0;
+}
+
+int ObjectStore::read_meta(const std::string& key,
+ std::string *value)
+{
+ char buf[4096];
+ int r = safe_read_file(path.c_str(), key.c_str(),
+ buf, sizeof(buf));
+ if (r <= 0)
+ return r;
+ // drop trailing newlines
+ while (r && isspace(buf[r-1])) {
+ --r;
+ }
+ *value = string(buf, r);
+ return 0;
+}
+
+
+
+
+ostream& operator<<(ostream& out, const ObjectStore::Transaction& tx) {
+
+ return out << "Transaction(" << &tx << ")";
+}
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
new file mode 100644
index 00000000..1c120689
--- /dev/null
+++ b/src/os/ObjectStore.h
@@ -0,0 +1,1925 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_OBJECTSTORE_H
+#define CEPH_OBJECTSTORE_H
+
+#include "include/Context.h"
+#include "include/buffer.h"
+#include "include/types.h"
+#include "include/stringify.h"
+#include "osd/osd_types.h"
+#include "common/TrackedOp.h"
+#include "common/WorkQueue.h"
+#include "ObjectMap.h"
+
+#include <errno.h>
+#include <sys/stat.h>
+#include <vector>
+#include <map>
+
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun)
+#include <sys/statvfs.h>
+#else
+#include <sys/vfs.h> /* or <sys/statfs.h> */
+#endif
+
+#define OPS_PER_PTR 32
+
+class CephContext;
+
+using std::vector;
+using std::string;
+using std::map;
+
+namespace ceph {
+ class Formatter;
+}
+
+/*
+ * low-level interface to the local OSD file system
+ */
+
+class Logger;
+class ContextQueue;
+
+static inline void encode(const map<string,bufferptr> *attrset, bufferlist &bl) {
+ encode(*attrset, bl);
+}
+
+// this isn't the best place for these, but...
+void decode_str_str_map_to_bl(bufferlist::const_iterator& p, bufferlist *out);
+void decode_str_set_to_bl(bufferlist::const_iterator& p, bufferlist *out);
+
+// Flag bits
+typedef uint32_t osflagbits_t;
+const int SKIP_JOURNAL_REPLAY = 1 << 0;
+const int SKIP_MOUNT_OMAP = 1 << 1;
+
+class ObjectStore {
+protected:
+ string path;
+
+public:
+ CephContext* cct;
+ /**
+ * create - create an ObjectStore instance.
+ *
+ * This is invoked once at initialization time.
+ *
+ * @param type type of store. This is a string from the configuration file.
+ * @param data path (or other descriptor) for data
+ * @param journal path (or other descriptor) for journal (optional)
+ * @param flags which filestores should check if applicable
+ */
+ static ObjectStore *create(CephContext *cct,
+ const string& type,
+ const string& data,
+ const string& journal,
+ osflagbits_t flags = 0);
+
+ /**
+ * probe a block device to learn the uuid of the owning OSD
+ *
+ * @param cct cct
+ * @param path path to device
+ * @param fsid [out] osd uuid
+ */
+ static int probe_block_device_fsid(
+ CephContext *cct,
+ const string& path,
+ uuid_d *fsid);
+
+ /**
+ * Fetch Object Store statistics.
+ *
+ * Currently only latency of write and apply times are measured.
+ *
+ * This appears to be called with nothing locked.
+ */
+ virtual objectstore_perf_stat_t get_cur_stats() = 0;
+
+ /**
+ * Fetch Object Store performance counters.
+ *
+ *
+ * This appears to be called with nothing locked.
+ */
+ virtual const PerfCounters* get_perf_counters() const = 0;
+
+ /**
+ * a collection also orders transactions
+ *
+ * Any transactions queued under a given collection will be applied in
+ * sequence. Transactions queued under different collections may run
+ * in parallel.
+ *
+ * ObjectStore users my get collection handles with open_collection() (or,
+ * for bootstrapping a new collection, create_new_collection()).
+ */
+ struct CollectionImpl : public RefCountedObject {
+ const coll_t cid;
+
+ CollectionImpl(const coll_t& c)
+ : RefCountedObject(NULL, 0),
+ cid(c) {}
+
+ /// wait for any queued transactions to apply
+ // block until any previous transactions are visible. specifically,
+ // collection_list and collection_empty need to reflect prior operations.
+ virtual void flush() = 0;
+
+ /**
+ * Async flush_commit
+ *
+ * There are two cases:
+ * 1) collection is currently idle: the method returns true. c is
+ * not touched.
+ * 2) collection is not idle: the method returns false and c is
+ * called asynchronously with a value of 0 once all transactions
+ * queued on this collection prior to the call have been applied
+ * and committed.
+ */
+ virtual bool flush_commit(Context *c) = 0;
+
+ const coll_t &get_cid() {
+ return cid;
+ }
+ };
+ typedef boost::intrusive_ptr<CollectionImpl> CollectionHandle;
+
+
+ /*********************************
+ *
+ * Object Contents and semantics
+ *
+ * All ObjectStore objects are identified as a named object
+ * (ghobject_t and hobject_t) in a named collection (coll_t).
+ * ObjectStore operations support the creation, mutation, deletion
+ * and enumeration of objects within a collection. Enumeration is
+ * in sorted key order (where keys are sorted by hash). Object names
+ * are globally unique.
+ *
+ * Each object has four distinct parts: byte data, xattrs, omap_header
+ * and omap entries.
+ *
+ * The data portion of an object is conceptually equivalent to a
+ * file in a file system. Random and Partial access for both read
+ * and write operations is required. The ability to have a sparse
+ * implementation of the data portion of an object is beneficial for
+ * some workloads, but not required. There is a system-wide limit on
+ * the maximum size of an object, which is typically around 100 MB.
+ *
+ * Xattrs are equivalent to the extended attributes of file
+ * systems. Xattrs are a set of key/value pairs. Sub-value access
+ * is not required. It is possible to enumerate the set of xattrs in
+ * key order. At the implementation level, xattrs are used
+ * exclusively internal to Ceph and the implementer can expect the
+ * total size of all of the xattrs on an object to be relatively
+ * small, i.e., less than 64KB. Much of Ceph assumes that accessing
+ * xattrs on temporally adjacent object accesses (recent past or
+ * near future) is inexpensive.
+ *
+ * omap_header is a single blob of data. It can be read or written
+ * in total.
+ *
+ * Omap entries are conceptually the same as xattrs
+ * but in a different address space. In other words, you can have
+ * the same key as an xattr and an omap entry and they have distinct
+ * values. Enumeration of xattrs doesn't include omap entries and
+ * vice versa. The size and access characteristics of omap entries
+ * are very different from xattrs. In particular, the value portion
+ * of an omap entry can be quite large (MBs). More importantly, the
+ * interface must support efficient range queries on omap entries even
+ * when there are a large numbers of entries.
+ *
+ *********************************/
+
+ /*******************************
+ *
+ * Collections
+ *
+ * A collection is simply a grouping of objects. Collections have
+ * names (coll_t) and can be enumerated in order. Like an
+ * individual object, a collection also has a set of xattrs.
+ *
+ *
+ */
+
+
+ /*********************************
+ * transaction
+ *
+ * A Transaction represents a sequence of primitive mutation
+ * operations.
+ *
+ * Three events in the life of a Transaction result in
+ * callbacks. Any Transaction can contain any number of callback
+ * objects (Context) for any combination of the three classes of
+ * callbacks:
+ *
+ * on_applied_sync, on_applied, and on_commit.
+ *
+ * The "on_applied" and "on_applied_sync" callbacks are invoked when
+ * the modifications requested by the Transaction are visible to
+ * subsequent ObjectStore operations, i.e., the results are
+ * readable. The only conceptual difference between on_applied and
+ * on_applied_sync is the specific thread and locking environment in
+ * which the callbacks operate. "on_applied_sync" is called
+ * directly by an ObjectStore execution thread. It is expected to
+ * execute quickly and must not acquire any locks of the calling
+ * environment. Conversely, "on_applied" is called from the separate
+ * Finisher thread, meaning that it can contend for calling
+ * environment locks. NB, on_applied and on_applied_sync are
+ * sometimes called on_readable and on_readable_sync.
+ *
+ * The "on_commit" callback is also called from the Finisher thread
+ * and indicates that all of the mutations have been durably
+ * committed to stable storage (i.e., are now software/hardware
+ * crashproof).
+ *
+ * At the implementation level, each mutation primitive (and its
+ * associated data) can be serialized to a single buffer. That
+ * serialization, however, does not copy any data, but (using the
+ * bufferlist library) will reference the original buffers. This
+ * implies that the buffer that contains the data being submitted
+ * must remain stable until the on_commit callback completes. In
+ * practice, bufferlist handles all of this for you and this
+ * subtlety is only relevant if you are referencing an existing
+ * buffer via buffer::raw_static.
+ *
+ * Some implementations of ObjectStore choose to implement their own
+ * form of journaling that uses the serialized form of a
+ * Transaction. This requires that the encode/decode logic properly
+ * version itself and handle version upgrades that might change the
+ * format of the encoded Transaction. This has already happened a
+ * couple of times and the Transaction object contains some helper
+ * variables that aid in this legacy decoding:
+ *
+ * sobject_encoding detects an older/simpler version of oid
+ * present in pre-bobtail versions of ceph. use_pool_override
+ * also detects a situation where the pool of an oid can be
+ * overridden for legacy operations/buffers. For non-legacy
+ * implementations of ObjectStore, neither of these fields are
+ * relevant.
+ *
+ *
+ * TRANSACTION ISOLATION
+ *
+ * Except as noted above, isolation is the responsibility of the
+ * caller. In other words, if any storage element (storage element
+ * == any of the four portions of an object as described above) is
+ * altered by a transaction (including deletion), the caller
+ * promises not to attempt to read that element while the
+ * transaction is pending (here pending means from the time of
+ * issuance until the "on_applied_sync" callback has been
+ * received). Violations of isolation need not be detected by
+ * ObjectStore and there is no corresponding error mechanism for
+ * reporting an isolation violation (crashing would be the
+ * appropriate way to report an isolation violation if detected).
+ *
+ * Enumeration operations may violate transaction isolation as
+ * described above when a storage element is being created or
+ * deleted as part of a transaction. In this case, ObjectStore is
+ * allowed to consider the enumeration operation to either precede
+ * or follow the violating transaction element. In other words, the
+ * presence/absence of the mutated element in the enumeration is
+ * entirely at the discretion of ObjectStore. The arbitrary ordering
+ * applies independently to each transaction element. For example,
+ * if a transaction contains two mutating elements "create A" and
+ * "delete B". And an enumeration operation is performed while this
+ * transaction is pending. It is permissible for ObjectStore to
+ * report any of the four possible combinations of the existence of
+ * A and B.
+ *
+ */
+ class Transaction {
+ public:
+ enum {
+ OP_NOP = 0,
+ OP_TOUCH = 9, // cid, oid
+ OP_WRITE = 10, // cid, oid, offset, len, bl
+ OP_ZERO = 11, // cid, oid, offset, len
+ OP_TRUNCATE = 12, // cid, oid, len
+ OP_REMOVE = 13, // cid, oid
+ OP_SETATTR = 14, // cid, oid, attrname, bl
+ OP_SETATTRS = 15, // cid, oid, attrset
+ OP_RMATTR = 16, // cid, oid, attrname
+ OP_CLONE = 17, // cid, oid, newoid
+ OP_CLONERANGE = 18, // cid, oid, newoid, offset, len
+ OP_CLONERANGE2 = 30, // cid, oid, newoid, srcoff, len, dstoff
+
+ OP_TRIMCACHE = 19, // cid, oid, offset, len **DEPRECATED**
+
+ OP_MKCOLL = 20, // cid
+ OP_RMCOLL = 21, // cid
+ OP_COLL_ADD = 22, // cid, oldcid, oid
+ OP_COLL_REMOVE = 23, // cid, oid
+ OP_COLL_SETATTR = 24, // cid, attrname, bl
+ OP_COLL_RMATTR = 25, // cid, attrname
+ OP_COLL_SETATTRS = 26, // cid, attrset
+ OP_COLL_MOVE = 8, // newcid, oldcid, oid
+
+ OP_RMATTRS = 28, // cid, oid
+ OP_COLL_RENAME = 29, // cid, newcid
+
+ OP_OMAP_CLEAR = 31, // cid
+ OP_OMAP_SETKEYS = 32, // cid, attrset
+ OP_OMAP_RMKEYS = 33, // cid, keyset
+ OP_OMAP_SETHEADER = 34, // cid, header
+ OP_SPLIT_COLLECTION = 35, // cid, bits, destination
+ OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination
+ doesn't create the destination */
+ OP_OMAP_RMKEYRANGE = 37, // cid, oid, firstkey, lastkey
+ OP_COLL_MOVE_RENAME = 38, // oldcid, oldoid, newcid, newoid
+
+ OP_SETALLOCHINT = 39, // cid, oid, object_size, write_size
+ OP_COLL_HINT = 40, // cid, type, bl
+
+ OP_TRY_RENAME = 41, // oldcid, oldoid, newoid
+
+ OP_COLL_SET_BITS = 42, // cid, bits
+
+ OP_MERGE_COLLECTION = 43, // cid, destination
+ };
+
+ // Transaction hint type
+ enum {
+ COLL_HINT_EXPECTED_NUM_OBJECTS = 1,
+ };
+
+ struct Op {
+ ceph_le32 op;
+ ceph_le32 cid;
+ ceph_le32 oid;
+ ceph_le64 off;
+ ceph_le64 len;
+ ceph_le32 dest_cid;
+ ceph_le32 dest_oid; //OP_CLONE, OP_CLONERANGE
+ ceph_le64 dest_off; //OP_CLONERANGE
+ union {
+ struct {
+ ceph_le32 hint_type; //OP_COLL_HINT
+ };
+ struct {
+ ceph_le32 alloc_hint_flags; //OP_SETALLOCHINT
+ };
+ };
+ ceph_le64 expected_object_size; //OP_SETALLOCHINT
+ ceph_le64 expected_write_size; //OP_SETALLOCHINT
+ ceph_le32 split_bits; //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS,
+ //OP_MKCOLL
+ ceph_le32 split_rem; //OP_SPLIT_COLLECTION2
+ } __attribute__ ((packed)) ;
+
+ struct TransactionData {
+ ceph_le64 ops;
+ ceph_le32 largest_data_len;
+ ceph_le32 largest_data_off;
+ ceph_le32 largest_data_off_in_data_bl;
+ ceph_le32 fadvise_flags;
+
+ TransactionData() noexcept :
+ ops(init_le64(0)),
+ largest_data_len(init_le32(0)),
+ largest_data_off(init_le32(0)),
+ largest_data_off_in_data_bl(init_le32(0)),
+ fadvise_flags(init_le32(0)) { }
+
+ // override default move operations to reset default values
+ TransactionData(TransactionData&& other) noexcept :
+ ops(other.ops),
+ largest_data_len(other.largest_data_len),
+ largest_data_off(other.largest_data_off),
+ largest_data_off_in_data_bl(other.largest_data_off_in_data_bl),
+ fadvise_flags(other.fadvise_flags) {
+ other.ops = 0;
+ other.largest_data_len = 0;
+ other.largest_data_off = 0;
+ other.largest_data_off_in_data_bl = 0;
+ other.fadvise_flags = 0;
+ }
+ TransactionData& operator=(TransactionData&& other) noexcept {
+ ops = other.ops;
+ largest_data_len = other.largest_data_len;
+ largest_data_off = other.largest_data_off;
+ largest_data_off_in_data_bl = other.largest_data_off_in_data_bl;
+ fadvise_flags = other.fadvise_flags;
+ other.ops = 0;
+ other.largest_data_len = 0;
+ other.largest_data_off = 0;
+ other.largest_data_off_in_data_bl = 0;
+ other.fadvise_flags = 0;
+ return *this;
+ }
+
+ TransactionData(const TransactionData& other) = default;
+ TransactionData& operator=(const TransactionData& other) = default;
+
+ void encode(bufferlist& bl) const {
+ bl.append((char*)this, sizeof(TransactionData));
+ }
+ void decode(bufferlist::const_iterator &bl) {
+ bl.copy(sizeof(TransactionData), (char*)this);
+ }
+ } __attribute__ ((packed)) ;
+
+ private:
+ TransactionData data;
+
+ map<coll_t, __le32> coll_index;
+ map<ghobject_t, __le32> object_index;
+
+ __le32 coll_id {0};
+ __le32 object_id {0};
+
+ bufferlist data_bl;
+ bufferlist op_bl;
+
+ list<Context *> on_applied;
+ list<Context *> on_commit;
+ list<Context *> on_applied_sync;
+
+ public:
+ Transaction() = default;
+
+ explicit Transaction(bufferlist::const_iterator &dp) {
+ decode(dp);
+ }
+ explicit Transaction(bufferlist &nbl) {
+ auto dp = nbl.cbegin();
+ decode(dp);
+ }
+
+ // override default move operations to reset default values
+ Transaction(Transaction&& other) noexcept :
+ data(std::move(other.data)),
+ coll_index(std::move(other.coll_index)),
+ object_index(std::move(other.object_index)),
+ coll_id(other.coll_id),
+ object_id(other.object_id),
+ data_bl(std::move(other.data_bl)),
+ op_bl(std::move(other.op_bl)),
+ on_applied(std::move(other.on_applied)),
+ on_commit(std::move(other.on_commit)),
+ on_applied_sync(std::move(other.on_applied_sync)) {
+ other.coll_id = 0;
+ other.object_id = 0;
+ }
+
+ Transaction& operator=(Transaction&& other) noexcept {
+ data = std::move(other.data);
+ coll_index = std::move(other.coll_index);
+ object_index = std::move(other.object_index);
+ coll_id = other.coll_id;
+ object_id = other.object_id;
+ data_bl = std::move(other.data_bl);
+ op_bl = std::move(other.op_bl);
+ on_applied = std::move(other.on_applied);
+ on_commit = std::move(other.on_commit);
+ on_applied_sync = std::move(other.on_applied_sync);
+ other.coll_id = 0;
+ other.object_id = 0;
+ return *this;
+ }
+
+ Transaction(const Transaction& other) = default;
+ Transaction& operator=(const Transaction& other) = default;
+
+ // expose object_index for FileStore::Op's benefit
+ const map<ghobject_t, __le32>& get_object_index() const {
+ return object_index;
+ }
+
+ /* Operations on callback contexts */
+ void register_on_applied(Context *c) {
+ if (!c) return;
+ on_applied.push_back(c);
+ }
+ void register_on_commit(Context *c) {
+ if (!c) return;
+ on_commit.push_back(c);
+ }
+ void register_on_applied_sync(Context *c) {
+ if (!c) return;
+ on_applied_sync.push_back(c);
+ }
+ void register_on_complete(Context *c) {
+ if (!c) return;
+ RunOnDeleteRef _complete (std::make_shared<RunOnDelete>(c));
+ register_on_applied(new ContainerContext<RunOnDeleteRef>(_complete));
+ register_on_commit(new ContainerContext<RunOnDeleteRef>(_complete));
+ }
+ bool has_contexts() const {
+ return
+ !on_commit.empty() ||
+ !on_applied.empty() ||
+ !on_applied_sync.empty();
+ }
+
+ static void collect_contexts(
+ vector<Transaction>& t,
+ Context **out_on_applied,
+ Context **out_on_commit,
+ Context **out_on_applied_sync) {
+ ceph_assert(out_on_applied);
+ ceph_assert(out_on_commit);
+ ceph_assert(out_on_applied_sync);
+ list<Context *> on_applied, on_commit, on_applied_sync;
+ for (auto& i : t) {
+ on_applied.splice(on_applied.end(), i.on_applied);
+ on_commit.splice(on_commit.end(), i.on_commit);
+ on_applied_sync.splice(on_applied_sync.end(), i.on_applied_sync);
+ }
+ *out_on_applied = C_Contexts::list_to_context(on_applied);
+ *out_on_commit = C_Contexts::list_to_context(on_commit);
+ *out_on_applied_sync = C_Contexts::list_to_context(on_applied_sync);
+ }
+ static void collect_contexts(
+ vector<Transaction>& t,
+ list<Context*> *out_on_applied,
+ list<Context*> *out_on_commit,
+ list<Context*> *out_on_applied_sync) {
+ ceph_assert(out_on_applied);
+ ceph_assert(out_on_commit);
+ ceph_assert(out_on_applied_sync);
+ for (auto& i : t) {
+ out_on_applied->splice(out_on_applied->end(), i.on_applied);
+ out_on_commit->splice(out_on_commit->end(), i.on_commit);
+ out_on_applied_sync->splice(out_on_applied_sync->end(),
+ i.on_applied_sync);
+ }
+ }
+
+ Context *get_on_applied() {
+ return C_Contexts::list_to_context(on_applied);
+ }
+ Context *get_on_commit() {
+ return C_Contexts::list_to_context(on_commit);
+ }
+ Context *get_on_applied_sync() {
+ return C_Contexts::list_to_context(on_applied_sync);
+ }
+
+ void set_fadvise_flags(uint32_t flags) {
+ data.fadvise_flags = flags;
+ }
+ void set_fadvise_flag(uint32_t flag) {
+ data.fadvise_flags = data.fadvise_flags | flag;
+ }
+ uint32_t get_fadvise_flags() { return data.fadvise_flags; }
+
+ void swap(Transaction& other) noexcept {
+ std::swap(data, other.data);
+ std::swap(on_applied, other.on_applied);
+ std::swap(on_commit, other.on_commit);
+ std::swap(on_applied_sync, other.on_applied_sync);
+
+ std::swap(coll_index, other.coll_index);
+ std::swap(object_index, other.object_index);
+ std::swap(coll_id, other.coll_id);
+ std::swap(object_id, other.object_id);
+ op_bl.swap(other.op_bl);
+ data_bl.swap(other.data_bl);
+ }
+
+ void _update_op(Op* op,
+ vector<__le32> &cm,
+ vector<__le32> &om) {
+
+ switch (op->op) {
+ case OP_NOP:
+ break;
+
+ case OP_TOUCH:
+ case OP_REMOVE:
+ case OP_SETATTR:
+ case OP_SETATTRS:
+ case OP_RMATTR:
+ case OP_RMATTRS:
+ case OP_COLL_REMOVE:
+ case OP_OMAP_CLEAR:
+ case OP_OMAP_SETKEYS:
+ case OP_OMAP_RMKEYS:
+ case OP_OMAP_RMKEYRANGE:
+ case OP_OMAP_SETHEADER:
+ case OP_WRITE:
+ case OP_ZERO:
+ case OP_TRUNCATE:
+ case OP_SETALLOCHINT:
+ ceph_assert(op->cid < cm.size());
+ ceph_assert(op->oid < om.size());
+ op->cid = cm[op->cid];
+ op->oid = om[op->oid];
+ break;
+
+ case OP_CLONERANGE2:
+ case OP_CLONE:
+ ceph_assert(op->cid < cm.size());
+ ceph_assert(op->oid < om.size());
+ ceph_assert(op->dest_oid < om.size());
+ op->cid = cm[op->cid];
+ op->oid = om[op->oid];
+ op->dest_oid = om[op->dest_oid];
+ break;
+
+ case OP_MKCOLL:
+ case OP_RMCOLL:
+ case OP_COLL_SETATTR:
+ case OP_COLL_RMATTR:
+ case OP_COLL_SETATTRS:
+ case OP_COLL_HINT:
+ case OP_COLL_SET_BITS:
+ ceph_assert(op->cid < cm.size());
+ op->cid = cm[op->cid];
+ break;
+
+ case OP_COLL_ADD:
+ ceph_assert(op->cid < cm.size());
+ ceph_assert(op->oid < om.size());
+ ceph_assert(op->dest_cid < om.size());
+ op->cid = cm[op->cid];
+ op->dest_cid = cm[op->dest_cid];
+ op->oid = om[op->oid];
+ break;
+
+ case OP_COLL_MOVE_RENAME:
+ ceph_assert(op->cid < cm.size());
+ ceph_assert(op->oid < om.size());
+ ceph_assert(op->dest_cid < cm.size());
+ ceph_assert(op->dest_oid < om.size());
+ op->cid = cm[op->cid];
+ op->oid = om[op->oid];
+ op->dest_cid = cm[op->dest_cid];
+ op->dest_oid = om[op->dest_oid];
+ break;
+
+ case OP_TRY_RENAME:
+ ceph_assert(op->cid < cm.size());
+ ceph_assert(op->oid < om.size());
+ ceph_assert(op->dest_oid < om.size());
+ op->cid = cm[op->cid];
+ op->oid = om[op->oid];
+ op->dest_oid = om[op->dest_oid];
+ break;
+
+ case OP_SPLIT_COLLECTION2:
+ ceph_assert(op->cid < cm.size());
+ ceph_assert(op->dest_cid < cm.size());
+ op->cid = cm[op->cid];
+ op->dest_cid = cm[op->dest_cid];
+ break;
+
+ case OP_MERGE_COLLECTION:
+ ceph_assert(op->cid < cm.size());
+ ceph_assert(op->dest_cid < cm.size());
+ op->cid = cm[op->cid];
+ op->dest_cid = cm[op->dest_cid];
+ break;
+
+ default:
+ ceph_abort_msg("Unknown OP");
+ }
+ }
+ void _update_op_bl(
+ bufferlist& bl,
+ vector<__le32> &cm,
+ vector<__le32> &om) {
+ for (auto& bp : bl.buffers()) {
+ ceph_assert(bp.length() % sizeof(Op) == 0);
+
+ char* raw_p = const_cast<char*>(bp.c_str());
+ char* raw_end = raw_p + bp.length();
+ while (raw_p < raw_end) {
+ _update_op(reinterpret_cast<Op*>(raw_p), cm, om);
+ raw_p += sizeof(Op);
+ }
+ }
+ }
+ /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction
+ void append(Transaction& other) {
+
+ data.ops = data.ops + other.data.ops;
+ if (other.data.largest_data_len > data.largest_data_len) {
+ data.largest_data_len = other.data.largest_data_len;
+ data.largest_data_off = other.data.largest_data_off;
+ data.largest_data_off_in_data_bl = data_bl.length() + other.data.largest_data_off_in_data_bl;
+ }
+ data.fadvise_flags = data.fadvise_flags | other.data.fadvise_flags;
+ on_applied.splice(on_applied.end(), other.on_applied);
+ on_commit.splice(on_commit.end(), other.on_commit);
+ on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync);
+
+ //append coll_index & object_index
+ vector<__le32> cm(other.coll_index.size());
+ map<coll_t, __le32>::iterator coll_index_p;
+ for (coll_index_p = other.coll_index.begin();
+ coll_index_p != other.coll_index.end();
+ ++coll_index_p) {
+ cm[coll_index_p->second] = _get_coll_id(coll_index_p->first);
+ }
+
+ vector<__le32> om(other.object_index.size());
+ map<ghobject_t, __le32>::iterator object_index_p;
+ for (object_index_p = other.object_index.begin();
+ object_index_p != other.object_index.end();
+ ++object_index_p) {
+ om[object_index_p->second] = _get_object_id(object_index_p->first);
+ }
+
+ //the other.op_bl SHOULD NOT be changes during append operation,
+ //we use additional bufferlist to avoid this problem
+ bufferlist other_op_bl;
+ {
+ bufferptr other_op_bl_ptr(other.op_bl.length());
+ other.op_bl.copy(0, other.op_bl.length(), other_op_bl_ptr.c_str());
+ other_op_bl.append(std::move(other_op_bl_ptr));
+ }
+
+ //update other_op_bl with cm & om
+ //When the other is appended to current transaction, all coll_index and
+ //object_index in other.op_buffer should be updated by new index of the
+ //combined transaction
+ _update_op_bl(other_op_bl, cm, om);
+
+ //append op_bl
+ op_bl.append(other_op_bl);
+ //append data_bl
+ data_bl.append(other.data_bl);
+ }
+
+ /** Inquires about the Transaction as a whole. */
+
+ /// How big is the encoded Transaction buffer?
+ uint64_t get_encoded_bytes() {
+ //layout: data_bl + op_bl + coll_index + object_index + data
+
+ // coll_index size, object_index size and sizeof(transaction_data)
+ // all here, so they may be computed at compile-time
+ size_t final_size = sizeof(__u32) * 2 + sizeof(data);
+
+ // coll_index second and object_index second
+ final_size += (coll_index.size() + object_index.size()) * sizeof(__le32);
+
+ // coll_index first
+ for (auto p = coll_index.begin(); p != coll_index.end(); ++p) {
+ final_size += p->first.encoded_size();
+ }
+
+ // object_index first
+ for (auto p = object_index.begin(); p != object_index.end(); ++p) {
+ final_size += p->first.encoded_size();
+ }
+
+ return data_bl.length() +
+ op_bl.length() +
+ final_size;
+ }
+
+ /// Retain old version for regression testing purposes
+ uint64_t get_encoded_bytes_test() {
+ using ceph::encode;
+ //layout: data_bl + op_bl + coll_index + object_index + data
+ bufferlist bl;
+ encode(coll_index, bl);
+ encode(object_index, bl);
+
+ return data_bl.length() +
+ op_bl.length() +
+ bl.length() +
+ sizeof(data);
+ }
+
+ uint64_t get_num_bytes() {
+ return get_encoded_bytes();
+ }
+ /// Size of largest data buffer to the "write" operation encountered so far
+ uint32_t get_data_length() {
+ return data.largest_data_len;
+ }
+ /// offset within the encoded buffer to the start of the largest data buffer that's encoded
+ uint32_t get_data_offset() {
+ if (data.largest_data_off_in_data_bl) {
+ return data.largest_data_off_in_data_bl +
+ sizeof(__u8) + // encode struct_v
+ sizeof(__u8) + // encode compat_v
+ sizeof(__u32) + // encode len
+ sizeof(__u32); // data_bl len
+ }
+ return 0; // none
+ }
+ /// offset of buffer as aligned to destination within object.
+ int get_data_alignment() {
+ if (!data.largest_data_len)
+ return 0;
+ return (0 - get_data_offset()) & ~CEPH_PAGE_MASK;
+ }
+ /// Is the Transaction empty (no operations)
+ bool empty() {
+ return !data.ops;
+ }
+ /// Number of operations in the transaction
+ int get_num_ops() {
+ return data.ops;
+ }
+
+ /**
+ * iterator
+ *
+ * Helper object to parse Transactions.
+ *
+ * ObjectStore instances use this object to step down the encoded
+ * buffer decoding operation codes and parameters as we go.
+ *
+ */
+ class iterator {
+ Transaction *t;
+
+ uint64_t ops;
+ char* op_buffer_p;
+
+ bufferlist::const_iterator data_bl_p;
+
+ public:
+ vector<coll_t> colls;
+ vector<ghobject_t> objects;
+
+ private:
+ explicit iterator(Transaction *t)
+ : t(t),
+ data_bl_p(t->data_bl.cbegin()),
+ colls(t->coll_index.size()),
+ objects(t->object_index.size()) {
+
+ ops = t->data.ops;
+ op_buffer_p = t->op_bl.c_str();
+
+ map<coll_t, __le32>::iterator coll_index_p;
+ for (coll_index_p = t->coll_index.begin();
+ coll_index_p != t->coll_index.end();
+ ++coll_index_p) {
+ colls[coll_index_p->second] = coll_index_p->first;
+ }
+
+ map<ghobject_t, __le32>::iterator object_index_p;
+ for (object_index_p = t->object_index.begin();
+ object_index_p != t->object_index.end();
+ ++object_index_p) {
+ objects[object_index_p->second] = object_index_p->first;
+ }
+ }
+
+ friend class Transaction;
+
+ public:
+
+ bool have_op() {
+ return ops > 0;
+ }
+ Op* decode_op() {
+ ceph_assert(ops > 0);
+
+ Op* op = reinterpret_cast<Op*>(op_buffer_p);
+ op_buffer_p += sizeof(Op);
+ ops--;
+
+ return op;
+ }
+ string decode_string() {
+ using ceph::decode;
+ string s;
+ decode(s, data_bl_p);
+ return s;
+ }
+ void decode_bp(bufferptr& bp) {
+ using ceph::decode;
+ decode(bp, data_bl_p);
+ }
+ void decode_bl(bufferlist& bl) {
+ using ceph::decode;
+ decode(bl, data_bl_p);
+ }
+ void decode_attrset(map<string,bufferptr>& aset) {
+ using ceph::decode;
+ decode(aset, data_bl_p);
+ }
+ void decode_attrset(map<string,bufferlist>& aset) {
+ using ceph::decode;
+ decode(aset, data_bl_p);
+ }
+ void decode_attrset_bl(bufferlist *pbl) {
+ decode_str_str_map_to_bl(data_bl_p, pbl);
+ }
+ void decode_keyset(set<string> &keys){
+ using ceph::decode;
+ decode(keys, data_bl_p);
+ }
+ void decode_keyset_bl(bufferlist *pbl){
+ decode_str_set_to_bl(data_bl_p, pbl);
+ }
+
+ const ghobject_t &get_oid(__le32 oid_id) {
+ ceph_assert(oid_id < objects.size());
+ return objects[oid_id];
+ }
+ const coll_t &get_cid(__le32 cid_id) {
+ ceph_assert(cid_id < colls.size());
+ return colls[cid_id];
+ }
+ uint32_t get_fadvise_flags() const {
+ return t->get_fadvise_flags();
+ }
+ };
+
+ iterator begin() {
+ return iterator(this);
+ }
+
+private:
+ void _build_actions_from_tbl();
+
+ /**
+ * Helper functions to encode the various mutation elements of a
+ * transaction. These are 1:1 with the operation codes (see
+ * enumeration above). These routines ensure that the
+ * encoder/creator of a transaction gets the right data in the
+ * right place. Sadly, there's no corresponding version nor any
+ * form of seat belts for the decoder.
+ */
+ Op* _get_next_op() {
+ if (op_bl.get_append_buffer_unused_tail_length() < sizeof(Op)) {
+ op_bl.reserve(sizeof(Op) * OPS_PER_PTR);
+ }
+ // append_hole ensures bptr merging. Even huge number of ops
+ // shouldn't result in overpopulating bl::_buffers.
+ char* const p = op_bl.append_hole(sizeof(Op)).c_str();
+ memset(p, 0, sizeof(Op));
+ return reinterpret_cast<Op*>(p);
+ }
+ __le32 _get_coll_id(const coll_t& coll) {
+ map<coll_t, __le32>::iterator c = coll_index.find(coll);
+ if (c != coll_index.end())
+ return c->second;
+
+ __le32 index_id = coll_id++;
+ coll_index[coll] = index_id;
+ return index_id;
+ }
+ __le32 _get_object_id(const ghobject_t& oid) {
+ map<ghobject_t, __le32>::iterator o = object_index.find(oid);
+ if (o != object_index.end())
+ return o->second;
+
+ __le32 index_id = object_id++;
+ object_index[oid] = index_id;
+ return index_id;
+ }
+
+public:
+ /// noop. 'nuf said
+ void nop() {
+ Op* _op = _get_next_op();
+ _op->op = OP_NOP;
+ data.ops = data.ops + 1;
+ }
+ /**
+ * touch
+ *
+ * Ensure the existance of an object in a collection. Create an
+ * empty object if necessary
+ */
+ void touch(const coll_t& cid, const ghobject_t& oid) {
+ Op* _op = _get_next_op();
+ _op->op = OP_TOUCH;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ data.ops = data.ops + 1;
+ }
+ /**
+ * Write data to an offset within an object. If the object is too
+ * small, it is expanded as needed. It is possible to specify an
+ * offset beyond the current end of an object and it will be
+ * expanded as needed. Simple implementations of ObjectStore will
+ * just zero the data between the old end of the object and the
+ * newly provided data. More sophisticated implementations of
+ * ObjectStore will omit the untouched data and store it as a
+ * "hole" in the file.
+ *
+ * Note that a 0-length write does not affect the size of the object.
+ */
+ void write(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len,
+ const bufferlist& write_data, uint32_t flags = 0) {
+ using ceph::encode;
+ uint32_t orig_len = data_bl.length();
+ Op* _op = _get_next_op();
+ _op->op = OP_WRITE;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ _op->off = off;
+ _op->len = len;
+ encode(write_data, data_bl);
+
+ ceph_assert(len == write_data.length());
+ data.fadvise_flags = data.fadvise_flags | flags;
+ if (write_data.length() > data.largest_data_len) {
+ data.largest_data_len = write_data.length();
+ data.largest_data_off = off;
+ data.largest_data_off_in_data_bl = orig_len + sizeof(__u32); // we are about to
+ }
+ data.ops = data.ops + 1;
+ }
+ /**
+ * zero out the indicated byte range within an object. Some
+ * ObjectStore instances may optimize this to release the
+ * underlying storage space.
+ *
+ * If the zero range extends beyond the end of the object, the object
+ * size is extended, just as if we were writing a buffer full of zeros.
+ * EXCEPT if the length is 0, in which case (just like a 0-length write)
+ * we do not adjust the object size.
+ */
+ void zero(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len) {
+ Op* _op = _get_next_op();
+ _op->op = OP_ZERO;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ _op->off = off;
+ _op->len = len;
+ data.ops = data.ops + 1;
+ }
+ /// Discard all data in the object beyond the specified size.
+ void truncate(const coll_t& cid, const ghobject_t& oid, uint64_t off) {
+ Op* _op = _get_next_op();
+ _op->op = OP_TRUNCATE;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ _op->off = off;
+ data.ops = data.ops + 1;
+ }
+ /// Remove an object. All four parts of the object are removed.
+ void remove(const coll_t& cid, const ghobject_t& oid) {
+ Op* _op = _get_next_op();
+ _op->op = OP_REMOVE;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ data.ops = data.ops + 1;
+ }
+ /// Set an xattr of an object
+ void setattr(const coll_t& cid, const ghobject_t& oid, const char* name, bufferlist& val) {
+ string n(name);
+ setattr(cid, oid, n, val);
+ }
+ /// Set an xattr of an object
+ void setattr(const coll_t& cid, const ghobject_t& oid, const string& s, bufferlist& val) {
+ using ceph::encode;
+ Op* _op = _get_next_op();
+ _op->op = OP_SETATTR;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ encode(s, data_bl);
+ encode(val, data_bl);
+ data.ops = data.ops + 1;
+ }
+ /// Set multiple xattrs of an object
+ void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferptr>& attrset) {
+ using ceph::encode;
+ Op* _op = _get_next_op();
+ _op->op = OP_SETATTRS;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ encode(attrset, data_bl);
+ data.ops = data.ops + 1;
+ }
+ /// Set multiple xattrs of an object
+ void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferlist>& attrset) {
+ using ceph::encode;
+ Op* _op = _get_next_op();
+ _op->op = OP_SETATTRS;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ encode(attrset, data_bl);
+ data.ops = data.ops + 1;
+ }
+ /// remove an xattr from an object
+ void rmattr(const coll_t& cid, const ghobject_t& oid, const char *name) {
+ string n(name);
+ rmattr(cid, oid, n);
+ }
+ /// remove an xattr from an object
+ void rmattr(const coll_t& cid, const ghobject_t& oid, const string& s) {
+ using ceph::encode;
+ Op* _op = _get_next_op();
+ _op->op = OP_RMATTR;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ encode(s, data_bl);
+ data.ops = data.ops + 1;
+ }
+ /// remove all xattrs from an object
+ void rmattrs(const coll_t& cid, const ghobject_t& oid) {
+ Op* _op = _get_next_op();
+ _op->op = OP_RMATTRS;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ data.ops = data.ops + 1;
+ }
+ /**
+ * Clone an object into another object.
+ *
+ * Low-cost (e.g., O(1)) cloning (if supported) is best, but
+ * fallback to an O(n) copy is allowed. All four parts of the
+ * object are cloned (data, xattrs, omap header, omap
+ * entries).
+ *
+ * The destination named object may already exist, in
+ * which case its previous contents are discarded.
+ */
+ void clone(const coll_t& cid, const ghobject_t& oid,
+ const ghobject_t& noid) {
+ Op* _op = _get_next_op();
+ _op->op = OP_CLONE;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ _op->dest_oid = _get_object_id(noid);
+ data.ops = data.ops + 1;
+ }
+ /**
+ * Clone a byte range from one object to another.
+ *
+ * The data portion of the destination object receives a copy of a
+ * portion of the data from the source object. None of the other
+ * three parts of an object is copied from the source.
+ *
+ * The destination object size may be extended to the dstoff + len.
+ *
+ * The source range *must* overlap with the source object data. If it does
+ * not the result is undefined.
+ */
+ void clone_range(const coll_t& cid, const ghobject_t& oid,
+ const ghobject_t& noid,
+ uint64_t srcoff, uint64_t srclen, uint64_t dstoff) {
+ Op* _op = _get_next_op();
+ _op->op = OP_CLONERANGE2;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ _op->dest_oid = _get_object_id(noid);
+ _op->off = srcoff;
+ _op->len = srclen;
+ _op->dest_off = dstoff;
+ data.ops = data.ops + 1;
+ }
+
+ /// Create the collection
+ void create_collection(const coll_t& cid, int bits) {
+ Op* _op = _get_next_op();
+ _op->op = OP_MKCOLL;
+ _op->cid = _get_coll_id(cid);
+ _op->split_bits = bits;
+ data.ops = data.ops + 1;
+ }
+
+ /**
+ * Give the collection a hint.
+ *
+ * @param cid - collection id.
+ * @param type - hint type.
+ * @param hint - the hint payload, which contains the customized
+ * data along with the hint type.
+ */
+ void collection_hint(const coll_t& cid, uint32_t type, const bufferlist& hint) {
+ using ceph::encode;
+ Op* _op = _get_next_op();
+ _op->op = OP_COLL_HINT;
+ _op->cid = _get_coll_id(cid);
+ _op->hint_type = type;
+ encode(hint, data_bl);
+ data.ops = data.ops + 1;
+ }
+
+ /// remove the collection, the collection must be empty
+ void remove_collection(const coll_t& cid) {
+ Op* _op = _get_next_op();
+ _op->op = OP_RMCOLL;
+ _op->cid = _get_coll_id(cid);
+ data.ops = data.ops + 1;
+ }
+ void collection_move(const coll_t& cid, const coll_t &oldcid, const ghobject_t& oid)
+ __attribute__ ((deprecated)) {
+ // NOTE: we encode this as a fixed combo of ADD + REMOVE. they
+ // always appear together, so this is effectively a single MOVE.
+ Op* _op = _get_next_op();
+ _op->op = OP_COLL_ADD;
+ _op->cid = _get_coll_id(oldcid);
+ _op->oid = _get_object_id(oid);
+ _op->dest_cid = _get_coll_id(cid);
+ data.ops = data.ops + 1;
+
+ _op = _get_next_op();
+ _op->op = OP_COLL_REMOVE;
+ _op->cid = _get_coll_id(oldcid);
+ _op->oid = _get_object_id(oid);
+ data.ops = data.ops + 1;
+ }
+ void collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+ const coll_t &cid, const ghobject_t& oid) {
+ Op* _op = _get_next_op();
+ _op->op = OP_COLL_MOVE_RENAME;
+ _op->cid = _get_coll_id(oldcid);
+ _op->oid = _get_object_id(oldoid);
+ _op->dest_cid = _get_coll_id(cid);
+ _op->dest_oid = _get_object_id(oid);
+ data.ops = data.ops + 1;
+ }
+ void try_rename(const coll_t &cid, const ghobject_t& oldoid,
+ const ghobject_t& oid) {
+ Op* _op = _get_next_op();
+ _op->op = OP_TRY_RENAME;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oldoid);
+ _op->dest_oid = _get_object_id(oid);
+ data.ops = data.ops + 1;
+ }
+
+ /// Remove omap from oid
+ void omap_clear(
+ const coll_t &cid, ///< [in] Collection containing oid
+ const ghobject_t &oid ///< [in] Object from which to remove omap
+ ) {
+ Op* _op = _get_next_op();
+ _op->op = OP_OMAP_CLEAR;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ data.ops = data.ops + 1;
+ }
+ /// Set keys on oid omap. Replaces duplicate keys.
+ void omap_setkeys(
+ const coll_t& cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object to update
+ const map<string, bufferlist> &attrset ///< [in] Replacement keys and values
+ ) {
+ using ceph::encode;
+ Op* _op = _get_next_op();
+ _op->op = OP_OMAP_SETKEYS;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ encode(attrset, data_bl);
+ data.ops = data.ops + 1;
+ }
+
+ /// Set keys on an oid omap (bufferlist variant).
+ void omap_setkeys(
+ const coll_t &cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object to update
+ const bufferlist &attrset_bl ///< [in] Replacement keys and values
+ ) {
+ Op* _op = _get_next_op();
+ _op->op = OP_OMAP_SETKEYS;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ data_bl.append(attrset_bl);
+ data.ops = data.ops + 1;
+ }
+
+ /// Remove keys from oid omap
+ void omap_rmkeys(
+ const coll_t &cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap
+ const set<string> &keys ///< [in] Keys to clear
+ ) {
+ using ceph::encode;
+ Op* _op = _get_next_op();
+ _op->op = OP_OMAP_RMKEYS;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ encode(keys, data_bl);
+ data.ops = data.ops + 1;
+ }
+
+ /// Remove keys from oid omap
+ void omap_rmkeys(
+ const coll_t &cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap
+ const bufferlist &keys_bl ///< [in] Keys to clear
+ ) {
+ Op* _op = _get_next_op();
+ _op->op = OP_OMAP_RMKEYS;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ data_bl.append(keys_bl);
+ data.ops = data.ops + 1;
+ }
+
+ /// Remove key range from oid omap
+ void omap_rmkeyrange(
+ const coll_t &cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap keys
+ const string& first, ///< [in] first key in range
+ const string& last ///< [in] first key past range, range is [first,last)
+ ) {
+ using ceph::encode;
+ Op* _op = _get_next_op();
+ _op->op = OP_OMAP_RMKEYRANGE;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ encode(first, data_bl);
+ encode(last, data_bl);
+ data.ops = data.ops + 1;
+ }
+
+ /// Set omap header
+ void omap_setheader(
+ const coll_t &cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object
+ const bufferlist &bl ///< [in] Header value
+ ) {
+ using ceph::encode;
+ Op* _op = _get_next_op();
+ _op->op = OP_OMAP_SETHEADER;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ encode(bl, data_bl);
+ data.ops = data.ops + 1;
+ }
+
+ /// Split collection based on given prefixes, objects matching the specified bits/rem are
+ /// moved to the new collection
+ void split_collection(
+ const coll_t &cid,
+ uint32_t bits,
+ uint32_t rem,
+ const coll_t &destination) {
+ Op* _op = _get_next_op();
+ _op->op = OP_SPLIT_COLLECTION2;
+ _op->cid = _get_coll_id(cid);
+ _op->dest_cid = _get_coll_id(destination);
+ _op->split_bits = bits;
+ _op->split_rem = rem;
+ data.ops = data.ops + 1;
+ }
+
+ /// Merge collection into another.
+ void merge_collection(
+ coll_t cid,
+ coll_t destination,
+ uint32_t bits) {
+ Op* _op = _get_next_op();
+ _op->op = OP_MERGE_COLLECTION;
+ _op->cid = _get_coll_id(cid);
+ _op->dest_cid = _get_coll_id(destination);
+ _op->split_bits = bits;
+ data.ops = data.ops + 1;
+ }
+
+ void collection_set_bits(
+ const coll_t &cid,
+ int bits) {
+ Op* _op = _get_next_op();
+ _op->op = OP_COLL_SET_BITS;
+ _op->cid = _get_coll_id(cid);
+ _op->split_bits = bits;
+ data.ops = data.ops + 1;
+ }
+
+ /// Set allocation hint for an object
+ /// make 0 values(expected_object_size, expected_write_size) noops for all implementations
+ void set_alloc_hint(
+ const coll_t &cid,
+ const ghobject_t &oid,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags
+ ) {
+ Op* _op = _get_next_op();
+ _op->op = OP_SETALLOCHINT;
+ _op->cid = _get_coll_id(cid);
+ _op->oid = _get_object_id(oid);
+ _op->expected_object_size = expected_object_size;
+ _op->expected_write_size = expected_write_size;
+ _op->alloc_hint_flags = flags;
+ data.ops = data.ops + 1;
+ }
+
+ void encode(bufferlist& bl) const {
+ //layout: data_bl + op_bl + coll_index + object_index + data
+ ENCODE_START(9, 9, bl);
+ encode(data_bl, bl);
+ encode(op_bl, bl);
+ encode(coll_index, bl);
+ encode(object_index, bl);
+ data.encode(bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl) {
+ DECODE_START(9, bl);
+ DECODE_OLDEST(9);
+
+ decode(data_bl, bl);
+ decode(op_bl, bl);
+ decode(coll_index, bl);
+ decode(object_index, bl);
+ data.decode(bl);
+ coll_id = coll_index.size();
+ object_id = object_index.size();
+
+ DECODE_FINISH(bl);
+ }
+
+ void dump(ceph::Formatter *f);
+ static void generate_test_instances(list<Transaction*>& o);
+ };
+
+ int queue_transaction(CollectionHandle& ch,
+ Transaction&& t,
+ TrackedOpRef op = TrackedOpRef(),
+ ThreadPool::TPHandle *handle = NULL) {
+ vector<Transaction> tls;
+ tls.push_back(std::move(t));
+ return queue_transactions(ch, tls, op, handle);
+ }
+
+ virtual int queue_transactions(
+ CollectionHandle& ch, vector<Transaction>& tls,
+ TrackedOpRef op = TrackedOpRef(),
+ ThreadPool::TPHandle *handle = NULL) = 0;
+
+
+ public:
+ ObjectStore(CephContext* cct,
+ const std::string& path_) : path(path_), cct(cct) {}
+ virtual ~ObjectStore() {}
+
+ // no copying
+ explicit ObjectStore(const ObjectStore& o) = delete;
+ const ObjectStore& operator=(const ObjectStore& o) = delete;
+
+ // versioning
+ virtual int upgrade() {
+ return 0;
+ }
+
+ virtual void get_db_statistics(Formatter *f) { }
+ virtual void generate_db_histogram(Formatter *f) { }
+ virtual int flush_cache(ostream *os = NULL) { return -1; }
+ virtual void dump_perf_counters(Formatter *f) {}
+ virtual void dump_cache_stats(Formatter *f) {}
+ virtual void dump_cache_stats(ostream& os) {}
+
+ virtual string get_type() = 0;
+
+ // mgmt
+ virtual bool test_mount_in_use() = 0;
+ virtual int mount() = 0;
+ virtual int umount() = 0;
+ virtual int fsck(bool deep) {
+ return -EOPNOTSUPP;
+ }
+ virtual int repair(bool deep) {
+ return -EOPNOTSUPP;
+ }
+ virtual int quick_fix() {
+ return -EOPNOTSUPP;
+ }
+
+ virtual void set_cache_shards(unsigned num) { }
+
+ /**
+ * Returns 0 if the hobject is valid, -error otherwise
+ *
+ * Errors:
+ * -ENAMETOOLONG: locator/namespace/name too large
+ */
+ virtual int validate_hobject_key(const hobject_t &obj) const = 0;
+
+ virtual unsigned get_max_attr_name_length() = 0;
+ virtual int mkfs() = 0; // wipe
+ virtual int mkjournal() = 0; // journal only
+ virtual bool needs_journal() = 0; //< requires a journal
+ virtual bool wants_journal() = 0; //< prefers a journal
+ virtual bool allows_journal() = 0; //< allows a journal
+
+ /// enumerate hardware devices (by 'devname', e.g., 'sda' as in /sys/block/sda)
+ virtual int get_devices(std::set<string> *devls) {
+ return -EOPNOTSUPP;
+ }
+
+ /// true if a txn is readable immediately after it is queued.
+ virtual bool is_sync_onreadable() const {
+ return true;
+ }
+
+ /**
+ * is_rotational
+ *
+ * Check whether store is backed by a rotational (HDD) or non-rotational
+ * (SSD) device.
+ *
+ * This must be usable *before* the store is mounted.
+ *
+ * @return true for HDD, false for SSD
+ */
+ virtual bool is_rotational() {
+ return true;
+ }
+
+ /**
+ * is_journal_rotational
+ *
+ * Check whether journal is backed by a rotational (HDD) or non-rotational
+ * (SSD) device.
+ *
+ *
+ * @return true for HDD, false for SSD
+ */
+ virtual bool is_journal_rotational() {
+ return true;
+ }
+
+ virtual string get_default_device_class() {
+ return is_rotational() ? "hdd" : "ssd";
+ }
+
+ virtual int get_numa_node(
+ int *numa_node,
+ set<int> *nodes,
+ set<string> *failed) {
+ return -EOPNOTSUPP;
+ }
+
+
+ virtual bool can_sort_nibblewise() {
+ return false; // assume a backend cannot, unless it says otherwise
+ }
+
+ virtual int statfs(struct store_statfs_t *buf,
+ osd_alert_list_t* alerts = nullptr) = 0;
+ virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) = 0;
+
+ virtual void collect_metadata(map<string,string> *pm) { }
+
+ /**
+ * write_meta - write a simple configuration key out-of-band
+ *
+ * Write a simple key/value pair for basic store configuration
+ * (e.g., a uuid or magic number) to an unopened/unmounted store.
+ * The default implementation writes this to a plaintext file in the
+ * path.
+ *
+ * A newline is appended.
+ *
+ * @param key key name (e.g., "fsid")
+ * @param value value (e.g., a uuid rendered as a string)
+ * @returns 0 for success, or an error code
+ */
+ virtual int write_meta(const std::string& key,
+ const std::string& value);
+
+ /**
+ * read_meta - read a simple configuration key out-of-band
+ *
+ * Read a simple key value to an unopened/mounted store.
+ *
+ * Trailing whitespace is stripped off.
+ *
+ * @param key key name
+ * @param value pointer to value string
+ * @returns 0 for success, or an error code
+ */
+ virtual int read_meta(const std::string& key,
+ std::string *value);
+
+ /**
+ * get ideal max value for collection_list()
+ *
+ * default to some arbitrary values; the implementation will override.
+ */
+ virtual int get_ideal_list_max() { return 64; }
+
+
+ /**
+ * get a collection handle
+ *
+ * Provide a trivial handle as a default to avoid converting legacy
+ * implementations.
+ */
+ virtual CollectionHandle open_collection(const coll_t &cid) = 0;
+
+ /**
+ * get a collection handle for a soon-to-be-created collection
+ *
+ * This handle must be used by queue_transaction that includes a
+ * create_collection call in order to become valid. It will become the
+ * reference to the created collection.
+ */
+ virtual CollectionHandle create_new_collection(const coll_t &cid) = 0;
+
+ /**
+ * set ContextQueue for a collection
+ *
+ * After that, oncommits of Transaction will queue into commit_queue.
+ * And osd ShardThread will call oncommits.
+ */
+ virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0;
+
+ /**
+ * Synchronous read operations
+ */
+
+ /**
+ * exists -- Test for existance of object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @returns true if object exists, false otherwise
+ */
+ virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0;
+ /**
+ * set_collection_opts -- set pool options for a collectioninformation for an object
+ *
+ * @param cid collection
+ * @param opts new collection options
+ * @returns 0 on success, negative error code on failure.
+ */
+ virtual int set_collection_opts(
+ CollectionHandle& c,
+ const pool_opts_t& opts) = 0;
+
+ /**
+ * stat -- get information for an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param st output information for the object
+ * @param allow_eio if false, assert on -EIO operation failure
+ * @returns 0 on success, negative error code on failure.
+ */
+ virtual int stat(
+ CollectionHandle &c,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio = false) = 0;
+ /**
+ * read -- read a byte range of data from an object
+ *
+ * Note: if reading from an offset past the end of the object, we
+ * return 0 (not, say, -EINVAL).
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param offset location offset of first byte to be read
+ * @param len number of bytes to be read
+ * @param bl output bufferlist
+ * @param op_flags is CEPH_OSD_OP_FLAG_*
+ * @returns number of bytes read on success, or negative error code on failure.
+ */
+ virtual int read(
+ CollectionHandle &c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags = 0) = 0;
+
+ /**
+ * fiemap -- get extent map of data of an object
+ *
+ * Returns an encoded map of the extents of an object's data portion
+ * (map<offset,size>).
+ *
+ * A non-enlightened implementation is free to return the extent (offset, len)
+ * as the sole extent.
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param offset location offset of first byte to be read
+ * @param len number of bytes to be read
+ * @param bl output bufferlist for extent map information.
+ * @returns 0 on success, negative error code on failure.
+ */
+ virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
+ uint64_t offset, size_t len, bufferlist& bl) = 0;
+ virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
+ uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) = 0;
+
+ /**
+ * getattr -- get an xattr of an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param name name of attr to read
+ * @param value place to put output result.
+ * @returns 0 on success, negative error code on failure.
+ */
+ virtual int getattr(CollectionHandle &c, const ghobject_t& oid,
+ const char *name, bufferptr& value) = 0;
+
+ /**
+ * getattr -- get an xattr of an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param name name of attr to read
+ * @param value place to put output result.
+ * @returns 0 on success, negative error code on failure.
+ */
+ int getattr(
+ CollectionHandle &c, const ghobject_t& oid,
+ const string& name, bufferlist& value) {
+ bufferptr bp;
+ int r = getattr(c, oid, name.c_str(), bp);
+ value.push_back(bp);
+ return r;
+ }
+
+ /**
+ * getattrs -- get all of the xattrs of an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param aset place to put output result.
+ * @returns 0 on success, negative error code on failure.
+ */
+ virtual int getattrs(CollectionHandle &c, const ghobject_t& oid,
+ map<string,bufferptr>& aset) = 0;
+
+ /**
+ * getattrs -- get all of the xattrs of an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param aset place to put output result.
+ * @returns 0 on success, negative error code on failure.
+ */
+ int getattrs(CollectionHandle &c, const ghobject_t& oid,
+ map<string,bufferlist>& aset) {
+ map<string,bufferptr> bmap;
+ int r = getattrs(c, oid, bmap);
+ for (map<string,bufferptr>::iterator i = bmap.begin();
+ i != bmap.end();
+ ++i) {
+ aset[i->first].append(i->second);
+ }
+ return r;
+ }
+
+
+ // collections
+
+ /**
+ * list_collections -- get all of the collections known to this ObjectStore
+ *
+ * @param ls list of the collections in sorted order.
+ * @returns 0 on success, negative error code on failure.
+ */
+ virtual int list_collections(vector<coll_t>& ls) = 0;
+
+ /**
+ * does a collection exist?
+ *
+ * @param c collection
+ * @returns true if it exists, false otherwise
+ */
+ virtual bool collection_exists(const coll_t& c) = 0;
+
+ /**
+ * is a collection empty?
+ *
+ * @param c collection
+ * @param empty true if the specified collection is empty, false otherwise
+ * @returns 0 on success, negative error code on failure.
+ */
+ virtual int collection_empty(CollectionHandle& c, bool *empty) = 0;
+
+ /**
+ * return the number of significant bits of the coll_t::pgid.
+ *
+ * This should return what the last create_collection or split_collection
+ * set. A legacy backend may return -EAGAIN if the value is unavailable
+ * (because we upgraded from an older version, e.g., FileStore).
+ */
+ virtual int collection_bits(CollectionHandle& c) = 0;
+
+
+ /**
+ * list contents of a collection that fall in the range [start, end) and no more than a specified many result
+ *
+ * @param c collection
+ * @param start list object that sort >= this value
+ * @param end list objects that sort < this value
+ * @param max return no more than this many results
+ * @param seq return no objects with snap < seq
+ * @param ls [out] result
+ * @param next [out] next item sorts >= this value
+ * @return zero on success, or negative error
+ */
+ virtual int collection_list(CollectionHandle &c,
+ const ghobject_t& start, const ghobject_t& end,
+ int max,
+ vector<ghobject_t> *ls, ghobject_t *next) = 0;
+
+ virtual int collection_list_legacy(CollectionHandle &c,
+ const ghobject_t& start,
+ const ghobject_t& end, int max,
+ std::vector<ghobject_t> *ls,
+ ghobject_t *next) {
+ return collection_list(c, start, end, max, ls, next);
+ }
+
+ /// OMAP
+ /// Get omap contents
+ virtual int omap_get(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+ ) = 0;
+
+ /// Get omap header
+ virtual int omap_get_header(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ bool allow_eio = false ///< [in] don't assert on eio
+ ) = 0;
+
+ /// Get keys defined on oid
+ virtual int omap_get_keys(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
+ ) = 0;
+
+ /// Get key values
+ virtual int omap_get_values(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to get
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ ) = 0;
+
+ /// Filters keys into out which are defined on oid
+ virtual int omap_check_keys(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to check
+ set<string> *out ///< [out] Subset of keys defined on oid
+ ) = 0;
+
+ /**
+ * Returns an object map iterator
+ *
+ * Warning! The returned iterator is an implicit lock on filestore
+ * operations in c. Do not use filestore methods on c while the returned
+ * iterator is live. (Filling in a transaction is no problem).
+ *
+ * @return iterator, null on error
+ */
+ virtual ObjectMap::ObjectMapIterator get_omap_iterator(
+ CollectionHandle &c, ///< [in] collection
+ const ghobject_t &oid ///< [in] object
+ ) = 0;
+
+ virtual int flush_journal() { return -EOPNOTSUPP; }
+
+ virtual int dump_journal(ostream& out) { return -EOPNOTSUPP; }
+
+ virtual int snapshot(const string& name) { return -EOPNOTSUPP; }
+
+ /**
+ * Set and get internal fsid for this instance. No external data is modified
+ */
+ virtual void set_fsid(uuid_d u) = 0;
+ virtual uuid_d get_fsid() = 0;
+
+ /**
+ * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store
+ * - num objects - total (including witeouts) object count to measure used space for.
+ */
+ virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0;
+
+
+ // DEBUG
+ virtual void inject_data_error(const ghobject_t &oid) {}
+ virtual void inject_mdata_error(const ghobject_t &oid) {}
+
+ virtual void compact() {}
+ virtual bool has_builtin_csum() const {
+ return false;
+ }
+};
+WRITE_CLASS_ENCODER(ObjectStore::Transaction)
+WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData)
+
+ostream& operator<<(ostream& out, const ObjectStore::Transaction& tx);
+
+#endif
diff --git a/src/os/Transaction.cc b/src/os/Transaction.cc
new file mode 100644
index 00000000..ad390a1c
--- /dev/null
+++ b/src/os/Transaction.cc
@@ -0,0 +1,516 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ObjectStore.h"
+#include "common/Formatter.h"
+
+void ObjectStore::Transaction::dump(ceph::Formatter *f)
+{
+ f->open_array_section("ops");
+ iterator i = begin();
+ int op_num = 0;
+ bool stop_looping = false;
+ while (i.have_op() && !stop_looping) {
+ Transaction::Op *op = i.decode_op();
+ f->open_object_section("op");
+ f->dump_int("op_num", op_num);
+
+ switch (op->op) {
+ case Transaction::OP_NOP:
+ f->dump_string("op_name", "nop");
+ break;
+ case Transaction::OP_TOUCH:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ f->dump_string("op_name", "touch");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ }
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ bufferlist bl;
+ i.decode_bl(bl);
+ f->dump_string("op_name", "write");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->dump_unsigned("length", len);
+ f->dump_unsigned("offset", off);
+ f->dump_unsigned("bufferlist length", bl.length());
+ }
+ break;
+
+ case Transaction::OP_ZERO:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ f->dump_string("op_name", "zero");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->dump_unsigned("offset", off);
+ f->dump_unsigned("length", len);
+ }
+ break;
+
+ case Transaction::OP_TRIMCACHE:
+ {
+ // deprecated, no-op
+ f->dump_string("op_name", "trim_cache");
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ f->dump_string("op_name", "truncate");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->dump_unsigned("offset", off);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ f->dump_string("op_name", "remove");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ string name = i.decode_string();
+ bufferlist bl;
+ i.decode_bl(bl);
+ f->dump_string("op_name", "setattr");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->dump_string("name", name);
+ f->dump_unsigned("length", bl.length());
+ }
+ break;
+
+ case Transaction::OP_SETATTRS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ map<string, bufferptr> aset;
+ i.decode_attrset(aset);
+ f->dump_string("op_name", "setattrs");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->open_object_section("attr_lens");
+ for (map<string,bufferptr>::iterator p = aset.begin();
+ p != aset.end(); ++p) {
+ f->dump_unsigned(p->first.c_str(), p->second.length());
+ }
+ f->close_section();
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ string name = i.decode_string();
+ f->dump_string("op_name", "rmattr");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->dump_string("name", name);
+ }
+ break;
+
+ case Transaction::OP_RMATTRS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ f->dump_string("op_name", "rmattrs");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ f->dump_string("op_name", "clone");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("src_oid") << oid;
+ f->dump_stream("dst_oid") << noid;
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ f->dump_string("op_name", "clonerange");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("src_oid") << oid;
+ f->dump_stream("dst_oid") << noid;
+ f->dump_unsigned("offset", off);
+ f->dump_unsigned("len", len);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE2:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ uint64_t srcoff = op->off;
+ uint64_t len = op->len;
+ uint64_t dstoff = op->dest_off;
+ f->dump_string("op_name", "clonerange2");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("src_oid") << oid;
+ f->dump_stream("dst_oid") << noid;
+ f->dump_unsigned("src_offset", srcoff);
+ f->dump_unsigned("len", len);
+ f->dump_unsigned("dst_offset", dstoff);
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ f->dump_string("op_name", "mkcoll");
+ f->dump_stream("collection") << cid;
+ }
+ break;
+
+ case Transaction::OP_COLL_HINT:
+ {
+ using ceph::decode;
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t type = op->hint_type;
+ f->dump_string("op_name", "coll_hint");
+ f->dump_stream("collection") << cid;
+ f->dump_unsigned("type", type);
+ bufferlist hint;
+ i.decode_bl(hint);
+ auto hiter = hint.cbegin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ decode(pg_num, hiter);
+ decode(num_objs, hiter);
+ f->dump_unsigned("pg_num", pg_num);
+ f->dump_unsigned("expected_num_objects", num_objs);
+ }
+ }
+ break;
+
+ case Transaction::OP_COLL_SET_BITS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ f->dump_string("op_name", "coll_set_bits");
+ f->dump_stream("collection") << cid;
+ f->dump_unsigned("bits", op->split_bits);
+ }
+ break;
+
+ case Transaction::OP_RMCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ f->dump_string("op_name", "rmcoll");
+ f->dump_stream("collection") << cid;
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ {
+ coll_t ocid = i.get_cid(op->cid);
+ coll_t ncid = i.get_cid(op->dest_cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ f->dump_string("op_name", "collection_add");
+ f->dump_stream("src_collection") << ocid;
+ f->dump_stream("dst_collection") << ncid;
+ f->dump_stream("oid") << oid;
+ }
+ break;
+
+ case Transaction::OP_COLL_REMOVE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ f->dump_string("op_name", "collection_remove");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ }
+ break;
+
+ case Transaction::OP_COLL_MOVE:
+ {
+ coll_t ocid = i.get_cid(op->cid);
+ coll_t ncid = i.get_cid(op->dest_cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ f->open_object_section("collection_move");
+ f->dump_stream("src_collection") << ocid;
+ f->dump_stream("dst_collection") << ncid;
+ f->dump_stream("oid") << oid;
+ f->close_section();
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ string name = i.decode_string();
+ bufferlist bl;
+ i.decode_bl(bl);
+ f->dump_string("op_name", "collection_setattr");
+ f->dump_stream("collection") << cid;
+ f->dump_string("name", name);
+ f->dump_unsigned("length", bl.length());
+ }
+ break;
+
+ case Transaction::OP_COLL_RMATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ string name = i.decode_string();
+ f->dump_string("op_name", "collection_rmattr");
+ f->dump_stream("collection") << cid;
+ f->dump_string("name", name);
+ }
+ break;
+
+ case Transaction::OP_COLL_RENAME:
+ {
+ f->dump_string("op_name", "collection_rename");
+ }
+ break;
+
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ f->dump_string("op_name", "omap_clear");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ }
+ break;
+
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ map<string, bufferlist> aset;
+ i.decode_attrset(aset);
+ f->dump_string("op_name", "omap_setkeys");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->open_object_section("attr_lens");
+ for (map<string, bufferlist>::iterator p = aset.begin();
+ p != aset.end(); ++p) {
+ f->dump_unsigned(p->first.c_str(), p->second.length());
+ }
+ f->close_section();
+ }
+ break;
+
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ set<string> keys;
+ i.decode_keyset(keys);
+ f->dump_string("op_name", "omap_rmkeys");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->open_array_section("attrs");
+ for (auto& k : keys) {
+ f->dump_string("", k.c_str());
+ }
+ f->close_section();
+ }
+ break;
+
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ bufferlist bl;
+ i.decode_bl(bl);
+ f->dump_string("op_name", "omap_setheader");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->dump_stream("header_length") << bl.length();
+ }
+ break;
+
+ case Transaction::OP_SPLIT_COLLECTION:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ uint32_t rem = op->split_rem;
+ coll_t dest = i.get_cid(op->dest_cid);
+ f->dump_string("op_name", "op_split_collection_create");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("bits") << bits;
+ f->dump_stream("rem") << rem;
+ f->dump_stream("dest") << dest;
+ }
+ break;
+
+ case Transaction::OP_SPLIT_COLLECTION2:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ uint32_t rem = op->split_rem;
+ coll_t dest = i.get_cid(op->dest_cid);
+ f->dump_string("op_name", "op_split_collection");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("bits") << bits;
+ f->dump_stream("rem") << rem;
+ f->dump_stream("dest") << dest;
+ }
+ break;
+
+ case Transaction::OP_MERGE_COLLECTION:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ coll_t dest = i.get_cid(op->dest_cid);
+ f->dump_string("op_name", "op_merge_collection");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("dest") << dest;
+ f->dump_stream("bits") << bits;
+ }
+ break;
+
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ f->dump_string("op_name", "op_omap_rmkeyrange");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->dump_string("first", first);
+ f->dump_string("last", last);
+ }
+ break;
+
+ case Transaction::OP_COLL_MOVE_RENAME:
+ {
+ coll_t old_cid = i.get_cid(op->cid);
+ ghobject_t old_oid = i.get_oid(op->oid);
+ coll_t new_cid = i.get_cid(op->dest_cid);
+ ghobject_t new_oid = i.get_oid(op->dest_oid);
+ f->dump_string("op_name", "op_coll_move_rename");
+ f->dump_stream("old_collection") << old_cid;
+ f->dump_stream("old_oid") << old_oid;
+ f->dump_stream("new_collection") << new_cid;
+ f->dump_stream("new_oid") << new_oid;
+ }
+ break;
+
+ case Transaction::OP_TRY_RENAME:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t old_oid = i.get_oid(op->oid);
+ ghobject_t new_oid = i.get_oid(op->dest_oid);
+ f->dump_string("op_name", "op_coll_move_rename");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("old_oid") << old_oid;
+ f->dump_stream("new_oid") << new_oid;
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t expected_object_size = op->expected_object_size;
+ uint64_t expected_write_size = op->expected_write_size;
+ f->dump_string("op_name", "op_setallochint");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("oid") << oid;
+ f->dump_stream("expected_object_size") << expected_object_size;
+ f->dump_stream("expected_write_size") << expected_write_size;
+ }
+ break;
+
+ default:
+ f->dump_string("op_name", "unknown");
+ f->dump_unsigned("op_code", op->op);
+ stop_looping = true;
+ break;
+ }
+ f->close_section();
+ op_num++;
+ }
+ f->close_section();
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transaction*>& o)
+{
+ o.push_back(new Transaction);
+
+ Transaction *t = new Transaction;
+ t->nop();
+ o.push_back(t);
+
+ t = new Transaction;
+ coll_t c(spg_t(pg_t(1,2), shard_id_t::NO_SHARD));
+ coll_t c2(spg_t(pg_t(4,5), shard_id_t::NO_SHARD));
+ ghobject_t o1(hobject_t("obj", "", 123, 456, -1, ""));
+ ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, ""));
+ ghobject_t o3(hobject_t("obj3", "", 123, 456, -1, ""));
+ t->touch(c, o1);
+ bufferlist bl;
+ bl.append("some data");
+ t->write(c, o1, 1, bl.length(), bl);
+ t->zero(c, o1, 22, 33);
+ t->truncate(c, o1, 99);
+ t->remove(c, o1);
+ o.push_back(t);
+
+ t = new Transaction;
+ t->setattr(c, o1, "key", bl);
+ map<string,bufferptr> m;
+ m["a"] = buffer::copy("this", 4);
+ m["b"] = buffer::copy("that", 4);
+ t->setattrs(c, o1, m);
+ t->rmattr(c, o1, "b");
+ t->rmattrs(c, o1);
+
+ t->clone(c, o1, o2);
+ t->clone(c, o1, o3);
+ t->clone_range(c, o1, o2, 1, 12, 99);
+
+ t->create_collection(c, 12);
+ t->collection_move_rename(c, o2, c2, o3);
+ t->remove_collection(c);
+ o.push_back(t);
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
diff --git a/src/os/bluestore/Allocator.cc b/src/os/bluestore/Allocator.cc
new file mode 100644
index 00000000..0ac9a15a
--- /dev/null
+++ b/src/os/bluestore/Allocator.cc
@@ -0,0 +1,203 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Allocator.h"
+#include "StupidAllocator.h"
+#include "BitmapAllocator.h"
+#include "AvlAllocator.h"
+#include "HybridAllocator.h"
+#include "common/debug.h"
+#include "common/admin_socket.h"
+#define dout_subsys ceph_subsys_bluestore
+
+class Allocator::SocketHook : public AdminSocketHook {
+ Allocator *alloc;
+
+ friend class Allocator;
+ std::string name;
+public:
+ explicit SocketHook(Allocator *alloc,
+ const std::string& _name) :
+ alloc(alloc), name(_name)
+ {
+ AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+ if (name.empty()) {
+ name = to_string((uintptr_t)this);
+ }
+ if (admin_socket) {
+ int r = admin_socket->register_command(("bluestore allocator dump " + name).c_str(),
+ ("bluestore allocator dump " + name).c_str(),
+ this,
+ "dump allocator free regions");
+ if (r != 0)
+ alloc = nullptr; //some collision, disable
+ if (alloc) {
+ r = admin_socket->register_command(("bluestore allocator score " + name).c_str(),
+ ("bluestore allocator score " + name).c_str(),
+ this,
+ "give score on allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(("bluestore allocator fragmentation " + name).c_str(),
+ ("bluestore allocator fragmentation " + name).c_str(),
+ this,
+ "give allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)");
+ ceph_assert(r == 0);
+ }
+ }
+ }
+ ~SocketHook()
+ {
+ AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+ if (admin_socket && alloc) {
+ int r = admin_socket->unregister_command(("bluestore allocator dump " + name).c_str());
+ ceph_assert(r == 0);
+ r = admin_socket->unregister_command(("bluestore allocator score " + name).c_str());
+ ceph_assert(r == 0);
+ r = admin_socket->unregister_command(("bluestore allocator fragmentation " + name).c_str());
+ ceph_assert(r == 0);
+ }
+ }
+
+ bool call(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, bufferlist& out) override {
+ stringstream ss;
+ bool r = true;
+ if (command == "bluestore allocator dump " + name) {
+ Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
+ f->open_array_section("free_regions");
+ auto iterated_allocation = [&](size_t off, size_t len) {
+ ceph_assert(len > 0);
+ f->open_object_section("free");
+ char off_hex[30];
+ char len_hex[30];
+ snprintf(off_hex, sizeof(off_hex) - 1, "0x%lx", off);
+ snprintf(len_hex, sizeof(len_hex) - 1, "0x%lx", len);
+ f->dump_string("offset", off_hex);
+ f->dump_string("length", len_hex);
+ f->close_section();
+ };
+ alloc->dump(iterated_allocation);
+
+
+ f->close_section();
+ f->flush(ss);
+ } else if (command == "bluestore allocator score " + name) {
+ Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
+ f->open_object_section("fragmentation_score");
+ f->dump_float("fragmentation_rating", alloc->get_fragmentation_score());
+ f->close_section();
+ f->flush(ss);
+ delete f;
+ } else if (command == "bluestore allocator fragmentation " + name) {
+ Formatter* f = Formatter::create(format, "json-pretty", "json-pretty");
+ f->open_object_section("fragmentation");
+ f->dump_float("fragmentation_rating", alloc->get_fragmentation());
+ f->close_section();
+ f->flush(ss);
+ delete f;
+ } else {
+ ss << "Invalid command" << std::endl;
+ r = false;
+ }
+ out.append(ss);
+ return r;
+ }
+
+};
+Allocator::Allocator(const std::string& name)
+{
+ asok_hook = new SocketHook(this, name);
+}
+
+
+Allocator::~Allocator()
+{
+ delete asok_hook;
+}
+
+const string& Allocator::get_name() const {
+ return asok_hook->name;
+}
+
+Allocator *Allocator::create(CephContext* cct, string type,
+ int64_t size, int64_t block_size, const std::string& name)
+{
+ Allocator* alloc = nullptr;
+ if (type == "stupid") {
+ alloc = new StupidAllocator(cct, name, block_size);
+ } else if (type == "bitmap") {
+ alloc = new BitmapAllocator(cct, size, block_size, name);
+ } else if (type == "avl") {
+ return new AvlAllocator(cct, size, block_size, name);
+ } else if (type == "hybrid") {
+ return new HybridAllocator(cct, size, block_size,
+ cct->_conf.get_val<uint64_t>("bluestore_hybrid_alloc_mem_cap"),
+ name);
+ }
+ if (alloc == nullptr) {
+ lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
+ << type << dendl;
+ }
+ return alloc;
+}
+
+void Allocator::release(const PExtentVector& release_vec)
+{
+ interval_set<uint64_t> release_set;
+ for (auto e : release_vec) {
+ release_set.insert(e.offset, e.length);
+ }
+ release(release_set);
+}
+
+/**
+ * Gives fragmentation a numeric value.
+ *
+ * Following algorithm applies value to each existing free unallocated block.
+ * Value of single block is a multiply of size and per-byte-value.
+ * Per-byte-value is greater for larger blocks.
+ * Assume block size X has value per-byte p; then block size 2*X will have per-byte value 1.1*p.
+ *
+ * This could be expressed in logarithms, but for speed this is interpolated inside ranges.
+ * [1] [2..3] [4..7] [8..15] ...
+ * ^ ^ ^ ^
+ * 1.1 1.1^2 1.1^3 1.1^4 ...
+ *
+ * Final score is obtained by proportion between score that would have been obtained
+ * in condition of absolute fragmentation and score in no fragmentation at all.
+ */
+double Allocator::get_fragmentation_score()
+{
+ // this value represents how much worth is 2X bytes in one chunk then in X + X bytes
+ static const double double_size_worth = 1.1 ;
+ std::vector<double> scales{1};
+ double score_sum = 0;
+ size_t sum = 0;
+
+ auto get_score = [&](size_t v) -> double {
+ size_t sc = sizeof(v) * 8 - clz(v) - 1; //assign to grade depending on log2(len)
+ while (scales.size() <= sc + 1) {
+ //unlikely expand scales vector
+ scales.push_back(scales[scales.size() - 1] * double_size_worth);
+ }
+
+ size_t sc_shifted = size_t(1) << sc;
+ double x = double(v - sc_shifted) / sc_shifted; //x is <0,1) in its scale grade
+ // linear extrapolation in its scale grade
+ double score = (sc_shifted ) * scales[sc] * (1-x) +
+ (sc_shifted * 2) * scales[sc+1] * x;
+ return score;
+ };
+
+ auto iterated_allocation = [&](size_t off, size_t len) {
+ ceph_assert(len > 0);
+ score_sum += get_score(len);
+ sum += len;
+ };
+ dump(iterated_allocation);
+
+
+ double ideal = get_score(sum);
+ double terrible = sum * get_score(1);
+ return (ideal - score_sum) / (ideal - terrible);
+}
diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h
new file mode 100644
index 00000000..7b571395
--- /dev/null
+++ b/src/os/bluestore/Allocator.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_OS_BLUESTORE_ALLOCATOR_H
+#define CEPH_OS_BLUESTORE_ALLOCATOR_H
+
+#include <ostream>
+#include "include/ceph_assert.h"
+#include "os/bluestore/bluestore_types.h"
+#include <functional>
+
+class Allocator {
+public:
+ explicit Allocator(const std::string& name);
+ virtual ~Allocator();
+
+ /*
+ * Allocate required number of blocks in n number of extents.
+ * Min and Max number of extents are limited by:
+ * a. alloc unit
+ * b. max_alloc_size.
+ * as no extent can be lesser than alloc_unit and greater than max_alloc size.
+ * Apart from that extents can vary between these lower and higher limits according
+ * to free block search algorithm and availability of contiguous space.
+ */
+ virtual int64_t allocate(uint64_t want_size, uint64_t alloc_unit,
+ uint64_t max_alloc_size, int64_t hint,
+ PExtentVector *extents) = 0;
+
+ int64_t allocate(uint64_t want_size, uint64_t alloc_unit,
+ int64_t hint, PExtentVector *extents) {
+ return allocate(want_size, alloc_unit, want_size, hint, extents);
+ }
+
+ /* Bulk release. Implementations may override this method to handle the whole
+ * set at once. This could save e.g. unnecessary mutex dance. */
+ virtual void release(const interval_set<uint64_t>& release_set) = 0;
+ void release(const PExtentVector& release_set);
+
+ virtual void dump() = 0;
+ virtual void dump(std::function<void(uint64_t offset, uint64_t length)> notify) = 0;
+
+ virtual void init_add_free(uint64_t offset, uint64_t length) = 0;
+ virtual void init_rm_free(uint64_t offset, uint64_t length) = 0;
+
+ virtual uint64_t get_free() = 0;
+ virtual double get_fragmentation()
+ {
+ return 0.0;
+ }
+ virtual double get_fragmentation_score();
+ virtual void shutdown() = 0;
+
+ static Allocator *create(CephContext* cct, string type, int64_t size,
+ int64_t block_size, const std::string& name = "");
+
+ const string& get_name() const;
+
+private:
+ class SocketHook;
+ SocketHook* asok_hook = nullptr;
+};
+
+#endif
diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc
new file mode 100755
index 00000000..0ac70baa
--- /dev/null
+++ b/src/os/bluestore/AvlAllocator.cc
@@ -0,0 +1,430 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "AvlAllocator.h"
+
+#include <limits>
+
+#include "common/config_proxy.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "AvlAllocator "
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc);
+
+namespace {
+ // a light-weight "range_seg_t", which only used as the key when searching in
+ // range_tree and range_size_tree
+ struct range_t {
+ uint64_t start;
+ uint64_t end;
+ };
+}
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
+template<class Tree>
+uint64_t AvlAllocator::_block_picker(const Tree& t,
+ uint64_t *cursor,
+ uint64_t size,
+ uint64_t align)
+{
+ const auto compare = t.key_comp();
+ for (auto rs = t.lower_bound(range_t{*cursor, size}, compare);
+ rs != t.end(); ++rs) {
+ uint64_t offset = p2roundup(rs->start, align);
+ if (offset + size <= rs->end) {
+ *cursor = offset + size;
+ return offset;
+ }
+ }
+ /*
+ * If we know we've searched the whole tree (*cursor == 0), give up.
+ * Otherwise, reset the cursor to the beginning and try again.
+ */
+ if (*cursor == 0) {
+ return -1ULL;
+ }
+ *cursor = 0;
+ return _block_picker(t, cursor, size, align);
+}
+
+void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
+{
+ ceph_assert(size != 0);
+
+ uint64_t end = start + size;
+
+ auto rs_after = range_tree.upper_bound(range_t{start, end},
+ range_tree.key_comp());
+
+ /* Make sure we don't overlap with either of our neighbors */
+ auto rs_before = range_tree.end();
+ if (rs_after != range_tree.begin()) {
+ rs_before = std::prev(rs_after);
+ }
+
+ bool merge_before = (rs_before != range_tree.end() && rs_before->end == start);
+ bool merge_after = (rs_after != range_tree.end() && rs_after->start == end);
+
+ if (merge_before && merge_after) {
+ _range_size_tree_rm(*rs_before);
+ _range_size_tree_rm(*rs_after);
+ rs_after->start = rs_before->start;
+ range_tree.erase_and_dispose(rs_before, dispose_rs{});
+ _range_size_tree_try_insert(*rs_after);
+ } else if (merge_before) {
+ _range_size_tree_rm(*rs_before);
+ rs_before->end = end;
+ _range_size_tree_try_insert(*rs_before);
+ } else if (merge_after) {
+ _range_size_tree_rm(*rs_after);
+ rs_after->start = start;
+ _range_size_tree_try_insert(*rs_after);
+ } else {
+ _try_insert_range(start, end, &rs_after);
+ }
+}
+
+void AvlAllocator::_process_range_removal(uint64_t start, uint64_t end,
+ AvlAllocator::range_tree_t::iterator& rs)
+{
+ bool left_over = (rs->start != start);
+ bool right_over = (rs->end != end);
+
+ _range_size_tree_rm(*rs);
+
+ if (left_over && right_over) {
+ auto old_right_end = rs->end;
+ auto insert_pos = rs;
+ ceph_assert(insert_pos != range_tree.end());
+ ++insert_pos;
+ rs->end = start;
+
+ // Insert tail first to be sure insert_pos hasn't been disposed.
+ // This woulnd't dispose rs though since it's out of range_size_tree.
+ // Don't care about a small chance of 'not-the-best-choice-for-removal' case
+ // which might happen if rs has the lowest size.
+ _try_insert_range(end, old_right_end, &insert_pos);
+ _range_size_tree_try_insert(*rs);
+
+ } else if (left_over) {
+ rs->end = start;
+ _range_size_tree_try_insert(*rs);
+ } else if (right_over) {
+ rs->start = end;
+ _range_size_tree_try_insert(*rs);
+ } else {
+ range_tree.erase_and_dispose(rs, dispose_rs{});
+ }
+}
+
+void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
+{
+ uint64_t end = start + size;
+
+ ceph_assert(size != 0);
+ ceph_assert(size <= num_free);
+
+ auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp());
+ /* Make sure we completely overlap with someone */
+ ceph_assert(rs != range_tree.end());
+ ceph_assert(rs->start <= start);
+ ceph_assert(rs->end >= end);
+
+ _process_range_removal(start, end, rs);
+}
+
+void AvlAllocator::_try_remove_from_tree(uint64_t start, uint64_t size,
+ std::function<void(uint64_t, uint64_t, bool)> cb)
+{
+ uint64_t end = start + size;
+
+ ceph_assert(size != 0);
+
+ auto rs = range_tree.find(range_t{ start, end },
+ range_tree.key_comp());
+
+ if (rs == range_tree.end() || rs->start >= end) {
+ cb(start, size, false);
+ return;
+ }
+
+ do {
+
+ auto next_rs = rs;
+ ++next_rs;
+
+ if (start < rs->start) {
+ cb(start, rs->start - start, false);
+ start = rs->start;
+ }
+ auto range_end = std::min(rs->end, end);
+ _process_range_removal(start, range_end, rs);
+ cb(start, range_end - start, true);
+ start = range_end;
+
+ rs = next_rs;
+ } while (rs != range_tree.end() && rs->start < end && start < end);
+ if (start < end) {
+ cb(start, end - start, false);
+ }
+}
+
+int64_t AvlAllocator::_allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint, // unused, for now!
+ PExtentVector* extents)
+{
+ uint64_t allocated = 0;
+ while (allocated < want) {
+ uint64_t offset, length;
+ int r = _allocate(std::min(max_alloc_size, want - allocated),
+ unit, &offset, &length);
+ if (r < 0) {
+ // Allocation failed.
+ break;
+ }
+ extents->emplace_back(offset, length);
+ allocated += length;
+ }
+ return allocated ? allocated : -ENOSPC;
+}
+
+int AvlAllocator::_allocate(
+ uint64_t size,
+ uint64_t unit,
+ uint64_t *offset,
+ uint64_t *length)
+{
+ uint64_t max_size = 0;
+ if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
+ max_size = p->end - p->start;
+ }
+
+ bool force_range_size_alloc = false;
+ if (max_size < size) {
+ if (max_size < unit) {
+ return -ENOSPC;
+ }
+ size = p2align(max_size, unit);
+ ceph_assert(size > 0);
+ force_range_size_alloc = true;
+ }
+ /*
+ * Find the largest power of 2 block size that evenly divides the
+ * requested size. This is used to try to allocate blocks with similar
+ * alignment from the same area (i.e. same cursor bucket) but it does
+ * not guarantee that other allocations sizes may exist in the same
+ * region.
+ */
+ const uint64_t align = size & -size;
+ ceph_assert(align != 0);
+ uint64_t *cursor = &lbas[cbits(align) - 1];
+
+ const int free_pct = num_free * 100 / num_total;
+ uint64_t start = 0;
+ /*
+ * If we're running low on space switch to using the size
+ * sorted AVL tree (best-fit).
+ */
+ if (force_range_size_alloc ||
+ max_size < range_size_alloc_threshold ||
+ free_pct < range_size_alloc_free_pct) {
+ *cursor = 0;
+ do {
+ start = _block_picker(range_size_tree, cursor, size, unit);
+ if (start != -1ULL || !force_range_size_alloc) {
+ break;
+ }
+ // try to collect smaller extents as we could fail to retrieve
+ // that large block due to misaligned extents
+ size = p2align(size >> 1, unit);
+ } while (size >= unit);
+ } else {
+ start = _block_picker(range_tree, cursor, size, unit);
+ }
+ if (start == -1ULL) {
+ return -ENOSPC;
+ }
+
+ _remove_from_tree(start, size);
+
+ *offset = start;
+ *length = size;
+ return 0;
+}
+
+void AvlAllocator::_release(const interval_set<uint64_t>& release_set)
+{
+ for (auto p = release_set.begin(); p != release_set.end(); ++p) {
+ const auto offset = p.get_start();
+ const auto length = p.get_len();
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << offset
+ << " length 0x" << length
+ << std::dec << dendl;
+ _add_to_tree(offset, length);
+ }
+}
+
+void AvlAllocator::_release(const PExtentVector& release_set) {
+ for (auto& e : release_set) {
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << e.offset
+ << " length 0x" << e.length
+ << std::dec << dendl;
+ _add_to_tree(e.offset, e.length);
+ }
+}
+
+void AvlAllocator::_shutdown()
+{
+ range_size_tree.clear();
+ range_tree.clear_and_dispose(dispose_rs{});
+}
+
+AvlAllocator::AvlAllocator(CephContext* cct,
+ int64_t device_size,
+ int64_t block_size,
+ uint64_t max_mem,
+ const std::string& name) :
+ Allocator(name),
+ num_total(device_size),
+ block_size(block_size),
+ range_size_alloc_threshold(
+ cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
+ range_size_alloc_free_pct(
+ cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
+ range_count_cap(max_mem / sizeof(range_seg_t)),
+ cct(cct)
+{}
+
+AvlAllocator::AvlAllocator(CephContext* cct,
+ int64_t device_size,
+ int64_t block_size,
+ const std::string& name) :
+ Allocator(name),
+ num_total(device_size),
+ block_size(block_size),
+ range_size_alloc_threshold(
+ cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
+ range_size_alloc_free_pct(
+ cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
+ cct(cct)
+{}
+
+AvlAllocator::~AvlAllocator()
+{
+ shutdown();
+}
+
+int64_t AvlAllocator::allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint, // unused, for now!
+ PExtentVector* extents)
+{
+ ldout(cct, 10) << __func__ << std::hex
+ << " want 0x" << want
+ << " unit 0x" << unit
+ << " max_alloc_size 0x" << max_alloc_size
+ << " hint 0x" << hint
+ << std::dec << dendl;
+ ceph_assert(isp2(unit));
+ ceph_assert(want % unit == 0);
+
+ if (max_alloc_size == 0) {
+ max_alloc_size = want;
+ }
+ if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
+ max_alloc_size >= cap) {
+ max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size);
+ }
+ std::lock_guard l(lock);
+ return _allocate(want, unit, max_alloc_size, hint, extents);
+}
+
+void AvlAllocator::release(const interval_set<uint64_t>& release_set) {
+ std::lock_guard l(lock);
+ _release(release_set);
+}
+
+uint64_t AvlAllocator::get_free()
+{
+ std::lock_guard l(lock);
+ return num_free;
+}
+
+double AvlAllocator::get_fragmentation()
+{
+ std::lock_guard l(lock);
+ return _get_fragmentation();
+}
+
+void AvlAllocator::dump()
+{
+ std::lock_guard l(lock);
+ _dump();
+}
+
+void AvlAllocator::_dump() const
+{
+ ldout(cct, 0) << __func__ << " range_tree: " << dendl;
+ for (auto& rs : range_tree) {
+ ldout(cct, 0) << std::hex
+ << "0x" << rs.start << "~" << rs.end
+ << std::dec
+ << dendl;
+ }
+
+ ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
+ for (auto& rs : range_size_tree) {
+ ldout(cct, 0) << std::hex
+ << "0x" << rs.start << "~" << rs.end
+ << std::dec
+ << dendl;
+ }
+}
+
+void AvlAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+ for (auto& rs : range_tree) {
+ notify(rs.start, rs.end - rs.start);
+ }
+}
+
+void AvlAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+ std::lock_guard l(lock);
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << offset
+ << " length 0x" << length
+ << std::dec << dendl;
+ _add_to_tree(offset, length);
+}
+
+void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+ std::lock_guard l(lock);
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << offset
+ << " length 0x" << length
+ << std::dec << dendl;
+ _remove_from_tree(offset, length);
+}
+
+void AvlAllocator::shutdown()
+{
+ std::lock_guard l(lock);
+ _shutdown();
+}
diff --git a/src/os/bluestore/AvlAllocator.h b/src/os/bluestore/AvlAllocator.h
new file mode 100755
index 00000000..bcc3f8b0
--- /dev/null
+++ b/src/os/bluestore/AvlAllocator.h
@@ -0,0 +1,257 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <mutex>
+#include <boost/intrusive/avl_set.hpp>
+
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+
+struct range_seg_t {
+ MEMPOOL_CLASS_HELPERS(); ///< memory monitoring
+ uint64_t start; ///< starting offset of this segment
+ uint64_t end; ///< ending offset (non-inclusive)
+
+ range_seg_t(uint64_t start, uint64_t end)
+ : start{start},
+ end{end}
+ {}
+ // Tree is sorted by offset, greater offsets at the end of the tree.
+ struct before_t {
+ template<typename KeyLeft, typename KeyRight>
+ bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const {
+ return lhs.end <= rhs.start;
+ }
+ };
+ boost::intrusive::avl_set_member_hook<> offset_hook;
+
+ // Tree is sorted by size, larger sizes at the end of the tree.
+ struct shorter_t {
+ template<typename KeyType>
+ bool operator()(const range_seg_t& lhs, const KeyType& rhs) const {
+ auto lhs_size = lhs.end - lhs.start;
+ auto rhs_size = rhs.end - rhs.start;
+ if (lhs_size < rhs_size) {
+ return true;
+ } else if (lhs_size > rhs_size) {
+ return false;
+ } else {
+ return lhs.start < rhs.start;
+ }
+ }
+ };
+ inline uint64_t length() const {
+ return end - start;
+ }
+ boost::intrusive::avl_set_member_hook<> size_hook;
+};
+
+class AvlAllocator : public Allocator {
+ struct dispose_rs {
+ void operator()(range_seg_t* p)
+ {
+ delete p;
+ }
+ };
+
+protected:
+ /*
+ * ctor intended for the usage from descendant class(es) which
+ * provides handling for spilled over entries
+ * (when entry count >= max_entries)
+ */
+ AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
+ uint64_t max_mem,
+ const std::string& name);
+
+public:
+ AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
+ const std::string& name);
+ ~AvlAllocator();
+ int64_t allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector *extents) override;
+ void release(const interval_set<uint64_t>& release_set) override;
+ int64_t get_capacity() const {
+ return num_total;
+ }
+
+ uint64_t get_block_size() const {
+ return block_size;
+ }
+ uint64_t get_free() override;
+ double get_fragmentation() override;
+
+ void dump() override;
+ void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override;
+ void init_add_free(uint64_t offset, uint64_t length) override;
+ void init_rm_free(uint64_t offset, uint64_t length) override;
+ void shutdown() override;
+
+private:
+ template<class Tree>
+ uint64_t _block_picker(const Tree& t, uint64_t *cursor, uint64_t size,
+ uint64_t align);
+ int _allocate(
+ uint64_t size,
+ uint64_t unit,
+ uint64_t *offset,
+ uint64_t *length);
+
+ using range_tree_t =
+ boost::intrusive::avl_set<
+ range_seg_t,
+ boost::intrusive::compare<range_seg_t::before_t>,
+ boost::intrusive::member_hook<
+ range_seg_t,
+ boost::intrusive::avl_set_member_hook<>,
+ &range_seg_t::offset_hook>>;
+ range_tree_t range_tree; ///< main range tree
+ /*
+ * The range_size_tree should always contain the
+ * same number of segments as the range_tree.
+ * The only difference is that the range_size_tree
+ * is ordered by segment sizes.
+ */
+ using range_size_tree_t =
+ boost::intrusive::avl_multiset<
+ range_seg_t,
+ boost::intrusive::compare<range_seg_t::shorter_t>,
+ boost::intrusive::member_hook<
+ range_seg_t,
+ boost::intrusive::avl_set_member_hook<>,
+ &range_seg_t::size_hook>,
+ boost::intrusive::constant_time_size<true>>;
+ range_size_tree_t range_size_tree;
+
+ const int64_t num_total; ///< device size
+ const uint64_t block_size; ///< block size
+ uint64_t num_free = 0; ///< total bytes in freelist
+
+ /*
+ * This value defines the number of elements in the ms_lbas array.
+ * The value of 64 was chosen as it covers all power of 2 buckets
+ * up to UINT64_MAX.
+ * This is the equivalent of highest-bit of UINT64_MAX.
+ */
+ static constexpr unsigned MAX_LBAS = 64;
+ uint64_t lbas[MAX_LBAS] = {0};
+
+ /*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the allocator cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+ uint64_t range_size_alloc_threshold = 0;
+ /*
+ * The minimum free space, in percent, which must be available
+ * in allocator to continue allocations in a first-fit fashion.
+ * Once the allocator's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+ int range_size_alloc_free_pct = 0;
+
+ /*
+ * Max amount of range entries allowed. 0 - unlimited
+ */
+ uint64_t range_count_cap = 0;
+
+ void _range_size_tree_rm(range_seg_t& r) {
+ ceph_assert(num_free >= r.length());
+ num_free -= r.length();
+ range_size_tree.erase(r);
+
+ }
+ void _range_size_tree_try_insert(range_seg_t& r) {
+ if (_try_insert_range(r.start, r.end)) {
+ range_size_tree.insert(r);
+ num_free += r.length();
+ } else {
+ range_tree.erase_and_dispose(r, dispose_rs{});
+ }
+ }
+ bool _try_insert_range(uint64_t start,
+ uint64_t end,
+ range_tree_t::iterator* insert_pos = nullptr) {
+ bool res = !range_count_cap || range_size_tree.size() < range_count_cap;
+ bool remove_lowest = false;
+ if (!res) {
+ if (end - start > _lowest_size_available()) {
+ remove_lowest = true;
+ res = true;
+ }
+ }
+ if (!res) {
+ _spillover_range(start, end);
+ } else {
+ // NB: we should do insertion before the following removal
+ // to avoid potential iterator disposal insertion might depend on.
+ if (insert_pos) {
+ auto new_rs = new range_seg_t{ start, end };
+ range_tree.insert_before(*insert_pos, *new_rs);
+ range_size_tree.insert(*new_rs);
+ num_free += new_rs->length();
+ }
+ if (remove_lowest) {
+ auto r = range_size_tree.begin();
+ _range_size_tree_rm(*r);
+ _spillover_range(r->start, r->end);
+ range_tree.erase_and_dispose(*r, dispose_rs{});
+ }
+ }
+ return res;
+ }
+ virtual void _spillover_range(uint64_t start, uint64_t end) {
+ // this should be overriden when range count cap is present,
+ // i.e. (range_count_cap > 0)
+ ceph_assert(false);
+ }
+protected:
+ // called when extent to be released/marked free
+ virtual void _add_to_tree(uint64_t start, uint64_t size);
+
+protected:
+ CephContext* cct;
+ std::mutex lock;
+
+ double _get_fragmentation() const {
+ auto free_blocks = p2align(num_free, block_size) / block_size;
+ if (free_blocks <= 1) {
+ return .0;
+ }
+ return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
+ }
+ void _dump() const;
+
+ uint64_t _lowest_size_available() {
+ auto rs = range_size_tree.begin();
+ return rs != range_size_tree.end() ? rs->length() : 0;
+ }
+
+ int64_t _allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector *extents);
+
+ void _release(const interval_set<uint64_t>& release_set);
+ void _release(const PExtentVector& release_set);
+ void _shutdown();
+
+ void _process_range_removal(uint64_t start, uint64_t end, range_tree_t::iterator& rs);
+ void _remove_from_tree(uint64_t start, uint64_t size);
+ void _try_remove_from_tree(uint64_t start, uint64_t size,
+ std::function<void(uint64_t offset, uint64_t length, bool found)> cb);
+
+ uint64_t _get_free() const {
+ return num_free;
+ }
+};
diff --git a/src/os/bluestore/BitmapAllocator.cc b/src/os/bluestore/BitmapAllocator.cc
new file mode 100755
index 00000000..c24a333a
--- /dev/null
+++ b/src/os/bluestore/BitmapAllocator.cc
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BitmapAllocator.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "fbmap_alloc " << this << " "
+
+BitmapAllocator::BitmapAllocator(CephContext* _cct,
+ int64_t capacity,
+ int64_t alloc_unit,
+ const std::string& name) :
+ Allocator(name),
+ cct(_cct)
+{
+ ldout(cct, 10) << __func__ << " 0x" << std::hex << capacity << "/"
+ << alloc_unit << std::dec << dendl;
+ _init(capacity, alloc_unit, false);
+}
+
+int64_t BitmapAllocator::allocate(
+ uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+ int64_t hint, PExtentVector *extents)
+{
+ uint64_t allocated = 0;
+ size_t old_size = extents->size();
+ ldout(cct, 10) << __func__ << std::hex << " 0x" << want_size
+ << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+ << std::dec << dendl;
+
+
+ _allocate_l2(want_size, alloc_unit, max_alloc_size, hint,
+ &allocated, extents);
+ if (!allocated) {
+ return -ENOSPC;
+ }
+ for (auto i = old_size; i < extents->size(); ++i) {
+ auto& e = (*extents)[i];
+ ldout(cct, 10) << __func__
+ << " extent: 0x" << std::hex << e.offset << "~" << e.length
+ << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+ << std::dec << dendl;
+ }
+ return int64_t(allocated);
+}
+
+void BitmapAllocator::release(
+ const interval_set<uint64_t>& release_set)
+{
+ for (auto r : release_set) {
+ ldout(cct, 10) << __func__ << " 0x" << std::hex << r.first << "~" << r.second
+ << std::dec << dendl;
+ }
+ _free_l2(release_set);
+ ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+
+void BitmapAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+ ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+
+ auto mas = get_min_alloc_size();
+ uint64_t offs = round_up_to(offset, mas);
+ uint64_t l = p2align(offset + length - offs, mas);
+
+ _mark_free(offs, l);
+ ldout(cct, 10) << __func__ << " done" << dendl;
+}
+void BitmapAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+ ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ auto mas = get_min_alloc_size();
+ uint64_t offs = round_up_to(offset, mas);
+ uint64_t l = p2align(offset + length - offs, mas);
+ _mark_allocated(offs, l);
+ ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+void BitmapAllocator::shutdown()
+{
+ ldout(cct, 1) << __func__ << dendl;
+ _shutdown();
+}
+
+void BitmapAllocator::dump()
+{
+ // bin -> interval count
+ std::map<size_t, size_t> bins_overall;
+ collect_stats(bins_overall);
+ auto it = bins_overall.begin();
+ while (it != bins_overall.end()) {
+ ldout(cct, 0) << __func__
+ << " bin " << it->first
+ << "(< " << byte_u_t((1 << (it->first + 1)) * get_min_alloc_size()) << ")"
+ << " : " << it->second << " extents"
+ << dendl;
+ ++it;
+ }
+}
+
+void BitmapAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+ size_t alloc_size = get_min_alloc_size();
+ auto multiply_by_alloc_size = [alloc_size, notify](size_t off, size_t len) {
+ notify(off * alloc_size, len * alloc_size);
+ };
+ std::lock_guard lck(lock);
+ l1.dump(multiply_by_alloc_size);
+}
diff --git a/src/os/bluestore/BitmapAllocator.h b/src/os/bluestore/BitmapAllocator.h
new file mode 100755
index 00000000..51ebaa42
--- /dev/null
+++ b/src/os/bluestore/BitmapAllocator.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+#define CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "fastbmap_allocator_impl.h"
+#include "include/mempool.h"
+#include "common/debug.h"
+
+class BitmapAllocator : public Allocator,
+ public AllocatorLevel02<AllocatorLevel01Loose> {
+ CephContext* cct;
+
+public:
+ BitmapAllocator(CephContext* _cct, int64_t capacity, int64_t alloc_unit, const std::string& name);
+ ~BitmapAllocator() override
+ {
+ }
+
+ int64_t allocate(
+ uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+ int64_t hint, PExtentVector *extents) override;
+
+ void release(
+ const interval_set<uint64_t>& release_set) override;
+
+ using Allocator::release;
+
+ uint64_t get_free() override
+ {
+ return get_available();
+ }
+
+ void dump() override;
+ void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override;
+ double get_fragmentation() override
+ {
+ return _get_fragmentation();
+ }
+
+ void init_add_free(uint64_t offset, uint64_t length) override;
+ void init_rm_free(uint64_t offset, uint64_t length) override;
+
+ void shutdown() override;
+};
+
+#endif
diff --git a/src/os/bluestore/BitmapFreelistManager.cc b/src/os/bluestore/BitmapFreelistManager.cc
new file mode 100644
index 00000000..13c60b49
--- /dev/null
+++ b/src/os/bluestore/BitmapFreelistManager.cc
@@ -0,0 +1,600 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BitmapFreelistManager.h"
+#include "kv/KeyValueDB.h"
+#include "os/kv.h"
+
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "freelist "
+
+void make_offset_key(uint64_t offset, std::string *key)
+{
+ key->reserve(10);
+ _key_encode_u64(offset, key);
+}
+
+struct XorMergeOperator : public KeyValueDB::MergeOperator {
+ void merge_nonexistent(
+ const char *rdata, size_t rlen, std::string *new_value) override {
+ *new_value = std::string(rdata, rlen);
+ }
+ void merge(
+ const char *ldata, size_t llen,
+ const char *rdata, size_t rlen,
+ std::string *new_value) override {
+ ceph_assert(llen == rlen);
+ *new_value = std::string(ldata, llen);
+ for (size_t i = 0; i < rlen; ++i) {
+ (*new_value)[i] ^= rdata[i];
+ }
+ }
+ // We use each operator name and each prefix to construct the
+ // overall RocksDB operator name for consistency check at open time.
+ const char *name() const override {
+ return "bitwise_xor";
+ }
+};
+
+void BitmapFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix)
+{
+ std::shared_ptr<XorMergeOperator> merge_op(new XorMergeOperator);
+ db->set_merge_operator(prefix, merge_op);
+}
+
+BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
+ string meta_prefix,
+ string bitmap_prefix)
+ : FreelistManager(cct),
+ meta_prefix(meta_prefix),
+ bitmap_prefix(bitmap_prefix),
+ enumerate_bl_pos(0)
+{
+}
+
+int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity,
+ KeyValueDB::Transaction txn)
+{
+ bytes_per_block = granularity;
+ ceph_assert(isp2(bytes_per_block));
+ size = p2align(new_size, bytes_per_block);
+ blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
+
+ _init_misc();
+
+ blocks = size / bytes_per_block;
+ if (blocks / blocks_per_key * blocks_per_key != blocks) {
+ blocks = (blocks / blocks_per_key + 1) * blocks_per_key;
+ dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
+ << " to 0x" << (blocks * bytes_per_block)
+ << " (0x" << blocks << " blocks)" << std::dec << dendl;
+ // set past-eof blocks as allocated
+ _xor(size, blocks * bytes_per_block - size, txn);
+ }
+ dout(10) << __func__
+ << " size 0x" << std::hex << size
+ << " bytes_per_block 0x" << bytes_per_block
+ << " blocks 0x" << blocks
+ << " blocks_per_key 0x" << blocks_per_key
+ << std::dec << dendl;
+ {
+ bufferlist bl;
+ encode(bytes_per_block, bl);
+ txn->set(meta_prefix, "bytes_per_block", bl);
+ }
+ {
+ bufferlist bl;
+ encode(blocks_per_key, bl);
+ txn->set(meta_prefix, "blocks_per_key", bl);
+ }
+ {
+ bufferlist bl;
+ encode(blocks, bl);
+ txn->set(meta_prefix, "blocks", bl);
+ }
+ {
+ bufferlist bl;
+ encode(size, bl);
+ txn->set(meta_prefix, "size", bl);
+ }
+ return 0;
+}
+
+int BitmapFreelistManager::expand(uint64_t new_size, KeyValueDB::Transaction txn)
+{
+ assert(new_size > size);
+ ceph_assert(isp2(bytes_per_block));
+
+ uint64_t blocks0 = size / bytes_per_block;
+ if (blocks0 / blocks_per_key * blocks_per_key != blocks0) {
+ blocks0 = (blocks0 / blocks_per_key + 1) * blocks_per_key;
+ dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
+ << " to 0x" << (blocks0 * bytes_per_block)
+ << " (0x" << blocks0 << " blocks)" << std::dec << dendl;
+ // reset previous past-eof blocks to unallocated
+ _xor(size, blocks0 * bytes_per_block - size, txn);
+ }
+
+ size = p2align(new_size, bytes_per_block);
+ blocks = size / bytes_per_block;
+
+ if (blocks / blocks_per_key * blocks_per_key != blocks) {
+ blocks = (blocks / blocks_per_key + 1) * blocks_per_key;
+ dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
+ << " to 0x" << (blocks * bytes_per_block)
+ << " (0x" << blocks << " blocks)" << std::dec << dendl;
+ // set past-eof blocks as allocated
+ _xor(size, blocks * bytes_per_block - size, txn);
+ }
+
+ dout(10) << __func__
+ << " size 0x" << std::hex << size
+ << " bytes_per_block 0x" << bytes_per_block
+ << " blocks 0x" << blocks
+ << " blocks_per_key 0x" << blocks_per_key
+ << std::dec << dendl;
+ {
+ bufferlist bl;
+ encode(blocks, bl);
+ txn->set(meta_prefix, "blocks", bl);
+ }
+ {
+ bufferlist bl;
+ encode(size, bl);
+ txn->set(meta_prefix, "size", bl);
+ }
+ return 0;
+}
+
+int BitmapFreelistManager::init(KeyValueDB *kvdb)
+{
+ dout(1) << __func__ << dendl;
+
+ KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix);
+ it->lower_bound(string());
+
+ // load meta
+ while (it->valid()) {
+ string k = it->key();
+ if (k == "bytes_per_block") {
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ decode(bytes_per_block, p);
+ dout(10) << __func__ << " bytes_per_block 0x" << std::hex
+ << bytes_per_block << std::dec << dendl;
+ } else if (k == "blocks") {
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ decode(blocks, p);
+ dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec
+ << dendl;
+ } else if (k == "size") {
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ decode(size, p);
+ dout(10) << __func__ << " size 0x" << std::hex << size << std::dec
+ << dendl;
+ } else if (k == "blocks_per_key") {
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ decode(blocks_per_key, p);
+ dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key
+ << std::dec << dendl;
+ } else {
+ derr << __func__ << " unrecognized meta " << k << dendl;
+ return -EIO;
+ }
+ it->next();
+ }
+
+ dout(10) << __func__ << std::hex
+ << " size 0x" << size
+ << " bytes_per_block 0x" << bytes_per_block
+ << " blocks 0x" << blocks
+ << " blocks_per_key 0x" << blocks_per_key
+ << std::dec << dendl;
+ _init_misc();
+ return 0;
+}
+
+void BitmapFreelistManager::_init_misc()
+{
+ bufferptr z(blocks_per_key >> 3);
+ memset(z.c_str(), 0xff, z.length());
+ all_set_bl.clear();
+ all_set_bl.append(z);
+
+ block_mask = ~(bytes_per_block - 1);
+
+ bytes_per_key = bytes_per_block * blocks_per_key;
+ key_mask = ~(bytes_per_key - 1);
+ dout(10) << __func__ << std::hex << " bytes_per_key 0x" << bytes_per_key
+ << ", key_mask 0x" << key_mask << std::dec
+ << dendl;
+}
+
+void BitmapFreelistManager::shutdown()
+{
+ dout(1) << __func__ << dendl;
+}
+
+void BitmapFreelistManager::enumerate_reset()
+{
+ std::lock_guard l(lock);
+ enumerate_offset = 0;
+ enumerate_bl_pos = 0;
+ enumerate_bl.clear();
+ enumerate_p.reset();
+}
+
+int get_next_clear_bit(bufferlist& bl, int start)
+{
+ const char *p = bl.c_str();
+ int bits = bl.length() << 3;
+ while (start < bits) {
+ // byte = start / 8 (or start >> 3)
+ // bit = start % 8 (or start & 7)
+ unsigned char byte_mask = 1 << (start & 7);
+ if ((p[start >> 3] & byte_mask) == 0) {
+ return start;
+ }
+ ++start;
+ }
+ return -1; // not found
+}
+
+int get_next_set_bit(bufferlist& bl, int start)
+{
+ const char *p = bl.c_str();
+ int bits = bl.length() << 3;
+ while (start < bits) {
+ int which_byte = start / 8;
+ int which_bit = start % 8;
+ unsigned char byte_mask = 1 << which_bit;
+ if (p[which_byte] & byte_mask) {
+ return start;
+ }
+ ++start;
+ }
+ return -1; // not found
+}
+
+bool BitmapFreelistManager::enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length)
+{
+ std::lock_guard l(lock);
+
+ // initial base case is a bit awkward
+ if (enumerate_offset == 0 && enumerate_bl_pos == 0) {
+ dout(10) << __func__ << " start" << dendl;
+ enumerate_p = kvdb->get_iterator(bitmap_prefix);
+ enumerate_p->lower_bound(string());
+ // we assert that the first block is always allocated; it's true,
+ // and it simplifies our lives a bit.
+ ceph_assert(enumerate_p->valid());
+ string k = enumerate_p->key();
+ const char *p = k.c_str();
+ _key_decode_u64(p, &enumerate_offset);
+ enumerate_bl = enumerate_p->value();
+ ceph_assert(enumerate_offset == 0);
+ ceph_assert(get_next_set_bit(enumerate_bl, 0) == 0);
+ }
+
+ if (enumerate_offset >= size) {
+ dout(10) << __func__ << " end" << dendl;
+ return false;
+ }
+
+ // skip set bits to find offset
+ while (true) {
+ enumerate_bl_pos = get_next_clear_bit(enumerate_bl, enumerate_bl_pos);
+ if (enumerate_bl_pos >= 0) {
+ *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
+ dout(30) << __func__ << " found clear bit, key 0x" << std::hex
+ << enumerate_offset << " bit 0x" << enumerate_bl_pos
+ << " offset 0x" << *offset
+ << std::dec << dendl;
+ break;
+ }
+ dout(30) << " no more clear bits in 0x" << std::hex << enumerate_offset
+ << std::dec << dendl;
+ enumerate_p->next();
+ enumerate_bl.clear();
+ if (!enumerate_p->valid()) {
+ enumerate_offset += bytes_per_key;
+ enumerate_bl_pos = 0;
+ *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
+ break;
+ }
+ string k = enumerate_p->key();
+ const char *p = k.c_str();
+ uint64_t next = enumerate_offset + bytes_per_key;
+ _key_decode_u64(p, &enumerate_offset);
+ enumerate_bl = enumerate_p->value();
+ enumerate_bl_pos = 0;
+ if (enumerate_offset > next) {
+ dout(30) << " no key at 0x" << std::hex << next << ", got 0x"
+ << enumerate_offset << std::dec << dendl;
+ *offset = next;
+ break;
+ }
+ }
+
+ // skip clear bits to find the end
+ uint64_t end = 0;
+ if (enumerate_p->valid()) {
+ while (true) {
+ enumerate_bl_pos = get_next_set_bit(enumerate_bl, enumerate_bl_pos);
+ if (enumerate_bl_pos >= 0) {
+ end = _get_offset(enumerate_offset, enumerate_bl_pos);
+ dout(30) << __func__ << " found set bit, key 0x" << std::hex
+ << enumerate_offset << " bit 0x" << enumerate_bl_pos
+ << " offset 0x" << end << std::dec
+ << dendl;
+ end = std::min(get_alloc_units() * bytes_per_block, end);
+ *length = end - *offset;
+ dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
+ << std::dec << dendl;
+ return true;
+ }
+ dout(30) << " no more set bits in 0x" << std::hex << enumerate_offset
+ << std::dec << dendl;
+ enumerate_p->next();
+ enumerate_bl.clear();
+ enumerate_bl_pos = 0;
+ if (!enumerate_p->valid()) {
+ break;
+ }
+ string k = enumerate_p->key();
+ const char *p = k.c_str();
+ _key_decode_u64(p, &enumerate_offset);
+ enumerate_bl = enumerate_p->value();
+ }
+ }
+
+ if (enumerate_offset < size) {
+ end = get_alloc_units() * bytes_per_block;
+ *length = end - *offset;
+ dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
+ << std::dec << dendl;
+ enumerate_offset = size;
+ enumerate_bl_pos = blocks_per_key;
+ return true;
+ }
+
+ dout(10) << __func__ << " end" << dendl;
+ return false;
+}
+
+void BitmapFreelistManager::dump(KeyValueDB *kvdb)
+{
+ enumerate_reset();
+ uint64_t offset, length;
+ while (enumerate_next(kvdb, &offset, &length)) {
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ }
+}
+
+void BitmapFreelistManager::_verify_range(KeyValueDB *kvdb,
+ uint64_t offset, uint64_t length,
+ int val)
+{
+ unsigned errors = 0;
+ uint64_t first_key = offset & key_mask;
+ uint64_t last_key = (offset + length - 1) & key_mask;
+ if (first_key == last_key) {
+ string k;
+ make_offset_key(first_key, &k);
+ bufferlist bl;
+ kvdb->get(bitmap_prefix, k, &bl);
+ if (bl.length() > 0) {
+ const char *p = bl.c_str();
+ unsigned s = (offset & ~key_mask) / bytes_per_block;
+ unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
+ for (unsigned i = s; i <= e; ++i) {
+ int has = !!(p[i >> 3] & (1ull << (i & 7)));
+ if (has != val) {
+ derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
+ << i << " has 0x" << has << " expected 0x" << val
+ << std::dec << dendl;
+ ++errors;
+ }
+ }
+ } else {
+ if (val) {
+ derr << __func__ << " key 0x" << std::hex << first_key
+ << " not present, expected 0x" << val << std::dec << dendl;
+ ++errors;
+ }
+ }
+ } else {
+ // first key
+ {
+ string k;
+ make_offset_key(first_key, &k);
+ bufferlist bl;
+ kvdb->get(bitmap_prefix, k, &bl);
+ if (bl.length()) {
+ const char *p = bl.c_str();
+ unsigned s = (offset & ~key_mask) / bytes_per_block;
+ unsigned e = blocks_per_key;
+ for (unsigned i = s; i < e; ++i) {
+ int has = !!(p[i >> 3] & (1ull << (i & 7)));
+ if (has != val) {
+ derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
+ << i << " has 0x" << has << " expected 0x" << val << std::dec
+ << dendl;
+ ++errors;
+ }
+ }
+ } else {
+ if (val) {
+ derr << __func__ << " key 0x" << std::hex << first_key
+ << " not present, expected 0x" << val << std::dec << dendl;
+ ++errors;
+ }
+ }
+ first_key += bytes_per_key;
+ }
+ // middle keys
+ if (first_key < last_key) {
+ while (first_key < last_key) {
+ string k;
+ make_offset_key(first_key, &k);
+ bufferlist bl;
+ kvdb->get(bitmap_prefix, k, &bl);
+ if (bl.length() > 0) {
+ const char *p = bl.c_str();
+ for (unsigned i = 0; i < blocks_per_key; ++i) {
+ int has = !!(p[i >> 3] & (1ull << (i & 7)));
+ if (has != val) {
+ derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
+ << i << " has 0x" << has << " expected 0x" << val
+ << std::dec << dendl;
+ ++errors;
+ }
+ }
+ } else {
+ if (val) {
+ derr << __func__ << " key 0x" << std::hex << first_key
+ << " not present, expected 0x" << val << std::dec << dendl;
+ ++errors;
+ }
+ }
+ first_key += bytes_per_key;
+ }
+ }
+ ceph_assert(first_key == last_key);
+ {
+ string k;
+ make_offset_key(first_key, &k);
+ bufferlist bl;
+ kvdb->get(bitmap_prefix, k, &bl);
+ if (bl.length() > 0) {
+ const char *p = bl.c_str();
+ unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
+ for (unsigned i = 0; i < e; ++i) {
+ int has = !!(p[i >> 3] & (1ull << (i & 7)));
+ if (has != val) {
+ derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
+ << i << " has 0x" << has << " expected 0x" << val << std::dec
+ << dendl;
+ ++errors;
+ }
+ }
+ } else {
+ if (val) {
+ derr << __func__ << " key 0x" << std::hex << first_key
+ << " not present, expected 0x" << val << std::dec << dendl;
+ ++errors;
+ }
+ }
+ }
+ }
+ if (errors) {
+ derr << __func__ << " saw " << errors << " errors" << dendl;
+ ceph_abort_msg("bitmap freelist errors");
+ }
+}
+
+void BitmapFreelistManager::allocate(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn)
+{
+ dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ _xor(offset, length, txn);
+}
+
+void BitmapFreelistManager::release(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn)
+{
+ dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ _xor(offset, length, txn);
+}
+
+void BitmapFreelistManager::_xor(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn)
+{
+ // must be block aligned
+ ceph_assert((offset & block_mask) == offset);
+ ceph_assert((length & block_mask) == length);
+
+ uint64_t first_key = offset & key_mask;
+ uint64_t last_key = (offset + length - 1) & key_mask;
+ dout(20) << __func__ << " first_key 0x" << std::hex << first_key
+ << " last_key 0x" << last_key << std::dec << dendl;
+
+ if (first_key == last_key) {
+ bufferptr p(blocks_per_key >> 3);
+ p.zero();
+ unsigned s = (offset & ~key_mask) / bytes_per_block;
+ unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
+ for (unsigned i = s; i <= e; ++i) {
+ p[i >> 3] ^= 1ull << (i & 7);
+ }
+ string k;
+ make_offset_key(first_key, &k);
+ bufferlist bl;
+ bl.append(p);
+ dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
+ bl.hexdump(*_dout, false);
+ *_dout << dendl;
+ txn->merge(bitmap_prefix, k, bl);
+ } else {
+ // first key
+ {
+ bufferptr p(blocks_per_key >> 3);
+ p.zero();
+ unsigned s = (offset & ~key_mask) / bytes_per_block;
+ unsigned e = blocks_per_key;
+ for (unsigned i = s; i < e; ++i) {
+ p[i >> 3] ^= 1ull << (i & 7);
+ }
+ string k;
+ make_offset_key(first_key, &k);
+ bufferlist bl;
+ bl.append(p);
+ dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
+ bl.hexdump(*_dout, false);
+ *_dout << dendl;
+ txn->merge(bitmap_prefix, k, bl);
+ first_key += bytes_per_key;
+ }
+ // middle keys
+ while (first_key < last_key) {
+ string k;
+ make_offset_key(first_key, &k);
+ dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec
+ << ": ";
+ all_set_bl.hexdump(*_dout, false);
+ *_dout << dendl;
+ txn->merge(bitmap_prefix, k, all_set_bl);
+ first_key += bytes_per_key;
+ }
+ ceph_assert(first_key == last_key);
+ {
+ bufferptr p(blocks_per_key >> 3);
+ p.zero();
+ unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
+ for (unsigned i = 0; i <= e; ++i) {
+ p[i >> 3] ^= 1ull << (i & 7);
+ }
+ string k;
+ make_offset_key(first_key, &k);
+ bufferlist bl;
+ bl.append(p);
+ dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
+ bl.hexdump(*_dout, false);
+ *_dout << dendl;
+ txn->merge(bitmap_prefix, k, bl);
+ }
+ }
+}
diff --git a/src/os/bluestore/BitmapFreelistManager.h b/src/os/bluestore/BitmapFreelistManager.h
new file mode 100644
index 00000000..9f076e77
--- /dev/null
+++ b/src/os/bluestore/BitmapFreelistManager.h
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_BITMAPFREELISTMANAGER_H
+#define CEPH_OS_BLUESTORE_BITMAPFREELISTMANAGER_H
+
+#include "FreelistManager.h"
+
+#include <string>
+#include <mutex>
+
+#include "common/ceph_mutex.h"
+#include "include/buffer.h"
+#include "kv/KeyValueDB.h"
+
+class BitmapFreelistManager : public FreelistManager {
+ std::string meta_prefix, bitmap_prefix;
+ std::shared_ptr<KeyValueDB::MergeOperator> merge_op;
+ ceph::mutex lock = ceph::make_mutex("BitmapFreelistManager::lock");
+
+ uint64_t size; ///< size of device (bytes)
+ uint64_t bytes_per_block; ///< bytes per block (bdev_block_size)
+ uint64_t blocks_per_key; ///< blocks (bits) per key/value pair
+ uint64_t bytes_per_key; ///< bytes per key/value pair
+ uint64_t blocks; ///< size of device (blocks, size rounded up)
+
+ uint64_t block_mask; ///< mask to convert byte offset to block offset
+ uint64_t key_mask; ///< mask to convert offset to key offset
+
+ bufferlist all_set_bl;
+
+ KeyValueDB::Iterator enumerate_p;
+ uint64_t enumerate_offset; ///< logical offset; position
+ bufferlist enumerate_bl; ///< current key at enumerate_offset
+ int enumerate_bl_pos; ///< bit position in enumerate_bl
+
+ uint64_t _get_offset(uint64_t key_off, int bit) {
+ return key_off + bit * bytes_per_block;
+ }
+
+ void _init_misc();
+
+ void _verify_range(KeyValueDB *kvdb,
+ uint64_t offset, uint64_t length, int val);
+ void _xor(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn);
+
+public:
+ BitmapFreelistManager(CephContext* cct, string meta_prefix,
+ string bitmap_prefix);
+
+ static void setup_merge_operator(KeyValueDB *db, string prefix);
+
+ int create(uint64_t size, uint64_t granularity,
+ KeyValueDB::Transaction txn) override;
+
+ int expand(uint64_t new_size,
+ KeyValueDB::Transaction txn) override;
+
+ int init(KeyValueDB *kvdb) override;
+ void shutdown() override;
+
+ void dump(KeyValueDB *kvdb) override;
+
+ void enumerate_reset() override;
+ bool enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length) override;
+
+ void allocate(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn) override;
+ void release(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn) override;
+
+ inline uint64_t get_size() const override {
+ return size;
+ }
+ inline uint64_t get_alloc_units() const override {
+ return size / bytes_per_block;
+ }
+ inline uint64_t get_alloc_size() const override {
+ return bytes_per_block;
+ }
+
+};
+
+#endif
diff --git a/src/os/bluestore/BlockDevice.cc b/src/os/bluestore/BlockDevice.cc
new file mode 100644
index 00000000..edfc2fb9
--- /dev/null
+++ b/src/os/bluestore/BlockDevice.cc
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <libgen.h>
+#include <unistd.h>
+
+#include "BlockDevice.h"
+
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
+#include "KernelDevice.h"
+#endif
+
+#if defined(HAVE_SPDK)
+#include "NVMEDevice.h"
+#endif
+
+#if defined(HAVE_PMEM)
+#include "PMEMDevice.h"
+#include "libpmem.h"
+#endif
+
+#include "common/debug.h"
+#include "common/EventTrace.h"
+#include "common/errno.h"
+#include "include/compat.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bdev
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev "
+
+void IOContext::aio_wait()
+{
+ std::unique_lock l(lock);
+ // see _aio_thread for waker logic
+ while (num_running.load() > 0) {
+ dout(10) << __func__ << " " << this
+ << " waiting for " << num_running.load() << " aios to complete"
+ << dendl;
+ cond.wait(l);
+ }
+ dout(20) << __func__ << " " << this << " done" << dendl;
+}
+
+uint64_t IOContext::get_num_ios() const
+{
+ // this is about the simplest model for transaction cost you can
+ // imagine. there is some fixed overhead cost by saying there is a
+ // minimum of one "io". and then we have some cost per "io" that is
+ // a configurable (with different hdd and ssd defaults), and add
+ // that to the bytes value.
+ uint64_t ios = 0;
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
+ ios += pending_aios.size();
+#endif
+#ifdef HAVE_SPDK
+ ios += total_nseg;
+#endif
+ return ios;
+}
+
+void IOContext::release_running_aios()
+{
+ ceph_assert(!num_running);
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
+ // release aio contexts (including pinned buffers).
+ running_aios.clear();
+#endif
+}
+
+BlockDevice *BlockDevice::create(CephContext* cct, const string& path,
+ aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv)
+{
+ string type = "kernel";
+ char buf[PATH_MAX + 1];
+ int r = ::readlink(path.c_str(), buf, sizeof(buf) - 1);
+ if (r >= 0) {
+ buf[r] = '\0';
+ char *bname = ::basename(buf);
+ if (strncmp(bname, SPDK_PREFIX, sizeof(SPDK_PREFIX)-1) == 0)
+ type = "ust-nvme";
+ }
+
+#if defined(HAVE_PMEM)
+ if (type == "kernel") {
+ int is_pmem = 0;
+ size_t map_len = 0;
+ void *addr = pmem_map_file(path.c_str(), 0, PMEM_FILE_EXCL, O_RDONLY, &map_len, &is_pmem);
+ if (addr != NULL) {
+ if (is_pmem)
+ type = "pmem";
+ else
+ dout(1) << path.c_str() << " isn't pmem file" << dendl;
+ pmem_unmap(addr, map_len);
+ } else {
+ dout(1) << "pmem_map_file:" << path.c_str() << " failed." << pmem_errormsg() << dendl;
+ }
+ }
+#endif
+
+ dout(1) << __func__ << " path " << path << " type " << type << dendl;
+
+#if defined(HAVE_PMEM)
+ if (type == "pmem") {
+ return new PMEMDevice(cct, cb, cbpriv);
+ }
+#endif
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
+ if (type == "kernel") {
+ return new KernelDevice(cct, cb, cbpriv, d_cb, d_cbpriv);
+ }
+#endif
+#if defined(HAVE_SPDK)
+ if (type == "ust-nvme") {
+ return new NVMEDevice(cct, cb, cbpriv);
+ }
+#endif
+
+
+ derr << __func__ << " unknown backend " << type << dendl;
+ ceph_abort();
+ return NULL;
+}
+
+void BlockDevice::queue_reap_ioc(IOContext *ioc)
+{
+ std::lock_guard l(ioc_reap_lock);
+ if (ioc_reap_count.load() == 0)
+ ++ioc_reap_count;
+ ioc_reap_queue.push_back(ioc);
+}
+
+void BlockDevice::reap_ioc()
+{
+ if (ioc_reap_count.load()) {
+ std::lock_guard l(ioc_reap_lock);
+ for (auto p : ioc_reap_queue) {
+ dout(20) << __func__ << " reap ioc " << p << dendl;
+ delete p;
+ }
+ ioc_reap_queue.clear();
+ --ioc_reap_count;
+ }
+}
diff --git a/src/os/bluestore/BlockDevice.h b/src/os/bluestore/BlockDevice.h
new file mode 100644
index 00000000..315d46c1
--- /dev/null
+++ b/src/os/bluestore/BlockDevice.h
@@ -0,0 +1,245 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_BLUESTORE_BLOCKDEVICE_H
+#define CEPH_OS_BLUESTORE_BLOCKDEVICE_H
+
+#include <atomic>
+#include <condition_variable>
+#include <list>
+#include <map>
+#include <mutex>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "acconfig.h"
+#include "common/ceph_mutex.h"
+
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
+#include "ceph_aio.h"
+#endif
+#include "include/ceph_assert.h"
+#include "include/buffer.h"
+#include "include/interval_set.h"
+#define SPDK_PREFIX "spdk:"
+
+#if defined(__linux__)
+#if !defined(F_SET_FILE_RW_HINT)
+#define F_LINUX_SPECIFIC_BASE 1024
+#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
+#endif
+// These values match Linux definition
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+#define WRITE_LIFE_NOT_SET 0 // No hint information set
+#define WRITE_LIFE_NONE 1 // No hints about write life time
+#define WRITE_LIFE_SHORT 2 // Data written has a short life time
+#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time
+#define WRITE_LIFE_LONG 4 // Data written has a long life time
+#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time
+#define WRITE_LIFE_MAX 6
+#else
+// On systems don't have WRITE_LIFE_* only use one FD
+// And all files are created equal
+#define WRITE_LIFE_NOT_SET 0 // No hint information set
+#define WRITE_LIFE_NONE 0 // No hints about write life time
+#define WRITE_LIFE_SHORT 0 // Data written has a short life time
+#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time
+#define WRITE_LIFE_LONG 0 // Data written has a long life time
+#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time
+#define WRITE_LIFE_MAX 1
+#endif
+
+class CephContext;
+
+/// track in-flight io
+struct IOContext {
+private:
+ ceph::mutex lock = ceph::make_mutex("IOContext::lock");
+ ceph::condition_variable cond;
+ int r = 0;
+
+public:
+ CephContext* cct;
+ void *priv;
+#ifdef HAVE_SPDK
+ void *nvme_task_first = nullptr;
+ void *nvme_task_last = nullptr;
+ std::atomic_int total_nseg = {0};
+#endif
+
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
+ std::list<aio_t> pending_aios; ///< not yet submitted
+ std::list<aio_t> running_aios; ///< submitting or submitted
+#endif
+ std::atomic_int num_pending = {0};
+ std::atomic_int num_running = {0};
+ bool allow_eio;
+
+ explicit IOContext(CephContext* cct, void *p, bool allow_eio = false)
+ : cct(cct), priv(p), allow_eio(allow_eio)
+ {}
+
+ // no copying
+ IOContext(const IOContext& other) = delete;
+ IOContext &operator=(const IOContext& other) = delete;
+
+ bool has_pending_aios() {
+ return num_pending.load();
+ }
+ void release_running_aios();
+ void aio_wait();
+ uint64_t get_num_ios() const;
+
+ void try_aio_wake() {
+ assert(num_running >= 1);
+
+ std::lock_guard l(lock);
+ if (num_running.fetch_sub(1) == 1) {
+
+ // we might have some pending IOs submitted after the check
+ // as there is no lock protection for aio_submit.
+ // Hence we might have false conditional trigger.
+ // aio_wait has to handle that hence do not care here.
+ cond.notify_all();
+ }
+ }
+
+ void set_return_value(int _r) {
+ r = _r;
+ }
+
+ int get_return_value() const {
+ return r;
+ }
+};
+
+
+class BlockDevice {
+public:
+ CephContext* cct;
+ typedef void (*aio_callback_t)(void *handle, void *aio);
+private:
+ ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock");
+ std::vector<IOContext*> ioc_reap_queue;
+ std::atomic_int ioc_reap_count = {0};
+
+protected:
+ uint64_t size;
+ uint64_t block_size;
+ bool support_discard = false;
+ bool rotational = true;
+ bool lock_exclusive = true;
+
+public:
+ aio_callback_t aio_callback;
+ void *aio_callback_priv;
+ BlockDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
+ : cct(cct),
+ size(0),
+ block_size(0),
+ aio_callback(cb),
+ aio_callback_priv(cbpriv)
+ {}
+ virtual ~BlockDevice() = default;
+
+ static BlockDevice *create(
+ CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
+ virtual bool supported_bdev_label() { return true; }
+ virtual bool is_rotational() { return rotational; }
+
+ virtual void aio_submit(IOContext *ioc) = 0;
+
+ void set_no_exclusive_lock() {
+ lock_exclusive = false;
+ }
+
+ uint64_t get_size() const { return size; }
+ uint64_t get_block_size() const { return block_size; }
+
+ /// hook to provide utilization of thinly-provisioned device
+ virtual bool get_thin_utilization(uint64_t *total, uint64_t *avail) const {
+ return false;
+ }
+
+ virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0;
+
+ virtual int get_devname(std::string *out) {
+ return -ENOENT;
+ }
+ virtual int get_devices(std::set<std::string> *ls) {
+ std::string s;
+ if (get_devname(&s) == 0) {
+ ls->insert(s);
+ }
+ return 0;
+ }
+ virtual int get_numa_node(int *node) const {
+ return -EOPNOTSUPP;
+ }
+
+ virtual int read(
+ uint64_t off,
+ uint64_t len,
+ bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered) = 0;
+ virtual int read_random(
+ uint64_t off,
+ uint64_t len,
+ char *buf,
+ bool buffered) = 0;
+ virtual int write(
+ uint64_t off,
+ bufferlist& bl,
+ bool buffered,
+ int write_hint = WRITE_LIFE_NOT_SET) = 0;
+
+ virtual int aio_read(
+ uint64_t off,
+ uint64_t len,
+ bufferlist *pbl,
+ IOContext *ioc) = 0;
+ virtual int aio_write(
+ uint64_t off,
+ bufferlist& bl,
+ IOContext *ioc,
+ bool buffered,
+ int write_hint = WRITE_LIFE_NOT_SET) = 0;
+ virtual int flush() = 0;
+ virtual int discard(uint64_t offset, uint64_t len) { return 0; }
+ virtual int queue_discard(interval_set<uint64_t> &to_release) { return -1; }
+ virtual void discard_drain() { return; }
+
+ void queue_reap_ioc(IOContext *ioc);
+ void reap_ioc();
+
+ // for managing buffered readers/writers
+ virtual int invalidate_cache(uint64_t off, uint64_t len) = 0;
+ virtual int open(const std::string& path) = 0;
+ virtual void close() = 0;
+
+protected:
+ bool is_valid_io(uint64_t off, uint64_t len) const {
+ return (off % block_size == 0 &&
+ len % block_size == 0 &&
+ len > 0 &&
+ off < size &&
+ off + len <= size);
+ }
+};
+
+#endif //CEPH_OS_BLUESTORE_BLOCKDEVICE_H
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
new file mode 100644
index 00000000..f7bda939
--- /dev/null
+++ b/src/os/bluestore/BlueFS.cc
@@ -0,0 +1,3665 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "boost/algorithm/string.hpp"
+#include "BlueFS.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "BlockDevice.h"
+#include "Allocator.h"
+#include "include/ceph_assert.h"
+#include "common/admin_socket.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluefs
+#undef dout_prefix
+#define dout_prefix *_dout << "bluefs "
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
+ bluefs_file_reader_buffer, bluefs_file_reader);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
+
+static void wal_discard_cb(void *priv, void* priv2) {
+ BlueFS *bluefs = static_cast<BlueFS*>(priv);
+ interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+ bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
+}
+
+static void db_discard_cb(void *priv, void* priv2) {
+ BlueFS *bluefs = static_cast<BlueFS*>(priv);
+ interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+ bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
+}
+
+static void slow_discard_cb(void *priv, void* priv2) {
+ BlueFS *bluefs = static_cast<BlueFS*>(priv);
+ interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+ bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
+}
+
+class BlueFS::SocketHook : public AdminSocketHook {
+ BlueFS* bluefs;
+public:
+ static BlueFS::SocketHook* create(BlueFS* bluefs)
+ {
+ BlueFS::SocketHook* hook = nullptr;
+ AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
+ if (admin_socket) {
+ hook = new BlueFS::SocketHook(bluefs);
+ int r = admin_socket->register_command("bluestore bluefs available",
+ "bluestore bluefs available "
+ "name=alloc_size,type=CephInt,req=false",
+ hook,
+ "Report available space for bluefs. "
+ "If alloc_size set, make simulation.");
+ if (r != 0) {
+ ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
+ delete hook;
+ hook = nullptr;
+ } else {
+ r = admin_socket->register_command("bluestore bluefs stats",
+ "bluestore bluefs stats",
+ hook,
+ "Dump internal statistics for bluefs.");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("bluefs debug_inject_read_zeros",
+ "bluefs debug_inject_read_zeros",
+ hook,
+ "Injects 8K zeros into next BlueFS read. Debug only.");
+ ceph_assert(r == 0);
+ }
+ }
+ return hook;
+ }
+
+ ~SocketHook() {
+ AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
+ admin_socket->unregister_commands(this);
+ }
+private:
+ SocketHook(BlueFS* bluefs) :
+ bluefs(bluefs) {}
+ bool call(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, bufferlist& out) override {
+ stringstream ss;
+ bool r = true;
+ if (command == "bluestore bluefs available") {
+ int64_t alloc_size = 0;
+ cmd_getval(bluefs->cct, cmdmap, "alloc_size", alloc_size);
+ if ((alloc_size & (alloc_size - 1)) != 0) {
+ ss << "Invalid allocation size:'" << alloc_size << std::endl;
+ }
+ if (alloc_size == 0)
+ alloc_size = bluefs->cct->_conf->bluefs_alloc_size;
+ Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
+ f->open_object_section("bluefs_available_space");
+ for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
+ if (bluefs->bdev[dev]) {
+ f->open_object_section("dev");
+ f->dump_string("device", bluefs->get_device_name(dev));
+ ceph_assert(bluefs->alloc[dev]);
+ f->dump_int("free", bluefs->alloc[dev]->get_free());
+ f->close_section();
+ }
+ }
+ size_t extra_space = 0;
+ if (bluefs->slow_dev_expander) {
+ extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size);
+ }
+ f->dump_int("available_from_bluestore", extra_space);
+ f->close_section();
+ f->flush(ss);
+ delete f;
+ } else if (command == "bluestore bluefs stats") {
+ bluefs->dump_block_extents(ss);
+ bluefs->dump_volume_selector(ss);
+ } else if (command == "bluefs debug_inject_read_zeros") {
+ bluefs->inject_read_zeros++;
+ } else {
+ ss << "Invalid command" << std::endl;
+ r = false;
+ }
+ out.append(ss);
+ return r;
+ }
+};
+
+BlueFS::BlueFS(CephContext* cct)
+ : cct(cct),
+ bdev(MAX_BDEV),
+ ioc(MAX_BDEV),
+ block_all(MAX_BDEV)
+{
+ discard_cb[BDEV_WAL] = wal_discard_cb;
+ discard_cb[BDEV_DB] = db_discard_cb;
+ discard_cb[BDEV_SLOW] = slow_discard_cb;
+ asok_hook = SocketHook::create(this);
+}
+
+BlueFS::~BlueFS()
+{
+ delete asok_hook;
+ for (auto p : ioc) {
+ if (p)
+ p->aio_wait();
+ }
+ for (auto p : bdev) {
+ if (p) {
+ p->close();
+ delete p;
+ }
+ }
+ for (auto p : ioc) {
+ delete p;
+ }
+}
+
+void BlueFS::_init_logger()
+{
+ PerfCountersBuilder b(cct, "bluefs",
+ l_bluefs_first, l_bluefs_last);
+ b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
+ "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
+ "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
+ "Total bytes (main db device)",
+ "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
+ "Used bytes (main db device)",
+ "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
+ "Total bytes (wal device)",
+ "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
+ "Used bytes (wal device)",
+ "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
+ "Total bytes (slow device)",
+ "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
+ "Used bytes (slow device)",
+ "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluefs_num_files, "num_files", "File count",
+ "f", PerfCountersBuilder::PRIO_USEFUL);
+ b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
+ "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
+ "Compactions of the metadata log");
+ b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
+ "Bytes written to the metadata log", "j",
+ PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
+ "Files written to WAL");
+ b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
+ "Files written to SSTs");
+ b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
+ "Bytes written to WAL", "wal",
+ PerfCountersBuilder::PRIO_CRITICAL);
+ b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
+ "Bytes written to SSTs", "sst",
+ PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
+ "Bytes written to WAL/SSTs at slow device", NULL,
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
+ "Maximum bytes allocated from WAL");
+ b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
+ "Maximum bytes allocated from DB");
+ b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
+ "Maximum bytes allocated from SLOW");
+
+ b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
+ "random read requests processed");
+ b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
+ "Bytes requested in random read mode", NULL,
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
+ "random reads requests going to disk");
+ b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
+ "Bytes read from disk in random read mode", NULL,
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
+ "random read requests processed using prefetch buffer");
+ b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
+ "Bytes read from prefetch buffer in random read mode", NULL,
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+ b.add_u64_counter(l_bluefs_read_count, "read_count",
+ "buffered read requests processed");
+ b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
+ "Bytes requested in buffered read mode", NULL,
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+ b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
+ "prefetch read requests processed");
+ b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
+ "Bytes requested in prefetch read mode", NULL,
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
+ "How many times bluefs read found page with all 0s");
+ b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
+ "How many times bluefs read found transient page with all 0s");
+
+ logger = b.create_perf_counters();
+ cct->get_perfcounters_collection()->add(logger);
+}
+
+void BlueFS::_shutdown_logger()
+{
+ cct->get_perfcounters_collection()->remove(logger);
+ delete logger;
+}
+
+void BlueFS::_update_logger_stats()
+{
+ // we must be holding the lock
+ logger->set(l_bluefs_num_files, file_map.size());
+ logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
+
+ if (alloc[BDEV_WAL]) {
+ logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size());
+ logger->set(l_bluefs_wal_used_bytes,
+ block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free());
+ }
+ if (alloc[BDEV_DB]) {
+ logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size());
+ logger->set(l_bluefs_db_used_bytes,
+ block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free());
+ }
+ if (alloc[BDEV_SLOW]) {
+ logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size());
+ logger->set(l_bluefs_slow_used_bytes,
+ block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free());
+ }
+}
+
+int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
+ bool shared_with_bluestore)
+{
+ dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
+ ceph_assert(id < bdev.size());
+ ceph_assert(bdev[id] == NULL);
+ BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
+ discard_cb[id], static_cast<void*>(this));
+ if (shared_with_bluestore) {
+ b->set_no_exclusive_lock();
+ }
+ int r = b->open(path);
+ if (r < 0) {
+ delete b;
+ return r;
+ }
+ if (trim) {
+ b->discard(0, b->get_size());
+ }
+
+ dout(1) << __func__ << " bdev " << id << " path " << path
+ << " size " << byte_u_t(b->get_size()) << dendl;
+ bdev[id] = b;
+ ioc[id] = new IOContext(cct, NULL);
+ return 0;
+}
+
+bool BlueFS::bdev_support_label(unsigned id)
+{
+ ceph_assert(id < bdev.size());
+ ceph_assert(bdev[id]);
+ return bdev[id]->supported_bdev_label();
+}
+
+uint64_t BlueFS::get_block_device_size(unsigned id)
+{
+ if (id < bdev.size() && bdev[id])
+ return bdev[id]->get_size();
+ return 0;
+}
+
+void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length,
+ bool skip)
+{
+ dout(1) << __func__ << " bdev " << id
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << " skip " << skip
+ << dendl;
+
+ ceph_assert(id < bdev.size());
+ ceph_assert(bdev[id]);
+ ceph_assert(bdev[id]->get_size() >= offset + length);
+ block_all[id].insert(offset, length);
+
+ if (id < alloc.size() && alloc[id]) {
+ if (!skip)
+ log_t.op_alloc_add(id, offset, length);
+
+ alloc[id]->init_add_free(offset, length);
+ }
+
+ if (logger)
+ logger->inc(l_bluefs_gift_bytes, length);
+ dout(10) << __func__ << " done" << dendl;
+}
+
+int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
+ PExtentVector *extents)
+{
+ std::unique_lock l(lock);
+ dout(1) << __func__ << " bdev " << id
+ << " want 0x" << std::hex << want << std::dec << dendl;
+ ceph_assert(id < alloc.size());
+ ceph_assert(alloc[id]);
+
+ int64_t got = alloc[id]->allocate(want, alloc_size[id], 0, extents);
+ ceph_assert(got != 0);
+ if (got < 0) {
+ derr << __func__ << " failed to allocate space to return to bluestore"
+ << dendl;
+ alloc[id]->dump();
+ return got;
+ }
+
+ for (auto& p : *extents) {
+ block_all[id].erase(p.offset, p.length);
+ log_t.op_alloc_rm(id, p.offset, p.length);
+ }
+
+ flush_bdev();
+ int r = _flush_and_sync_log(l);
+ ceph_assert(r == 0);
+
+ logger->inc(l_bluefs_reclaim_bytes, got);
+ dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
+ << " got " << *extents << dendl;
+ return 0;
+}
+
+void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
+{
+ dout(10) << __func__ << " bdev " << id << dendl;
+ ceph_assert(alloc[id]);
+ alloc[id]->release(to_release);
+}
+
+uint64_t BlueFS::get_used()
+{
+ std::lock_guard l(lock);
+ uint64_t used = 0;
+ for (unsigned id = 0; id < MAX_BDEV; ++id) {
+ if (alloc[id]) {
+ used += block_all[id].size() - alloc[id]->get_free();
+ }
+ }
+ return used;
+}
+
+uint64_t BlueFS::get_total(unsigned id)
+{
+ std::lock_guard l(lock);
+ ceph_assert(id < block_all.size());
+ return block_all[id].size();
+}
+
+uint64_t BlueFS::get_free(unsigned id)
+{
+ std::lock_guard l(lock);
+ ceph_assert(id < alloc.size());
+ return alloc[id]->get_free();
+}
+
+void BlueFS::dump_perf_counters(Formatter *f)
+{
+ f->open_object_section("bluefs_perf_counters");
+ logger->dump_formatted(f,0);
+ f->close_section();
+}
+
+void BlueFS::dump_block_extents(ostream& out)
+{
+ for (unsigned i = 0; i < MAX_BDEV; ++i) {
+ if (!bdev[i]) {
+ continue;
+ }
+ auto owned = get_total(i);
+ auto free = get_free(i);
+
+ out << i << " : device size 0x" << std::hex << bdev[i]->get_size()
+ << " : own 0x" << block_all[i]
+ << " = 0x" << owned
+ << " : using 0x" << owned - free
+ << std::dec << "(" << byte_u_t(owned - free) << ")";
+ if (i == _get_slow_device_id()) {
+ ceph_assert(slow_dev_expander);
+ ceph_assert(alloc[i]);
+ free = slow_dev_expander->available_freespace(alloc_size[i]);
+ out << std::hex
+ << " : bluestore has 0x" << free
+ << std::dec << "(" << byte_u_t(free) << ") available";
+ }
+ out << "\n";
+ }
+}
+
+void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
+{
+ std::lock_guard l(lock);
+ usage->resize(bdev.size());
+ for (unsigned id = 0; id < bdev.size(); ++id) {
+ if (!bdev[id]) {
+ (*usage)[id] = make_pair(0, 0);
+ continue;
+ }
+ (*usage)[id].first = alloc[id]->get_free();
+ (*usage)[id].second = block_all[id].size();
+ uint64_t used =
+ (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size();
+ dout(10) << __func__ << " bdev " << id
+ << " free " << (*usage)[id].first
+ << " (" << byte_u_t((*usage)[id].first) << ")"
+ << " / " << (*usage)[id].second
+ << " (" << byte_u_t((*usage)[id].second) << ")"
+ << ", used " << used << "%"
+ << dendl;
+ }
+}
+
+int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " bdev " << id << dendl;
+ if (id >= block_all.size())
+ return -EINVAL;
+ *extents = block_all[id];
+ return 0;
+}
+
+int BlueFS::mkfs(uuid_d osd_uuid)
+{
+ std::unique_lock l(lock);
+ dout(1) << __func__
+ << " osd_uuid " << osd_uuid
+ << dendl;
+
+ // set volume selector if not provided before/outside
+ if (vselector == nullptr) {
+ vselector.reset(
+ new OriginalVolumeSelector(
+ get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+ get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+ get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+ }
+
+ _init_alloc();
+ _init_logger();
+
+ super.version = 1;
+ super.block_size = bdev[BDEV_DB]->get_block_size();
+ super.osd_uuid = osd_uuid;
+ super.uuid.generate_random();
+ dout(1) << __func__ << " uuid " << super.uuid << dendl;
+
+ // init log
+ FileRef log_file = new File;
+ log_file->fnode.ino = 1;
+ log_file->vselector_hint = vselector->get_hint_by_device(BDEV_WAL);
+ int r = _allocate(
+ vselector->select_prefer_bdev(log_file->vselector_hint),
+ cct->_conf->bluefs_max_log_runway,
+ &log_file->fnode);
+ vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+ ceph_assert(r == 0);
+ log_writer = _create_writer(log_file);
+
+ // initial txn
+ log_t.op_init();
+ for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
+ interval_set<uint64_t>& p = block_all[bdev];
+ if (p.empty())
+ continue;
+ for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
+ dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
+ << std::hex << q.get_start() << "~" << q.get_len() << std::dec
+ << dendl;
+ log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
+ }
+ }
+ _flush_and_sync_log(l);
+
+ // write supers
+ super.log_fnode = log_file->fnode;
+ _write_super(BDEV_DB);
+ flush_bdev();
+
+ // clean up
+ super = bluefs_super_t();
+ _close_writer(log_writer);
+ log_writer = NULL;
+ block_all.clear();
+ vselector.reset(nullptr);
+ _stop_alloc();
+ _shutdown_logger();
+
+ dout(10) << __func__ << " success" << dendl;
+ return 0;
+}
+
+void BlueFS::_init_alloc()
+{
+ dout(20) << __func__ << dendl;
+ alloc.resize(MAX_BDEV);
+ alloc_size.resize(MAX_BDEV, 0);
+ pending_release.resize(MAX_BDEV);
+
+ if (bdev[BDEV_WAL]) {
+ alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
+ }
+ if (bdev[BDEV_SLOW]) {
+ alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
+ alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
+ } else {
+ alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
+ }
+ // new wal and db devices are never shared
+ if (bdev[BDEV_NEWWAL]) {
+ alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
+ }
+ if (bdev[BDEV_NEWDB]) {
+ alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
+ }
+
+ for (unsigned id = 0; id < bdev.size(); ++id) {
+ if (!bdev[id]) {
+ continue;
+ }
+ ceph_assert(bdev[id]->get_size());
+ std::string name = "bluefs-";
+ const char* devnames[] = {"wal","db","slow"};
+ if (id <= BDEV_SLOW)
+ name += devnames[id];
+ else
+ name += to_string(uintptr_t(this));
+ ceph_assert(alloc_size[id]);
+ dout(1) << __func__ << " id " << id
+ << " alloc_size 0x" << std::hex << alloc_size[id]
+ << " size 0x" << bdev[id]->get_size() << std::dec << dendl;
+ alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
+ bdev[id]->get_size(),
+ alloc_size[id], name);
+ interval_set<uint64_t>& p = block_all[id];
+ for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
+ alloc[id]->init_add_free(q.get_start(), q.get_len());
+ }
+ }
+}
+
+void BlueFS::_stop_alloc()
+{
+ dout(20) << __func__ << dendl;
+ for (auto p : bdev) {
+ if (p)
+ p->discard_drain();
+ }
+
+ for (auto p : alloc) {
+ if (p != nullptr) {
+ p->shutdown();
+ delete p;
+ }
+ }
+ alloc.clear();
+}
+
+int BlueFS::read(uint8_t ndev, uint64_t off, uint64_t len,
+ ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
+{
+ dout(10) << __func__ << " dev " << int(ndev)
+ << ": 0x" << std::hex << off << "~" << len << std::dec
+ << (buffered ? " buffered" : "")
+ << dendl;
+ int r;
+ bufferlist bl;
+ r = bdev[ndev]->read(off, len, &bl, ioc, buffered);
+ if (r != 0) {
+ return r;
+ }
+ uint64_t block_size = bdev[ndev]->get_block_size();
+ if (inject_read_zeros) {
+ if (len >= block_size * 2) {
+ derr << __func__ << " injecting error, zeros at "
+ << int(ndev) << ": 0x" << std::hex << (off + len / 2)
+ << "~" << (block_size * 2) << std::dec << dendl;
+ //use beginning, replace 8K in the middle with zeros, use tail
+ bufferlist temp;
+ bl.splice(0, len / 2 - block_size, &temp);
+ temp.append_zero(block_size * 2);
+ bl.splice(block_size * 2, len / 2 - block_size, &temp);
+ bl = temp;
+ inject_read_zeros--;
+ }
+ }
+ //make a check if there is a block with all 0
+ uint64_t to_check_len = len;
+ uint64_t skip = p2nphase(off, block_size);
+ if (skip >= to_check_len) {
+ return r;
+ }
+ auto it = bl.begin();
+ it.seek(skip);
+ to_check_len -= skip;
+ bool all_zeros = false;
+ while (all_zeros == false && to_check_len >= block_size) {
+ // checking 0s step
+ unsigned block_left = block_size;
+ unsigned avail;
+ const char* data;
+ all_zeros = true;
+ while (all_zeros && block_left > 0) {
+ avail = it.get_ptr_and_advance(block_left, &data);
+ block_left -= avail;
+ all_zeros = mem_is_zero(data, avail);
+ }
+ // skipping step
+ while (block_left > 0) {
+ avail = it.get_ptr_and_advance(block_left, &data);
+ block_left -= avail;
+ }
+ to_check_len -= block_size;
+ }
+ if (all_zeros) {
+ logger->inc(l_bluefs_read_zeros_candidate, 1);
+ bufferlist bl_reread;
+ r = bdev[ndev]->read(off, len, &bl_reread, ioc, buffered);
+ if (r != 0) {
+ return r;
+ }
+ // check if both read gave the same
+ if (!bl.contents_equal(bl_reread)) {
+ // report problems to log, but continue, maybe it will be good now...
+ derr << __func__ << " initial read of " << int(ndev)
+ << ": 0x" << std::hex << off << "~" << len
+ << std::dec << ": different then re-read " << dendl;
+ logger->inc(l_bluefs_read_zeros_errors, 1);
+ }
+ // use second read will be better if is different
+ pbl->append(bl_reread);
+ } else {
+ pbl->append(bl);
+ }
+ return r;
+}
+
+int BlueFS::read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
+{
+ dout(10) << __func__ << " dev " << int(ndev)
+ << ": 0x" << std::hex << off << "~" << len << std::dec
+ << (buffered ? " buffered" : "")
+ << dendl;
+ int r;
+ r = bdev[ndev]->read_random(off, len, buf, buffered);
+ if (r != 0) {
+ return r;
+ }
+ uint64_t block_size = bdev[ndev]->get_block_size();
+ if (inject_read_zeros) {
+ if (len >= block_size * 2) {
+ derr << __func__ << " injecting error, zeros at "
+ << int(ndev) << ": 0x" << std::hex << (off + len / 2)
+ << "~" << (block_size * 2) << std::dec << dendl;
+ //zero middle 8K
+ memset(buf + len / 2 - block_size, 0, block_size * 2);
+ inject_read_zeros--;
+ }
+ }
+ //make a check if there is a block with all 0
+ uint64_t to_check_len = len;
+ const char* data = buf;
+ uint64_t skip = p2nphase(off, block_size);
+ if (skip >= to_check_len) {
+ return r;
+ }
+ to_check_len -= skip;
+ data += skip;
+
+ bool all_zeros = false;
+ while (all_zeros == false && to_check_len >= block_size) {
+ if (mem_is_zero(data, block_size)) {
+ // at least one block is all zeros
+ all_zeros = true;
+ break;
+ }
+ data += block_size;
+ to_check_len -= block_size;
+ }
+ if (all_zeros) {
+ logger->inc(l_bluefs_read_zeros_candidate, 1);
+ std::unique_ptr<char[]> data_reread(new char[len]);
+ r = bdev[ndev]->read_random(off, len, &data_reread[0], buffered);
+ if (r != 0) {
+ return r;
+ }
+ // check if both read gave the same
+ if (memcmp(buf, &data_reread[0], len) != 0) {
+ derr << __func__ << " initial read of " << int(ndev)
+ << ": 0x" << std::hex << off << "~" << len
+ << std::dec << ": different then re-read " << dendl;
+ logger->inc(l_bluefs_read_zeros_errors, 1);
+ // second read is probably better
+ memcpy(buf, &data_reread[0], len);
+ }
+ }
+ return r;
+}
+
+int BlueFS::mount()
+{
+ dout(1) << __func__ << dendl;
+
+ int r = _open_super();
+ if (r < 0) {
+ derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+
+ // set volume selector if not provided before/outside
+ if (vselector == nullptr) {
+ vselector.reset(
+ new OriginalVolumeSelector(
+ get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+ get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+ get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+ }
+
+ block_all.clear();
+ block_all.resize(MAX_BDEV);
+ _init_alloc();
+ _init_logger();
+
+ r = _replay(false, false);
+ if (r < 0) {
+ derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
+ _stop_alloc();
+ goto out;
+ }
+
+ // init freelist
+ for (auto& p : file_map) {
+ dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
+ for (auto& q : p.second->fnode.extents) {
+ alloc[q.bdev]->init_rm_free(q.offset, q.length);
+ }
+ }
+
+ // set up the log for future writes
+ log_writer = _create_writer(_get_file(1));
+ ceph_assert(log_writer->file->fnode.ino == 1);
+ log_writer->pos = log_writer->file->fnode.size;
+ dout(10) << __func__ << " log write pos set to 0x"
+ << std::hex << log_writer->pos << std::dec
+ << dendl;
+
+ return 0;
+
+ out:
+ super = bluefs_super_t();
+ return r;
+}
+
+void BlueFS::umount(bool avoid_compact)
+{
+ dout(1) << __func__ << dendl;
+
+ sync_metadata(avoid_compact);
+
+ _close_writer(log_writer);
+ log_writer = NULL;
+
+ vselector.reset(nullptr);
+ _stop_alloc();
+ file_map.clear();
+ dir_map.clear();
+ super = bluefs_super_t();
+ log_t.clear();
+ _shutdown_logger();
+}
+
+int BlueFS::prepare_new_device(int id)
+{
+ dout(1) << __func__ << dendl;
+
+ if(id == BDEV_NEWDB) {
+ int new_log_dev_cur = BDEV_WAL;
+ int new_log_dev_next = BDEV_WAL;
+ if (!bdev[BDEV_WAL]) {
+ new_log_dev_cur = BDEV_NEWDB;
+ new_log_dev_next = BDEV_DB;
+ }
+ _rewrite_log_sync(false,
+ BDEV_NEWDB,
+ new_log_dev_cur,
+ new_log_dev_next,
+ RENAME_DB2SLOW);
+ //}
+ } else if(id == BDEV_NEWWAL) {
+ _rewrite_log_sync(false, BDEV_DB, BDEV_NEWWAL, BDEV_WAL, REMOVE_WAL);
+ } else {
+ assert(false);
+ }
+ return 0;
+}
+
+void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
+{
+ if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
+ bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
+ if (bdev[BDEV_WAL])
+ bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
+}
+
+void BlueFS::get_devices(set<string> *ls)
+{
+ for (unsigned i = 0; i < MAX_BDEV; ++i) {
+ if (bdev[i]) {
+ bdev[i]->get_devices(ls);
+ }
+ }
+}
+
+int BlueFS::fsck()
+{
+ std::lock_guard l(lock);
+ dout(1) << __func__ << dendl;
+ // hrm, i think we check everything on mount...
+ return 0;
+}
+
+int BlueFS::_write_super(int dev)
+{
+ // build superblock
+ bufferlist bl;
+ encode(super, bl);
+ uint32_t crc = bl.crc32c(-1);
+ encode(crc, bl);
+ dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
+ dout(10) << __func__ << " superblock " << super.version << dendl;
+ dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
+ ceph_assert(bl.length() <= get_super_length());
+ bl.append_zero(get_super_length() - bl.length());
+
+ bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
+ dout(20) << __func__ << " v " << super.version
+ << " crc 0x" << std::hex << crc
+ << " offset 0x" << get_super_offset() << std::dec
+ << dendl;
+ return 0;
+}
+
+int BlueFS::_open_super()
+{
+ dout(10) << __func__ << dendl;
+
+ bufferlist bl;
+ uint32_t expected_crc, crc;
+ int r;
+
+ // always the second block
+ r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
+ &bl, ioc[BDEV_DB], false);
+ if (r < 0)
+ return r;
+
+ auto p = bl.cbegin();
+ decode(super, p);
+ {
+ bufferlist t;
+ t.substr_of(bl, 0, p.get_off());
+ crc = t.crc32c(-1);
+ }
+ decode(expected_crc, p);
+ if (crc != expected_crc) {
+ derr << __func__ << " bad crc on superblock, expected 0x"
+ << std::hex << expected_crc << " != actual 0x" << crc << std::dec
+ << dendl;
+ return -EIO;
+ }
+ dout(10) << __func__ << " superblock " << super.version << dendl;
+ dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
+ return 0;
+}
+
+int BlueFS::_replay(bool noop, bool to_stdout)
+{
+ dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
+ ino_last = 1; // by the log
+ log_seq = 0;
+
+ FileRef log_file;
+ log_file = _get_file(1);
+ if (!noop) {
+ log_file->fnode = super.log_fnode;
+ log_file->vselector_hint =
+ vselector->get_hint_by_device(BDEV_WAL);
+ } else {
+ // do not use fnode from superblock in 'noop' mode - log_file's one should
+ // be fine and up-to-date
+ ceph_assert(log_file->fnode.ino == 1);
+ ceph_assert(log_file->fnode.extents.size() != 0);
+ }
+ dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " log_fnode " << super.log_fnode << std::endl;
+ }
+
+ FileReader *log_reader = new FileReader(
+ log_file, cct->_conf->bluefs_max_prefetch,
+ false, // !random
+ true); // ignore eof
+ while (true) {
+ ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
+ uint64_t pos = log_reader->buf.pos;
+ uint64_t read_pos = pos;
+ bufferlist bl;
+ {
+ int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
+ &bl, NULL);
+ if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
+ r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
+ }
+ assert(r == (int)super.block_size);
+ read_pos += r;
+ }
+ uint64_t more = 0;
+ uint64_t seq;
+ uuid_d uuid;
+ {
+ auto p = bl.cbegin();
+ __u8 a, b;
+ uint32_t len;
+ decode(a, p);
+ decode(b, p);
+ decode(len, p);
+ decode(uuid, p);
+ decode(seq, p);
+ if (len + 6 > bl.length()) {
+ more = round_up_to(len + 6 - bl.length(), super.block_size);
+ }
+ }
+ if (uuid != super.uuid) {
+ dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
+ << dendl;
+ break;
+ }
+ if (seq != log_seq + 1) {
+ dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": stop: seq " << seq << " != expected " << log_seq + 1
+ << dendl;
+ break;
+ }
+ if (more) {
+ dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
+ << " more bytes" << dendl;
+ bufferlist t;
+ int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
+ if (r < (int)more) {
+ dout(10) << __func__ << " 0x" << std::hex << pos
+ << ": stop: len is 0x" << bl.length() + more << std::dec
+ << ", which is past eof" << dendl;
+ if (cct->_conf->bluefs_replay_recovery) {
+ //try to search for more data
+ r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
+ if (r < (int)more) {
+ //in normal mode we must read r==more, for recovery it is too strict
+ break;
+ }
+ }
+ }
+ ceph_assert(r == (int)more);
+ bl.claim_append(t);
+ read_pos += r;
+ }
+ bluefs_transaction_t t;
+ try {
+ auto p = bl.cbegin();
+ decode(t, p);
+ }
+ catch (buffer::error& e) {
+ dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": stop: failed to decode: " << e.what()
+ << dendl;
+ delete log_reader;
+ return -EIO;
+ }
+ ceph_assert(seq == t.seq);
+ dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": " << t << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": " << t << std::endl;
+ }
+
+ auto p = t.op_bl.cbegin();
+ while (!p.end()) {
+ __u8 op;
+ decode(op, p);
+ switch (op) {
+
+ case bluefs_transaction_t::OP_INIT:
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_init" << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_init" << std::endl;
+ }
+
+ ceph_assert(t.seq == 1);
+ break;
+
+ case bluefs_transaction_t::OP_JUMP:
+ {
+ uint64_t next_seq;
+ uint64_t offset;
+ decode(next_seq, p);
+ decode(offset, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_jump seq " << next_seq
+ << " offset 0x" << std::hex << offset << std::dec << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_jump seq " << next_seq
+ << " offset 0x" << std::hex << offset << std::dec
+ << std::endl;
+ }
+
+ ceph_assert(next_seq >= log_seq);
+ log_seq = next_seq - 1; // we will increment it below
+ uint64_t skip = offset - read_pos;
+ if (skip) {
+ bufferlist junk;
+ int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
+ NULL);
+ if (r != (int)skip) {
+ dout(10) << __func__ << " 0x" << std::hex << read_pos
+ << ": stop: failed to skip to " << offset
+ << std::dec << dendl;
+ ceph_abort_msg("problem with op_jump");
+ }
+ }
+ }
+ break;
+
+ case bluefs_transaction_t::OP_JUMP_SEQ:
+ {
+ uint64_t next_seq;
+ decode(next_seq, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_jump_seq " << next_seq << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_jump_seq " << next_seq << std::endl;
+ }
+
+ ceph_assert(next_seq >= log_seq);
+ log_seq = next_seq - 1; // we will increment it below
+ }
+ break;
+
+ case bluefs_transaction_t::OP_ALLOC_ADD:
+ {
+ __u8 id;
+ uint64_t offset, length;
+ decode(id, p);
+ decode(offset, p);
+ decode(length, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_alloc_add " << " " << (int)id
+ << ":0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_alloc_add " << " " << (int)id
+ << ":0x" << std::hex << offset << "~" << length << std::dec
+ << std::endl;
+ }
+
+ if (!noop) {
+ block_all[id].insert(offset, length);
+ alloc[id]->init_add_free(offset, length);
+ }
+ }
+ break;
+
+ case bluefs_transaction_t::OP_ALLOC_RM:
+ {
+ __u8 id;
+ uint64_t offset, length;
+ decode(id, p);
+ decode(offset, p);
+ decode(length, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_alloc_rm " << " " << (int)id
+ << ":0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_alloc_rm " << " " << (int)id
+ << ":0x" << std::hex << offset << "~" << length << std::dec
+ << std::endl;
+ }
+
+ if (!noop) {
+ block_all[id].erase(offset, length);
+ alloc[id]->init_rm_free(offset, length);
+ }
+ }
+ break;
+
+ case bluefs_transaction_t::OP_DIR_LINK:
+ {
+ string dirname, filename;
+ uint64_t ino;
+ decode(dirname, p);
+ decode(filename, p);
+ decode(ino, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_dir_link " << " " << dirname << "/" << filename
+ << " to " << ino
+ << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_dir_link " << " " << dirname << "/" << filename
+ << " to " << ino
+ << std::endl;
+ }
+
+ if (!noop) {
+ FileRef file = _get_file(ino);
+ ceph_assert(file->fnode.ino);
+ map<string,DirRef>::iterator q = dir_map.find(dirname);
+ ceph_assert(q != dir_map.end());
+ map<string,FileRef>::iterator r = q->second->file_map.find(filename);
+ ceph_assert(r == q->second->file_map.end());
+
+ vselector->sub_usage(file->vselector_hint, file->fnode);
+ file->vselector_hint =
+ vselector->get_hint_by_dir(dirname);
+ vselector->add_usage(file->vselector_hint, file->fnode);
+
+ q->second->file_map[filename] = file;
+ ++file->refs;
+ }
+ }
+ break;
+
+ case bluefs_transaction_t::OP_DIR_UNLINK:
+ {
+ string dirname, filename;
+ decode(dirname, p);
+ decode(filename, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_dir_unlink " << " " << dirname << "/" << filename
+ << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_dir_unlink " << " " << dirname << "/" << filename
+ << std::endl;
+ }
+
+ if (!noop) {
+ map<string,DirRef>::iterator q = dir_map.find(dirname);
+ ceph_assert(q != dir_map.end());
+ map<string,FileRef>::iterator r = q->second->file_map.find(filename);
+ ceph_assert(r != q->second->file_map.end());
+ ceph_assert(r->second->refs > 0);
+ --r->second->refs;
+ q->second->file_map.erase(r);
+ }
+ }
+ break;
+
+ case bluefs_transaction_t::OP_DIR_CREATE:
+ {
+ string dirname;
+ decode(dirname, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_dir_create " << dirname << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_dir_create " << dirname << std::endl;
+ }
+
+ if (!noop) {
+ map<string,DirRef>::iterator q = dir_map.find(dirname);
+ ceph_assert(q == dir_map.end());
+ dir_map[dirname] = new Dir;
+ }
+ }
+ break;
+
+ case bluefs_transaction_t::OP_DIR_REMOVE:
+ {
+ string dirname;
+ decode(dirname, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_dir_remove " << dirname << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_dir_remove " << dirname << std::endl;
+ }
+
+ if (!noop) {
+ map<string,DirRef>::iterator q = dir_map.find(dirname);
+ ceph_assert(q != dir_map.end());
+ ceph_assert(q->second->file_map.empty());
+ dir_map.erase(q);
+ }
+ }
+ break;
+
+ case bluefs_transaction_t::OP_FILE_UPDATE:
+ {
+ bluefs_fnode_t fnode;
+ decode(fnode, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_file_update " << " " << fnode << " " << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_file_update " << " " << fnode << std::endl;
+ }
+
+ if (!noop) {
+ FileRef f = _get_file(fnode.ino);
+ if (fnode.ino != 1) {
+ vselector->sub_usage(f->vselector_hint, f->fnode);
+ }
+ f->fnode = fnode;
+ if (fnode.ino != 1) {
+ vselector->add_usage(f->vselector_hint, f->fnode);
+ }
+
+ if (fnode.ino > ino_last) {
+ ino_last = fnode.ino;
+ }
+ }
+ }
+ break;
+
+ case bluefs_transaction_t::OP_FILE_REMOVE:
+ {
+ uint64_t ino;
+ decode(ino, p);
+ dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": op_file_remove " << ino << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " 0x" << std::hex << pos << std::dec
+ << ": op_file_remove " << ino << std::endl;
+ }
+
+ if (!noop) {
+ auto p = file_map.find(ino);
+ ceph_assert(p != file_map.end());
+ vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
+ file_map.erase(p);
+ }
+ }
+ break;
+
+ default:
+ derr << __func__ << " 0x" << std::hex << pos << std::dec
+ << ": stop: unrecognized op " << (int)op << dendl;
+ delete log_reader;
+ return -EIO;
+ }
+ }
+ ceph_assert(p.end());
+
+ // we successfully replayed the transaction; bump the seq and log size
+ ++log_seq;
+ log_file->fnode.size = log_reader->buf.pos;
+ }
+ vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+
+ dout(10) << __func__ << " log file size was 0x"
+ << std::hex << log_file->fnode.size << std::dec << dendl;
+ if (unlikely(to_stdout)) {
+ std::cout << " log file size was 0x"
+ << std::hex << log_file->fnode.size << std::dec << std::endl;
+ }
+
+ delete log_reader;
+
+ if (!noop) {
+ // verify file link counts are all >0
+ for (auto& p : file_map) {
+ if (p.second->refs == 0 &&
+ p.second->fnode.ino > 1) {
+ derr << __func__ << " file with link count 0: " << p.second->fnode
+ << dendl;
+ return -EIO;
+ }
+ }
+ }
+
+ dout(10) << __func__ << " done" << dendl;
+ return 0;
+}
+
+int BlueFS::log_dump()
+{
+ // only dump log file's content
+ int r = _replay(true, true);
+ if (r < 0) {
+ derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int BlueFS::device_migrate_to_existing(
+ CephContext *cct,
+ const set<int>& devs_source,
+ int dev_target)
+{
+ vector<byte> buf;
+ bool buffered = cct->_conf->bluefs_buffered_io;
+
+ dout(10) << __func__ << " devs_source " << devs_source
+ << " dev_target " << dev_target << dendl;
+ assert(dev_target < (int)MAX_BDEV);
+
+ int flags = 0;
+ flags |= devs_source.count(BDEV_DB) ?
+ (REMOVE_DB | RENAME_SLOW2DB) : 0;
+ flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+ int dev_target_new = dev_target;
+
+ // Slow device without separate DB one is addressed via BDEV_DB
+ // Hence need renaming.
+ if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
+ dev_target_new = BDEV_DB;
+ dout(0) << __func__ << " super to be written to " << dev_target << dendl;
+ }
+
+ for (auto& p : file_map) {
+ //do not copy log
+ if (p.second->fnode.ino == 1) {
+ continue;
+ }
+ dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
+
+ auto& fnode_extents = p.second->fnode.extents;
+
+ bool rewrite = false;
+ for (auto ext_it = fnode_extents.begin();
+ ext_it != p.second->fnode.extents.end();
+ ++ext_it) {
+ if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
+ rewrite = true;
+ break;
+ }
+ }
+ if (rewrite) {
+ dout(10) << __func__ << " migrating" << dendl;
+
+ // read entire file
+ bufferlist bl;
+ for (auto old_ext : fnode_extents) {
+ buf.resize(old_ext.length);
+ int r = bdev[old_ext.bdev]->read_random(
+ old_ext.offset,
+ old_ext.length,
+ (char*)&buf.at(0),
+ buffered);
+ if (r != 0) {
+ derr << __func__ << " failed to read 0x" << std::hex
+ << old_ext.offset << "~" << old_ext.length << std::dec
+ << " from " << (int)dev_target << dendl;
+ return -EIO;
+ }
+ bl.append((char*)&buf[0], old_ext.length);
+ }
+
+ // write entire file
+ PExtentVector extents;
+ auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
+ if (l < 0) {
+ derr << __func__ << " unable to allocate len 0x" << std::hex
+ << bl.length() << std::dec << " from " << (int)dev_target
+ << ": " << cpp_strerror(l) << dendl;
+ return -ENOSPC;
+ }
+
+ uint64_t off = 0;
+ for (auto& i : extents) {
+ bufferlist cur;
+ uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
+ ceph_assert(cur_len > 0);
+ cur.substr_of(bl, off, cur_len);
+ int r = bdev[dev_target]->write(i.offset, cur, buffered);
+ ceph_assert(r == 0);
+ off += cur_len;
+ }
+
+ // release old extents
+ for (auto old_ext : fnode_extents) {
+ PExtentVector to_release;
+ to_release.emplace_back(old_ext.offset, old_ext.length);
+ alloc[old_ext.bdev]->release(to_release);
+ }
+
+ // update fnode
+ fnode_extents.clear();
+ for (auto& i : extents) {
+ fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
+ }
+ } else {
+ for (auto ext_it = fnode_extents.begin();
+ ext_it != p.second->fnode.extents.end();
+ ++ext_it) {
+ if (dev_target != dev_target_new && ext_it->bdev == dev_target) {
+ dout(20) << __func__ << " " << " ... adjusting extent 0x"
+ << std::hex << ext_it->offset << std::dec
+ << " bdev " << dev_target << " -> " << dev_target_new
+ << dendl;
+ ext_it->bdev = dev_target_new;
+ }
+ }
+ }
+ }
+ // new logging device in the current naming scheme
+ int new_log_dev_cur = bdev[BDEV_WAL] ?
+ BDEV_WAL :
+ bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
+
+ // new logging device in new naming scheme
+ int new_log_dev_next = new_log_dev_cur;
+
+ if (devs_source.count(new_log_dev_cur)) {
+ // SLOW device is addressed via BDEV_DB too hence either WAL or DB
+ new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
+ BDEV_DB :
+ BDEV_WAL;
+
+ dout(0) << __func__ << " log moved from " << new_log_dev_cur
+ << " to " << new_log_dev_next << dendl;
+
+ new_log_dev_cur =
+ (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
+ BDEV_SLOW :
+ new_log_dev_next;
+ }
+
+ _rewrite_log_sync(
+ false,
+ (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
+ new_log_dev_cur,
+ new_log_dev_next,
+ flags);
+ return 0;
+}
+
+int BlueFS::device_migrate_to_new(
+ CephContext *cct,
+ const set<int>& devs_source,
+ int dev_target)
+{
+ vector<byte> buf;
+ bool buffered = cct->_conf->bluefs_buffered_io;
+
+ dout(10) << __func__ << " devs_source " << devs_source
+ << " dev_target " << dev_target << dendl;
+ assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
+
+ int flags = 0;
+
+ flags |= devs_source.count(BDEV_DB) ?
+ (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
+ 0;
+ flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+ int dev_target_new = dev_target; //FIXME: remove, makes no sense
+
+ for (auto& p : file_map) {
+ //do not copy log
+ if (p.second->fnode.ino == 1) {
+ continue;
+ }
+ dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
+
+ auto& fnode_extents = p.second->fnode.extents;
+
+ bool rewrite = false;
+ for (auto ext_it = fnode_extents.begin();
+ ext_it != p.second->fnode.extents.end();
+ ++ext_it) {
+ if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
+ rewrite = true;
+ break;
+ }
+ }
+ if (rewrite) {
+ dout(10) << __func__ << " migrating" << dendl;
+
+ // read entire file
+ bufferlist bl;
+ for (auto old_ext : fnode_extents) {
+ buf.resize(old_ext.length);
+ int r = bdev[old_ext.bdev]->read_random(
+ old_ext.offset,
+ old_ext.length,
+ (char*)&buf.at(0),
+ buffered);
+ if (r != 0) {
+ derr << __func__ << " failed to read 0x" << std::hex
+ << old_ext.offset << "~" << old_ext.length << std::dec
+ << " from " << (int)dev_target << dendl;
+ return -EIO;
+ }
+ bl.append((char*)&buf[0], old_ext.length);
+ }
+
+ // write entire file
+ PExtentVector extents;
+ auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
+ if (l < 0) {
+ derr << __func__ << " unable to allocate len 0x" << std::hex
+ << bl.length() << std::dec << " from " << (int)dev_target
+ << ": " << cpp_strerror(l) << dendl;
+ return -ENOSPC;
+ }
+
+ uint64_t off = 0;
+ for (auto& i : extents) {
+ bufferlist cur;
+ uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
+ ceph_assert(cur_len > 0);
+ cur.substr_of(bl, off, cur_len);
+ int r = bdev[dev_target]->write(i.offset, cur, buffered);
+ ceph_assert(r == 0);
+ off += cur_len;
+ }
+
+ // release old extents
+ for (auto old_ext : fnode_extents) {
+ PExtentVector to_release;
+ to_release.emplace_back(old_ext.offset, old_ext.length);
+ alloc[old_ext.bdev]->release(to_release);
+ }
+
+ // update fnode
+ fnode_extents.clear();
+ for (auto& i : extents) {
+ fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
+ }
+ }
+ }
+ // new logging device in the current naming scheme
+ int new_log_dev_cur =
+ bdev[BDEV_NEWWAL] ?
+ BDEV_NEWWAL :
+ bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
+ BDEV_WAL :
+ bdev[BDEV_NEWDB] ?
+ BDEV_NEWDB :
+ bdev[BDEV_DB] && !(flags & REMOVE_DB)?
+ BDEV_DB :
+ BDEV_SLOW;
+
+ // new logging device in new naming scheme
+ int new_log_dev_next =
+ new_log_dev_cur == BDEV_NEWWAL ?
+ BDEV_WAL :
+ new_log_dev_cur == BDEV_NEWDB ?
+ BDEV_DB :
+ new_log_dev_cur;
+
+ int super_dev =
+ dev_target == BDEV_NEWDB ?
+ BDEV_NEWDB :
+ bdev[BDEV_DB] ?
+ BDEV_DB :
+ BDEV_SLOW;
+
+ _rewrite_log_sync(
+ false,
+ super_dev,
+ new_log_dev_cur,
+ new_log_dev_next,
+ flags);
+ return 0;
+}
+
+BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
+{
+ auto p = file_map.find(ino);
+ if (p == file_map.end()) {
+ FileRef f = new File;
+ file_map[ino] = f;
+ dout(30) << __func__ << " ino " << ino << " = " << f
+ << " (new)" << dendl;
+ return f;
+ } else {
+ dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
+ return p->second;
+ }
+}
+
+void BlueFS::_drop_link(FileRef file)
+{
+ dout(20) << __func__ << " had refs " << file->refs
+ << " on " << file->fnode << dendl;
+ ceph_assert(file->refs > 0);
+ --file->refs;
+ if (file->refs == 0) {
+ dout(20) << __func__ << " destroying " << file->fnode << dendl;
+ ceph_assert(file->num_reading.load() == 0);
+ vselector->sub_usage(file->vselector_hint, file->fnode);
+ log_t.op_file_remove(file->fnode.ino);
+ for (auto& r : file->fnode.extents) {
+ pending_release[r.bdev].insert(r.offset, r.length);
+ }
+ file_map.erase(file->fnode.ino);
+ file->deleted = true;
+
+ if (file->dirty_seq) {
+ ceph_assert(file->dirty_seq > log_seq_stable);
+ ceph_assert(dirty_files.count(file->dirty_seq));
+ auto it = dirty_files[file->dirty_seq].iterator_to(*file);
+ dirty_files[file->dirty_seq].erase(it);
+ file->dirty_seq = 0;
+ }
+ }
+}
+
+int64_t BlueFS::_read_random(
+ FileReader *h, ///< [in] read from here
+ uint64_t off, ///< [in] offset
+ size_t len, ///< [in] this many bytes
+ char *out) ///< [out] optional: or copy it here
+{
+ auto* buf = &h->buf;
+
+ int64_t ret = 0;
+ dout(10) << __func__ << " h " << h
+ << " 0x" << std::hex << off << "~" << len << std::dec
+ << " from " << h->file->fnode << dendl;
+
+ ++h->file->num_reading;
+
+ if (!h->ignore_eof &&
+ off + len > h->file->fnode.size) {
+ if (off > h->file->fnode.size)
+ len = 0;
+ else
+ len = h->file->fnode.size - off;
+ dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
+ << std::hex << len << std::dec << dendl;
+ }
+ logger->inc(l_bluefs_read_random_count, 1);
+ logger->inc(l_bluefs_read_random_bytes, len);
+
+ std::shared_lock s_lock(h->lock);
+ buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
+ while (len > 0) {
+ if (off < buf->bl_off || off >= buf->get_buf_end()) {
+ s_lock.unlock();
+ uint64_t x_off = 0;
+ auto p = h->file->fnode.seek(off, &x_off);
+ ceph_assert(p != h->file->fnode.extents.end());
+ uint64_t l = std::min(p->length - x_off, static_cast<uint64_t>(len));
+ //hard cap to 1GB
+ l = std::min(l, uint64_t(1) << 30);
+ dout(20) << __func__ << " read random 0x"
+ << std::hex << x_off << "~" << l << std::dec
+ << " of " << *p << dendl;
+ int r;
+ if (!cct->_conf->bluefs_check_for_zeros) {
+ r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
+ cct->_conf->bluefs_buffered_io);
+ } else {
+ r = read_random(p->bdev, p->offset + x_off, l, out,
+ cct->_conf->bluefs_buffered_io);
+ }
+ ceph_assert(r == 0);
+ off += l;
+ len -= l;
+ ret += l;
+ out += l;
+
+ logger->inc(l_bluefs_read_random_disk_count, 1);
+ logger->inc(l_bluefs_read_random_disk_bytes, l);
+ if (len > 0) {
+ s_lock.lock();
+ }
+ } else {
+ auto left = buf->get_buf_remaining(off);
+ int64_t r = std::min(len, left);
+ logger->inc(l_bluefs_read_random_buffer_count, 1);
+ logger->inc(l_bluefs_read_random_buffer_bytes, r);
+ dout(20) << __func__ << " left 0x" << std::hex << left
+ << " 0x" << off << "~" << len << std::dec
+ << dendl;
+
+ if (out) {
+ // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
+ memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
+ out += r;
+ }
+
+ dout(30) << __func__ << " result chunk (0x"
+ << std::hex << r << std::dec << " bytes):\n";
+ bufferlist t;
+ t.substr_of(buf->bl, off - buf->bl_off, r);
+ t.hexdump(*_dout);
+ *_dout << dendl;
+
+ off += r;
+ len -= r;
+ ret += r;
+ buf->pos += r;
+ }
+ }
+ dout(20) << __func__ << " got " << ret << dendl;
+ --h->file->num_reading;
+ return ret;
+}
+
+int64_t BlueFS::_read(
+ FileReader *h, ///< [in] read from here
+ FileReaderBuffer *buf, ///< [in] reader state
+ uint64_t off, ///< [in] offset
+ size_t len, ///< [in] this many bytes
+ bufferlist *outbl, ///< [out] optional: reference the result here
+ char *out) ///< [out] optional: or copy it here
+{
+ bool prefetch = !outbl && !out;
+ dout(10) << __func__ << " h " << h
+ << " 0x" << std::hex << off << "~" << len << std::dec
+ << " from " << h->file->fnode
+ << (prefetch ? " prefetch" : "")
+ << dendl;
+
+ ++h->file->num_reading;
+
+ if (!h->ignore_eof &&
+ off + len > h->file->fnode.size) {
+ if (off > h->file->fnode.size)
+ len = 0;
+ else
+ len = h->file->fnode.size - off;
+ dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
+ << std::hex << len << std::dec << dendl;
+ }
+ logger->inc(l_bluefs_read_count, 1);
+ logger->inc(l_bluefs_read_bytes, len);
+ if (prefetch) {
+ logger->inc(l_bluefs_read_prefetch_count, 1);
+ logger->inc(l_bluefs_read_prefetch_bytes, len);
+ }
+
+ if (outbl)
+ outbl->clear();
+
+ int64_t ret = 0;
+ std::shared_lock s_lock(h->lock);
+ while (len > 0) {
+ size_t left;
+ if (off < buf->bl_off || off >= buf->get_buf_end()) {
+ s_lock.unlock();
+ std::unique_lock u_lock(h->lock);
+ buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
+ if (off < buf->bl_off || off >= buf->get_buf_end()) {
+ // if precondition hasn't changed during locking upgrade.
+ buf->bl.clear();
+ buf->bl_off = off & super.block_mask();
+ uint64_t x_off = 0;
+ auto p = h->file->fnode.seek(buf->bl_off, &x_off);
+ if (p == h->file->fnode.extents.end()) {
+ dout(5) << __func__ << " reading less then required "
+ << ret << "<" << ret + len << dendl;
+ break;
+ }
+
+ uint64_t want = round_up_to(len + (off & ~super.block_mask()),
+ super.block_size);
+ want = std::max(want, buf->max_prefetch);
+ uint64_t l = std::min(p->length - x_off, want);
+ //hard cap to 1GB
+ l = std::min(l, uint64_t(1) << 30);
+ uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
+ if (!h->ignore_eof &&
+ buf->bl_off + l > eof_offset) {
+ l = eof_offset - buf->bl_off;
+ }
+ dout(20) << __func__ << " fetching 0x"
+ << std::hex << x_off << "~" << l << std::dec
+ << " of " << *p << dendl;
+ int r;
+ if (!cct->_conf->bluefs_check_for_zeros) {
+ r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
+ cct->_conf->bluefs_buffered_io);
+ } else {
+ r = read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
+ cct->_conf->bluefs_buffered_io);
+ }
+ ceph_assert(r == 0);
+ }
+ u_lock.unlock();
+ s_lock.lock();
+ // we should recheck if buffer is valid after lock downgrade
+ continue;
+ }
+ left = buf->get_buf_remaining(off);
+ dout(20) << __func__ << " left 0x" << std::hex << left
+ << " len 0x" << len << std::dec << dendl;
+
+ int64_t r = std::min(len, left);
+ if (outbl) {
+ bufferlist t;
+ t.substr_of(buf->bl, off - buf->bl_off, r);
+ outbl->claim_append(t);
+ }
+ if (out) {
+ // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
+ memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
+ out += r;
+ }
+
+ dout(30) << __func__ << " result chunk (0x"
+ << std::hex << r << std::dec << " bytes):\n";
+ bufferlist t;
+ t.substr_of(buf->bl, off - buf->bl_off, r);
+ t.hexdump(*_dout);
+ *_dout << dendl;
+
+ off += r;
+ len -= r;
+ ret += r;
+ buf->pos += r;
+ }
+ dout(20) << __func__ << " got " << ret << dendl;
+ ceph_assert(!outbl || (int)outbl->length() == ret);
+ --h->file->num_reading;
+ return ret;
+}
+
+void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
+{
+ dout(10) << __func__ << " file " << f->fnode
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ if (offset & ~super.block_mask()) {
+ offset &= super.block_mask();
+ length = round_up_to(length, super.block_size);
+ }
+ uint64_t x_off = 0;
+ auto p = f->fnode.seek(offset, &x_off);
+ while (length > 0 && p != f->fnode.extents.end()) {
+ uint64_t x_len = std::min(p->length - x_off, length);
+ bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
+ dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
+ << std:: dec << " of " << *p << dendl;
+ offset += x_len;
+ length -= x_len;
+ }
+}
+
+uint64_t BlueFS::_estimate_log_size()
+{
+ int avg_dir_size = 40; // fixme
+ int avg_file_size = 12;
+ uint64_t size = 4096 * 2;
+ size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
+ for (auto& p : block_all)
+ size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
+ size += dir_map.size() + (1 + avg_dir_size);
+ size += file_map.size() * (1 + avg_dir_size + avg_file_size);
+ return round_up_to(size, super.block_size);
+}
+
+void BlueFS::compact_log()
+{
+ std::unique_lock<ceph::mutex> l(lock);
+ if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
+ if (cct->_conf->bluefs_compact_log_sync) {
+ _compact_log_sync();
+ } else {
+ _compact_log_async(l);
+ }
+ }
+}
+
+bool BlueFS::_should_compact_log()
+{
+ uint64_t current = log_writer->file->fnode.size;
+ uint64_t expected = _estimate_log_size();
+ float ratio = (float)current / (float)expected;
+ dout(10) << __func__ << " current 0x" << std::hex << current
+ << " expected " << expected << std::dec
+ << " ratio " << ratio
+ << (new_log ? " (async compaction in progress)" : "")
+ << dendl;
+ if (new_log ||
+ current < cct->_conf->bluefs_log_compact_min_size ||
+ ratio < cct->_conf->bluefs_log_compact_min_ratio) {
+ return false;
+ }
+ return true;
+}
+
+void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
+ int flags)
+{
+ t->seq = 1;
+ t->uuid = super.uuid;
+ dout(20) << __func__ << " op_init" << dendl;
+
+ t->op_init();
+ for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
+ interval_set<uint64_t>& p = block_all[bdev];
+ for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
+ auto bdev_new = bdev;
+ if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
+ continue;
+ }
+ if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
+ continue;
+ }
+ if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+ bdev_new = BDEV_DB;
+ }
+ if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+ bdev_new = BDEV_SLOW;
+ }
+ if (bdev == BDEV_NEWDB) {
+ // REMOVE_DB xor RENAME_DB
+ ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
+ ceph_assert(!(flags & RENAME_SLOW2DB));
+ bdev_new = BDEV_DB;
+ }
+ if (bdev == BDEV_NEWWAL) {
+ ceph_assert(flags & REMOVE_WAL);
+ bdev_new = BDEV_WAL;
+ }
+ dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
+ << std::hex << q.get_start() << "~" << q.get_len() << std::dec
+ << dendl;
+ t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
+ }
+ }
+ for (auto& p : file_map) {
+ if (p.first == 1)
+ continue;
+ ceph_assert(p.first > 1);
+
+ for(auto& e : p.second->fnode.extents) {
+ auto bdev = e.bdev;
+ auto bdev_new = bdev;
+ ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
+ if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+ bdev_new = BDEV_DB;
+ }
+ if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+ bdev_new = BDEV_SLOW;
+ }
+ if (bdev == BDEV_NEWDB) {
+ // REMOVE_DB xor RENAME_DB
+ ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
+ ceph_assert(!(flags & RENAME_SLOW2DB));
+ bdev_new = BDEV_DB;
+ }
+ if (bdev == BDEV_NEWWAL) {
+ ceph_assert(flags & REMOVE_WAL);
+ bdev_new = BDEV_WAL;
+ }
+ e.bdev = bdev_new;
+ }
+ dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
+ t->op_file_update(p.second->fnode);
+ }
+ for (auto& p : dir_map) {
+ dout(20) << __func__ << " op_dir_create " << p.first << dendl;
+ t->op_dir_create(p.first);
+ for (auto& q : p.second->file_map) {
+ dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first
+ << " to " << q.second->fnode.ino << dendl;
+ t->op_dir_link(p.first, q.first, q.second->fnode.ino);
+ }
+ }
+}
+
+void BlueFS::_compact_log_sync()
+{
+ dout(10) << __func__ << dendl;
+ auto prefer_bdev =
+ vselector->select_prefer_bdev(log_writer->file->vselector_hint);
+ _rewrite_log_sync(true,
+ BDEV_DB,
+ prefer_bdev,
+ prefer_bdev,
+ 0);
+ logger->inc(l_bluefs_log_compactions);
+}
+
+void BlueFS::_rewrite_log_sync(bool allocate_with_fallback,
+ int super_dev,
+ int log_dev,
+ int log_dev_new,
+ int flags)
+{
+ File *log_file = log_writer->file.get();
+
+ // clear out log (be careful who calls us!!!)
+ log_t.clear();
+
+ dout(20) << __func__ << " super_dev:" << super_dev
+ << " log_dev:" << log_dev
+ << " log_dev_new:" << log_dev_new
+ << " flags:" << flags
+ << dendl;
+ bluefs_transaction_t t;
+ _compact_log_dump_metadata(&t, flags);
+
+ dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
+ t.op_jump_seq(log_seq);
+
+ bufferlist bl;
+ encode(t, bl);
+ _pad_bl(bl);
+
+ uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
+ dout(20) << __func__ << " need " << need << dendl;
+
+ bluefs_fnode_t old_fnode;
+ int r;
+ log_file->fnode.swap_extents(old_fnode);
+ if (allocate_with_fallback) {
+ r = _allocate(log_dev, need, &log_file->fnode);
+ ceph_assert(r == 0);
+ } else {
+ PExtentVector extents;
+ r = _allocate_without_fallback(log_dev,
+ need,
+ &extents);
+ ceph_assert(r == 0);
+ for (auto& p : extents) {
+ log_file->fnode.append_extent(
+ bluefs_extent_t(log_dev, p.offset, p.length));
+ }
+ }
+
+ _close_writer(log_writer);
+
+ log_file->fnode.size = bl.length();
+ vselector->sub_usage(log_file->vselector_hint, old_fnode);
+ vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+
+ log_writer = _create_writer(log_file);
+ log_writer->append(bl);
+ r = _flush(log_writer, true);
+ ceph_assert(r == 0);
+#ifdef HAVE_LIBAIO
+ if (!cct->_conf->bluefs_sync_write) {
+ list<aio_t> completed_ios;
+ _claim_completed_aios(log_writer, &completed_ios);
+ wait_for_aio(log_writer);
+ completed_ios.clear();
+ }
+#endif
+ flush_bdev();
+
+ super.log_fnode = log_file->fnode;
+ // rename device if needed
+ if (log_dev != log_dev_new) {
+ dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
+ for (auto& p : super.log_fnode.extents) {
+ p.bdev = log_dev_new;
+ }
+ }
+ dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
+
+ ++super.version;
+ _write_super(super_dev);
+ flush_bdev();
+
+ dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
+ for (auto& r : old_fnode.extents) {
+ pending_release[r.bdev].insert(r.offset, r.length);
+ }
+}
+
+/*
+ * 1. Allocate a new extent to continue the log, and then log an event
+ * that jumps the log write position to the new extent. At this point, the
+ * old extent(s) won't be written to, and reflect everything to compact.
+ * New events will be written to the new region that we'll keep.
+ *
+ * 2. While still holding the lock, encode a bufferlist that dumps all of the
+ * in-memory fnodes and names. This will become the new beginning of the
+ * log. The last event will jump to the log continuation extent from #1.
+ *
+ * 3. Queue a write to a new extent for the new beginnging of the log.
+ *
+ * 4. Drop lock and wait
+ *
+ * 5. Retake the lock.
+ *
+ * 6. Update the log_fnode to splice in the new beginning.
+ *
+ * 7. Write the new superblock.
+ *
+ * 8. Release the old log space. Clean up.
+ */
+void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
+{
+ dout(10) << __func__ << dendl;
+ File *log_file = log_writer->file.get();
+ ceph_assert(!new_log);
+ ceph_assert(!new_log_writer);
+
+ // create a new log [writer] so that we know compaction is in progress
+ // (see _should_compact_log)
+ new_log = new File;
+ new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
+
+ // 0. wait for any racing flushes to complete. (We do not want to block
+ // in _flush_sync_log with jump_to set or else a racing thread might flush
+ // our entries and our jump_to update won't be correct.)
+ while (log_flushing) {
+ dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
+ log_cond.wait(l);
+ }
+
+ vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
+
+ // 1. allocate new log space and jump to it.
+ old_log_jump_to = log_file->fnode.get_allocated();
+ dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
+ << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
+ int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+ cct->_conf->bluefs_max_log_runway,
+ &log_file->fnode);
+ ceph_assert(r == 0);
+ //adjust usage as flush below will need it
+ vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+ dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+
+ // update the log file change and log a jump to the offset where we want to
+ // write the new entries
+ log_t.op_file_update(log_file->fnode);
+ log_t.op_jump(log_seq, old_log_jump_to);
+
+ flush_bdev(); // FIXME?
+
+ _flush_and_sync_log(l, 0, old_log_jump_to);
+
+ // 2. prepare compacted log
+ bluefs_transaction_t t;
+ //avoid record two times in log_t and _compact_log_dump_metadata.
+ log_t.clear();
+ _compact_log_dump_metadata(&t, 0);
+
+ uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
+ std::max(alloc_size[BDEV_DB],
+ alloc_size[BDEV_SLOW]));
+
+ // conservative estimate for final encoded size
+ new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
+ max_alloc_size);
+ t.op_jump(log_seq, new_log_jump_to);
+
+ // allocate
+ //FIXME: check if we want DB here?
+ r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
+ &new_log->fnode);
+ ceph_assert(r == 0);
+
+ // we might have some more ops in log_t due to _allocate call
+ t.claim_ops(log_t);
+
+ bufferlist bl;
+ encode(t, bl);
+ _pad_bl(bl);
+
+ dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
+ << std::dec << dendl;
+
+ new_log_writer = _create_writer(new_log);
+ new_log_writer->append(bl);
+
+ // 3. flush
+ r = _flush(new_log_writer, true);
+ ceph_assert(r == 0);
+
+ // 4. wait
+ _flush_bdev_safely(new_log_writer);
+
+ // 5. update our log fnode
+ // discard first old_log_jump_to extents
+
+ dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
+ << " of " << log_file->fnode.extents << dendl;
+ uint64_t discarded = 0;
+ mempool::bluefs::vector<bluefs_extent_t> old_extents;
+ while (discarded < old_log_jump_to) {
+ ceph_assert(!log_file->fnode.extents.empty());
+ bluefs_extent_t& e = log_file->fnode.extents.front();
+ bluefs_extent_t temp = e;
+ if (discarded + e.length <= old_log_jump_to) {
+ dout(10) << __func__ << " remove old log extent " << e << dendl;
+ discarded += e.length;
+ log_file->fnode.pop_front_extent();
+ } else {
+ dout(10) << __func__ << " remove front of old log extent " << e << dendl;
+ uint64_t drop = old_log_jump_to - discarded;
+ temp.length = drop;
+ e.offset += drop;
+ e.length -= drop;
+ discarded += drop;
+ dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
+ }
+ old_extents.push_back(temp);
+ }
+ auto from = log_file->fnode.extents.begin();
+ auto to = log_file->fnode.extents.end();
+ while (from != to) {
+ new_log->fnode.append_extent(*from);
+ ++from;
+ }
+
+ vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
+
+ // clear the extents from old log file, they are added to new log
+ log_file->fnode.clear_extents();
+ // swap the log files. New log file is the log file now.
+ new_log->fnode.swap_extents(log_file->fnode);
+
+ log_writer->pos = log_writer->file->fnode.size =
+ log_writer->pos - old_log_jump_to + new_log_jump_to;
+
+ vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+
+ // 6. write the super block to reflect the changes
+ dout(10) << __func__ << " writing super" << dendl;
+ super.log_fnode = log_file->fnode;
+ ++super.version;
+ _write_super(BDEV_DB);
+
+ lock.unlock();
+ flush_bdev();
+ lock.lock();
+
+ // 7. release old space
+ dout(10) << __func__ << " release old log extents " << old_extents << dendl;
+ for (auto& r : old_extents) {
+ pending_release[r.bdev].insert(r.offset, r.length);
+ }
+
+ // delete the new log, remove from the dirty files list
+ _close_writer(new_log_writer);
+ if (new_log->dirty_seq) {
+ ceph_assert(dirty_files.count(new_log->dirty_seq));
+ auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
+ dirty_files[new_log->dirty_seq].erase(it);
+ }
+ new_log_writer = nullptr;
+ new_log = nullptr;
+ log_cond.notify_all();
+
+ dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+ logger->inc(l_bluefs_log_compactions);
+}
+
+void BlueFS::_pad_bl(bufferlist& bl)
+{
+ uint64_t partial = bl.length() % super.block_size;
+ if (partial) {
+ dout(10) << __func__ << " padding with 0x" << std::hex
+ << super.block_size - partial << " zeros" << std::dec << dendl;
+ bl.append_zero(super.block_size - partial);
+ }
+}
+
+
+int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
+ uint64_t want_seq,
+ uint64_t jump_to)
+{
+ while (log_flushing) {
+ dout(10) << __func__ << " want_seq " << want_seq
+ << " log is currently flushing, waiting" << dendl;
+ ceph_assert(!jump_to);
+ log_cond.wait(l);
+ }
+ if (want_seq && want_seq <= log_seq_stable) {
+ dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
+ << log_seq_stable << ", done" << dendl;
+ ceph_assert(!jump_to);
+ return 0;
+ }
+ if (log_t.empty() && dirty_files.empty()) {
+ dout(10) << __func__ << " want_seq " << want_seq
+ << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
+ ceph_assert(!jump_to);
+ return 0;
+ }
+
+ vector<interval_set<uint64_t>> to_release(pending_release.size());
+ to_release.swap(pending_release);
+
+ uint64_t seq = log_t.seq = ++log_seq;
+ ceph_assert(want_seq == 0 || want_seq <= seq);
+ log_t.uuid = super.uuid;
+
+ // log dirty files
+ auto lsi = dirty_files.find(seq);
+ if (lsi != dirty_files.end()) {
+ dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
+ for (auto &f : lsi->second) {
+ dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
+ log_t.op_file_update(f.fnode);
+ }
+ }
+
+ dout(10) << __func__ << " " << log_t << dendl;
+ ceph_assert(!log_t.empty());
+
+ // allocate some more space (before we run out)?
+ int64_t runway = log_writer->file->fnode.get_allocated() -
+ log_writer->get_effective_write_pos();
+ bool just_expanded_log = false;
+ if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
+ dout(10) << __func__ << " allocating more log runway (0x"
+ << std::hex << runway << std::dec << " remaining)" << dendl;
+ while (new_log_writer) {
+ dout(10) << __func__ << " waiting for async compaction" << dendl;
+ log_cond.wait(l);
+ }
+ vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
+ int r = _allocate(
+ vselector->select_prefer_bdev(log_writer->file->vselector_hint),
+ cct->_conf->bluefs_max_log_runway,
+ &log_writer->file->fnode);
+ ceph_assert(r == 0);
+ vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
+ log_t.op_file_update(log_writer->file->fnode);
+ just_expanded_log = true;
+ }
+
+ bufferlist bl;
+ bl.reserve(super.block_size);
+ encode(log_t, bl);
+ // pad to block boundary
+ size_t realign = super.block_size - (bl.length() % super.block_size);
+ if (realign && realign != super.block_size)
+ bl.append_zero(realign);
+
+ logger->inc(l_bluefs_logged_bytes, bl.length());
+
+ if (just_expanded_log) {
+ ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
+ }
+
+ log_writer->append(bl);
+
+ log_t.clear();
+ log_t.seq = 0; // just so debug output is less confusing
+ log_flushing = true;
+
+ int r = _flush(log_writer, true);
+ ceph_assert(r == 0);
+
+ if (jump_to) {
+ dout(10) << __func__ << " jumping log offset from 0x" << std::hex
+ << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
+ log_writer->pos = jump_to;
+ vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
+ log_writer->file->fnode.size = jump_to;
+ vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
+ }
+
+ _flush_bdev_safely(log_writer);
+
+ log_flushing = false;
+ log_cond.notify_all();
+
+ // clean dirty files
+ if (seq > log_seq_stable) {
+ log_seq_stable = seq;
+ dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
+
+ auto p = dirty_files.begin();
+ while (p != dirty_files.end()) {
+ if (p->first > log_seq_stable) {
+ dout(20) << __func__ << " done cleaning up dirty files" << dendl;
+ break;
+ }
+
+ auto l = p->second.begin();
+ while (l != p->second.end()) {
+ File *file = &*l;
+ ceph_assert(file->dirty_seq > 0);
+ ceph_assert(file->dirty_seq <= log_seq_stable);
+ dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
+ file->dirty_seq = 0;
+ p->second.erase(l++);
+ }
+
+ ceph_assert(p->second.empty());
+ dirty_files.erase(p++);
+ }
+ } else {
+ dout(20) << __func__ << " log_seq_stable " << log_seq_stable
+ << " already >= out seq " << seq
+ << ", we lost a race against another log flush, done" << dendl;
+ }
+
+ for (unsigned i = 0; i < to_release.size(); ++i) {
+ if (!to_release[i].empty()) {
+ /* OK, now we have the guarantee alloc[i] won't be null. */
+ int r = 0;
+ if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
+ r = bdev[i]->queue_discard(to_release[i]);
+ if (r == 0)
+ continue;
+ } else if (cct->_conf->bdev_enable_discard) {
+ for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
+ bdev[i]->discard(p.get_start(), p.get_len());
+ }
+ }
+ alloc[i]->release(to_release[i]);
+ }
+ }
+
+ _update_logger_stats();
+
+ return 0;
+}
+
+int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
+{
+ dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
+ << " 0x" << offset << "~" << length << std::dec
+ << " to " << h->file->fnode << dendl;
+ ceph_assert(!h->file->deleted);
+ ceph_assert(h->file->num_readers.load() == 0);
+
+ h->buffer_appender.flush();
+
+ bool buffered;
+ if (h->file->fnode.ino == 1)
+ buffered = false;
+ else
+ buffered = cct->_conf->bluefs_buffered_io;
+
+ if (offset + length <= h->pos)
+ return 0;
+ if (offset < h->pos) {
+ length -= h->pos - offset;
+ offset = h->pos;
+ dout(10) << " still need 0x"
+ << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ }
+ ceph_assert(offset <= h->file->fnode.size);
+
+ uint64_t allocated = h->file->fnode.get_allocated();
+ vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
+ // do not bother to dirty the file if we are overwriting
+ // previously allocated extents.
+ bool must_dirty = false;
+ if (allocated < offset + length) {
+ // we should never run out of log space here; see the min runway check
+ // in _flush_and_sync_log.
+ ceph_assert(h->file->fnode.ino != 1);
+ int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
+ offset + length - allocated,
+ &h->file->fnode);
+ if (r < 0) {
+ derr << __func__ << " allocated: 0x" << std::hex << allocated
+ << " offset: 0x" << offset << " length: 0x" << length << std::dec
+ << dendl;
+ vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
+ ceph_abort_msg("bluefs enospc");
+ return r;
+ }
+ if (cct->_conf->bluefs_preextend_wal_files &&
+ h->writer_type == WRITER_WAL) {
+ // NOTE: this *requires* that rocksdb also has log recycling
+ // enabled and is therefore doing robust CRCs on the log
+ // records. otherwise, we will fail to reply the rocksdb log
+ // properly due to garbage on the device.
+ h->file->fnode.size = h->file->fnode.get_allocated();
+ dout(10) << __func__ << " extending WAL size to 0x" << std::hex
+ << h->file->fnode.size << std::dec << " to include allocated"
+ << dendl;
+ }
+ must_dirty = true;
+ }
+ if (h->file->fnode.size < offset + length) {
+ h->file->fnode.size = offset + length;
+ if (h->file->fnode.ino > 1) {
+ // we do not need to dirty the log file (or it's compacting
+ // replacement) when the file size changes because replay is
+ // smart enough to discover it on its own.
+ must_dirty = true;
+ }
+ }
+ if (must_dirty) {
+ h->file->fnode.mtime = ceph_clock_now();
+ ceph_assert(h->file->fnode.ino >= 1);
+ if (h->file->dirty_seq == 0) {
+ h->file->dirty_seq = log_seq + 1;
+ dirty_files[h->file->dirty_seq].push_back(*h->file);
+ dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+ << " (was clean)" << dendl;
+ } else {
+ if (h->file->dirty_seq != log_seq + 1) {
+ // need re-dirty, erase from list first
+ ceph_assert(dirty_files.count(h->file->dirty_seq));
+ auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
+ dirty_files[h->file->dirty_seq].erase(it);
+ h->file->dirty_seq = log_seq + 1;
+ dirty_files[h->file->dirty_seq].push_back(*h->file);
+ dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+ << " (was " << h->file->dirty_seq << ")" << dendl;
+ } else {
+ dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+ << " (unchanged, do nothing) " << dendl;
+ }
+ }
+ }
+ dout(20) << __func__ << " file now " << h->file->fnode << dendl;
+
+ uint64_t x_off = 0;
+ auto p = h->file->fnode.seek(offset, &x_off);
+ ceph_assert(p != h->file->fnode.extents.end());
+ dout(20) << __func__ << " in " << *p << " x_off 0x"
+ << std::hex << x_off << std::dec << dendl;
+
+ unsigned partial = x_off & ~super.block_mask();
+ bufferlist bl;
+ if (partial) {
+ dout(20) << __func__ << " using partial tail 0x"
+ << std::hex << partial << std::dec << dendl;
+ ceph_assert(h->tail_block.length() == partial);
+ bl.claim_append_piecewise(h->tail_block);
+ x_off -= partial;
+ offset -= partial;
+ length += partial;
+ dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
+ for (auto p : h->iocv) {
+ if (p) {
+ p->aio_wait();
+ }
+ }
+ }
+ if (length == partial + h->buffer.length()) {
+ bl.claim_append_piecewise(h->buffer);
+ } else {
+ bufferlist t;
+ h->buffer.splice(0, length, &t);
+ bl.claim_append_piecewise(t);
+ t.substr_of(h->buffer, length, h->buffer.length() - length);
+ h->buffer.swap(t);
+ dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
+ << " unflushed" << dendl;
+ }
+ ceph_assert(bl.length() == length);
+
+ switch (h->writer_type) {
+ case WRITER_WAL:
+ logger->inc(l_bluefs_bytes_written_wal, length);
+ break;
+ case WRITER_SST:
+ logger->inc(l_bluefs_bytes_written_sst, length);
+ break;
+ }
+
+ dout(30) << "dump:\n";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ h->pos = offset + length;
+ h->tail_block.clear();
+
+ uint64_t bloff = 0;
+ uint64_t bytes_written_slow = 0;
+ while (length > 0) {
+ uint64_t x_len = std::min(p->length - x_off, length);
+ bufferlist t;
+ t.substr_of(bl, bloff, x_len);
+ unsigned tail = x_len & ~super.block_mask();
+ if (tail) {
+ size_t zlen = super.block_size - tail;
+ dout(20) << __func__ << " caching tail of 0x"
+ << std::hex << tail
+ << " and padding block with 0x" << zlen
+ << std::dec << dendl;
+ h->tail_block.substr_of(bl, bl.length() - tail, tail);
+ if (h->file->fnode.ino > 1) {
+ // we are using the page_aligned_appender, and can safely use
+ // the tail of the raw buffer.
+ const bufferptr &last = t.back();
+ if (last.unused_tail_length() < zlen) {
+ derr << " wtf, last is " << last << " from " << t << dendl;
+ ceph_assert(last.unused_tail_length() >= zlen);
+ }
+ bufferptr z = last;
+ z.set_offset(last.offset() + last.length());
+ z.set_length(zlen);
+ z.zero();
+ t.append(z, 0, zlen);
+ } else {
+ t.append_zero(zlen);
+ }
+ }
+ if (cct->_conf->bluefs_sync_write) {
+ bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
+ } else {
+ bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
+ }
+ h->dirty_devs[p->bdev] = true;
+ if (p->bdev == BDEV_SLOW) {
+ bytes_written_slow += t.length();
+ }
+
+ bloff += x_len;
+ length -= x_len;
+ ++p;
+ x_off = 0;
+ }
+ logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
+ for (unsigned i = 0; i < MAX_BDEV; ++i) {
+ if (bdev[i]) {
+ if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
+ bdev[i]->aio_submit(h->iocv[i]);
+ }
+ }
+ }
+ vselector->add_usage(h->file->vselector_hint, h->file->fnode);
+ dout(20) << __func__ << " h " << h << " pos now 0x"
+ << std::hex << h->pos << std::dec << dendl;
+ return 0;
+}
+
+#ifdef HAVE_LIBAIO
+// we need to retire old completed aios so they don't stick around in
+// memory indefinitely (along with their bufferlist refs).
+void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
+{
+ for (auto p : h->iocv) {
+ if (p) {
+ ls->splice(ls->end(), p->running_aios);
+ }
+ }
+ dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
+}
+
+void BlueFS::wait_for_aio(FileWriter *h)
+{
+ // NOTE: this is safe to call without a lock, as long as our reference is
+ // stable.
+ dout(10) << __func__ << " " << h << dendl;
+ utime_t start = ceph_clock_now();
+ for (auto p : h->iocv) {
+ if (p) {
+ p->aio_wait();
+ }
+ }
+ dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
+}
+#endif
+
+int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l)
+{
+ bool flushed = false;
+ int r = _flush(h, force, &flushed);
+ if (r == 0 && flushed) {
+ _maybe_compact_log(l);
+ }
+ return r;
+}
+
+int BlueFS::_flush(FileWriter *h, bool force, bool *flushed)
+{
+ h->buffer_appender.flush();
+ uint64_t length = h->buffer.length();
+ uint64_t offset = h->pos;
+ if (flushed) {
+ *flushed = false;
+ }
+ if (!force &&
+ length < cct->_conf->bluefs_min_flush_size) {
+ dout(10) << __func__ << " " << h << " ignoring, length " << length
+ << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
+ << dendl;
+ return 0;
+ }
+ if (length == 0) {
+ dout(10) << __func__ << " " << h << " no dirty data on "
+ << h->file->fnode << dendl;
+ return 0;
+ }
+ dout(10) << __func__ << " " << h << " 0x"
+ << std::hex << offset << "~" << length << std::dec
+ << " to " << h->file->fnode << dendl;
+ ceph_assert(h->pos <= h->file->fnode.size);
+ int r = _flush_range(h, offset, length);
+ if (flushed) {
+ *flushed = true;
+ }
+ return r;
+}
+
+int BlueFS::_truncate(FileWriter *h, uint64_t offset)
+{
+ dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
+ << " file " << h->file->fnode << dendl;
+ if (h->file->deleted) {
+ dout(10) << __func__ << " deleted, no-op" << dendl;
+ return 0;
+ }
+
+ // we never truncate internal log files
+ ceph_assert(h->file->fnode.ino > 1);
+
+ h->buffer_appender.flush();
+
+ // truncate off unflushed data?
+ if (h->pos < offset &&
+ h->pos + h->buffer.length() > offset) {
+ bufferlist t;
+ dout(20) << __func__ << " tossing out last " << offset - h->pos
+ << " unflushed bytes" << dendl;
+ t.substr_of(h->buffer, 0, offset - h->pos);
+ h->buffer.swap(t);
+ ceph_abort_msg("actually this shouldn't happen");
+ }
+ if (h->buffer.length()) {
+ int r = _flush(h, true);
+ if (r < 0)
+ return r;
+ }
+ if (offset == h->file->fnode.size) {
+ return 0; // no-op!
+ }
+ if (offset > h->file->fnode.size) {
+ ceph_abort_msg("truncate up not supported");
+ }
+ ceph_assert(h->file->fnode.size >= offset);
+ vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
+ h->file->fnode.size = offset;
+ vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
+ log_t.op_file_update(h->file->fnode);
+ return 0;
+}
+
+int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
+{
+ dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
+ int r = _flush(h, true);
+ if (r < 0)
+ return r;
+ uint64_t old_dirty_seq = h->file->dirty_seq;
+
+ _flush_bdev_safely(h);
+
+ if (old_dirty_seq) {
+ uint64_t s = log_seq;
+ dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
+ << ") on " << h->file->fnode << ", flushing log" << dendl;
+ _flush_and_sync_log(l, old_dirty_seq);
+ ceph_assert(h->file->dirty_seq == 0 || // cleaned
+ h->file->dirty_seq > s); // or redirtied by someone else
+ }
+ return 0;
+}
+
+void BlueFS::_flush_bdev_safely(FileWriter *h)
+{
+ std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
+ h->dirty_devs.fill(false);
+#ifdef HAVE_LIBAIO
+ if (!cct->_conf->bluefs_sync_write) {
+ list<aio_t> completed_ios;
+ _claim_completed_aios(h, &completed_ios);
+ lock.unlock();
+ wait_for_aio(h);
+ completed_ios.clear();
+ flush_bdev(flush_devs);
+ lock.lock();
+ } else
+#endif
+ {
+ lock.unlock();
+ flush_bdev(flush_devs);
+ lock.lock();
+ }
+}
+
+void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
+{
+ // NOTE: this is safe to call without a lock.
+ dout(20) << __func__ << dendl;
+ for (unsigned i = 0; i < MAX_BDEV; i++) {
+ if (dirty_bdevs[i])
+ bdev[i]->flush();
+ }
+}
+
+void BlueFS::flush_bdev()
+{
+ // NOTE: this is safe to call without a lock.
+ dout(20) << __func__ << dendl;
+ for (auto p : bdev) {
+ if (p)
+ p->flush();
+ }
+}
+
+const char* BlueFS::get_device_name(unsigned id)
+{
+ if (id >= MAX_BDEV) return "BDEV_INV";
+ const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
+ return names[id];
+}
+
+int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents)
+{
+ int r = -ENOSPC;
+ if (slow_dev_expander) {
+ auto id = _get_slow_device_id();
+ auto min_alloc_size = alloc_size[id];
+ ceph_assert(id <= alloc.size() && alloc[id]);
+ auto min_need = round_up_to(need, min_alloc_size);
+ need = std::max(need,
+ slow_dev_expander->get_recommended_expansion_delta(
+ alloc[id]->get_free(), block_all[id].size()));
+
+ need = round_up_to(need, min_alloc_size);
+ dout(10) << __func__ << " expanding slow device by 0x"
+ << std::hex << need << std::dec
+ << dendl;
+ r = slow_dev_expander->allocate_freespace(min_need, need, extents);
+ }
+ return r;
+}
+
+int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
+ PExtentVector* extents)
+{
+ dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
+ << " from " << (int)id << dendl;
+ assert(id < alloc.size());
+ if (!alloc[id]) {
+ return -ENOENT;
+ }
+ extents->reserve(4); // 4 should be (more than) enough for most allocations
+ uint64_t min_alloc_size = alloc_size[id];
+ uint64_t left = round_up_to(len, min_alloc_size);
+ int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
+ if (alloc_len < 0 || alloc_len < (int64_t)left) {
+ if (alloc_len > 0) {
+ alloc[id]->release(*extents);
+ }
+ if (bdev[id])
+ derr << __func__ << " failed to allocate 0x" << std::hex << left
+ << " on bdev " << (int)id
+ << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
+ else
+ derr << __func__ << " failed to allocate 0x" << std::hex << left
+ << " on bdev " << (int)id << ", dne" << std::dec << dendl;
+ if (alloc[id])
+ alloc[id]->dump();
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+int BlueFS::_allocate(uint8_t id, uint64_t len,
+ bluefs_fnode_t* node)
+{
+ dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
+ << " from " << (int)id << dendl;
+ ceph_assert(id < alloc.size());
+ int64_t alloc_len = 0;
+ PExtentVector extents;
+ uint64_t hint = 0;
+ if (alloc[id]) {
+ if (!node->extents.empty() && node->extents.back().bdev == id) {
+ hint = node->extents.back().end();
+ }
+ extents.reserve(4); // 4 should be (more than) enough for most allocations
+ alloc_len = alloc[id]->allocate(round_up_to(len, alloc_size[id]),
+ alloc_size[id], hint, &extents);
+ }
+ if (!alloc[id] ||
+ alloc_len < 0 ||
+ alloc_len < (int64_t)round_up_to(len, alloc_size[id])) {
+ if (alloc_len > 0) {
+ alloc[id]->release(extents);
+ }
+ if (id != BDEV_SLOW) {
+ if (bdev[id]) {
+ dout(1) << __func__ << " failed to allocate 0x" << std::hex << len
+ << " on bdev " << (int)id
+ << ", free 0x" << alloc[id]->get_free()
+ << "; fallback to bdev " << (int)id + 1
+ << std::dec << dendl;
+ }
+ return _allocate(id + 1, len, node);
+ }
+ dout(1) << __func__ << " unable to allocate 0x" << std::hex << len
+ << " on bdev " << (int)id << ", free 0x"
+ << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1)
+ << "; fallback to slow device expander "
+ << std::dec << dendl;
+ extents.clear();
+ if (_expand_slow_device(len, extents) == 0) {
+ id = _get_slow_device_id();
+ for (auto& e : extents) {
+ _add_block_extent(id, e.offset, e.length);
+ }
+ extents.clear();
+ auto* last_alloc = alloc[id];
+ ceph_assert(last_alloc);
+ // try again
+ alloc_len = last_alloc->allocate(round_up_to(len, alloc_size[id]),
+ alloc_size[id], hint, &extents);
+ if (alloc_len < 0 || alloc_len < (int64_t)len) {
+ if (alloc_len > 0) {
+ last_alloc->release(extents);
+ }
+ derr << __func__ << " failed to allocate 0x" << std::hex << len
+ << " on bdev " << (int)id
+ << ", free 0x" << last_alloc->get_free() << std::dec << dendl;
+ return -ENOSPC;
+ }
+ } else {
+ derr << __func__ << " failed to expand slow device to fit +0x"
+ << std::hex << len << std::dec
+ << dendl;
+ return -ENOSPC;
+ }
+ } else {
+ uint64_t total_allocated =
+ block_all[id].size() - alloc[id]->get_free();
+ if (max_bytes[id] < total_allocated) {
+ logger->set(max_bytes_pcounters[id], total_allocated);
+ max_bytes[id] = total_allocated;
+ }
+ }
+
+ for (auto& p : extents) {
+ node->append_extent(bluefs_extent_t(id, p.offset, p.length));
+ }
+
+ return 0;
+}
+
+int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
+{
+ dout(10) << __func__ << " file " << f->fnode << " 0x"
+ << std::hex << off << "~" << len << std::dec << dendl;
+ if (f->deleted) {
+ dout(10) << __func__ << " deleted, no-op" << dendl;
+ return 0;
+ }
+ ceph_assert(f->fnode.ino > 1);
+ uint64_t allocated = f->fnode.get_allocated();
+ if (off + len > allocated) {
+ uint64_t want = off + len - allocated;
+ vselector->sub_usage(f->vselector_hint, f->fnode);
+
+ int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
+ want,
+ &f->fnode);
+ vselector->add_usage(f->vselector_hint, f->fnode);
+ if (r < 0)
+ return r;
+ log_t.op_file_update(f->fnode);
+ }
+ return 0;
+}
+
+void BlueFS::sync_metadata(bool avoid_compact)
+{
+ std::unique_lock l(lock);
+ if (log_t.empty() && dirty_files.empty()) {
+ dout(10) << __func__ << " - no pending log events" << dendl;
+ } else {
+ dout(10) << __func__ << dendl;
+ utime_t start = ceph_clock_now();
+ flush_bdev(); // FIXME?
+ _flush_and_sync_log(l);
+ dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
+ }
+
+ if (!avoid_compact) {
+ _maybe_compact_log(l);
+ }
+}
+
+void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l)
+{
+ if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
+ _should_compact_log()) {
+ if (cct->_conf->bluefs_compact_log_sync) {
+ _compact_log_sync();
+ } else {
+ _compact_log_async(l);
+ }
+ }
+}
+
+int BlueFS::open_for_write(
+ const string& dirname,
+ const string& filename,
+ FileWriter **h,
+ bool overwrite)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+ map<string,DirRef>::iterator p = dir_map.find(dirname);
+ DirRef dir;
+ if (p == dir_map.end()) {
+ // implicitly create the dir
+ dout(20) << __func__ << " dir " << dirname
+ << " does not exist" << dendl;
+ return -ENOENT;
+ } else {
+ dir = p->second;
+ }
+
+ FileRef file;
+ bool create = false;
+ map<string,FileRef>::iterator q = dir->file_map.find(filename);
+ if (q == dir->file_map.end()) {
+ if (overwrite) {
+ dout(20) << __func__ << " dir " << dirname << " (" << dir
+ << ") file " << filename
+ << " does not exist" << dendl;
+ return -ENOENT;
+ }
+ file = new File;
+ file->fnode.ino = ++ino_last;
+ file_map[ino_last] = file;
+ dir->file_map[filename] = file;
+ ++file->refs;
+ create = true;
+ } else {
+ // overwrite existing file?
+ file = q->second;
+ if (overwrite) {
+ dout(20) << __func__ << " dir " << dirname << " (" << dir
+ << ") file " << filename
+ << " already exists, overwrite in place" << dendl;
+ } else {
+ dout(20) << __func__ << " dir " << dirname << " (" << dir
+ << ") file " << filename
+ << " already exists, truncate + overwrite" << dendl;
+ vselector->sub_usage(file->vselector_hint, file->fnode);
+ file->fnode.size = 0;
+ for (auto& p : file->fnode.extents) {
+ pending_release[p.bdev].insert(p.offset, p.length);
+ }
+
+ file->fnode.clear_extents();
+ }
+ }
+ ceph_assert(file->fnode.ino > 1);
+
+ file->fnode.mtime = ceph_clock_now();
+ file->vselector_hint = vselector->get_hint_by_dir(dirname);
+
+ dout(20) << __func__ << " mapping " << dirname << "/" << filename
+ << " vsel_hint " << file->vselector_hint
+ << dendl;
+
+ log_t.op_file_update(file->fnode);
+ if (create)
+ log_t.op_dir_link(dirname, filename, file->fnode.ino);
+
+ *h = _create_writer(file);
+
+ if (boost::algorithm::ends_with(filename, ".log")) {
+ (*h)->writer_type = BlueFS::WRITER_WAL;
+ if (logger && !overwrite) {
+ logger->inc(l_bluefs_files_written_wal);
+ }
+ } else if (boost::algorithm::ends_with(filename, ".sst")) {
+ (*h)->writer_type = BlueFS::WRITER_SST;
+ if (logger) {
+ logger->inc(l_bluefs_files_written_sst);
+ }
+ }
+
+ dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
+ return 0;
+}
+
+BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
+{
+ FileWriter *w = new FileWriter(f);
+ for (unsigned i = 0; i < MAX_BDEV; ++i) {
+ if (bdev[i]) {
+ w->iocv[i] = new IOContext(cct, NULL);
+ }
+ }
+ return w;
+}
+
+void BlueFS::_close_writer(FileWriter *h)
+{
+ dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
+ h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
+ for (unsigned i=0; i<MAX_BDEV; ++i) {
+ if (bdev[i]) {
+ if (h->iocv[i]) {
+ h->iocv[i]->aio_wait();
+ bdev[i]->queue_reap_ioc(h->iocv[i]);
+ }
+ }
+ }
+ delete h;
+}
+
+int BlueFS::open_for_read(
+ const string& dirname,
+ const string& filename,
+ FileReader **h,
+ bool random)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << dirname << "/" << filename
+ << (random ? " (random)":" (sequential)") << dendl;
+ map<string,DirRef>::iterator p = dir_map.find(dirname);
+ if (p == dir_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+ return -ENOENT;
+ }
+ DirRef dir = p->second;
+
+ map<string,FileRef>::iterator q = dir->file_map.find(filename);
+ if (q == dir->file_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " (" << dir
+ << ") file " << filename
+ << " not found" << dendl;
+ return -ENOENT;
+ }
+ File *file = q->second.get();
+
+ *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
+ random, false);
+ dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
+ return 0;
+}
+
+int BlueFS::rename(
+ const string& old_dirname, const string& old_filename,
+ const string& new_dirname, const string& new_filename)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << old_dirname << "/" << old_filename
+ << " -> " << new_dirname << "/" << new_filename << dendl;
+ map<string,DirRef>::iterator p = dir_map.find(old_dirname);
+ if (p == dir_map.end()) {
+ dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
+ return -ENOENT;
+ }
+ DirRef old_dir = p->second;
+ map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
+ if (q == old_dir->file_map.end()) {
+ dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
+ << ") file " << old_filename
+ << " not found" << dendl;
+ return -ENOENT;
+ }
+ FileRef file = q->second;
+
+ p = dir_map.find(new_dirname);
+ if (p == dir_map.end()) {
+ dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
+ return -ENOENT;
+ }
+ DirRef new_dir = p->second;
+ q = new_dir->file_map.find(new_filename);
+ if (q != new_dir->file_map.end()) {
+ dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
+ << ") file " << new_filename
+ << " already exists, unlinking" << dendl;
+ ceph_assert(q->second != file);
+ log_t.op_dir_unlink(new_dirname, new_filename);
+ _drop_link(q->second);
+ }
+
+ dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
+ << " " << file->fnode << dendl;
+
+ new_dir->file_map[new_filename] = file;
+ old_dir->file_map.erase(old_filename);
+
+ log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
+ log_t.op_dir_unlink(old_dirname, old_filename);
+ return 0;
+}
+
+int BlueFS::mkdir(const string& dirname)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << dirname << dendl;
+ map<string,DirRef>::iterator p = dir_map.find(dirname);
+ if (p != dir_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
+ return -EEXIST;
+ }
+ dir_map[dirname] = new Dir;
+ log_t.op_dir_create(dirname);
+ return 0;
+}
+
+int BlueFS::rmdir(const string& dirname)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << dirname << dendl;
+ map<string,DirRef>::iterator p = dir_map.find(dirname);
+ if (p == dir_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
+ return -ENOENT;
+ }
+ DirRef dir = p->second;
+ if (!dir->file_map.empty()) {
+ dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
+ return -ENOTEMPTY;
+ }
+ dir_map.erase(dirname);
+ log_t.op_dir_remove(dirname);
+ return 0;
+}
+
+bool BlueFS::dir_exists(const string& dirname)
+{
+ std::lock_guard l(lock);
+ map<string,DirRef>::iterator p = dir_map.find(dirname);
+ bool exists = p != dir_map.end();
+ dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
+ return exists;
+}
+
+int BlueFS::stat(const string& dirname, const string& filename,
+ uint64_t *size, utime_t *mtime)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+ map<string,DirRef>::iterator p = dir_map.find(dirname);
+ if (p == dir_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+ return -ENOENT;
+ }
+ DirRef dir = p->second;
+ map<string,FileRef>::iterator q = dir->file_map.find(filename);
+ if (q == dir->file_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " (" << dir
+ << ") file " << filename
+ << " not found" << dendl;
+ return -ENOENT;
+ }
+ File *file = q->second.get();
+ dout(10) << __func__ << " " << dirname << "/" << filename
+ << " " << file->fnode << dendl;
+ if (size)
+ *size = file->fnode.size;
+ if (mtime)
+ *mtime = file->fnode.mtime;
+ return 0;
+}
+
+int BlueFS::lock_file(const string& dirname, const string& filename,
+ FileLock **plock)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+ map<string,DirRef>::iterator p = dir_map.find(dirname);
+ if (p == dir_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+ return -ENOENT;
+ }
+ DirRef dir = p->second;
+ map<string,FileRef>::iterator q = dir->file_map.find(filename);
+ File *file;
+ if (q == dir->file_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " (" << dir
+ << ") file " << filename
+ << " not found, creating" << dendl;
+ file = new File;
+ file->fnode.ino = ++ino_last;
+ file->fnode.mtime = ceph_clock_now();
+ file_map[ino_last] = file;
+ dir->file_map[filename] = file;
+ ++file->refs;
+ log_t.op_file_update(file->fnode);
+ log_t.op_dir_link(dirname, filename, file->fnode.ino);
+ } else {
+ file = q->second.get();
+ if (file->locked) {
+ dout(10) << __func__ << " already locked" << dendl;
+ return -ENOLCK;
+ }
+ }
+ file->locked = true;
+ *plock = new FileLock(file);
+ dout(10) << __func__ << " locked " << file->fnode
+ << " with " << *plock << dendl;
+ return 0;
+}
+
+int BlueFS::unlock_file(FileLock *fl)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
+ ceph_assert(fl->file->locked);
+ fl->file->locked = false;
+ delete fl;
+ return 0;
+}
+
+int BlueFS::readdir(const string& dirname, vector<string> *ls)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << dirname << dendl;
+ if (dirname.empty()) {
+ // list dirs
+ ls->reserve(dir_map.size() + 2);
+ for (auto& q : dir_map) {
+ ls->push_back(q.first);
+ }
+ } else {
+ // list files in dir
+ map<string,DirRef>::iterator p = dir_map.find(dirname);
+ if (p == dir_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+ return -ENOENT;
+ }
+ DirRef dir = p->second;
+ ls->reserve(dir->file_map.size() + 2);
+ for (auto& q : dir->file_map) {
+ ls->push_back(q.first);
+ }
+ }
+ ls->push_back(".");
+ ls->push_back("..");
+ return 0;
+}
+
+int BlueFS::unlink(const string& dirname, const string& filename)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+ map<string,DirRef>::iterator p = dir_map.find(dirname);
+ if (p == dir_map.end()) {
+ dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+ return -ENOENT;
+ }
+ DirRef dir = p->second;
+ map<string,FileRef>::iterator q = dir->file_map.find(filename);
+ if (q == dir->file_map.end()) {
+ dout(20) << __func__ << " file " << dirname << "/" << filename
+ << " not found" << dendl;
+ return -ENOENT;
+ }
+ FileRef file = q->second;
+ if (file->locked) {
+ dout(20) << __func__ << " file " << dirname << "/" << filename
+ << " is locked" << dendl;
+ return -EBUSY;
+ }
+ dir->file_map.erase(filename);
+ log_t.op_dir_unlink(dirname, filename);
+ _drop_link(file);
+ return 0;
+}
+
+bool BlueFS::wal_is_rotational()
+{
+ if (bdev[BDEV_WAL]) {
+ return bdev[BDEV_WAL]->is_rotational();
+ } else if (bdev[BDEV_DB]) {
+ return bdev[BDEV_DB]->is_rotational();
+ }
+ return bdev[BDEV_SLOW]->is_rotational();
+}
+
+/*
+ Algorithm.
+ do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
+ Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
+ and try if using it will produce healthy bluefs transaction.
+ We encode already known bluefs log extents and search disk for these bytes.
+ When we find it, we decode following bytes as extent.
+ We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
+ */
+int BlueFS::do_replay_recovery_read(FileReader *log_reader,
+ size_t replay_pos,
+ size_t read_offset,
+ size_t read_len,
+ bufferlist* bl) {
+ dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
+ " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
+
+ bluefs_fnode_t& log_fnode = log_reader->file->fnode;
+ bufferlist bin_extents;
+ encode(log_fnode.extents, bin_extents);
+ dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
+
+ // cannot process if too small to effectively search
+ ceph_assert(bin_extents.length() >= 32);
+ bufferlist last_32;
+ last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
+
+ //read fixed part from replay_pos to end of bluefs_log extents
+ bufferlist fixed;
+ uint64_t e_off = 0;
+ auto e = log_fnode.seek(replay_pos, &e_off);
+ ceph_assert(e != log_fnode.extents.end());
+ int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
+ cct->_conf->bluefs_buffered_io);
+ ceph_assert(r == 0);
+ //capture dev of last good extent
+ uint8_t last_e_dev = e->bdev;
+ uint64_t last_e_off = e->offset;
+ ++e;
+ while (e != log_fnode.extents.end()) {
+ r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev],
+ cct->_conf->bluefs_buffered_io);
+ ceph_assert(r == 0);
+ last_e_dev = e->bdev;
+ ++e;
+ }
+ ceph_assert(replay_pos + fixed.length() == read_offset);
+
+ dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
+
+ struct compare {
+ bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
+ if (a.bdev < b.bdev) return true;
+ if (a.offset < b.offset) return true;
+ return a.length < b.length;
+ }
+ };
+ std::set<bluefs_extent_t, compare> extents_rejected;
+ for (int dcnt = 0; dcnt < 3; dcnt++) {
+ uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
+ if (bdev[dev] == nullptr) continue;
+ dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
+ interval_set<uint64_t> disk_regions;
+ disk_regions.insert(0, bdev[dev]->get_size());
+ for (auto f : file_map) {
+ auto& e = f.second->fnode.extents;
+ for (auto& p : e) {
+ if (p.bdev == dev) {
+ disk_regions.erase(p.offset, p.length);
+ }
+ }
+ }
+ size_t disk_regions_count = disk_regions.num_intervals();
+ dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
+
+ auto reg = disk_regions.lower_bound(last_e_off);
+ //for all except first, start from beginning
+ last_e_off = 0;
+ if (reg == disk_regions.end()) {
+ reg = disk_regions.begin();
+ }
+ const uint64_t chunk_size = 4 * 1024 * 1024;
+ const uint64_t page_size = 4096;
+ const uint64_t max_extent_size = 16;
+ uint64_t overlay_size = last_32.length() + max_extent_size;
+ for (size_t i = 0; i < disk_regions_count; reg++, i++) {
+ if (reg == disk_regions.end()) {
+ reg = disk_regions.begin();
+ }
+ uint64_t pos = reg.get_start();
+ uint64_t len = reg.get_len();
+
+ std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
+ char* raw_data = raw_data_p.get();
+ memset(raw_data, 0, page_size);
+
+ while (len > last_32.length()) {
+ uint64_t chunk_len = len > chunk_size ? chunk_size : len;
+ dout(5) << __func__ << " read "
+ << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl;
+ r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io);
+ ceph_assert(r == 0);
+
+ //search for fixed_last_32
+ char* chunk_b = raw_data + page_size;
+ char* chunk_e = chunk_b + chunk_len;
+
+ char* search_b = chunk_b - overlay_size;
+ char* search_e = chunk_e;
+
+ for (char* sp = search_b; ; sp += last_32.length()) {
+ sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
+ if (sp == nullptr) {
+ break;
+ }
+
+ char* n = sp + last_32.length();
+ dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
+ bufferlist test;
+ test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
+ bluefs_extent_t ne;
+ try {
+ bufferlist::const_iterator p = test.begin();
+ decode(ne, p);
+ } catch (buffer::error& e) {
+ continue;
+ }
+ if (extents_rejected.count(ne) != 0) {
+ dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
+ continue;
+ }
+ //insert as rejected already. if we succeed, it wouldn't make difference.
+ extents_rejected.insert(ne);
+
+ if (ne.bdev >= MAX_BDEV ||
+ bdev[ne.bdev] == nullptr ||
+ ne.length > 16 * 1024 * 1024 ||
+ (ne.length & 4095) != 0 ||
+ ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
+ (ne.offset & 4095) != 0) {
+ dout(5) << __func__ << " refusing extent " << ne << dendl;
+ continue;
+ }
+ dout(5) << __func__ << " checking extent " << ne << dendl;
+
+ //read candidate extent - whole
+ bufferlist candidate;
+ candidate.append(fixed);
+ r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev],
+ cct->_conf->bluefs_buffered_io);
+ ceph_assert(r == 0);
+
+ //check if transaction & crc is ok
+ bluefs_transaction_t t;
+ try {
+ bufferlist::const_iterator p = candidate.begin();
+ decode(t, p);
+ }
+ catch (buffer::error& e) {
+ dout(5) << __func__ << " failed match" << dendl;
+ continue;
+ }
+
+ //success, it seems a probable candidate
+ uint64_t l = std::min<uint64_t>(ne.length, read_len);
+ //trim to required size
+ bufferlist requested_read;
+ requested_read.substr_of(candidate, fixed.length(), l);
+ bl->append(requested_read);
+ dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
+ log_fnode.append_extent(ne);
+ log_fnode.recalc_allocated();
+ log_reader->buf.pos += l;
+ return l;
+ }
+ //save overlay for next search
+ memcpy(search_b, chunk_e - overlay_size, overlay_size);
+ pos += chunk_len;
+ len -= chunk_len;
+ }
+ }
+ }
+ return 0;
+}
+
+// ===============================================
+// OriginalVolumeSelector
+
+void* OriginalVolumeSelector::get_hint_by_device(uint8_t dev) const {
+ return reinterpret_cast<void*>(dev);
+}
+void* OriginalVolumeSelector::get_hint_by_dir(const string& dirname) const {
+ uint8_t res = BlueFS::BDEV_DB;
+ if (dirname.length() > 5) {
+ // the "db.slow" and "db.wal" directory names are hard-coded at
+ // match up with bluestore. the slow device is always the second
+ // one (when a dedicated block.db device is present and used at
+ // bdev 0). the wal device is always last.
+ if (boost::algorithm::ends_with(dirname, ".slow")) {
+ res = BlueFS::BDEV_SLOW;
+ }
+ else if (boost::algorithm::ends_with(dirname, ".wal")) {
+ res = BlueFS::BDEV_WAL;
+ }
+ }
+ return reinterpret_cast<void*>(res);
+}
+
+uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
+{
+ return (uint8_t)(reinterpret_cast<uint64_t>(hint));
+}
+
+void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+ res.emplace_back(base, db_total);
+ res.emplace_back(base + ".slow", slow_total);
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "OriginalVolumeSelector: "
+
+void OriginalVolumeSelector::dump(ostream& sout) {
+ sout<< "wal_total:" << wal_total
+ << ", db_total:" << db_total
+ << ", slow_total:" << slow_total
+ << std::endl;
+}
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
new file mode 100644
index 00000000..2115870f
--- /dev/null
+++ b/src/os/bluestore/BlueFS.h
@@ -0,0 +1,682 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_BLUEFS_H
+#define CEPH_OS_BLUESTORE_BLUEFS_H
+
+#include <atomic>
+#include <mutex>
+#include <limits>
+
+#include "bluefs_types.h"
+#include "common/RefCountedObj.h"
+#include "BlockDevice.h"
+
+#include "boost/intrusive/list.hpp"
+#include <boost/intrusive_ptr.hpp>
+
+class PerfCounters;
+
+class Allocator;
+
+enum {
+ l_bluefs_first = 732600,
+ l_bluefs_gift_bytes,
+ l_bluefs_reclaim_bytes,
+ l_bluefs_db_total_bytes,
+ l_bluefs_db_used_bytes,
+ l_bluefs_wal_total_bytes,
+ l_bluefs_wal_used_bytes,
+ l_bluefs_slow_total_bytes,
+ l_bluefs_slow_used_bytes,
+ l_bluefs_num_files,
+ l_bluefs_log_bytes,
+ l_bluefs_log_compactions,
+ l_bluefs_logged_bytes,
+ l_bluefs_files_written_wal,
+ l_bluefs_files_written_sst,
+ l_bluefs_bytes_written_wal,
+ l_bluefs_bytes_written_sst,
+ l_bluefs_bytes_written_slow,
+ l_bluefs_max_bytes_wal,
+ l_bluefs_max_bytes_db,
+ l_bluefs_max_bytes_slow,
+ l_bluefs_read_random_count,
+ l_bluefs_read_random_bytes,
+ l_bluefs_read_random_disk_count,
+ l_bluefs_read_random_disk_bytes,
+ l_bluefs_read_random_buffer_count,
+ l_bluefs_read_random_buffer_bytes,
+ l_bluefs_read_count,
+ l_bluefs_read_bytes,
+ l_bluefs_read_prefetch_count,
+ l_bluefs_read_prefetch_bytes,
+ l_bluefs_read_zeros_candidate,
+ l_bluefs_read_zeros_errors,
+
+ l_bluefs_last,
+};
+
+class BlueFSDeviceExpander {
+protected:
+ ~BlueFSDeviceExpander() {}
+public:
+ virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
+ uint64_t bluefs_total) = 0;
+ virtual int allocate_freespace(
+ uint64_t min_size,
+ uint64_t size,
+ PExtentVector& extents) = 0;
+ /** Reports amount of space that can be transferred to BlueFS.
+ * This gives either current state, when alloc_size is currently used
+ * BlueFS's size, or simulation when alloc_size is different.
+ * @params
+ * alloc_size - allocation unit size to check
+ */
+ virtual size_t available_freespace(uint64_t alloc_size) = 0;
+};
+
+class BlueFSVolumeSelector {
+public:
+ typedef std::vector<std::pair<std::string, uint64_t>> paths;
+
+ virtual ~BlueFSVolumeSelector() {
+ }
+ virtual void* get_hint_by_device(uint8_t dev) const = 0;
+ virtual void* get_hint_by_dir(const string& dirname) const = 0;
+
+ virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
+ virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
+ virtual void add_usage(void* file_hint, uint64_t fsize) = 0;
+ virtual void sub_usage(void* file_hint, uint64_t fsize) = 0;
+ virtual uint8_t select_prefer_bdev(void* hint) = 0;
+ virtual void get_paths(const std::string& base, paths& res) const = 0;
+ virtual void dump(ostream& sout) = 0;
+};
+class BlueFS;
+
+class BlueFS {
+public:
+ CephContext* cct;
+ static constexpr unsigned MAX_BDEV = 5;
+ static constexpr unsigned BDEV_WAL = 0;
+ static constexpr unsigned BDEV_DB = 1;
+ static constexpr unsigned BDEV_SLOW = 2;
+ static constexpr unsigned BDEV_NEWWAL = 3;
+ static constexpr unsigned BDEV_NEWDB = 4;
+
+ enum {
+ WRITER_UNKNOWN,
+ WRITER_WAL,
+ WRITER_SST,
+ };
+
+ struct File : public RefCountedObject {
+ MEMPOOL_CLASS_HELPERS();
+
+ bluefs_fnode_t fnode;
+ int refs;
+ uint64_t dirty_seq;
+ bool locked;
+ bool deleted;
+ boost::intrusive::list_member_hook<> dirty_item;
+
+ std::atomic_int num_readers, num_writers;
+ std::atomic_int num_reading;
+
+ void* vselector_hint = nullptr;
+
+ File()
+ : RefCountedObject(NULL, 0),
+ refs(0),
+ dirty_seq(0),
+ locked(false),
+ deleted(false),
+ num_readers(0),
+ num_writers(0),
+ num_reading(0),
+ vselector_hint(nullptr)
+ {}
+ ~File() override {
+ ceph_assert(num_readers.load() == 0);
+ ceph_assert(num_writers.load() == 0);
+ ceph_assert(num_reading.load() == 0);
+ ceph_assert(!locked);
+ }
+
+ friend void intrusive_ptr_add_ref(File *f) {
+ f->get();
+ }
+ friend void intrusive_ptr_release(File *f) {
+ f->put();
+ }
+ };
+ typedef boost::intrusive_ptr<File> FileRef;
+
+ typedef boost::intrusive::list<
+ File,
+ boost::intrusive::member_hook<
+ File,
+ boost::intrusive::list_member_hook<>,
+ &File::dirty_item> > dirty_file_list_t;
+
+ struct Dir : public RefCountedObject {
+ MEMPOOL_CLASS_HELPERS();
+
+ mempool::bluefs::map<string,FileRef> file_map;
+
+ Dir() : RefCountedObject(NULL, 0) {}
+
+ friend void intrusive_ptr_add_ref(Dir *d) {
+ d->get();
+ }
+ friend void intrusive_ptr_release(Dir *d) {
+ d->put();
+ }
+ };
+ typedef boost::intrusive_ptr<Dir> DirRef;
+
+ struct FileWriter {
+ MEMPOOL_CLASS_HELPERS();
+
+ FileRef file;
+ uint64_t pos; ///< start offset for buffer
+ bufferlist buffer; ///< new data to write (at end of file)
+ bufferlist tail_block; ///< existing partial block at end of file, if any
+ bufferlist::page_aligned_appender buffer_appender; //< for const char* only
+ int writer_type = 0; ///< WRITER_*
+ int write_hint = WRITE_LIFE_NOT_SET;
+
+ ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
+ std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
+ std::array<bool, MAX_BDEV> dirty_devs;
+
+ FileWriter(FileRef f)
+ : file(f),
+ pos(0),
+ buffer_appender(buffer.get_page_aligned_appender(
+ g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
+ ++file->num_writers;
+ iocv.fill(nullptr);
+ dirty_devs.fill(false);
+ if (f->fnode.ino == 1) {
+ write_hint = WRITE_LIFE_MEDIUM;
+ }
+ }
+ // NOTE: caller must call BlueFS::close_writer()
+ ~FileWriter() {
+ --file->num_writers;
+ }
+
+ // note: BlueRocksEnv uses this append exclusively, so it's safe
+ // to use buffer_appender exclusively here (e.g., it's notion of
+ // offset will remain accurate).
+ void append(const char *buf, size_t len) {
+ uint64_t l0 = buffer.length();
+ ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
+ buffer_appender.append(buf, len);
+ }
+
+ // note: used internally only, for ino 1 or 0.
+ void append(ceph::buffer::list& bl) {
+ uint64_t l0 = buffer.length();
+ ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max());
+ buffer.claim_append(bl);
+ }
+
+ uint64_t get_effective_write_pos() {
+ buffer_appender.flush();
+ return pos + buffer.length();
+ }
+ };
+
+ struct FileReaderBuffer {
+ MEMPOOL_CLASS_HELPERS();
+
+ uint64_t bl_off; ///< prefetch buffer logical offset
+ bufferlist bl; ///< prefetch buffer
+ uint64_t pos; ///< current logical offset
+ uint64_t max_prefetch; ///< max allowed prefetch
+
+ explicit FileReaderBuffer(uint64_t mpf)
+ : bl_off(0),
+ pos(0),
+ max_prefetch(mpf) {}
+
+ uint64_t get_buf_end() {
+ return bl_off + bl.length();
+ }
+ uint64_t get_buf_remaining(uint64_t p) {
+ if (p >= bl_off && p < bl_off + bl.length())
+ return bl_off + bl.length() - p;
+ return 0;
+ }
+
+ void skip(size_t n) {
+ pos += n;
+ }
+ void seek(uint64_t offset) {
+ pos = offset;
+ }
+ };
+
+ struct FileReader {
+ MEMPOOL_CLASS_HELPERS();
+
+ FileRef file;
+ FileReaderBuffer buf;
+ bool random;
+ bool ignore_eof; ///< used when reading our log file
+
+ ceph::shared_mutex lock {
+ ceph::make_shared_mutex(std::string(), false, false, false)
+ };
+
+
+ FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
+ : file(f),
+ buf(mpf),
+ random(rand),
+ ignore_eof(ie) {
+ ++file->num_readers;
+ }
+ ~FileReader() {
+ --file->num_readers;
+ }
+ };
+
+ struct FileLock {
+ MEMPOOL_CLASS_HELPERS();
+
+ FileRef file;
+ explicit FileLock(FileRef f) : file(f) {}
+ };
+
+private:
+ ceph::mutex lock = ceph::make_mutex("BlueFS::lock");
+
+ PerfCounters *logger = nullptr;
+
+ uint64_t max_bytes[MAX_BDEV] = {0};
+ uint64_t max_bytes_pcounters[MAX_BDEV] = {
+ l_bluefs_max_bytes_wal,
+ l_bluefs_max_bytes_db,
+ l_bluefs_max_bytes_slow,
+ };
+
+ // cache
+ mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir
+ mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
+
+ // map of dirty files, files of same dirty_seq are grouped into list.
+ map<uint64_t, dirty_file_list_t> dirty_files;
+
+ bluefs_super_t super; ///< latest superblock (as last written)
+ uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
+ uint64_t log_seq = 0; ///< last used log seq (by current pending log_t)
+ uint64_t log_seq_stable = 0; ///< last stable/synced log seq
+ FileWriter *log_writer = 0; ///< writer for the log
+ bluefs_transaction_t log_t; ///< pending, unwritten log transaction
+ bool log_flushing = false; ///< true while flushing the log
+ ceph::condition_variable log_cond;
+
+ uint64_t new_log_jump_to = 0;
+ uint64_t old_log_jump_to = 0;
+ FileRef new_log = nullptr;
+ FileWriter *new_log_writer = nullptr;
+
+ /*
+ * There are up to 3 block devices:
+ *
+ * BDEV_DB db/ - the primary db device
+ * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
+ * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
+ */
+ vector<BlockDevice*> bdev; ///< block devices we can use
+ vector<IOContext*> ioc; ///< IOContexts for bdevs
+ vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
+ vector<Allocator*> alloc; ///< allocators for bdevs
+ vector<uint64_t> alloc_size; ///< alloc size for each device
+ vector<interval_set<uint64_t>> pending_release; ///< extents to release
+
+ BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
+
+ BlueFSDeviceExpander* slow_dev_expander = nullptr;
+ std::unique_ptr<BlueFSVolumeSelector> vselector;
+
+ class SocketHook;
+ SocketHook* asok_hook = nullptr;
+ // used to trigger zeros into read (debug / verify)
+ std::atomic<uint64_t> inject_read_zeros{0};
+
+ void _init_logger();
+ void _shutdown_logger();
+ void _update_logger_stats();
+
+ void _init_alloc();
+ void _stop_alloc();
+
+ void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros
+
+ FileRef _get_file(uint64_t ino);
+ void _drop_link(FileRef f);
+
+ unsigned _get_slow_device_id() {
+ return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
+ }
+ const char* get_device_name(unsigned id);
+ int _expand_slow_device(uint64_t min_size, PExtentVector& extents);
+ int _allocate(uint8_t bdev, uint64_t len,
+ bluefs_fnode_t* node);
+ int _allocate_without_fallback(uint8_t id, uint64_t len,
+ PExtentVector* extents);
+
+ int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
+ int _flush(FileWriter *h, bool focce, std::unique_lock<ceph::mutex>& l);
+ int _flush(FileWriter *h, bool force, bool *flushed = nullptr);
+ int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l);
+
+#ifdef HAVE_LIBAIO
+ void _claim_completed_aios(FileWriter *h, list<aio_t> *ls);
+ void wait_for_aio(FileWriter *h); // safe to call without a lock
+#endif
+
+ int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
+ uint64_t want_seq = 0,
+ uint64_t jump_to = 0);
+ uint64_t _estimate_log_size();
+ bool _should_compact_log();
+
+ enum {
+ REMOVE_DB = 1,
+ REMOVE_WAL = 2,
+ RENAME_SLOW2DB = 4,
+ RENAME_DB2SLOW = 8,
+ };
+ void _compact_log_dump_metadata(bluefs_transaction_t *t,
+ int flags);
+ void _compact_log_sync();
+ void _compact_log_async(std::unique_lock<ceph::mutex>& l);
+
+ void _rewrite_log_sync(bool allocate_with_fallback,
+ int super_dev,
+ int log_dev,
+ int new_log_dev,
+ int flags);
+
+ //void _aio_finish(void *priv);
+
+ void _flush_bdev_safely(FileWriter *h);
+ void flush_bdev(); // this is safe to call without a lock
+ void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock
+
+ int _preallocate(FileRef f, uint64_t off, uint64_t len);
+ int _truncate(FileWriter *h, uint64_t off);
+
+ int64_t _read(
+ FileReader *h, ///< [in] read from here
+ FileReaderBuffer *buf, ///< [in] reader state
+ uint64_t offset, ///< [in] offset
+ size_t len, ///< [in] this many bytes
+ bufferlist *outbl, ///< [out] optional: reference the result here
+ char *out); ///< [out] optional: or copy it here
+ int64_t _read_random(
+ FileReader *h, ///< [in] read from here
+ uint64_t offset, ///< [in] offset
+ size_t len, ///< [in] this many bytes
+ char *out); ///< [out] optional: or copy it here
+
+ void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
+
+ int _open_super();
+ int _write_super(int dev);
+ int _replay(bool noop, bool to_stdout = false); ///< replay journal
+
+ FileWriter *_create_writer(FileRef f);
+ void _close_writer(FileWriter *h);
+
+ // always put the super in the second 4k block. FIXME should this be
+ // block size independent?
+ unsigned get_super_offset() {
+ return 4096;
+ }
+ unsigned get_super_length() {
+ return 4096;
+ }
+
+ void _add_block_extent(unsigned bdev, uint64_t offset, uint64_t len,
+ bool skip=false);
+
+public:
+ BlueFS(CephContext* cct);
+ ~BlueFS();
+
+ // the super is always stored on bdev 0
+ int mkfs(uuid_d osd_uuid);
+ int mount();
+ void umount(bool avoid_compact = false);
+ int prepare_new_device(int id);
+
+ int log_dump();
+
+ void collect_metadata(map<string,string> *pm, unsigned skip_bdev_id);
+ void get_devices(set<string> *ls);
+ uint64_t get_alloc_size(int id) {
+ return alloc_size[id];
+ }
+ int fsck();
+
+ int device_migrate_to_new(
+ CephContext *cct,
+ const set<int>& devs_source,
+ int dev_target);
+ int device_migrate_to_existing(
+ CephContext *cct,
+ const set<int>& devs_source,
+ int dev_target);
+
+ uint64_t get_used();
+ uint64_t get_total(unsigned id);
+ uint64_t get_free(unsigned id);
+ void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
+ void dump_perf_counters(Formatter *f);
+
+ void dump_block_extents(ostream& out);
+
+ /// get current extents that we own for given block device
+ int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
+
+ int open_for_write(
+ const string& dir,
+ const string& file,
+ FileWriter **h,
+ bool overwrite);
+
+ int open_for_read(
+ const string& dir,
+ const string& file,
+ FileReader **h,
+ bool random = false);
+
+ void close_writer(FileWriter *h) {
+ std::lock_guard l(lock);
+ _close_writer(h);
+ }
+
+ int rename(const string& old_dir, const string& old_file,
+ const string& new_dir, const string& new_file);
+
+ int readdir(const string& dirname, vector<string> *ls);
+
+ int unlink(const string& dirname, const string& filename);
+ int mkdir(const string& dirname);
+ int rmdir(const string& dirname);
+ bool wal_is_rotational();
+
+ bool dir_exists(const string& dirname);
+ int stat(const string& dirname, const string& filename,
+ uint64_t *size, utime_t *mtime);
+
+ int lock_file(const string& dirname, const string& filename, FileLock **p);
+ int unlock_file(FileLock *l);
+
+ void compact_log();
+
+ /// sync any uncommitted state to disk
+ void sync_metadata(bool avoid_compact);
+ /// test and compact log, if necessary
+ void _maybe_compact_log(std::unique_lock<ceph::mutex>& l);
+
+ void set_slow_device_expander(BlueFSDeviceExpander* a) {
+ slow_dev_expander = a;
+ }
+ void set_volume_selector(BlueFSVolumeSelector* s) {
+ vselector.reset(s);
+ }
+ void dump_volume_selector(ostream& sout) {
+ vselector->dump(sout);
+ }
+ void get_vselector_paths(const std::string& base,
+ BlueFSVolumeSelector::paths& res) const {
+ return vselector->get_paths(base, res);
+ }
+
+ int add_block_device(unsigned bdev, const string& path, bool trim,
+ bool shared_with_bluestore=false);
+ bool bdev_support_label(unsigned id);
+ uint64_t get_block_device_size(unsigned bdev);
+
+ /// gift more block space
+ void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len,
+ bool skip=false) {
+ std::unique_lock l(lock);
+ _add_block_extent(bdev, offset, len, skip);
+ int r = _flush_and_sync_log(l);
+ ceph_assert(r == 0);
+ }
+
+ /// reclaim block space
+ int reclaim_blocks(unsigned bdev, uint64_t want,
+ PExtentVector *extents);
+
+ // handler for discard event
+ void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
+
+ void flush(FileWriter *h, bool force = false) {
+ std::unique_lock l(lock);
+ int r = _flush(h, force, l);
+ ceph_assert(r == 0);
+ }
+
+ void append_try_flush(FileWriter *h, const char* buf, size_t len) {
+ size_t max_size = 1ull << 30; // cap to 1GB
+ while (len > 0) {
+ bool need_flush = true;
+ auto l0 = h->buffer.length();
+ if (l0 < max_size) {
+ size_t l = std::min(len, max_size - l0);
+ h->append(buf, l);
+ buf += l;
+ len -= l;
+ need_flush = h->buffer.length() >= cct->_conf->bluefs_min_flush_size;
+ }
+ if (need_flush) {
+ flush(h, true);
+ // make sure we've made any progress with flush hence the
+ // loop doesn't iterate forever
+ ceph_assert(h->buffer.length() < max_size);
+ }
+ }
+ }
+ void flush_range(FileWriter *h, uint64_t offset, uint64_t length) {
+ std::lock_guard l(lock);
+ _flush_range(h, offset, length);
+ }
+ int fsync(FileWriter *h) {
+ std::unique_lock l(lock);
+ int r = _fsync(h, l);
+ _maybe_compact_log(l);
+ return r;
+ }
+ int64_t read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len,
+ bufferlist *outbl, char *out) {
+ // no need to hold the global lock here; we only touch h and
+ // h->file, and read vs write or delete is already protected (via
+ // atomics and asserts).
+ return _read(h, buf, offset, len, outbl, out);
+ }
+ int64_t read_random(FileReader *h, uint64_t offset, size_t len,
+ char *out) {
+ // no need to hold the global lock here; we only touch h and
+ // h->file, and read vs write or delete is already protected (via
+ // atomics and asserts).
+ return _read_random(h, offset, len, out);
+ }
+ void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) {
+ std::lock_guard l(lock);
+ _invalidate_cache(f, offset, len);
+ }
+ int preallocate(FileRef f, uint64_t offset, uint64_t len) {
+ std::lock_guard l(lock);
+ return _preallocate(f, offset, len);
+ }
+ int truncate(FileWriter *h, uint64_t offset) {
+ std::lock_guard l(lock);
+ return _truncate(h, offset);
+ }
+ int do_replay_recovery_read(FileReader *log,
+ size_t log_pos,
+ size_t read_offset,
+ size_t read_len,
+ bufferlist* bl);
+
+ /// test purpose methods
+ const PerfCounters* get_perf_counters() const {
+ return logger;
+ }
+
+private:
+ // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
+ // They are used for checking if read values are all 0, and reread if so.
+ int read(uint8_t ndev, uint64_t off, uint64_t len,
+ ceph::buffer::list *pbl, IOContext *ioc, bool buffered);
+ int read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered);
+};
+
+class OriginalVolumeSelector : public BlueFSVolumeSelector {
+ uint64_t wal_total;
+ uint64_t db_total;
+ uint64_t slow_total;
+
+public:
+ OriginalVolumeSelector(
+ uint64_t _wal_total,
+ uint64_t _db_total,
+ uint64_t _slow_total)
+ : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
+
+ void* get_hint_by_device(uint8_t dev) const override;
+ void* get_hint_by_dir(const string& dirname) const override;
+
+ void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ // do nothing
+ return;
+ }
+ void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ // do nothing
+ return;
+ }
+ void add_usage(void* hint, uint64_t fsize) override {
+ // do nothing
+ return;
+ }
+ void sub_usage(void* hint, uint64_t fsize) override {
+ // do nothing
+ return;
+ }
+
+ uint8_t select_prefer_bdev(void* hint) override;
+ void get_paths(const std::string& base, paths& res) const override;
+ void dump(ostream& sout) override;
+};
+
+#endif
diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc
new file mode 100644
index 00000000..df626395
--- /dev/null
+++ b/src/os/bluestore/BlueRocksEnv.cc
@@ -0,0 +1,583 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BlueRocksEnv.h"
+#include "BlueFS.h"
+#include "include/stringify.h"
+#include "kv/RocksDBStore.h"
+#include "string.h"
+
+rocksdb::Status err_to_status(int r)
+{
+ switch (r) {
+ case 0:
+ return rocksdb::Status::OK();
+ case -ENOENT:
+ return rocksdb::Status::NotFound(rocksdb::Status::kNone);
+ case -EINVAL:
+ return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
+ case -EIO:
+ case -EEXIST:
+ return rocksdb::Status::IOError(rocksdb::Status::kNone);
+ case -ENOLCK:
+ return rocksdb::Status::IOError(strerror(r));
+ default:
+ // FIXME :(
+ ceph_abort_msg("unrecognized error code");
+ return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
+ }
+}
+
+// A file abstraction for reading sequentially through a file
+class BlueRocksSequentialFile : public rocksdb::SequentialFile {
+ BlueFS *fs;
+ BlueFS::FileReader *h;
+ public:
+ BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
+ ~BlueRocksSequentialFile() override {
+ delete h;
+ }
+
+ // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
+ // written by this routine. Sets "*result" to the data that was
+ // read (including if fewer than "n" bytes were successfully read).
+ // May set "*result" to point at data in "scratch[0..n-1]", so
+ // "scratch[0..n-1]" must be live when "*result" is used.
+ // If an error was encountered, returns a non-OK status.
+ //
+ // REQUIRES: External synchronization
+ rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
+ int64_t r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch);
+ ceph_assert(r >= 0);
+ *result = rocksdb::Slice(scratch, r);
+ return rocksdb::Status::OK();
+ }
+
+ // Skip "n" bytes from the file. This is guaranteed to be no
+ // slower that reading the same data, but may be faster.
+ //
+ // If end of file is reached, skipping will stop at the end of the
+ // file, and Skip will return OK.
+ //
+ // REQUIRES: External synchronization
+ rocksdb::Status Skip(uint64_t n) override {
+ h->buf.skip(n);
+ return rocksdb::Status::OK();
+ }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
+ fs->invalidate_cache(h->file, offset, length);
+ return rocksdb::Status::OK();
+ }
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
+ BlueFS *fs;
+ BlueFS::FileReader *h;
+ public:
+ BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
+ ~BlueRocksRandomAccessFile() override {
+ delete h;
+ }
+
+ // Read up to "n" bytes from the file starting at "offset".
+ // "scratch[0..n-1]" may be written by this routine. Sets "*result"
+ // to the data that was read (including if fewer than "n" bytes were
+ // successfully read). May set "*result" to point at data in
+ // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+ // "*result" is used. If an error was encountered, returns a non-OK
+ // status.
+ //
+ // Safe for concurrent use by multiple threads.
+ rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
+ char* scratch) const override {
+ int64_t r = fs->read_random(h, offset, n, scratch);
+ ceph_assert(r >= 0);
+ *result = rocksdb::Slice(scratch, r);
+ return rocksdb::Status::OK();
+ }
+
+ // Tries to get an unique ID for this file that will be the same each time
+ // the file is opened (and will stay the same while the file is open).
+ // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+ // ID can be created this function returns the length of the ID and places it
+ // in "id"; otherwise, this function returns 0, in which case "id"
+ // may not have been modified.
+ //
+ // This function guarantees, for IDs from a given environment, two unique ids
+ // cannot be made equal to eachother by adding arbitrary bytes to one of
+ // them. That is, no unique ID is the prefix of another.
+ //
+ // This function guarantees that the returned ID will not be interpretable as
+ // a single varint.
+ //
+ // Note: these IDs are only valid for the duration of the process.
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return snprintf(id, max_size, "%016llx",
+ (unsigned long long)h->file->fnode.ino);
+ };
+
+ // Readahead the file starting from offset by n bytes for caching.
+ rocksdb::Status Prefetch(uint64_t offset, size_t n) override {
+ fs->read(h, &h->buf, offset, n, nullptr, nullptr);
+ return rocksdb::Status::OK();
+ }
+
+ //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+ void Hint(AccessPattern pattern) override {
+ if (pattern == RANDOM)
+ h->buf.max_prefetch = 4096;
+ else if (pattern == SEQUENTIAL)
+ h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
+ }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
+ fs->invalidate_cache(h->file, offset, length);
+ return rocksdb::Status::OK();
+ }
+};
+
+
+// A file abstraction for sequential writing. The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class BlueRocksWritableFile : public rocksdb::WritableFile {
+ BlueFS *fs;
+ BlueFS::FileWriter *h;
+ public:
+ BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
+ ~BlueRocksWritableFile() override {
+ fs->close_writer(h);
+ }
+
+ // Indicates if the class makes use of unbuffered I/O
+ /*bool UseOSBuffer() const {
+ return true;
+ }*/
+
+ // This is needed when you want to allocate
+ // AlignedBuffer for use with file I/O classes
+ // Used for unbuffered file I/O when UseOSBuffer() returns false
+ /*size_t GetRequiredBufferAlignment() const {
+ return c_DefaultPageSize;
+ }*/
+
+ rocksdb::Status Append(const rocksdb::Slice& data) override {
+ fs->append_try_flush(h, data.data(), data.size());
+ return rocksdb::Status::OK();
+ }
+
+ // Positioned write for unbuffered access default forward
+ // to simple append as most of the tests are buffered by default
+ rocksdb::Status PositionedAppend(
+ const rocksdb::Slice& /* data */,
+ uint64_t /* offset */) override {
+ return rocksdb::Status::NotSupported();
+ }
+
+ // Truncate is necessary to trim the file to the correct size
+ // before closing. It is not always possible to keep track of the file
+ // size due to whole pages writes. The behavior is undefined if called
+ // with other writes to follow.
+ rocksdb::Status Truncate(uint64_t size) override {
+ // we mirror the posix env, which does nothing here; instead, it
+ // truncates to the final size on close. whatever!
+ return rocksdb::Status::OK();
+ //int r = fs->truncate(h, size);
+ // return err_to_status(r);
+ }
+
+ rocksdb::Status Close() override {
+ Flush();
+
+ // mimic posix env, here. shrug.
+ size_t block_size;
+ size_t last_allocated_block;
+ GetPreallocationStatus(&block_size, &last_allocated_block);
+ if (last_allocated_block > 0) {
+ int r = fs->truncate(h, h->pos);
+ if (r < 0)
+ return err_to_status(r);
+ }
+
+ return rocksdb::Status::OK();
+ }
+
+ rocksdb::Status Flush() override {
+ fs->flush(h);
+ return rocksdb::Status::OK();
+ }
+
+ rocksdb::Status Sync() override { // sync data
+ fs->fsync(h);
+ return rocksdb::Status::OK();
+ }
+
+ // true if Sync() and Fsync() are safe to call concurrently with Append()
+ // and Flush().
+ bool IsSyncThreadSafe() const override {
+ return true;
+ }
+
+ // Indicates the upper layers if the current WritableFile implementation
+ // uses direct IO.
+ bool UseDirectIO() const {
+ return false;
+ }
+
+ void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint) override {
+ h->write_hint = (const int)hint;
+ }
+
+ /*
+ * Get the size of valid data in the file.
+ */
+ uint64_t GetFileSize() override {
+ return h->file->fnode.size + h->buffer.length();;
+ }
+
+ // For documentation, refer to RandomAccessFile::GetUniqueId()
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return snprintf(id, max_size, "%016llx",
+ (unsigned long long)h->file->fnode.ino);
+ }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ // This call has no effect on dirty pages in the cache.
+ rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
+ fs->invalidate_cache(h->file, offset, length);
+ return rocksdb::Status::OK();
+ }
+
+ using rocksdb::WritableFile::RangeSync;
+ // Sync a file range with disk.
+ // offset is the starting byte of the file range to be synchronized.
+ // nbytes specifies the length of the range to be synchronized.
+ // This asks the OS to initiate flushing the cached data to disk,
+ // without waiting for completion.
+ // Default implementation does nothing.
+ rocksdb::Status RangeSync(off_t offset, off_t nbytes) {
+ // round down to page boundaries
+ int partial = offset & 4095;
+ offset -= partial;
+ nbytes += partial;
+ nbytes &= ~4095;
+ if (nbytes)
+ fs->flush_range(h, offset, nbytes);
+ return rocksdb::Status::OK();
+ }
+
+ protected:
+ using rocksdb::WritableFile::Allocate;
+ /*
+ * Pre-allocate space for a file.
+ */
+ rocksdb::Status Allocate(off_t offset, off_t len) {
+ int r = fs->preallocate(h->file, offset, len);
+ return err_to_status(r);
+ }
+};
+
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class BlueRocksDirectory : public rocksdb::Directory {
+ BlueFS *fs;
+ public:
+ explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
+
+ // Fsync directory. Can be called concurrently from multiple threads.
+ rocksdb::Status Fsync() override {
+ // it is sufficient to flush the log.
+ fs->sync_metadata(false);
+ return rocksdb::Status::OK();
+ }
+};
+
+// Identifies a locked file.
+class BlueRocksFileLock : public rocksdb::FileLock {
+ public:
+ BlueFS *fs;
+ BlueFS::FileLock *lock;
+ BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
+ ~BlueRocksFileLock() override {
+ }
+};
+
+
+// --------------------
+// --- BlueRocksEnv ---
+// --------------------
+
+BlueRocksEnv::BlueRocksEnv(BlueFS *f)
+ : EnvWrapper(Env::Default()), // forward most of it to POSIX
+ fs(f)
+{
+
+}
+
+rocksdb::Status BlueRocksEnv::NewSequentialFile(
+ const std::string& fname,
+ std::unique_ptr<rocksdb::SequentialFile>* result,
+ const rocksdb::EnvOptions& options)
+{
+ if (fname[0] == '/')
+ return target()->NewSequentialFile(fname, result, options);
+ std::string dir, file;
+ split(fname, &dir, &file);
+ BlueFS::FileReader *h;
+ int r = fs->open_for_read(dir, file, &h, false);
+ if (r < 0)
+ return err_to_status(r);
+ result->reset(new BlueRocksSequentialFile(fs, h));
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
+ const std::string& fname,
+ std::unique_ptr<rocksdb::RandomAccessFile>* result,
+ const rocksdb::EnvOptions& options)
+{
+ std::string dir, file;
+ split(fname, &dir, &file);
+ BlueFS::FileReader *h;
+ int r = fs->open_for_read(dir, file, &h, true);
+ if (r < 0)
+ return err_to_status(r);
+ result->reset(new BlueRocksRandomAccessFile(fs, h));
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewWritableFile(
+ const std::string& fname,
+ std::unique_ptr<rocksdb::WritableFile>* result,
+ const rocksdb::EnvOptions& options)
+{
+ std::string dir, file;
+ split(fname, &dir, &file);
+ BlueFS::FileWriter *h;
+ int r = fs->open_for_write(dir, file, &h, false);
+ if (r < 0)
+ return err_to_status(r);
+ result->reset(new BlueRocksWritableFile(fs, h));
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::ReuseWritableFile(
+ const std::string& new_fname,
+ const std::string& old_fname,
+ std::unique_ptr<rocksdb::WritableFile>* result,
+ const rocksdb::EnvOptions& options)
+{
+ std::string old_dir, old_file;
+ split(old_fname, &old_dir, &old_file);
+ std::string new_dir, new_file;
+ split(new_fname, &new_dir, &new_file);
+
+ int r = fs->rename(old_dir, old_file, new_dir, new_file);
+ if (r < 0)
+ return err_to_status(r);
+
+ BlueFS::FileWriter *h;
+ r = fs->open_for_write(new_dir, new_file, &h, true);
+ if (r < 0)
+ return err_to_status(r);
+ result->reset(new BlueRocksWritableFile(fs, h));
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewDirectory(
+ const std::string& name,
+ std::unique_ptr<rocksdb::Directory>* result)
+{
+ if (!fs->dir_exists(name))
+ return rocksdb::Status::NotFound(name, strerror(ENOENT));
+ result->reset(new BlueRocksDirectory(fs));
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
+{
+ if (fname[0] == '/')
+ return target()->FileExists(fname);
+ std::string dir, file;
+ split(fname, &dir, &file);
+ if (fs->stat(dir, file, NULL, NULL) == 0)
+ return rocksdb::Status::OK();
+ return err_to_status(-ENOENT);
+}
+
+rocksdb::Status BlueRocksEnv::GetChildren(
+ const std::string& dir,
+ std::vector<std::string>* result)
+{
+ result->clear();
+ int r = fs->readdir(dir, result);
+ if (r < 0)
+ return rocksdb::Status::NotFound(dir, strerror(ENOENT));// return err_to_status(r);
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
+{
+ std::string dir, file;
+ split(fname, &dir, &file);
+ int r = fs->unlink(dir, file);
+ if (r < 0)
+ return err_to_status(r);
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
+{
+ int r = fs->mkdir(dirname);
+ if (r < 0)
+ return err_to_status(r);
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
+{
+ int r = fs->mkdir(dirname);
+ if (r < 0 && r != -EEXIST)
+ return err_to_status(r);
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
+{
+ int r = fs->rmdir(dirname);
+ if (r < 0)
+ return err_to_status(r);
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetFileSize(
+ const std::string& fname,
+ uint64_t* file_size)
+{
+ std::string dir, file;
+ split(fname, &dir, &file);
+ int r = fs->stat(dir, file, file_size, NULL);
+ if (r < 0)
+ return err_to_status(r);
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime)
+{
+ std::string dir, file;
+ split(fname, &dir, &file);
+ utime_t mtime;
+ int r = fs->stat(dir, file, NULL, &mtime);
+ if (r < 0)
+ return err_to_status(r);
+ *file_mtime = mtime.sec();
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::RenameFile(
+ const std::string& src,
+ const std::string& target)
+{
+ std::string old_dir, old_file;
+ split(src, &old_dir, &old_file);
+ std::string new_dir, new_file;
+ split(target, &new_dir, &new_file);
+
+ int r = fs->rename(old_dir, old_file, new_dir, new_file);
+ if (r < 0)
+ return err_to_status(r);
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::LinkFile(
+ const std::string& src,
+ const std::string& target)
+{
+ ceph_abort();
+}
+
+rocksdb::Status BlueRocksEnv::AreFilesSame(
+ const std::string& first,
+ const std::string& second, bool* res)
+{
+ for (auto& path : {first, second}) {
+ if (fs->dir_exists(path)) {
+ continue;
+ }
+ std::string dir, file;
+ split(path, &dir, &file);
+ int r = fs->stat(dir, file, nullptr, nullptr);
+ if (!r) {
+ continue;
+ } else if (r == -ENOENT) {
+ return rocksdb::Status::NotFound("AreFilesSame", path);
+ } else {
+ return err_to_status(r);
+ }
+ }
+ *res = (first == second);
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::LockFile(
+ const std::string& fname,
+ rocksdb::FileLock** lock)
+{
+ std::string dir, file;
+ split(fname, &dir, &file);
+ BlueFS::FileLock *l = NULL;
+ int r = fs->lock_file(dir, file, &l);
+ if (r < 0)
+ return err_to_status(r);
+ *lock = new BlueRocksFileLock(fs, l);
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
+{
+ BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
+ int r = fs->unlock_file(l->lock);
+ if (r < 0)
+ return err_to_status(r);
+ delete lock;
+ lock = nullptr;
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetAbsolutePath(
+ const std::string& db_path,
+ std::string* output_path)
+{
+ // this is a lie...
+ *output_path = "/" + db_path;
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewLogger(
+ const std::string& fname,
+ std::shared_ptr<rocksdb::Logger>* result)
+{
+ // ignore the filename :)
+ result->reset(create_rocksdb_ceph_logger());
+ return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
+{
+ static int foo = 0;
+ *path = "temp_" + stringify(++foo);
+ return rocksdb::Status::OK();
+}
diff --git a/src/os/bluestore/BlueRocksEnv.h b/src/os/bluestore/BlueRocksEnv.h
new file mode 100644
index 00000000..82cffcd8
--- /dev/null
+++ b/src/os/bluestore/BlueRocksEnv.h
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_BLUEROCKSENV_H
+#define CEPH_OS_BLUESTORE_BLUEROCKSENV_H
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/env_mirror.h"
+
+#include "include/ceph_assert.h"
+#include "kv/RocksDBStore.h"
+
+class BlueFS;
+
+class BlueRocksEnv : public rocksdb::EnvWrapper {
+ void split(const std::string &fn, std::string *dir, std::string *file) {
+ size_t slash = fn.rfind('/');
+ *file = fn.substr(slash + 1);
+ while (slash && fn[slash-1] == '/')
+ --slash;
+ *dir = fn.substr(0, slash);
+ }
+
+public:
+ // Create a brand new sequentially-readable file with the specified name.
+ // On success, stores a pointer to the new file in *result and returns OK.
+ // On failure, stores nullptr in *result and returns non-OK. If the file does
+ // not exist, returns a non-OK status.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ rocksdb::Status NewSequentialFile(
+ const std::string& fname,
+ std::unique_ptr<rocksdb::SequentialFile>* result,
+ const rocksdb::EnvOptions& options) override;
+
+ // Create a brand new random access read-only file with the
+ // specified name. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure, stores nullptr in *result and
+ // returns non-OK. If the file does not exist, returns a non-OK
+ // status.
+ //
+ // The returned file may be concurrently accessed by multiple threads.
+ rocksdb::Status NewRandomAccessFile(
+ const std::string& fname,
+ std::unique_ptr<rocksdb::RandomAccessFile>* result,
+ const rocksdb::EnvOptions& options) override;
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure, stores nullptr in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ rocksdb::Status NewWritableFile(
+ const std::string& fname,
+ std::unique_ptr<rocksdb::WritableFile>* result,
+ const rocksdb::EnvOptions& options) override;
+
+ // Reuse an existing file by renaming it and opening it as writable.
+ rocksdb::Status ReuseWritableFile(
+ const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<rocksdb::WritableFile>* result,
+ const rocksdb::EnvOptions& options) override;
+
+ // Create an object that represents a directory. Will fail if directory
+ // doesn't exist. If the directory exists, it will open the directory
+ // and create a new Directory object.
+ //
+ // On success, stores a pointer to the new Directory in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ rocksdb::Status NewDirectory(
+ const std::string& name,
+ std::unique_ptr<rocksdb::Directory>* result) override;
+
+ // Returns OK if the named file exists.
+ // NotFound if the named file does not exist,
+ // the calling process does not have permission to determine
+ // whether this file exists, or if the path is invalid.
+ // IOError if an IO Error was encountered
+ rocksdb::Status FileExists(const std::string& fname) override;
+
+ // Store in *result the names of the children of the specified directory.
+ // The names are relative to "dir".
+ // Original contents of *results are dropped.
+ rocksdb::Status GetChildren(const std::string& dir,
+ std::vector<std::string>* result) override;
+
+ // Delete the named file.
+ rocksdb::Status DeleteFile(const std::string& fname) override;
+
+ // Create the specified directory. Returns error if directory exists.
+ rocksdb::Status CreateDir(const std::string& dirname) override;
+
+ // Create directory if missing. Return Ok if it exists, or successful in
+ // Creating.
+ rocksdb::Status CreateDirIfMissing(const std::string& dirname) override;
+
+ // Delete the specified directory.
+ rocksdb::Status DeleteDir(const std::string& dirname) override;
+
+ // Store the size of fname in *file_size.
+ rocksdb::Status GetFileSize(const std::string& fname, uint64_t* file_size) override;
+
+ // Store the last modification time of fname in *file_mtime.
+ rocksdb::Status GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime) override;
+ // Rename file src to target.
+ rocksdb::Status RenameFile(const std::string& src,
+ const std::string& target) override;
+ // Hard Link file src to target.
+ rocksdb::Status LinkFile(const std::string& src, const std::string& target) override;
+
+ // Tell if two files are identical
+ rocksdb::Status AreFilesSame(const std::string& first,
+ const std::string& second, bool* res) override;
+
+ // Lock the specified file. Used to prevent concurrent access to
+ // the same db by multiple processes. On failure, stores nullptr in
+ // *lock and returns non-OK.
+ //
+ // On success, stores a pointer to the object that represents the
+ // acquired lock in *lock and returns OK. The caller should call
+ // UnlockFile(*lock) to release the lock. If the process exits,
+ // the lock will be automatically released.
+ //
+ // If somebody else already holds the lock, finishes immediately
+ // with a failure. I.e., this call does not wait for existing locks
+ // to go away.
+ //
+ // May create the named file if it does not already exist.
+ rocksdb::Status LockFile(const std::string& fname, rocksdb::FileLock** lock) override;
+
+ // Release the lock acquired by a previous successful call to LockFile.
+ // REQUIRES: lock was returned by a successful LockFile() call
+ // REQUIRES: lock has not already been unlocked.
+ rocksdb::Status UnlockFile(rocksdb::FileLock* lock) override;
+
+ // *path is set to a temporary directory that can be used for testing. It may
+ // or may not have just been created. The directory may or may not differ
+ // between runs of the same process, but subsequent calls will return the
+ // same directory.
+ rocksdb::Status GetTestDirectory(std::string* path) override;
+
+ // Create and return a log file for storing informational messages.
+ rocksdb::Status NewLogger(
+ const std::string& fname,
+ std::shared_ptr<rocksdb::Logger>* result) override;
+
+ // Get full directory name for this db.
+ rocksdb::Status GetAbsolutePath(const std::string& db_path,
+ std::string* output_path) override;
+
+ explicit BlueRocksEnv(BlueFS *f);
+private:
+ BlueFS *fs;
+};
+
+#endif
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
new file mode 100644
index 00000000..d701ef4d
--- /dev/null
+++ b/src/os/bluestore/BlueStore.cc
@@ -0,0 +1,15265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <boost/container/flat_set.hpp>
+#include "boost/algorithm/string.hpp"
+
+#include "include/cpp-btree/btree_set.h"
+
+#include "BlueStore.h"
+#include "os/kv.h"
+#include "include/compat.h"
+#include "include/intarith.h"
+#include "include/stringify.h"
+#include "include/str_map.h"
+#include "include/util.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/PriorityCache.h"
+#include "Allocator.h"
+#include "FreelistManager.h"
+#include "BlueFS.h"
+#include "BlueRocksEnv.h"
+#include "auth/Crypto.h"
+#include "common/EventTrace.h"
+#include "perfglue/heap_profiler.h"
+#include "common/blkdev.h"
+#include "common/numa.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+
+using bid_t = decltype(BlueStore::Blob::id);
+
+// bluestore_cache_onode
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
+ bluestore_cache_onode);
+
+// bluestore_cache_other
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
+ bluestore_Buffer);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
+ bluestore_Extent);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
+ bluestore_Blob);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
+ bluestore_SharedBlob);
+
+// bluestore_txc
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
+ bluestore_txc);
+
+
+// kv store prefixes
+const string PREFIX_SUPER = "S"; // field -> value
+const string PREFIX_STAT = "T"; // field -> value(int64 array)
+const string PREFIX_COLL = "C"; // collection name -> cnode_t
+const string PREFIX_OBJ = "O"; // object name -> onode_t
+const string PREFIX_OMAP = "M"; // u64 + keyname -> value
+const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
+const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
+const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
+const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
+const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
+
+const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
+
+// write a label in the first block. always use this size. note that
+// bluefs makes a matching assumption about the location of its
+// superblock (always the second block of the device).
+#define BDEV_LABEL_BLOCK_SIZE 4096
+
+// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
+#define SUPER_RESERVED 8192
+
+#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
+
+
+/*
+ * extent map blob encoding
+ *
+ * we use the low bits of the blobid field to indicate some common scenarios
+ * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
+ */
+#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
+#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
+#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
+#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
+#define BLOBID_SHIFT_BITS 4
+
+/*
+ * object name key structure
+ *
+ * encoded u8: shard + 2^7 (so that it sorts properly)
+ * encoded u64: poolid + 2^63 (so that it sorts properly)
+ * encoded u32: hash (bit reversed)
+ *
+ * escaped string: namespace
+ *
+ * escaped string: key or object name
+ * 1 char: '<', '=', or '>'. if =, then object key == object name, and
+ * we are done. otherwise, we are followed by the object name.
+ * escaped string: object name (unless '=' above)
+ *
+ * encoded u64: snap
+ * encoded u64: generation
+ * 'o'
+ */
+#define ONODE_KEY_SUFFIX 'o'
+
+/*
+ * extent shard key
+ *
+ * object prefix key
+ * u32
+ * 'x'
+ */
+#define EXTENT_SHARD_KEY_SUFFIX 'x'
+
+/*
+ * string encoding in the key
+ *
+ * The key string needs to lexicographically sort the same way that
+ * ghobject_t does. We do this by escaping anything <= to '#' with #
+ * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
+ * hex digits.
+ *
+ * We use ! as a terminator for strings; this works because it is < #
+ * and will get escaped if it is present in the string.
+ *
+ * NOTE: There is a bug in this implementation: due to implicit
+ * character type conversion in comparison it may produce unexpected
+ * ordering. Unfortunately fixing the bug would mean invalidating the
+ * keys in existing deployments. Instead we do additional sorting
+ * where it is needed.
+ */
+template<typename S>
+static void append_escaped(const string &in, S *out)
+{
+ char hexbyte[in.length() * 3 + 1];
+ char* ptr = &hexbyte[0];
+ for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+ if (*i <= '#') { // bug: unexpected result for *i > 0x7f
+ *ptr++ = '#';
+ *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
+ *ptr++ = "0123456789abcdef"[*i & 0x0f];
+ } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
+ *ptr++ = '~';
+ *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
+ *ptr++ = "0123456789abcdef"[*i & 0x0f];
+ } else {
+ *ptr++ = *i;
+ }
+ }
+ *ptr++ = '!';
+ out->append(hexbyte, ptr - &hexbyte[0]);
+}
+
+inline unsigned h2i(char c)
+{
+ if ((c >= '0') && (c <= '9')) {
+ return c - 0x30;
+ } else if ((c >= 'a') && (c <= 'f')) {
+ return c - 'a' + 10;
+ } else if ((c >= 'A') && (c <= 'F')) {
+ return c - 'A' + 10;
+ } else {
+ return 256; // make it always larger than 255
+ }
+}
+
+static int decode_escaped(const char *p, string *out)
+{
+ char buff[256];
+ char* ptr = &buff[0];
+ char* max = &buff[252];
+ const char *orig_p = p;
+ while (*p && *p != '!') {
+ if (*p == '#' || *p == '~') {
+ unsigned hex = 0;
+ p++;
+ hex = h2i(*p++) << 4;
+ if (hex > 255) {
+ return -EINVAL;
+ }
+ hex |= h2i(*p++);
+ if (hex > 255) {
+ return -EINVAL;
+ }
+ *ptr++ = hex;
+ } else {
+ *ptr++ = *p++;
+ }
+ if (ptr > max) {
+ out->append(buff, ptr-buff);
+ ptr = &buff[0];
+ }
+ }
+ if (ptr != buff) {
+ out->append(buff, ptr-buff);
+ }
+ return p - orig_p;
+}
+
+// some things we encode in binary (as le32 or le64); print the
+// resulting key strings nicely
+template<typename S>
+static string pretty_binary_string(const S& in)
+{
+ char buf[10];
+ string out;
+ out.reserve(in.length() * 3);
+ enum { NONE, HEX, STRING } mode = NONE;
+ unsigned from = 0, i;
+ for (i=0; i < in.length(); ++i) {
+ if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
+ (mode == HEX && in.length() - i >= 4 &&
+ ((in[i] < 32 || (unsigned char)in[i] > 126) ||
+ (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
+ (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
+ (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
+ if (mode == STRING) {
+ out.append(in.c_str() + from, i - from);
+ out.push_back('\'');
+ }
+ if (mode != HEX) {
+ out.append("0x");
+ mode = HEX;
+ }
+ if (in.length() - i >= 4) {
+ // print a whole u32 at once
+ snprintf(buf, sizeof(buf), "%08x",
+ (uint32_t)(((unsigned char)in[i] << 24) |
+ ((unsigned char)in[i+1] << 16) |
+ ((unsigned char)in[i+2] << 8) |
+ ((unsigned char)in[i+3] << 0)));
+ i += 3;
+ } else {
+ snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
+ }
+ out.append(buf);
+ } else {
+ if (mode != STRING) {
+ out.push_back('\'');
+ mode = STRING;
+ from = i;
+ }
+ }
+ }
+ if (mode == STRING) {
+ out.append(in.c_str() + from, i - from);
+ out.push_back('\'');
+ }
+ return out;
+}
+
+template<typename T>
+static void _key_encode_shard(shard_id_t shard, T *key)
+{
+ key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
+}
+
+static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
+{
+ pshard->id = (uint8_t)*key - (uint8_t)0x80;
+ return key + 1;
+}
+
+static void get_coll_range(const coll_t& cid, int bits,
+ ghobject_t *temp_start, ghobject_t *temp_end,
+ ghobject_t *start, ghobject_t *end)
+{
+ spg_t pgid;
+ if (cid.is_pg(&pgid)) {
+ start->shard_id = pgid.shard;
+ *temp_start = *start;
+
+ start->hobj.pool = pgid.pool();
+ temp_start->hobj.pool = -2ll - pgid.pool();
+
+ *end = *start;
+ *temp_end = *temp_start;
+
+ uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
+ start->hobj.set_bitwise_key_u32(reverse_hash);
+ temp_start->hobj.set_bitwise_key_u32(reverse_hash);
+
+ uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
+ if (end_hash > 0xffffffffull)
+ end_hash = 0xffffffffull;
+
+ end->hobj.set_bitwise_key_u32(end_hash);
+ temp_end->hobj.set_bitwise_key_u32(end_hash);
+ } else {
+ start->shard_id = shard_id_t::NO_SHARD;
+ start->hobj.pool = -1ull;
+
+ *end = *start;
+ start->hobj.set_bitwise_key_u32(0);
+ end->hobj.set_bitwise_key_u32(0xffffffff);
+
+ // no separate temp section
+ *temp_start = *end;
+ *temp_end = *end;
+ }
+
+ start->generation = 0;
+ end->generation = 0;
+ temp_start->generation = 0;
+ temp_end->generation = 0;
+}
+
+static void get_shared_blob_key(uint64_t sbid, string *key)
+{
+ key->clear();
+ _key_encode_u64(sbid, key);
+}
+
+static int get_key_shared_blob(const string& key, uint64_t *sbid)
+{
+ const char *p = key.c_str();
+ if (key.length() < sizeof(uint64_t))
+ return -1;
+ _key_decode_u64(p, sbid);
+ return 0;
+}
+
+template<typename S>
+static void _key_encode_prefix(const ghobject_t& oid, S *key)
+{
+ _key_encode_shard(oid.shard_id, key);
+ _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
+ _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+}
+
+static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
+{
+ p = _key_decode_shard(p, &oid->shard_id);
+
+ uint64_t pool;
+ p = _key_decode_u64(p, &pool);
+ oid->hobj.pool = pool - 0x8000000000000000ull;
+
+ unsigned hash;
+ p = _key_decode_u32(p, &hash);
+
+ oid->hobj.set_bitwise_key_u32(hash);
+
+ return p;
+}
+
+#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
+
+template<typename S>
+static int get_key_object(const S& key, ghobject_t *oid)
+{
+ int r;
+ const char *p = key.c_str();
+
+ if (key.length() < ENCODED_KEY_PREFIX_LEN)
+ return -1;
+
+ p = _key_decode_prefix(p, oid);
+
+ if (key.length() == ENCODED_KEY_PREFIX_LEN)
+ return -2;
+
+ r = decode_escaped(p, &oid->hobj.nspace);
+ if (r < 0)
+ return -2;
+ p += r + 1;
+
+ string k;
+ r = decode_escaped(p, &k);
+ if (r < 0)
+ return -3;
+ p += r + 1;
+ if (*p == '=') {
+ // no key
+ ++p;
+ oid->hobj.oid.name = k;
+ } else if (*p == '<' || *p == '>') {
+ // key + name
+ ++p;
+ r = decode_escaped(p, &oid->hobj.oid.name);
+ if (r < 0)
+ return -5;
+ p += r + 1;
+ oid->hobj.set_key(k);
+ } else {
+ // malformed
+ return -6;
+ }
+
+ p = _key_decode_u64(p, &oid->hobj.snap.val);
+ p = _key_decode_u64(p, &oid->generation);
+
+ if (*p != ONODE_KEY_SUFFIX) {
+ return -7;
+ }
+ p++;
+ if (*p) {
+ // if we get something other than a null terminator here,
+ // something goes wrong.
+ return -8;
+ }
+
+ return 0;
+}
+
+template<typename S>
+static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
+{
+ key->clear();
+
+ size_t max_len = ENCODED_KEY_PREFIX_LEN +
+ (oid.hobj.nspace.length() * 3 + 1) +
+ (oid.hobj.get_key().length() * 3 + 1) +
+ 1 + // for '<', '=', or '>'
+ (oid.hobj.oid.name.length() * 3 + 1) +
+ 8 + 8 + 1;
+ key->reserve(max_len);
+
+ _key_encode_prefix(oid, key);
+
+ append_escaped(oid.hobj.nspace, key);
+
+ if (oid.hobj.get_key().length()) {
+ // is a key... could be < = or >.
+ append_escaped(oid.hobj.get_key(), key);
+ // (ASCII chars < = and > sort in that order, yay)
+ int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
+ if (r) {
+ key->append(r > 0 ? ">" : "<");
+ append_escaped(oid.hobj.oid.name, key);
+ } else {
+ // same as no key
+ key->append("=");
+ }
+ } else {
+ // no key
+ append_escaped(oid.hobj.oid.name, key);
+ key->append("=");
+ }
+
+ _key_encode_u64(oid.hobj.snap, key);
+ _key_encode_u64(oid.generation, key);
+
+ key->push_back(ONODE_KEY_SUFFIX);
+
+ // sanity check
+ if (true) {
+ ghobject_t t;
+ int r = get_key_object(*key, &t);
+ if (r || t != oid) {
+ derr << " r " << r << dendl;
+ derr << "key " << pretty_binary_string(*key) << dendl;
+ derr << "oid " << oid << dendl;
+ derr << " t " << t << dendl;
+ ceph_assert(r == 0 && t == oid);
+ }
+ }
+}
+
+
+// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
+// char lets us quickly test whether it is a shard key without decoding any
+// of the prefix bytes.
+template<typename S>
+static void get_extent_shard_key(const S& onode_key, uint32_t offset,
+ string *key)
+{
+ key->clear();
+ key->reserve(onode_key.length() + 4 + 1);
+ key->append(onode_key.c_str(), onode_key.size());
+ _key_encode_u32(offset, key);
+ key->push_back(EXTENT_SHARD_KEY_SUFFIX);
+}
+
+static void rewrite_extent_shard_key(uint32_t offset, string *key)
+{
+ ceph_assert(key->size() > sizeof(uint32_t) + 1);
+ ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
+ _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
+}
+
+template<typename S>
+static void generate_extent_shard_key_and_apply(
+ const S& onode_key,
+ uint32_t offset,
+ string *key,
+ std::function<void(const string& final_key)> apply)
+{
+ if (key->empty()) { // make full key
+ ceph_assert(!onode_key.empty());
+ get_extent_shard_key(onode_key, offset, key);
+ } else {
+ rewrite_extent_shard_key(offset, key);
+ }
+ apply(*key);
+}
+
+int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
+{
+ ceph_assert(key.size() > sizeof(uint32_t) + 1);
+ ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
+ int okey_len = key.size() - sizeof(uint32_t) - 1;
+ *onode_key = key.substr(0, okey_len);
+ const char *p = key.data() + okey_len;
+ _key_decode_u32(p, offset);
+ return 0;
+}
+
+static bool is_extent_shard_key(const string& key)
+{
+ return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
+}
+
+// '-' < '.' < '~'
+static void get_omap_header(uint64_t id, string *out)
+{
+ _key_encode_u64(id, out);
+ out->push_back('-');
+}
+
+// hmm, I don't think there's any need to escape the user key since we
+// have a clean prefix.
+static void get_omap_key(uint64_t id, const string& key, string *out)
+{
+ _key_encode_u64(id, out);
+ out->push_back('.');
+ out->append(key);
+}
+
+static void rewrite_omap_key(uint64_t id, string old, string *out)
+{
+ _key_encode_u64(id, out);
+ out->append(old.c_str() + out->length(), old.size() - out->length());
+}
+
+static void decode_omap_key(const string& key, string *user_key)
+{
+ *user_key = key.substr(sizeof(uint64_t) + 1);
+}
+
+static void get_omap_tail(uint64_t id, string *out)
+{
+ _key_encode_u64(id, out);
+ out->push_back('~');
+}
+
+static void get_deferred_key(uint64_t seq, string *out)
+{
+ _key_encode_u64(seq, out);
+}
+
+static void get_pool_stat_key(int64_t pool_id, string *key)
+{
+ key->clear();
+ _key_encode_u64(pool_id, key);
+}
+
+static int get_key_pool_stat(const string& key, uint64_t* pool_id)
+{
+ const char *p = key.c_str();
+ if (key.length() < sizeof(uint64_t))
+ return -1;
+ _key_decode_u64(p, pool_id);
+ return 0;
+}
+
+template <int LogLevelV>
+void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
+{
+ uint64_t pos = 0;
+ for (auto& s : em.shards) {
+ dout(LogLevelV) << __func__ << " shard " << *s.shard_info
+ << (s.loaded ? " (loaded)" : "")
+ << (s.dirty ? " (dirty)" : "")
+ << dendl;
+ }
+ for (auto& e : em.extent_map) {
+ dout(LogLevelV) << __func__ << " " << e << dendl;
+ ceph_assert(e.logical_offset >= pos);
+ pos = e.logical_offset + e.length;
+ const bluestore_blob_t& blob = e.blob->get_blob();
+ if (blob.has_csum()) {
+ vector<uint64_t> v;
+ unsigned n = blob.get_csum_count();
+ for (unsigned i = 0; i < n; ++i)
+ v.push_back(blob.get_csum_item(i));
+ dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
+ << dendl;
+ }
+ std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
+ for (auto& i : e.blob->shared_blob->bc.buffer_map) {
+ dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
+ << "~" << i.second->length << std::dec
+ << " " << *i.second << dendl;
+ }
+ }
+}
+
+template <int LogLevelV>
+void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
+{
+ if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
+ return;
+ dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
+ << " nid " << o.onode.nid
+ << " size 0x" << std::hex << o.onode.size
+ << " (" << std::dec << o.onode.size << ")"
+ << " expected_object_size " << o.onode.expected_object_size
+ << " expected_write_size " << o.onode.expected_write_size
+ << " in " << o.onode.extent_map_shards.size() << " shards"
+ << ", " << o.extent_map.spanning_blob_map.size()
+ << " spanning blobs"
+ << dendl;
+ for (auto p = o.onode.attrs.begin();
+ p != o.onode.attrs.end();
+ ++p) {
+ dout(LogLevelV) << __func__ << " attr " << p->first
+ << " len " << p->second.length() << dendl;
+ }
+ _dump_extent_map<LogLevelV>(cct, o.extent_map);
+}
+
+template <int LogLevelV>
+void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
+{
+ dout(LogLevelV) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t->dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+}
+
+// merge operators
+
+struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
+ void merge_nonexistent(
+ const char *rdata, size_t rlen, std::string *new_value) override {
+ *new_value = std::string(rdata, rlen);
+ }
+ void merge(
+ const char *ldata, size_t llen,
+ const char *rdata, size_t rlen,
+ std::string *new_value) override {
+ ceph_assert(llen == rlen);
+ ceph_assert((rlen % 8) == 0);
+ new_value->resize(rlen);
+ const ceph_le64* lv = (const ceph_le64*)ldata;
+ const ceph_le64* rv = (const ceph_le64*)rdata;
+ ceph_le64* nv = &(ceph_le64&)new_value->at(0);
+ for (size_t i = 0; i < rlen >> 3; ++i) {
+ nv[i] = lv[i] + rv[i];
+ }
+ }
+ // We use each operator name and each prefix to construct the
+ // overall RocksDB operator name for consistency check at open time.
+ const char *name() const override {
+ return "int64_array";
+ }
+};
+
+
+// Buffer
+
+ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
+{
+ out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
+ << b.offset << "~" << b.length << std::dec
+ << " " << BlueStore::Buffer::get_state_name(b.state);
+ if (b.flags)
+ out << " " << BlueStore::Buffer::get_flag_name(b.flags);
+ return out << ")";
+}
+
+namespace {
+
+/*
+ * Due to a bug in key string encoding (see a comment for append_escaped)
+ * the KeyValueDB iterator does not lexicographically sort the same
+ * way that ghobject_t does: objects with the same hash may have wrong order.
+ *
+ * This is the iterator wrapper that fixes the keys order.
+ */
+
+class CollectionListIterator {
+public:
+ CollectionListIterator(const KeyValueDB::Iterator &it)
+ : m_it(it) {
+ }
+ virtual ~CollectionListIterator() {
+ }
+
+ virtual bool valid() const = 0;
+ virtual const ghobject_t &oid() const = 0;
+ virtual void lower_bound(const ghobject_t &oid) = 0;
+ virtual void upper_bound(const ghobject_t &oid) = 0;
+ virtual void next() = 0;
+
+ virtual int cmp(const ghobject_t &oid) const = 0;
+
+ bool is_ge(const ghobject_t &oid) const {
+ return cmp(oid) >= 0;
+ }
+
+ bool is_lt(const ghobject_t &oid) const {
+ return cmp(oid) < 0;
+ }
+
+protected:
+ KeyValueDB::Iterator m_it;
+};
+
+class SimpleCollectionListIterator : public CollectionListIterator {
+public:
+ SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
+ : CollectionListIterator(it), m_cct(cct) {
+ }
+
+ bool valid() const override {
+ return m_it->valid();
+ }
+
+ const ghobject_t &oid() const override {
+ ceph_assert(valid());
+
+ return m_oid;
+ }
+
+ void lower_bound(const ghobject_t &oid) override {
+ string key;
+ get_object_key(m_cct, oid, &key);
+
+ m_it->lower_bound(key);
+ get_oid();
+ }
+
+ void upper_bound(const ghobject_t &oid) override {
+ string key;
+ get_object_key(m_cct, oid, &key);
+
+ m_it->upper_bound(key);
+ get_oid();
+ }
+
+ void next() override {
+ ceph_assert(valid());
+
+ m_it->next();
+ get_oid();
+ }
+
+ int cmp(const ghobject_t &oid) const override {
+ ceph_assert(valid());
+
+ string key;
+ get_object_key(m_cct, oid, &key);
+
+ return m_it->key().compare(key);
+ }
+
+private:
+ CephContext *m_cct;
+ ghobject_t m_oid;
+
+ void get_oid() {
+ if (!valid()) {
+ return;
+ }
+
+ if (is_extent_shard_key(m_it->key())) {
+ next();
+ return;
+ }
+
+ m_oid = ghobject_t();
+ int r = get_key_object(m_it->key(), &m_oid);
+ ceph_assert(r == 0);
+ }
+};
+
+class SortedCollectionListIterator : public CollectionListIterator {
+public:
+ SortedCollectionListIterator(const KeyValueDB::Iterator &it)
+ : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
+ }
+
+ bool valid() const override {
+ return m_chunk_iter != m_chunk.end();
+ }
+
+ const ghobject_t &oid() const override {
+ ceph_assert(valid());
+
+ return m_chunk_iter->first;
+ }
+
+ void lower_bound(const ghobject_t &oid) override {
+ std::string key;
+ _key_encode_prefix(oid, &key);
+
+ m_it->lower_bound(key);
+ m_chunk_iter = m_chunk.end();
+ if (!get_next_chunk()) {
+ return;
+ }
+
+ if (this->oid().shard_id != oid.shard_id ||
+ this->oid().hobj.pool != oid.hobj.pool ||
+ this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+ return;
+ }
+
+ m_chunk_iter = m_chunk.lower_bound(oid);
+ if (m_chunk_iter == m_chunk.end()) {
+ get_next_chunk();
+ }
+ }
+
+ void upper_bound(const ghobject_t &oid) override {
+ lower_bound(oid);
+
+ if (valid() && this->oid() == oid) {
+ next();
+ }
+ }
+
+ void next() override {
+ ceph_assert(valid());
+
+ m_chunk_iter++;
+ if (m_chunk_iter == m_chunk.end()) {
+ get_next_chunk();
+ }
+ }
+
+ int cmp(const ghobject_t &oid) const override {
+ ceph_assert(valid());
+
+ if (this->oid() < oid) {
+ return -1;
+ }
+ if (this->oid() > oid) {
+ return 1;
+ }
+ return 0;
+ }
+
+private:
+ std::map<ghobject_t, std::string> m_chunk;
+ std::map<ghobject_t, std::string>::iterator m_chunk_iter;
+
+ bool get_next_chunk() {
+ while (m_it->valid() && is_extent_shard_key(m_it->key())) {
+ m_it->next();
+ }
+
+ if (!m_it->valid()) {
+ return false;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(m_it->key(), &oid);
+ ceph_assert(r == 0);
+
+ m_chunk.clear();
+ while (true) {
+ m_chunk.insert({oid, m_it->key()});
+
+ do {
+ m_it->next();
+ } while (m_it->valid() && is_extent_shard_key(m_it->key()));
+
+ if (!m_it->valid()) {
+ break;
+ }
+
+ ghobject_t next;
+ r = get_key_object(m_it->key(), &next);
+ ceph_assert(r == 0);
+ if (next.shard_id != oid.shard_id ||
+ next.hobj.pool != oid.hobj.pool ||
+ next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+ break;
+ }
+ oid = next;
+ }
+
+ m_chunk_iter = m_chunk.begin();
+ return true;
+ }
+};
+
+} // anonymous namespace
+
+// Garbage Collector
+
+void BlueStore::GarbageCollector::process_protrusive_extents(
+ const BlueStore::ExtentMap& extent_map,
+ uint64_t start_offset,
+ uint64_t end_offset,
+ uint64_t start_touch_offset,
+ uint64_t end_touch_offset,
+ uint64_t min_alloc_size)
+{
+ ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
+
+ uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
+ uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
+
+ dout(30) << __func__ << " (hex): [" << std::hex
+ << lookup_start_offset << ", " << lookup_end_offset
+ << ")" << std::dec << dendl;
+
+ for (auto it = extent_map.seek_lextent(lookup_start_offset);
+ it != extent_map.extent_map.end() &&
+ it->logical_offset < lookup_end_offset;
+ ++it) {
+ uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
+ uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
+
+ dout(30) << __func__ << " " << *it
+ << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
+ << dendl;
+
+ Blob* b = it->blob.get();
+
+ if (it->logical_offset >=start_touch_offset &&
+ it->logical_end() <= end_touch_offset) {
+ // Process extents within the range affected by
+ // the current write request.
+ // Need to take into account if existing extents
+ // can be merged with them (uncompressed case)
+ if (!b->get_blob().is_compressed()) {
+ if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
+ --blob_info_counted->expected_allocations; // don't need to allocate
+ // new AU for compressed
+ // data since another
+ // collocated uncompressed
+ // blob already exists
+ dout(30) << __func__ << " --expected:"
+ << alloc_unit_start << dendl;
+ }
+ used_alloc_unit = alloc_unit_end;
+ blob_info_counted = nullptr;
+ }
+ } else if (b->get_blob().is_compressed()) {
+
+ // additionally we take compressed blobs that were not impacted
+ // by the write into account too
+ BlobInfo& bi =
+ affected_blobs.emplace(
+ b, BlobInfo(b->get_referenced_bytes())).first->second;
+
+ int adjust =
+ (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
+ bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
+ dout(30) << __func__ << " expected_allocations="
+ << bi.expected_allocations << " end_au:"
+ << alloc_unit_end << dendl;
+
+ blob_info_counted = &bi;
+ used_alloc_unit = alloc_unit_end;
+
+ ceph_assert(it->length <= bi.referenced_bytes);
+ bi.referenced_bytes -= it->length;
+ dout(30) << __func__ << " affected_blob:" << *b
+ << " unref 0x" << std::hex << it->length
+ << " referenced = 0x" << bi.referenced_bytes
+ << std::dec << dendl;
+ // NOTE: we can't move specific blob to resulting GC list here
+ // when reference counter == 0 since subsequent extents might
+ // decrement its expected_allocation.
+ // Hence need to enumerate all the extents first.
+ if (!bi.collect_candidate) {
+ bi.first_lextent = it;
+ bi.collect_candidate = true;
+ }
+ bi.last_lextent = it;
+ } else {
+ if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
+ // don't need to allocate new AU for compressed data since another
+ // collocated uncompressed blob already exists
+ --blob_info_counted->expected_allocations;
+ dout(30) << __func__ << " --expected_allocations:"
+ << alloc_unit_start << dendl;
+ }
+ used_alloc_unit = alloc_unit_end;
+ blob_info_counted = nullptr;
+ }
+ }
+
+ for (auto b_it = affected_blobs.begin();
+ b_it != affected_blobs.end();
+ ++b_it) {
+ Blob* b = b_it->first;
+ BlobInfo& bi = b_it->second;
+ if (bi.referenced_bytes == 0) {
+ uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
+ int64_t blob_expected_for_release =
+ round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
+
+ dout(30) << __func__ << " " << *(b_it->first)
+ << " expected4release=" << blob_expected_for_release
+ << " expected_allocations=" << bi.expected_allocations
+ << dendl;
+ int64_t benefit = blob_expected_for_release - bi.expected_allocations;
+ if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
+ if (bi.collect_candidate) {
+ auto it = bi.first_lextent;
+ bool bExit = false;
+ do {
+ if (it->blob.get() == b) {
+ extents_to_collect.insert(it->logical_offset, it->length);
+ }
+ bExit = it == bi.last_lextent;
+ ++it;
+ } while (!bExit);
+ }
+ expected_for_release += blob_expected_for_release;
+ expected_allocations += bi.expected_allocations;
+ }
+ }
+ }
+}
+
+int64_t BlueStore::GarbageCollector::estimate(
+ uint64_t start_offset,
+ uint64_t length,
+ const BlueStore::ExtentMap& extent_map,
+ const BlueStore::old_extent_map_t& old_extents,
+ uint64_t min_alloc_size)
+{
+
+ affected_blobs.clear();
+ extents_to_collect.clear();
+ used_alloc_unit = boost::optional<uint64_t >();
+ blob_info_counted = nullptr;
+
+ uint64_t gc_start_offset = start_offset;
+ uint64_t gc_end_offset = start_offset + length;
+
+ uint64_t end_offset = start_offset + length;
+
+ for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
+ Blob* b = it->e.blob.get();
+ if (b->get_blob().is_compressed()) {
+
+ // update gc_start_offset/gc_end_offset if needed
+ gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
+ gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
+
+ auto o = it->e.logical_offset;
+ auto l = it->e.length;
+
+ uint64_t ref_bytes = b->get_referenced_bytes();
+ // micro optimization to bypass blobs that have no more references
+ if (ref_bytes != 0) {
+ dout(30) << __func__ << " affected_blob:" << *b
+ << " unref 0x" << std::hex << o << "~" << l
+ << std::dec << dendl;
+ affected_blobs.emplace(b, BlobInfo(ref_bytes));
+ }
+ }
+ }
+ dout(30) << __func__ << " gc range(hex): [" << std::hex
+ << gc_start_offset << ", " << gc_end_offset
+ << ")" << std::dec << dendl;
+
+ // enumerate preceeding extents to check if they reference affected blobs
+ if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
+ process_protrusive_extents(extent_map,
+ gc_start_offset,
+ gc_end_offset,
+ start_offset,
+ end_offset,
+ min_alloc_size);
+ }
+ return expected_for_release - expected_allocations;
+}
+
+// Cache
+
+BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
+ PerfCounters *logger)
+{
+ Cache *c = nullptr;
+
+ if (type == "lru")
+ c = new LRUCache(cct);
+ else if (type == "2q")
+ c = new TwoQCache(cct);
+ else
+ ceph_abort_msg("unrecognized cache type");
+
+ c->logger = logger;
+ return c;
+}
+
+void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max)
+{
+ std::lock_guard l(lock);
+ if (cct->_conf->objectstore_blackhole) {
+ // do not trim if we are throwing away IOs a layer down
+ return;
+ }
+ _trim(onode_max, buffer_max);
+}
+
+void BlueStore::Cache::trim_all()
+{
+ std::lock_guard l(lock);
+ // we should not be shutting down after the blackhole is enabled
+ assert(!cct->_conf->objectstore_blackhole);
+ _trim(0, 0);
+}
+
+// LRUCache
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
+
+void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
+{
+ auto p = onode_lru.iterator_to(*o);
+ onode_lru.erase(p);
+ onode_lru.push_front(*o);
+}
+
+void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
+{
+ dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
+ << " buffers " << buffer_size << " / " << buffer_max
+ << dendl;
+
+ _audit("trim start");
+
+ // buffers
+ while (buffer_size > buffer_max) {
+ auto i = buffer_lru.rbegin();
+ if (i == buffer_lru.rend()) {
+ // stop if buffer_lru is now empty
+ break;
+ }
+
+ Buffer *b = &*i;
+ ceph_assert(b->is_clean());
+ dout(20) << __func__ << " rm " << *b << dendl;
+ b->space->_rm_buffer(this, b);
+ }
+
+ // onodes
+ if (onode_max >= onode_lru.size() ||
+ last_pinned == onode_lru.begin()) {
+ return; // don't even try
+ }
+ uint64_t num = onode_lru.size() - onode_max;
+
+ auto p = last_pinned;
+ last_pinned = onode_lru.end();
+ ceph_assert(p != onode_lru.begin());
+ --p;
+ int skipped = 0;
+ int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
+ while (num > 0) {
+ Onode *o = &*p;
+ int refs = o->nref.load();
+ if (refs > 1) {
+ dout(20) << __func__ << " " << o->oid << " has " << refs
+ << " refs, skipping" << dendl;
+ if (++skipped >= max_skipped) {
+ dout(15) << __func__ << " maximum skip pinned reached; stopping with "
+ << num << " left to trim" << dendl;
+ last_pinned = p;
+ break;
+ }
+
+ if (p == onode_lru.begin()) {
+ break;
+ } else {
+ p--;
+ num--;
+ continue;
+ }
+ }
+ dout(30) << __func__ << " rm " << o->oid << dendl;
+ if (p != onode_lru.begin()) {
+ _onode_lru_erase(p--);
+ } else {
+ _onode_lru_erase(p);
+ num = 1; // fake num to end the loop
+ // in we might still have some pinned onodes
+ }
+ o->get(); // paranoia
+ o->c->onode_map.remove(o->oid);
+ o->put();
+ --num;
+ }
+}
+
+#ifdef DEBUG_CACHE
+void BlueStore::LRUCache::_audit(const char *when)
+{
+ dout(10) << __func__ << " " << when << " start" << dendl;
+ uint64_t s = 0;
+ for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
+ s += i->length;
+ }
+ if (s != buffer_size) {
+ derr << __func__ << " buffer_size " << buffer_size << " actual " << s
+ << dendl;
+ for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
+ derr << __func__ << " " << *i << dendl;
+ }
+ ceph_assert(s == buffer_size);
+ }
+ dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
+ << " ok" << dendl;
+}
+#endif
+
+// TwoQCache
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
+
+
+void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
+{
+ auto p = onode_lru.iterator_to(*o);
+ _onode_lru_erase(p);
+ onode_lru.push_front(*o);
+}
+
+void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
+{
+ dout(20) << __func__ << " level " << level << " near " << near
+ << " on " << *b
+ << " which has cache_private " << b->cache_private << dendl;
+ if (near) {
+ b->cache_private = near->cache_private;
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
+ break;
+ case BUFFER_WARM_OUT:
+ ceph_assert(b->is_empty());
+ buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
+ break;
+ case BUFFER_HOT:
+ buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+ } else if (b->cache_private == BUFFER_NEW) {
+ b->cache_private = BUFFER_WARM_IN;
+ if (level > 0) {
+ buffer_warm_in.push_front(*b);
+ } else {
+ // take caller hint to start at the back of the warm queue
+ buffer_warm_in.push_back(*b);
+ }
+ } else {
+ // we got a hint from discard
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ // stay in warm_in. move to front, even though 2Q doesn't actually
+ // do this.
+ dout(20) << __func__ << " move to front of warm " << *b << dendl;
+ buffer_warm_in.push_front(*b);
+ break;
+ case BUFFER_WARM_OUT:
+ b->cache_private = BUFFER_HOT;
+ // move to hot. fall-thru
+ case BUFFER_HOT:
+ dout(20) << __func__ << " move to front of hot " << *b << dendl;
+ buffer_hot.push_front(*b);
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+ }
+ if (!b->is_empty()) {
+ buffer_bytes += b->length;
+ buffer_list_bytes[b->cache_private] += b->length;
+ }
+}
+
+void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
+{
+ dout(20) << __func__ << " " << *b << dendl;
+ if (!b->is_empty()) {
+ ceph_assert(buffer_bytes >= b->length);
+ buffer_bytes -= b->length;
+ ceph_assert(buffer_list_bytes[b->cache_private] >= b->length);
+ buffer_list_bytes[b->cache_private] -= b->length;
+ }
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
+ break;
+ case BUFFER_WARM_OUT:
+ buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
+ break;
+ case BUFFER_HOT:
+ buffer_hot.erase(buffer_hot.iterator_to(*b));
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+}
+
+void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
+{
+ TwoQCache *src = static_cast<TwoQCache*>(srcc);
+ src->_rm_buffer(b);
+
+ // preserve which list we're on (even if we can't preserve the order!)
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ ceph_assert(!b->is_empty());
+ buffer_warm_in.push_back(*b);
+ break;
+ case BUFFER_WARM_OUT:
+ ceph_assert(b->is_empty());
+ buffer_warm_out.push_back(*b);
+ break;
+ case BUFFER_HOT:
+ ceph_assert(!b->is_empty());
+ buffer_hot.push_back(*b);
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+ if (!b->is_empty()) {
+ buffer_bytes += b->length;
+ buffer_list_bytes[b->cache_private] += b->length;
+ }
+}
+
+void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
+{
+ dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
+ if (!b->is_empty()) {
+ ceph_assert((int64_t)buffer_bytes + delta >= 0);
+ buffer_bytes += delta;
+ ceph_assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
+ buffer_list_bytes[b->cache_private] += delta;
+ }
+}
+
+void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
+{
+ dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
+ << " buffers " << buffer_bytes << " / " << buffer_max
+ << dendl;
+
+ _audit("trim start");
+
+ // buffers
+ if (buffer_bytes > buffer_max) {
+ uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio;
+ uint64_t khot = buffer_max - kin;
+
+ // pre-calculate kout based on average buffer size too,
+ // which is typical(the warm_in and hot lists may change later)
+ uint64_t kout = 0;
+ uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
+ if (buffer_num) {
+ uint64_t buffer_avg_size = buffer_bytes / buffer_num;
+ ceph_assert(buffer_avg_size);
+ uint64_t calculated_buffer_num = buffer_max / buffer_avg_size;
+ kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
+ }
+
+ if (buffer_list_bytes[BUFFER_HOT] < khot) {
+ // hot is small, give slack to warm_in
+ kin += khot - buffer_list_bytes[BUFFER_HOT];
+ } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
+ // warm_in is small, give slack to hot
+ khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
+ }
+
+ // adjust warm_in list
+ int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
+ uint64_t evicted = 0;
+
+ while (to_evict_bytes > 0) {
+ auto p = buffer_warm_in.rbegin();
+ if (p == buffer_warm_in.rend()) {
+ // stop if warm_in list is now empty
+ break;
+ }
+
+ Buffer *b = &*p;
+ ceph_assert(b->is_clean());
+ dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
+ ceph_assert(buffer_bytes >= b->length);
+ buffer_bytes -= b->length;
+ ceph_assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
+ buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
+ to_evict_bytes -= b->length;
+ evicted += b->length;
+ b->state = Buffer::STATE_EMPTY;
+ b->data.clear();
+ buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
+ buffer_warm_out.push_front(*b);
+ b->cache_private = BUFFER_WARM_OUT;
+ }
+
+ if (evicted > 0) {
+ dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+ << " from warm_in list, done evicting warm_in buffers"
+ << dendl;
+ }
+
+ // adjust hot list
+ to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
+ evicted = 0;
+
+ while (to_evict_bytes > 0) {
+ auto p = buffer_hot.rbegin();
+ if (p == buffer_hot.rend()) {
+ // stop if hot list is now empty
+ break;
+ }
+
+ Buffer *b = &*p;
+ dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
+ ceph_assert(b->is_clean());
+ // adjust evict size before buffer goes invalid
+ to_evict_bytes -= b->length;
+ evicted += b->length;
+ b->space->_rm_buffer(this, b);
+ }
+
+ if (evicted > 0) {
+ dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+ << " from hot list, done evicting hot buffers"
+ << dendl;
+ }
+
+ // adjust warm out list too, if necessary
+ int64_t num = buffer_warm_out.size() - kout;
+ while (num-- > 0) {
+ Buffer *b = &*buffer_warm_out.rbegin();
+ ceph_assert(b->is_empty());
+ dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
+ b->space->_rm_buffer(this, b);
+ }
+ }
+
+ // onodes
+ if (onode_max >= onode_lru.size() ||
+ last_pinned == onode_lru.begin()) {
+ return; // don't even try
+ }
+ uint64_t num = onode_lru.size() - onode_max;
+
+ auto p = last_pinned;
+ last_pinned = onode_lru.end();
+ ceph_assert(p != onode_lru.begin());
+ --p;
+ int skipped = 0;
+ int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
+ while (num > 0) {
+ Onode *o = &*p;
+ dout(20) << __func__ << " considering " << o << dendl;
+ int refs = o->nref.load();
+ if (refs > 1) {
+ dout(20) << __func__ << " " << o->oid << " has " << refs
+ << " refs; skipping" << dendl;
+ if (++skipped >= max_skipped) {
+ dout(15) << __func__ << " maximum skip pinned reached; stopping with "
+ << num << " left to trim" << dendl;
+ last_pinned = p;
+ break;
+ }
+
+ if (p == onode_lru.begin()) {
+ break;
+ } else {
+ p--;
+ num--;
+ continue;
+ }
+ }
+ dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
+ if (p != onode_lru.begin()) {
+ _onode_lru_erase(p--);
+ } else {
+ _onode_lru_erase(p);
+ num = 1; // fake num to end the loop
+ // in we might still have some pinned onodes
+ }
+ o->get(); // paranoia
+ o->c->onode_map.remove(o->oid);
+ o->put();
+ --num;
+ }
+}
+
+#ifdef DEBUG_CACHE
+void BlueStore::TwoQCache::_audit(const char *when)
+{
+ dout(10) << __func__ << " " << when << " start" << dendl;
+ uint64_t s = 0;
+ for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
+ s += i->length;
+ }
+
+ uint64_t hot_bytes = s;
+ if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
+ derr << __func__ << " hot_list_bytes "
+ << buffer_list_bytes[BUFFER_HOT]
+ << " != actual " << hot_bytes
+ << dendl;
+ ceph_assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
+ }
+
+ for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
+ s += i->length;
+ }
+
+ uint64_t warm_in_bytes = s - hot_bytes;
+ if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
+ derr << __func__ << " warm_in_list_bytes "
+ << buffer_list_bytes[BUFFER_WARM_IN]
+ << " != actual " << warm_in_bytes
+ << dendl;
+ ceph_assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
+ }
+
+ if (s != buffer_bytes) {
+ derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
+ << dendl;
+ ceph_assert(s == buffer_bytes);
+ }
+
+ dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+ << " ok" << dendl;
+}
+#endif
+
+
+// BufferSpace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
+
+void BlueStore::BufferSpace::_clear(Cache* cache)
+{
+ // note: we already hold cache->lock
+ ldout(cache->cct, 20) << __func__ << dendl;
+ while (!buffer_map.empty()) {
+ _rm_buffer(cache, buffer_map.begin());
+ }
+}
+
+int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
+{
+ // note: we already hold cache->lock
+ ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
+ << std::dec << dendl;
+ int cache_private = 0;
+ cache->_audit("discard start");
+ auto i = _data_lower_bound(offset);
+ uint32_t end = offset + length;
+ while (i != buffer_map.end()) {
+ Buffer *b = i->second.get();
+ if (b->offset >= end) {
+ break;
+ }
+ if (b->cache_private > cache_private) {
+ cache_private = b->cache_private;
+ }
+ if (b->offset < offset) {
+ int64_t front = offset - b->offset;
+ if (b->end() > end) {
+ // drop middle (split)
+ uint32_t tail = b->end() - end;
+ if (b->data.length()) {
+ bufferlist bl;
+ bl.substr_of(b->data, b->length - tail, tail);
+ Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
+ nb->maybe_rebuild();
+ _add_buffer(cache, nb, 0, b);
+ } else {
+ _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
+ 0, b);
+ }
+ if (!b->is_writing()) {
+ cache->_adjust_buffer_size(b, front - (int64_t)b->length);
+ }
+ b->truncate(front);
+ b->maybe_rebuild();
+ cache->_audit("discard end 1");
+ break;
+ } else {
+ // drop tail
+ if (!b->is_writing()) {
+ cache->_adjust_buffer_size(b, front - (int64_t)b->length);
+ }
+ b->truncate(front);
+ b->maybe_rebuild();
+ ++i;
+ continue;
+ }
+ }
+ if (b->end() <= end) {
+ // drop entire buffer
+ _rm_buffer(cache, i++);
+ continue;
+ }
+ // drop front
+ uint32_t keep = b->end() - end;
+ if (b->data.length()) {
+ bufferlist bl;
+ bl.substr_of(b->data, b->length - keep, keep);
+ Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
+ nb->maybe_rebuild();
+ _add_buffer(cache, nb, 0, b);
+ } else {
+ _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
+ }
+ _rm_buffer(cache, i);
+ cache->_audit("discard end 2");
+ break;
+ }
+ return cache_private;
+}
+
+void BlueStore::BufferSpace::read(
+ Cache* cache,
+ uint32_t offset,
+ uint32_t length,
+ BlueStore::ready_regions_t& res,
+ interval_set<uint32_t>& res_intervals,
+ int flags)
+{
+ res.clear();
+ res_intervals.clear();
+ uint32_t want_bytes = length;
+ uint32_t end = offset + length;
+
+ {
+ std::lock_guard l(cache->lock);
+ for (auto i = _data_lower_bound(offset);
+ i != buffer_map.end() && offset < end && i->first < end;
+ ++i) {
+ Buffer *b = i->second.get();
+ ceph_assert(b->end() > offset);
+
+ bool val = false;
+ if (flags & BYPASS_CLEAN_CACHE)
+ val = b->is_writing();
+ else
+ val = b->is_writing() || b->is_clean();
+ if (val) {
+ if (b->offset < offset) {
+ uint32_t skip = offset - b->offset;
+ uint32_t l = min(length, b->length - skip);
+ res[offset].substr_of(b->data, skip, l);
+ res_intervals.insert(offset, l);
+ offset += l;
+ length -= l;
+ if (!b->is_writing()) {
+ cache->_touch_buffer(b);
+ }
+ continue;
+ }
+ if (b->offset > offset) {
+ uint32_t gap = b->offset - offset;
+ if (length <= gap) {
+ break;
+ }
+ offset += gap;
+ length -= gap;
+ }
+ if (!b->is_writing()) {
+ cache->_touch_buffer(b);
+ }
+ if (b->length > length) {
+ res[offset].substr_of(b->data, 0, length);
+ res_intervals.insert(offset, length);
+ break;
+ } else {
+ res[offset].append(b->data);
+ res_intervals.insert(offset, b->length);
+ if (b->length == length)
+ break;
+ offset += b->length;
+ length -= b->length;
+ }
+ }
+ }
+ }
+
+ uint64_t hit_bytes = res_intervals.size();
+ ceph_assert(hit_bytes <= want_bytes);
+ uint64_t miss_bytes = want_bytes - hit_bytes;
+ cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
+ cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
+}
+
+void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq)
+{
+ auto i = writing.begin();
+ while (i != writing.end()) {
+ if (i->seq > seq) {
+ break;
+ }
+ if (i->seq < seq) {
+ ++i;
+ continue;
+ }
+
+ Buffer *b = &*i;
+ ceph_assert(b->is_writing());
+
+ if (b->flags & Buffer::FLAG_NOCACHE) {
+ writing.erase(i++);
+ ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
+ buffer_map.erase(b->offset);
+ } else {
+ b->state = Buffer::STATE_CLEAN;
+ writing.erase(i++);
+ b->maybe_rebuild();
+ b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
+ cache->_add_buffer(b, 1, nullptr);
+ ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
+ }
+ }
+
+ cache->_audit("finish_write end");
+}
+
+void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
+{
+ std::lock_guard lk(cache->lock);
+ if (buffer_map.empty())
+ return;
+
+ auto p = --buffer_map.end();
+ while (true) {
+ if (p->second->end() <= pos)
+ break;
+
+ if (p->second->offset < pos) {
+ ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
+ size_t left = pos - p->second->offset;
+ size_t right = p->second->length - left;
+ if (p->second->data.length()) {
+ bufferlist bl;
+ bl.substr_of(p->second->data, left, right);
+ r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
+ 0, p->second.get());
+ } else {
+ r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
+ 0, p->second.get());
+ }
+ cache->_adjust_buffer_size(p->second.get(), -right);
+ p->second->truncate(left);
+ break;
+ }
+
+ ceph_assert(p->second->end() > pos);
+ ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
+ if (p->second->data.length()) {
+ r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+ p->second->offset - pos, p->second->data),
+ 0, p->second.get());
+ } else {
+ r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+ p->second->offset - pos, p->second->length),
+ 0, p->second.get());
+ }
+ if (p == buffer_map.begin()) {
+ _rm_buffer(cache, p);
+ break;
+ } else {
+ _rm_buffer(cache, p--);
+ }
+ }
+ ceph_assert(writing.empty());
+}
+
+// OnodeSpace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
+
+BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
+{
+ std::lock_guard l(cache->lock);
+ auto p = onode_map.find(oid);
+ if (p != onode_map.end()) {
+ ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
+ << " raced, returning existing " << p->second
+ << dendl;
+ return p->second;
+ }
+ ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
+ onode_map[oid] = o;
+ cache->_add_onode(o, 1);
+ return o;
+}
+
+BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
+{
+ ldout(cache->cct, 30) << __func__ << dendl;
+ OnodeRef o;
+ bool hit = false;
+
+ {
+ std::lock_guard l(cache->lock);
+ ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
+ if (p == onode_map.end()) {
+ ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
+ } else {
+ ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
+ << dendl;
+ cache->_touch_onode(p->second);
+ hit = true;
+ o = p->second;
+ }
+ }
+
+ if (hit) {
+ cache->logger->inc(l_bluestore_onode_hits);
+ } else {
+ cache->logger->inc(l_bluestore_onode_misses);
+ }
+ return o;
+}
+
+void BlueStore::OnodeSpace::clear()
+{
+ std::lock_guard l(cache->lock);
+ ldout(cache->cct, 10) << __func__ << dendl;
+ for (auto &p : onode_map) {
+ cache->_rm_onode(p.second);
+ }
+ onode_map.clear();
+}
+
+bool BlueStore::OnodeSpace::empty()
+{
+ std::lock_guard l(cache->lock);
+ return onode_map.empty();
+}
+
+void BlueStore::OnodeSpace::rename(
+ OnodeRef& oldo,
+ const ghobject_t& old_oid,
+ const ghobject_t& new_oid,
+ const mempool::bluestore_cache_meta::string& new_okey)
+{
+ std::lock_guard l(cache->lock);
+ ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
+ << dendl;
+ ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
+ po = onode_map.find(old_oid);
+ pn = onode_map.find(new_oid);
+ ceph_assert(po != pn);
+
+ ceph_assert(po != onode_map.end());
+ if (pn != onode_map.end()) {
+ ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
+ << dendl;
+ cache->_rm_onode(pn->second);
+ onode_map.erase(pn);
+ }
+ OnodeRef o = po->second;
+
+ // install a non-existent onode at old location
+ oldo.reset(new Onode(o->c, old_oid, o->key));
+ po->second = oldo;
+ cache->_add_onode(po->second, 1);
+
+ // add at new position and fix oid, key
+ onode_map.insert(make_pair(new_oid, o));
+ cache->_touch_onode(o);
+ o->oid = new_oid;
+ o->key = new_okey;
+}
+
+bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
+{
+ std::lock_guard l(cache->lock);
+ ldout(cache->cct, 20) << __func__ << dendl;
+ for (auto& i : onode_map) {
+ if (f(i.second)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+template <int LogLevelV = 30>
+void BlueStore::OnodeSpace::dump(CephContext *cct)
+{
+ for (auto& i : onode_map) {
+ ldout(cct, LogLevelV) << i.first << " : " << i.second << dendl;
+ }
+}
+
+// SharedBlob
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
+
+ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
+{
+ out << "SharedBlob(" << &sb;
+
+ if (sb.loaded) {
+ out << " loaded " << *sb.persistent;
+ } else {
+ out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
+ }
+ return out << ")";
+}
+
+BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
+ : coll(_coll), sbid_unloaded(i)
+{
+ ceph_assert(sbid_unloaded > 0);
+ if (get_cache()) {
+ get_cache()->add_blob();
+ }
+}
+
+BlueStore::SharedBlob::~SharedBlob()
+{
+ if (loaded && persistent) {
+ delete persistent;
+ }
+}
+
+void BlueStore::SharedBlob::put()
+{
+ if (--nref == 0) {
+ ldout(coll->store->cct, 20) << __func__ << " " << this
+ << " removing self from set " << get_parent()
+ << dendl;
+ again:
+ auto coll_snap = coll;
+ if (coll_snap) {
+ std::lock_guard l(coll_snap->cache->lock);
+ if (coll_snap != coll) {
+ goto again;
+ }
+ if (!coll_snap->shared_blob_set.remove(this, true)) {
+ // race with lookup
+ return;
+ }
+ bc._clear(coll_snap->cache);
+ coll_snap->cache->rm_blob();
+ }
+ delete this;
+ }
+}
+
+void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
+{
+ ceph_assert(persistent);
+ persistent->ref_map.get(offset, length);
+}
+
+void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
+ PExtentVector *r,
+ bool *unshare)
+{
+ ceph_assert(persistent);
+ persistent->ref_map.put(offset, length, r,
+ unshare && !*unshare ? unshare : nullptr);
+}
+
+void BlueStore::SharedBlob::finish_write(uint64_t seq)
+{
+ while (true) {
+ Cache *cache = coll->cache;
+ std::lock_guard l(cache->lock);
+ if (coll->cache != cache) {
+ ldout(coll->store->cct, 20) << __func__
+ << " raced with sb cache update, was " << cache
+ << ", now " << coll->cache << ", retrying"
+ << dendl;
+ continue;
+ }
+ bc._finish_write(cache, seq);
+ break;
+ }
+}
+
+// SharedBlobSet
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
+
+template <int LogLevelV = 30>
+void BlueStore::SharedBlobSet::dump(CephContext *cct)
+{
+ std::lock_guard l(lock);
+ for (auto& i : sb_map) {
+ ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
+ }
+}
+
+// Blob
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
+
+ostream& operator<<(ostream& out, const BlueStore::Blob& b)
+{
+ out << "Blob(" << &b;
+ if (b.is_spanning()) {
+ out << " spanning " << b.id;
+ }
+ out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
+ if (b.shared_blob) {
+ out << " " << *b.shared_blob;
+ } else {
+ out << " (shared_blob=NULL)";
+ }
+ out << ")";
+ return out;
+}
+
+void BlueStore::Blob::discard_unallocated(Collection *coll)
+{
+ if (get_blob().is_shared()) {
+ return;
+ }
+ if (get_blob().is_compressed()) {
+ bool discard = false;
+ bool all_invalid = true;
+ for (auto e : get_blob().get_extents()) {
+ if (!e.is_valid()) {
+ discard = true;
+ } else {
+ all_invalid = false;
+ }
+ }
+ ceph_assert(discard == all_invalid); // in case of compressed blob all
+ // or none pextents are invalid.
+ if (discard) {
+ shared_blob->bc.discard(shared_blob->get_cache(), 0,
+ get_blob().get_logical_length());
+ }
+ } else {
+ size_t pos = 0;
+ for (auto e : get_blob().get_extents()) {
+ if (!e.is_valid()) {
+ ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos
+ << "~" << e.length
+ << std::dec << dendl;
+ shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
+ }
+ pos += e.length;
+ }
+ if (get_blob().can_prune_tail()) {
+ dirty_blob().prune_tail();
+ used_in_blob.prune_tail(get_blob().get_ondisk_length());
+ auto cct = coll->store->cct; //used by dout
+ dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
+ }
+ }
+}
+
+void BlueStore::Blob::get_ref(
+ Collection *coll,
+ uint32_t offset,
+ uint32_t length)
+{
+ // Caller has to initialize Blob's logical length prior to increment
+ // references. Otherwise one is neither unable to determine required
+ // amount of counters in case of per-au tracking nor obtain min_release_size
+ // for single counter mode.
+ ceph_assert(get_blob().get_logical_length() != 0);
+ auto cct = coll->store->cct;
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << " " << *this << dendl;
+
+ if (used_in_blob.is_empty()) {
+ uint32_t min_release_size =
+ get_blob().get_release_size(coll->store->min_alloc_size);
+ uint64_t l = get_blob().get_logical_length();
+ dout(20) << __func__ << " init 0x" << std::hex << l << ", "
+ << min_release_size << std::dec << dendl;
+ used_in_blob.init(l, min_release_size);
+ }
+ used_in_blob.get(
+ offset,
+ length);
+}
+
+bool BlueStore::Blob::put_ref(
+ Collection *coll,
+ uint32_t offset,
+ uint32_t length,
+ PExtentVector *r)
+{
+ PExtentVector logical;
+
+ auto cct = coll->store->cct;
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << " " << *this << dendl;
+
+ bool empty = used_in_blob.put(
+ offset,
+ length,
+ &logical);
+ r->clear();
+ // nothing to release
+ if (!empty && logical.empty()) {
+ return false;
+ }
+
+ bluestore_blob_t& b = dirty_blob();
+ return b.release_extents(empty, logical, r);
+}
+
+bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
+ uint32_t target_blob_size,
+ uint32_t b_offset,
+ uint32_t *length0) {
+ ceph_assert(min_alloc_size);
+ ceph_assert(target_blob_size);
+ if (!get_blob().is_mutable()) {
+ return false;
+ }
+
+ uint32_t length = *length0;
+ uint32_t end = b_offset + length;
+
+ // Currently for the sake of simplicity we omit blob reuse if data is
+ // unaligned with csum chunk. Later we can perform padding if needed.
+ if (get_blob().has_csum() &&
+ ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
+ (end % get_blob().get_csum_chunk_size()) != 0)) {
+ return false;
+ }
+
+ auto blen = get_blob().get_logical_length();
+ uint32_t new_blen = blen;
+
+ // make sure target_blob_size isn't less than current blob len
+ target_blob_size = std::max(blen, target_blob_size);
+
+ if (b_offset >= blen) {
+ // new data totally stands out of the existing blob
+ new_blen = end;
+ } else {
+ // new data overlaps with the existing blob
+ new_blen = std::max(blen, end);
+
+ uint32_t overlap = 0;
+ if (new_blen > blen) {
+ overlap = blen - b_offset;
+ } else {
+ overlap = length;
+ }
+
+ if (!get_blob().is_unallocated(b_offset, overlap)) {
+ // abort if any piece of the overlap has already been allocated
+ return false;
+ }
+ }
+
+ if (new_blen > blen) {
+ int64_t overflow = int64_t(new_blen) - target_blob_size;
+ // Unable to decrease the provided length to fit into max_blob_size
+ if (overflow >= length) {
+ return false;
+ }
+
+ // FIXME: in some cases we could reduce unused resolution
+ if (get_blob().has_unused()) {
+ return false;
+ }
+
+ if (overflow > 0) {
+ new_blen -= overflow;
+ length -= overflow;
+ *length0 = length;
+ }
+
+ if (new_blen > blen) {
+ dirty_blob().add_tail(new_blen);
+ used_in_blob.add_tail(new_blen,
+ get_blob().get_release_size(min_alloc_size));
+ }
+ }
+ return true;
+}
+
+void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
+{
+ auto cct = coll->store->cct; //used by dout
+ dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+ << " start " << *this << dendl;
+ ceph_assert(blob.can_split());
+ ceph_assert(used_in_blob.can_split());
+ bluestore_blob_t &lb = dirty_blob();
+ bluestore_blob_t &rb = r->dirty_blob();
+
+ used_in_blob.split(
+ blob_offset,
+ &(r->used_in_blob));
+
+ lb.split(blob_offset, rb);
+ shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
+
+ dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+ << " finish " << *this << dendl;
+ dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+ << " and " << *r << dendl;
+}
+
+#ifndef CACHE_BLOB_BL
+void BlueStore::Blob::decode(
+ Collection *coll,
+ bufferptr::const_iterator& p,
+ uint64_t struct_v,
+ uint64_t* sbid,
+ bool include_ref_map)
+{
+ denc(blob, p, struct_v);
+ if (blob.is_shared()) {
+ denc(*sbid, p);
+ }
+ if (include_ref_map) {
+ if (struct_v > 1) {
+ used_in_blob.decode(p);
+ } else {
+ used_in_blob.clear();
+ bluestore_extent_ref_map_t legacy_ref_map;
+ legacy_ref_map.decode(p);
+ for (auto r : legacy_ref_map.ref_map) {
+ get_ref(
+ coll,
+ r.first,
+ r.second.refs * r.second.length);
+ }
+ }
+ }
+}
+#endif
+
+// Extent
+
+ostream& operator<<(ostream& out, const BlueStore::Extent& e)
+{
+ return out << std::hex << "0x" << e.logical_offset << "~" << e.length
+ << ": 0x" << e.blob_offset << "~" << e.length << std::dec
+ << " " << *e.blob;
+}
+
+// OldExtent
+BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
+ uint32_t lo,
+ uint32_t o,
+ uint32_t l,
+ BlobRef& b) {
+ OldExtent* oe = new OldExtent(lo, o, l, b);
+ b->put_ref(c.get(), o, l, &(oe->r));
+ oe->blob_empty = !b->is_referenced();
+ return oe;
+}
+
+// ExtentMap
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
+
+BlueStore::ExtentMap::ExtentMap(Onode *o)
+ : onode(o),
+ inline_bl(
+ o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
+}
+
+void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
+ CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
+ uint64_t& length, uint64_t& dstoff) {
+
+ auto cct = onode->c->store->cct;
+ bool inject_21040 =
+ cct->_conf->bluestore_debug_inject_bug21040;
+ vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
+ for (auto& e : oldo->extent_map.extent_map) {
+ e.blob->last_encoded_id = -1;
+ }
+
+ int n = 0;
+ uint64_t end = srcoff + length;
+ uint32_t dirty_range_begin = 0;
+ uint32_t dirty_range_end = 0;
+ bool src_dirty = false;
+ for (auto ep = oldo->extent_map.seek_lextent(srcoff);
+ ep != oldo->extent_map.extent_map.end();
+ ++ep) {
+ auto& e = *ep;
+ if (e.logical_offset >= end) {
+ break;
+ }
+ dout(20) << __func__ << " src " << e << dendl;
+ BlobRef cb;
+ bool blob_duped = true;
+ if (e.blob->last_encoded_id >= 0) {
+ cb = id_to_blob[e.blob->last_encoded_id];
+ blob_duped = false;
+ } else {
+ // dup the blob
+ const bluestore_blob_t& blob = e.blob->get_blob();
+ // make sure it is shared
+ if (!blob.is_shared()) {
+ c->make_blob_shared(b->_assign_blobid(txc), e.blob);
+ if (!inject_21040 && !src_dirty) {
+ src_dirty = true;
+ dirty_range_begin = e.logical_offset;
+ } else if (inject_21040 &&
+ dirty_range_begin == 0 && dirty_range_end == 0) {
+ dirty_range_begin = e.logical_offset;
+ }
+ ceph_assert(e.logical_end() > 0);
+ // -1 to exclude next potential shard
+ dirty_range_end = e.logical_end() - 1;
+ } else {
+ c->load_shared_blob(e.blob->shared_blob);
+ }
+ cb = new Blob();
+ e.blob->last_encoded_id = n;
+ id_to_blob[n] = cb;
+ e.blob->dup(*cb);
+ // bump the extent refs on the copied blob's extents
+ for (auto p : blob.get_extents()) {
+ if (p.is_valid()) {
+ e.blob->shared_blob->get_ref(p.offset, p.length);
+ }
+ }
+ txc->write_shared_blob(e.blob->shared_blob);
+ dout(20) << __func__ << " new " << *cb << dendl;
+ }
+
+ int skip_front, skip_back;
+ if (e.logical_offset < srcoff) {
+ skip_front = srcoff - e.logical_offset;
+ } else {
+ skip_front = 0;
+ }
+ if (e.logical_end() > end) {
+ skip_back = e.logical_end() - end;
+ } else {
+ skip_back = 0;
+ }
+
+ Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
+ e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
+ newo->extent_map.extent_map.insert(*ne);
+ ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
+ // fixme: we may leave parts of new blob unreferenced that could
+ // be freed (relative to the shared_blob).
+ txc->statfs_delta.stored() += ne->length;
+ if (e.blob->get_blob().is_compressed()) {
+ txc->statfs_delta.compressed_original() += ne->length;
+ if (blob_duped) {
+ txc->statfs_delta.compressed() +=
+ cb->get_blob().get_compressed_payload_length();
+ }
+ }
+ dout(20) << __func__ << " dst " << *ne << dendl;
+ ++n;
+ }
+ if ((!inject_21040 && src_dirty) ||
+ (inject_21040 && dirty_range_end > dirty_range_begin)) {
+ oldo->extent_map.dirty_range(dirty_range_begin,
+ dirty_range_end - dirty_range_begin);
+ txc->write_onode(oldo);
+ }
+ txc->write_onode(newo);
+
+ if (dstoff + length > newo->onode.size) {
+ newo->onode.size = dstoff + length;
+ }
+ newo->extent_map.dirty_range(dstoff, length);
+}
+void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
+ bool force)
+{
+ auto cct = onode->c->store->cct; //used by dout
+ dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
+ if (onode->onode.extent_map_shards.empty()) {
+ if (inline_bl.length() == 0) {
+ unsigned n;
+ // we need to encode inline_bl to measure encoded length
+ bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
+ inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
+ ceph_assert(!never_happen);
+ size_t len = inline_bl.length();
+ dout(20) << __func__ << " inline shard " << len << " bytes from " << n
+ << " extents" << dendl;
+ if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
+ request_reshard(0, OBJECT_MAX_SIZE);
+ return;
+ }
+ }
+ // will persist in the onode key.
+ } else {
+ // pending shard update
+ struct dirty_shard_t {
+ Shard *shard;
+ bufferlist bl;
+ dirty_shard_t(Shard *s) : shard(s) {}
+ };
+ vector<dirty_shard_t> encoded_shards;
+ // allocate slots for all shards in a single call instead of
+ // doing multiple allocations - one per each dirty shard
+ encoded_shards.reserve(shards.size());
+
+ auto p = shards.begin();
+ auto prev_p = p;
+ while (p != shards.end()) {
+ ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
+ auto n = p;
+ ++n;
+ if (p->dirty) {
+ uint32_t endoff;
+ if (n == shards.end()) {
+ endoff = OBJECT_MAX_SIZE;
+ } else {
+ endoff = n->shard_info->offset;
+ }
+ encoded_shards.emplace_back(dirty_shard_t(&(*p)));
+ bufferlist& bl = encoded_shards.back().bl;
+ if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
+ bl, &p->extents)) {
+ if (force) {
+ derr << __func__ << " encode_some needs reshard" << dendl;
+ ceph_assert(!force);
+ }
+ }
+ size_t len = bl.length();
+
+ dout(20) << __func__ << " shard 0x" << std::hex
+ << p->shard_info->offset << std::dec << " is " << len
+ << " bytes (was " << p->shard_info->bytes << ") from "
+ << p->extents << " extents" << dendl;
+
+ if (!force) {
+ if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
+ // we are big; reshard ourselves
+ request_reshard(p->shard_info->offset, endoff);
+ }
+ // avoid resharding the trailing shard, even if it is small
+ else if (n != shards.end() &&
+ len < g_conf()->bluestore_extent_map_shard_min_size) {
+ ceph_assert(endoff != OBJECT_MAX_SIZE);
+ if (p == shards.begin()) {
+ // we are the first shard, combine with next shard
+ request_reshard(p->shard_info->offset, endoff + 1);
+ } else {
+ // combine either with the previous shard or the next,
+ // whichever is smaller
+ if (prev_p->shard_info->bytes > n->shard_info->bytes) {
+ request_reshard(p->shard_info->offset, endoff + 1);
+ } else {
+ request_reshard(prev_p->shard_info->offset, endoff);
+ }
+ }
+ }
+ }
+ }
+ prev_p = p;
+ p = n;
+ }
+ if (needs_reshard()) {
+ return;
+ }
+
+ // schedule DB update for dirty shards
+ string key;
+ for (auto& it : encoded_shards) {
+ it.shard->dirty = false;
+ it.shard->shard_info->bytes = it.bl.length();
+ generate_extent_shard_key_and_apply(
+ onode->key,
+ it.shard->shard_info->offset,
+ &key,
+ [&](const string& final_key) {
+ t->set(PREFIX_OBJ, final_key, it.bl);
+ }
+ );
+ }
+ }
+}
+
+bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
+{
+ if (spanning_blob_map.empty())
+ return 0;
+ bid_t bid = spanning_blob_map.rbegin()->first + 1;
+ // bid is valid and available.
+ if (bid >= 0)
+ return bid;
+ // Find next unused bid;
+ bid = rand() % (numeric_limits<bid_t>::max() + 1);
+ const auto begin_bid = bid;
+ do {
+ if (!spanning_blob_map.count(bid))
+ return bid;
+ else {
+ bid++;
+ if (bid < 0) bid = 0;
+ }
+ } while (bid != begin_bid);
+ auto cct = onode->c->store->cct; // used by dout
+ _dump_onode<0>(cct, *onode);
+ ceph_abort_msg("no available blob id");
+}
+
+void BlueStore::ExtentMap::reshard(
+ KeyValueDB *db,
+ KeyValueDB::Transaction t)
+{
+ auto cct = onode->c->store->cct; // used by dout
+
+ dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
+ << needs_reshard_end << ")" << std::dec
+ << " of " << onode->onode.extent_map_shards.size()
+ << " shards on " << onode->oid << dendl;
+ for (auto& p : spanning_blob_map) {
+ dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
+ << dendl;
+ }
+ // determine shard index range
+ unsigned si_begin = 0, si_end = 0;
+ if (!shards.empty()) {
+ while (si_begin + 1 < shards.size() &&
+ shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
+ ++si_begin;
+ }
+ needs_reshard_begin = shards[si_begin].shard_info->offset;
+ for (si_end = si_begin; si_end < shards.size(); ++si_end) {
+ if (shards[si_end].shard_info->offset >= needs_reshard_end) {
+ needs_reshard_end = shards[si_end].shard_info->offset;
+ break;
+ }
+ }
+ if (si_end == shards.size()) {
+ needs_reshard_end = OBJECT_MAX_SIZE;
+ }
+ dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
+ << " over 0x[" << std::hex << needs_reshard_begin << ","
+ << needs_reshard_end << ")" << std::dec << dendl;
+ }
+
+ fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
+
+ // we may need to fault in a larger interval later must have all
+ // referring extents for spanning blobs loaded in order to have
+ // accurate use_tracker values.
+ uint32_t spanning_scan_begin = needs_reshard_begin;
+ uint32_t spanning_scan_end = needs_reshard_end;
+
+ // remove old keys
+ string key;
+ for (unsigned i = si_begin; i < si_end; ++i) {
+ generate_extent_shard_key_and_apply(
+ onode->key, shards[i].shard_info->offset, &key,
+ [&](const string& final_key) {
+ t->rmkey(PREFIX_OBJ, final_key);
+ }
+ );
+ }
+
+ // calculate average extent size
+ unsigned bytes = 0;
+ unsigned extents = 0;
+ if (onode->onode.extent_map_shards.empty()) {
+ bytes = inline_bl.length();
+ extents = extent_map.size();
+ } else {
+ for (unsigned i = si_begin; i < si_end; ++i) {
+ bytes += shards[i].shard_info->bytes;
+ extents += shards[i].extents;
+ }
+ }
+ unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
+ unsigned slop = target *
+ cct->_conf->bluestore_extent_map_shard_target_size_slop;
+ unsigned extent_avg = bytes / std::max(1u, extents);
+ dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
+ << ", slop " << slop << dendl;
+
+ // reshard
+ unsigned estimate = 0;
+ unsigned offset = needs_reshard_begin;
+ vector<bluestore_onode_t::shard_info> new_shard_info;
+ unsigned max_blob_end = 0;
+ Extent dummy(needs_reshard_begin);
+ for (auto e = extent_map.lower_bound(dummy);
+ e != extent_map.end();
+ ++e) {
+ if (e->logical_offset >= needs_reshard_end) {
+ break;
+ }
+ dout(30) << " extent " << *e << dendl;
+
+ // disfavor shard boundaries that span a blob
+ bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
+ if (estimate &&
+ estimate + extent_avg > target + (would_span ? slop : 0)) {
+ // new shard
+ if (offset == needs_reshard_begin) {
+ new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+ new_shard_info.back().offset = offset;
+ dout(20) << __func__ << " new shard 0x" << std::hex << offset
+ << std::dec << dendl;
+ }
+ offset = e->logical_offset;
+ new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+ new_shard_info.back().offset = offset;
+ dout(20) << __func__ << " new shard 0x" << std::hex << offset
+ << std::dec << dendl;
+ estimate = 0;
+ }
+ estimate += extent_avg;
+ unsigned bs = e->blob_start();
+ if (bs < spanning_scan_begin) {
+ spanning_scan_begin = bs;
+ }
+ uint32_t be = e->blob_end();
+ if (be > max_blob_end) {
+ max_blob_end = be;
+ }
+ if (be > spanning_scan_end) {
+ spanning_scan_end = be;
+ }
+ }
+ if (new_shard_info.empty() && (si_begin > 0 ||
+ si_end < shards.size())) {
+ // we resharded a partial range; we must produce at least one output
+ // shard
+ new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+ new_shard_info.back().offset = needs_reshard_begin;
+ dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
+ << std::dec << " (singleton degenerate case)" << dendl;
+ }
+
+ auto& sv = onode->onode.extent_map_shards;
+ dout(20) << __func__ << " new " << new_shard_info << dendl;
+ dout(20) << __func__ << " old " << sv << dendl;
+ if (sv.empty()) {
+ // no old shards to keep
+ sv.swap(new_shard_info);
+ init_shards(true, true);
+ } else {
+ // splice in new shards
+ sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
+ shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
+ sv.insert(
+ sv.begin() + si_begin,
+ new_shard_info.begin(),
+ new_shard_info.end());
+ shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
+ si_end = si_begin + new_shard_info.size();
+
+ ceph_assert(sv.size() == shards.size());
+
+ // note that we need to update every shard_info of shards here,
+ // as sv might have been totally re-allocated above
+ for (unsigned i = 0; i < shards.size(); i++) {
+ shards[i].shard_info = &sv[i];
+ }
+
+ // mark newly added shards as dirty
+ for (unsigned i = si_begin; i < si_end; ++i) {
+ shards[i].loaded = true;
+ shards[i].dirty = true;
+ }
+ }
+ dout(20) << __func__ << " fin " << sv << dendl;
+ inline_bl.clear();
+
+ if (sv.empty()) {
+ // no more shards; unspan all previously spanning blobs
+ auto p = spanning_blob_map.begin();
+ while (p != spanning_blob_map.end()) {
+ p->second->id = -1;
+ dout(30) << __func__ << " un-spanning " << *p->second << dendl;
+ p = spanning_blob_map.erase(p);
+ }
+ } else {
+ // identify new spanning blobs
+ dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
+ << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
+ if (spanning_scan_begin < needs_reshard_begin) {
+ fault_range(db, spanning_scan_begin,
+ needs_reshard_begin - spanning_scan_begin);
+ }
+ if (spanning_scan_end > needs_reshard_end) {
+ fault_range(db, needs_reshard_end,
+ spanning_scan_end - needs_reshard_end);
+ }
+ auto sp = sv.begin() + si_begin;
+ auto esp = sv.end();
+ unsigned shard_start = sp->offset;
+ unsigned shard_end;
+ ++sp;
+ if (sp == esp) {
+ shard_end = OBJECT_MAX_SIZE;
+ } else {
+ shard_end = sp->offset;
+ }
+ Extent dummy(needs_reshard_begin);
+
+ bool was_too_many_blobs_check = false;
+ auto too_many_blobs_threshold =
+ g_conf()->bluestore_debug_too_many_blobs_threshold;
+ auto& dumped_onodes = onode->c->cache->dumped_onodes;
+ decltype(onode->c->cache->dumped_onodes)::value_type* oid_slot = nullptr;
+ decltype(onode->c->cache->dumped_onodes)::value_type* oldest_slot = nullptr;
+
+ for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
+ if (e->logical_offset >= needs_reshard_end) {
+ break;
+ }
+ dout(30) << " extent " << *e << dendl;
+ while (e->logical_offset >= shard_end) {
+ shard_start = shard_end;
+ ceph_assert(sp != esp);
+ ++sp;
+ if (sp == esp) {
+ shard_end = OBJECT_MAX_SIZE;
+ } else {
+ shard_end = sp->offset;
+ }
+ dout(30) << __func__ << " shard 0x" << std::hex << shard_start
+ << " to 0x" << shard_end << std::dec << dendl;
+ }
+
+ if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
+ if (!e->blob->is_spanning()) {
+ // We have two options: (1) split the blob into pieces at the
+ // shard boundaries (and adjust extents accordingly), or (2)
+ // mark it spanning. We prefer to cut the blob if we can. Note that
+ // we may have to split it multiple times--potentially at every
+ // shard boundary.
+ bool must_span = false;
+ BlobRef b = e->blob;
+ if (b->can_split()) {
+ uint32_t bstart = e->blob_start();
+ uint32_t bend = e->blob_end();
+ for (const auto& sh : shards) {
+ if (bstart < sh.shard_info->offset &&
+ bend > sh.shard_info->offset) {
+ uint32_t blob_offset = sh.shard_info->offset - bstart;
+ if (b->can_split_at(blob_offset)) {
+ dout(20) << __func__ << " splitting blob, bstart 0x"
+ << std::hex << bstart << " blob_offset 0x"
+ << blob_offset << std::dec << " " << *b << dendl;
+ b = split_blob(b, blob_offset, sh.shard_info->offset);
+ // switch b to the new right-hand side, in case it
+ // *also* has to get split.
+ bstart += blob_offset;
+ onode->c->store->logger->inc(l_bluestore_blob_split);
+ } else {
+ must_span = true;
+ break;
+ }
+ }
+ }
+ } else {
+ must_span = true;
+ }
+ if (must_span) {
+ auto bid = allocate_spanning_blob_id();
+ b->id = bid;
+ spanning_blob_map[b->id] = b;
+ dout(20) << __func__ << " adding spanning " << *b << dendl;
+ if (!was_too_many_blobs_check &&
+ too_many_blobs_threshold &&
+ spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
+
+ was_too_many_blobs_check = true;
+ for (size_t i = 0; i < dumped_onodes.size(); ++i) {
+ if (dumped_onodes[i].first == onode->oid) {
+ oid_slot = &dumped_onodes[i];
+ break;
+ }
+ if (!oldest_slot || (oldest_slot &&
+ dumped_onodes[i].second < oldest_slot->second)) {
+ oldest_slot = &dumped_onodes[i];
+ }
+ }
+ }
+ }
+ }
+ } else {
+ if (e->blob->is_spanning()) {
+ spanning_blob_map.erase(e->blob->id);
+ e->blob->id = -1;
+ dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
+ }
+ }
+ }
+ bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
+ (oid_slot &&
+ (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
+ if (do_dump) {
+ dout(0) << __func__
+ << " spanning blob count exceeds threshold, "
+ << spanning_blob_map.size() << " spanning blobs"
+ << dendl;
+ _dump_onode<0>(cct, *onode);
+ if (oid_slot) {
+ oid_slot->second = mono_clock::now();
+ } else {
+ ceph_assert(oldest_slot);
+ oldest_slot->first = onode->oid;
+ oldest_slot->second = mono_clock::now();
+ }
+ }
+ }
+
+ clear_needs_reshard();
+}
+
+bool BlueStore::ExtentMap::encode_some(
+ uint32_t offset,
+ uint32_t length,
+ bufferlist& bl,
+ unsigned *pn)
+{
+ auto cct = onode->c->store->cct; //used by dout
+ Extent dummy(offset);
+ auto start = extent_map.lower_bound(dummy);
+ uint32_t end = offset + length;
+
+ __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+
+ unsigned n = 0;
+ size_t bound = 0;
+ bool must_reshard = false;
+ for (auto p = start;
+ p != extent_map.end() && p->logical_offset < end;
+ ++p, ++n) {
+ ceph_assert(p->logical_offset >= offset);
+ p->blob->last_encoded_id = -1;
+ if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
+ dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << " hit new spanning blob " << *p << dendl;
+ request_reshard(p->blob_start(), p->blob_end());
+ must_reshard = true;
+ }
+ if (!must_reshard) {
+ denc_varint(0, bound); // blobid
+ denc_varint(0, bound); // logical_offset
+ denc_varint(0, bound); // len
+ denc_varint(0, bound); // blob_offset
+
+ p->blob->bound_encode(
+ bound,
+ struct_v,
+ p->blob->shared_blob->get_sbid(),
+ false);
+ }
+ }
+ if (must_reshard) {
+ return true;
+ }
+
+ denc(struct_v, bound);
+ denc_varint(0, bound); // number of extents
+
+ {
+ auto app = bl.get_contiguous_appender(bound);
+ denc(struct_v, app);
+ denc_varint(n, app);
+ if (pn) {
+ *pn = n;
+ }
+
+ n = 0;
+ uint64_t pos = 0;
+ uint64_t prev_len = 0;
+ for (auto p = start;
+ p != extent_map.end() && p->logical_offset < end;
+ ++p, ++n) {
+ unsigned blobid;
+ bool include_blob = false;
+ if (p->blob->is_spanning()) {
+ blobid = p->blob->id << BLOBID_SHIFT_BITS;
+ blobid |= BLOBID_FLAG_SPANNING;
+ } else if (p->blob->last_encoded_id < 0) {
+ p->blob->last_encoded_id = n + 1; // so it is always non-zero
+ include_blob = true;
+ blobid = 0; // the decoder will infer the id from n
+ } else {
+ blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
+ }
+ if (p->logical_offset == pos) {
+ blobid |= BLOBID_FLAG_CONTIGUOUS;
+ }
+ if (p->blob_offset == 0) {
+ blobid |= BLOBID_FLAG_ZEROOFFSET;
+ }
+ if (p->length == prev_len) {
+ blobid |= BLOBID_FLAG_SAMELENGTH;
+ } else {
+ prev_len = p->length;
+ }
+ denc_varint(blobid, app);
+ if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+ denc_varint_lowz(p->logical_offset - pos, app);
+ }
+ if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+ denc_varint_lowz(p->blob_offset, app);
+ }
+ if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+ denc_varint_lowz(p->length, app);
+ }
+ pos = p->logical_end();
+ if (include_blob) {
+ p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
+ }
+ }
+ }
+ /*derr << __func__ << bl << dendl;
+ derr << __func__ << ":";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+ */
+ return false;
+}
+
+unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
+{
+ auto cct = onode->c->store->cct; //used by dout
+ /*
+ derr << __func__ << ":";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+ */
+
+ ceph_assert(bl.get_num_buffers() <= 1);
+ auto p = bl.front().begin_deep();
+ __u8 struct_v;
+ denc(struct_v, p);
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level below.
+ ceph_assert(struct_v == 1 || struct_v == 2);
+
+ uint32_t num;
+ denc_varint(num, p);
+ vector<BlobRef> blobs(num);
+ uint64_t pos = 0;
+ uint64_t prev_len = 0;
+ unsigned n = 0;
+
+ while (!p.end()) {
+ Extent *le = new Extent();
+ uint64_t blobid;
+ denc_varint(blobid, p);
+ if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+ uint64_t gap;
+ denc_varint_lowz(gap, p);
+ pos += gap;
+ }
+ le->logical_offset = pos;
+ if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+ denc_varint_lowz(le->blob_offset, p);
+ } else {
+ le->blob_offset = 0;
+ }
+ if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+ denc_varint_lowz(prev_len, p);
+ }
+ le->length = prev_len;
+
+ if (blobid & BLOBID_FLAG_SPANNING) {
+ dout(30) << __func__ << " getting spanning blob "
+ << (blobid >> BLOBID_SHIFT_BITS) << dendl;
+ le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
+ } else {
+ blobid >>= BLOBID_SHIFT_BITS;
+ if (blobid) {
+ le->assign_blob(blobs[blobid - 1]);
+ ceph_assert(le->blob);
+ } else {
+ Blob *b = new Blob();
+ uint64_t sbid = 0;
+ b->decode(onode->c, p, struct_v, &sbid, false);
+ blobs[n] = b;
+ onode->c->open_shared_blob(sbid, b);
+ le->assign_blob(b);
+ }
+ // we build ref_map dynamically for non-spanning blobs
+ le->blob->get_ref(
+ onode->c,
+ le->blob_offset,
+ le->length);
+ }
+ pos += prev_len;
+ ++n;
+ extent_map.insert(*le);
+ }
+
+ ceph_assert(n == num);
+ return num;
+}
+
+void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
+{
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+ __u8 struct_v = 2;
+
+ denc(struct_v, p);
+ denc_varint((uint32_t)0, p);
+ size_t key_size = 0;
+ denc_varint((uint32_t)0, key_size);
+ p += spanning_blob_map.size() * key_size;
+ for (const auto& i : spanning_blob_map) {
+ i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
+ }
+}
+
+void BlueStore::ExtentMap::encode_spanning_blobs(
+ bufferlist::contiguous_appender& p)
+{
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+ __u8 struct_v = 2;
+
+ denc(struct_v, p);
+ denc_varint(spanning_blob_map.size(), p);
+ for (auto& i : spanning_blob_map) {
+ denc_varint(i.second->id, p);
+ i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
+ }
+}
+
+void BlueStore::ExtentMap::decode_spanning_blobs(
+ bufferptr::const_iterator& p)
+{
+ __u8 struct_v;
+ denc(struct_v, p);
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+ ceph_assert(struct_v == 1 || struct_v == 2);
+
+ unsigned n;
+ denc_varint(n, p);
+ while (n--) {
+ BlobRef b(new Blob());
+ denc_varint(b->id, p);
+ spanning_blob_map[b->id] = b;
+ uint64_t sbid = 0;
+ b->decode(onode->c, p, struct_v, &sbid, true);
+ onode->c->open_shared_blob(sbid, b);
+ }
+}
+
+void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
+{
+ shards.resize(onode->onode.extent_map_shards.size());
+ unsigned i = 0;
+ for (auto &s : onode->onode.extent_map_shards) {
+ shards[i].shard_info = &s;
+ shards[i].loaded = loaded;
+ shards[i].dirty = dirty;
+ ++i;
+ }
+}
+
+void BlueStore::ExtentMap::fault_range(
+ KeyValueDB *db,
+ uint32_t offset,
+ uint32_t length)
+{
+ auto cct = onode->c->store->cct; //used by dout
+ dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ auto start = seek_shard(offset);
+ auto last = seek_shard(offset + length);
+
+ if (start < 0)
+ return;
+
+ ceph_assert(last >= start);
+ string key;
+ while (start <= last) {
+ ceph_assert((size_t)start < shards.size());
+ auto p = &shards[start];
+ if (!p->loaded) {
+ dout(30) << __func__ << " opening shard 0x" << std::hex
+ << p->shard_info->offset << std::dec << dendl;
+ bufferlist v;
+ generate_extent_shard_key_and_apply(
+ onode->key, p->shard_info->offset, &key,
+ [&](const string& final_key) {
+ int r = db->get(PREFIX_OBJ, final_key, &v);
+ if (r < 0) {
+ derr << __func__ << " missing shard 0x" << std::hex
+ << p->shard_info->offset << std::dec << " for " << onode->oid
+ << dendl;
+ ceph_assert(r >= 0);
+ }
+ }
+ );
+ p->extents = decode_some(v);
+ p->loaded = true;
+ dout(20) << __func__ << " open shard 0x" << std::hex
+ << p->shard_info->offset
+ << " for range 0x" << offset << "~" << length << std::dec
+ << " (" << v.length() << " bytes)" << dendl;
+ ceph_assert(p->dirty == false);
+ ceph_assert(v.length() == p->shard_info->bytes);
+ onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
+ } else {
+ onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
+ }
+ ++start;
+ }
+}
+
+void BlueStore::ExtentMap::dirty_range(
+ uint32_t offset,
+ uint32_t length)
+{
+ auto cct = onode->c->store->cct; //used by dout
+ dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ if (shards.empty()) {
+ dout(20) << __func__ << " mark inline shard dirty" << dendl;
+ inline_bl.clear();
+ return;
+ }
+ auto start = seek_shard(offset);
+ if (length == 0) {
+ length = 1;
+ }
+ auto last = seek_shard(offset + length - 1);
+ if (start < 0)
+ return;
+
+ ceph_assert(last >= start);
+ while (start <= last) {
+ ceph_assert((size_t)start < shards.size());
+ auto p = &shards[start];
+ if (!p->loaded) {
+ derr << __func__ << "on write 0x" << std::hex << offset
+ << "~" << length << " shard 0x" << p->shard_info->offset
+ << std::dec << " is not loaded, can't mark dirty" << dendl;
+ ceph_abort_msg("can't mark unloaded shard dirty");
+ }
+ if (!p->dirty) {
+ dout(20) << __func__ << " mark shard 0x" << std::hex
+ << p->shard_info->offset << std::dec << " dirty" << dendl;
+ p->dirty = true;
+ }
+ ++start;
+ }
+}
+
+BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
+ uint64_t offset)
+{
+ Extent dummy(offset);
+ return extent_map.find(dummy);
+}
+
+BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
+ uint64_t offset)
+{
+ Extent dummy(offset);
+ auto fp = extent_map.lower_bound(dummy);
+ if (fp != extent_map.begin()) {
+ --fp;
+ if (fp->logical_end() <= offset) {
+ ++fp;
+ }
+ }
+ return fp;
+}
+
+BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
+ uint64_t offset) const
+{
+ Extent dummy(offset);
+ auto fp = extent_map.lower_bound(dummy);
+ if (fp != extent_map.begin()) {
+ --fp;
+ if (fp->logical_end() <= offset) {
+ ++fp;
+ }
+ }
+ return fp;
+}
+
+bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
+{
+ auto fp = seek_lextent(offset);
+ if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
+ return false;
+ }
+ return true;
+}
+
+int BlueStore::ExtentMap::compress_extent_map(
+ uint64_t offset,
+ uint64_t length)
+{
+ auto cct = onode->c->store->cct; //used by dout
+ if (extent_map.empty())
+ return 0;
+ int removed = 0;
+ auto p = seek_lextent(offset);
+ if (p != extent_map.begin()) {
+ --p; // start to the left of offset
+ }
+ // the caller should have just written to this region
+ ceph_assert(p != extent_map.end());
+
+ // identify the *next* shard
+ auto pshard = shards.begin();
+ while (pshard != shards.end() &&
+ p->logical_offset >= pshard->shard_info->offset) {
+ ++pshard;
+ }
+ uint64_t shard_end;
+ if (pshard != shards.end()) {
+ shard_end = pshard->shard_info->offset;
+ } else {
+ shard_end = OBJECT_MAX_SIZE;
+ }
+
+ auto n = p;
+ for (++n; n != extent_map.end(); p = n++) {
+ if (n->logical_offset > offset + length) {
+ break; // stop after end
+ }
+ while (n != extent_map.end() &&
+ p->logical_end() == n->logical_offset &&
+ p->blob == n->blob &&
+ p->blob_offset + p->length == n->blob_offset &&
+ n->logical_offset < shard_end) {
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " next shard 0x" << shard_end << std::dec
+ << " merging " << *p << " and " << *n << dendl;
+ p->length += n->length;
+ rm(n++);
+ ++removed;
+ }
+ if (n == extent_map.end()) {
+ break;
+ }
+ if (n->logical_offset >= shard_end) {
+ ceph_assert(pshard != shards.end());
+ ++pshard;
+ if (pshard != shards.end()) {
+ shard_end = pshard->shard_info->offset;
+ } else {
+ shard_end = OBJECT_MAX_SIZE;
+ }
+ }
+ }
+ if (removed) {
+ onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
+ }
+ return removed;
+}
+
+void BlueStore::ExtentMap::punch_hole(
+ CollectionRef &c,
+ uint64_t offset,
+ uint64_t length,
+ old_extent_map_t *old_extents)
+{
+ auto p = seek_lextent(offset);
+ uint64_t end = offset + length;
+ while (p != extent_map.end()) {
+ if (p->logical_offset >= end) {
+ break;
+ }
+ if (p->logical_offset < offset) {
+ if (p->logical_end() > end) {
+ // split and deref middle
+ uint64_t front = offset - p->logical_offset;
+ OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
+ length, p->blob);
+ old_extents->push_back(*oe);
+ add(end,
+ p->blob_offset + front + length,
+ p->length - front - length,
+ p->blob);
+ p->length = front;
+ break;
+ } else {
+ // deref tail
+ ceph_assert(p->logical_end() > offset); // else seek_lextent bug
+ uint64_t keep = offset - p->logical_offset;
+ OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
+ p->length - keep, p->blob);
+ old_extents->push_back(*oe);
+ p->length = keep;
+ ++p;
+ continue;
+ }
+ }
+ if (p->logical_offset + p->length <= end) {
+ // deref whole lextent
+ OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
+ p->length, p->blob);
+ old_extents->push_back(*oe);
+ rm(p++);
+ continue;
+ }
+ // deref head
+ uint64_t keep = p->logical_end() - end;
+ OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
+ p->length - keep, p->blob);
+ old_extents->push_back(*oe);
+
+ add(end, p->blob_offset + p->length - keep, keep, p->blob);
+ rm(p);
+ break;
+ }
+}
+
+BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
+ CollectionRef &c,
+ uint64_t logical_offset,
+ uint64_t blob_offset, uint64_t length, BlobRef b,
+ old_extent_map_t *old_extents)
+{
+ // We need to have completely initialized Blob to increment its ref counters.
+ ceph_assert(b->get_blob().get_logical_length() != 0);
+
+ // Do get_ref prior to punch_hole to prevent from putting reused blob into
+ // old_extents list if we overwre the blob totally
+ // This might happen during WAL overwrite.
+ b->get_ref(onode->c, blob_offset, length);
+
+ if (old_extents) {
+ punch_hole(c, logical_offset, length, old_extents);
+ }
+
+ Extent *le = new Extent(logical_offset, blob_offset, length, b);
+ extent_map.insert(*le);
+ if (spans_shard(logical_offset, length)) {
+ request_reshard(logical_offset, logical_offset + length);
+ }
+ return le;
+}
+
+BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
+ BlobRef lb,
+ uint32_t blob_offset,
+ uint32_t pos)
+{
+ auto cct = onode->c->store->cct; //used by dout
+
+ uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
+ dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
+ << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
+ << dendl;
+ BlobRef rb = onode->c->new_blob();
+ lb->split(onode->c, blob_offset, rb.get());
+
+ for (auto ep = seek_lextent(pos);
+ ep != extent_map.end() && ep->logical_offset < end_pos;
+ ++ep) {
+ if (ep->blob != lb) {
+ continue;
+ }
+ if (ep->logical_offset < pos) {
+ // split extent
+ size_t left = pos - ep->logical_offset;
+ Extent *ne = new Extent(pos, 0, ep->length - left, rb);
+ extent_map.insert(*ne);
+ ep->length = left;
+ dout(30) << __func__ << " split " << *ep << dendl;
+ dout(30) << __func__ << " to " << *ne << dendl;
+ } else {
+ // switch blob
+ ceph_assert(ep->blob_offset >= blob_offset);
+
+ ep->blob = rb;
+ ep->blob_offset -= blob_offset;
+ dout(30) << __func__ << " adjusted " << *ep << dendl;
+ }
+ }
+ return rb;
+}
+
+// Onode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
+
+BlueStore::Onode* BlueStore::Onode::decode(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const string& key,
+ const bufferlist& v)
+{
+ Onode* on = new Onode(c.get(), oid, key);
+ on->exists = true;
+ auto p = v.front().begin_deep();
+ on->onode.decode(p);
+ for (auto& i : on->onode.attrs) {
+ i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ }
+
+ // initialize extent_map
+ on->extent_map.decode_spanning_blobs(p);
+ if (on->onode.extent_map_shards.empty()) {
+ denc(on->extent_map.inline_bl, p);
+ on->extent_map.decode_some(on->extent_map.inline_bl);
+ on->extent_map.inline_bl.reassign_to_mempool(
+ mempool::mempool_bluestore_cache_data);
+ }
+ else {
+ on->extent_map.init_shards(false, false);
+ }
+ return on;
+}
+
+void BlueStore::Onode::flush()
+{
+ if (flushing_count.load()) {
+ ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
+ std::unique_lock l(flush_lock);
+ while (flushing_count.load()) {
+ flush_cond.wait(l);
+ }
+ }
+ ldout(c->store->cct, 20) << __func__ << " done" << dendl;
+}
+
+// =======================================================
+// WriteContext
+
+/// Checks for writes to the same pextent within a blob
+bool BlueStore::WriteContext::has_conflict(
+ BlobRef b,
+ uint64_t loffs,
+ uint64_t loffs_end,
+ uint64_t min_alloc_size)
+{
+ ceph_assert((loffs % min_alloc_size) == 0);
+ ceph_assert((loffs_end % min_alloc_size) == 0);
+ for (auto w : writes) {
+ if (b == w.b) {
+ auto loffs2 = p2align(w.logical_offset, min_alloc_size);
+ auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
+ if ((loffs <= loffs2 && loffs_end > loffs2) ||
+ (loffs >= loffs2 && loffs < loffs2_end)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// =======================================================
+
+// DeferredBatch
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
+
+void BlueStore::DeferredBatch::prepare_write(
+ CephContext *cct,
+ uint64_t seq, uint64_t offset, uint64_t length,
+ bufferlist::const_iterator& blp)
+{
+ _discard(cct, offset, length);
+ auto i = iomap.insert(make_pair(offset, deferred_io()));
+ ceph_assert(i.second); // this should be a new insertion
+ i.first->second.seq = seq;
+ blp.copy(length, i.first->second.bl);
+ i.first->second.bl.reassign_to_mempool(
+ mempool::mempool_bluestore_writing_deferred);
+ dout(20) << __func__ << " seq " << seq
+ << " 0x" << std::hex << offset << "~" << length
+ << " crc " << i.first->second.bl.crc32c(-1)
+ << std::dec << dendl;
+ seq_bytes[seq] += length;
+#ifdef DEBUG_DEFERRED
+ _audit(cct);
+#endif
+}
+
+void BlueStore::DeferredBatch::_discard(
+ CephContext *cct, uint64_t offset, uint64_t length)
+{
+ generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ auto p = iomap.lower_bound(offset);
+ if (p != iomap.begin()) {
+ --p;
+ auto end = p->first + p->second.bl.length();
+ if (end > offset) {
+ bufferlist head;
+ head.substr_of(p->second.bl, 0, offset - p->first);
+ dout(20) << __func__ << " keep head " << p->second.seq
+ << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+ << " -> 0x" << head.length() << std::dec << dendl;
+ auto i = seq_bytes.find(p->second.seq);
+ ceph_assert(i != seq_bytes.end());
+ if (end > offset + length) {
+ bufferlist tail;
+ tail.substr_of(p->second.bl, offset + length - p->first,
+ end - (offset + length));
+ dout(20) << __func__ << " keep tail " << p->second.seq
+ << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+ << " -> 0x" << tail.length() << std::dec << dendl;
+ auto &n = iomap[offset + length];
+ n.bl.swap(tail);
+ n.seq = p->second.seq;
+ i->second -= length;
+ } else {
+ i->second -= end - offset;
+ }
+ ceph_assert(i->second >= 0);
+ p->second.bl.swap(head);
+ }
+ ++p;
+ }
+ while (p != iomap.end()) {
+ if (p->first >= offset + length) {
+ break;
+ }
+ auto i = seq_bytes.find(p->second.seq);
+ ceph_assert(i != seq_bytes.end());
+ auto end = p->first + p->second.bl.length();
+ if (end > offset + length) {
+ unsigned drop_front = offset + length - p->first;
+ unsigned keep_tail = end - (offset + length);
+ dout(20) << __func__ << " truncate front " << p->second.seq
+ << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+ << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
+ << " to 0x" << (offset + length) << "~" << keep_tail
+ << std::dec << dendl;
+ auto &s = iomap[offset + length];
+ s.seq = p->second.seq;
+ s.bl.substr_of(p->second.bl, drop_front, keep_tail);
+ i->second -= drop_front;
+ } else {
+ dout(20) << __func__ << " drop " << p->second.seq
+ << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+ << std::dec << dendl;
+ i->second -= p->second.bl.length();
+ }
+ ceph_assert(i->second >= 0);
+ p = iomap.erase(p);
+ }
+}
+
+void BlueStore::DeferredBatch::_audit(CephContext *cct)
+{
+ map<uint64_t,int> sb;
+ for (auto p : seq_bytes) {
+ sb[p.first] = 0; // make sure we have the same set of keys
+ }
+ uint64_t pos = 0;
+ for (auto& p : iomap) {
+ ceph_assert(p.first >= pos);
+ sb[p.second.seq] += p.second.bl.length();
+ pos = p.first + p.second.bl.length();
+ }
+ ceph_assert(sb == seq_bytes);
+}
+
+
+// Collection
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
+
+BlueStore::Collection::Collection(BlueStore *store_, Cache *c, coll_t cid)
+ : CollectionImpl(cid),
+ store(store_),
+ cache(c),
+ lock("BlueStore::Collection::lock", true, false),
+ exists(true),
+ onode_map(c),
+ commit_queue(nullptr)
+{
+}
+
+bool BlueStore::Collection::flush_commit(Context *c)
+{
+ return osr->flush_commit(c);
+}
+
+void BlueStore::Collection::flush()
+{
+ osr->flush();
+}
+
+void BlueStore::Collection::flush_all_but_last()
+{
+ osr->flush_all_but_last();
+}
+
+void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
+{
+ ceph_assert(!b->shared_blob);
+ const bluestore_blob_t& blob = b->get_blob();
+ if (!blob.is_shared()) {
+ b->shared_blob = new SharedBlob(this);
+ return;
+ }
+
+ b->shared_blob = shared_blob_set.lookup(sbid);
+ if (b->shared_blob) {
+ ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+ << std::dec << " had " << *b->shared_blob << dendl;
+ } else {
+ b->shared_blob = new SharedBlob(sbid, this);
+ shared_blob_set.add(this, b->shared_blob.get());
+ ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+ << std::dec << " opened " << *b->shared_blob
+ << dendl;
+ }
+}
+
+void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
+{
+ if (!sb->is_loaded()) {
+
+ bufferlist v;
+ string key;
+ auto sbid = sb->get_sbid();
+ get_shared_blob_key(sbid, &key);
+ int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
+ if (r < 0) {
+ lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
+ << std::dec << " not found at key "
+ << pretty_binary_string(key) << dendl;
+ ceph_abort_msg("uh oh, missing shared_blob");
+ }
+
+ sb->loaded = true;
+ sb->persistent = new bluestore_shared_blob_t(sbid);
+ auto p = v.cbegin();
+ decode(*(sb->persistent), p);
+ ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+ << std::dec << " loaded shared_blob " << *sb << dendl;
+ }
+}
+
+void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
+{
+ ldout(store->cct, 10) << __func__ << " " << *b << dendl;
+ ceph_assert(!b->shared_blob->is_loaded());
+
+ // update blob
+ bluestore_blob_t& blob = b->dirty_blob();
+ blob.set_flag(bluestore_blob_t::FLAG_SHARED);
+
+ // update shared blob
+ b->shared_blob->loaded = true;
+ b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
+ shared_blob_set.add(this, b->shared_blob.get());
+ for (auto p : blob.get_extents()) {
+ if (p.is_valid()) {
+ b->shared_blob->get_ref(
+ p.offset,
+ p.length);
+ }
+ }
+ ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
+}
+
+uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
+{
+ ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
+ ceph_assert(sb->is_loaded());
+
+ uint64_t sbid = sb->get_sbid();
+ shared_blob_set.remove(sb);
+ sb->loaded = false;
+ delete sb->persistent;
+ sb->sbid_unloaded = 0;
+ ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
+ return sbid;
+}
+
+BlueStore::OnodeRef BlueStore::Collection::get_onode(
+ const ghobject_t& oid,
+ bool create)
+{
+ ceph_assert(create ? lock.is_wlocked() : lock.is_locked());
+
+ spg_t pgid;
+ if (cid.is_pg(&pgid)) {
+ if (!oid.match(cnode.bits, pgid.ps())) {
+ lderr(store->cct) << __func__ << " oid " << oid << " not part of "
+ << pgid << " bits " << cnode.bits << dendl;
+ ceph_abort();
+ }
+ }
+
+ OnodeRef o = onode_map.lookup(oid);
+ if (o)
+ return o;
+
+ string key;
+ get_object_key(store->cct, oid, &key);
+
+ ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
+ << pretty_binary_string(key) << dendl;
+
+ bufferlist v;
+ int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
+ ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
+ Onode *on;
+ if (v.length() == 0) {
+ ceph_assert(r == -ENOENT);
+ if (!store->cct->_conf->bluestore_debug_misc &&
+ !create)
+ return OnodeRef();
+
+ // new object, new onode
+ on = new Onode(this, oid, key);
+ } else {
+ // loaded
+ ceph_assert(r >= 0);
+ on = Onode::decode(this, oid, key, v);
+ }
+ o.reset(on);
+ return onode_map.add(oid, o);
+}
+
+void BlueStore::Collection::split_cache(
+ Collection *dest)
+{
+ ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
+
+ // lock (one or both) cache shards
+ std::lock(cache->lock, dest->cache->lock);
+ std::lock_guard l(cache->lock, std::adopt_lock);
+ std::lock_guard l2(dest->cache->lock, std::adopt_lock);
+
+ int destbits = dest->cnode.bits;
+ spg_t destpg;
+ bool is_pg = dest->cid.is_pg(&destpg);
+ ceph_assert(is_pg);
+
+ auto p = onode_map.onode_map.begin();
+ while (p != onode_map.onode_map.end()) {
+ OnodeRef o = p->second;
+ if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
+ // onode does not belong to this child
+ ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
+ << dendl;
+ ++p;
+ } else {
+ ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
+ << dendl;
+
+ cache->_rm_onode(p->second);
+ p = onode_map.onode_map.erase(p);
+
+ o->c = dest;
+ dest->cache->_add_onode(o, 1);
+ dest->onode_map.onode_map[o->oid] = o;
+ dest->onode_map.cache = dest->cache;
+
+ // move over shared blobs and buffers. cover shared blobs from
+ // both extent map and spanning blob map (the full extent map
+ // may not be faulted in)
+ vector<SharedBlob*> sbvec;
+ for (auto& e : o->extent_map.extent_map) {
+ sbvec.push_back(e.blob->shared_blob.get());
+ }
+ for (auto& b : o->extent_map.spanning_blob_map) {
+ sbvec.push_back(b.second->shared_blob.get());
+ }
+ for (auto sb : sbvec) {
+ if (sb->coll == dest) {
+ ldout(store->cct, 20) << __func__ << " already moved " << *sb
+ << dendl;
+ continue;
+ }
+ ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
+ if (sb->get_sbid()) {
+ ldout(store->cct, 20) << __func__
+ << " moving registration " << *sb << dendl;
+ shared_blob_set.remove(sb);
+ dest->shared_blob_set.add(dest, sb);
+ }
+ sb->coll = dest;
+ if (dest->cache != cache) {
+ for (auto& i : sb->bc.buffer_map) {
+ if (!i.second->is_writing()) {
+ ldout(store->cct, 20) << __func__ << " moving " << *i.second
+ << dendl;
+ dest->cache->_move_buffer(cache, i.second.get());
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+// =======================================================
+
+// MempoolThread
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
+
+void *BlueStore::MempoolThread::entry()
+{
+ std::unique_lock l(lock);
+
+ uint32_t prev_config_change = store->config_changed.load();
+ uint64_t base = store->osd_memory_base;
+ double fragmentation = store->osd_memory_expected_fragmentation;
+ uint64_t target = store->osd_memory_target;
+ uint64_t min = store->osd_memory_cache_min;
+ uint64_t max = min;
+
+ // When setting the maximum amount of memory to use for cache, first
+ // assume some base amount of memory for the OSD and then fudge in
+ // some overhead for fragmentation that scales with cache usage.
+ uint64_t ltarget = (1.0 - fragmentation) * target;
+ if (ltarget > base + min) {
+ max = ltarget - base;
+ }
+
+ binned_kv_cache = store->db->get_priority_cache();
+ if (store->cache_autotune && binned_kv_cache != nullptr) {
+ pcm = std::make_shared<PriorityCache::Manager>(
+ store->cct, min, max, target, true);
+ pcm->insert("kv", binned_kv_cache, true);
+ pcm->insert("meta", meta_cache, true);
+ pcm->insert("data", data_cache, true);
+ }
+
+ utime_t next_balance = ceph_clock_now();
+ utime_t next_resize = ceph_clock_now();
+
+ bool interval_stats_trim = false;
+ while (!stop) {
+ // Update pcm cache settings if related configuration was changed
+ uint32_t cur_config_change = store->config_changed.load();
+ if (cur_config_change != prev_config_change) {
+ _update_cache_settings();
+ prev_config_change = cur_config_change;
+ }
+
+ // Before we trim, check and see if it's time to rebalance/resize.
+ double autotune_interval = store->cache_autotune_interval;
+ double resize_interval = store->osd_memory_cache_resize_interval;
+
+ if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
+ _adjust_cache_settings();
+
+ // Log events at 5 instead of 20 when balance happens.
+ interval_stats_trim = true;
+
+ if (pcm != nullptr) {
+ pcm->balance();
+ }
+
+ next_balance = ceph_clock_now();
+ next_balance += autotune_interval;
+ }
+ if (resize_interval > 0 && next_resize < ceph_clock_now()) {
+ if (ceph_using_tcmalloc() && pcm != nullptr) {
+ pcm->tune_memory();
+ }
+ next_resize = ceph_clock_now();
+ next_resize += resize_interval;
+ }
+
+ // Now Trim
+ _trim_shards(interval_stats_trim);
+ interval_stats_trim = false;
+
+ store->_update_cache_logger();
+ auto wait = ceph::make_timespan(
+ store->cct->_conf->bluestore_cache_trim_interval);
+ cond.wait_for(l, wait);
+ }
+ stop = false;
+ return NULL;
+}
+
+void BlueStore::MempoolThread::_adjust_cache_settings()
+{
+ if (binned_kv_cache != nullptr) {
+ binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
+ }
+ meta_cache->set_cache_ratio(store->cache_meta_ratio);
+ data_cache->set_cache_ratio(store->cache_data_ratio);
+}
+
+void BlueStore::MempoolThread::_trim_shards(bool interval_stats)
+{
+ auto cct = store->cct;
+ size_t num_shards = store->cache_shards.size();
+
+ int64_t kv_used = store->db->get_cache_usage();
+ int64_t meta_used = meta_cache->_get_used_bytes();
+ int64_t data_used = data_cache->_get_used_bytes();
+
+ uint64_t cache_size = store->cache_size;
+ int64_t kv_alloc =
+ static_cast<int64_t>(store->cache_kv_ratio * cache_size);
+ int64_t meta_alloc =
+ static_cast<int64_t>(store->cache_meta_ratio * cache_size);
+ int64_t data_alloc =
+ static_cast<int64_t>(store->cache_data_ratio * cache_size);
+
+ if (pcm != nullptr && binned_kv_cache != nullptr) {
+ cache_size = pcm->get_tuned_mem();
+ kv_alloc = binned_kv_cache->get_committed_size();
+ meta_alloc = meta_cache->get_committed_size();
+ data_alloc = data_cache->get_committed_size();
+ }
+
+ if (interval_stats) {
+ ldout(cct, 5) << __func__ << " cache_size: " << cache_size
+ << " kv_alloc: " << kv_alloc
+ << " kv_used: " << kv_used
+ << " meta_alloc: " << meta_alloc
+ << " meta_used: " << meta_used
+ << " data_alloc: " << data_alloc
+ << " data_used: " << data_used << dendl;
+ } else {
+ ldout(cct, 20) << __func__ << " cache_size: " << cache_size
+ << " kv_alloc: " << kv_alloc
+ << " kv_used: " << kv_used
+ << " meta_alloc: " << meta_alloc
+ << " meta_used: " << meta_used
+ << " data_alloc: " << data_alloc
+ << " data_used: " << data_used << dendl;
+ }
+
+ uint64_t max_shard_onodes = static_cast<uint64_t>(
+ (meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode());
+ uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
+
+ auto debug_max_onodes = g_conf()->bluestore_debug_max_cached_onodes;
+ if (debug_max_onodes) {
+ max_shard_onodes = debug_max_onodes;
+ }
+ ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
+ << " max_shard_buffer: " << max_shard_buffer << dendl;
+
+ for (auto i : store->cache_shards) {
+ i->trim(max_shard_onodes, max_shard_buffer);
+ }
+}
+
+void BlueStore::MempoolThread::_update_cache_settings()
+{
+ // Nothing to do if pcm is not used.
+ if (pcm == nullptr) {
+ return;
+ }
+
+ auto cct = store->cct;
+ uint64_t target = store->osd_memory_target;
+ uint64_t base = store->osd_memory_base;
+ uint64_t min = store->osd_memory_cache_min;
+ uint64_t max = min;
+ double fragmentation = store->osd_memory_expected_fragmentation;
+
+ uint64_t ltarget = (1.0 - fragmentation) * target;
+ if (ltarget > base + min) {
+ max = ltarget - base;
+ }
+
+ // set pcm cache levels
+ pcm->set_target_memory(target);
+ pcm->set_min_memory(min);
+ pcm->set_max_memory(max);
+
+ ldout(cct, 5) << __func__ << " updated pcm target: " << target
+ << " pcm min: " << min
+ << " pcm max: " << max
+ << dendl;
+}
+
+// =======================================================
+
+// OmapIteratorImpl
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
+
+BlueStore::OmapIteratorImpl::OmapIteratorImpl(
+ CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
+ : c(c), o(o), it(it)
+{
+ RWLock::RLocker l(c->lock);
+ if (o->onode.has_omap()) {
+ get_omap_key(o->onode.nid, string(), &head);
+ get_omap_tail(o->onode.nid, &tail);
+ it->lower_bound(head);
+ }
+}
+
+string BlueStore::OmapIteratorImpl::_stringify() const
+{
+ stringstream s;
+ s << " omap_iterator(cid = " << c->cid
+ <<", oid = " << o->oid << ")";
+ return s.str();
+}
+
+int BlueStore::OmapIteratorImpl::seek_to_first()
+{
+ RWLock::RLocker l(c->lock);
+ auto start1 = mono_clock::now();
+ if (o->onode.has_omap()) {
+ it->lower_bound(head);
+ } else {
+ it = KeyValueDB::Iterator();
+ }
+ c->store->log_latency(
+ __func__,
+ l_bluestore_omap_seek_to_first_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+ return 0;
+}
+
+int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
+{
+ RWLock::RLocker l(c->lock);
+ auto start1 = mono_clock::now();
+ if (o->onode.has_omap()) {
+ string key;
+ get_omap_key(o->onode.nid, after, &key);
+ ldout(c->store->cct,20) << __func__ << " after " << after << " key "
+ << pretty_binary_string(key) << dendl;
+ it->upper_bound(key);
+ } else {
+ it = KeyValueDB::Iterator();
+ }
+ c->store->log_latency_fn(
+ __func__,
+ l_bluestore_omap_upper_bound_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age,
+ [&] (const ceph::timespan& lat) {
+ return ", after = " + after +
+ _stringify();
+ }
+ );
+ return 0;
+}
+
+int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
+{
+ RWLock::RLocker l(c->lock);
+ auto start1 = mono_clock::now();
+ if (o->onode.has_omap()) {
+ string key;
+ get_omap_key(o->onode.nid, to, &key);
+ ldout(c->store->cct,20) << __func__ << " to " << to << " key "
+ << pretty_binary_string(key) << dendl;
+ it->lower_bound(key);
+ } else {
+ it = KeyValueDB::Iterator();
+ }
+ c->store->log_latency_fn(
+ __func__,
+ l_bluestore_omap_lower_bound_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age,
+ [&] (const ceph::timespan& lat) {
+ return ", to = " + to +
+ _stringify();
+ }
+ );
+ return 0;
+}
+
+bool BlueStore::OmapIteratorImpl::valid()
+{
+ RWLock::RLocker l(c->lock);
+ bool r = o->onode.has_omap() && it && it->valid() &&
+ it->raw_key().second < tail;
+ if (it && it->valid()) {
+ ldout(c->store->cct,20) << __func__ << " is at "
+ << pretty_binary_string(it->raw_key().second)
+ << dendl;
+ }
+ return r;
+}
+
+int BlueStore::OmapIteratorImpl::next()
+{
+ int r = -1;
+ RWLock::RLocker l(c->lock);
+ auto start1 = mono_clock::now();
+ if (o->onode.has_omap()) {
+ it->next();
+ r = 0;
+ }
+ c->store->log_latency(
+ __func__,
+ l_bluestore_omap_next_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+ return r;
+}
+
+string BlueStore::OmapIteratorImpl::key()
+{
+ RWLock::RLocker l(c->lock);
+ ceph_assert(it->valid());
+ string db_key = it->raw_key().second;
+ string user_key;
+ decode_omap_key(db_key, &user_key);
+
+ return user_key;
+}
+
+bufferlist BlueStore::OmapIteratorImpl::value()
+{
+ RWLock::RLocker l(c->lock);
+ ceph_assert(it->valid());
+ return it->value();
+}
+
+
+// =====================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore(" << path << ") "
+
+
+static void aio_cb(void *priv, void *priv2)
+{
+ BlueStore *store = static_cast<BlueStore*>(priv);
+ BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
+ c->aio_finish(store);
+}
+
+static void discard_cb(void *priv, void *priv2)
+{
+ BlueStore *store = static_cast<BlueStore*>(priv);
+ interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+ store->handle_discard(*tmp);
+}
+
+void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(alloc);
+ alloc->release(to_release);
+}
+
+BlueStore::BlueStore(CephContext *cct, const string& path)
+ : ObjectStore(cct, path),
+ throttle_bytes(cct, "bluestore_throttle_bytes",
+ cct->_conf->bluestore_throttle_bytes),
+ throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
+ cct->_conf->bluestore_throttle_bytes +
+ cct->_conf->bluestore_throttle_deferred_bytes),
+ deferred_finisher(cct, "defered_finisher", "dfin"),
+ finisher(cct, "commit_finisher", "cfin"),
+ kv_sync_thread(this),
+ kv_finalize_thread(this),
+ mempool_thread(this)
+{
+ _init_logger();
+ cct->_conf.add_observer(this);
+ set_cache_shards(1);
+}
+
+BlueStore::BlueStore(CephContext *cct,
+ const string& path,
+ uint64_t _min_alloc_size)
+ : ObjectStore(cct, path),
+ throttle_bytes(cct, "bluestore_throttle_bytes",
+ cct->_conf->bluestore_throttle_bytes),
+ throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
+ cct->_conf->bluestore_throttle_bytes +
+ cct->_conf->bluestore_throttle_deferred_bytes),
+ deferred_finisher(cct, "defered_finisher", "dfin"),
+ finisher(cct, "commit_finisher", "cfin"),
+ kv_sync_thread(this),
+ kv_finalize_thread(this),
+ min_alloc_size(_min_alloc_size),
+ min_alloc_size_order(ctz(_min_alloc_size)),
+ mempool_thread(this)
+{
+ _init_logger();
+ cct->_conf.add_observer(this);
+ set_cache_shards(1);
+}
+
+BlueStore::~BlueStore()
+{
+ cct->_conf.remove_observer(this);
+ _shutdown_logger();
+ ceph_assert(!mounted);
+ ceph_assert(db == NULL);
+ ceph_assert(bluefs == NULL);
+ ceph_assert(fsid_fd < 0);
+ ceph_assert(path_fd < 0);
+ for (auto i : cache_shards) {
+ delete i;
+ }
+ cache_shards.clear();
+}
+
+const char **BlueStore::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "bluestore_csum_type",
+ "bluestore_compression_mode",
+ "bluestore_compression_algorithm",
+ "bluestore_compression_min_blob_size",
+ "bluestore_compression_min_blob_size_ssd",
+ "bluestore_compression_min_blob_size_hdd",
+ "bluestore_compression_max_blob_size",
+ "bluestore_compression_max_blob_size_ssd",
+ "bluestore_compression_max_blob_size_hdd",
+ "bluestore_compression_required_ratio",
+ "bluestore_max_alloc_size",
+ "bluestore_prefer_deferred_size",
+ "bluestore_prefer_deferred_size_hdd",
+ "bluestore_prefer_deferred_size_ssd",
+ "bluestore_deferred_batch_ops",
+ "bluestore_deferred_batch_ops_hdd",
+ "bluestore_deferred_batch_ops_ssd",
+ "bluestore_throttle_bytes",
+ "bluestore_throttle_deferred_bytes",
+ "bluestore_throttle_cost_per_io_hdd",
+ "bluestore_throttle_cost_per_io_ssd",
+ "bluestore_throttle_cost_per_io",
+ "bluestore_max_blob_size",
+ "bluestore_max_blob_size_ssd",
+ "bluestore_max_blob_size_hdd",
+ "osd_memory_target",
+ "osd_memory_target_cgroup_limit_ratio",
+ "osd_memory_base",
+ "osd_memory_cache_min",
+ "osd_memory_expected_fragmentation",
+ "bluestore_cache_autotune",
+ "bluestore_cache_autotune_interval",
+ "bluestore_warn_on_legacy_statfs",
+ NULL
+ };
+ return KEYS;
+}
+
+void BlueStore::handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ if (changed.count("bluestore_warn_on_legacy_statfs")) {
+ _check_legacy_statfs_alert();
+ }
+
+ if (changed.count("bluestore_csum_type")) {
+ _set_csum();
+ }
+ if (changed.count("bluestore_compression_mode") ||
+ changed.count("bluestore_compression_algorithm") ||
+ changed.count("bluestore_compression_min_blob_size") ||
+ changed.count("bluestore_compression_max_blob_size")) {
+ if (bdev) {
+ _set_compression();
+ }
+ }
+ if (changed.count("bluestore_max_blob_size") ||
+ changed.count("bluestore_max_blob_size_ssd") ||
+ changed.count("bluestore_max_blob_size_hdd")) {
+ if (bdev) {
+ // only after startup
+ _set_blob_size();
+ }
+ }
+ if (changed.count("bluestore_prefer_deferred_size") ||
+ changed.count("bluestore_prefer_deferred_size_hdd") ||
+ changed.count("bluestore_prefer_deferred_size_ssd") ||
+ changed.count("bluestore_max_alloc_size") ||
+ changed.count("bluestore_deferred_batch_ops") ||
+ changed.count("bluestore_deferred_batch_ops_hdd") ||
+ changed.count("bluestore_deferred_batch_ops_ssd")) {
+ if (bdev) {
+ // only after startup
+ _set_alloc_sizes();
+ }
+ }
+ if (changed.count("bluestore_throttle_cost_per_io") ||
+ changed.count("bluestore_throttle_cost_per_io_hdd") ||
+ changed.count("bluestore_throttle_cost_per_io_ssd")) {
+ if (bdev) {
+ _set_throttle_params();
+ }
+ }
+ if (changed.count("bluestore_throttle_bytes")) {
+ throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
+ throttle_deferred_bytes.reset_max(
+ conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
+ }
+ if (changed.count("bluestore_throttle_deferred_bytes")) {
+ throttle_deferred_bytes.reset_max(
+ conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
+ }
+ if (changed.count("osd_memory_target") ||
+ changed.count("osd_memory_base") ||
+ changed.count("osd_memory_cache_min") ||
+ changed.count("osd_memory_expected_fragmentation")) {
+ _update_osd_memory_options();
+ }
+}
+
+void BlueStore::_set_compression()
+{
+ auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
+ if (m) {
+ _clear_compression_alert();
+ comp_mode = *m;
+ } else {
+ derr << __func__ << " unrecognized value '"
+ << cct->_conf->bluestore_compression_mode
+ << "' for bluestore_compression_mode, reverting to 'none'"
+ << dendl;
+ comp_mode = Compressor::COMP_NONE;
+ string s("unknown mode: ");
+ s += cct->_conf->bluestore_compression_mode;
+ _set_compression_alert(true, s.c_str());
+ }
+
+ compressor = nullptr;
+
+ if (cct->_conf->bluestore_compression_min_blob_size) {
+ comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
+ } else {
+ ceph_assert(bdev);
+ if (bdev->is_rotational()) {
+ comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
+ } else {
+ comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
+ }
+ }
+
+ if (cct->_conf->bluestore_compression_max_blob_size) {
+ comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
+ } else {
+ ceph_assert(bdev);
+ if (bdev->is_rotational()) {
+ comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
+ } else {
+ comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
+ }
+ }
+
+ auto& alg_name = cct->_conf->bluestore_compression_algorithm;
+ if (!alg_name.empty()) {
+ compressor = Compressor::create(cct, alg_name);
+ if (!compressor) {
+ derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
+ << dendl;
+ _set_compression_alert(false, alg_name.c_str());
+ }
+ }
+
+ dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
+ << " alg " << (compressor ? compressor->get_type_name() : "(none)")
+ << " min_blob " << comp_min_blob_size
+ << " max_blob " << comp_max_blob_size
+ << dendl;
+}
+
+void BlueStore::_set_csum()
+{
+ csum_type = Checksummer::CSUM_NONE;
+ int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
+ if (t > Checksummer::CSUM_NONE)
+ csum_type = t;
+
+ dout(10) << __func__ << " csum_type "
+ << Checksummer::get_csum_type_string(csum_type)
+ << dendl;
+}
+
+void BlueStore::_set_throttle_params()
+{
+ if (cct->_conf->bluestore_throttle_cost_per_io) {
+ throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
+ } else {
+ ceph_assert(bdev);
+ if (bdev->is_rotational()) {
+ throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
+ } else {
+ throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
+ }
+ }
+
+ dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
+ << dendl;
+}
+void BlueStore::_set_blob_size()
+{
+ if (cct->_conf->bluestore_max_blob_size) {
+ max_blob_size = cct->_conf->bluestore_max_blob_size;
+ } else {
+ ceph_assert(bdev);
+ if (bdev->is_rotational()) {
+ max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
+ } else {
+ max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
+ }
+ }
+ dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
+ << std::dec << dendl;
+}
+
+void BlueStore::_update_osd_memory_options()
+{
+ osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+ osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+ osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+ osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
+ config_changed++;
+ dout(10) << __func__
+ << " osd_memory_target " << osd_memory_target
+ << " osd_memory_base " << osd_memory_base
+ << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
+ << " osd_memory_cache_min " << osd_memory_cache_min
+ << dendl;
+}
+
+int BlueStore::_set_cache_sizes()
+{
+ ceph_assert(bdev);
+ cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
+ cache_autotune_interval =
+ cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
+ osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+ osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+ osd_memory_expected_fragmentation =
+ cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+ osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
+ osd_memory_cache_resize_interval =
+ cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
+
+ if (cct->_conf->bluestore_cache_size) {
+ cache_size = cct->_conf->bluestore_cache_size;
+ } else {
+ // choose global cache size based on backend type
+ if (bdev->is_rotational()) {
+ cache_size = cct->_conf->bluestore_cache_size_hdd;
+ } else {
+ cache_size = cct->_conf->bluestore_cache_size_ssd;
+ }
+ }
+
+ cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
+ if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
+ derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
+ << ") must be in range [0,1.0]" << dendl;
+ return -EINVAL;
+ }
+
+ cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
+ if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
+ derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
+ << ") must be in range [0,1.0]" << dendl;
+ return -EINVAL;
+ }
+
+ if (cache_meta_ratio + cache_kv_ratio > 1.0) {
+ derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
+ << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
+ << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
+ << dendl;
+ return -EINVAL;
+ }
+
+ cache_data_ratio =
+ (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
+ if (cache_data_ratio < 0) {
+ // deal with floating point imprecision
+ cache_data_ratio = 0;
+ }
+
+ dout(1) << __func__ << " cache_size " << cache_size
+ << " meta " << cache_meta_ratio
+ << " kv " << cache_kv_ratio
+ << " data " << cache_data_ratio
+ << dendl;
+ return 0;
+}
+
+int BlueStore::write_meta(const std::string& key, const std::string& value)
+{
+ bluestore_bdev_label_t label;
+ string p = path + "/block";
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ return ObjectStore::write_meta(key, value);
+ }
+ label.meta[key] = value;
+ r = _write_bdev_label(cct, p, label);
+ ceph_assert(r == 0);
+ return ObjectStore::write_meta(key, value);
+}
+
+int BlueStore::read_meta(const std::string& key, std::string *value)
+{
+ bluestore_bdev_label_t label;
+ string p = path + "/block";
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ return ObjectStore::read_meta(key, value);
+ }
+ auto i = label.meta.find(key);
+ if (i == label.meta.end()) {
+ return ObjectStore::read_meta(key, value);
+ }
+ *value = i->second;
+ return 0;
+}
+
+void BlueStore::_init_logger()
+{
+ PerfCountersBuilder b(cct, "bluestore",
+ l_bluestore_first, l_bluestore_last);
+ b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
+ "Average kv_thread flush latency",
+ "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
+ "Average kv_thread commit latency");
+ b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
+ "Average kv_sync thread latency",
+ "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
+ "Average kv_finalize thread latency",
+ "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
+ "Average prepare state latency");
+ b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
+ "Average aio_wait state latency",
+ "io_l", PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
+ "Average io_done state latency");
+ b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
+ "Average kv_queued state latency");
+ b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
+ "Average kv_commiting state latency");
+ b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
+ "Average kv_done state latency");
+ b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
+ "Average deferred_queued state latency");
+ b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
+ "Average aio_wait state latency");
+ b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
+ "Average cleanup state latency");
+ b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
+ "Average finishing state latency");
+ b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
+ "Average done state latency");
+ b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
+ "Average submit throttle latency",
+ "th_l", PerfCountersBuilder::PRIO_CRITICAL);
+ b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
+ "Average submit latency",
+ "s_l", PerfCountersBuilder::PRIO_CRITICAL);
+ b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
+ "Average commit latency",
+ "c_l", PerfCountersBuilder::PRIO_CRITICAL);
+ b.add_time_avg(l_bluestore_read_lat, "read_lat",
+ "Average read latency",
+ "r_l", PerfCountersBuilder::PRIO_CRITICAL);
+ b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
+ "Average read onode metadata latency");
+ b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
+ "Average read latency");
+ b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
+ "Average compress latency");
+ b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
+ "Average decompress latency");
+ b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
+ "Average checksum latency");
+ b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
+ "Sum for beneficial compress ops");
+ b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
+ "Sum for compress ops rejected due to low net gain of space");
+ b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
+ "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
+ "Sum for deferred write op");
+ b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
+ "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
+ "Sum for write penalty read ops");
+ b.add_u64(l_bluestore_allocated, "bluestore_allocated",
+ "Sum for allocated bytes");
+ b.add_u64(l_bluestore_stored, "bluestore_stored",
+ "Sum for stored bytes");
+ b.add_u64(l_bluestore_compressed, "bluestore_compressed",
+ "Sum for stored compressed bytes",
+ "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
+ "Sum for bytes allocated for compressed data",
+ "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
+ "Sum for original bytes that were compressed",
+ "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_u64(l_bluestore_onodes, "bluestore_onodes",
+ "Number of onodes in cache");
+ b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
+ "Sum for onode-lookups hit in the cache");
+ b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
+ "Sum for onode-lookups missed in the cache");
+ b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
+ "Sum for onode-shard lookups hit in the cache");
+ b.add_u64_counter(l_bluestore_onode_shard_misses,
+ "bluestore_onode_shard_misses",
+ "Sum for onode-shard lookups missed in the cache");
+ b.add_u64(l_bluestore_extents, "bluestore_extents",
+ "Number of extents in cache");
+ b.add_u64(l_bluestore_blobs, "bluestore_blobs",
+ "Number of blobs in cache");
+ b.add_u64(l_bluestore_buffers, "bluestore_buffers",
+ "Number of buffers in cache");
+ b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
+ "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
+ "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
+ "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
+
+ b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
+ "Large aligned writes into fresh blobs");
+ b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
+ "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
+ "Large aligned writes into fresh blobs (blobs)");
+ b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
+ "Small writes into existing or sparse small blobs");
+ b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
+ "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64_counter(l_bluestore_write_small_unused,
+ "bluestore_write_small_unused",
+ "Small writes into unused portion of existing blob");
+ b.add_u64_counter(l_bluestore_write_small_deferred,
+ "bluestore_write_small_deferred",
+ "Small overwrites using deferred");
+ b.add_u64_counter(l_bluestore_write_small_pre_read,
+ "bluestore_write_small_pre_read",
+ "Small writes that required we read some data (possibly "
+ "cached) to fill out the block");
+ b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
+ "Small write into new (sparse) blob");
+
+ b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
+ b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
+ "Onode extent map reshard events");
+ b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
+ "Sum for blob splitting due to resharding");
+ b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
+ "Sum for extents that have been removed due to compression");
+ b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
+ "Sum for extents that have been merged due to garbage "
+ "collection");
+ b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
+ "Read EIO errors propagated to high level callers");
+ b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
+ "Read operations that required at least one retry due to failed checksum validation");
+ b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
+ "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
+ b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
+ "Average omap iterator seek_to_first call latency");
+ b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
+ "Average omap iterator upper_bound call latency");
+ b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
+ "Average omap iterator lower_bound call latency");
+ b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
+ "Average omap iterator next call latency");
+ b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
+ "Average omap get_keys call latency");
+ b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
+ "Average omap get_values call latency");
+ b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
+ "Average collection listing latency");
+ b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
+ "Average removal latency");
+
+ logger = b.create_perf_counters();
+ cct->get_perfcounters_collection()->add(logger);
+}
+
+int BlueStore::_reload_logger()
+{
+ struct store_statfs_t store_statfs;
+ int r = statfs(&store_statfs);
+ if (r >= 0) {
+ logger->set(l_bluestore_allocated, store_statfs.allocated);
+ logger->set(l_bluestore_stored, store_statfs.data_stored);
+ logger->set(l_bluestore_compressed, store_statfs.data_compressed);
+ logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
+ logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
+ }
+ return r;
+}
+
+void BlueStore::_shutdown_logger()
+{
+ cct->get_perfcounters_collection()->remove(logger);
+ delete logger;
+}
+
+int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
+ uuid_d *fsid)
+{
+ bluestore_bdev_label_t label;
+ int r = _read_bdev_label(cct, path, &label);
+ if (r < 0)
+ return r;
+ *fsid = label.osd_uuid;
+ return 0;
+}
+
+int BlueStore::_open_path()
+{
+ // sanity check(s)
+ auto osd_max_object_size =
+ cct->_conf.get_val<Option::size_t>("osd_max_object_size");
+ if (osd_max_object_size >= (size_t)OBJECT_MAX_SIZE) {
+ derr << __func__ << " osd_max_object_size >= 0x" << std::hex << OBJECT_MAX_SIZE
+ << "; BlueStore has hard limit of 0x" << OBJECT_MAX_SIZE << "." << std::dec << dendl;
+ return -EINVAL;
+ }
+ ceph_assert(path_fd < 0);
+ path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
+ if (path_fd < 0) {
+ int r = -errno;
+ derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ return 0;
+}
+
+void BlueStore::_close_path()
+{
+ VOID_TEMP_FAILURE_RETRY(::close(path_fd));
+ path_fd = -1;
+}
+
+int BlueStore::_write_bdev_label(CephContext *cct,
+ string path, bluestore_bdev_label_t label)
+{
+ dout(10) << __func__ << " path " << path << " label " << label << dendl;
+ bufferlist bl;
+ encode(label, bl);
+ uint32_t crc = bl.crc32c(-1);
+ encode(crc, bl);
+ ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
+ bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
+ z.zero();
+ bl.append(std::move(z));
+
+ int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
+ if (fd < 0) {
+ fd = -errno;
+ derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
+ << dendl;
+ return fd;
+ }
+ int r = bl.write_fd(fd);
+ if (r < 0) {
+ derr << __func__ << " failed to write to " << path
+ << ": " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " failed to fsync " << path
+ << ": " << cpp_strerror(r) << dendl;
+ }
+out:
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+}
+
+int BlueStore::_read_bdev_label(CephContext* cct, string path,
+ bluestore_bdev_label_t *label)
+{
+ dout(10) << __func__ << dendl;
+ int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
+ if (fd < 0) {
+ fd = -errno;
+ derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
+ << dendl;
+ return fd;
+ }
+ bufferlist bl;
+ int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ if (r < 0) {
+ derr << __func__ << " failed to read from " << path
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ uint32_t crc, expected_crc;
+ auto p = bl.cbegin();
+ try {
+ decode(*label, p);
+ bufferlist t;
+ t.substr_of(bl, 0, p.get_off());
+ crc = t.crc32c(-1);
+ decode(expected_crc, p);
+ }
+ catch (buffer::error& e) {
+ dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
+ << ": " << e.what()
+ << dendl;
+ return -ENOENT;
+ }
+ if (crc != expected_crc) {
+ derr << __func__ << " bad crc on label, expected " << expected_crc
+ << " != actual " << crc << dendl;
+ return -EIO;
+ }
+ dout(10) << __func__ << " got " << *label << dendl;
+ return 0;
+}
+
+int BlueStore::_check_or_set_bdev_label(
+ string path, uint64_t size, string desc, bool create)
+{
+ bluestore_bdev_label_t label;
+ if (create) {
+ label.osd_uuid = fsid;
+ label.size = size;
+ label.btime = ceph_clock_now();
+ label.description = desc;
+ int r = _write_bdev_label(cct, path, label);
+ if (r < 0)
+ return r;
+ } else {
+ int r = _read_bdev_label(cct, path, &label);
+ if (r < 0)
+ return r;
+ if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
+ dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
+ << " and fsid " << fsid << " check bypassed" << dendl;
+ }
+ else if (label.osd_uuid != fsid) {
+ derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
+ << " does not match our fsid " << fsid << dendl;
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+void BlueStore::_set_alloc_sizes(void)
+{
+ max_alloc_size = cct->_conf->bluestore_max_alloc_size;
+
+ if (cct->_conf->bluestore_prefer_deferred_size) {
+ prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
+ } else {
+ ceph_assert(bdev);
+ if (bdev->is_rotational()) {
+ prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
+ } else {
+ prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
+ }
+ }
+
+ if (cct->_conf->bluestore_deferred_batch_ops) {
+ deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
+ } else {
+ ceph_assert(bdev);
+ if (bdev->is_rotational()) {
+ deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
+ } else {
+ deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
+ }
+ }
+
+ dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
+ << std::dec << " order " << (int)min_alloc_size_order
+ << " max_alloc_size 0x" << std::hex << max_alloc_size
+ << " prefer_deferred_size 0x" << prefer_deferred_size
+ << std::dec
+ << " deferred_batch_ops " << deferred_batch_ops
+ << dendl;
+}
+
+int BlueStore::_open_bdev(bool create)
+{
+ ceph_assert(bdev == NULL);
+ string p = path + "/block";
+ bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
+ int r = bdev->open(p);
+ if (r < 0)
+ goto fail;
+
+ if (create && cct->_conf->bdev_enable_discard) {
+ bdev->discard(0, bdev->get_size());
+ }
+
+ if (bdev->supported_bdev_label()) {
+ r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
+ if (r < 0)
+ goto fail_close;
+ }
+
+ // initialize global block parameters
+ block_size = bdev->get_block_size();
+ block_mask = ~(block_size - 1);
+ block_size_order = ctz(block_size);
+ ceph_assert(block_size == 1u << block_size_order);
+ // and set cache_size based on device type
+ r = _set_cache_sizes();
+ if (r < 0) {
+ goto fail_close;
+ }
+ return 0;
+
+ fail_close:
+ bdev->close();
+ fail:
+ delete bdev;
+ bdev = NULL;
+ return r;
+}
+
+void BlueStore::_validate_bdev()
+{
+ ceph_assert(bdev);
+ ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
+ uint64_t dev_size = bdev->get_size();
+ if (dev_size <
+ _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) {
+ dout(1) << __func__ << " main device size " << byte_u_t(dev_size)
+ << " is too small, disable bluestore_bluefs_min for now"
+ << dendl;
+ ceph_assert(dev_size >= _get_ondisk_reserved());
+
+ int r = cct->_conf.set_val("bluestore_bluefs_min", "0");
+ ceph_assert(r == 0);
+ }
+}
+
+void BlueStore::_close_bdev()
+{
+ ceph_assert(bdev);
+ bdev->close();
+ delete bdev;
+ bdev = NULL;
+}
+
+int BlueStore::_open_fm(KeyValueDB::Transaction t)
+{
+ ceph_assert(fm == NULL);
+ fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
+ ceph_assert(fm);
+ if (t) {
+ // create mode. initialize freespace
+ dout(20) << __func__ << " initializing freespace" << dendl;
+ {
+ bufferlist bl;
+ bl.append(freelist_type);
+ t->set(PREFIX_SUPER, "freelist_type", bl);
+ }
+ // being able to allocate in units less than bdev block size
+ // seems to be a bad idea.
+ ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
+ fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
+
+ // allocate superblock reserved space. note that we do not mark
+ // bluefs space as allocated in the freelist; we instead rely on
+ // bluefs_extents.
+ auto reserved = _get_ondisk_reserved();
+ fm->allocate(0, reserved, t);
+
+ if (cct->_conf->bluestore_bluefs) {
+ ceph_assert(bluefs_extents.num_intervals() == 1);
+ interval_set<uint64_t>::iterator p = bluefs_extents.begin();
+ reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size);
+ dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
+ << " for bluefs" << dendl;
+ }
+
+ if (cct->_conf->bluestore_debug_prefill > 0) {
+ uint64_t end = bdev->get_size() - reserved;
+ dout(1) << __func__ << " pre-fragmenting freespace, using "
+ << cct->_conf->bluestore_debug_prefill << " with max free extent "
+ << cct->_conf->bluestore_debug_prefragment_max << dendl;
+ uint64_t start = p2roundup(reserved, min_alloc_size);
+ uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
+ float r = cct->_conf->bluestore_debug_prefill;
+ r /= 1.0 - r;
+ bool stop = false;
+
+ while (!stop && start < end) {
+ uint64_t l = (rand() % max_b + 1) * min_alloc_size;
+ if (start + l > end) {
+ l = end - start;
+ l = p2align(l, min_alloc_size);
+ }
+ ceph_assert(start + l <= end);
+
+ uint64_t u = 1 + (uint64_t)(r * (double)l);
+ u = p2roundup(u, min_alloc_size);
+ if (start + l + u > end) {
+ u = end - (start + l);
+ // trim to align so we don't overflow again
+ u = p2align(u, min_alloc_size);
+ stop = true;
+ }
+ ceph_assert(start + l + u <= end);
+
+ dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
+ << " use 0x" << u << std::dec << dendl;
+
+ if (u == 0) {
+ // break if u has been trimmed to nothing
+ break;
+ }
+
+ fm->allocate(start + l, u, t);
+ start += l + u;
+ }
+ }
+ }
+
+ int r = fm->init(db);
+ if (r < 0) {
+ derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
+ delete fm;
+ fm = NULL;
+ return r;
+ }
+ // if space size tracked by free list manager is that higher than actual
+ // dev size one can hit out-of-space allocation which will result
+ // in data loss and/or assertions
+ // Probably user altered the device size somehow.
+ // The only fix for now is to redeploy OSD.
+ if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
+ ostringstream ss;
+ ss << "slow device size mismatch detected, "
+ << " fm size(" << fm->get_size()
+ << ") > slow device size(" << bdev->get_size()
+ << "), Please stop using this OSD as it might cause data loss.";
+ _set_disk_size_mismatch_alert(ss.str());
+ }
+ return 0;
+}
+
+void BlueStore::_close_fm()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(fm);
+ fm->shutdown();
+ delete fm;
+ fm = NULL;
+}
+
+int BlueStore::_open_alloc()
+{
+ ceph_assert(alloc == NULL);
+ ceph_assert(bdev->get_size());
+
+ if (bluefs) {
+ bluefs_extents.clear();
+ auto r = bluefs->get_block_extents(bluefs_shared_bdev, &bluefs_extents);
+ if (r < 0) {
+ lderr(cct) << __func__ << " failed to retrieve bluefs_extents: "
+ << cpp_strerror(r) << dendl;
+
+ return r;
+ }
+ dout(10) << __func__ << " bluefs extents 0x"
+ << std::hex << bluefs_extents << std::dec
+ << dendl;
+ }
+
+ alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
+ bdev->get_size(),
+ min_alloc_size, "block");
+ if (!alloc) {
+ lderr(cct) << __func__ << " Allocator::unknown alloc type "
+ << cct->_conf->bluestore_allocator
+ << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t num = 0, bytes = 0;
+
+ dout(1) << __func__ << " opening allocation metadata" << dendl;
+ // initialize from freelist
+ fm->enumerate_reset();
+ uint64_t offset, length;
+ while (fm->enumerate_next(db, &offset, &length)) {
+ alloc->init_add_free(offset, length);
+ ++num;
+ bytes += length;
+ }
+ fm->enumerate_reset();
+
+ // also mark bluefs space as allocated
+ for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
+ alloc->init_rm_free(e.get_start(), e.get_len());
+ }
+
+ dout(1) << __func__ << " loaded " << byte_u_t(bytes)
+ << " in " << num << " extents"
+ << " available " << byte_u_t(alloc->get_free())
+ << dendl;
+
+ return 0;
+}
+
+void BlueStore::_close_alloc()
+{
+ ceph_assert(bdev);
+ bdev->discard_drain();
+
+ ceph_assert(alloc);
+ alloc->shutdown();
+ delete alloc;
+ alloc = NULL;
+ bluefs_extents.clear();
+}
+
+int BlueStore::_open_fsid(bool create)
+{
+ ceph_assert(fsid_fd < 0);
+ int flags = O_RDWR|O_CLOEXEC;
+ if (create)
+ flags |= O_CREAT;
+ fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
+ if (fsid_fd < 0) {
+ int err = -errno;
+ derr << __func__ << " " << cpp_strerror(err) << dendl;
+ return err;
+ }
+ return 0;
+}
+
+int BlueStore::_read_fsid(uuid_d *uuid)
+{
+ char fsid_str[40];
+ memset(fsid_str, 0, sizeof(fsid_str));
+ int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
+ if (ret < 0) {
+ derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (ret > 36)
+ fsid_str[36] = 0;
+ else
+ fsid_str[ret] = 0;
+ if (!uuid->parse(fsid_str)) {
+ derr << __func__ << " unparsable uuid " << fsid_str << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int BlueStore::_write_fsid()
+{
+ int r = ::ftruncate(fsid_fd, 0);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ string str = stringify(fsid) + "\n";
+ r = safe_write(fsid_fd, str.c_str(), str.length());
+ if (r < 0) {
+ derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ r = ::fsync(fsid_fd);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+void BlueStore::_close_fsid()
+{
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+}
+
+int BlueStore::_lock_fsid()
+{
+ struct flock l;
+ memset(&l, 0, sizeof(l));
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ int r = ::fcntl(fsid_fd, F_SETLK, &l);
+ if (r < 0) {
+ int err = errno;
+ derr << __func__ << " failed to lock " << path << "/fsid"
+ << " (is another ceph-osd still running?)"
+ << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ return 0;
+}
+
+bool BlueStore::is_rotational()
+{
+ if (bdev) {
+ return bdev->is_rotational();
+ }
+
+ bool rotational = true;
+ int r = _open_path();
+ if (r < 0)
+ goto out;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+ r = _read_fsid(&fsid);
+ if (r < 0)
+ goto out_fsid;
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_fsid;
+ r = _open_bdev(false);
+ if (r < 0)
+ goto out_fsid;
+ rotational = bdev->is_rotational();
+ _close_bdev();
+ out_fsid:
+ _close_fsid();
+ out_path:
+ _close_path();
+ out:
+ return rotational;
+}
+
+bool BlueStore::is_journal_rotational()
+{
+ if (!bluefs) {
+ dout(5) << __func__ << " bluefs disabled, default to store media type"
+ << dendl;
+ return is_rotational();
+ }
+ dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
+ return bluefs->wal_is_rotational();
+}
+
+bool BlueStore::test_mount_in_use()
+{
+ // most error conditions mean the mount is not in use (e.g., because
+ // it doesn't exist). only if we fail to lock do we conclude it is
+ // in use.
+ bool ret = false;
+ int r = _open_path();
+ if (r < 0)
+ return false;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+ r = _lock_fsid();
+ if (r < 0)
+ ret = true; // if we can't lock, it is in use
+ _close_fsid();
+ out_path:
+ _close_path();
+ return ret;
+}
+
+int BlueStore::_minimal_open_bluefs(bool create)
+{
+ int r;
+ bluefs = new BlueFS(cct);
+
+ string bfn;
+ struct stat st;
+
+ bfn = path + "/block.db";
+ if (::stat(bfn.c_str(), &st) == 0) {
+ r = bluefs->add_block_device(
+ BlueFS::BDEV_DB, bfn,
+ create && cct->_conf->bdev_enable_discard);
+ if (r < 0) {
+ derr << __func__ << " add block device(" << bfn << ") returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
+ r = _check_or_set_bdev_label(
+ bfn,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB),
+ "bluefs db", create);
+ if (r < 0) {
+ derr << __func__
+ << " check block device(" << bfn << ") label returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+ if (create) {
+ bluefs->add_block_extent(
+ BlueFS::BDEV_DB,
+ SUPER_RESERVED,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
+ }
+ bluefs_shared_bdev = BlueFS::BDEV_SLOW;
+ bluefs_single_shared_device = false;
+ } else {
+ r = -errno;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ r = 0;
+ bluefs_shared_bdev = BlueFS::BDEV_DB;
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+
+ // shared device
+ bfn = path + "/block";
+ // never trim here
+ r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false,
+ true /* shared with bluestore */);
+ if (r < 0) {
+ derr << __func__ << " add block device(" << bfn << ") returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ if (create) {
+ // note: we always leave the first SUPER_RESERVED (8k) of the device unused
+ uint64_t initial =
+ bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
+ cct->_conf->bluestore_bluefs_gift_ratio);
+ initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
+ uint64_t alloc_size = cct->_conf->bluefs_shared_alloc_size;
+ if (alloc_size % min_alloc_size) {
+ derr << __func__ << " bluefs_shared_alloc_size 0x" << std::hex
+ << alloc_size << " is not a multiple of "
+ << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ r = -EINVAL;
+ goto free_bluefs;
+ }
+ // align to bluefs's alloc_size
+ initial = p2roundup(initial, alloc_size);
+ // put bluefs in the middle of the device in case it is an HDD
+ uint64_t start = p2align((bdev->get_size() - initial) / 2, alloc_size);
+ //avoiding superblock overwrite
+ start = std::max(alloc_size, start);
+ ceph_assert(start >=_get_ondisk_reserved());
+
+ bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
+ bluefs_extents.insert(start, initial);
+ ++out_of_sync_fm;
+ }
+
+ bfn = path + "/block.wal";
+ if (::stat(bfn.c_str(), &st) == 0) {
+ r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
+ create && cct->_conf->bdev_enable_discard);
+ if (r < 0) {
+ derr << __func__ << " add block device(" << bfn << ") returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
+ r = _check_or_set_bdev_label(
+ bfn,
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL),
+ "bluefs wal", create);
+ if (r < 0) {
+ derr << __func__ << " check block device(" << bfn
+ << ") label returned: " << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+
+ if (create) {
+ bluefs->add_block_extent(
+ BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
+ BDEV_LABEL_BLOCK_SIZE);
+ }
+ bluefs_single_shared_device = false;
+ } else {
+ r = 0;
+ if (::lstat(bfn.c_str(), &st) != -1) {
+ r = -errno;
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+ return 0;
+
+free_bluefs:
+ ceph_assert(bluefs);
+ delete bluefs;
+ bluefs = NULL;
+ return r;
+}
+
+int BlueStore::_open_bluefs(bool create)
+{
+ int r = _minimal_open_bluefs(create);
+ if (r < 0) {
+ return r;
+ }
+ RocksDBBlueFSVolumeSelector* vselector = nullptr;
+ if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
+
+ string options = cct->_conf->bluestore_rocksdb_options;
+
+ rocksdb::Options rocks_opts;
+ int r = RocksDBStore::ParseOptionsFromStringStatic(
+ cct,
+ options,
+ rocks_opts,
+ nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
+ vselector =
+ new RocksDBBlueFSVolumeSelector(
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+ bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
+ 1024 * 1024 * 1024, //FIXME: set expected l0 size here
+ rocks_opts.max_bytes_for_level_base,
+ rocks_opts.max_bytes_for_level_multiplier,
+ reserved_factor,
+ cct->_conf->bluestore_volume_selection_reserved,
+ cct->_conf->bluestore_volume_selection_policy != "rocksdb_original");
+ }
+ if (create) {
+ bluefs->mkfs(fsid);
+ }
+ bluefs->set_volume_selector(vselector);
+ r = bluefs->mount();
+ if (r < 0) {
+ derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+}
+
+void BlueStore::_close_bluefs(bool cold_close)
+{
+ bluefs->umount(cold_close);
+ _minimal_close_bluefs();
+}
+
+void BlueStore::_minimal_close_bluefs()
+{
+ delete bluefs;
+ bluefs = NULL;
+}
+
+int BlueStore::_is_bluefs(bool create, bool* ret)
+{
+ if (create) {
+ *ret = cct->_conf->bluestore_bluefs;
+ } else {
+ string s;
+ int r = read_meta("bluefs", &s);
+ if (r < 0) {
+ derr << __func__ << " unable to read 'bluefs' meta" << dendl;
+ return -EIO;
+ }
+ if (s == "1") {
+ *ret = true;
+ } else if (s == "0") {
+ *ret = false;
+ } else {
+ derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
+ << dendl;
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+/*
+* opens both DB and dependant super_meta, FreelistManager and allocator
+* in the proper order
+*/
+int BlueStore::_open_db_and_around(bool read_only)
+{
+ int r;
+ bool do_bluefs = false;
+ _is_bluefs(false, &do_bluefs); // ignore err code
+ if (do_bluefs) {
+ // open in read-only first to read FM list and init allocator
+ // as they might be needed for some BlueFS procedures
+ r = _open_db(false, false, true);
+ if (r < 0)
+ return r;
+
+ r = _open_super_meta();
+ if (r < 0) {
+ goto out_db;
+ }
+
+ r = _open_fm(nullptr);
+ if (r < 0)
+ goto out_db;
+
+ r = _open_alloc();
+ if (r < 0)
+ goto out_fm;
+
+ // now open in R/W mode
+ if (!read_only) {
+ _close_db(true);
+
+ r = _open_db(false, false, false);
+ if (r < 0) {
+ _close_alloc();
+ _close_fm();
+ return r;
+ }
+ }
+ } else {
+ r = _open_db(false, false);
+ if (r < 0) {
+ return r;
+ }
+ r = _open_super_meta();
+ if (r < 0) {
+ goto out_db;
+ }
+
+ r = _open_fm(nullptr);
+ if (r < 0)
+ goto out_db;
+
+ r = _open_alloc();
+ if (r < 0)
+ goto out_fm;
+ }
+ return 0;
+
+ out_fm:
+ _close_fm();
+ out_db:
+ _close_db(read_only);
+ return r;
+}
+
+void BlueStore::_close_db_and_around(bool read_only)
+{
+ if (bluefs) {
+ if (!read_only && out_of_sync_fm.fetch_and(0)) {
+ _sync_bluefs_and_fm();
+ }
+ _close_db(read_only);
+ while(!read_only && out_of_sync_fm.fetch_and(0)) {
+ // if seen some allocations during close - repeat open_db, sync fm, close
+ dout(0) << __func__ << " syncing FreelistManager" << dendl;
+ int r = _open_db(false, false, false);
+ if (r < 0) {
+ derr << __func__
+ << " unable to open db, FreelistManager is probably out of sync"
+ << dendl;
+ break;
+ }
+ _sync_bluefs_and_fm();
+ _close_db(false);
+ }
+ if (!_kv_only) {
+ _close_alloc();
+ _close_fm();
+ }
+ } else {
+ _close_alloc();
+ _close_fm();
+ _close_db(read_only);
+ }
+}
+
+// updates legacy bluefs related recs in DB to a state valid for
+// downgrades from nautilus.
+void BlueStore::_sync_bluefs_and_fm()
+{
+ if (cct->_conf->bluestore_bluefs_db_compatibility) {
+ bufferlist bl;
+ encode(bluefs_extents, bl);
+ dout(20) << __func__ << " bluefs_extents at KV is now 0x"
+ << std::hex << bluefs_extents << std::dec
+ << dendl;
+ KeyValueDB::Transaction synct = db->get_transaction();
+ synct->set(PREFIX_SUPER, "bluefs_extents", bl);
+ synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
+
+ // Nice thing is that we don't need to update FreelistManager here.
+ // It always has corresponding bits set to 'Free' for both Nautilus+ and
+ // pre-Nautilis releases.
+ // So once we get an extent to bluefs_extents this means it's
+ // been free in allocator and hence it's free in FM too.
+
+ db->submit_transaction_sync(synct);
+ }
+}
+
+int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
+{
+ int r;
+ ceph_assert(!db);
+ ceph_assert(!(create && read_only));
+ string fn = path + "/db";
+ string options;
+ stringstream err;
+ std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
+
+ string kv_backend;
+ std::vector<KeyValueDB::ColumnFamily> cfs;
+
+ if (create) {
+ kv_backend = cct->_conf->bluestore_kvbackend;
+ } else {
+ r = read_meta("kv_backend", &kv_backend);
+ if (r < 0) {
+ derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
+ return -EIO;
+ }
+ }
+ dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
+
+ bool do_bluefs;
+ r = _is_bluefs(create, &do_bluefs);
+ if (r < 0) {
+ return r;
+ }
+ dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
+
+ map<string,string> kv_options;
+ // force separate wal dir for all new deployments.
+ kv_options["separate_wal_dir"] = 1;
+ rocksdb::Env *env = NULL;
+ if (do_bluefs) {
+ dout(10) << __func__ << " initializing bluefs" << dendl;
+ if (kv_backend != "rocksdb") {
+ derr << " backend must be rocksdb to use bluefs" << dendl;
+ return -EINVAL;
+ }
+
+ r = _open_bluefs(create);
+ if (r < 0) {
+ return r;
+ }
+
+ if (cct->_conf->bluestore_bluefs_env_mirror) {
+ rocksdb::Env* a = new BlueRocksEnv(bluefs);
+ rocksdb::Env* b = rocksdb::Env::Default();
+ if (create) {
+ string cmd = "rm -rf " + path + "/db " +
+ path + "/db.slow " +
+ path + "/db.wal";
+ int r = system(cmd.c_str());
+ (void)r;
+ }
+ env = new rocksdb::EnvMirror(b, a, false, true);
+ } else {
+ env = new BlueRocksEnv(bluefs);
+
+ // simplify the dir names, too, as "seen" by rocksdb
+ fn = "db";
+ }
+ bluefs->set_slow_device_expander(this);
+ BlueFSVolumeSelector::paths paths;
+ bluefs->get_vselector_paths(fn, paths);
+
+ if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
+ // we have both block.db and block; tell rocksdb!
+ // note: the second (last) size value doesn't really matter
+ ostringstream db_paths;
+ bool first = true;
+ for (auto& p : paths) {
+ if (!first) {
+ db_paths << " ";
+ }
+ first = false;
+ db_paths << p.first << "," << p.second;
+
+ }
+ kv_options["db_paths"] = db_paths.str();
+ dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
+ }
+
+ if (create) {
+ for (auto& p : paths) {
+ env->CreateDir(p.first);
+ }
+ // Selectors don't provide wal path so far hence create explicitly
+ env->CreateDir(fn + ".wal");
+ } else {
+ std::vector<std::string> res;
+ // check for dir presence
+ auto r = env->GetChildren(fn+".wal", &res);
+ if (r.IsNotFound()) {
+ kv_options.erase("separate_wal_dir");
+ }
+ }
+ } else {
+ string walfn = path + "/db.wal";
+
+ if (create) {
+ int r = ::mkdir(fn.c_str(), 0755);
+ if (r < 0)
+ r = -errno;
+ if (r < 0 && r != -EEXIST) {
+ derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ // wal_dir, too!
+ r = ::mkdir(walfn.c_str(), 0755);
+ if (r < 0)
+ r = -errno;
+ if (r < 0 && r != -EEXIST) {
+ derr << __func__ << " failed to create " << walfn
+ << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ } else {
+ struct stat st;
+ r = ::stat(walfn.c_str(), &st);
+ if (r < 0 && errno == ENOENT) {
+ kv_options.erase("separate_wal_dir");
+ }
+ }
+ }
+
+
+ db = KeyValueDB::create(cct,
+ kv_backend,
+ fn,
+ kv_options,
+ static_cast<void*>(env));
+ if (!db) {
+ derr << __func__ << " error creating db" << dendl;
+ if (bluefs) {
+ _close_bluefs(read_only);
+ }
+ // delete env manually here since we can't depend on db to do this
+ // under this case
+ delete env;
+ env = NULL;
+ return -EIO;
+ }
+
+ FreelistManager::setup_merge_operators(db);
+ db->set_merge_operator(PREFIX_STAT, merge_op);
+ db->set_cache_size(cache_kv_ratio * cache_size);
+
+ if (kv_backend == "rocksdb") {
+ options = cct->_conf->bluestore_rocksdb_options;
+
+ map<string,string> cf_map;
+ cct->_conf.with_val<string>("bluestore_rocksdb_cfs",
+ get_str_map,
+ &cf_map,
+ " \t");
+ for (auto& i : cf_map) {
+ dout(10) << "column family " << i.first << ": " << i.second << dendl;
+ cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));
+ }
+ }
+
+ db->init(options);
+ if (to_repair_db)
+ return 0;
+ if (create) {
+ if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
+ r = db->create_and_open(err, cfs);
+ } else {
+ r = db->create_and_open(err);
+ }
+ } else {
+ // we pass in cf list here, but it is only used if the db already has
+ // column families created.
+ r = read_only ?
+ db->open_read_only(err, cfs) :
+ db->open(err, cfs);
+ }
+ if (r) {
+ derr << __func__ << " erroring opening db: " << err.str() << dendl;
+ _close_db(read_only);
+ return -EIO;
+ }
+ dout(1) << __func__ << " opened " << kv_backend
+ << " path " << fn << " options " << options << dendl;
+ return 0;
+}
+
+void BlueStore::_close_db(bool cold_close)
+{
+ ceph_assert(db);
+ delete db;
+ db = NULL;
+ if (bluefs) {
+ _close_bluefs(cold_close);
+ }
+}
+
+void BlueStore::_dump_alloc_on_failure()
+{
+ auto dump_interval =
+ cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
+ if (dump_interval > 0 &&
+ next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
+ alloc->dump();
+ next_dump_on_bluefs_alloc_failure = ceph_clock_now();
+ next_dump_on_bluefs_alloc_failure += dump_interval;
+ }
+}
+
+
+int BlueStore::allocate_bluefs_freespace(
+ uint64_t min_size,
+ uint64_t size,
+ PExtentVector* extents_out)
+{
+ ceph_assert(min_size <= size);
+ if (size) {
+ // round up to alloc size
+ uint64_t alloc_size = bluefs->get_alloc_size(bluefs_shared_bdev);
+ min_size = p2roundup(min_size, alloc_size);
+ size = p2roundup(size, alloc_size);
+
+ PExtentVector extents_local;
+ PExtentVector* extents = extents_out ? extents_out : &extents_local;
+
+
+ uint64_t gift;
+ uint64_t allocated = 0;
+ int64_t alloc_len;
+ do {
+ // hard cap to fit into 32 bits
+ gift = std::min<uint64_t>(size, 1ull << 31);
+ dout(10) << __func__ << " gifting " << gift
+ << " (" << byte_u_t(gift) << ")" << dendl;
+
+ alloc_len = alloc->allocate(gift, alloc_size, 0, 0, extents);
+ if (alloc_len > 0) {
+ allocated += alloc_len;
+ size -= alloc_len;
+ }
+
+ if (alloc_len < 0 ||
+ (alloc_len < (int64_t)gift && (min_size > allocated))) {
+ derr << __func__
+ << " failed to allocate on 0x" << std::hex << gift
+ << " min_size 0x" << min_size
+ << " > allocated total 0x" << allocated
+ << " bluefs_shared_alloc_size 0x" << alloc_size
+ << " allocated 0x" << (alloc_len < 0 ? 0 : alloc_len)
+ << " available 0x " << alloc->get_free()
+ << std::dec << dendl;
+
+ _dump_alloc_on_failure();
+ alloc->release(*extents);
+ extents->clear();
+ return -ENOSPC;
+ }
+ } while (size && alloc_len > 0);
+ for (auto& e : *extents) {
+ dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl;
+ bluefs_extents.insert(e.offset, e.length);
+ ++out_of_sync_fm;
+ // apply to bluefs if not requested from outside
+ if (!extents_out) {
+ bluefs->add_block_extent(bluefs_shared_bdev, e.offset, e.length);
+ }
+ }
+ }
+ return 0;
+}
+
+size_t BlueStore::available_freespace(uint64_t alloc_size) {
+ size_t total = 0;
+ auto iterated_allocation = [&](size_t off, size_t len) {
+ //only count in size that is alloc_size aligned
+ size_t dist_to_alignment;
+ size_t offset_in_block = off & (alloc_size - 1);
+ if (offset_in_block == 0)
+ dist_to_alignment = 0;
+ else
+ dist_to_alignment = alloc_size - offset_in_block;
+ if (dist_to_alignment >= len)
+ return;
+ len -= dist_to_alignment;
+ total += p2align(len, alloc_size);
+ };
+ alloc->dump(iterated_allocation);
+ return total;
+}
+
+int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
+{
+ float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
+
+ uint64_t my_free = alloc->get_free();
+ uint64_t total = bdev->get_size();
+ float my_free_ratio = (float)my_free / (float)total;
+
+ uint64_t total_free = bluefs_free + my_free;
+
+ float bluefs_ratio = (float)bluefs_free / (float)total_free;
+
+ dout(10) << __func__
+ << " bluefs " << byte_u_t(bluefs_free)
+ << " free (" << bluefs_free_ratio
+ << ") bluestore " << byte_u_t(my_free)
+ << " free (" << my_free_ratio
+ << "), bluefs_ratio " << bluefs_ratio
+ << dendl;
+
+ uint64_t gift = 0;
+ uint64_t reclaim = 0;
+ if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
+ gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
+ dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
+ << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
+ << ", should gift " << byte_u_t(gift) << dendl;
+ } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
+ reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
+ if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
+ reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
+ dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
+ << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
+ << ", should reclaim " << byte_u_t(reclaim) << dendl;
+ }
+
+ // don't take over too much of the freespace
+ uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
+ if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
+ cct->_conf->bluestore_bluefs_min < free_cap) {
+ uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
+ dout(10) << __func__ << " bluefs_total " << bluefs_total
+ << " < min " << cct->_conf->bluestore_bluefs_min
+ << ", should gift " << byte_u_t(g) << dendl;
+ if (g > gift)
+ gift = g;
+ reclaim = 0;
+ }
+ uint64_t min_free = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
+ if (bluefs_free < min_free &&
+ min_free < free_cap) {
+ uint64_t g = min_free - bluefs_free;
+ dout(10) << __func__ << " bluefs_free " << bluefs_free
+ << " < min " << min_free
+ << ", should gift " << byte_u_t(g) << dendl;
+ if (g > gift)
+ gift = g;
+ reclaim = 0;
+ }
+ ceph_assert((int64_t)gift >= 0);
+ ceph_assert((int64_t)reclaim >= 0);
+ return gift > 0 ? (int64_t)gift : -(int64_t)reclaim;
+}
+
+int BlueStore::_balance_bluefs_freespace()
+{
+ int ret = 0;
+ ceph_assert(bluefs);
+
+ vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
+ bluefs->get_usage(&bluefs_usage);
+ ceph_assert(bluefs_usage.size() > bluefs_shared_bdev);
+
+ bool clear_alert = true;
+ if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
+ auto& p = bluefs_usage[bluefs_shared_bdev];
+ if (p.first != p.second) {
+ auto& db = bluefs_usage[BlueFS::BDEV_DB];
+ ostringstream ss;
+ ss << "spilled over " << byte_u_t(p.second - p.first)
+ << " metadata from 'db' device (" << byte_u_t(db.second - db.first)
+ << " used of " << byte_u_t(db.second) << ") to slow device";
+ _set_spillover_alert(ss.str());
+ clear_alert = false;
+ }
+ }
+ if (clear_alert) {
+ _clear_spillover_alert();
+ }
+
+ // fixme: look at primary bdev only for now
+ int64_t delta = _get_bluefs_size_delta(
+ bluefs_usage[bluefs_shared_bdev].first,
+ bluefs_usage[bluefs_shared_bdev].second);
+
+ // reclaim from bluefs?
+ if (delta < 0) {
+ // round up to alloc size
+ uint64_t alloc_size = bluefs->get_alloc_size(bluefs_shared_bdev);
+ auto reclaim = p2roundup(uint64_t(-delta), alloc_size);
+
+ // hard cap to fit into 32 bits
+ reclaim = std::min<uint64_t>(reclaim, 1ull << 31);
+ dout(10) << __func__ << " reclaiming " << reclaim
+ << " (" << byte_u_t(reclaim) << ")" << dendl;
+
+ while (reclaim > 0) {
+ // NOTE: this will block and do IO.
+ PExtentVector extents;
+ int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
+ &extents);
+ if (r < 0) {
+ derr << __func__ << " failed to reclaim space from bluefs"
+ << dendl;
+ break;
+ }
+ for (auto e : extents) {
+ ++out_of_sync_fm;
+ bluefs_extents.erase(e.offset, e.length);
+ bluefs_extents_reclaiming.insert(e.offset, e.length);
+ reclaim -= e.length;
+ }
+ }
+
+ ret = 1;
+ }
+
+ return ret;
+}
+
+int BlueStore::_open_collections()
+{
+ dout(10) << __func__ << dendl;
+ collections_had_errors = false;
+ ceph_assert(coll_map.empty());
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
+ for (it->upper_bound(string());
+ it->valid();
+ it->next()) {
+ coll_t cid;
+ if (cid.parse(it->key())) {
+ CollectionRef c(
+ new Collection(
+ this,
+ cache_shards[cid.hash_to_shard(cache_shards.size())],
+ cid));
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ try {
+ decode(c->cnode, p);
+ } catch (buffer::error& e) {
+ derr << __func__ << " failed to decode cnode, key:"
+ << pretty_binary_string(it->key()) << dendl;
+ return -EIO;
+ }
+ dout(20) << __func__ << " opened " << cid << " " << c
+ << " " << c->cnode << dendl;
+ _osr_attach(c.get());
+ coll_map[cid] = c;
+
+ } else {
+ derr << __func__ << " unrecognized collection " << it->key() << dendl;
+ collections_had_errors = true;
+ }
+ }
+ return 0;
+}
+
+void BlueStore::_fsck_collections(int64_t* errors)
+{
+ if (collections_had_errors) {
+ dout(10) << __func__ << dendl;
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
+ for (it->upper_bound(string());
+ it->valid();
+ it->next()) {
+ coll_t cid;
+ if (!cid.parse(it->key())) {
+ derr << __func__ << " unrecognized collection " << it->key() << dendl;
+ if (errors) {
+ (*errors)++;
+ }
+ }
+ }
+ }
+}
+
+void BlueStore::_open_statfs()
+{
+ osd_pools.clear();
+ vstatfs.reset();
+
+ bufferlist bl;
+ int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
+ if (r >= 0) {
+ per_pool_stat_collection = false;
+ if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
+ auto it = bl.cbegin();
+ vstatfs.decode(it);
+ dout(10) << __func__ << " store_statfs is found" << dendl;
+ } else {
+ dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
+ }
+ _check_legacy_statfs_alert();
+ } else {
+ per_pool_stat_collection = true;
+ dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT);
+ for (it->upper_bound(string());
+ it->valid();
+ it->next()) {
+
+ uint64_t pool_id;
+ int r = get_key_pool_stat(it->key(), &pool_id);
+ ceph_assert(r == 0);
+
+ bufferlist bl;
+ bl = it->value();
+ auto p = bl.cbegin();
+ auto& st = osd_pools[pool_id];
+ try {
+ st.decode(p);
+ vstatfs += st;
+
+ dout(30) << __func__ << " pool " << pool_id
+ << " statfs " << st << dendl;
+ } catch (buffer::error& e) {
+ derr << __func__ << " failed to decode pool stats, key:"
+ << pretty_binary_string(it->key()) << dendl;
+ }
+ }
+ }
+ dout(30) << __func__ << " statfs " << vstatfs << dendl;
+
+}
+
+int BlueStore::_setup_block_symlink_or_file(
+ string name,
+ string epath,
+ uint64_t size,
+ bool create)
+{
+ dout(20) << __func__ << " name " << name << " path " << epath
+ << " size " << size << " create=" << (int)create << dendl;
+ int r = 0;
+ int flags = O_RDWR|O_CLOEXEC;
+ if (create)
+ flags |= O_CREAT;
+ if (epath.length()) {
+ r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " failed to create " << name << " symlink to "
+ << epath << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
+ int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
+ if (fd < 0) {
+ r = -errno;
+ derr << __func__ << " failed to open " << epath << " file: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ // write the Transport ID of the NVMe device
+ // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
+ // where "0000:02:00.0" is the selector of a PCI device, see
+ // the first column of "lspci -mm -n -D"
+ string trid{"trtype:PCIe "};
+ trid += "traddr:";
+ trid += epath.substr(strlen(SPDK_PREFIX));
+ r = ::write(fd, trid.c_str(), trid.size());
+ ceph_assert(r == static_cast<int>(trid.size()));
+ dout(1) << __func__ << " created " << name << " symlink to "
+ << epath << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ }
+ if (size) {
+ int fd = ::openat(path_fd, name.c_str(), flags, 0644);
+ if (fd >= 0) {
+ // block file is present
+ struct stat st;
+ int r = ::fstat(fd, &st);
+ if (r == 0 &&
+ S_ISREG(st.st_mode) && // if it is a regular file
+ st.st_size == 0) { // and is 0 bytes
+ r = ::ftruncate(fd, size);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " failed to resize " << name << " file to "
+ << size << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+
+ if (cct->_conf->bluestore_block_preallocate_file) {
+ r = ::ceph_posix_fallocate(fd, 0, size);
+ if (r > 0) {
+ derr << __func__ << " failed to prefallocate " << name << " file to "
+ << size << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return -r;
+ }
+ }
+ dout(1) << __func__ << " resized " << name << " file to "
+ << byte_u_t(size) << dendl;
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ } else {
+ int r = -errno;
+ if (r != -ENOENT) {
+ derr << __func__ << " failed to open " << name << " file: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+int BlueStore::mkfs()
+{
+ dout(1) << __func__ << " path " << path << dendl;
+ int r;
+ uuid_d old_fsid;
+
+ if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
+ derr << __func__ << " osd_max_object_size "
+ << cct->_conf->osd_max_object_size << " > bluestore max "
+ << OBJECT_MAX_SIZE << dendl;
+ return -EINVAL;
+ }
+
+ {
+ string done;
+ r = read_meta("mkfs_done", &done);
+ if (r == 0) {
+ dout(1) << __func__ << " already created" << dendl;
+ if (cct->_conf->bluestore_fsck_on_mkfs) {
+ r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
+ if (r < 0) {
+ derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ if (r > 0) {
+ derr << __func__ << " fsck found " << r << " errors" << dendl;
+ r = -EIO;
+ }
+ }
+ return r; // idempotent
+ }
+ }
+
+ {
+ string type;
+ r = read_meta("type", &type);
+ if (r == 0) {
+ if (type != "bluestore") {
+ derr << __func__ << " expected bluestore, but type is " << type << dendl;
+ return -EIO;
+ }
+ } else {
+ r = write_meta("type", "bluestore");
+ if (r < 0)
+ return r;
+ }
+ }
+
+ freelist_type = "bitmap";
+
+ r = _open_path();
+ if (r < 0)
+ return r;
+
+ r = _open_fsid(true);
+ if (r < 0)
+ goto out_path_fd;
+
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_close_fsid;
+
+ r = _read_fsid(&old_fsid);
+ if (r < 0 || old_fsid.is_zero()) {
+ if (fsid.is_zero()) {
+ fsid.generate_random();
+ dout(1) << __func__ << " generated fsid " << fsid << dendl;
+ } else {
+ dout(1) << __func__ << " using provided fsid " << fsid << dendl;
+ }
+ // we'll write it later.
+ } else {
+ if (!fsid.is_zero() && fsid != old_fsid) {
+ derr << __func__ << " on-disk fsid " << old_fsid
+ << " != provided " << fsid << dendl;
+ r = -EINVAL;
+ goto out_close_fsid;
+ }
+ fsid = old_fsid;
+ }
+
+ r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
+ cct->_conf->bluestore_block_size,
+ cct->_conf->bluestore_block_create);
+ if (r < 0)
+ goto out_close_fsid;
+ if (cct->_conf->bluestore_bluefs) {
+ r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
+ cct->_conf->bluestore_block_wal_size,
+ cct->_conf->bluestore_block_wal_create);
+ if (r < 0)
+ goto out_close_fsid;
+ r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
+ cct->_conf->bluestore_block_db_size,
+ cct->_conf->bluestore_block_db_create);
+ if (r < 0)
+ goto out_close_fsid;
+ }
+
+ r = _open_bdev(true);
+ if (r < 0)
+ goto out_close_fsid;
+
+ // choose min_alloc_size
+ if (cct->_conf->bluestore_min_alloc_size) {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size;
+ } else {
+ ceph_assert(bdev);
+ if (bdev->is_rotational()) {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
+ } else {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
+ }
+ }
+ _validate_bdev();
+
+ // make sure min_alloc_size is power of 2 aligned.
+ if (!isp2(min_alloc_size)) {
+ derr << __func__ << " min_alloc_size 0x"
+ << std::hex << min_alloc_size << std::dec
+ << " is not power of 2 aligned!"
+ << dendl;
+ r = -EINVAL;
+ goto out_close_bdev;
+ }
+
+ r = _open_db(true);
+ if (r < 0)
+ goto out_close_bdev;
+
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ r = _open_fm(t);
+ if (r < 0)
+ goto out_close_db;
+ {
+ bufferlist bl;
+ encode((uint64_t)0, bl);
+ t->set(PREFIX_SUPER, "nid_max", bl);
+ t->set(PREFIX_SUPER, "blobid_max", bl);
+ }
+
+ {
+ bufferlist bl;
+ encode((uint64_t)min_alloc_size, bl);
+ t->set(PREFIX_SUPER, "min_alloc_size", bl);
+ }
+
+ ondisk_format = latest_ondisk_format;
+ _prepare_ondisk_format_super(t);
+ db->submit_transaction_sync(t);
+ }
+
+ r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
+ if (r < 0)
+ goto out_close_fm;
+
+ r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
+ if (r < 0)
+ goto out_close_fm;
+
+ if (fsid != old_fsid) {
+ r = _write_fsid();
+ if (r < 0) {
+ derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
+ goto out_close_fm;
+ }
+ }
+
+ if (out_of_sync_fm.fetch_and(0)) {
+ _sync_bluefs_and_fm();
+ }
+
+ out_close_fm:
+ _close_fm();
+ out_close_db:
+ _close_db(false);
+ out_close_bdev:
+ _close_bdev();
+ out_close_fsid:
+ _close_fsid();
+ out_path_fd:
+ _close_path();
+
+ if (r == 0 &&
+ cct->_conf->bluestore_fsck_on_mkfs) {
+ int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
+ if (rc < 0)
+ return rc;
+ if (rc > 0) {
+ derr << __func__ << " fsck found " << rc << " errors" << dendl;
+ r = -EIO;
+ }
+ }
+
+ if (r == 0) {
+ // indicate success by writing the 'mkfs_done' file
+ r = write_meta("mkfs_done", "yes");
+ }
+
+ if (r < 0) {
+ derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+ } else {
+ dout(0) << __func__ << " success" << dendl;
+ }
+ return r;
+}
+
+int BlueStore::_mount_for_bluefs()
+{
+ int r = _open_path();
+ ceph_assert(r == 0);
+ r = _open_fsid(false);
+ ceph_assert(r == 0);
+ r = _read_fsid(&fsid);
+ ceph_assert(r == 0);
+ r = _lock_fsid();
+ ceph_assert(r == 0);
+ r = _open_bluefs(false);
+ ceph_assert(r == 0);
+ return r;
+}
+
+void BlueStore::_umount_for_bluefs()
+{
+ _close_bluefs(false);
+ _close_fsid();
+ _close_path();
+}
+
+int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
+{
+ dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+ int r;
+ ceph_assert(path_fd < 0);
+
+ ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+ if (!cct->_conf->bluestore_bluefs) {
+ derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+ return -EIO;
+ }
+
+ r = _mount_for_bluefs();
+
+ int reserved = 0;
+ if (id == BlueFS::BDEV_NEWWAL) {
+ string p = path + "/block.wal";
+ r = _setup_block_symlink_or_file("block.wal", dev_path,
+ cct->_conf->bluestore_block_wal_size,
+ true);
+ ceph_assert(r == 0);
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
+ cct->_conf->bdev_enable_discard);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+ r = _check_or_set_bdev_label(
+ p,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+ "bluefs wal",
+ true);
+ ceph_assert(r == 0);
+ }
+
+ reserved = BDEV_LABEL_BLOCK_SIZE;
+ } else if (id == BlueFS::BDEV_NEWDB) {
+ string p = path + "/block.db";
+ r = _setup_block_symlink_or_file("block.db", dev_path,
+ cct->_conf->bluestore_block_db_size,
+ true);
+ ceph_assert(r == 0);
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
+ cct->_conf->bdev_enable_discard);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+ r = _check_or_set_bdev_label(
+ p,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+ "bluefs db",
+ true);
+ ceph_assert(r == 0);
+ }
+ reserved = SUPER_RESERVED;
+ }
+
+ bluefs->umount();
+ bluefs->mount();
+
+ bluefs->add_block_extent(
+ id,
+ reserved,
+ bluefs->get_block_device_size(id) - reserved, true);
+
+ r = bluefs->prepare_new_device(id);
+ ceph_assert(r == 0);
+
+ if (r < 0) {
+ derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+ } else {
+ dout(0) << __func__ << " success" << dendl;
+ }
+
+ _umount_for_bluefs();
+ return r;
+}
+
+int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
+ int id)
+{
+ dout(10) << __func__ << " id:" << id << dendl;
+ ceph_assert(path_fd < 0);
+
+ ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
+
+ if (!cct->_conf->bluestore_bluefs) {
+ derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+ return -EIO;
+ }
+
+ int r = _mount_for_bluefs();
+
+ // require bluestore_bluefs_min_free to be free at target device!
+ uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
+ for(auto src_id : devs_source) {
+ used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
+ }
+ uint64_t target_free = bluefs->get_free(id);
+ if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
+ // will need to remount full BlueStore instance to allocate more space
+ _umount_for_bluefs();
+
+ r = mount();
+ ceph_assert(r == 0);
+ dout(1) << __func__
+ << " Allocating more space at slow device for BlueFS: +"
+ << used_space - target_free << " bytes" << dendl;
+ r = allocate_bluefs_freespace(
+ used_space - target_free,
+ used_space - target_free,
+ nullptr);
+
+ umount();
+ if (r != 0) {
+ derr << __func__
+ << " can't migrate, unable to allocate extra space: "
+ << used_space - target_free << " at target:" << id
+ << dendl;
+ return -ENOSPC;
+ }
+
+ r = _mount_for_bluefs();
+ ceph_assert(r == 0);
+ } else if (target_free < used_space) {
+ derr << __func__
+ << " can't migrate, free space at target: " << target_free
+ << " is less than required space: " << used_space
+ << dendl;
+ return -ENOSPC;
+ }
+ r = bluefs->device_migrate_to_existing(cct, devs_source, id);
+ if (r < 0) {
+ derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+ goto shutdown;
+ }
+
+ if (devs_source.count(BlueFS::BDEV_DB)) {
+ r = unlink(string(path + "/block.db").c_str());
+ ceph_assert(r == 0);
+ }
+ if (devs_source.count(BlueFS::BDEV_WAL)) {
+ r = unlink(string(path + "/block.wal").c_str());
+ ceph_assert(r == 0);
+ }
+
+shutdown:
+ _umount_for_bluefs();
+ return r;
+}
+
+int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
+ int id,
+ const string& dev_path)
+{
+ dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+ int r;
+ ceph_assert(path_fd < 0);
+
+ ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+ if (!cct->_conf->bluestore_bluefs) {
+ derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+ return -EIO;
+ }
+
+ r = _mount_for_bluefs();
+
+ int reserved = 0;
+ string link_db;
+ string link_wal;
+ if (devs_source.count(BlueFS::BDEV_DB) &&
+ bluefs_shared_bdev != BlueFS::BDEV_DB) {
+ link_db = path + "/block.db";
+ }
+ if (devs_source.count(BlueFS::BDEV_WAL)) {
+ link_wal = path + "/block.wal";
+ }
+
+ size_t target_size;
+ string target_name;
+ if (id == BlueFS::BDEV_NEWWAL) {
+ target_name = "block.wal";
+ target_size = cct->_conf->bluestore_block_wal_size;
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
+ cct->_conf->bdev_enable_discard);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+ r = _check_or_set_bdev_label(
+ dev_path,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+ "bluefs wal",
+ true);
+ ceph_assert(r == 0);
+ }
+ reserved = BDEV_LABEL_BLOCK_SIZE;
+ } else if (id == BlueFS::BDEV_NEWDB) {
+ target_name = "block.db";
+ target_size = cct->_conf->bluestore_block_db_size;
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
+ cct->_conf->bdev_enable_discard);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+ r = _check_or_set_bdev_label(
+ dev_path,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+ "bluefs db",
+ true);
+ ceph_assert(r == 0);
+ }
+ reserved = SUPER_RESERVED;
+ }
+
+ bluefs->umount();
+ bluefs->mount();
+
+ bluefs->add_block_extent(
+ id, reserved, bluefs->get_block_device_size(id) - reserved);
+
+ r = bluefs->device_migrate_to_new(cct, devs_source, id);
+
+ if (r < 0) {
+ derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+ goto shutdown;
+ }
+
+ if (!link_db.empty()) {
+ r = unlink(link_db.c_str());
+ ceph_assert(r == 0);
+ }
+ if (!link_wal.empty()) {
+ r = unlink(link_wal.c_str());
+ ceph_assert(r == 0);
+ }
+ r = _setup_block_symlink_or_file(
+ target_name,
+ dev_path,
+ target_size,
+ true);
+ ceph_assert(r == 0);
+ dout(0) << __func__ << " success" << dendl;
+
+shutdown:
+ _umount_for_bluefs();
+ return r;
+}
+
+string BlueStore::get_device_path(unsigned id)
+{
+ string res;
+ if (id < BlueFS::MAX_BDEV) {
+ switch (id) {
+ case BlueFS::BDEV_WAL:
+ res = path + "/block.wal";
+ break;
+ case BlueFS::BDEV_DB:
+ if (id == bluefs_shared_bdev) {
+ res = path + "/block";
+ } else {
+ res = path + "/block.db";
+ }
+ break;
+ case BlueFS::BDEV_SLOW:
+ res = path + "/block";
+ break;
+ }
+ }
+ return res;
+}
+
+int BlueStore::expand_devices(ostream& out)
+{
+ int r = cold_open();
+ ceph_assert(r == 0);
+ bluefs->dump_block_extents(out);
+ out << "Expanding DB/WAL..." << std::endl;
+ for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
+ if (devid == bluefs_shared_bdev ) {
+ continue;
+ }
+ uint64_t size = bluefs->get_block_device_size(devid);
+ if (size == 0) {
+ // no bdev
+ continue;
+ }
+
+ interval_set<uint64_t> before;
+ bluefs->get_block_extents(devid, &before);
+ ceph_assert(!before.empty());
+ uint64_t end = before.range_end();
+ if (end < size) {
+ out << devid
+ <<" : expanding " << " from 0x" << std::hex
+ << end << " to 0x" << size << std::dec << std::endl;
+ bluefs->add_block_extent(devid, end, size-end);
+ string p = get_device_path(devid);
+ const char* path = p.c_str();
+ if (path == nullptr) {
+ derr << devid
+ <<": can't find device path " << dendl;
+ continue;
+ }
+ bluestore_bdev_label_t label;
+ int r = _read_bdev_label(cct, path, &label);
+ if (r < 0) {
+ derr << "unable to read label for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ continue;
+ }
+ label.size = size;
+ r = _write_bdev_label(cct, path, label);
+ if (r < 0) {
+ derr << "unable to write label for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ continue;
+ }
+ out << devid
+ <<" : size label updated to " << size
+ << std::endl;
+ }
+ }
+ uint64_t size0 = fm->get_size();
+ uint64_t size = bdev->get_size();
+ cold_close();
+ if (size0 < size) {
+ out << "Expanding Main..." << std::endl;
+ int r = _mount(false);
+ ceph_assert(r == 0);
+
+ out << bluefs_shared_bdev
+ <<" : expanding " << " from 0x" << std::hex
+ << size0 << " to 0x" << size << std::dec << std::endl;
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+ r = fm->expand(size, txn);
+ ceph_assert(r == 0);
+ db->submit_transaction_sync(txn);
+
+ // always reference to slow device here
+ string p = get_device_path(BlueFS::BDEV_SLOW);
+ ceph_assert(!p.empty());
+ const char* path = p.c_str();
+ bluestore_bdev_label_t label;
+ r = _read_bdev_label(cct, path, &label);
+ if (r < 0) {
+ derr << "unable to read label for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ } else {
+ label.size = size;
+ r = _write_bdev_label(cct, path, label);
+ if (r < 0) {
+ derr << "unable to write label for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ } else {
+ out << bluefs_shared_bdev
+ <<" : size label updated to " << size
+ << std::endl;
+ }
+ }
+ umount();
+ }
+ return r;
+}
+
+int BlueStore::dump_bluefs_sizes(ostream& out)
+{
+ int r = cold_open();
+ ceph_assert(r == 0);
+ bluefs->dump_block_extents(out);
+ cold_close();
+ return r;
+}
+
+void BlueStore::set_cache_shards(unsigned num)
+{
+ dout(10) << __func__ << " " << num << dendl;
+ size_t old = cache_shards.size();
+ ceph_assert(num >= old);
+ cache_shards.resize(num);
+ for (unsigned i = old; i < num; ++i) {
+ cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
+ logger);
+ }
+}
+
+int BlueStore::_mount(bool kv_only, bool open_db)
+{
+ dout(1) << __func__ << " path " << path << dendl;
+
+ _kv_only = kv_only;
+
+ {
+ string type;
+ int r = read_meta("type", &type);
+ if (r < 0) {
+ derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (type != "bluestore") {
+ derr << __func__ << " expected bluestore, but type is " << type << dendl;
+ return -EIO;
+ }
+ }
+
+ if (cct->_conf->bluestore_fsck_on_mount) {
+ int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
+ if (rc < 0)
+ return rc;
+ if (rc > 0) {
+ derr << __func__ << " fsck found " << rc << " errors" << dendl;
+ return -EIO;
+ }
+ }
+
+ if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
+ derr << __func__ << " osd_max_object_size "
+ << cct->_conf->osd_max_object_size << " > bluestore max "
+ << OBJECT_MAX_SIZE << dendl;
+ return -EINVAL;
+ }
+
+ int r = _open_path();
+ if (r < 0)
+ return r;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+
+ r = _read_fsid(&fsid);
+ if (r < 0)
+ goto out_fsid;
+
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_fsid;
+
+ r = _open_bdev(false);
+ if (r < 0)
+ goto out_fsid;
+
+ if (open_db) {
+ r = _open_db_and_around(false);
+ } else {
+ // we can bypass db open exclusively in case of kv_only mode
+ ceph_assert(kv_only);
+ r = _open_db(false, true);
+ if (r < 0)
+ goto out_bdev;
+ }
+
+ if (kv_only)
+ return 0;
+
+ r = _upgrade_super();
+ if (r < 0) {
+ goto out_db;
+ }
+
+ r = _open_collections();
+ if (r < 0)
+ goto out_db;
+
+ r = _reload_logger();
+ if (r < 0)
+ goto out_coll;
+
+ _kv_start();
+
+ r = _deferred_replay();
+ if (r < 0)
+ goto out_stop;
+
+ mempool_thread.init();
+
+ if (!per_pool_stat_collection &&
+ cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
+ dout(1) << __func__ << " quick-fix on mount" << dendl;
+ _fsck_on_open(FSCK_SHALLOW, true);
+
+ //reread statfs
+ //FIXME minor: replace with actual open/close?
+ _open_statfs();
+
+ _check_legacy_statfs_alert();
+ }
+
+ mounted = true;
+ return 0;
+
+ out_stop:
+ _kv_stop();
+ out_coll:
+ _flush_cache();
+ out_db:
+ _close_db_and_around(false);
+ out_bdev:
+ _close_bdev();
+ out_fsid:
+ _close_fsid();
+ out_path:
+ _close_path();
+ return r;
+}
+
+int BlueStore::umount()
+{
+ ceph_assert(_kv_only || mounted);
+ dout(1) << __func__ << dendl;
+
+ _osr_drain_all();
+
+ mounted = false;
+ if (!_kv_only) {
+ mempool_thread.shutdown();
+ dout(20) << __func__ << " stopping kv thread" << dendl;
+ _kv_stop();
+ _flush_cache();
+ dout(20) << __func__ << " closing" << dendl;
+
+ }
+ _close_db_and_around(false);
+ _close_bdev();
+ _close_fsid();
+ _close_path();
+
+ if (cct->_conf->bluestore_fsck_on_umount) {
+ int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
+ if (rc < 0)
+ return rc;
+ if (rc > 0) {
+ derr << __func__ << " fsck found " << rc << " errors" << dendl;
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+int BlueStore::cold_open()
+{
+ int r = _open_path();
+ if (r < 0)
+ return r;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+
+ r = _read_fsid(&fsid);
+ if (r < 0)
+ goto out_fsid;
+
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_fsid;
+
+ r = _open_bdev(false);
+ if (r < 0)
+ goto out_fsid;
+ r = _open_db_and_around(true);
+ if (r < 0) {
+ goto out_bdev;
+ }
+ return 0;
+ out_bdev:
+ _close_bdev();
+ out_fsid:
+ _close_fsid();
+ out_path:
+ _close_path();
+ return r;
+}
+int BlueStore::cold_close()
+{
+ _close_db_and_around(true);
+ _close_bdev();
+ _close_fsid();
+ _close_path();
+ return 0;
+}
+
+static void apply(uint64_t off,
+ uint64_t len,
+ uint64_t granularity,
+ BlueStore::mempool_dynamic_bitset &bitset,
+ std::function<void(uint64_t,
+ BlueStore::mempool_dynamic_bitset &)> f) {
+ auto end = round_up_to(off + len, granularity);
+ while (off < end) {
+ uint64_t pos = off / granularity;
+ f(pos, bitset);
+ off += granularity;
+ }
+}
+
+int _fsck_sum_extents(
+ const PExtentVector& extents,
+ bool compressed,
+ store_statfs_t& expected_statfs)
+{
+ for (auto e : extents) {
+ if (!e.is_valid())
+ continue;
+ expected_statfs.allocated += e.length;
+ if (compressed) {
+ expected_statfs.data_compressed_allocated += e.length;
+ }
+ }
+ return 0;
+}
+
+int BlueStore::_fsck_check_extents(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const PExtentVector& extents,
+ bool compressed,
+ mempool_dynamic_bitset &used_blocks,
+ uint64_t granularity,
+ BlueStoreRepairer* repairer,
+ store_statfs_t& expected_statfs,
+ FSCKDepth depth)
+{
+ dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
+ int errors = 0;
+ for (auto e : extents) {
+ if (!e.is_valid())
+ continue;
+ expected_statfs.allocated += e.length;
+ if (compressed) {
+ expected_statfs.data_compressed_allocated += e.length;
+ }
+ if (depth != FSCK_SHALLOW) {
+ bool already = false;
+ apply(
+ e.offset, e.length, granularity, used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ ceph_assert(pos < bs.size());
+ if (bs.test(pos)) {
+ if (repairer) {
+ repairer->note_misreference(
+ pos * min_alloc_size, min_alloc_size, !already);
+ }
+ if (!already) {
+ derr << "fsck error: " << oid << " extent " << e
+ << " or a subset is already allocated (misreferenced)" << dendl;
+ ++errors;
+ already = true;
+ }
+ }
+ else
+ bs.set(pos);
+ });
+ if (repairer) {
+ repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
+ }
+
+ if (e.end() > bdev->get_size()) {
+ derr << "fsck error: " << oid << " extent " << e
+ << " past end of block device" << dendl;
+ ++errors;
+ }
+ }
+ }
+ return errors;
+}
+
+void BlueStore::_fsck_check_pool_statfs(
+ BlueStore::per_pool_statfs& expected_pool_statfs,
+ int64_t& errors,
+ int64_t& warnings,
+ BlueStoreRepairer* repairer)
+{
+ auto it = db->get_iterator(PREFIX_STAT);
+ if (it) {
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ string key = it->key();
+ if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
+ if (repairer) {
+ ++errors;
+ repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
+ derr << "fsck error: " << "legacy statfs record found, removing"
+ << dendl;
+ }
+ continue;
+ }
+ uint64_t pool_id;
+ if (get_key_pool_stat(key, &pool_id) < 0) {
+ derr << "fsck error: bad key " << key
+ << "in statfs namespece" << dendl;
+ if (repairer) {
+ repairer->remove_key(db, PREFIX_STAT, key);
+ }
+ ++errors;
+ continue;
+ }
+
+ volatile_statfs vstatfs;
+ bufferlist bl = it->value();
+ auto blp = bl.cbegin();
+ try {
+ vstatfs.decode(blp);
+ } catch (buffer::error& e) {
+ derr << "fsck error: failed to decode Pool StatFS record"
+ << pretty_binary_string(key) << dendl;
+ if (repairer) {
+ dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
+ << pretty_binary_string(key)
+ << "', removing" << dendl;
+ repairer->remove_key(db, PREFIX_STAT, key);
+ }
+ ++errors;
+ vstatfs.reset();
+ }
+ auto stat_it = expected_pool_statfs.find(pool_id);
+ if (stat_it == expected_pool_statfs.end()) {
+ if (vstatfs.is_empty()) {
+ // we don't consider that as an error since empty pool statfs
+ // are left in DB for now
+ dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
+ << std::hex << pool_id << std::dec << dendl;
+ if (repairer) {
+ // but we need to increment error count in case of repair
+ // to have proper counters at the end
+ // (as repairer increments recovery counter anyway).
+ ++errors;
+ }
+ } else {
+ derr << "fsck error: found stray Pool StatFS record for pool id 0x"
+ << std::hex << pool_id << std::dec << dendl;
+ ++errors;
+ }
+ if (repairer) {
+ repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
+ }
+ continue;
+ }
+ store_statfs_t statfs;
+ vstatfs.publish(&statfs);
+ if (!(stat_it->second == statfs)) {
+ derr << "fsck error: actual " << statfs
+ << " != expected " << stat_it->second
+ << " for pool "
+ << std::hex << pool_id << std::dec << dendl;
+ if (repairer) {
+ repairer->fix_statfs(db, key, stat_it->second);
+ }
+ ++errors;
+ }
+ expected_pool_statfs.erase(stat_it);
+ }
+ } // if (it)
+ for (auto& s : expected_pool_statfs) {
+ if (s.second.is_zero()) {
+ // we might lack empty statfs recs in DB
+ continue;
+ }
+ derr << "fsck error: missing Pool StatFS record for pool "
+ << std::hex << s.first << std::dec << dendl;
+ if (repairer) {
+ string key;
+ get_pool_stat_key(s.first, &key);
+ repairer->fix_statfs(db, key, s.second);
+ }
+ ++errors;
+ }
+ if (!per_pool_stat_collection &&
+ cct->_conf->bluestore_fsck_error_on_no_per_pool_stats &&
+ repairer) {
+ // by virtue of running this method, we correct the top-level
+ // error of having global stats
+ repairer->inc_repaired();
+ }
+}
+
+BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
+ BlueStore::FSCKDepth depth,
+ int64_t pool_id,
+ BlueStore::CollectionRef c,
+ const ghobject_t& oid,
+ const string& key,
+ const bufferlist& value,
+ mempool::bluestore_fsck::list<string>& expecting_shards,
+ map<BlobRef, bluestore_blob_t::unused_t>* referenced,
+ const BlueStore::FSCK_ObjectCtx& ctx)
+{
+ auto& errors = ctx.errors;
+ auto& num_objects = ctx.num_objects;
+ auto& num_extents = ctx.num_extents;
+ auto& num_blobs = ctx.num_blobs;
+ auto& num_sharded_objects = ctx.num_sharded_objects;
+ auto& num_spanning_blobs = ctx.num_spanning_blobs;
+ auto used_blocks = ctx.used_blocks;
+ auto sb_info_lock = ctx.sb_info_lock;
+ auto& sb_info = ctx.sb_info;
+ auto repairer = ctx.repairer;
+
+ store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
+ &ctx.expected_pool_statfs[pool_id] :
+ &ctx.expected_store_statfs;
+
+ dout(10) << __func__ << " " << oid << dendl;
+ OnodeRef o;
+ o.reset(Onode::decode(c, oid, key, value));
+ ++num_objects;
+
+ num_spanning_blobs += o->extent_map.spanning_blob_map.size();
+
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ _dump_onode<30>(cct, *o);
+ // shards
+ if (!o->extent_map.shards.empty()) {
+ ++num_sharded_objects;
+ if (depth != FSCK_SHALLOW) {
+ for (auto& s : o->extent_map.shards) {
+ dout(20) << __func__ << " shard " << *s.shard_info << dendl;
+ expecting_shards.push_back(string());
+ get_extent_shard_key(o->key, s.shard_info->offset,
+ &expecting_shards.back());
+ if (s.shard_info->offset >= o->onode.size) {
+ derr << "fsck error: " << oid << " shard 0x" << std::hex
+ << s.shard_info->offset << " past EOF at 0x" << o->onode.size
+ << std::dec << dendl;
+ ++errors;
+ }
+ }
+ }
+ }
+
+ // lextents
+ uint64_t pos = 0;
+ mempool::bluestore_fsck::map<BlobRef,
+ bluestore_blob_use_tracker_t> ref_map;
+ for (auto& l : o->extent_map.extent_map) {
+ dout(20) << __func__ << " " << l << dendl;
+ if (l.logical_offset < pos) {
+ derr << "fsck error: " << oid << " lextent at 0x"
+ << std::hex << l.logical_offset
+ << " overlaps with the previous, which ends at 0x" << pos
+ << std::dec << dendl;
+ ++errors;
+ }
+ if (depth != FSCK_SHALLOW &&
+ o->extent_map.spans_shard(l.logical_offset, l.length)) {
+ derr << "fsck error: " << oid << " lextent at 0x"
+ << std::hex << l.logical_offset << "~" << l.length
+ << " spans a shard boundary"
+ << std::dec << dendl;
+ ++errors;
+ }
+ pos = l.logical_offset + l.length;
+ res_statfs->data_stored += l.length;
+ ceph_assert(l.blob);
+ const bluestore_blob_t& blob = l.blob->get_blob();
+
+ auto& ref = ref_map[l.blob];
+ if (ref.is_empty()) {
+ uint32_t min_release_size = blob.get_release_size(min_alloc_size);
+ uint32_t l = blob.get_logical_length();
+ ref.init(l, min_release_size);
+ }
+ ref.get(
+ l.blob_offset,
+ l.length);
+ ++num_extents;
+ if (depth != FSCK_SHALLOW &&
+ blob.has_unused()) {
+ ceph_assert(referenced);
+ auto p = referenced->find(l.blob);
+ bluestore_blob_t::unused_t* pu;
+ if (p == referenced->end()) {
+ pu = &(*referenced)[l.blob];
+ }
+ else {
+ pu = &p->second;
+ }
+ uint64_t blob_len = blob.get_logical_length();
+ ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
+ ceph_assert(l.blob_offset + l.length <= blob_len);
+ uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
+ uint64_t start = l.blob_offset / chunk_size;
+ uint64_t end =
+ round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
+ for (auto i = start; i < end; ++i) {
+ (*pu) |= (1u << i);
+ }
+ }
+ } //for (auto& l : o->extent_map.extent_map)
+
+ for (auto& i : ref_map) {
+ ++num_blobs;
+ const bluestore_blob_t& blob = i.first->get_blob();
+ bool equal =
+ depth == FSCK_SHALLOW ? true :
+ i.first->get_blob_use_tracker().equal(i.second);
+ if (!equal) {
+ derr << "fsck error: " << oid << " blob " << *i.first
+ << " doesn't match expected ref_map " << i.second << dendl;
+ ++errors;
+ }
+ if (blob.is_compressed()) {
+ res_statfs->data_compressed += blob.get_compressed_payload_length();
+ res_statfs->data_compressed_original +=
+ i.first->get_referenced_bytes();
+ }
+ if (blob.is_shared()) {
+ if (i.first->shared_blob->get_sbid() > blobid_max) {
+ derr << "fsck error: " << oid << " blob " << blob
+ << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
+ << blobid_max << dendl;
+ ++errors;
+ }
+ else if (i.first->shared_blob->get_sbid() == 0) {
+ derr << "fsck error: " << oid << " blob " << blob
+ << " marked as shared but has uninitialized sbid"
+ << dendl;
+ ++errors;
+ }
+ // the below lock is optional and provided in multithreading mode only
+ if (sb_info_lock) {
+ sb_info_lock->lock();
+ }
+ sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
+ ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
+ ceph_assert(sbi.pool_id == INT64_MIN ||
+ sbi.pool_id == oid.hobj.get_logical_pool());
+ sbi.cid = c->cid;
+ sbi.pool_id = oid.hobj.get_logical_pool();
+ sbi.sb = i.first->shared_blob;
+ sbi.oids.push_back(oid);
+ sbi.compressed = blob.is_compressed();
+ for (auto e : blob.get_extents()) {
+ if (e.is_valid()) {
+ sbi.ref_map.get(e.offset, e.length);
+ }
+ }
+ if (sb_info_lock) {
+ sb_info_lock->unlock();
+ }
+ } else if (depth != FSCK_SHALLOW) {
+ ceph_assert(used_blocks);
+ errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
+ blob.is_compressed(),
+ *used_blocks,
+ fm->get_alloc_size(),
+ repairer,
+ *res_statfs,
+ depth);
+ } else {
+ errors += _fsck_sum_extents(
+ blob.get_extents(),
+ blob.is_compressed(),
+ *res_statfs);
+ }
+ } // for (auto& i : ref_map)
+
+ {
+ auto &sbm = o->extent_map.spanning_blob_map;
+ size_t broken = 0;
+ BlobRef first_broken;
+ for (auto it = sbm.begin(); it != sbm.end();) {
+ auto it1 = it++;
+ if (ref_map.count(it1->second) == 0) {
+ if (!broken) {
+ first_broken = it1->second;
+ ++errors;
+ }
+ broken++;
+ if (repairer) {
+ sbm.erase(it1);
+ }
+ }
+ }
+ if (broken) {
+ derr << "fsck error: " << oid << " - " << broken
+ << " zombie spanning blob(s) found, the first one: "
+ << *first_broken << dendl;
+ if(repairer) {
+ auto txn = repairer->fix_spanning_blobs(db);
+ _record_onode(o, txn);
+ }
+ }
+ }
+
+ return o;
+}
+
+#include "common/WorkQueue.h"
+
+class ShallowFSCKThreadPool : public ThreadPool
+{
+public:
+ ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
+ ThreadPool(cct_, nm, tn, n) {
+ }
+ void worker(ThreadPool::WorkThread* wt) override {
+ int next_wq = 0;
+ while (!_stop) {
+ next_wq %= work_queues.size();
+ WorkQueue_ *wq = work_queues[next_wq++];
+
+ void* item = wq->_void_dequeue();
+ if (item) {
+ processing++;
+ TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
+ wq->_void_process(item, tp_handle);
+ processing--;
+ }
+ }
+ }
+ template <size_t BatchLen>
+ struct FSCKWorkQueue : public ThreadPool::WorkQueue_
+ {
+ struct Entry {
+ int64_t pool_id;
+ BlueStore::CollectionRef c;
+ ghobject_t oid;
+ string key;
+ bufferlist value;
+ };
+ struct Batch {
+ std::atomic<size_t> running = { 0 };
+ size_t entry_count = 0;
+ std::array<Entry, BatchLen> entries;
+
+ int64_t errors = 0;
+ int64_t warnings = 0;
+ uint64_t num_objects = 0;
+ uint64_t num_extents = 0;
+ uint64_t num_blobs = 0;
+ uint64_t num_sharded_objects = 0;
+ uint64_t num_spanning_blobs = 0;
+ store_statfs_t expected_store_statfs;
+ BlueStore::per_pool_statfs expected_pool_statfs;
+ };
+
+ size_t batchCount;
+ BlueStore* store = nullptr;
+
+ mempool::bluestore_fsck::list<string>* expecting_shards = nullptr;
+ ceph::mutex* sb_info_lock = nullptr;
+ BlueStore::sb_info_map_t* sb_info = nullptr;
+ BlueStoreRepairer* repairer = nullptr;
+
+ Batch* batches = nullptr;
+ size_t last_batch_pos = 0;
+ bool batch_acquired = false;
+
+ FSCKWorkQueue(std::string n,
+ size_t _batchCount,
+ BlueStore* _store,
+ mempool::bluestore_fsck::list<string>& _expecting_shards,
+ ceph::mutex* _sb_info_lock,
+ BlueStore::sb_info_map_t& _sb_info,
+ BlueStoreRepairer* _repairer) :
+ WorkQueue_(n, time_t(), time_t()),
+ batchCount(_batchCount),
+ store(_store),
+ expecting_shards(&_expecting_shards),
+ sb_info_lock(_sb_info_lock),
+ sb_info(&_sb_info),
+ repairer(_repairer)
+ {
+ batches = new Batch[batchCount];
+ }
+ ~FSCKWorkQueue() {
+ delete[] batches;
+ }
+
+ /// Remove all work items from the queue.
+ void _clear() override {
+ //do nothing
+ }
+ /// Check whether there is anything to do.
+ bool _empty() override {
+ ceph_assert(false);
+ }
+
+ /// Get the next work item to process.
+ void* _void_dequeue() override {
+ size_t pos = rand() % batchCount;
+ size_t pos0 = pos;
+ do {
+ auto& batch = batches[pos];
+ if (batch.running.fetch_add(1) == 0) {
+ if (batch.entry_count) {
+ return &batch;
+ }
+ }
+ batch.running--;
+ pos++;
+ pos %= batchCount;
+ } while (pos != pos0);
+ return nullptr;
+ }
+ /** @brief Process the work item.
+ * This function will be called several times in parallel
+ * and must therefore be thread-safe. */
+ void _void_process(void* item, TPHandle& handle) override {
+ Batch* batch = (Batch*)item;
+
+ BlueStore::FSCK_ObjectCtx ctx(
+ batch->errors,
+ batch->warnings,
+ batch->num_objects,
+ batch->num_extents,
+ batch->num_blobs,
+ batch->num_sharded_objects,
+ batch->num_spanning_blobs,
+ nullptr, // used_blocks
+ nullptr, // used_omap_head;
+ nullptr, // used_per_pool_omap_head;
+ nullptr, // used_pgmeta_omap_head;
+ sb_info_lock,
+ *sb_info,
+ batch->expected_store_statfs,
+ batch->expected_pool_statfs,
+ repairer);
+
+ for (size_t i = 0; i < batch->entry_count; i++) {
+ auto& entry = batch->entries[i];
+
+ store->fsck_check_objects_shallow(
+ BlueStore::FSCK_SHALLOW,
+ entry.pool_id,
+ entry.c,
+ entry.oid,
+ entry.key,
+ entry.value,
+ *expecting_shards,
+ nullptr, // referenced
+ ctx);
+ }
+ //std::cout << "processed " << batch << std::endl;
+ batch->entry_count = 0;
+ batch->running--;
+ }
+ /** @brief Synchronously finish processing a work item.
+ * This function is called after _void_process with the global thread pool lock held,
+ * so at most one copy will execute simultaneously for a given thread pool.
+ * It can be used for non-thread-safe finalization. */
+ void _void_process_finish(void*) override {
+ ceph_assert(false);
+ }
+
+ bool queue(
+ int64_t pool_id,
+ BlueStore::CollectionRef c,
+ const ghobject_t& oid,
+ const string& key,
+ const bufferlist& value) {
+ bool res = false;
+ size_t pos0 = last_batch_pos;
+ if (!batch_acquired) {
+ do {
+ auto& batch = batches[last_batch_pos];
+ if (batch.running.fetch_add(1) == 0) {
+ if (batch.entry_count < BatchLen) {
+ batch_acquired = true;
+ break;
+ }
+ }
+ batch.running.fetch_sub(1);
+ last_batch_pos++;
+ last_batch_pos %= batchCount;
+ } while (last_batch_pos != pos0);
+ }
+ if (batch_acquired) {
+ auto& batch = batches[last_batch_pos];
+ ceph_assert(batch.running);
+ ceph_assert(batch.entry_count < BatchLen);
+
+ auto& entry = batch.entries[batch.entry_count];
+ entry.pool_id = pool_id;
+ entry.c = c;
+ entry.oid = oid;
+ entry.key = key;
+ entry.value = value;
+
+ ++batch.entry_count;
+ if (batch.entry_count == BatchLen) {
+ batch_acquired = false;
+ batch.running.fetch_sub(1);
+ last_batch_pos++;
+ last_batch_pos %= batchCount;
+ }
+ res = true;
+ }
+ return res;
+ }
+
+ void finalize(ThreadPool& tp,
+ BlueStore::FSCK_ObjectCtx& ctx) {
+ if (batch_acquired) {
+ auto& batch = batches[last_batch_pos];
+ ceph_assert(batch.running);
+ batch.running.fetch_sub(1);
+ }
+ tp.stop();
+
+ for (size_t i = 0; i < batchCount; i++) {
+ auto& batch = batches[i];
+
+ //process leftovers if any
+ if (batch.entry_count) {
+ TPHandle tp_handle(store->cct,
+ nullptr,
+ timeout_interval,
+ suicide_interval);
+ ceph_assert(batch.running == 0);
+
+ batch.running++; // just to be on-par with the regular call
+ _void_process(&batch, tp_handle);
+ }
+ ceph_assert(batch.entry_count == 0);
+
+ ctx.errors += batch.errors;
+ ctx.warnings += batch.warnings;
+ ctx.num_objects += batch.num_objects;
+ ctx.num_extents += batch.num_extents;
+ ctx.num_blobs += batch.num_blobs;
+ ctx.num_sharded_objects += batch.num_sharded_objects;
+ ctx.num_spanning_blobs += batch.num_spanning_blobs;
+ ctx.expected_store_statfs.add(batch.expected_store_statfs);
+
+ for (auto it = batch.expected_pool_statfs.begin();
+ it != batch.expected_pool_statfs.end();
+ it++) {
+ ctx.expected_pool_statfs[it->first].add(it->second);
+ }
+ }
+ }
+ };
+};
+
+void BlueStore::_fsck_check_objects(FSCKDepth depth,
+ BlueStore::FSCK_ObjectCtx& ctx)
+{
+ //no need for the below lock when in non-shallow mode as
+ // there is no multithreading in this case
+ if (depth != FSCK_SHALLOW) {
+ ctx.sb_info_lock = nullptr;
+ }
+
+ auto& errors = ctx.errors;
+ auto used_omap_head = ctx.used_omap_head;
+ auto used_pgmeta_omap_head = ctx.used_pgmeta_omap_head;
+ auto sb_info_lock = ctx.sb_info_lock;
+ auto& sb_info = ctx.sb_info;
+ auto repairer = ctx.repairer;
+
+ uint64_t_btree_t used_nids;
+
+ size_t processed_myself = 0;
+
+ auto it = db->get_iterator(PREFIX_OBJ);
+ mempool::bluestore_fsck::list<string> expecting_shards;
+ if (it) {
+ const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
+ typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
+ std::unique_ptr<WQ> wq(
+ new WQ(
+ "FSCKWorkQueue",
+ (thread_count ? : 1) * 32,
+ this,
+ expecting_shards,
+ sb_info_lock,
+ sb_info,
+ repairer));
+
+ ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
+
+ thread_pool.add_work_queue(wq.get());
+ if (depth == FSCK_SHALLOW && thread_count > 0) {
+ //not the best place but let's check anyway
+ ceph_assert(sb_info_lock);
+ thread_pool.start();
+ }
+
+ //fill global if not overriden below
+ CollectionRef c;
+ int64_t pool_id = -1;
+ spg_t pgid;
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ dout(30) << __func__ << " key "
+ << pretty_binary_string(it->key()) << dendl;
+ if (is_extent_shard_key(it->key())) {
+ if (depth == FSCK_SHALLOW) {
+ continue;
+ }
+ while (!expecting_shards.empty() &&
+ expecting_shards.front() < it->key()) {
+ derr << "fsck error: missing shard key "
+ << pretty_binary_string(expecting_shards.front())
+ << dendl;
+ ++errors;
+ expecting_shards.pop_front();
+ }
+ if (!expecting_shards.empty() &&
+ expecting_shards.front() == it->key()) {
+ // all good
+ expecting_shards.pop_front();
+ continue;
+ }
+
+ uint32_t offset;
+ string okey;
+ get_key_extent_shard(it->key(), &okey, &offset);
+ derr << "fsck error: stray shard 0x" << std::hex << offset
+ << std::dec << dendl;
+ if (expecting_shards.empty()) {
+ derr << "fsck error: " << pretty_binary_string(it->key())
+ << " is unexpected" << dendl;
+ ++errors;
+ continue;
+ }
+ while (expecting_shards.front() > it->key()) {
+ derr << "fsck error: saw " << pretty_binary_string(it->key())
+ << dendl;
+ derr << "fsck error: exp "
+ << pretty_binary_string(expecting_shards.front()) << dendl;
+ ++errors;
+ expecting_shards.pop_front();
+ if (expecting_shards.empty()) {
+ break;
+ }
+ }
+ continue;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(it->key(), &oid);
+ if (r < 0) {
+ derr << "fsck error: bad object key "
+ << pretty_binary_string(it->key()) << dendl;
+ ++errors;
+ continue;
+ }
+ if (!c ||
+ oid.shard_id != pgid.shard ||
+ oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+ !c->contains(oid)) {
+ c = nullptr;
+ for (auto& p : coll_map) {
+ if (p.second->contains(oid)) {
+ c = p.second;
+ break;
+ }
+ }
+ if (!c) {
+ derr << "fsck error: stray object " << oid
+ << " not owned by any collection" << dendl;
+ ++errors;
+ continue;
+ }
+ pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
+ dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
+ << dendl;
+ }
+
+ if (depth != FSCK_SHALLOW &&
+ !expecting_shards.empty()) {
+ for (auto& k : expecting_shards) {
+ derr << "fsck error: missing shard key "
+ << pretty_binary_string(k) << dendl;
+ }
+ ++errors;
+ expecting_shards.clear();
+ }
+
+ bool queued = false;
+ if (depth == FSCK_SHALLOW && thread_count > 0) {
+ queued = wq->queue(
+ pool_id,
+ c,
+ oid,
+ it->key(),
+ it->value());
+ }
+ OnodeRef o;
+ map<BlobRef, bluestore_blob_t::unused_t> referenced;
+
+ if (!queued) {
+ ++processed_myself;
+
+ o = fsck_check_objects_shallow(
+ depth,
+ pool_id,
+ c,
+ oid,
+ it->key(),
+ it->value(),
+ expecting_shards,
+ &referenced,
+ ctx);
+ }
+
+ if (depth != FSCK_SHALLOW) {
+ ceph_assert(o != nullptr);
+ if (o->onode.nid) {
+ if (o->onode.nid > nid_max) {
+ derr << "fsck error: " << oid << " nid " << o->onode.nid
+ << " > nid_max " << nid_max << dendl;
+ ++errors;
+ }
+ if (used_nids.count(o->onode.nid)) {
+ derr << "fsck error: " << oid << " nid " << o->onode.nid
+ << " already in use" << dendl;
+ ++errors;
+ continue; // go for next object
+ }
+ used_nids.insert(o->onode.nid);
+ }
+ for (auto& i : referenced) {
+ dout(20) << __func__ << " referenced 0x" << std::hex << i.second
+ << std::dec << " for " << *i.first << dendl;
+ const bluestore_blob_t& blob = i.first->get_blob();
+ if (i.second & blob.unused) {
+ derr << "fsck error: " << oid << " blob claims unused 0x"
+ << std::hex << blob.unused
+ << " but extents reference 0x" << i.second << std::dec
+ << " on blob " << *i.first << dendl;
+ ++errors;
+ }
+ if (blob.has_csum()) {
+ uint64_t blob_len = blob.get_logical_length();
+ uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
+ unsigned csum_count = blob.get_csum_count();
+ unsigned csum_chunk_size = blob.get_csum_chunk_size();
+ for (unsigned p = 0; p < csum_count; ++p) {
+ unsigned pos = p * csum_chunk_size;
+ unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
+ unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
+ unsigned mask = 1u << firstbit;
+ for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
+ mask |= 1u << b;
+ }
+ if ((blob.unused & mask) == mask) {
+ // this csum chunk region is marked unused
+ if (blob.get_csum_item(p) != 0) {
+ derr << "fsck error: " << oid
+ << " blob claims csum chunk 0x" << std::hex << pos
+ << "~" << csum_chunk_size
+ << " is unused (mask 0x" << mask << " of unused 0x"
+ << blob.unused << ") but csum is non-zero 0x"
+ << blob.get_csum_item(p) << std::dec << " on blob "
+ << *i.first << dendl;
+ ++errors;
+ }
+ }
+ }
+ }
+ }
+ // omap
+ if (o->onode.has_omap()) {
+ ceph_assert(used_omap_head);
+ ceph_assert(used_pgmeta_omap_head);
+ auto m =
+ o->onode.is_pgmeta_omap() ? used_pgmeta_omap_head : used_omap_head;
+ if (m->count(o->onode.nid)) {
+ derr << "fsck error: " << oid << " omap_head " << o->onode.nid
+ << " already in use" << dendl;
+ ++errors;
+ } else {
+ m->insert(o->onode.nid);
+ }
+ }
+ if (depth == FSCK_DEEP) {
+ bufferlist bl;
+ uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
+ uint64_t offset = 0;
+ do {
+ uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
+ int r = _do_read(c.get(), o, offset, l, bl,
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ if (r < 0) {
+ ++errors;
+ derr << "fsck error: " << oid << std::hex
+ << " error during read: "
+ << " " << offset << "~" << l
+ << " " << cpp_strerror(r) << std::dec
+ << dendl;
+ break;
+ }
+ offset += l;
+ } while (offset < o->onode.size);
+ } // deep
+ } //if (depth != FSCK_SHALLOW)
+ } // for (it->lower_bound(string()); it->valid(); it->next())
+ if (depth == FSCK_SHALLOW && thread_count > 0) {
+ wq->finalize(thread_pool, ctx);
+ if (processed_myself) {
+ // may be needs more threads?
+ dout(0) << __func__ << " partial offload"
+ << ", done myself " << processed_myself
+ << " of " << ctx.num_objects
+ << "objects, threads " << thread_count
+ << dendl;
+ }
+ }
+ } // if (it)
+}
+/**
+An overview for currently implemented repair logics
+performed in fsck in two stages: detection(+preparation) and commit.
+Detection stage (in processing order):
+ (Issue -> Repair action to schedule)
+ - Detect undecodable keys for Shared Blobs -> Remove
+ - Detect undecodable records for Shared Blobs -> Remove
+ (might trigger missed Shared Blob detection below)
+ - Detect stray records for Shared Blobs -> Remove
+ - Detect misreferenced pextents -> Fix
+ Prepare Bloom-like filter to track cid/oid -> pextent
+ Prepare list of extents that are improperly referenced
+ Enumerate Onode records that might use 'misreferenced' pextents
+ (Bloom-like filter applied to reduce computation)
+ Per each questinable Onode enumerate all blobs and identify broken ones
+ (i.e. blobs having 'misreferences')
+ Rewrite each broken blob data by allocating another extents and
+ copying data there
+ If blob is shared - unshare it and mark corresponding Shared Blob
+ for removal
+ Release previously allocated space
+ Update Extent Map
+ - Detect missed Shared Blobs -> Recreate
+ - Detect undecodable deferred transaction -> Remove
+ - Detect Freelist Manager's 'false free' entries -> Mark as used
+ - Detect Freelist Manager's leaked entries -> Mark as free
+ - Detect statfs inconsistency - Update
+ Commit stage (separate DB commit per each step):
+ - Apply leaked FM entries fix
+ - Apply 'false free' FM entries fix
+ - Apply 'Remove' actions
+ - Apply fix for misreference pextents
+ - Apply Shared Blob recreate
+ (can be merged with the step above if misreferences were dectected)
+ - Apply StatFS update
+*/
+int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
+{
+ dout(1) << __func__
+ << (repair ? " repair" : " check")
+ << (depth == FSCK_DEEP ? " (deep)" :
+ depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
+ << dendl;
+
+ // in deep mode we need R/W write access to be able to replay deferred ops
+ bool read_only = !(repair || depth == FSCK_DEEP);
+
+ int r = _open_path();
+ if (r < 0)
+ return r;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+
+ r = _read_fsid(&fsid);
+ if (r < 0)
+ goto out_fsid;
+
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_fsid;
+
+ r = _open_bdev(false);
+ if (r < 0)
+ goto out_fsid;
+
+ r = _open_db_and_around(read_only);
+ if (r < 0)
+ goto out_bdev;
+
+ if (!read_only) {
+ r = _upgrade_super();
+ if (r < 0) {
+ goto out_db;
+ }
+ }
+
+ r = _open_collections();
+ if (r < 0)
+ goto out_db;
+
+ mempool_thread.init();
+
+ // we need finisher and kv_{sync,finalize}_thread *just* for replay
+ // enable in repair or deep mode modes only
+ if (!read_only) {
+ _kv_start();
+ r = _deferred_replay();
+ _kv_stop();
+ }
+ if (r < 0)
+ goto out_scan;
+
+ r = _fsck_on_open(depth, repair);
+
+out_scan:
+ mempool_thread.shutdown();
+ _flush_cache();
+out_db:
+ _close_db_and_around(false);
+out_bdev:
+ _close_bdev();
+out_fsid:
+ _close_fsid();
+out_path:
+ _close_path();
+
+ return r;
+}
+
+int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
+{
+ dout(1) << __func__
+ << " <<<START>>>"
+ << (repair ? " repair" : " check")
+ << (depth == FSCK_DEEP ? " (deep)" :
+ depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
+ << " start" << dendl;
+ int64_t errors = 0;
+ int64_t warnings = 0;
+ unsigned repaired = 0;
+
+ uint64_t_btree_t used_omap_head;
+ uint64_t_btree_t used_per_pool_omap_head;
+ uint64_t_btree_t used_pgmeta_omap_head;
+ uint64_t_btree_t used_sbids;
+
+ mempool_dynamic_bitset used_blocks;
+ KeyValueDB::Iterator it;
+ store_statfs_t expected_store_statfs, actual_statfs;
+ per_pool_statfs expected_pool_statfs;
+
+ sb_info_map_t sb_info;
+
+ uint64_t num_objects = 0;
+ uint64_t num_extents = 0;
+ uint64_t num_blobs = 0;
+ uint64_t num_spanning_blobs = 0;
+ uint64_t num_shared_blobs = 0;
+ uint64_t num_sharded_objects = 0;
+ BlueStoreRepairer repairer;
+
+ utime_t start = ceph_clock_now();
+
+ _fsck_collections(&errors);
+ used_blocks.resize(fm->get_alloc_units());
+ apply(
+ 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ ceph_assert(pos < bs.size());
+ bs.set(pos);
+ }
+ );
+ if (repair) {
+ repairer.get_space_usage_tracker().init(
+ bdev->get_size(),
+ min_alloc_size);
+ }
+
+ if (bluefs) {
+ if( cct->_conf->bluestore_bluefs_db_compatibility) {
+ interval_set<uint64_t> bluefs_extents_db;
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "bluefs_extents", &bl);
+ auto p = bl.cbegin();
+ auto prev_errors = errors;
+ try {
+ decode(bluefs_extents_db, p);
+ bluefs_extents_db.union_of(bluefs_extents);
+ bluefs_extents_db.subtract(bluefs_extents);
+ if (!bluefs_extents_db.empty()) {
+ derr << "fsck error: bluefs_extents inconsistency, "
+ << "downgrade to previous releases might be broken."
+ << dendl;
+ ++errors;
+ }
+ }
+ catch (buffer::error& e) {
+ derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl;
+ ++errors;
+ }
+ if (errors != prev_errors && repair) {
+ repairer.fix_bluefs_extents(out_of_sync_fm);
+ }
+ }
+
+ for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
+ apply(
+ e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ ceph_assert(pos < bs.size());
+ bs.set(pos);
+ });
+ }
+ int r = bluefs->fsck();
+ if (r < 0) {
+ return r;
+ }
+ if (r > 0)
+ errors += r;
+ }
+
+ if (!per_pool_stat_collection) {
+ const char *w;
+ if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
+ w = "error";
+ ++errors;
+ } else {
+ w = "warning";
+ ++warnings;
+ }
+ derr << "fsck " << w << ": store not yet converted to per-pool stats"
+ << dendl;
+ }
+ // get expected statfs; reset unaffected fields to be able to compare
+ // structs
+ statfs(&actual_statfs);
+ actual_statfs.total = 0;
+ actual_statfs.internally_reserved = 0;
+ actual_statfs.available = 0;
+ actual_statfs.internal_metadata = 0;
+ actual_statfs.omap_allocated = 0;
+
+ if (g_conf()->bluestore_debug_fsck_abort) {
+ dout(1) << __func__ << " debug abort" << dendl;
+ goto out_scan;
+ }
+ // walk PREFIX_OBJ
+ {
+ dout(1) << __func__ << " walking object keyspace" << dendl;
+ ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
+ BlueStore::FSCK_ObjectCtx ctx(
+ errors,
+ warnings,
+ num_objects,
+ num_extents,
+ num_blobs,
+ num_sharded_objects,
+ num_spanning_blobs,
+ &used_blocks,
+ &used_omap_head,
+ nullptr,
+ &used_pgmeta_omap_head,
+ &sb_info_lock,
+ sb_info,
+ expected_store_statfs,
+ expected_pool_statfs,
+ repair ? &repairer : nullptr);
+ _fsck_check_objects(depth,
+ ctx);
+ }
+
+ dout(1) << __func__ << " checking shared_blobs" << dendl;
+ it = db->get_iterator(PREFIX_SHARED_BLOB);
+ if (it) {
+ // FIXME minor: perhaps simplify for shallow mode?
+ // fill global if not overriden below
+ auto expected_statfs = &expected_store_statfs;
+
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ string key = it->key();
+ uint64_t sbid;
+ if (get_key_shared_blob(key, &sbid)) {
+ derr << "fsck error: bad key '" << key
+ << "' in shared blob namespace" << dendl;
+ if (repair) {
+ repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+ }
+ ++errors;
+ continue;
+ }
+ auto p = sb_info.find(sbid);
+ if (p == sb_info.end()) {
+ derr << "fsck error: found stray shared blob data for sbid 0x"
+ << std::hex << sbid << std::dec << dendl;
+ if (repair) {
+ repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+ }
+ ++errors;
+ } else {
+ ++num_shared_blobs;
+ sb_info_t& sbi = p->second;
+ bluestore_shared_blob_t shared_blob(sbid);
+ bufferlist bl = it->value();
+ auto blp = bl.cbegin();
+ try {
+ decode(shared_blob, blp);
+ } catch (buffer::error& e) {
+ ++errors;
+ // Force update and don't report as missing
+ sbi.updated = sbi.passed = true;
+
+ derr << "fsck error: failed to decode Shared Blob"
+ << pretty_binary_string(it->key()) << dendl;
+ if (repair) {
+ dout(20) << __func__ << " undecodable Shared Blob, key:'"
+ << pretty_binary_string(it->key())
+ << "', removing" << dendl;
+ repairer.remove_key(db, PREFIX_DEFERRED, it->key());
+ }
+ continue;
+ }
+ dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
+ if (shared_blob.ref_map != sbi.ref_map) {
+ derr << "fsck error: shared blob 0x" << std::hex << sbid
+ << std::dec << " ref_map " << shared_blob.ref_map
+ << " != expected " << sbi.ref_map << dendl;
+ sbi.updated = true; // will update later in repair mode only!
+ ++errors;
+ }
+ PExtentVector extents;
+ for (auto &r : shared_blob.ref_map.ref_map) {
+ extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
+ }
+ if (per_pool_stat_collection || repair) {
+ expected_statfs = &expected_pool_statfs[sbi.pool_id];
+ }
+ errors += _fsck_check_extents(sbi.cid,
+ p->second.oids.front(),
+ extents,
+ p->second.compressed,
+ used_blocks,
+ fm->get_alloc_size(),
+ repair ? &repairer : nullptr,
+ *expected_statfs,
+ depth);
+ sbi.passed = true;
+ }
+ }
+ } // if (it)
+
+ if (repair && repairer.preprocess_misreference(db)) {
+
+ dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
+ auto& space_tracker = repairer.get_space_usage_tracker();
+ auto& misref_extents = repairer.get_misreferences();
+ interval_set<uint64_t> to_release;
+ it = db->get_iterator(PREFIX_OBJ);
+ if (it) {
+ // fill global if not overriden below
+ auto expected_statfs = &expected_store_statfs;
+
+ CollectionRef c;
+ spg_t pgid;
+ KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
+ bool bypass_rest = false;
+ for (it->lower_bound(string()); it->valid() && !bypass_rest;
+ it->next()) {
+ dout(30) << __func__ << " key "
+ << pretty_binary_string(it->key()) << dendl;
+ if (is_extent_shard_key(it->key())) {
+ continue;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(it->key(), &oid);
+ if (r < 0 || !space_tracker.is_used(oid)) {
+ continue;
+ }
+
+ if (!c ||
+ oid.shard_id != pgid.shard ||
+ oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+ !c->contains(oid)) {
+ c = nullptr;
+ for (auto& p : coll_map) {
+ if (p.second->contains(oid)) {
+ c = p.second;
+ break;
+ }
+ }
+ if (!c) {
+ continue;
+ }
+ if (per_pool_stat_collection || repair) {
+ auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
+ expected_statfs = &expected_pool_statfs[pool_id];
+ }
+ }
+ if (!space_tracker.is_used(c->cid)) {
+ continue;
+ }
+
+ dout(20) << __func__ << " check misreference for col:" << c->cid
+ << " obj:" << oid << dendl;
+
+ OnodeRef o;
+ o.reset(Onode::decode(c, oid, it->key(), it->value()));
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ mempool::bluestore_fsck::set<BlobRef> blobs;
+
+ for (auto& e : o->extent_map.extent_map) {
+ blobs.insert(e.blob);
+ }
+ bool need_onode_update = false;
+ bool first_dump = true;
+ for(auto b : blobs) {
+ bool broken_blob = false;
+ auto& pextents = b->dirty_blob().dirty_extents();
+ for (auto& e : pextents) {
+ if (!e.is_valid()) {
+ continue;
+ }
+ // for the sake of simplicity and proper shared blob handling
+ // always rewrite the whole blob even when it's partially
+ // misreferenced.
+ if (misref_extents.intersects(e.offset, e.length)) {
+ if (first_dump) {
+ first_dump = false;
+ _dump_onode<10>(cct, *o);
+ }
+ broken_blob = true;
+ break;
+ }
+ }
+ if (!broken_blob)
+ continue;
+ bool compressed = b->get_blob().is_compressed();
+ need_onode_update = true;
+ dout(10) << __func__
+ << " fix misreferences in oid:" << oid
+ << " " << *b << dendl;
+ uint64_t b_off = 0;
+ PExtentVector pext_to_release;
+ pext_to_release.reserve(pextents.size());
+ // rewriting all valid pextents
+ for (auto e = pextents.begin(); e != pextents.end();
+ b_off += e->length, e++) {
+ if (!e->is_valid()) {
+ continue;
+ }
+ PExtentVector exts;
+ int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
+ 0, 0, &exts);
+ if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
+ derr << __func__
+ << " failed to allocate 0x" << std::hex << e->length
+ << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
+ << " min_alloc_size 0x" << min_alloc_size
+ << " available 0x " << alloc->get_free()
+ << std::dec << dendl;
+ if (alloc_len > 0) {
+ alloc->release(exts);
+ }
+ bypass_rest = true;
+ break;
+ }
+ expected_statfs->allocated += e->length;
+ if (compressed) {
+ expected_statfs->data_compressed_allocated += e->length;
+ }
+
+ bufferlist bl;
+ IOContext ioc(cct, NULL, true); // allow EIO
+ r = bdev->read(e->offset, e->length, &bl, &ioc, false);
+ if (r < 0) {
+ derr << __func__ << " failed to read from 0x" << std::hex << e->offset
+ <<"~" << e->length << std::dec << dendl;
+ ceph_abort_msg("read failed, wtf");
+ }
+ pext_to_release.push_back(*e);
+ e = pextents.erase(e);
+ e = pextents.insert(e, exts.begin(), exts.end());
+ b->get_blob().map_bl(
+ b_off, bl,
+ [&](uint64_t offset, bufferlist& t) {
+ int r = bdev->write(offset, t, false);
+ ceph_assert(r == 0);
+ });
+ e += exts.size() - 1;
+ for (auto& p : exts) {
+ fm->allocate(p.offset, p.length, txn);
+ }
+ } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
+
+ if (b->get_blob().is_shared()) {
+ b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
+
+ auto sb_it = sb_info.find(b->shared_blob->get_sbid());
+ ceph_assert(sb_it != sb_info.end());
+ sb_info_t& sbi = sb_it->second;
+
+ for (auto& r : sbi.ref_map.ref_map) {
+ expected_statfs->allocated -= r.second.length;
+ if (sbi.compressed) {
+ // NB: it's crucial to use compressed flag from sb_info_t
+ // as we originally used that value while accumulating
+ // expected_statfs
+ expected_statfs->data_compressed_allocated -= r.second.length;
+ }
+ }
+ sbi.updated = sbi.passed = true;
+ sbi.ref_map.clear();
+
+ // relying on blob's pextents to decide what to release.
+ for (auto& p : pext_to_release) {
+ to_release.union_insert(p.offset, p.length);
+ }
+ } else {
+ for (auto& p : pext_to_release) {
+ expected_statfs->allocated -= p.length;
+ if (compressed) {
+ expected_statfs->data_compressed_allocated -= p.length;
+ }
+ to_release.union_insert(p.offset, p.length);
+ }
+ }
+ if (bypass_rest) {
+ break;
+ }
+ } // for(auto b : blobs)
+ if (need_onode_update) {
+ o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
+ _record_onode(o, txn);
+ }
+ } // for (it->lower_bound(string()); it->valid(); it->next())
+
+ for (auto it = to_release.begin(); it != to_release.end(); ++it) {
+ dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
+ << "~" << it.get_len() << std::dec << dendl;
+ fm->release(it.get_start(), it.get_len(), txn);
+ }
+ alloc->release(to_release);
+ to_release.clear();
+ } // if (it) {
+ } //if (repair && repairer.preprocess_misreference()) {
+
+ if (depth != FSCK_SHALLOW) {
+ for (auto &p : sb_info) {
+ sb_info_t& sbi = p.second;
+ if (!sbi.passed) {
+ derr << "fsck error: missing " << *sbi.sb << dendl;
+ ++errors;
+ }
+ if (repair && (!sbi.passed || sbi.updated)) {
+ auto sbid = p.first;
+ if (sbi.ref_map.empty()) {
+ ceph_assert(sbi.passed);
+ dout(20) << __func__ << " " << *sbi.sb
+ << " is empty, removing" << dendl;
+ repairer.fix_shared_blob(db, sbid, nullptr);
+ } else {
+ bufferlist bl;
+ bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
+ encode(persistent, bl);
+ dout(20) << __func__ << " " << *sbi.sb
+ << " is " << bl.length() << " bytes, updating" << dendl;
+
+ repairer.fix_shared_blob(db, sbid, &bl);
+ }
+ }
+ }
+ }
+ sb_info.clear();
+
+ // check global stats only if fscking (not repairing) w/o per-pool stats
+ if (!per_pool_stat_collection &&
+ !repair &&
+ !(actual_statfs == expected_store_statfs)) {
+ derr << "fsck error: actual " << actual_statfs
+ << " != expected " << expected_store_statfs << dendl;
+ if (repair) {
+ repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
+ expected_store_statfs);
+ }
+ ++errors;
+ }
+
+ dout(1) << __func__ << " checking pool_statfs" << dendl;
+ _fsck_check_pool_statfs(expected_pool_statfs,
+ errors, warnings, repair ? &repairer : nullptr);
+
+ if (depth != FSCK_SHALLOW) {
+ dout(1) << __func__ << " checking for stray omap data" << dendl;
+ it = db->get_iterator(PREFIX_OMAP);
+ if (it) {
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ uint64_t omap_head;
+ _key_decode_u64(it->key().c_str(), &omap_head);
+ if (used_omap_head.count(omap_head) == 0) {
+ derr << "fsck error: found stray omap data on omap_head "
+ << omap_head << dendl;
+ ++errors;
+ }
+ }
+ }
+ it = db->get_iterator(PREFIX_PGMETA_OMAP);
+ if (it) {
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ uint64_t omap_head;
+ _key_decode_u64(it->key().c_str(), &omap_head);
+ if (used_pgmeta_omap_head.count(omap_head) == 0) {
+ derr << "fsck error: found stray (pgmeta) omap data on omap_head "
+ << omap_head << dendl;
+ ++errors;
+ }
+ }
+ }
+ dout(1) << __func__ << " checking deferred events" << dendl;
+ it = db->get_iterator(PREFIX_DEFERRED);
+ if (it) {
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ bluestore_deferred_transaction_t wt;
+ try {
+ decode(wt, p);
+ } catch (buffer::error& e) {
+ derr << "fsck error: failed to decode deferred txn "
+ << pretty_binary_string(it->key()) << dendl;
+ if (repair) {
+ dout(20) << __func__ << " undecodable deferred TXN record, key: '"
+ << pretty_binary_string(it->key())
+ << "', removing" << dendl;
+ repairer.remove_key(db, PREFIX_DEFERRED, it->key());
+ }
+ continue;
+ }
+ dout(20) << __func__ << " deferred " << wt.seq
+ << " ops " << wt.ops.size()
+ << " released 0x" << std::hex << wt.released << std::dec << dendl;
+ for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
+ apply(
+ e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ ceph_assert(pos < bs.size());
+ bs.set(pos);
+ }
+ );
+ }
+ }
+ }
+
+ dout(1) << __func__ << " checking freelist vs allocated" << dendl;
+ {
+ // remove bluefs_extents from used set since the freelist doesn't
+ // know they are allocated.
+ for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
+ apply(
+ e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ ceph_assert(pos < bs.size());
+ bs.reset(pos);
+ }
+ );
+ }
+ fm->enumerate_reset();
+ uint64_t offset, length;
+ while (fm->enumerate_next(db, &offset, &length)) {
+ bool intersects = false;
+ apply(
+ offset, length, fm->get_alloc_size(), used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ ceph_assert(pos < bs.size());
+ if (bs.test(pos)) {
+ if (offset == SUPER_RESERVED &&
+ length == min_alloc_size - SUPER_RESERVED) {
+ // this is due to the change just after luminous to min_alloc_size
+ // granularity allocations, and our baked in assumption at the top
+ // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
+ // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
+ // since we will never allocate this region below min_alloc_size.
+ dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+ << " and min_alloc_size, 0x" << std::hex << offset << "~"
+ << length << std::dec << dendl;
+ } else {
+ intersects = true;
+ if (repair) {
+ repairer.fix_false_free(db, fm,
+ pos * min_alloc_size,
+ min_alloc_size);
+ }
+ }
+ } else {
+ bs.set(pos);
+ }
+ }
+ );
+ if (intersects) {
+ derr << "fsck error: free extent 0x" << std::hex << offset
+ << "~" << length << std::dec
+ << " intersects allocated blocks" << dendl;
+ ++errors;
+ }
+ }
+ fm->enumerate_reset();
+ size_t count = used_blocks.count();
+ if (used_blocks.size() != count) {
+ ceph_assert(used_blocks.size() > count);
+ used_blocks.flip();
+ size_t start = used_blocks.find_first();
+ while (start != decltype(used_blocks)::npos) {
+ size_t cur = start;
+ while (true) {
+ size_t next = used_blocks.find_next(cur);
+ if (next != cur + 1) {
+ ++errors;
+ derr << "fsck error: leaked extent 0x" << std::hex
+ << ((uint64_t)start * fm->get_alloc_size()) << "~"
+ << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
+ << dendl;
+ if (repair) {
+ repairer.fix_leaked(db,
+ fm,
+ start * min_alloc_size,
+ (cur + 1 - start) * min_alloc_size);
+ }
+ start = next;
+ break;
+ }
+ cur = next;
+ }
+ }
+ used_blocks.flip();
+ }
+ }
+ }
+ if (repair) {
+ dout(5) << __func__ << " applying repair results" << dendl;
+ repaired = repairer.apply(db);
+ dout(5) << __func__ << " repair applied" << dendl;
+ }
+
+out_scan:
+ dout(2) << __func__ << " " << num_objects << " objects, "
+ << num_sharded_objects << " of them sharded. "
+ << dendl;
+ dout(2) << __func__ << " " << num_extents << " extents to "
+ << num_blobs << " blobs, "
+ << num_spanning_blobs << " spanning, "
+ << num_shared_blobs << " shared."
+ << dendl;
+
+ utime_t duration = ceph_clock_now() - start;
+ dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, " << repaired
+ << " repaired, " << (errors - (int)repaired) << " remaining in "
+ << duration << " seconds" << dendl;
+ return errors - (int)repaired;
+}
+
+/// methods to inject various errors fsck can repair
+void BlueStore::inject_broken_shared_blob_key(const string& key,
+ const bufferlist& bl)
+{
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+ txn->set(PREFIX_SHARED_BLOB, key, bl);
+ db->submit_transaction_sync(txn);
+};
+
+void BlueStore::inject_leaked(uint64_t len)
+{
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ PExtentVector exts;
+ int64_t alloc_len = alloc->allocate(len, min_alloc_size,
+ min_alloc_size * 256, 0, &exts);
+ ceph_assert(alloc_len >= (int64_t)len);
+ for (auto& p : exts) {
+ fm->allocate(p.offset, p.length, txn);
+ }
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
+{
+ KeyValueDB::Transaction txn;
+ OnodeRef o;
+ CollectionRef c = _get_collection(cid);
+ ceph_assert(c);
+ {
+ RWLock::WLocker l(c->lock); // just to avoid internal asserts
+ o = c->get_onode(oid, false);
+ ceph_assert(o);
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ }
+
+ bool injected = false;
+ txn = db->get_transaction();
+ auto& em = o->extent_map.extent_map;
+ std::vector<const PExtentVector*> v;
+ if (em.size()) {
+ v.push_back(&em.begin()->blob->get_blob().get_extents());
+ }
+ if (em.size() > 1) {
+ auto it = em.end();
+ --it;
+ v.push_back(&(it->blob->get_blob().get_extents()));
+ }
+ for (auto pext : v) {
+ if (pext->size()) {
+ auto p = pext->begin();
+ while (p != pext->end()) {
+ if (p->is_valid()) {
+ dout(20) << __func__ << " release 0x" << std::hex << p->offset
+ << "~" << p->length << std::dec << dendl;
+ fm->release(p->offset, p->length, txn);
+ injected = true;
+ break;
+ }
+ ++p;
+ }
+ }
+ }
+ ceph_assert(injected);
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
+{
+ BlueStoreRepairer repairer;
+ repairer.fix_statfs(db, key, new_statfs);
+ repairer.apply(db);
+}
+
+void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ volatile_statfs v;
+ v = new_statfs;
+ bufferlist bl;
+ v.encode(bl);
+ t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+ db->submit_transaction_sync(t);
+}
+
+void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
+ coll_t cid2, ghobject_t oid2,
+ uint64_t offset)
+{
+ OnodeRef o1;
+ CollectionRef c1 = _get_collection(cid1);
+ ceph_assert(c1);
+ {
+ RWLock::WLocker l(c1->lock); // just to avoid internal asserts
+ o1 = c1->get_onode(oid1, false);
+ ceph_assert(o1);
+ o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+ }
+ OnodeRef o2;
+ CollectionRef c2 = _get_collection(cid2);
+ ceph_assert(c2);
+ {
+ RWLock::WLocker l(c2->lock); // just to avoid internal asserts
+ o2 = c2->get_onode(oid2, false);
+ ceph_assert(o2);
+ o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+ }
+ Extent& e1 = *(o1->extent_map.seek_lextent(offset));
+ Extent& e2 = *(o2->extent_map.seek_lextent(offset));
+
+ // require onode/extent layout to be the same (and simple)
+ // to make things easier
+ ceph_assert(o1->onode.extent_map_shards.empty());
+ ceph_assert(o2->onode.extent_map_shards.empty());
+ ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
+ ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
+ ceph_assert(e1.logical_offset == e2.logical_offset);
+ ceph_assert(e1.length == e2.length);
+ ceph_assert(e1.blob_offset == e2.blob_offset);
+
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ // along with misreference error this will create space leaks errors
+ e2.blob->dirty_blob() = e1.blob->get_blob();
+ o2->extent_map.dirty_range(offset, e2.length);
+ o2->extent_map.update(txn, false);
+
+ _record_onode(o2, txn);
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
+ int16_t blob_id)
+{
+ OnodeRef o;
+ CollectionRef c = _get_collection(cid);
+ ceph_assert(c);
+ {
+ RWLock::WLocker l(c->lock); // just to avoid internal asserts
+ o = c->get_onode(oid, false);
+ ceph_assert(o);
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ }
+
+ BlobRef b = c->new_blob();
+ b->id = blob_id;
+ o->extent_map.spanning_blob_map[blob_id] = b;
+
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ _record_onode(o, txn);
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::collect_metadata(map<string,string> *pm)
+{
+ dout(10) << __func__ << dendl;
+ bdev->collect_metadata("bluestore_bdev_", pm);
+ if (bluefs) {
+ (*pm)["bluefs"] = "1";
+ (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device);
+ bluefs->collect_metadata(pm, bluefs_shared_bdev);
+ } else {
+ (*pm)["bluefs"] = "0";
+ }
+
+ // report numa mapping for underlying devices
+ int node = -1;
+ set<int> nodes;
+ set<string> failed;
+ int r = get_numa_node(&node, &nodes, &failed);
+ if (r >= 0) {
+ if (!failed.empty()) {
+ (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
+ }
+ if (!nodes.empty()) {
+ dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
+ (*pm)["objectstore_numa_nodes"] = stringify(nodes);
+ }
+ if (node >= 0) {
+ (*pm)["objectstore_numa_node"] = stringify(node);
+ }
+ }
+}
+
+int BlueStore::get_numa_node(
+ int *final_node,
+ set<int> *out_nodes,
+ set<string> *out_failed)
+{
+ int node = -1;
+ set<string> devices;
+ get_devices(&devices);
+ set<int> nodes;
+ set<string> failed;
+ for (auto& devname : devices) {
+ int n;
+ BlkDev bdev(devname);
+ int r = bdev.get_numa_node(&n);
+ if (r < 0) {
+ dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
+ << dendl;
+ failed.insert(devname);
+ continue;
+ }
+ dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
+ << dendl;
+ nodes.insert(n);
+ if (node < 0) {
+ node = n;
+ }
+ }
+ if (node >= 0 && nodes.size() == 1 && failed.empty()) {
+ *final_node = node;
+ }
+ if (out_nodes) {
+ *out_nodes = nodes;
+ }
+ if (out_failed) {
+ *out_failed = failed;
+ }
+ return 0;
+}
+
+int BlueStore::get_devices(set<string> *ls)
+{
+ if (bdev) {
+ bdev->get_devices(ls);
+ if (bluefs) {
+ bluefs->get_devices(ls);
+ }
+ return 0;
+ }
+
+ // grumble, we haven't started up yet.
+ int r = _open_path();
+ if (r < 0)
+ goto out;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+ r = _read_fsid(&fsid);
+ if (r < 0)
+ goto out_fsid;
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_fsid;
+ r = _open_bdev(false);
+ if (r < 0)
+ goto out_fsid;
+ r = _minimal_open_bluefs(false);
+ if (r < 0)
+ goto out_bdev;
+ bdev->get_devices(ls);
+ if (bluefs) {
+ bluefs->get_devices(ls);
+ }
+ r = 0;
+ _minimal_close_bluefs();
+ out_bdev:
+ _close_bdev();
+ out_fsid:
+ _close_fsid();
+ out_path:
+ _close_path();
+ out:
+ return r;
+}
+
+void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
+{
+ buf->reset();
+
+ buf->omap_allocated = db->estimate_prefix_size(PREFIX_OMAP);
+
+ uint64_t bfree = alloc->get_free();
+
+ if (bluefs) {
+ int64_t bluefs_total = bluefs->get_total(bluefs_shared_bdev);
+ int64_t bluefs_free = bluefs->get_free(bluefs_shared_bdev);
+ // part of our shared device is "free" according to BlueFS, but we
+ // can't touch bluestore_bluefs_min of it.
+ int64_t shared_available = std::min(
+ bluefs_free,
+ int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min));
+ buf->internally_reserved = bluefs_total - shared_available;
+ if (shared_available > 0) {
+ bfree += shared_available;
+ }
+ // include dedicated db, too, if that isn't the shared device.
+ if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
+ buf->total += bluefs->get_total(BlueFS::BDEV_DB);
+ }
+ // call any non-omap bluefs space "internal metadata"
+ buf->internal_metadata =
+ std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min)
+ - buf->omap_allocated;
+ }
+
+ uint64_t thin_total, thin_avail;
+ if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
+ buf->total += thin_total;
+
+ // we are limited by both the size of the virtual device and the
+ // underlying physical device.
+ bfree = std::min(bfree, thin_avail);
+
+ buf->allocated = thin_total - thin_avail;
+ } else {
+ buf->total += bdev->get_size();
+ }
+ buf->available = bfree;
+}
+
+int BlueStore::statfs(struct store_statfs_t *buf,
+ osd_alert_list_t* alerts)
+{
+ if (alerts) {
+ alerts->clear();
+ _log_alerts(*alerts);
+ }
+ _get_statfs_overall(buf);
+ {
+ std::lock_guard l(vstatfs_lock);
+ buf->allocated = vstatfs.allocated();
+ buf->data_stored = vstatfs.stored();
+ buf->data_compressed = vstatfs.compressed();
+ buf->data_compressed_original = vstatfs.compressed_original();
+ buf->data_compressed_allocated = vstatfs.compressed_allocated();
+ }
+
+ dout(20) << __func__ << " " << *buf << dendl;
+ return 0;
+}
+
+int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf)
+{
+ dout(20) << __func__ << " pool " << pool_id<< dendl;
+
+ if (!per_pool_stat_collection) {
+ dout(20) << __func__ << " not supported in legacy mode " << dendl;
+ return -ENOTSUP;
+ }
+ buf->reset();
+
+ {
+ std::lock_guard l(vstatfs_lock);
+ osd_pools[pool_id].publish(buf);
+ }
+ dout(10) << __func__ << *buf << dendl;
+ return 0;
+}
+
+void BlueStore::_check_legacy_statfs_alert()
+{
+ string s;
+ if (!per_pool_stat_collection &&
+ cct->_conf->bluestore_warn_on_legacy_statfs) {
+ s = "legacy statfs reporting detected, "
+ "suggest to run store repair to get consistent statistic reports";
+ }
+ std::lock_guard l(qlock);
+ legacy_statfs_alert = s;
+}
+
+// ---------------
+// cache
+
+BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
+{
+ RWLock::RLocker l(coll_lock);
+ ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+ if (cp == coll_map.end())
+ return CollectionRef();
+ return cp->second;
+}
+
+void BlueStore::_queue_reap_collection(CollectionRef& c)
+{
+ dout(10) << __func__ << " " << c << " " << c->cid << dendl;
+ // _reap_collections and this in the same thread,
+ // so no need a lock.
+ removed_collections.push_back(c);
+}
+
+void BlueStore::_reap_collections()
+{
+
+ list<CollectionRef> removed_colls;
+ {
+ // _queue_reap_collection and this in the same thread.
+ // So no need a lock.
+ if (!removed_collections.empty())
+ removed_colls.swap(removed_collections);
+ else
+ return;
+ }
+
+ list<CollectionRef>::iterator p = removed_colls.begin();
+ while (p != removed_colls.end()) {
+ CollectionRef c = *p;
+ dout(10) << __func__ << " " << c << " " << c->cid << dendl;
+ if (c->onode_map.map_any([&](OnodeRef o) {
+ ceph_assert(!o->exists);
+ if (o->flushing_count.load()) {
+ dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
+ << " flush_txns " << o->flushing_count << dendl;
+ return true;
+ }
+ return false;
+ })) {
+ ++p;
+ continue;
+ }
+ c->onode_map.clear();
+ p = removed_colls.erase(p);
+ dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
+ }
+ if (removed_colls.empty()) {
+ dout(10) << __func__ << " all reaped" << dendl;
+ } else {
+ removed_collections.splice(removed_collections.begin(), removed_colls);
+ }
+}
+
+void BlueStore::_update_cache_logger()
+{
+ uint64_t num_onodes = 0;
+ uint64_t num_extents = 0;
+ uint64_t num_blobs = 0;
+ uint64_t num_buffers = 0;
+ uint64_t num_buffer_bytes = 0;
+ for (auto c : cache_shards) {
+ c->add_stats(&num_onodes, &num_extents, &num_blobs,
+ &num_buffers, &num_buffer_bytes);
+ }
+ logger->set(l_bluestore_onodes, num_onodes);
+ logger->set(l_bluestore_extents, num_extents);
+ logger->set(l_bluestore_blobs, num_blobs);
+ logger->set(l_bluestore_buffers, num_buffers);
+ logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
+}
+
+// ---------------
+// read operations
+
+ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
+{
+ return _get_collection(cid);
+}
+
+ObjectStore::CollectionHandle BlueStore::create_new_collection(
+ const coll_t& cid)
+{
+ RWLock::WLocker l(coll_lock);
+ Collection *c = new Collection(
+ this,
+ cache_shards[cid.hash_to_shard(cache_shards.size())],
+ cid);
+ new_coll_map[cid] = c;
+ _osr_attach(c);
+ return c;
+}
+
+void BlueStore::set_collection_commit_queue(
+ const coll_t& cid,
+ ContextQueue *commit_queue)
+{
+ if (commit_queue) {
+ RWLock::RLocker l(coll_lock);
+ if (coll_map.count(cid)) {
+ coll_map[cid]->commit_queue = commit_queue;
+ } else if (new_coll_map.count(cid)) {
+ new_coll_map[cid]->commit_queue = commit_queue;
+ }
+ }
+}
+
+
+bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+ if (!c->exists)
+ return false;
+
+ bool r = true;
+
+ {
+ RWLock::RLocker l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists)
+ r = false;
+ }
+
+ return r;
+}
+
+int BlueStore::stat(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ if (!c->exists)
+ return -ENOENT;
+ dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+
+ {
+ RWLock::RLocker l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists)
+ return -ENOENT;
+ st->st_size = o->onode.size;
+ st->st_blksize = 4096;
+ st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+ st->st_nlink = 1;
+ }
+
+ int r = 0;
+ if (_debug_mdata_eio(oid)) {
+ r = -EIO;
+ derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+ }
+ return r;
+}
+int BlueStore::set_collection_opts(
+ CollectionHandle& ch,
+ const pool_opts_t& opts)
+{
+ Collection *c = static_cast<Collection *>(ch.get());
+ dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ RWLock::WLocker l(c->lock);
+ c->pool_opts = opts;
+ return 0;
+}
+
+int BlueStore::read(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t length,
+ bufferlist& bl,
+ uint32_t op_flags)
+{
+ auto start = mono_clock::now();
+ Collection *c = static_cast<Collection *>(c_.get());
+ const coll_t &cid = c->get_cid();
+ dout(15) << __func__ << " " << cid << " " << oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ if (!c->exists)
+ return -ENOENT;
+
+ bl.clear();
+ int r;
+ {
+ RWLock::RLocker l(c->lock);
+ auto start1 = mono_clock::now();
+ OnodeRef o = c->get_onode(oid, false);
+ log_latency("get_onode@read",
+ l_bluestore_read_onode_meta_lat,
+ mono_clock::now() - start1,
+ cct->_conf->bluestore_log_op_age);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (offset == length && offset == 0)
+ length = o->onode.size;
+
+ r = _do_read(c, o, offset, length, bl, op_flags);
+ if (r == -EIO) {
+ logger->inc(l_bluestore_read_eio);
+ }
+ }
+
+ out:
+ if (r >= 0 && _debug_data_eio(oid)) {
+ r = -EIO;
+ derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+ } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
+ cct->_conf->bluestore_debug_random_read_err &&
+ (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
+ 100.0)) == 0) {
+ dout(0) << __func__ << ": inject random EIO" << dendl;
+ r = -EIO;
+ }
+ dout(10) << __func__ << " " << cid << " " << oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << " = " << r << dendl;
+ log_latency(__func__,
+ l_bluestore_read_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ return r;
+}
+
+// --------------------------------------------------------
+// intermediate data structures used while reading
+struct region_t {
+ uint64_t logical_offset;
+ uint64_t blob_xoffset; //region offset within the blob
+ uint64_t length;
+
+ // used later in read process
+ uint64_t front = 0;
+
+ region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
+ : logical_offset(offset),
+ blob_xoffset(b_offs),
+ length(len),
+ front(front){}
+ region_t(const region_t& from)
+ : logical_offset(from.logical_offset),
+ blob_xoffset(from.blob_xoffset),
+ length(from.length),
+ front(from.front){}
+
+ friend ostream& operator<<(ostream& out, const region_t& r) {
+ return out << "0x" << std::hex << r.logical_offset << ":"
+ << r.blob_xoffset << "~" << r.length << std::dec;
+ }
+};
+
+// merged blob read request
+struct read_req_t {
+ uint64_t r_off = 0;
+ uint64_t r_len = 0;
+ bufferlist bl;
+ std::list<region_t> regs; // original read regions
+
+ read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
+
+ friend ostream& operator<<(ostream& out, const read_req_t& r) {
+ out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
+ for (const auto& reg : r.regs)
+ out << reg;
+ return out << "]}" << std::dec;
+ }
+};
+
+typedef list<read_req_t> regions2read_t;
+typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
+
+int BlueStore::_do_read(
+ Collection *c,
+ OnodeRef o,
+ uint64_t offset,
+ size_t length,
+ bufferlist& bl,
+ uint32_t op_flags,
+ uint64_t retry_count)
+{
+ FUNCTRACE(cct);
+ int r = 0;
+ int read_cache_policy = 0; // do not bypass clean or dirty cache
+
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " size 0x" << o->onode.size << " (" << std::dec
+ << o->onode.size << ")" << dendl;
+ bl.clear();
+
+ if (offset >= o->onode.size) {
+ return r;
+ }
+
+ // generally, don't buffer anything, unless the client explicitly requests
+ // it.
+ bool buffered = false;
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+ dout(20) << __func__ << " will do buffered read" << dendl;
+ buffered = true;
+ } else if (cct->_conf->bluestore_default_buffered_read &&
+ (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+ dout(20) << __func__ << " defaulting to buffered read" << dendl;
+ buffered = true;
+ }
+
+ if (offset + length > o->onode.size) {
+ length = o->onode.size - offset;
+ }
+
+ auto start = mono_clock::now();
+ o->extent_map.fault_range(db, offset, length);
+ log_latency(__func__,
+ l_bluestore_read_onode_meta_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ _dump_onode<30>(cct, *o);
+
+ ready_regions_t ready_regions;
+
+ // for deep-scrub, we only read dirty cache and bypass clean cache in
+ // order to read underlying block device in case there are silent disk errors.
+ if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
+ dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
+ read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
+ }
+
+ // build blob-wise list to of stuff read (that isn't cached)
+ blobs2read_t blobs2read;
+ unsigned left = length;
+ uint64_t pos = offset;
+ unsigned num_regions = 0;
+ auto lp = o->extent_map.seek_lextent(offset);
+ while (left > 0 && lp != o->extent_map.extent_map.end()) {
+ if (pos < lp->logical_offset) {
+ unsigned hole = lp->logical_offset - pos;
+ if (hole >= left) {
+ break;
+ }
+ dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
+ << std::dec << dendl;
+ pos += hole;
+ left -= hole;
+ }
+ BlobRef& bptr = lp->blob;
+ unsigned l_off = pos - lp->logical_offset;
+ unsigned b_off = l_off + lp->blob_offset;
+ unsigned b_len = std::min(left, lp->length - l_off);
+
+ ready_regions_t cache_res;
+ interval_set<uint32_t> cache_interval;
+ bptr->shared_blob->bc.read(
+ bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
+ read_cache_policy);
+ dout(20) << __func__ << " blob " << *bptr << std::hex
+ << " need 0x" << b_off << "~" << b_len
+ << " cache has 0x" << cache_interval
+ << std::dec << dendl;
+
+ auto pc = cache_res.begin();
+ uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
+ while (b_len > 0) {
+ unsigned l;
+ if (pc != cache_res.end() &&
+ pc->first == b_off) {
+ l = pc->second.length();
+ ready_regions[pos].claim(pc->second);
+ dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
+ << b_off << "~" << l << std::dec << dendl;
+ ++pc;
+ } else {
+ l = b_len;
+ if (pc != cache_res.end()) {
+ ceph_assert(pc->first > b_off);
+ l = pc->first - b_off;
+ }
+ dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
+ << b_off << "~" << l << std::dec << dendl;
+ // merge regions
+ {
+ uint64_t r_off = b_off;
+ uint64_t r_len = l;
+ uint64_t front = r_off % chunk_size;
+ if (front) {
+ r_off -= front;
+ r_len += front;
+ }
+ unsigned tail = r_len % chunk_size;
+ if (tail) {
+ r_len += chunk_size - tail;
+ }
+ bool merged = false;
+ regions2read_t& r2r = blobs2read[bptr];
+ if (r2r.size()) {
+ read_req_t& pre = r2r.back();
+ if (r_off <= (pre.r_off + pre.r_len)) {
+ front += (r_off - pre.r_off);
+ pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
+ pre.regs.emplace_back(region_t(pos, b_off, l, front));
+ merged = true;
+ }
+ }
+ if (!merged) {
+ read_req_t req(r_off, r_len);
+ req.regs.emplace_back(region_t(pos, b_off, l, front));
+ r2r.emplace_back(std::move(req));
+ }
+ }
+ ++num_regions;
+ }
+ pos += l;
+ b_off += l;
+ left -= l;
+ b_len -= l;
+ }
+ ++lp;
+ }
+
+ // read raw blob data. use aio if we have >1 blobs to read.
+ start = mono_clock::now(); // for the sake of simplicity
+ // measure the whole block below.
+ // The error isn't that much...
+ vector<bufferlist> compressed_blob_bls;
+ IOContext ioc(cct, NULL, true); // allow EIO
+ for (auto& p : blobs2read) {
+ const BlobRef& bptr = p.first;
+ regions2read_t& r2r = p.second;
+ dout(20) << __func__ << " blob " << *bptr << std::hex
+ << " need " << r2r << std::dec << dendl;
+ if (bptr->get_blob().is_compressed()) {
+ // read the whole thing
+ if (compressed_blob_bls.empty()) {
+ // ensure we avoid any reallocation on subsequent blobs
+ compressed_blob_bls.reserve(blobs2read.size());
+ }
+ compressed_blob_bls.push_back(bufferlist());
+ bufferlist& bl = compressed_blob_bls.back();
+ r = bptr->get_blob().map(
+ 0, bptr->get_blob().get_ondisk_length(),
+ [&](uint64_t offset, uint64_t length) {
+ int r;
+ // use aio if there are more regions to read than those in this blob
+ if (num_regions > r2r.size()) {
+ r = bdev->aio_read(offset, length, &bl, &ioc);
+ } else {
+ r = bdev->read(offset, length, &bl, &ioc, false);
+ }
+ if (r < 0)
+ return r;
+ return 0;
+ });
+ if (r < 0) {
+ derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
+ if (r == -EIO) {
+ // propagate EIO to caller
+ return r;
+ }
+ ceph_assert(r == 0);
+ }
+ } else {
+ // read the pieces
+ for (auto& req : r2r) {
+ dout(20) << __func__ << " region 0x" << std::hex
+ << req.regs.front().logical_offset
+ << ": 0x" << req.regs.front().blob_xoffset
+ << " reading 0x" << req.r_off
+ << "~" << req.r_len << std::dec
+ << dendl;
+
+ // read it
+ r = bptr->get_blob().map(
+ req.r_off, req.r_len,
+ [&](uint64_t offset, uint64_t length) {
+ int r;
+ // use aio if there is more than one region to read
+ if (num_regions > 1) {
+ r = bdev->aio_read(offset, length, &req.bl, &ioc);
+ } else {
+ r = bdev->read(offset, length, &req.bl, &ioc, false);
+ }
+ if (r < 0)
+ return r;
+ return 0;
+ });
+ if (r < 0) {
+ derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
+ << dendl;
+ if (r == -EIO) {
+ // propagate EIO to caller
+ return r;
+ }
+ ceph_assert(r == 0);
+ }
+ ceph_assert(req.bl.length() == req.r_len);
+ }
+ }
+ }
+
+ int64_t num_ios = length;
+ if (ioc.has_pending_aios()) {
+ num_ios = -ioc.get_num_ios();
+ bdev->aio_submit(&ioc);
+ dout(20) << __func__ << " waiting for aio" << dendl;
+ ioc.aio_wait();
+ r = ioc.get_return_value();
+ if (r < 0) {
+ ceph_assert(r == -EIO); // no other errors allowed
+ return -EIO;
+ }
+ }
+ log_latency_fn(__func__,
+ l_bluestore_read_wait_aio_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age,
+ [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
+ );
+
+ // enumerate and decompress desired blobs
+ auto p = compressed_blob_bls.begin();
+ blobs2read_t::iterator b2r_it = blobs2read.begin();
+ while (b2r_it != blobs2read.end()) {
+ const BlobRef& bptr = b2r_it->first;
+ regions2read_t& r2r = b2r_it->second;
+ dout(20) << __func__ << " blob " << *bptr << std::hex
+ << " need 0x" << r2r << std::dec << dendl;
+ if (bptr->get_blob().is_compressed()) {
+ ceph_assert(p != compressed_blob_bls.end());
+ bufferlist& compressed_bl = *p++;
+ if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
+ r2r.front().regs.front().logical_offset) < 0) {
+ // Handles spurious read errors caused by a kernel bug.
+ // We sometimes get all-zero pages as a result of the read under
+ // high memory pressure. Retrying the failing read succeeds in most
+ // cases.
+ // See also: http://tracker.ceph.com/issues/22464
+ if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+ return -EIO;
+ }
+ return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
+ }
+ bufferlist raw_bl;
+ r = _decompress(compressed_bl, &raw_bl);
+ if (r < 0)
+ return r;
+ if (buffered) {
+ bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
+ raw_bl);
+ }
+ for (auto& req : r2r) {
+ for (auto& r : req.regs) {
+ ready_regions[r.logical_offset].substr_of(
+ raw_bl, r.blob_xoffset, r.length);
+ }
+ }
+ } else {
+ for (auto& req : r2r) {
+ if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
+ req.regs.front().logical_offset) < 0) {
+ // Handles spurious read errors caused by a kernel bug.
+ // We sometimes get all-zero pages as a result of the read under
+ // high memory pressure. Retrying the failing read succeeds in most
+ // cases.
+ // See also: http://tracker.ceph.com/issues/22464
+ if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+ return -EIO;
+ }
+ return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
+ }
+ if (buffered) {
+ bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
+ req.r_off, req.bl);
+ }
+
+ // prune and keep result
+ for (const auto& r : req.regs) {
+ ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
+ }
+ }
+ }
+ ++b2r_it;
+ }
+
+ // generate a resulting buffer
+ auto pr = ready_regions.begin();
+ auto pr_end = ready_regions.end();
+ pos = 0;
+ while (pos < length) {
+ if (pr != pr_end && pr->first == pos + offset) {
+ dout(30) << __func__ << " assemble 0x" << std::hex << pos
+ << ": data from 0x" << pr->first << "~" << pr->second.length()
+ << std::dec << dendl;
+ pos += pr->second.length();
+ bl.claim_append(pr->second);
+ ++pr;
+ } else {
+ uint64_t l = length - pos;
+ if (pr != pr_end) {
+ ceph_assert(pr->first > pos + offset);
+ l = pr->first - (pos + offset);
+ }
+ dout(30) << __func__ << " assemble 0x" << std::hex << pos
+ << ": zeros for 0x" << (pos + offset) << "~" << l
+ << std::dec << dendl;
+ bl.append_zero(l);
+ pos += l;
+ }
+ }
+ ceph_assert(bl.length() == length);
+ ceph_assert(pos == length);
+ ceph_assert(pr == pr_end);
+ r = bl.length();
+ if (retry_count) {
+ logger->inc(l_bluestore_reads_with_retries);
+ dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
+ << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
+ }
+ return r;
+}
+
+int BlueStore::_verify_csum(OnodeRef& o,
+ const bluestore_blob_t* blob, uint64_t blob_xoffset,
+ const bufferlist& bl,
+ uint64_t logical_offset) const
+{
+ int bad;
+ uint64_t bad_csum;
+ auto start = mono_clock::now();
+ int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
+ if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
+ (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
+ derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
+ bad = blob_xoffset;
+ r = -1;
+ bad_csum = 0xDEADBEEF;
+ }
+ if (r < 0) {
+ if (r == -1) {
+ PExtentVector pex;
+ blob->map(
+ bad,
+ blob->get_csum_chunk_size(),
+ [&](uint64_t offset, uint64_t length) {
+ pex.emplace_back(bluestore_pextent_t(offset, length));
+ return 0;
+ });
+ derr << __func__ << " bad "
+ << Checksummer::get_csum_type_string(blob->csum_type)
+ << "/0x" << std::hex << blob->get_csum_chunk_size()
+ << " checksum at blob offset 0x" << bad
+ << ", got 0x" << bad_csum << ", expected 0x"
+ << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
+ << ", device location " << pex
+ << ", logical extent 0x" << std::hex
+ << (logical_offset + bad - blob_xoffset) << "~"
+ << blob->get_csum_chunk_size() << std::dec
+ << ", object " << o->oid
+ << dendl;
+ } else {
+ derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
+ }
+ }
+ log_latency(__func__,
+ l_bluestore_csum_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ if (cct->_conf->bluestore_ignore_data_csum) {
+ return 0;
+ }
+ return r;
+}
+
+int BlueStore::_decompress(bufferlist& source, bufferlist* result)
+{
+ int r = 0;
+ auto start = mono_clock::now();
+ auto i = source.cbegin();
+ bluestore_compression_header_t chdr;
+ decode(chdr, i);
+ int alg = int(chdr.type);
+ CompressorRef cp = compressor;
+ if (!cp || (int)cp->get_type() != alg) {
+ cp = Compressor::create(cct, alg);
+ }
+
+ if (!cp.get()) {
+ // if compressor isn't available - error, because cannot return
+ // decompressed data?
+
+ const char* alg_name = Compressor::get_comp_alg_name(alg);
+ derr << __func__ << " can't load decompressor " << alg_name << dendl;
+ _set_compression_alert(false, alg_name);
+ r = -EIO;
+ } else {
+ r = cp->decompress(i, chdr.length, *result);
+ if (r < 0) {
+ derr << __func__ << " decompression failed with exit code " << r << dendl;
+ r = -EIO;
+ }
+ }
+ log_latency(__func__,
+ l_bluestore_decompress_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ return r;
+}
+
+// this stores fiemap into interval_set, other variations
+// use it internally
+int BlueStore::_fiemap(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t length,
+ interval_set<uint64_t>& destset)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ if (!c->exists)
+ return -ENOENT;
+ {
+ RWLock::RLocker l(c->lock);
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ return -ENOENT;
+ }
+ _dump_onode<30>(cct, *o);
+
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " size 0x" << o->onode.size << std::dec << dendl;
+
+ boost::intrusive::set<Extent>::iterator ep, eend;
+ if (offset >= o->onode.size)
+ goto out;
+
+ if (offset + length > o->onode.size) {
+ length = o->onode.size - offset;
+ }
+
+ o->extent_map.fault_range(db, offset, length);
+ eend = o->extent_map.extent_map.end();
+ ep = o->extent_map.seek_lextent(offset);
+ while (length > 0) {
+ dout(20) << __func__ << " offset " << offset << dendl;
+ if (ep != eend && ep->logical_offset + ep->length <= offset) {
+ ++ep;
+ continue;
+ }
+
+ uint64_t x_len = length;
+ if (ep != eend && ep->logical_offset <= offset) {
+ uint64_t x_off = offset - ep->logical_offset;
+ x_len = std::min(x_len, ep->length - x_off);
+ dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
+ << x_len << std::dec << " blob " << ep->blob << dendl;
+ destset.insert(offset, x_len);
+ length -= x_len;
+ offset += x_len;
+ if (x_off + x_len == ep->length)
+ ++ep;
+ continue;
+ }
+ if (ep != eend &&
+ ep->logical_offset > offset &&
+ ep->logical_offset - offset < x_len) {
+ x_len = ep->logical_offset - offset;
+ }
+ offset += x_len;
+ length -= x_len;
+ }
+ }
+
+ out:
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " size = 0x(" << destset << ")" << std::dec << dendl;
+ return 0;
+}
+
+int BlueStore::fiemap(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t length,
+ bufferlist& bl)
+{
+ interval_set<uint64_t> m;
+ int r = _fiemap(c_, oid, offset, length, m);
+ if (r >= 0) {
+ encode(m, bl);
+ }
+ return r;
+}
+
+int BlueStore::fiemap(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t length,
+ map<uint64_t, uint64_t>& destmap)
+{
+ interval_set<uint64_t> m;
+ int r = _fiemap(c_, oid, offset, length, m);
+ if (r >= 0) {
+ m.move_into(destmap);
+ }
+ return r;
+}
+
+int BlueStore::getattr(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ const char *name,
+ bufferptr& value)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
+ if (!c->exists)
+ return -ENOENT;
+
+ int r;
+ {
+ RWLock::RLocker l(c->lock);
+ mempool::bluestore_cache_meta::string k(name);
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (!o->onode.attrs.count(k)) {
+ r = -ENODATA;
+ goto out;
+ }
+ value = o->onode.attrs[k];
+ r = 0;
+ }
+ out:
+ if (r == 0 && _debug_mdata_eio(oid)) {
+ r = -EIO;
+ derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+ }
+ dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::getattrs(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ map<string,bufferptr>& aset)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+
+ int r;
+ {
+ RWLock::RLocker l(c->lock);
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ for (auto& i : o->onode.attrs) {
+ aset.emplace(i.first.c_str(), i.second);
+ }
+ r = 0;
+ }
+
+ out:
+ if (r == 0 && _debug_mdata_eio(oid)) {
+ r = -EIO;
+ derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+ }
+ dout(10) << __func__ << " " << c->cid << " " << oid
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::list_collections(vector<coll_t>& ls)
+{
+ RWLock::RLocker l(coll_lock);
+ ls.reserve(coll_map.size());
+ for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
+ p != coll_map.end();
+ ++p)
+ ls.push_back(p->first);
+ return 0;
+}
+
+bool BlueStore::collection_exists(const coll_t& c)
+{
+ RWLock::RLocker l(coll_lock);
+ return coll_map.count(c);
+}
+
+int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+ dout(15) << __func__ << " " << ch->cid << dendl;
+ vector<ghobject_t> ls;
+ ghobject_t next;
+ int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
+ &ls, &next);
+ if (r < 0) {
+ derr << __func__ << " collection_list returned: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ *empty = ls.empty();
+ dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
+ return 0;
+}
+
+int BlueStore::collection_bits(CollectionHandle& ch)
+{
+ dout(15) << __func__ << " " << ch->cid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
+ return c->cnode.bits;
+}
+
+int BlueStore::collection_list(
+ CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ c->flush();
+ dout(15) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max << dendl;
+ int r;
+ {
+ RWLock::RLocker l(c->lock);
+ r = _collection_list(c, start, end, max, false, ls, pnext);
+ }
+
+ dout(10) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max
+ << " = " << r << ", ls.size() = " << ls->size()
+ << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
+ return r;
+}
+
+int BlueStore::collection_list_legacy(
+ CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ c->flush();
+ dout(15) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max << dendl;
+ int r;
+ {
+ RWLock::RLocker l(c->lock);
+ r = _collection_list(c, start, end, max, true, ls, pnext);
+ }
+
+ dout(10) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max
+ << " = " << r << ", ls.size() = " << ls->size()
+ << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
+ return r;
+}
+
+int BlueStore::_collection_list(
+ Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
+ bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+
+ if (!c->exists)
+ return -ENOENT;
+
+ auto start_time = mono_clock::now();
+ int r = 0;
+ ghobject_t static_next;
+ std::unique_ptr<CollectionListIterator> it;
+ ghobject_t coll_range_temp_start, coll_range_temp_end;
+ ghobject_t coll_range_start, coll_range_end;
+ bool set_next = false;
+ ghobject_t pend;
+ bool temp;
+
+ if (!pnext)
+ pnext = &static_next;
+
+ if (start.is_max() || start.hobj.is_max()) {
+ goto out;
+ }
+ get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
+ &coll_range_temp_end, &coll_range_start, &coll_range_end);
+ dout(20) << __func__
+ << " range " << coll_range_temp_start
+ << " to " << coll_range_temp_end
+ << " and " << coll_range_start
+ << " to " << coll_range_end
+ << " start " << start << dendl;
+ if (legacy) {
+ it = std::make_unique<SimpleCollectionListIterator>(
+ cct, db->get_iterator(PREFIX_OBJ));
+ } else {
+ it = std::make_unique<SortedCollectionListIterator>(
+ db->get_iterator(PREFIX_OBJ));
+ }
+ if (start == ghobject_t() ||
+ start.hobj == hobject_t() ||
+ start == c->cid.get_min_hobj()) {
+ it->upper_bound(coll_range_temp_start);
+ temp = true;
+ } else {
+ if (start.hobj.is_temp()) {
+ temp = true;
+ ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
+ } else {
+ temp = false;
+ ceph_assert(start >= coll_range_start && start < coll_range_end);
+ }
+ dout(20) << __func__ << " temp=" << (int)temp << dendl;
+ it->lower_bound(start);
+ }
+ if (end.hobj.is_max()) {
+ pend = temp ? coll_range_temp_end : coll_range_end;
+ } else {
+ if (end.hobj.is_temp()) {
+ if (temp)
+ pend = end;
+ else
+ goto out;
+ } else {
+ pend = temp ? coll_range_temp_end : end;
+ }
+ }
+ dout(20) << __func__ << " pend " << pend << dendl;
+ while (true) {
+ if (!it->valid() || it->is_ge(pend)) {
+ if (!it->valid())
+ dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
+ else
+ dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
+ if (temp) {
+ if (end.hobj.is_temp()) {
+ if (it->valid() && it->is_lt(coll_range_temp_end)) {
+ *pnext = it->oid();
+ set_next = true;
+ }
+ break;
+ }
+ dout(30) << __func__ << " switch to non-temp namespace" << dendl;
+ temp = false;
+ it->upper_bound(coll_range_start);
+ if (end.hobj.is_max())
+ pend = coll_range_end;
+ else
+ pend = end;
+ dout(30) << __func__ << " pend " << pend << dendl;
+ continue;
+ }
+ if (it->valid() && it->is_lt(coll_range_end)) {
+ *pnext = it->oid();
+ set_next = true;
+ }
+ break;
+ }
+ dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
+ if (ls->size() >= (unsigned)max) {
+ dout(20) << __func__ << " reached max " << max << dendl;
+ *pnext = it->oid();
+ set_next = true;
+ break;
+ }
+ ls->push_back(it->oid());
+ it->next();
+ }
+out:
+ if (!set_next) {
+ *pnext = ghobject_t::get_max();
+ }
+ log_latency_fn(
+ __func__,
+ l_bluestore_clist_lat,
+ mono_clock::now() - start_time,
+ cct->_conf->bluestore_log_collection_list_age,
+ [&] (const ceph::timespan& lat) {
+ ostringstream ostr;
+ ostr << ", lat = " << timespan_str(lat)
+ << " cid =" << c->cid
+ << " start " << start << " end " << end
+ << " max " << max;
+ return ostr.str();
+ }
+ );
+ return r;
+}
+
+int BlueStore::omap_get(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ RWLock::RLocker l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap())
+ goto out;
+ o->flush();
+ {
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ KeyValueDB::Iterator it = db->get_iterator(prefix);
+ string head, tail;
+ get_omap_header(o->onode.nid, &head);
+ get_omap_tail(o->onode.nid, &tail);
+ it->lower_bound(head);
+ while (it->valid()) {
+ if (it->key() == head) {
+ dout(30) << __func__ << " got header" << dendl;
+ *header = it->value();
+ } else if (it->key() >= tail) {
+ dout(30) << __func__ << " reached tail" << dendl;
+ break;
+ } else {
+ string user_key;
+ decode_omap_key(it->key(), &user_key);
+ dout(20) << __func__ << " got " << pretty_binary_string(it->key())
+ << " -> " << user_key << dendl;
+ (*out)[user_key] = it->value();
+ }
+ it->next();
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+int BlueStore::omap_get_header(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ bool allow_eio ///< [in] don't assert on eio
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ RWLock::RLocker l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap())
+ goto out;
+ o->flush();
+ {
+ string head;
+ get_omap_header(o->onode.nid, &head);
+ if (db->get(o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
+ head, header) >= 0) {
+ dout(30) << __func__ << " got header" << dendl;
+ } else {
+ dout(30) << __func__ << " no header" << dendl;
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+int BlueStore::omap_get_keys(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ auto start1 = mono_clock::now();
+ RWLock::RLocker l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap())
+ goto out;
+ o->flush();
+ {
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ KeyValueDB::Iterator it = db->get_iterator(prefix);
+ string head, tail;
+ get_omap_key(o->onode.nid, string(), &head);
+ get_omap_tail(o->onode.nid, &tail);
+ it->lower_bound(head);
+ while (it->valid()) {
+ if (it->key() >= tail) {
+ dout(30) << __func__ << " reached tail" << dendl;
+ break;
+ }
+ string user_key;
+ decode_omap_key(it->key(), &user_key);
+ dout(20) << __func__ << " got " << pretty_binary_string(it->key())
+ << " -> " << user_key << dendl;
+ keys->insert(user_key);
+ it->next();
+ }
+ }
+ out:
+ c->store->log_latency(
+ __func__,
+ l_bluestore_omap_get_keys_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+int BlueStore::omap_get_values(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to get
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ RWLock::RLocker l(c->lock);
+ auto start1 = mono_clock::now();
+ int r = 0;
+ string final_key;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap())
+ goto out;
+ {
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ o->flush();
+ _key_encode_u64(o->onode.nid, &final_key);
+ final_key.push_back('.');
+ for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+ final_key.resize(9); // keep prefix
+ final_key += *p;
+ bufferlist val;
+ if (db->get(prefix, final_key, &val) >= 0) {
+ dout(30) << __func__ << " got " << pretty_binary_string(final_key)
+ << " -> " << *p << dendl;
+ out->insert(make_pair(*p, val));
+ }
+ }
+ }
+ out:
+ c->store->log_latency(
+ __func__,
+ l_bluestore_omap_get_values_lat,
+ mono_clock::now() - start1,
+ c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+int BlueStore::omap_check_keys(
+ CollectionHandle &c_, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to check
+ set<string> *out ///< [out] Subset of keys defined on oid
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ RWLock::RLocker l(c->lock);
+ int r = 0;
+ string final_key;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.has_omap())
+ goto out;
+ {
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ o->flush();
+ _key_encode_u64(o->onode.nid, &final_key);
+ final_key.push_back('.');
+ for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+ final_key.resize(9); // keep prefix
+ final_key += *p;
+ bufferlist val;
+ if (db->get(prefix, final_key, &val) >= 0) {
+ dout(30) << __func__ << " have " << pretty_binary_string(final_key)
+ << " -> " << *p << dendl;
+ out->insert(*p);
+ } else {
+ dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
+ << " -> " << *p << dendl;
+ }
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+ << dendl;
+ return r;
+}
+
+ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
+ CollectionHandle &c_, ///< [in] collection
+ const ghobject_t &oid ///< [in] object
+ )
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+ if (!c->exists) {
+ return ObjectMap::ObjectMapIterator();
+ }
+ RWLock::RLocker l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+ return ObjectMap::ObjectMapIterator();
+ }
+ o->flush();
+ dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
+ KeyValueDB::Iterator it = db->get_iterator(
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP);
+ return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
+}
+
+// -----------------
+// write helpers
+
+uint64_t BlueStore::_get_ondisk_reserved() const {
+ return round_up_to(
+ std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
+}
+
+void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
+{
+ dout(10) << __func__ << " ondisk_format " << ondisk_format
+ << " min_compat_ondisk_format " << min_compat_ondisk_format
+ << dendl;
+ ceph_assert(ondisk_format == latest_ondisk_format);
+ {
+ bufferlist bl;
+ encode(ondisk_format, bl);
+ t->set(PREFIX_SUPER, "ondisk_format", bl);
+ }
+ {
+ bufferlist bl;
+ encode(min_compat_ondisk_format, bl);
+ t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
+ }
+}
+
+int BlueStore::_open_super_meta()
+{
+ // nid
+ {
+ nid_max = 0;
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "nid_max", &bl);
+ auto p = bl.cbegin();
+ try {
+ uint64_t v;
+ decode(v, p);
+ nid_max = v;
+ } catch (buffer::error& e) {
+ derr << __func__ << " unable to read nid_max" << dendl;
+ return -EIO;
+ }
+ dout(10) << __func__ << " old nid_max " << nid_max << dendl;
+ nid_last = nid_max.load();
+ }
+
+ // blobid
+ {
+ blobid_max = 0;
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "blobid_max", &bl);
+ auto p = bl.cbegin();
+ try {
+ uint64_t v;
+ decode(v, p);
+ blobid_max = v;
+ } catch (buffer::error& e) {
+ derr << __func__ << " unable to read blobid_max" << dendl;
+ return -EIO;
+ }
+ dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
+ blobid_last = blobid_max.load();
+ }
+
+ // freelist
+ {
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "freelist_type", &bl);
+ if (bl.length()) {
+ freelist_type = std::string(bl.c_str(), bl.length());
+ dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
+ } else {
+ ceph_abort_msg("Not Support extent freelist manager");
+ }
+ }
+
+ // ondisk format
+ int32_t compat_ondisk_format = 0;
+ {
+ bufferlist bl;
+ int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
+ if (r < 0) {
+ // base case: kraken bluestore is v1 and readable by v1
+ dout(20) << __func__ << " missing ondisk_format; assuming kraken"
+ << dendl;
+ ondisk_format = 1;
+ compat_ondisk_format = 1;
+ } else {
+ auto p = bl.cbegin();
+ try {
+ decode(ondisk_format, p);
+ } catch (buffer::error& e) {
+ derr << __func__ << " unable to read ondisk_format" << dendl;
+ return -EIO;
+ }
+ bl.clear();
+ {
+ r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
+ ceph_assert(!r);
+ auto p = bl.cbegin();
+ try {
+ decode(compat_ondisk_format, p);
+ } catch (buffer::error& e) {
+ derr << __func__ << " unable to read compat_ondisk_format" << dendl;
+ return -EIO;
+ }
+ }
+ }
+ dout(10) << __func__ << " ondisk_format " << ondisk_format
+ << " compat_ondisk_format " << compat_ondisk_format
+ << dendl;
+ }
+
+ if (latest_ondisk_format < compat_ondisk_format) {
+ derr << __func__ << " compat_ondisk_format is "
+ << compat_ondisk_format << " but we only understand version "
+ << latest_ondisk_format << dendl;
+ return -EPERM;
+ }
+
+ {
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "min_alloc_size", &bl);
+ auto p = bl.cbegin();
+ try {
+ uint64_t val;
+ decode(val, p);
+ min_alloc_size = val;
+ min_alloc_size_order = ctz(val);
+ ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
+ } catch (buffer::error& e) {
+ derr << __func__ << " unable to read min_alloc_size" << dendl;
+ return -EIO;
+ }
+ dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
+ << std::dec << dendl;
+ }
+ _open_statfs();
+ _set_alloc_sizes();
+ _set_throttle_params();
+
+ _set_csum();
+ _set_compression();
+ _set_blob_size();
+
+ _validate_bdev();
+ return 0;
+}
+
+int BlueStore::_upgrade_super()
+{
+ dout(1) << __func__ << " from " << ondisk_format << ", latest "
+ << latest_ondisk_format << dendl;
+ if (ondisk_format < latest_ondisk_format) {
+ ceph_assert(ondisk_format > 0);
+ ceph_assert(ondisk_format < latest_ondisk_format);
+
+ if (ondisk_format == 1) {
+ // changes:
+ // - super: added ondisk_format
+ // - super: added min_readable_ondisk_format
+ // - super: added min_compat_ondisk_format
+ // - super: added min_alloc_size
+ // - super: removed min_min_alloc_size
+ KeyValueDB::Transaction t = db->get_transaction();
+ {
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
+ auto p = bl.cbegin();
+ try {
+ uint64_t val;
+ decode(val, p);
+ min_alloc_size = val;
+ } catch (buffer::error& e) {
+ derr << __func__ << " failed to read min_min_alloc_size" << dendl;
+ return -EIO;
+ }
+ t->set(PREFIX_SUPER, "min_alloc_size", bl);
+ t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
+ }
+ ondisk_format = 2;
+ _prepare_ondisk_format_super(t);
+ int r = db->submit_transaction_sync(t);
+ ceph_assert(r == 0);
+ }
+ }
+ // done
+ dout(1) << __func__ << " done" << dendl;
+ return 0;
+}
+
+void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
+{
+ if (o->onode.nid) {
+ ceph_assert(o->exists);
+ return;
+ }
+ uint64_t nid = ++nid_last;
+ dout(20) << __func__ << " " << nid << dendl;
+ o->onode.nid = nid;
+ txc->last_nid = nid;
+ o->exists = true;
+}
+
+uint64_t BlueStore::_assign_blobid(TransContext *txc)
+{
+ uint64_t bid = ++blobid_last;
+ dout(20) << __func__ << " " << bid << dendl;
+ txc->last_blobid = bid;
+ return bid;
+}
+
+void BlueStore::get_db_statistics(Formatter *f)
+{
+ db->get_statistics(f);
+}
+
+BlueStore::TransContext *BlueStore::_txc_create(
+ Collection *c, OpSequencer *osr,
+ list<Context*> *on_commits)
+{
+ TransContext *txc = new TransContext(cct, c, osr, on_commits);
+ txc->t = db->get_transaction();
+ osr->queue_new(txc);
+ dout(20) << __func__ << " osr " << osr << " = " << txc
+ << " seq " << txc->seq << dendl;
+ return txc;
+}
+
+void BlueStore::_txc_calc_cost(TransContext *txc)
+{
+ // one "io" for the kv commit
+ auto ios = 1 + txc->ioc.get_num_ios();
+ auto cost = throttle_cost_per_io.load();
+ txc->cost = ios * cost + txc->bytes;
+ dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
+ << ios << " ios * " << cost << " + " << txc->bytes
+ << " bytes)" << dendl;
+}
+
+void BlueStore::_txc_update_store_statfs(TransContext *txc)
+{
+ if (txc->statfs_delta.is_empty())
+ return;
+
+ logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
+ logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
+ logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
+ logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
+ logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
+
+ bufferlist bl;
+ txc->statfs_delta.encode(bl);
+ if (per_pool_stat_collection) {
+ string key;
+ get_pool_stat_key(txc->osd_pool_id, &key);
+ txc->t->merge(PREFIX_STAT, key, bl);
+
+ std::lock_guard l(vstatfs_lock);
+ auto& stats = osd_pools[txc->osd_pool_id];
+ stats += txc->statfs_delta;
+
+ vstatfs += txc->statfs_delta; //non-persistent in this mode
+
+ } else {
+ txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+
+ std::lock_guard l(vstatfs_lock);
+ vstatfs += txc->statfs_delta;
+ }
+ txc->statfs_delta.reset();
+}
+
+void BlueStore::_txc_state_proc(TransContext *txc)
+{
+ while (true) {
+ dout(10) << __func__ << " txc " << txc
+ << " " << txc->get_state_name() << dendl;
+ switch (txc->state) {
+ case TransContext::STATE_PREPARE:
+ txc->log_state_latency(logger, l_bluestore_state_prepare_lat);
+ if (txc->ioc.has_pending_aios()) {
+ txc->state = TransContext::STATE_AIO_WAIT;
+ txc->had_ios = true;
+ _txc_aio_submit(txc);
+ return;
+ }
+ // ** fall-thru **
+
+ case TransContext::STATE_AIO_WAIT:
+ {
+ utime_t lat = txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat);
+ if (lat >= cct->_conf->bluestore_log_op_age) {
+ dout(0) << __func__ << " slow aio_wait, txc = " << txc
+ << ", latency = " << lat
+ << dendl;
+ }
+ }
+
+ _txc_finish_io(txc); // may trigger blocked txc's too
+ return;
+
+ case TransContext::STATE_IO_DONE:
+ ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
+ if (txc->had_ios) {
+ ++txc->osr->txc_with_unstable_io;
+ }
+ txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
+ txc->state = TransContext::STATE_KV_QUEUED;
+ if (cct->_conf->bluestore_sync_submit_transaction) {
+ if (txc->last_nid >= nid_max ||
+ txc->last_blobid >= blobid_max) {
+ dout(20) << __func__
+ << " last_{nid,blobid} exceeds max, submit via kv thread"
+ << dendl;
+ } else if (txc->osr->kv_committing_serially) {
+ dout(20) << __func__ << " prior txc submitted via kv thread, us too"
+ << dendl;
+ // note: this is starvation-prone. once we have a txc in a busy
+ // sequencer that is committing serially it is possible to keep
+ // submitting new transactions fast enough that we get stuck doing
+ // so. the alternative is to block here... fixme?
+ } else if (txc->osr->txc_with_unstable_io) {
+ dout(20) << __func__ << " prior txc(s) with unstable ios "
+ << txc->osr->txc_with_unstable_io.load() << dendl;
+ } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
+ rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
+ == 0) {
+ dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
+ << dendl;
+ } else {
+ int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
+ ceph_assert(r == 0);
+ txc->state = TransContext::STATE_KV_SUBMITTED;
+ _txc_applied_kv(txc);
+ }
+ }
+ {
+ std::lock_guard l(kv_lock);
+ kv_queue.push_back(txc);
+ kv_cond.notify_one();
+ if (txc->state != TransContext::STATE_KV_SUBMITTED) {
+ kv_queue_unsubmitted.push_back(txc);
+ ++txc->osr->kv_committing_serially;
+ }
+ if (txc->had_ios)
+ kv_ios++;
+ kv_throttle_costs += txc->cost;
+ }
+ return;
+ case TransContext::STATE_KV_SUBMITTED:
+ _txc_committed_kv(txc);
+ // ** fall-thru **
+
+ case TransContext::STATE_KV_DONE:
+ txc->log_state_latency(logger, l_bluestore_state_kv_done_lat);
+ if (txc->deferred_txn) {
+ txc->state = TransContext::STATE_DEFERRED_QUEUED;
+ _deferred_queue(txc);
+ return;
+ }
+ txc->state = TransContext::STATE_FINISHING;
+ break;
+
+ case TransContext::STATE_DEFERRED_CLEANUP:
+ txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
+ txc->state = TransContext::STATE_FINISHING;
+ // ** fall-thru **
+
+ case TransContext::STATE_FINISHING:
+ txc->log_state_latency(logger, l_bluestore_state_finishing_lat);
+ _txc_finish(txc);
+ return;
+
+ default:
+ derr << __func__ << " unexpected txc " << txc
+ << " state " << txc->get_state_name() << dendl;
+ ceph_abort_msg("unexpected txc state");
+ return;
+ }
+ }
+}
+
+void BlueStore::_txc_finish_io(TransContext *txc)
+{
+ dout(20) << __func__ << " " << txc << dendl;
+
+ /*
+ * we need to preserve the order of kv transactions,
+ * even though aio will complete in any order.
+ */
+
+ OpSequencer *osr = txc->osr.get();
+ std::lock_guard l(osr->qlock);
+ txc->state = TransContext::STATE_IO_DONE;
+ txc->ioc.release_running_aios();
+ OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
+ while (p != osr->q.begin()) {
+ --p;
+ if (p->state < TransContext::STATE_IO_DONE) {
+ dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
+ << p->get_state_name() << dendl;
+ return;
+ }
+ if (p->state > TransContext::STATE_IO_DONE) {
+ ++p;
+ break;
+ }
+ }
+ do {
+ _txc_state_proc(&*p++);
+ } while (p != osr->q.end() &&
+ p->state == TransContext::STATE_IO_DONE);
+
+ if (osr->kv_submitted_waiters) {
+ osr->qcond.notify_all();
+ }
+}
+
+void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
+{
+ dout(20) << __func__ << " txc " << txc
+ << " onodes " << txc->onodes
+ << " shared_blobs " << txc->shared_blobs
+ << dendl;
+
+ // finalize onodes
+ for (auto o : txc->onodes) {
+ _record_onode(o, t);
+ o->flushing_count++;
+ }
+
+ // objects we modified but didn't affect the onode
+ auto p = txc->modified_objects.begin();
+ while (p != txc->modified_objects.end()) {
+ if (txc->onodes.count(*p) == 0) {
+ (*p)->flushing_count++;
+ ++p;
+ } else {
+ // remove dups with onodes list to avoid problems in _txc_finish
+ p = txc->modified_objects.erase(p);
+ }
+ }
+
+ // finalize shared_blobs
+ for (auto sb : txc->shared_blobs) {
+ string key;
+ auto sbid = sb->get_sbid();
+ get_shared_blob_key(sbid, &key);
+ if (sb->persistent->empty()) {
+ dout(20) << __func__ << " shared_blob 0x"
+ << std::hex << sbid << std::dec
+ << " is empty" << dendl;
+ t->rmkey(PREFIX_SHARED_BLOB, key);
+ } else {
+ bufferlist bl;
+ encode(*(sb->persistent), bl);
+ dout(20) << __func__ << " shared_blob 0x"
+ << std::hex << sbid << std::dec
+ << " is " << bl.length() << " " << *sb << dendl;
+ t->set(PREFIX_SHARED_BLOB, key, bl);
+ }
+ }
+}
+
+void BlueStore::BSPerfTracker::update_from_perfcounters(
+ PerfCounters &logger)
+{
+ os_commit_latency_ns.consume_next(
+ logger.get_tavg_ns(
+ l_bluestore_commit_lat));
+ os_apply_latency_ns.consume_next(
+ logger.get_tavg_ns(
+ l_bluestore_commit_lat));
+}
+
+void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
+{
+ dout(20) << __func__ << " txc " << txc << std::hex
+ << " allocated 0x" << txc->allocated
+ << " released 0x" << txc->released
+ << std::dec << dendl;
+
+ // We have to handle the case where we allocate *and* deallocate the
+ // same region in this transaction. The freelist doesn't like that.
+ // (Actually, the only thing that cares is the BitmapFreelistManager
+ // debug check. But that's important.)
+ interval_set<uint64_t> tmp_allocated, tmp_released;
+ interval_set<uint64_t> *pallocated = &txc->allocated;
+ interval_set<uint64_t> *preleased = &txc->released;
+ if (!txc->allocated.empty() && !txc->released.empty()) {
+ interval_set<uint64_t> overlap;
+ overlap.intersection_of(txc->allocated, txc->released);
+ if (!overlap.empty()) {
+ tmp_allocated = txc->allocated;
+ tmp_allocated.subtract(overlap);
+ tmp_released = txc->released;
+ tmp_released.subtract(overlap);
+ dout(20) << __func__ << " overlap 0x" << std::hex << overlap
+ << ", new allocated 0x" << tmp_allocated
+ << " released 0x" << tmp_released << std::dec
+ << dendl;
+ pallocated = &tmp_allocated;
+ preleased = &tmp_released;
+ }
+ }
+
+ // update freelist with non-overlap sets
+ for (interval_set<uint64_t>::iterator p = pallocated->begin();
+ p != pallocated->end();
+ ++p) {
+ fm->allocate(p.get_start(), p.get_len(), t);
+ }
+ for (interval_set<uint64_t>::iterator p = preleased->begin();
+ p != preleased->end();
+ ++p) {
+ dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
+ << "~" << p.get_len() << std::dec << dendl;
+ fm->release(p.get_start(), p.get_len(), t);
+ }
+
+ _txc_update_store_statfs(txc);
+}
+
+void BlueStore::_txc_applied_kv(TransContext *txc)
+{
+ for (auto ls : { &txc->onodes, &txc->modified_objects }) {
+ for (auto& o : *ls) {
+ dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
+ << dendl;
+ if (--o->flushing_count == 0) {
+ std::lock_guard l(o->flush_lock);
+ o->flush_cond.notify_all();
+ }
+ }
+ }
+}
+
+void BlueStore::_txc_committed_kv(TransContext *txc)
+{
+ dout(20) << __func__ << " txc " << txc << dendl;
+ {
+ std::lock_guard l(txc->osr->qlock);
+ txc->state = TransContext::STATE_KV_DONE;
+ if (txc->ch->commit_queue) {
+ txc->ch->commit_queue->queue(txc->oncommits);
+ } else {
+ finisher.queue(txc->oncommits);
+ }
+ }
+ txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
+ log_latency_fn(
+ __func__,
+ l_bluestore_commit_lat,
+ ceph::make_timespan(ceph_clock_now() - txc->start),
+ cct->_conf->bluestore_log_op_age,
+ [&](auto lat) {
+ return ", txc = " + stringify(txc);
+ }
+ );
+}
+
+void BlueStore::_txc_finish(TransContext *txc)
+{
+ dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
+ ceph_assert(txc->state == TransContext::STATE_FINISHING);
+
+ for (auto& sb : txc->shared_blobs_written) {
+ sb->finish_write(txc->seq);
+ }
+ txc->shared_blobs_written.clear();
+
+ while (!txc->removed_collections.empty()) {
+ _queue_reap_collection(txc->removed_collections.front());
+ txc->removed_collections.pop_front();
+ }
+
+ OpSequencerRef osr = txc->osr;
+ bool empty = false;
+ bool submit_deferred = false;
+ OpSequencer::q_list_t releasing_txc;
+ {
+ std::lock_guard l(osr->qlock);
+ txc->state = TransContext::STATE_DONE;
+ bool notify = false;
+ while (!osr->q.empty()) {
+ TransContext *txc = &osr->q.front();
+ dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
+ << dendl;
+ if (txc->state != TransContext::STATE_DONE) {
+ if (txc->state == TransContext::STATE_PREPARE &&
+ deferred_aggressive) {
+ // for _osr_drain_preceding()
+ notify = true;
+ }
+ if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
+ osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
+ submit_deferred = true;
+ }
+ break;
+ }
+
+ osr->q.pop_front();
+ releasing_txc.push_back(*txc);
+ notify = true;
+ }
+ if (notify) {
+ osr->qcond.notify_all();
+ }
+ if (osr->q.empty()) {
+ dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
+ empty = true;
+ }
+ }
+ while (!releasing_txc.empty()) {
+ // release to allocator only after all preceding txc's have also
+ // finished any deferred writes that potentially land in these
+ // blocks
+ auto txc = &releasing_txc.front();
+ _txc_release_alloc(txc);
+ releasing_txc.pop_front();
+ txc->log_state_latency(logger, l_bluestore_state_done_lat);
+ delete txc;
+ }
+
+ if (submit_deferred) {
+ // we're pinning memory; flush! we could be more fine-grained here but
+ // i'm not sure it's worth the bother.
+ deferred_try_submit();
+ }
+
+ if (empty && osr->zombie) {
+ std::lock_guard l(zombie_osr_lock);
+ if (zombie_osr_set.erase(osr->cid)) {
+ dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
+ } else {
+ dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
+ << dendl;
+ }
+ }
+ }
+
+void BlueStore::_txc_release_alloc(TransContext *txc)
+{
+ // it's expected we're called with lazy_release_lock already taken!
+ if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
+ int r = 0;
+ if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
+ r = bdev->queue_discard(txc->released);
+ if (r == 0) {
+ dout(10) << __func__ << "(queued) " << txc << " " << std::hex
+ << txc->released << std::dec << dendl;
+ goto out;
+ }
+ } else if (cct->_conf->bdev_enable_discard) {
+ for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
+ bdev->discard(p.get_start(), p.get_len());
+ }
+ }
+ dout(10) << __func__ << "(sync) " << txc << " " << std::hex
+ << txc->released << std::dec << dendl;
+ alloc->release(txc->released);
+ }
+
+out:
+ txc->allocated.clear();
+ txc->released.clear();
+}
+
+void BlueStore::_osr_attach(Collection *c)
+{
+ // note: caller has RWLock on coll_map
+ auto q = coll_map.find(c->cid);
+ if (q != coll_map.end()) {
+ c->osr = q->second->osr;
+ ldout(cct, 10) << __func__ << " " << c->cid
+ << " reusing osr " << c->osr << " from existing coll "
+ << q->second << dendl;
+ } else {
+ std::lock_guard l(zombie_osr_lock);
+ auto p = zombie_osr_set.find(c->cid);
+ if (p == zombie_osr_set.end()) {
+ c->osr = new OpSequencer(this, c->cid);
+ ldout(cct, 10) << __func__ << " " << c->cid
+ << " fresh osr " << c->osr << dendl;
+ } else {
+ c->osr = p->second;
+ zombie_osr_set.erase(p);
+ ldout(cct, 10) << __func__ << " " << c->cid
+ << " resurrecting zombie osr " << c->osr << dendl;
+ c->osr->zombie = false;
+ }
+ }
+}
+
+void BlueStore::_osr_register_zombie(OpSequencer *osr)
+{
+ std::lock_guard l(zombie_osr_lock);
+ dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
+ osr->zombie = true;
+ auto i = zombie_osr_set.emplace(osr->cid, osr);
+ // this is either a new insertion or the same osr is already there
+ ceph_assert(i.second || i.first->second == osr);
+}
+
+void BlueStore::_osr_drain_preceding(TransContext *txc)
+{
+ OpSequencer *osr = txc->osr.get();
+ dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
+ ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
+ {
+ // submit anything pending
+ deferred_lock.lock();
+ if (osr->deferred_pending && !osr->deferred_running) {
+ _deferred_submit_unlock(osr);
+ } else {
+ deferred_lock.unlock();
+ }
+ }
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard l(kv_lock);
+ kv_cond.notify_one();
+ }
+ osr->drain_preceding(txc);
+ --deferred_aggressive;
+ dout(10) << __func__ << " " << osr << " done" << dendl;
+}
+
+void BlueStore::_osr_drain(OpSequencer *osr)
+{
+ dout(10) << __func__ << " " << osr << dendl;
+ ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
+ {
+ // submit anything pending
+ deferred_lock.lock();
+ if (osr->deferred_pending && !osr->deferred_running) {
+ _deferred_submit_unlock(osr);
+ } else {
+ deferred_lock.unlock();
+ }
+ }
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard l(kv_lock);
+ kv_cond.notify_one();
+ }
+ osr->drain();
+ --deferred_aggressive;
+ dout(10) << __func__ << " " << osr << " done" << dendl;
+}
+
+void BlueStore::_osr_drain_all()
+{
+ dout(10) << __func__ << dendl;
+
+ set<OpSequencerRef> s;
+ vector<OpSequencerRef> zombies;
+ {
+ RWLock::RLocker l(coll_lock);
+ for (auto& i : coll_map) {
+ s.insert(i.second->osr);
+ }
+ }
+ {
+ std::lock_guard l(zombie_osr_lock);
+ for (auto& i : zombie_osr_set) {
+ s.insert(i.second);
+ zombies.push_back(i.second);
+ }
+ }
+ dout(20) << __func__ << " osr_set " << s << dendl;
+
+ ++deferred_aggressive;
+ {
+ // submit anything pending
+ deferred_try_submit();
+ }
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard l(kv_lock);
+ kv_cond.notify_one();
+ }
+ {
+ std::lock_guard l(kv_finalize_lock);
+ kv_finalize_cond.notify_one();
+ }
+ for (auto osr : s) {
+ dout(20) << __func__ << " drain " << osr << dendl;
+ osr->drain();
+ }
+ --deferred_aggressive;
+
+ {
+ std::lock_guard l(zombie_osr_lock);
+ for (auto& osr : zombies) {
+ if (zombie_osr_set.erase(osr->cid)) {
+ dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
+ ceph_assert(osr->q.empty());
+ } else if (osr->zombie) {
+ dout(10) << __func__ << " empty zombie osr " << osr
+ << " already reaped" << dendl;
+ ceph_assert(osr->q.empty());
+ } else {
+ dout(10) << __func__ << " empty zombie osr " << osr
+ << " resurrected" << dendl;
+ }
+ }
+ }
+
+ dout(10) << __func__ << " done" << dendl;
+}
+
+
+void BlueStore::_kv_start()
+{
+ dout(10) << __func__ << dendl;
+
+ deferred_finisher.start();
+ finisher.start();
+ kv_sync_thread.create("bstore_kv_sync");
+ kv_finalize_thread.create("bstore_kv_final");
+}
+
+void BlueStore::_kv_stop()
+{
+ dout(10) << __func__ << dendl;
+ {
+ std::unique_lock l(kv_lock);
+ while (!kv_sync_started) {
+ kv_cond.wait(l);
+ }
+ kv_stop = true;
+ kv_cond.notify_all();
+ }
+ {
+ std::unique_lock l(kv_finalize_lock);
+ while (!kv_finalize_started) {
+ kv_finalize_cond.wait(l);
+ }
+ kv_finalize_stop = true;
+ kv_finalize_cond.notify_all();
+ }
+ kv_sync_thread.join();
+ kv_finalize_thread.join();
+ ceph_assert(removed_collections.empty());
+ {
+ std::lock_guard l(kv_lock);
+ kv_stop = false;
+ }
+ {
+ std::lock_guard l(kv_finalize_lock);
+ kv_finalize_stop = false;
+ }
+ dout(10) << __func__ << " stopping finishers" << dendl;
+ deferred_finisher.wait_for_empty();
+ deferred_finisher.stop();
+ finisher.wait_for_empty();
+ finisher.stop();
+ dout(10) << __func__ << " stopped" << dendl;
+}
+
+void BlueStore::_kv_sync_thread()
+{
+ dout(10) << __func__ << " start" << dendl;
+ deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
+ std::unique_lock l(kv_lock);
+ ceph_assert(!kv_sync_started);
+ kv_sync_started = true;
+ kv_cond.notify_all();
+
+ auto t0 = mono_clock::now();
+ timespan twait = ceph::make_timespan(0);
+ size_t kv_submitted = 0;
+
+ while (true) {
+ auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
+ auto observation_period =
+ ceph::make_timespan(period);
+ auto elapsed = mono_clock::now() - t0;
+ if (period && elapsed >= observation_period) {
+ dout(5) << __func__ << " utilization: idle "
+ << twait << " of " << elapsed
+ << ", submitted: " << kv_submitted
+ <<dendl;
+ t0 = mono_clock::now();
+ twait = ceph::make_timespan(0);
+ kv_submitted = 0;
+ }
+ ceph_assert(kv_committing.empty());
+ if (kv_queue.empty() &&
+ ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
+ !deferred_aggressive)) {
+ if (kv_stop)
+ break;
+ dout(20) << __func__ << " sleep" << dendl;
+ auto t = mono_clock::now();
+ kv_cond.wait(l);
+ twait += mono_clock::now() - t;
+
+ dout(20) << __func__ << " wake" << dendl;
+ } else {
+ deque<TransContext*> kv_submitting;
+ deque<DeferredBatch*> deferred_done, deferred_stable;
+ uint64_t aios = 0, costs = 0;
+
+ dout(20) << __func__ << " committing " << kv_queue.size()
+ << " submitting " << kv_queue_unsubmitted.size()
+ << " deferred done " << deferred_done_queue.size()
+ << " stable " << deferred_stable_queue.size()
+ << dendl;
+ kv_committing.swap(kv_queue);
+ kv_submitting.swap(kv_queue_unsubmitted);
+ deferred_done.swap(deferred_done_queue);
+ deferred_stable.swap(deferred_stable_queue);
+ aios = kv_ios;
+ costs = kv_throttle_costs;
+ kv_ios = 0;
+ kv_throttle_costs = 0;
+ l.unlock();
+
+ dout(30) << __func__ << " committing " << kv_committing << dendl;
+ dout(30) << __func__ << " submitting " << kv_submitting << dendl;
+ dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
+ dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
+
+ auto start = mono_clock::now();
+
+ bool force_flush = false;
+ // if bluefs is sharing the same device as data (only), then we
+ // can rely on the bluefs commit to flush the device and make
+ // deferred aios stable. that means that if we do have done deferred
+ // txcs AND we are not on a single device, we need to force a flush.
+ if (bluefs_single_shared_device && bluefs) {
+ if (aios) {
+ force_flush = true;
+ } else if (kv_committing.empty() && deferred_stable.empty()) {
+ force_flush = true; // there's nothing else to commit!
+ } else if (deferred_aggressive) {
+ force_flush = true;
+ }
+ } else {
+ if (aios || !deferred_done.empty()) {
+ force_flush = true;
+ } else {
+ dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
+ }
+ }
+
+ if (force_flush) {
+ dout(20) << __func__ << " num_aios=" << aios
+ << " force_flush=" << (int)force_flush
+ << ", flushing, deferred done->stable" << dendl;
+ // flush/barrier on block device
+ bdev->flush();
+
+ // if we flush then deferred done are now deferred stable
+ deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
+ deferred_done.end());
+ deferred_done.clear();
+ }
+ auto after_flush = mono_clock::now();
+
+ // we will use one final transaction to force a sync
+ KeyValueDB::Transaction synct = db->get_transaction();
+
+ // increase {nid,blobid}_max? note that this covers both the
+ // case where we are approaching the max and the case we passed
+ // it. in either case, we increase the max in the earlier txn
+ // we submit.
+ uint64_t new_nid_max = 0, new_blobid_max = 0;
+ if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
+ KeyValueDB::Transaction t =
+ kv_submitting.empty() ? synct : kv_submitting.front()->t;
+ new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
+ bufferlist bl;
+ encode(new_nid_max, bl);
+ t->set(PREFIX_SUPER, "nid_max", bl);
+ dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
+ }
+ if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
+ KeyValueDB::Transaction t =
+ kv_submitting.empty() ? synct : kv_submitting.front()->t;
+ new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
+ bufferlist bl;
+ encode(new_blobid_max, bl);
+ t->set(PREFIX_SUPER, "blobid_max", bl);
+ dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
+ }
+
+ for (auto txc : kv_committing) {
+ if (txc->state == TransContext::STATE_KV_QUEUED) {
+ txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
+ int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
+ ceph_assert(r == 0);
+ ++kv_submitted;
+ txc->state = TransContext::STATE_KV_SUBMITTED;
+ _txc_applied_kv(txc);
+ --txc->osr->kv_committing_serially;
+ if (txc->osr->kv_submitted_waiters) {
+ std::lock_guard l(txc->osr->qlock);
+ txc->osr->qcond.notify_all();
+ }
+
+ } else {
+ ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
+ txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
+ }
+ if (txc->had_ios) {
+ --txc->osr->txc_with_unstable_io;
+ }
+ }
+
+ // release throttle *before* we commit. this allows new ops
+ // to be prepared and enter pipeline while we are waiting on
+ // the kv commit sync/flush. then hopefully on the next
+ // iteration there will already be ops awake. otherwise, we
+ // end up going to sleep, and then wake up when the very first
+ // transaction is ready for commit.
+ throttle_bytes.put(costs);
+
+ if (bluefs &&
+ after_flush - bluefs_last_balance >
+ ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) {
+ bluefs_last_balance = after_flush;
+ int r = _balance_bluefs_freespace();
+ ceph_assert(r >= 0);
+ }
+
+ // cleanup sync deferred keys
+ for (auto b : deferred_stable) {
+ for (auto& txc : b->txcs) {
+ bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
+ ceph_assert(wt.released.empty()); // only kraken did this
+ string key;
+ get_deferred_key(wt.seq, &key);
+ synct->rm_single_key(PREFIX_DEFERRED, key);
+ }
+ }
+
+ // submit synct synchronously (block and wait for it to commit)
+ int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
+ ceph_assert(r == 0);
+
+ {
+ std::unique_lock m(kv_finalize_lock);
+ if (kv_committing_to_finalize.empty()) {
+ kv_committing_to_finalize.swap(kv_committing);
+ } else {
+ kv_committing_to_finalize.insert(
+ kv_committing_to_finalize.end(),
+ kv_committing.begin(),
+ kv_committing.end());
+ kv_committing.clear();
+ }
+ if (deferred_stable_to_finalize.empty()) {
+ deferred_stable_to_finalize.swap(deferred_stable);
+ } else {
+ deferred_stable_to_finalize.insert(
+ deferred_stable_to_finalize.end(),
+ deferred_stable.begin(),
+ deferred_stable.end());
+ deferred_stable.clear();
+ }
+ kv_finalize_cond.notify_one();
+ }
+
+ if (new_nid_max) {
+ nid_max = new_nid_max;
+ dout(10) << __func__ << " nid_max now " << nid_max << dendl;
+ }
+ if (new_blobid_max) {
+ blobid_max = new_blobid_max;
+ dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
+ }
+
+ {
+ auto finish = mono_clock::now();
+ ceph::timespan dur_flush = after_flush - start;
+ ceph::timespan dur_kv = finish - after_flush;
+ ceph::timespan dur = finish - start;
+ dout(20) << __func__ << " committed " << kv_committing.size()
+ << " cleaned " << deferred_stable.size()
+ << " in " << dur
+ << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
+ << dendl;
+ log_latency("kv_flush",
+ l_bluestore_kv_flush_lat,
+ dur_flush,
+ cct->_conf->bluestore_log_op_age);
+ log_latency("kv_commit",
+ l_bluestore_kv_commit_lat,
+ dur_kv,
+ cct->_conf->bluestore_log_op_age);
+ log_latency("kv_sync",
+ l_bluestore_kv_sync_lat,
+ dur,
+ cct->_conf->bluestore_log_op_age);
+ }
+
+ if (bluefs) {
+ if (!bluefs_extents_reclaiming.empty()) {
+ dout(0) << __func__ << " releasing old bluefs 0x" << std::hex
+ << bluefs_extents_reclaiming << std::dec << dendl;
+ int r = 0;
+ if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
+ r = bdev->queue_discard(bluefs_extents_reclaiming);
+ if (r == 0) {
+ goto clear;
+ }
+ } else if (cct->_conf->bdev_enable_discard) {
+ for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) {
+ bdev->discard(p.get_start(), p.get_len());
+ }
+ }
+
+ alloc->release(bluefs_extents_reclaiming);
+clear:
+ bluefs_extents_reclaiming.clear();
+ }
+ }
+
+ l.lock();
+ // previously deferred "done" are now "stable" by virtue of this
+ // commit cycle.
+ deferred_stable_queue.swap(deferred_done);
+ }
+ }
+ dout(10) << __func__ << " finish" << dendl;
+ kv_sync_started = false;
+}
+
+void BlueStore::_kv_finalize_thread()
+{
+ deque<TransContext*> kv_committed;
+ deque<DeferredBatch*> deferred_stable;
+ dout(10) << __func__ << " start" << dendl;
+ std::unique_lock l(kv_finalize_lock);
+ ceph_assert(!kv_finalize_started);
+ kv_finalize_started = true;
+ kv_finalize_cond.notify_all();
+ while (true) {
+ ceph_assert(kv_committed.empty());
+ ceph_assert(deferred_stable.empty());
+ if (kv_committing_to_finalize.empty() &&
+ deferred_stable_to_finalize.empty()) {
+ if (kv_finalize_stop)
+ break;
+ dout(20) << __func__ << " sleep" << dendl;
+ kv_finalize_cond.wait(l);
+ dout(20) << __func__ << " wake" << dendl;
+ } else {
+ kv_committed.swap(kv_committing_to_finalize);
+ deferred_stable.swap(deferred_stable_to_finalize);
+ l.unlock();
+ dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
+ dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
+
+ auto start = mono_clock::now();
+
+ while (!kv_committed.empty()) {
+ TransContext *txc = kv_committed.front();
+ ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
+ _txc_state_proc(txc);
+ kv_committed.pop_front();
+ }
+
+ for (auto b : deferred_stable) {
+ auto p = b->txcs.begin();
+ while (p != b->txcs.end()) {
+ TransContext *txc = &*p;
+ p = b->txcs.erase(p); // unlink here because
+ _txc_state_proc(txc); // this may destroy txc
+ }
+ delete b;
+ }
+ deferred_stable.clear();
+
+ if (!deferred_aggressive) {
+ if (deferred_queue_size >= deferred_batch_ops.load() ||
+ throttle_deferred_bytes.past_midpoint()) {
+ deferred_try_submit();
+ }
+ }
+
+ // this is as good a place as any ...
+ _reap_collections();
+
+ logger->set(l_bluestore_fragmentation,
+ (uint64_t)(alloc->get_fragmentation() * 1000));
+
+ log_latency("kv_final",
+ l_bluestore_kv_final_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+
+ l.lock();
+ }
+ }
+ dout(10) << __func__ << " finish" << dendl;
+ kv_finalize_started = false;
+}
+
+bluestore_deferred_op_t *BlueStore::_get_deferred_op(
+ TransContext *txc, OnodeRef o)
+{
+ if (!txc->deferred_txn) {
+ txc->deferred_txn = new bluestore_deferred_transaction_t;
+ }
+ txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
+ return &txc->deferred_txn->ops.back();
+}
+
+void BlueStore::_deferred_queue(TransContext *txc)
+{
+ dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
+ deferred_lock.lock();
+ if (!txc->osr->deferred_pending &&
+ !txc->osr->deferred_running) {
+ deferred_queue.push_back(*txc->osr);
+ }
+ if (!txc->osr->deferred_pending) {
+ txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
+ }
+ ++deferred_queue_size;
+ txc->osr->deferred_pending->txcs.push_back(*txc);
+ bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
+ for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
+ const auto& op = *opi;
+ ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
+ bufferlist::const_iterator p = op.data.begin();
+ for (auto e : op.extents) {
+ txc->osr->deferred_pending->prepare_write(
+ cct, wt.seq, e.offset, e.length, p);
+ }
+ }
+ if (deferred_aggressive &&
+ !txc->osr->deferred_running) {
+ _deferred_submit_unlock(txc->osr.get());
+ } else {
+ deferred_lock.unlock();
+ }
+}
+
+void BlueStore::deferred_try_submit()
+{
+ dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
+ << deferred_queue_size << " txcs" << dendl;
+ std::lock_guard l(deferred_lock);
+ vector<OpSequencerRef> osrs;
+ osrs.reserve(deferred_queue.size());
+ for (auto& osr : deferred_queue) {
+ osrs.push_back(&osr);
+ }
+ for (auto& osr : osrs) {
+ if (osr->deferred_pending) {
+ if (!osr->deferred_running) {
+ _deferred_submit_unlock(osr.get());
+ deferred_lock.lock();
+ } else {
+ dout(20) << __func__ << " osr " << osr << " already has running"
+ << dendl;
+ }
+ } else {
+ dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
+ }
+ }
+}
+
+void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
+{
+ dout(10) << __func__ << " osr " << osr
+ << " " << osr->deferred_pending->iomap.size() << " ios pending "
+ << dendl;
+ ceph_assert(osr->deferred_pending);
+ ceph_assert(!osr->deferred_running);
+
+ auto b = osr->deferred_pending;
+ deferred_queue_size -= b->seq_bytes.size();
+ ceph_assert(deferred_queue_size >= 0);
+
+ osr->deferred_running = osr->deferred_pending;
+ osr->deferred_pending = nullptr;
+
+ deferred_lock.unlock();
+
+ for (auto& txc : b->txcs) {
+ txc.log_state_latency(logger, l_bluestore_state_deferred_queued_lat);
+ }
+ uint64_t start = 0, pos = 0;
+ bufferlist bl;
+ auto i = b->iomap.begin();
+ while (true) {
+ if (i == b->iomap.end() || i->first != pos) {
+ if (bl.length()) {
+ dout(20) << __func__ << " write 0x" << std::hex
+ << start << "~" << bl.length()
+ << " crc " << bl.crc32c(-1) << std::dec << dendl;
+ if (!g_conf()->bluestore_debug_omit_block_device_write) {
+ logger->inc(l_bluestore_deferred_write_ops);
+ logger->inc(l_bluestore_deferred_write_bytes, bl.length());
+ int r = bdev->aio_write(start, bl, &b->ioc, false);
+ ceph_assert(r == 0);
+ }
+ }
+ if (i == b->iomap.end()) {
+ break;
+ }
+ start = 0;
+ pos = i->first;
+ bl.clear();
+ }
+ dout(20) << __func__ << " seq " << i->second.seq << " 0x"
+ << std::hex << pos << "~" << i->second.bl.length() << std::dec
+ << dendl;
+ if (!bl.length()) {
+ start = pos;
+ }
+ pos += i->second.bl.length();
+ bl.claim_append(i->second.bl);
+ ++i;
+ }
+
+ bdev->aio_submit(&b->ioc);
+}
+
+struct C_DeferredTrySubmit : public Context {
+ BlueStore *store;
+ C_DeferredTrySubmit(BlueStore *s) : store(s) {}
+ void finish(int r) {
+ store->deferred_try_submit();
+ }
+};
+
+void BlueStore::_deferred_aio_finish(OpSequencer *osr)
+{
+ dout(10) << __func__ << " osr " << osr << dendl;
+ ceph_assert(osr->deferred_running);
+ DeferredBatch *b = osr->deferred_running;
+
+ {
+ std::lock_guard l(deferred_lock);
+ ceph_assert(osr->deferred_running == b);
+ osr->deferred_running = nullptr;
+ if (!osr->deferred_pending) {
+ dout(20) << __func__ << " dequeueing" << dendl;
+ auto q = deferred_queue.iterator_to(*osr);
+ deferred_queue.erase(q);
+ } else if (deferred_aggressive) {
+ dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
+ deferred_finisher.queue(new C_DeferredTrySubmit(this));
+ } else {
+ dout(20) << __func__ << " leaving queued, more pending" << dendl;
+ }
+ }
+
+ {
+ uint64_t costs = 0;
+ {
+ std::lock_guard l2(osr->qlock);
+ for (auto& i : b->txcs) {
+ TransContext *txc = &i;
+ txc->log_state_latency(logger, l_bluestore_state_deferred_aio_wait_lat);
+ txc->state = TransContext::STATE_DEFERRED_CLEANUP;
+ costs += txc->cost;
+ }
+ }
+ throttle_deferred_bytes.put(costs);
+ std::lock_guard l(kv_lock);
+ deferred_done_queue.emplace_back(b);
+ }
+
+ // in the normal case, do not bother waking up the kv thread; it will
+ // catch us on the next commit anyway.
+ if (deferred_aggressive) {
+ std::lock_guard l(kv_lock);
+ kv_cond.notify_one();
+ }
+}
+
+int BlueStore::_deferred_replay()
+{
+ dout(10) << __func__ << " start" << dendl;
+ int count = 0;
+ int r = 0;
+ CollectionRef ch = _get_collection(coll_t::meta());
+ bool fake_ch = false;
+ if (!ch) {
+ // hmm, replaying initial mkfs?
+ ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
+ fake_ch = true;
+ }
+ OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
+ for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
+ dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
+ << dendl;
+ bluestore_deferred_transaction_t *deferred_txn =
+ new bluestore_deferred_transaction_t;
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ try {
+ decode(*deferred_txn, p);
+ } catch (buffer::error& e) {
+ derr << __func__ << " failed to decode deferred txn "
+ << pretty_binary_string(it->key()) << dendl;
+ delete deferred_txn;
+ r = -EIO;
+ goto out;
+ }
+ TransContext *txc = _txc_create(ch.get(), osr, nullptr);
+ txc->deferred_txn = deferred_txn;
+ txc->state = TransContext::STATE_KV_DONE;
+ _txc_state_proc(txc);
+ }
+ out:
+ dout(20) << __func__ << " draining osr" << dendl;
+ _osr_register_zombie(osr);
+ _osr_drain_all();
+ if (fake_ch) {
+ new_coll_map.clear();
+ }
+ dout(10) << __func__ << " completed " << count << " events" << dendl;
+ return r;
+}
+
+// ---------------------------
+// transactions
+
+int BlueStore::queue_transactions(
+ CollectionHandle& ch,
+ vector<Transaction>& tls,
+ TrackedOpRef op,
+ ThreadPool::TPHandle *handle)
+{
+ FUNCTRACE(cct);
+ list<Context *> on_applied, on_commit, on_applied_sync;
+ ObjectStore::Transaction::collect_contexts(
+ tls, &on_applied, &on_commit, &on_applied_sync);
+
+ auto start = mono_clock::now();
+
+ Collection *c = static_cast<Collection*>(ch.get());
+ OpSequencer *osr = c->osr.get();
+ dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
+
+ // prepare
+ TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
+ &on_commit);
+
+ for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
+ txc->bytes += (*p).get_num_bytes();
+ _txc_add_transaction(txc, &(*p));
+ }
+ _txc_calc_cost(txc);
+
+ _txc_write_nodes(txc, txc->t);
+
+ // journal deferred items
+ if (txc->deferred_txn) {
+ txc->deferred_txn->seq = ++deferred_seq;
+ bufferlist bl;
+ encode(*txc->deferred_txn, bl);
+ string key;
+ get_deferred_key(txc->deferred_txn->seq, &key);
+ txc->t->set(PREFIX_DEFERRED, key, bl);
+ }
+
+ _txc_finalize_kv(txc, txc->t);
+ if (handle)
+ handle->suspend_tp_timeout();
+
+ auto tstart = mono_clock::now();
+ throttle_bytes.get(txc->cost);
+ if (txc->deferred_txn) {
+ // ensure we do not block here because of deferred writes
+ if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
+ dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
+ << dendl;
+ ++deferred_aggressive;
+ deferred_try_submit();
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard l(kv_lock);
+ kv_cond.notify_one();
+ }
+ throttle_deferred_bytes.get(txc->cost);
+ --deferred_aggressive;
+ }
+ }
+ auto tend = mono_clock::now();
+
+ if (handle)
+ handle->reset_tp_timeout();
+
+ logger->inc(l_bluestore_txc);
+
+ // execute (start)
+ _txc_state_proc(txc);
+
+ // we're immediately readable (unlike FileStore)
+ for (auto c : on_applied_sync) {
+ c->complete(0);
+ }
+ if (!on_applied.empty()) {
+ if (c->commit_queue) {
+ c->commit_queue->queue(on_applied);
+ } else {
+ finisher.queue(on_applied);
+ }
+ }
+
+ log_latency("submit_transact",
+ l_bluestore_submit_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age);
+ log_latency("throttle_transact",
+ l_bluestore_throttle_lat,
+ tend - tstart,
+ cct->_conf->bluestore_log_op_age);
+ return 0;
+}
+
+void BlueStore::_txc_aio_submit(TransContext *txc)
+{
+ dout(10) << __func__ << " txc " << txc << dendl;
+ bdev->aio_submit(&txc->ioc);
+}
+
+void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
+{
+ Transaction::iterator i = t->begin();
+
+ _dump_transaction<30>(cct, t);
+
+ vector<CollectionRef> cvec(i.colls.size());
+ unsigned j = 0;
+ for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
+ ++p, ++j) {
+ cvec[j] = _get_collection(*p);
+ }
+
+ vector<OnodeRef> ovec(i.objects.size());
+
+ for (int pos = 0; i.have_op(); ++pos) {
+ Transaction::Op *op = i.decode_op();
+ int r = 0;
+
+ // no coll or obj
+ if (op->op == Transaction::OP_NOP)
+ continue;
+
+
+ // collection operations
+ CollectionRef &c = cvec[op->cid];
+
+ // initialize osd_pool_id and do a smoke test that all collections belong
+ // to the same pool
+ spg_t pgid;
+ if (!!c ? c->cid.is_pg(&pgid) : false) {
+ ceph_assert(txc->osd_pool_id == META_POOL_ID ||
+ txc->osd_pool_id == pgid.pool());
+ txc->osd_pool_id = pgid.pool();
+ }
+
+ switch (op->op) {
+ case Transaction::OP_RMCOLL:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ r = _remove_collection(txc, cid, &c);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ ceph_assert(!c);
+ const coll_t &cid = i.get_cid(op->cid);
+ r = _create_collection(txc, cid, op->split_bits, &c);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_SPLIT_COLLECTION:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_SPLIT_COLLECTION2:
+ {
+ uint32_t bits = op->split_bits;
+ uint32_t rem = op->split_rem;
+ r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_MERGE_COLLECTION:
+ {
+ uint32_t bits = op->split_bits;
+ r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_COLL_HINT:
+ {
+ uint32_t type = op->hint_type;
+ bufferlist hint;
+ i.decode_bl(hint);
+ auto hiter = hint.cbegin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ decode(pg_num, hiter);
+ decode(num_objs, hiter);
+ dout(10) << __func__ << " collection hint objects is a no-op, "
+ << " pg_num " << pg_num << " num_objects " << num_objs
+ << dendl;
+ } else {
+ // Ignore the hint
+ dout(10) << __func__ << " unknown collection hint " << type << dendl;
+ }
+ continue;
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ r = -EOPNOTSUPP;
+ break;
+
+ case Transaction::OP_COLL_RMATTR:
+ r = -EOPNOTSUPP;
+ break;
+
+ case Transaction::OP_COLL_RENAME:
+ ceph_abort_msg("not implemented");
+ break;
+ }
+ if (r < 0) {
+ derr << __func__ << " error " << cpp_strerror(r)
+ << " not handled on operation " << op->op
+ << " (op " << pos << ", counting from 0)" << dendl;
+ _dump_transaction<0>(cct, t);
+ ceph_abort_msg("unexpected error");
+ }
+
+ // these operations implicity create the object
+ bool create = false;
+ if (op->op == Transaction::OP_TOUCH ||
+ op->op == Transaction::OP_WRITE ||
+ op->op == Transaction::OP_ZERO) {
+ create = true;
+ }
+
+ // object operations
+ RWLock::WLocker l(c->lock);
+ OnodeRef &o = ovec[op->oid];
+ if (!o) {
+ ghobject_t oid = i.get_oid(op->oid);
+ o = c->get_onode(oid, create);
+ }
+ if (!create && (!o || !o->exists)) {
+ dout(10) << __func__ << " op " << op->op << " got ENOENT on "
+ << i.get_oid(op->oid) << dendl;
+ r = -ENOENT;
+ goto endop;
+ }
+
+ switch (op->op) {
+ case Transaction::OP_TOUCH:
+ r = _touch(txc, c, o);
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ bufferlist bl;
+ i.decode_bl(bl);
+ r = _write(txc, c, o, off, len, bl, fadvise_flags);
+ }
+ break;
+
+ case Transaction::OP_ZERO:
+ {
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ r = _zero(txc, c, o, off, len);
+ }
+ break;
+
+ case Transaction::OP_TRIMCACHE:
+ {
+ // deprecated, no-op
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ uint64_t off = op->off;
+ r = _truncate(txc, c, o, off);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ r = _remove(txc, c, o);
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ string name = i.decode_string();
+ bufferptr bp;
+ i.decode_bp(bp);
+ r = _setattr(txc, c, o, name, bp);
+ }
+ break;
+
+ case Transaction::OP_SETATTRS:
+ {
+ map<string, bufferptr> aset;
+ i.decode_attrset(aset);
+ r = _setattrs(txc, c, o, aset);
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ string name = i.decode_string();
+ r = _rmattr(txc, c, o, name);
+ }
+ break;
+
+ case Transaction::OP_RMATTRS:
+ {
+ r = _rmattrs(txc, c, o);
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ OnodeRef& no = ovec[op->dest_oid];
+ if (!no) {
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ no = c->get_onode(noid, true);
+ }
+ r = _clone(txc, c, o, no);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_CLONERANGE2:
+ {
+ OnodeRef& no = ovec[op->dest_oid];
+ if (!no) {
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ no = c->get_onode(noid, true);
+ }
+ uint64_t srcoff = op->off;
+ uint64_t len = op->len;
+ uint64_t dstoff = op->dest_off;
+ r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ ceph_abort_msg("not implemented");
+ break;
+
+ case Transaction::OP_COLL_REMOVE:
+ ceph_abort_msg("not implemented");
+ break;
+
+ case Transaction::OP_COLL_MOVE:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_COLL_MOVE_RENAME:
+ case Transaction::OP_TRY_RENAME:
+ {
+ ceph_assert(op->cid == op->dest_cid);
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ OnodeRef& no = ovec[op->dest_oid];
+ if (!no) {
+ no = c->get_onode(noid, false);
+ }
+ r = _rename(txc, c, o, no, noid);
+ }
+ break;
+
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ r = _omap_clear(txc, c, o);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ bufferlist aset_bl;
+ i.decode_attrset_bl(&aset_bl);
+ r = _omap_setkeys(txc, c, o, aset_bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ bufferlist keys_bl;
+ i.decode_keyset_bl(&keys_bl);
+ r = _omap_rmkeys(txc, c, o, keys_bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ r = _omap_rmkey_range(txc, c, o, first, last);
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ bufferlist bl;
+ i.decode_bl(bl);
+ r = _omap_setheader(txc, c, o, bl);
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+ {
+ r = _set_alloc_hint(txc, c, o,
+ op->expected_object_size,
+ op->expected_write_size,
+ op->alloc_hint_flags);
+ }
+ break;
+
+ default:
+ derr << __func__ << " bad op " << op->op << dendl;
+ ceph_abort();
+ }
+
+ endop:
+ if (r < 0) {
+ bool ok = false;
+
+ if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2 ||
+ op->op == Transaction::OP_COLL_ADD ||
+ op->op == Transaction::OP_SETATTR ||
+ op->op == Transaction::OP_SETATTRS ||
+ op->op == Transaction::OP_RMATTR ||
+ op->op == Transaction::OP_OMAP_SETKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYRANGE ||
+ op->op == Transaction::OP_OMAP_SETHEADER))
+ // -ENOENT is usually okay
+ ok = true;
+ if (r == -ENODATA)
+ ok = true;
+
+ if (!ok) {
+ const char *msg = "unexpected error code";
+
+ if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2))
+ msg = "ENOENT on clone suggests osd bug";
+
+ if (r == -ENOSPC)
+ // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+ // by partially applying transactions.
+ msg = "ENOSPC from bluestore, misconfigured cluster";
+
+ if (r == -ENOTEMPTY) {
+ msg = "ENOTEMPTY suggests garbage data in osd data dir";
+ }
+
+ derr << __func__ << " error " << cpp_strerror(r)
+ << " not handled on operation " << op->op
+ << " (op " << pos << ", counting from 0)"
+ << dendl;
+ derr << msg << dendl;
+ _dump_transaction<0>(cct, t);
+ ceph_abort_msg("unexpected error");
+ }
+ }
+ }
+}
+
+
+
+// -----------------
+// write operations
+
+int BlueStore::_touch(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef &o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+ _assign_nid(txc, o);
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::_pad_zeros(
+ bufferlist *bl, uint64_t *offset,
+ uint64_t chunk_size)
+{
+ auto length = bl->length();
+ dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
+ << " chunk_size 0x" << chunk_size << std::dec << dendl;
+ dout(40) << "before:\n";
+ bl->hexdump(*_dout);
+ *_dout << dendl;
+ // front
+ size_t front_pad = *offset % chunk_size;
+ size_t back_pad = 0;
+ size_t pad_count = 0;
+ if (front_pad) {
+ size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
+ bufferptr z = buffer::create_small_page_aligned(chunk_size);
+ z.zero(0, front_pad, false);
+ pad_count += front_pad;
+ bl->copy(0, front_copy, z.c_str() + front_pad);
+ if (front_copy + front_pad < chunk_size) {
+ back_pad = chunk_size - (length + front_pad);
+ z.zero(front_pad + length, back_pad, false);
+ pad_count += back_pad;
+ }
+ bufferlist old, t;
+ old.swap(*bl);
+ t.substr_of(old, front_copy, length - front_copy);
+ bl->append(z);
+ bl->claim_append(t);
+ *offset -= front_pad;
+ length += pad_count;
+ }
+
+ // back
+ uint64_t end = *offset + length;
+ unsigned back_copy = end % chunk_size;
+ if (back_copy) {
+ ceph_assert(back_pad == 0);
+ back_pad = chunk_size - back_copy;
+ ceph_assert(back_copy <= length);
+ bufferptr tail(chunk_size);
+ bl->copy(length - back_copy, back_copy, tail.c_str());
+ tail.zero(back_copy, back_pad, false);
+ bufferlist old;
+ old.swap(*bl);
+ bl->substr_of(old, 0, length - back_copy);
+ bl->append(tail);
+ length += back_pad;
+ pad_count += back_pad;
+ }
+ dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
+ << back_pad << " on front/back, now 0x" << *offset << "~"
+ << length << std::dec << dendl;
+ dout(40) << "after:\n";
+ bl->hexdump(*_dout);
+ *_dout << dendl;
+ if (pad_count)
+ logger->inc(l_bluestore_write_pad_bytes, pad_count);
+ ceph_assert(bl->length() == length);
+}
+
+void BlueStore::_do_write_small(
+ TransContext *txc,
+ CollectionRef &c,
+ OnodeRef o,
+ uint64_t offset, uint64_t length,
+ bufferlist::iterator& blp,
+ WriteContext *wctx)
+{
+ dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ ceph_assert(length < min_alloc_size);
+ uint64_t end_offs = offset + length;
+
+ logger->inc(l_bluestore_write_small);
+ logger->inc(l_bluestore_write_small_bytes, length);
+
+ bufferlist bl;
+ blp.copy(length, bl);
+
+ auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+ auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+ uint32_t alloc_len = min_alloc_size;
+ auto offset0 = p2align<uint64_t>(offset, alloc_len);
+
+ bool any_change;
+
+ // search suitable extent in both forward and reverse direction in
+ // [offset - target_max_blob_size, offset + target_max_blob_size] range
+ // then check if blob can be reused via can_reuse_blob func or apply
+ // direct/deferred write (the latter for extents including or higher
+ // than 'offset' only).
+ o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
+
+ // Look for an existing mutable blob we can use.
+ auto begin = o->extent_map.extent_map.begin();
+ auto end = o->extent_map.extent_map.end();
+ auto ep = o->extent_map.seek_lextent(offset);
+ if (ep != begin) {
+ --ep;
+ if (ep->blob_end() <= offset) {
+ ++ep;
+ }
+ }
+ auto prev_ep = ep;
+ if (prev_ep != begin) {
+ --prev_ep;
+ } else {
+ prev_ep = end; // to avoid this extent check as it's a duplicate
+ }
+
+ boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
+ // We don't want to have more blobs than min alloc units fit
+ // into 2 max blobs
+ size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
+ bool above_blob_threshold = false;
+
+ inspected_blobs.reserve(blob_threshold);
+
+ uint64_t max_off = 0;
+ auto start_ep = ep;
+ auto end_ep = ep; // exclusively
+ do {
+ any_change = false;
+
+ if (ep != end && ep->logical_offset < offset + max_bsize) {
+ BlobRef b = ep->blob;
+ if (!above_blob_threshold) {
+ inspected_blobs.insert(&b->get_blob());
+ above_blob_threshold = inspected_blobs.size() >= blob_threshold;
+ }
+ max_off = ep->logical_end();
+ auto bstart = ep->blob_start();
+
+ dout(20) << __func__ << " considering " << *b
+ << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+ if (bstart >= end_offs) {
+ dout(20) << __func__ << " ignoring distant " << *b << dendl;
+ } else if (!b->get_blob().is_mutable()) {
+ dout(20) << __func__ << " ignoring immutable " << *b << dendl;
+ } else if (ep->logical_offset % min_alloc_size !=
+ ep->blob_offset % min_alloc_size) {
+ dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
+ } else {
+ uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+ // can we pad our head/tail out with zeros?
+ uint64_t head_pad, tail_pad;
+ head_pad = p2phase(offset, chunk_size);
+ tail_pad = p2nphase(end_offs, chunk_size);
+ if (head_pad || tail_pad) {
+ o->extent_map.fault_range(db, offset - head_pad,
+ end_offs - offset + head_pad + tail_pad);
+ }
+ if (head_pad &&
+ o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
+ head_pad = 0;
+ }
+ if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
+ tail_pad = 0;
+ }
+
+ uint64_t b_off = offset - head_pad - bstart;
+ uint64_t b_len = length + head_pad + tail_pad;
+
+ // direct write into unused blocks of an existing mutable blob?
+ if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
+ b->get_blob().get_ondisk_length() >= b_off + b_len &&
+ b->get_blob().is_unused(b_off, b_len) &&
+ b->get_blob().is_allocated(b_off, b_len)) {
+ _apply_padding(head_pad, tail_pad, bl);
+
+ dout(20) << __func__ << " write to unused 0x" << std::hex
+ << b_off << "~" << b_len
+ << " pad 0x" << head_pad << " + 0x" << tail_pad
+ << std::dec << " of mutable " << *b << dendl;
+ _buffer_cache_write(txc, b, b_off, bl,
+ wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+ if (!g_conf()->bluestore_debug_omit_block_device_write) {
+ if (b_len <= prefer_deferred_size) {
+ dout(20) << __func__ << " deferring small 0x" << std::hex
+ << b_len << std::dec << " unused write via deferred" << dendl;
+ bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
+ op->op = bluestore_deferred_op_t::OP_WRITE;
+ b->get_blob().map(
+ b_off, b_len,
+ [&](uint64_t offset, uint64_t length) {
+ op->extents.emplace_back(bluestore_pextent_t(offset, length));
+ return 0;
+ });
+ op->data = bl;
+ } else {
+ b->get_blob().map_bl(
+ b_off, bl,
+ [&](uint64_t offset, bufferlist& t) {
+ bdev->aio_write(offset, t,
+ &txc->ioc, wctx->buffered);
+ });
+ }
+ }
+ b->dirty_blob().calc_csum(b_off, bl);
+ dout(20) << __func__ << " lex old " << *ep << dendl;
+ Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
+ b,
+ &wctx->old_extents);
+ b->dirty_blob().mark_used(le->blob_offset, le->length);
+ txc->statfs_delta.stored() += le->length;
+ dout(20) << __func__ << " lex " << *le << dendl;
+ logger->inc(l_bluestore_write_small_unused);
+ return;
+ }
+ // read some data to fill out the chunk?
+ uint64_t head_read = p2phase(b_off, chunk_size);
+ uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
+ if ((head_read || tail_read) &&
+ (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
+ head_read + tail_read < min_alloc_size) {
+ b_off -= head_read;
+ b_len += head_read + tail_read;
+
+ } else {
+ head_read = tail_read = 0;
+ }
+
+ // chunk-aligned deferred overwrite?
+ if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
+ b_off % chunk_size == 0 &&
+ b_len % chunk_size == 0 &&
+ b->get_blob().is_allocated(b_off, b_len)) {
+
+ _apply_padding(head_pad, tail_pad, bl);
+
+ dout(20) << __func__ << " reading head 0x" << std::hex << head_read
+ << " and tail 0x" << tail_read << std::dec << dendl;
+ if (head_read) {
+ bufferlist head_bl;
+ int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
+ head_bl, 0);
+ ceph_assert(r >= 0 && r <= (int)head_read);
+ size_t zlen = head_read - r;
+ if (zlen) {
+ head_bl.append_zero(zlen);
+ logger->inc(l_bluestore_write_pad_bytes, zlen);
+ }
+ head_bl.claim_append(bl);
+ bl.swap(head_bl);
+ logger->inc(l_bluestore_write_penalty_read_ops);
+ }
+ if (tail_read) {
+ bufferlist tail_bl;
+ int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
+ tail_bl, 0);
+ ceph_assert(r >= 0 && r <= (int)tail_read);
+ size_t zlen = tail_read - r;
+ if (zlen) {
+ tail_bl.append_zero(zlen);
+ logger->inc(l_bluestore_write_pad_bytes, zlen);
+ }
+ bl.claim_append(tail_bl);
+ logger->inc(l_bluestore_write_penalty_read_ops);
+ }
+ logger->inc(l_bluestore_write_small_pre_read);
+
+ _buffer_cache_write(txc, b, b_off, bl,
+ wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+ if (b->get_blob().csum_type) {
+ b->dirty_blob().calc_csum(b_off, bl);
+ }
+
+ if (!g_conf()->bluestore_debug_omit_block_device_write) {
+ bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
+ op->op = bluestore_deferred_op_t::OP_WRITE;
+ int r = b->get_blob().map(
+ b_off, b_len,
+ [&](uint64_t offset, uint64_t length) {
+ op->extents.emplace_back(bluestore_pextent_t(offset, length));
+ return 0;
+ });
+ ceph_assert(r == 0);
+ op->data.claim(bl);
+ dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
+ << b_len << std::dec << " of mutable " << *b
+ << " at " << op->extents << dendl;
+ }
+
+ Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
+ b, &wctx->old_extents);
+ b->dirty_blob().mark_used(le->blob_offset, le->length);
+ txc->statfs_delta.stored() += le->length;
+ dout(20) << __func__ << " lex " << *le << dendl;
+ logger->inc(l_bluestore_write_small_deferred);
+ return;
+ }
+ // try to reuse blob if we can
+ if (b->can_reuse_blob(min_alloc_size,
+ max_bsize,
+ offset0 - bstart,
+ &alloc_len)) {
+ ceph_assert(alloc_len == min_alloc_size); // expecting data always
+ // fit into reused blob
+ // Need to check for pending writes desiring to
+ // reuse the same pextent. The rationale is that during GC two chunks
+ // from garbage blobs(compressed?) can share logical space within the same
+ // AU. That's in turn might be caused by unaligned len in clone_range2.
+ // Hence the second write will fail in an attempt to reuse blob at
+ // do_alloc_write().
+ if (!wctx->has_conflict(b,
+ offset0,
+ offset0 + alloc_len,
+ min_alloc_size)) {
+
+ // we can't reuse pad_head/pad_tail since they might be truncated
+ // due to existent extents
+ uint64_t b_off = offset - bstart;
+ uint64_t b_off0 = b_off;
+ _pad_zeros(&bl, &b_off0, chunk_size);
+
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (0x" << b_off0 << "~" << bl.length() << ")"
+ << " (0x" << b_off << "~" << length << ")"
+ << std::dec << dendl;
+
+ o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+ wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+ false, false);
+ logger->inc(l_bluestore_write_small_unused);
+ return;
+ }
+ }
+ }
+ ++ep;
+ end_ep = ep;
+ any_change = true;
+ } // if (ep != end && ep->logical_offset < offset + max_bsize)
+
+ // check extent for reuse in reverse order
+ if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+ BlobRef b = prev_ep->blob;
+ if (!above_blob_threshold) {
+ inspected_blobs.insert(&b->get_blob());
+ above_blob_threshold = inspected_blobs.size() >= blob_threshold;
+ }
+ start_ep = prev_ep;
+ auto bstart = prev_ep->blob_start();
+ dout(20) << __func__ << " considering " << *b
+ << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+ if (b->can_reuse_blob(min_alloc_size,
+ max_bsize,
+ offset0 - bstart,
+ &alloc_len)) {
+ ceph_assert(alloc_len == min_alloc_size); // expecting data always
+ // fit into reused blob
+ // Need to check for pending writes desiring to
+ // reuse the same pextent. The rationale is that during GC two chunks
+ // from garbage blobs(compressed?) can share logical space within the same
+ // AU. That's in turn might be caused by unaligned len in clone_range2.
+ // Hence the second write will fail in an attempt to reuse blob at
+ // do_alloc_write().
+ if (!wctx->has_conflict(b,
+ offset0,
+ offset0 + alloc_len,
+ min_alloc_size)) {
+
+ uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+ uint64_t b_off = offset - bstart;
+ uint64_t b_off0 = b_off;
+ _pad_zeros(&bl, &b_off0, chunk_size);
+
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (0x" << b_off0 << "~" << bl.length() << ")"
+ << " (0x" << b_off << "~" << length << ")"
+ << std::dec << dendl;
+
+ o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+ wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+ false, false);
+ logger->inc(l_bluestore_write_small_unused);
+ return;
+ }
+ }
+ if (prev_ep != begin) {
+ --prev_ep;
+ any_change = true;
+ } else {
+ prev_ep = end; // to avoid useless first extent re-check
+ }
+ } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
+ } while (any_change);
+
+ if (above_blob_threshold) {
+ dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
+ << " " << std::hex << min_off << "~" << max_off << std::dec
+ << dendl;
+ ceph_assert(start_ep != end_ep);
+ for (auto ep = start_ep; ep != end_ep; ++ep) {
+ dout(20) << __func__ << " inserting for GC "
+ << std::hex << ep->logical_offset << "~" << ep->length
+ << std::dec << dendl;
+
+ wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
+ }
+ // insert newly written extent to GC
+ wctx->extents_to_gc.union_insert(offset, length);
+ dout(20) << __func__ << " inserting (last) for GC "
+ << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ }
+ // new blob.
+ BlobRef b = c->new_blob();
+ uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
+ uint64_t b_off0 = b_off;
+ _pad_zeros(&bl, &b_off0, block_size);
+ o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+ wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+ min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
+ // doesn't match disk one only
+ true);
+
+ return;
+}
+
+void BlueStore::_do_write_big(
+ TransContext *txc,
+ CollectionRef &c,
+ OnodeRef o,
+ uint64_t offset, uint64_t length,
+ bufferlist::iterator& blp,
+ WriteContext *wctx)
+{
+ dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << " target_blob_size 0x" << wctx->target_blob_size << std::dec
+ << " compress " << (int)wctx->compress
+ << dendl;
+ logger->inc(l_bluestore_write_big);
+ logger->inc(l_bluestore_write_big_bytes, length);
+ o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+ auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+ while (length > 0) {
+ bool new_blob = false;
+ uint32_t l = std::min(max_bsize, length);
+ BlobRef b;
+ uint32_t b_off = 0;
+
+ //attempting to reuse existing blob
+ if (!wctx->compress) {
+ // look for an existing mutable blob we can reuse
+ auto begin = o->extent_map.extent_map.begin();
+ auto end = o->extent_map.extent_map.end();
+ auto ep = o->extent_map.seek_lextent(offset);
+ auto prev_ep = ep;
+ if (prev_ep != begin) {
+ --prev_ep;
+ } else {
+ prev_ep = end; // to avoid this extent check as it's a duplicate
+ }
+ auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+ // search suitable extent in both forward and reverse direction in
+ // [offset - target_max_blob_size, offset + target_max_blob_size] range
+ // then check if blob can be reused via can_reuse_blob func.
+ bool any_change;
+ do {
+ any_change = false;
+ if (ep != end && ep->logical_offset < offset + max_bsize) {
+ if (offset >= ep->blob_start() &&
+ ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
+ offset - ep->blob_start(),
+ &l)) {
+ b = ep->blob;
+ b_off = offset - ep->blob_start();
+ prev_ep = end; // to avoid check below
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
+ } else {
+ ++ep;
+ any_change = true;
+ }
+ }
+
+ if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+ if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
+ offset - prev_ep->blob_start(),
+ &l)) {
+ b = prev_ep->blob;
+ b_off = offset - prev_ep->blob_start();
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
+ } else if (prev_ep != begin) {
+ --prev_ep;
+ any_change = true;
+ } else {
+ prev_ep = end; // to avoid useless first extent re-check
+ }
+ }
+ } while (b == nullptr && any_change);
+ }
+ if (b == nullptr) {
+ b = c->new_blob();
+ b_off = 0;
+ new_blob = true;
+ }
+
+ bufferlist t;
+ blp.copy(l, t);
+ wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
+ offset += l;
+ length -= l;
+ logger->inc(l_bluestore_write_big_blobs);
+ }
+}
+
+int BlueStore::_do_alloc_write(
+ TransContext *txc,
+ CollectionRef coll,
+ OnodeRef o,
+ WriteContext *wctx)
+{
+ dout(20) << __func__ << " txc " << txc
+ << " " << wctx->writes.size() << " blobs"
+ << dendl;
+ if (wctx->writes.empty()) {
+ return 0;
+ }
+
+ CompressorRef c;
+ double crr = 0;
+ if (wctx->compress) {
+ c = select_option(
+ "compression_algorithm",
+ compressor,
+ [&]() {
+ string val;
+ if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
+ CompressorRef cp = compressor;
+ if (!cp || cp->get_type_name() != val) {
+ cp = Compressor::create(cct, val);
+ if (!cp) {
+ if (_set_compression_alert(false, val.c_str())) {
+ derr << __func__ << " unable to initialize " << val.c_str()
+ << " compressor" << dendl;
+ }
+ }
+ }
+ return boost::optional<CompressorRef>(cp);
+ }
+ return boost::optional<CompressorRef>();
+ }
+ );
+
+ crr = select_option(
+ "compression_required_ratio",
+ cct->_conf->bluestore_compression_required_ratio,
+ [&]() {
+ double val;
+ if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
+ return boost::optional<double>(val);
+ }
+ return boost::optional<double>();
+ }
+ );
+ }
+
+ // checksum
+ int64_t csum = csum_type.load();
+ csum = select_option(
+ "csum_type",
+ csum,
+ [&]() {
+ int64_t val;
+ if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
+ return boost::optional<int64_t>(val);
+ }
+ return boost::optional<int64_t>();
+ }
+ );
+
+ // compress (as needed) and calc needed space
+ uint64_t need = 0;
+ auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+ for (auto& wi : wctx->writes) {
+ if (c && wi.blob_length > min_alloc_size) {
+ auto start = mono_clock::now();
+
+ // compress
+ ceph_assert(wi.b_off == 0);
+ ceph_assert(wi.blob_length == wi.bl.length());
+
+ // FIXME: memory alignment here is bad
+ bufferlist t;
+ int r = c->compress(wi.bl, t);
+ uint64_t want_len_raw = wi.blob_length * crr;
+ uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
+ bool rejected = false;
+ uint64_t compressed_len = t.length();
+ // do an approximate (fast) estimation for resulting blob size
+ // that doesn't take header overhead into account
+ uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
+ if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
+ bluestore_compression_header_t chdr;
+ chdr.type = c->get_type();
+ chdr.length = t.length();
+ encode(chdr, wi.compressed_bl);
+ wi.compressed_bl.claim_append(t);
+
+ compressed_len = wi.compressed_bl.length();
+ result_len = p2roundup(compressed_len, min_alloc_size);
+ if (result_len <= want_len && result_len < wi.blob_length) {
+ // Cool. We compressed at least as much as we were hoping to.
+ // pad out to min_alloc_size
+ wi.compressed_bl.append_zero(result_len - compressed_len);
+ wi.compressed_len = compressed_len;
+ wi.compressed = true;
+ logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
+ dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
+ << " -> 0x" << compressed_len << " => 0x" << result_len
+ << " with " << c->get_type()
+ << std::dec << dendl;
+ txc->statfs_delta.compressed() += compressed_len;
+ txc->statfs_delta.compressed_original() += wi.blob_length;
+ txc->statfs_delta.compressed_allocated() += result_len;
+ logger->inc(l_bluestore_compress_success_count);
+ need += result_len;
+ } else {
+ rejected = true;
+ }
+ } else if (r != 0) {
+ dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
+ << " bytes compressed using " << c->get_type_name()
+ << std::dec
+ << " failed with errcode = " << r
+ << ", leaving uncompressed"
+ << dendl;
+ logger->inc(l_bluestore_compress_rejected_count);
+ need += wi.blob_length;
+ } else {
+ rejected = true;
+ }
+
+ if (rejected) {
+ dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
+ << " compressed to 0x" << compressed_len << " -> 0x" << result_len
+ << " with " << c->get_type()
+ << ", which is more than required 0x" << want_len_raw
+ << " -> 0x" << want_len
+ << ", leaving uncompressed"
+ << std::dec << dendl;
+ logger->inc(l_bluestore_compress_rejected_count);
+ need += wi.blob_length;
+ }
+ log_latency("compress@_do_alloc_write",
+ l_bluestore_compress_lat,
+ mono_clock::now() - start,
+ cct->_conf->bluestore_log_op_age );
+ } else {
+ need += wi.blob_length;
+ }
+ }
+ PExtentVector prealloc;
+ prealloc.reserve(2 * wctx->writes.size());;
+ int64_t prealloc_left = 0;
+ prealloc_left = alloc->allocate(
+ need, min_alloc_size, need,
+ 0, &prealloc);
+ if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
+ derr << __func__ << " failed to allocate 0x" << std::hex << need
+ << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
+ << " min_alloc_size 0x" << min_alloc_size
+ << " available 0x " << alloc->get_free()
+ << std::dec << dendl;
+ if (prealloc.size()) {
+ alloc->release(prealloc);
+ }
+ return -ENOSPC;
+ }
+
+ dout(20) << __func__ << " prealloc " << prealloc << dendl;
+ auto prealloc_pos = prealloc.begin();
+
+ for (auto& wi : wctx->writes) {
+ BlobRef b = wi.b;
+ bluestore_blob_t& dblob = b->dirty_blob();
+ uint64_t b_off = wi.b_off;
+ bufferlist *l = &wi.bl;
+ uint64_t final_length = wi.blob_length;
+ uint64_t csum_length = wi.blob_length;
+ if (wi.compressed) {
+ final_length = wi.compressed_bl.length();
+ csum_length = final_length;
+ unsigned csum_order = ctz(csum_length);
+ l = &wi.compressed_bl;
+ dblob.set_compressed(wi.blob_length, wi.compressed_len);
+ if (csum != Checksummer::CSUM_NONE) {
+ dout(20) << __func__ << " initialize csum setting for compressed blob " << *b
+ << " csum_type " << Checksummer::get_csum_type_string(csum)
+ << " csum_order " << csum_order
+ << " csum_length 0x" << std::hex << csum_length
+ << " blob_length 0x" << wi.blob_length
+ << " compressed_length 0x" << wi.compressed_len << std::dec
+ << dendl;
+ dblob.init_csum(csum, csum_order, csum_length);
+ }
+ } else if (wi.new_blob) {
+ unsigned csum_order;
+ // initialize newly created blob only
+ ceph_assert(dblob.is_mutable());
+ if (l->length() != wi.blob_length) {
+ // hrm, maybe we could do better here, but let's not bother.
+ dout(20) << __func__ << " forcing csum_order to block_size_order "
+ << block_size_order << dendl;
+ csum_order = block_size_order;
+ } else {
+ csum_order = std::min(wctx->csum_order, ctz(l->length()));
+ }
+ // try to align blob with max_blob_size to improve
+ // its reuse ratio, e.g. in case of reverse write
+ uint32_t suggested_boff =
+ (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
+ if ((suggested_boff % (1 << csum_order)) == 0 &&
+ suggested_boff + final_length <= max_bsize &&
+ suggested_boff > b_off) {
+ dout(20) << __func__ << " forcing blob_offset to 0x"
+ << std::hex << suggested_boff << std::dec << dendl;
+ ceph_assert(suggested_boff >= b_off);
+ csum_length += suggested_boff - b_off;
+ b_off = suggested_boff;
+ }
+ if (csum != Checksummer::CSUM_NONE) {
+ dout(20) << __func__ << " initialize csum setting for new blob " << *b
+ << " csum_type " << Checksummer::get_csum_type_string(csum)
+ << " csum_order " << csum_order
+ << " csum_length 0x" << std::hex << csum_length << std::dec
+ << dendl;
+ dblob.init_csum(csum, csum_order, csum_length);
+ }
+ }
+
+ PExtentVector extents;
+ int64_t left = final_length;
+ while (left > 0) {
+ ceph_assert(prealloc_left > 0);
+ if (prealloc_pos->length <= left) {
+ prealloc_left -= prealloc_pos->length;
+ left -= prealloc_pos->length;
+ txc->statfs_delta.allocated() += prealloc_pos->length;
+ extents.push_back(*prealloc_pos);
+ ++prealloc_pos;
+ } else {
+ extents.emplace_back(prealloc_pos->offset, left);
+ prealloc_pos->offset += left;
+ prealloc_pos->length -= left;
+ prealloc_left -= left;
+ txc->statfs_delta.allocated() += left;
+ left = 0;
+ break;
+ }
+ }
+ for (auto& p : extents) {
+ txc->allocated.insert(p.offset, p.length);
+ }
+ dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
+
+ dout(20) << __func__ << " blob " << *b << dendl;
+ if (dblob.has_csum()) {
+ dblob.calc_csum(b_off, *l);
+ }
+
+ if (wi.mark_unused) {
+ ceph_assert(!dblob.is_compressed());
+ auto b_end = b_off + wi.bl.length();
+ if (b_off) {
+ dblob.add_unused(0, b_off);
+ }
+ uint64_t llen = dblob.get_logical_length();
+ if (b_end < llen) {
+ dblob.add_unused(b_end, llen - b_end);
+ }
+ }
+
+ Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
+ b_off + (wi.b_off0 - wi.b_off),
+ wi.length0,
+ wi.b,
+ nullptr);
+ wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
+ txc->statfs_delta.stored() += le->length;
+ dout(20) << __func__ << " lex " << *le << dendl;
+ _buffer_cache_write(txc, wi.b, b_off, wi.bl,
+ wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+ // queue io
+ if (!g_conf()->bluestore_debug_omit_block_device_write) {
+ if (l->length() <= prefer_deferred_size.load()) {
+ dout(20) << __func__ << " deferring small 0x" << std::hex
+ << l->length() << std::dec << " write via deferred" << dendl;
+ bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
+ op->op = bluestore_deferred_op_t::OP_WRITE;
+ int r = b->get_blob().map(
+ b_off, l->length(),
+ [&](uint64_t offset, uint64_t length) {
+ op->extents.emplace_back(bluestore_pextent_t(offset, length));
+ return 0;
+ });
+ ceph_assert(r == 0);
+ op->data = *l;
+ logger->inc(l_bluestore_write_small_deferred);
+ } else {
+ b->get_blob().map_bl(
+ b_off, *l,
+ [&](uint64_t offset, bufferlist& t) {
+ bdev->aio_write(offset, t, &txc->ioc, false);
+ });
+ logger->inc(l_bluestore_write_small_new);
+ }
+ }
+ }
+ ceph_assert(prealloc_pos == prealloc.end());
+ ceph_assert(prealloc_left == 0);
+ return 0;
+}
+
+void BlueStore::_wctx_finish(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ WriteContext *wctx,
+ set<SharedBlob*> *maybe_unshared_blobs)
+{
+ auto oep = wctx->old_extents.begin();
+ while (oep != wctx->old_extents.end()) {
+ auto &lo = *oep;
+ oep = wctx->old_extents.erase(oep);
+ dout(20) << __func__ << " lex_old " << lo.e << dendl;
+ BlobRef b = lo.e.blob;
+ const bluestore_blob_t& blob = b->get_blob();
+ if (blob.is_compressed()) {
+ if (lo.blob_empty) {
+ txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
+ }
+ txc->statfs_delta.compressed_original() -= lo.e.length;
+ }
+ auto& r = lo.r;
+ txc->statfs_delta.stored() -= lo.e.length;
+ if (!r.empty()) {
+ dout(20) << __func__ << " blob release " << r << dendl;
+ if (blob.is_shared()) {
+ PExtentVector final;
+ c->load_shared_blob(b->shared_blob);
+ bool unshare = false;
+ bool* unshare_ptr =
+ !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
+ for (auto e : r) {
+ b->shared_blob->put_ref(
+ e.offset, e.length, &final,
+ unshare_ptr);
+ }
+ if (unshare) {
+ ceph_assert(maybe_unshared_blobs);
+ maybe_unshared_blobs->insert(b->shared_blob.get());
+ }
+ dout(20) << __func__ << " shared_blob release " << final
+ << " from " << *b->shared_blob << dendl;
+ txc->write_shared_blob(b->shared_blob);
+ r.clear();
+ r.swap(final);
+ }
+ }
+ // we can't invalidate our logical extents as we drop them because
+ // other lextents (either in our onode or others) may still
+ // reference them. but we can throw out anything that is no
+ // longer allocated. Note that this will leave behind edge bits
+ // that are no longer referenced but not deallocated (until they
+ // age out of the cache naturally).
+ b->discard_unallocated(c.get());
+ for (auto e : r) {
+ dout(20) << __func__ << " release " << e << dendl;
+ txc->released.insert(e.offset, e.length);
+ txc->statfs_delta.allocated() -= e.length;
+ if (blob.is_compressed()) {
+ txc->statfs_delta.compressed_allocated() -= e.length;
+ }
+ }
+ delete &lo;
+ if (b->is_spanning() && !b->is_referenced()) {
+ dout(20) << __func__ << " spanning_blob_map removing empty " << *b
+ << dendl;
+ o->extent_map.spanning_blob_map.erase(b->id);
+ }
+ }
+}
+
+void BlueStore::_do_write_data(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ uint64_t offset,
+ uint64_t length,
+ bufferlist& bl,
+ WriteContext *wctx)
+{
+ uint64_t end = offset + length;
+ bufferlist::iterator p = bl.begin();
+
+ if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
+ (length != min_alloc_size)) {
+ // we fall within the same block
+ _do_write_small(txc, c, o, offset, length, p, wctx);
+ } else {
+ uint64_t head_offset, head_length;
+ uint64_t middle_offset, middle_length;
+ uint64_t tail_offset, tail_length;
+
+ head_offset = offset;
+ head_length = p2nphase(offset, min_alloc_size);
+
+ tail_offset = p2align(end, min_alloc_size);
+ tail_length = p2phase(end, min_alloc_size);
+
+ middle_offset = head_offset + head_length;
+ middle_length = length - head_length - tail_length;
+
+ if (head_length) {
+ _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
+ }
+
+ if (middle_length) {
+ _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
+ }
+
+ if (tail_length) {
+ _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
+ }
+ }
+}
+
+void BlueStore::_choose_write_options(
+ CollectionRef& c,
+ OnodeRef o,
+ uint32_t fadvise_flags,
+ WriteContext *wctx)
+{
+ if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+ dout(20) << __func__ << " will do buffered write" << dendl;
+ wctx->buffered = true;
+ } else if (cct->_conf->bluestore_default_buffered_write &&
+ (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+ dout(20) << __func__ << " defaulting to buffered write" << dendl;
+ wctx->buffered = true;
+ }
+
+ // apply basic csum block size
+ wctx->csum_order = block_size_order;
+
+ // compression parameters
+ unsigned alloc_hints = o->onode.alloc_hint_flags;
+ auto cm = select_option(
+ "compression_mode",
+ comp_mode.load(),
+ [&]() {
+ string val;
+ if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
+ return boost::optional<Compressor::CompressionMode>(
+ Compressor::get_comp_mode_type(val));
+ }
+ return boost::optional<Compressor::CompressionMode>();
+ }
+ );
+
+ wctx->compress = (cm != Compressor::COMP_NONE) &&
+ ((cm == Compressor::COMP_FORCE) ||
+ (cm == Compressor::COMP_AGGRESSIVE &&
+ (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
+ (cm == Compressor::COMP_PASSIVE &&
+ (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
+
+ if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
+ (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
+ (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
+ (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
+
+ dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
+
+ if (o->onode.expected_write_size) {
+ wctx->csum_order = std::max(min_alloc_size_order,
+ (uint8_t)ctz(o->onode.expected_write_size));
+ } else {
+ wctx->csum_order = min_alloc_size_order;
+ }
+
+ if (wctx->compress) {
+ wctx->target_blob_size = select_option(
+ "compression_max_blob_size",
+ comp_max_blob_size.load(),
+ [&]() {
+ int64_t val;
+ if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
+ return boost::optional<uint64_t>((uint64_t)val);
+ }
+ return boost::optional<uint64_t>();
+ }
+ );
+ }
+ } else {
+ if (wctx->compress) {
+ wctx->target_blob_size = select_option(
+ "compression_min_blob_size",
+ comp_min_blob_size.load(),
+ [&]() {
+ int64_t val;
+ if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
+ return boost::optional<uint64_t>((uint64_t)val);
+ }
+ return boost::optional<uint64_t>();
+ }
+ );
+ }
+ }
+
+ uint64_t max_bsize = max_blob_size.load();
+ if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
+ wctx->target_blob_size = max_bsize;
+ }
+
+ // set the min blob size floor at 2x the min_alloc_size, or else we
+ // won't be able to allocate a smaller extent for the compressed
+ // data.
+ if (wctx->compress &&
+ wctx->target_blob_size < min_alloc_size * 2) {
+ wctx->target_blob_size = min_alloc_size * 2;
+ }
+
+ dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
+ << " target_blob_size 0x" << std::hex << wctx->target_blob_size
+ << " compress=" << (int)wctx->compress
+ << " buffered=" << (int)wctx->buffered
+ << std::dec << dendl;
+}
+
+int BlueStore::_do_gc(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ const WriteContext& wctx,
+ uint64_t *dirty_start,
+ uint64_t *dirty_end)
+{
+
+ bool dirty_range_updated = false;
+ WriteContext wctx_gc;
+ wctx_gc.fork(wctx); // make a clone for garbage collection
+
+ auto & extents_to_collect = wctx.extents_to_gc;
+ for (auto it = extents_to_collect.begin();
+ it != extents_to_collect.end();
+ ++it) {
+ bufferlist bl;
+ auto offset = (*it).first;
+ auto length = (*it).second;
+ dout(20) << __func__ << " processing " << std::hex
+ << offset << "~" << length << std::dec
+ << dendl;
+ int r = _do_read(c.get(), o, offset, length, bl, 0);
+ ceph_assert(r == (int)length);
+
+ _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
+ logger->inc(l_bluestore_gc_merged, length);
+
+ if (*dirty_start > offset) {
+ *dirty_start = offset;
+ dirty_range_updated = true;
+ }
+
+ if (*dirty_end < offset + length) {
+ *dirty_end = offset + length;
+ dirty_range_updated = true;
+ }
+ }
+ if (dirty_range_updated) {
+ o->extent_map.fault_range(db, *dirty_start, *dirty_end);
+ }
+
+ dout(30) << __func__ << " alloc write" << dendl;
+ int r = _do_alloc_write(txc, c, o, &wctx_gc);
+ if (r < 0) {
+ derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ _wctx_finish(txc, c, o, &wctx_gc);
+ return 0;
+}
+
+int BlueStore::_do_write(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ uint64_t offset,
+ uint64_t length,
+ bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ int r = 0;
+
+ dout(20) << __func__
+ << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length
+ << " - have 0x" << o->onode.size
+ << " (" << std::dec << o->onode.size << ")"
+ << " bytes"
+ << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
+ << dendl;
+ _dump_onode<30>(cct, *o);
+
+ if (length == 0) {
+ return 0;
+ }
+
+ uint64_t end = offset + length;
+
+ GarbageCollector gc(c->store->cct);
+ int64_t benefit = 0;
+ auto dirty_start = offset;
+ auto dirty_end = end;
+
+ WriteContext wctx;
+ _choose_write_options(c, o, fadvise_flags, &wctx);
+ o->extent_map.fault_range(db, offset, length);
+ _do_write_data(txc, c, o, offset, length, bl, &wctx);
+ r = _do_alloc_write(txc, c, o, &wctx);
+ if (r < 0) {
+ derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
+ << dendl;
+ goto out;
+ }
+
+ if (wctx.extents_to_gc.empty() ||
+ wctx.extents_to_gc.range_start() > offset ||
+ wctx.extents_to_gc.range_end() < offset + length) {
+ benefit = gc.estimate(offset,
+ length,
+ o->extent_map,
+ wctx.old_extents,
+ min_alloc_size);
+ }
+
+ // NB: _wctx_finish() will empty old_extents
+ // so we must do gc estimation before that
+ _wctx_finish(txc, c, o, &wctx);
+ if (end > o->onode.size) {
+ dout(20) << __func__ << " extending size to 0x" << std::hex << end
+ << std::dec << dendl;
+ o->onode.size = end;
+ }
+
+ if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
+ wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
+ dout(20) << __func__
+ << " perform garbage collection for compressed extents, "
+ << "expected benefit = " << benefit << " AUs" << dendl;
+ }
+ if (!wctx.extents_to_gc.empty()) {
+ dout(20) << __func__ << " perform garbage collection" << dendl;
+
+ r = _do_gc(txc, c, o,
+ wctx,
+ &dirty_start, &dirty_end);
+ if (r < 0) {
+ derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
+ << dendl;
+ goto out;
+ }
+ dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
+ << "~" << dirty_end - dirty_start << std::dec << dendl;
+ }
+ o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
+ o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
+
+ r = 0;
+
+ out:
+ return r;
+}
+
+int BlueStore::_write(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t length,
+ bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ int r = 0;
+ if (offset + length >= OBJECT_MAX_SIZE) {
+ r = -E2BIG;
+ } else {
+ _assign_nid(txc, o);
+ r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
+ txc->write_onode(o);
+ }
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_zero(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t length)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ int r = 0;
+ if (offset + length >= OBJECT_MAX_SIZE) {
+ r = -E2BIG;
+ } else {
+ _assign_nid(txc, o);
+ r = _do_zero(txc, c, o, offset, length);
+ }
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_do_zero(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t length)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << dendl;
+ int r = 0;
+
+ _dump_onode<30>(cct, *o);
+
+ WriteContext wctx;
+ o->extent_map.fault_range(db, offset, length);
+ o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
+ o->extent_map.dirty_range(offset, length);
+ _wctx_finish(txc, c, o, &wctx);
+
+ if (length > 0 && offset + length > o->onode.size) {
+ o->onode.size = offset + length;
+ dout(20) << __func__ << " extending size to " << offset + length
+ << dendl;
+ }
+ txc->write_onode(o);
+
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::_do_truncate(
+ TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
+ set<SharedBlob*> *maybe_unshared_blobs)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << std::dec << dendl;
+
+ _dump_onode<30>(cct, *o);
+
+ if (offset == o->onode.size)
+ return;
+
+ if (offset < o->onode.size) {
+ WriteContext wctx;
+ uint64_t length = o->onode.size - offset;
+ o->extent_map.fault_range(db, offset, length);
+ o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
+ o->extent_map.dirty_range(offset, length);
+ _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
+
+ // if we have shards past EOF, ask for a reshard
+ if (!o->onode.extent_map_shards.empty() &&
+ o->onode.extent_map_shards.back().offset >= offset) {
+ dout(10) << __func__ << " request reshard past EOF" << dendl;
+ if (offset) {
+ o->extent_map.request_reshard(offset - 1, offset + length);
+ } else {
+ o->extent_map.request_reshard(0, length);
+ }
+ }
+ }
+
+ o->onode.size = offset;
+
+ txc->write_onode(o);
+}
+
+int BlueStore::_truncate(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << std::dec
+ << dendl;
+ int r = 0;
+ if (offset >= OBJECT_MAX_SIZE) {
+ r = -E2BIG;
+ } else {
+ _do_truncate(txc, c, o, offset);
+ }
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " 0x" << std::hex << offset << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_do_remove(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o)
+{
+ set<SharedBlob*> maybe_unshared_blobs;
+ bool is_gen = !o->oid.is_no_gen();
+ _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
+ if (o->onode.has_omap()) {
+ o->flush();
+ _do_omap_clear(txc,
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
+ o->onode.nid);
+ }
+ o->exists = false;
+ string key;
+ for (auto &s : o->extent_map.shards) {
+ dout(20) << __func__ << " removing shard 0x" << std::hex
+ << s.shard_info->offset << std::dec << dendl;
+ generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
+ [&](const string& final_key) {
+ txc->t->rmkey(PREFIX_OBJ, final_key);
+ }
+ );
+ }
+ txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
+ txc->note_removed_object(o);
+ o->extent_map.clear();
+ o->onode = bluestore_onode_t();
+ _debug_obj_on_delete(o->oid);
+
+ if (!is_gen || maybe_unshared_blobs.empty()) {
+ return 0;
+ }
+
+ // see if we can unshare blobs still referenced by the head
+ dout(10) << __func__ << " gen and maybe_unshared_blobs "
+ << maybe_unshared_blobs << dendl;
+ ghobject_t nogen = o->oid;
+ nogen.generation = ghobject_t::NO_GEN;
+ OnodeRef h = c->onode_map.lookup(nogen);
+
+ if (!h || !h->exists) {
+ return 0;
+ }
+
+ dout(20) << __func__ << " checking for unshareable blobs on " << h
+ << " " << h->oid << dendl;
+ map<SharedBlob*,bluestore_extent_ref_map_t> expect;
+ for (auto& e : h->extent_map.extent_map) {
+ const bluestore_blob_t& b = e.blob->get_blob();
+ SharedBlob *sb = e.blob->shared_blob.get();
+ if (b.is_shared() &&
+ sb->loaded &&
+ maybe_unshared_blobs.count(sb)) {
+ if (b.is_compressed()) {
+ expect[sb].get(0, b.get_ondisk_length());
+ } else {
+ b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
+ expect[sb].get(off, len);
+ return 0;
+ });
+ }
+ }
+ }
+
+ vector<SharedBlob*> unshared_blobs;
+ unshared_blobs.reserve(maybe_unshared_blobs.size());
+ for (auto& p : expect) {
+ dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
+ if (p.first->persistent->ref_map == p.second) {
+ SharedBlob *sb = p.first;
+ dout(20) << __func__ << " unsharing " << *sb << dendl;
+ unshared_blobs.push_back(sb);
+ txc->unshare_blob(sb);
+ uint64_t sbid = c->make_blob_unshared(sb);
+ string key;
+ get_shared_blob_key(sbid, &key);
+ txc->t->rmkey(PREFIX_SHARED_BLOB, key);
+ }
+ }
+
+ if (unshared_blobs.empty()) {
+ return 0;
+ }
+
+ for (auto& e : h->extent_map.extent_map) {
+ const bluestore_blob_t& b = e.blob->get_blob();
+ SharedBlob *sb = e.blob->shared_blob.get();
+ if (b.is_shared() &&
+ std::find(unshared_blobs.begin(), unshared_blobs.end(),
+ sb) != unshared_blobs.end()) {
+ dout(20) << __func__ << " unsharing " << e << dendl;
+ bluestore_blob_t& blob = e.blob->dirty_blob();
+ blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
+ h->extent_map.dirty_range(e.logical_offset, 1);
+ }
+ }
+ txc->write_onode(h);
+
+ return 0;
+}
+
+int BlueStore::_remove(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef &o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " onode " << o.get()
+ << " txc "<< txc << dendl;
+
+ auto start_time = mono_clock::now();
+ int r = _do_remove(txc, c, o);
+ log_latency_fn(
+ __func__,
+ l_bluestore_remove_lat,
+ mono_clock::now() - start_time,
+ cct->_conf->bluestore_log_op_age,
+ [&](const ceph::timespan& lat) {
+ ostringstream ostr;
+ ostr << ", lat = " << timespan_str(lat)
+ << " cid =" << c->cid
+ << " oid =" << o->oid;
+ return ostr.str();
+ }
+ );
+
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_setattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name,
+ bufferptr& val)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << " (" << val.length() << " bytes)"
+ << dendl;
+ int r = 0;
+ if (val.is_partial()) {
+ auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
+ val.length());
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ } else {
+ auto& b = o->onode.attrs[name.c_str()] = val;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ }
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << " (" << val.length() << " bytes)"
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_setattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const map<string,bufferptr>& aset)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << aset.size() << " keys"
+ << dendl;
+ int r = 0;
+ for (map<string,bufferptr>::const_iterator p = aset.begin();
+ p != aset.end(); ++p) {
+ if (p->second.is_partial()) {
+ auto& b = o->onode.attrs[p->first.c_str()] =
+ bufferptr(p->second.c_str(), p->second.length());
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ } else {
+ auto& b = o->onode.attrs[p->first.c_str()] = p->second;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ }
+ }
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << aset.size() << " keys"
+ << " = " << r << dendl;
+ return r;
+}
+
+
+int BlueStore::_rmattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << dendl;
+ int r = 0;
+ auto it = o->onode.attrs.find(name.c_str());
+ if (it == o->onode.attrs.end())
+ goto out;
+
+ o->onode.attrs.erase(it);
+ txc->write_onode(o);
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_rmattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+
+ if (o->onode.attrs.empty())
+ goto out;
+
+ o->onode.attrs.clear();
+ txc->write_onode(o);
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::_do_omap_clear(TransContext *txc, const string& omap_prefix,
+ uint64_t id)
+{
+ string prefix, tail;
+ get_omap_header(id, &prefix);
+ get_omap_tail(id, &tail);
+ txc->t->rm_range_keys(omap_prefix, prefix, tail);
+ txc->t->rmkey(omap_prefix, tail);
+ dout(20) << __func__ << " remove range start: "
+ << pretty_binary_string(prefix) << " end: "
+ << pretty_binary_string(tail) << dendl;
+}
+
+int BlueStore::_omap_clear(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+ if (o->onode.has_omap()) {
+ o->flush();
+ _do_omap_clear(txc,
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP,
+ o->onode.nid);
+ o->onode.clear_omap_flag();
+ txc->write_onode(o);
+ }
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_omap_setkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist &bl)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r;
+ auto p = bl.cbegin();
+ __u32 num;
+ if (!o->onode.has_omap()) {
+ o->onode.set_omap_flag();
+ if (o->oid.is_pgmeta()) {
+ o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
+ }
+ txc->write_onode(o);
+
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ string key_tail;
+ bufferlist tail;
+ get_omap_tail(o->onode.nid, &key_tail);
+ txc->t->set(prefix, key_tail, tail);
+ } else {
+ txc->note_modified_object(o);
+ }
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ string final_key;
+ _key_encode_u64(o->onode.nid, &final_key);
+ final_key.push_back('.');
+ decode(num, p);
+ while (num--) {
+ string key;
+ bufferlist value;
+ decode(key, p);
+ decode(value, p);
+ final_key.resize(9); // keep prefix
+ final_key += key;
+ dout(20) << __func__ << " " << pretty_binary_string(final_key)
+ << " <- " << key << dendl;
+ txc->t->set(prefix, final_key, value);
+ }
+ r = 0;
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_omap_setheader(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef &o,
+ bufferlist& bl)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r;
+ string key;
+ if (!o->onode.has_omap()) {
+ o->onode.set_omap_flag();
+ if (o->oid.is_pgmeta()) {
+ o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
+ }
+ txc->write_onode(o);
+
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ string key_tail;
+ bufferlist tail;
+ get_omap_tail(o->onode.nid, &key_tail);
+ txc->t->set(prefix, key_tail, tail);
+ } else {
+ txc->note_modified_object(o);
+ }
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ get_omap_header(o->onode.nid, &key);
+ txc->t->set(prefix, key, bl);
+ r = 0;
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_omap_rmkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist& bl)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+ auto p = bl.cbegin();
+ __u32 num;
+ string final_key;
+
+ if (!o->onode.has_omap()) {
+ goto out;
+ }
+ {
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ _key_encode_u64(o->onode.nid, &final_key);
+ final_key.push_back('.');
+ decode(num, p);
+ while (num--) {
+ string key;
+ decode(key, p);
+ final_key.resize(9); // keep prefix
+ final_key += key;
+ dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
+ << " <- " << key << dendl;
+ txc->t->rmkey(prefix, final_key);
+ }
+ }
+ txc->note_modified_object(o);
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_omap_rmkey_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& first, const string& last)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ string key_first, key_last;
+ int r = 0;
+ if (!o->onode.has_omap()) {
+ goto out;
+ }
+ {
+ const string& prefix =
+ o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ o->flush();
+ get_omap_key(o->onode.nid, first, &key_first);
+ get_omap_key(o->onode.nid, last, &key_last);
+ txc->t->rm_range_keys(prefix, key_first, key_last);
+ dout(20) << __func__ << " remove range start: "
+ << pretty_binary_string(key_first) << " end: "
+ << pretty_binary_string(key_last) << dendl;
+ }
+ txc->note_modified_object(o);
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_set_alloc_hint(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " object_size " << expected_object_size
+ << " write_size " << expected_write_size
+ << " flags " << ceph_osd_alloc_hint_flag_string(flags)
+ << dendl;
+ int r = 0;
+ o->onode.expected_object_size = expected_object_size;
+ o->onode.expected_write_size = expected_write_size;
+ o->onode.alloc_hint_flags = flags;
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " object_size " << expected_object_size
+ << " write_size " << expected_write_size
+ << " flags " << ceph_osd_alloc_hint_flag_string(flags)
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_clone(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << dendl;
+ int r = 0;
+ if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
+ derr << __func__ << " mismatched hash on " << oldo->oid
+ << " and " << newo->oid << dendl;
+ return -EINVAL;
+ }
+
+ _assign_nid(txc, newo);
+
+ // clone data
+ oldo->flush();
+ _do_truncate(txc, c, newo, 0);
+ if (cct->_conf->bluestore_clone_cow) {
+ _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
+ } else {
+ bufferlist bl;
+ r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
+ if (r < 0)
+ goto out;
+ r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
+ if (r < 0)
+ goto out;
+ }
+
+ // clone attrs
+ newo->onode.attrs = oldo->onode.attrs;
+
+ // clone omap
+ if (newo->onode.has_omap()) {
+ dout(20) << __func__ << " clearing old omap data" << dendl;
+ newo->flush();
+ _do_omap_clear(txc,
+ newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP
+ : PREFIX_OMAP,
+ newo->onode.nid);
+ newo->onode.clear_omap_flag();
+ }
+ if (oldo->onode.has_omap()) {
+ dout(20) << __func__ << " copying omap data" << dendl;
+ newo->onode.set_omap_flag();
+ if (newo->oid.is_pgmeta()) {
+ newo->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP;
+ }
+ const string& prefix =
+ newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP;
+ KeyValueDB::Iterator it = db->get_iterator(prefix);
+ string head, tail;
+ get_omap_header(oldo->onode.nid, &head);
+ get_omap_tail(oldo->onode.nid, &tail);
+ it->lower_bound(head);
+ while (it->valid()) {
+ if (it->key() >= tail) {
+ dout(30) << __func__ << " reached tail" << dendl;
+ break;
+ } else {
+ dout(30) << __func__ << " got header/data "
+ << pretty_binary_string(it->key()) << dendl;
+ string key;
+ rewrite_omap_key(newo->onode.nid, it->key(), &key);
+ txc->t->set(prefix, key, it->value());
+ }
+ it->next();
+ }
+ string new_tail;
+ bufferlist new_tail_value;
+ get_omap_tail(newo->onode.nid, &new_tail);
+ txc->t->set(prefix, new_tail, new_tail_value);
+ }
+
+ txc->write_onode(newo);
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_do_clone_range(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ uint64_t srcoff,
+ uint64_t length,
+ uint64_t dstoff)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid
+ << " 0x" << std::hex << srcoff << "~" << length << " -> "
+ << " 0x" << dstoff << "~" << length << std::dec << dendl;
+ oldo->extent_map.fault_range(db, srcoff, length);
+ newo->extent_map.fault_range(db, dstoff, length);
+ _dump_onode<30>(cct, *oldo);
+ _dump_onode<30>(cct, *newo);
+
+ oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
+ _dump_onode<30>(cct, *oldo);
+ _dump_onode<30>(cct, *newo);
+ return 0;
+}
+
+int BlueStore::_clone_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ uint64_t srcoff, uint64_t length, uint64_t dstoff)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
+ << " to offset 0x" << dstoff << std::dec << dendl;
+ int r = 0;
+
+ if (srcoff + length >= OBJECT_MAX_SIZE ||
+ dstoff + length >= OBJECT_MAX_SIZE) {
+ r = -E2BIG;
+ goto out;
+ }
+ if (srcoff + length > oldo->onode.size) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ _assign_nid(txc, newo);
+
+ if (length > 0) {
+ if (cct->_conf->bluestore_clone_cow) {
+ _do_zero(txc, c, newo, dstoff, length);
+ _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
+ } else {
+ bufferlist bl;
+ r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
+ if (r < 0)
+ goto out;
+ r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
+ if (r < 0)
+ goto out;
+ }
+ }
+
+ txc->write_onode(newo);
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
+ << " to offset 0x" << dstoff << std::dec
+ << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_rename(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ const ghobject_t& new_oid)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << new_oid << dendl;
+ int r;
+ ghobject_t old_oid = oldo->oid;
+ mempool::bluestore_cache_meta::string new_okey;
+
+ if (newo) {
+ if (newo->exists) {
+ r = -EEXIST;
+ goto out;
+ }
+ ceph_assert(txc->onodes.count(newo) == 0);
+ }
+
+ txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
+
+ // rewrite shards
+ {
+ oldo->extent_map.fault_range(db, 0, oldo->onode.size);
+ get_object_key(cct, new_oid, &new_okey);
+ string key;
+ for (auto &s : oldo->extent_map.shards) {
+ generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
+ [&](const string& final_key) {
+ txc->t->rmkey(PREFIX_OBJ, final_key);
+ }
+ );
+ s.dirty = true;
+ }
+ }
+
+ newo = oldo;
+ txc->write_onode(newo);
+
+ // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
+ // Onode in the old slot
+ c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
+ r = 0;
+
+ // hold a ref to new Onode in old name position, to ensure we don't drop
+ // it from the cache before this txc commits (or else someone may come along
+ // and read newo's metadata via the old name).
+ txc->note_modified_object(oldo);
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+ << new_oid << " = " << r << dendl;
+ return r;
+}
+
+// collections
+
+int BlueStore::_create_collection(
+ TransContext *txc,
+ const coll_t &cid,
+ unsigned bits,
+ CollectionRef *c)
+{
+ dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
+ int r;
+ bufferlist bl;
+
+ {
+ RWLock::WLocker l(coll_lock);
+ if (*c) {
+ r = -EEXIST;
+ goto out;
+ }
+ auto p = new_coll_map.find(cid);
+ ceph_assert(p != new_coll_map.end());
+ *c = p->second;
+ (*c)->cnode.bits = bits;
+ coll_map[cid] = *c;
+ new_coll_map.erase(p);
+ }
+ encode((*c)->cnode, bl);
+ txc->t->set(PREFIX_COLL, stringify(cid), bl);
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
+ CollectionRef *c)
+{
+ dout(15) << __func__ << " " << cid << dendl;
+ int r;
+
+ (*c)->flush_all_but_last();
+ {
+ RWLock::WLocker l(coll_lock);
+ if (!*c) {
+ r = -ENOENT;
+ goto out;
+ }
+ size_t nonexistent_count = 0;
+ ceph_assert((*c)->exists);
+ if ((*c)->onode_map.map_any([&](OnodeRef o) {
+ if (o->exists) {
+ dout(1) << __func__ << " " << o->oid << " " << o
+ << " exists in onode_map" << dendl;
+ return true;
+ }
+ ++nonexistent_count;
+ return false;
+ })) {
+ r = -ENOTEMPTY;
+ goto out;
+ }
+
+ vector<ghobject_t> ls;
+ ghobject_t next;
+ // Enumerate onodes in db, up to nonexistent_count + 1
+ // then check if all of them are marked as non-existent.
+ // Bypass the check if (next != ghobject_t::get_max())
+ r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
+ nonexistent_count + 1, false, &ls, &next);
+ if (r >= 0) {
+ // If true mean collecton has more objects than nonexistent_count,
+ // so bypass check.
+ bool exists = (!next.is_max());
+ for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
+ dout(10) << __func__ << " oid " << *it << dendl;
+ auto onode = (*c)->onode_map.lookup(*it);
+ exists = !onode || onode->exists;
+ if (exists) {
+ dout(1) << __func__ << " " << *it
+ << " exists in db, "
+ << (!onode ? "not present in ram" : "present in ram")
+ << dendl;
+ }
+ }
+ if (!exists) {
+ _do_remove_collection(txc, c);
+ r = 0;
+ } else {
+ dout(10) << __func__ << " " << cid
+ << " is non-empty" << dendl;
+ r = -ENOTEMPTY;
+ }
+ }
+ }
+
+ out:
+ dout(10) << __func__ << " " << cid << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::_do_remove_collection(TransContext *txc,
+ CollectionRef *c)
+{
+ coll_map.erase((*c)->cid);
+ txc->removed_collections.push_back(*c);
+ (*c)->exists = false;
+ _osr_register_zombie((*c)->osr.get());
+ txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
+ c->reset();
+}
+
+int BlueStore::_split_collection(TransContext *txc,
+ CollectionRef& c,
+ CollectionRef& d,
+ unsigned bits, int rem)
+{
+ dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
+ << " bits " << bits << dendl;
+ RWLock::WLocker l(c->lock);
+ RWLock::WLocker l2(d->lock);
+ int r;
+
+ // flush all previous deferred writes on this sequencer. this is a bit
+ // heavyweight, but we need to make sure all deferred writes complete
+ // before we split as the new collection's sequencer may need to order
+ // this after those writes, and we don't bother with the complexity of
+ // moving those TransContexts over to the new osr.
+ _osr_drain_preceding(txc);
+
+ // move any cached items (onodes and referenced shared blobs) that will
+ // belong to the child collection post-split. leave everything else behind.
+ // this may include things that don't strictly belong to the now-smaller
+ // parent split, but the OSD will always send us a split for every new
+ // child.
+
+ spg_t pgid, dest_pgid;
+ bool is_pg = c->cid.is_pg(&pgid);
+ ceph_assert(is_pg);
+ is_pg = d->cid.is_pg(&dest_pgid);
+ ceph_assert(is_pg);
+
+ // the destination should initially be empty.
+ ceph_assert(d->onode_map.empty());
+ ceph_assert(d->shared_blob_set.empty());
+ ceph_assert(d->cnode.bits == bits);
+
+ c->split_cache(d.get());
+
+ // adjust bits. note that this will be redundant for all but the first
+ // split call for this parent (first child).
+ c->cnode.bits = bits;
+ ceph_assert(d->cnode.bits == bits);
+ r = 0;
+
+ bufferlist bl;
+ encode(c->cnode, bl);
+ txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
+
+ dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
+ << " bits " << bits << " = " << r << dendl;
+ return r;
+}
+
+int BlueStore::_merge_collection(
+ TransContext *txc,
+ CollectionRef *c,
+ CollectionRef& d,
+ unsigned bits)
+{
+ dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
+ << " bits " << bits << dendl;
+ RWLock::WLocker l((*c)->lock);
+ RWLock::WLocker l2(d->lock);
+ int r;
+
+ coll_t cid = (*c)->cid;
+
+ // flush all previous deferred writes on the source collection to ensure
+ // that all deferred writes complete before we merge as the target collection's
+ // sequencer may need to order new ops after those writes.
+
+ _osr_drain((*c)->osr.get());
+
+ // move any cached items (onodes and referenced shared blobs) that will
+ // belong to the child collection post-split. leave everything else behind.
+ // this may include things that don't strictly belong to the now-smaller
+ // parent split, but the OSD will always send us a split for every new
+ // child.
+
+ spg_t pgid, dest_pgid;
+ bool is_pg = cid.is_pg(&pgid);
+ ceph_assert(is_pg);
+ is_pg = d->cid.is_pg(&dest_pgid);
+ ceph_assert(is_pg);
+
+ // adjust bits. note that this will be redundant for all but the first
+ // merge call for the parent/target.
+ d->cnode.bits = bits;
+
+ // behavior depends on target (d) bits, so this after that is updated.
+ (*c)->split_cache(d.get());
+
+ // remove source collection
+ {
+ RWLock::WLocker l3(coll_lock);
+ _do_remove_collection(txc, c);
+ }
+
+ r = 0;
+
+ bufferlist bl;
+ encode(d->cnode, bl);
+ txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
+
+ dout(10) << __func__ << " " << cid << " to " << d->cid << " "
+ << " bits " << bits << " = " << r << dendl;
+ return r;
+}
+
+void BlueStore::log_latency(
+ const char* name,
+ int idx,
+ const ceph::timespan& l,
+ double lat_threshold,
+ const char* info) const
+{
+ logger->tinc(idx, l);
+ if (lat_threshold > 0.0 &&
+ l >= make_timespan(lat_threshold)) {
+ dout(0) << __func__ << " slow operation observed for " << name
+ << ", latency = " << l
+ << info
+ << dendl;
+ }
+}
+
+void BlueStore::log_latency_fn(
+ const char* name,
+ int idx,
+ const ceph::timespan& l,
+ double lat_threshold,
+ std::function<string (const ceph::timespan& lat)> fn) const
+{
+ logger->tinc(idx, l);
+ if (lat_threshold > 0.0 &&
+ l >= make_timespan(lat_threshold)) {
+ dout(0) << __func__ << " slow operation observed for " << name
+ << ", latency = " << l
+ << fn(l)
+ << dendl;
+ }
+}
+
+
+// DB key value Histogram
+#define KEY_SLAB 32
+#define VALUE_SLAB 64
+
+const string prefix_onode = "o";
+const string prefix_onode_shard = "x";
+const string prefix_other = "Z";
+
+int BlueStore::DBHistogram::get_key_slab(size_t sz)
+{
+ return (sz/KEY_SLAB);
+}
+
+string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
+{
+ int lower_bound = slab * KEY_SLAB;
+ int upper_bound = (slab + 1) * KEY_SLAB;
+ string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
+ return ret;
+}
+
+int BlueStore::DBHistogram::get_value_slab(size_t sz)
+{
+ return (sz/VALUE_SLAB);
+}
+
+string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
+{
+ int lower_bound = slab * VALUE_SLAB;
+ int upper_bound = (slab + 1) * VALUE_SLAB;
+ string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
+ return ret;
+}
+
+void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
+ const string &prefix, size_t key_size, size_t value_size)
+{
+ uint32_t key_slab = get_key_slab(key_size);
+ uint32_t value_slab = get_value_slab(value_size);
+ key_hist[prefix][key_slab].count++;
+ key_hist[prefix][key_slab].max_len =
+ std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
+ key_hist[prefix][key_slab].val_map[value_slab].count++;
+ key_hist[prefix][key_slab].val_map[value_slab].max_len =
+ std::max<size_t>(value_size,
+ key_hist[prefix][key_slab].val_map[value_slab].max_len);
+}
+
+void BlueStore::DBHistogram::dump(Formatter *f)
+{
+ f->open_object_section("rocksdb_value_distribution");
+ for (auto i : value_hist) {
+ f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
+ }
+ f->close_section();
+
+ f->open_object_section("rocksdb_key_value_histogram");
+ for (auto i : key_hist) {
+ f->dump_string("prefix", i.first);
+ f->open_object_section("key_hist");
+ for ( auto k : i.second) {
+ f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
+ f->dump_unsigned("max_len", k.second.max_len);
+ f->open_object_section("value_hist");
+ for ( auto j : k.second.val_map) {
+ f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
+ f->dump_unsigned("max_len", j.second.max_len);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+//Itrerates through the db and collects the stats
+void BlueStore::generate_db_histogram(Formatter *f)
+{
+ //globals
+ uint64_t num_onodes = 0;
+ uint64_t num_shards = 0;
+ uint64_t num_super = 0;
+ uint64_t num_coll = 0;
+ uint64_t num_omap = 0;
+ uint64_t num_pgmeta_omap = 0;
+ uint64_t num_deferred = 0;
+ uint64_t num_alloc = 0;
+ uint64_t num_stat = 0;
+ uint64_t num_others = 0;
+ uint64_t num_shared_shards = 0;
+ size_t max_key_size =0, max_value_size = 0;
+ uint64_t total_key_size = 0, total_value_size = 0;
+ size_t key_size = 0, value_size = 0;
+ DBHistogram hist;
+
+ auto start = coarse_mono_clock::now();
+
+ KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
+ iter->seek_to_first();
+ while (iter->valid()) {
+ dout(30) << __func__ << " Key: " << iter->key() << dendl;
+ key_size = iter->key_size();
+ value_size = iter->value_size();
+ hist.value_hist[hist.get_value_slab(value_size)]++;
+ max_key_size = std::max(max_key_size, key_size);
+ max_value_size = std::max(max_value_size, value_size);
+ total_key_size += key_size;
+ total_value_size += value_size;
+
+ pair<string,string> key(iter->raw_key());
+
+ if (key.first == PREFIX_SUPER) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
+ num_super++;
+ } else if (key.first == PREFIX_STAT) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
+ num_stat++;
+ } else if (key.first == PREFIX_COLL) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
+ num_coll++;
+ } else if (key.first == PREFIX_OBJ) {
+ if (key.second.back() == ONODE_KEY_SUFFIX) {
+ hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
+ num_onodes++;
+ } else {
+ hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
+ num_shards++;
+ }
+ } else if (key.first == PREFIX_OMAP) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
+ num_omap++;
+ } else if (key.first == PREFIX_PGMETA_OMAP) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
+ num_pgmeta_omap++;
+ } else if (key.first == PREFIX_DEFERRED) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
+ num_deferred++;
+ } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
+ num_alloc++;
+ } else if (key.first == PREFIX_SHARED_BLOB) {
+ hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
+ num_shared_shards++;
+ } else {
+ hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
+ num_others++;
+ }
+ iter->next();
+ }
+
+ ceph::timespan duration = coarse_mono_clock::now() - start;
+ f->open_object_section("rocksdb_key_value_stats");
+ f->dump_unsigned("num_onodes", num_onodes);
+ f->dump_unsigned("num_shards", num_shards);
+ f->dump_unsigned("num_super", num_super);
+ f->dump_unsigned("num_coll", num_coll);
+ f->dump_unsigned("num_omap", num_omap);
+ f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
+ f->dump_unsigned("num_deferred", num_deferred);
+ f->dump_unsigned("num_alloc", num_alloc);
+ f->dump_unsigned("num_stat", num_stat);
+ f->dump_unsigned("num_shared_shards", num_shared_shards);
+ f->dump_unsigned("num_others", num_others);
+ f->dump_unsigned("max_key_size", max_key_size);
+ f->dump_unsigned("max_value_size", max_value_size);
+ f->dump_unsigned("total_key_size", total_key_size);
+ f->dump_unsigned("total_value_size", total_value_size);
+ f->close_section();
+
+ hist.dump(f);
+
+ dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
+
+}
+
+void BlueStore::_flush_cache()
+{
+ dout(10) << __func__ << dendl;
+ for (auto i : cache_shards) {
+ i->trim_all();
+ ceph_assert(i->empty());
+ }
+ for (auto& p : coll_map) {
+ if (!p.second->onode_map.empty()) {
+ derr << __func__ << " stray onodes on " << p.first << dendl;
+ p.second->onode_map.dump<0>(cct);
+ }
+ if (!p.second->shared_blob_set.empty()) {
+ derr << __func__ << " stray shared blobs on " << p.first << dendl;
+ p.second->shared_blob_set.dump<0>(cct);
+ }
+ ceph_assert(p.second->onode_map.empty());
+ ceph_assert(p.second->shared_blob_set.empty());
+ }
+ coll_map.clear();
+}
+
+// For external caller.
+// We use a best-effort policy instead, e.g.,
+// we don't care if there are still some pinned onodes/data in the cache
+// after this command is completed.
+int BlueStore::flush_cache(ostream *os)
+{
+ dout(10) << __func__ << dendl;
+ for (auto i : cache_shards) {
+ i->trim_all();
+ }
+
+ return 0;
+}
+
+void BlueStore::_apply_padding(uint64_t head_pad,
+ uint64_t tail_pad,
+ bufferlist& padded)
+{
+ if (head_pad) {
+ padded.prepend_zero(head_pad);
+ }
+ if (tail_pad) {
+ padded.append_zero(tail_pad);
+ }
+ if (head_pad || tail_pad) {
+ dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
+ << " tail 0x" << tail_pad << std::dec << dendl;
+ logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
+ }
+}
+
+void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
+{
+ // finalize extent_map shards
+ o->extent_map.update(txn, false);
+ if (o->extent_map.needs_reshard()) {
+ o->extent_map.reshard(db, txn);
+ o->extent_map.update(txn, true);
+ if (o->extent_map.needs_reshard()) {
+ dout(20) << __func__ << " warning: still wants reshard, check options?"
+ << dendl;
+ o->extent_map.clear_needs_reshard();
+ }
+ logger->inc(l_bluestore_onode_reshard);
+ }
+
+ // bound encode
+ size_t bound = 0;
+ denc(o->onode, bound);
+ o->extent_map.bound_encode_spanning_blobs(bound);
+ if (o->onode.extent_map_shards.empty()) {
+ denc(o->extent_map.inline_bl, bound);
+ }
+
+ // encode
+ bufferlist bl;
+ unsigned onode_part, blob_part, extent_part;
+ {
+ auto p = bl.get_contiguous_appender(bound, true);
+ denc(o->onode, p);
+ onode_part = p.get_logical_offset();
+ o->extent_map.encode_spanning_blobs(p);
+ blob_part = p.get_logical_offset() - onode_part;
+ if (o->onode.extent_map_shards.empty()) {
+ denc(o->extent_map.inline_bl, p);
+ }
+ extent_part = p.get_logical_offset() - onode_part - blob_part;
+ }
+
+ dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
+ << " (" << onode_part << " bytes onode + "
+ << blob_part << " bytes spanning blobs + "
+ << extent_part << " bytes inline extents)"
+ << dendl;
+
+
+ txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
+}
+
+void BlueStore::_log_alerts(osd_alert_list_t& alerts)
+{
+ std::lock_guard l(qlock);
+
+ if (!disk_size_mismatch_alert.empty()) {
+ alerts.emplace(
+ "BLUESTORE_DISK_SIZE_MISMATCH",
+ disk_size_mismatch_alert);
+ }
+ if (!legacy_statfs_alert.empty()) {
+ alerts.emplace(
+ "BLUESTORE_LEGACY_STATFS",
+ legacy_statfs_alert);
+ }
+ if (!spillover_alert.empty() &&
+ cct->_conf->bluestore_warn_on_bluefs_spillover) {
+ alerts.emplace(
+ "BLUEFS_SPILLOVER",
+ spillover_alert);
+ }
+ string s0(failed_cmode);
+
+ if (!failed_compressors.empty()) {
+ if (!s0.empty()) {
+ s0 += ", ";
+ }
+ s0 += "unable to load:";
+ bool first = true;
+ for (auto& s : failed_compressors) {
+ if (first) {
+ first = false;
+ } else {
+ s0 += ", ";
+ }
+ s0 += s;
+ }
+ alerts.emplace(
+ "BLUESTORE_NO_COMPRESSION",
+ s0);
+ }
+}
+
+// ===========================================
+// BlueStoreRepairer
+
+size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
+ const interval_set<uint64_t>& extents)
+{
+ ceph_assert(granularity); // initialized
+ // can't call for the second time
+ ceph_assert(!was_filtered_out);
+ ceph_assert(collections_bfs.size() == objects_bfs.size());
+
+ uint64_t prev_pos = 0;
+ uint64_t npos = collections_bfs.size();
+
+ bloom_vector collections_reduced;
+ bloom_vector objects_reduced;
+
+ for (auto e : extents) {
+ if (e.second == 0) {
+ continue;
+ }
+ uint64_t pos = max(e.first / granularity, prev_pos);
+ uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
+ while (pos != npos && pos < end_pos) {
+ ceph_assert( collections_bfs[pos].element_count() ==
+ objects_bfs[pos].element_count());
+ if (collections_bfs[pos].element_count()) {
+ collections_reduced.push_back(std::move(collections_bfs[pos]));
+ objects_reduced.push_back(std::move(objects_bfs[pos]));
+ }
+ ++pos;
+ }
+ prev_pos = end_pos;
+ }
+ collections_reduced.swap(collections_bfs);
+ objects_reduced.swap(objects_bfs);
+ was_filtered_out = true;
+ return collections_bfs.size();
+}
+
+bool BlueStoreRepairer::remove_key(KeyValueDB *db,
+ const string& prefix,
+ const string& key)
+{
+ if (!remove_key_txn) {
+ remove_key_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ remove_key_txn->rmkey(prefix, key);
+
+ return true;
+}
+
+bool BlueStoreRepairer::fix_shared_blob(
+ KeyValueDB *db,
+ uint64_t sbid,
+ const bufferlist* bl)
+{
+ KeyValueDB::Transaction txn;
+ if (fix_misreferences_txn) { // reuse this txn
+ txn = fix_misreferences_txn;
+ } else {
+ if (!fix_shared_blob_txn) {
+ fix_shared_blob_txn = db->get_transaction();
+ }
+ txn = fix_shared_blob_txn;
+ }
+ string key;
+ get_shared_blob_key(sbid, &key);
+
+ ++to_repair_cnt;
+ if (bl) {
+ txn->set(PREFIX_SHARED_BLOB, key, *bl);
+ } else {
+ txn->rmkey(PREFIX_SHARED_BLOB, key);
+ }
+ return true;
+}
+
+bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
+ const string& key,
+ const store_statfs_t& new_statfs)
+{
+ if (!fix_statfs_txn) {
+ fix_statfs_txn = db->get_transaction();
+ }
+ BlueStore::volatile_statfs vstatfs;
+ vstatfs = new_statfs;
+ bufferlist bl;
+ vstatfs.encode(bl);
+ ++to_repair_cnt;
+ fix_statfs_txn->set(PREFIX_STAT, key, bl);
+ return true;
+}
+
+bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len)
+{
+ if (!fix_fm_leaked_txn) {
+ fix_fm_leaked_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ fm->release(offset, len, fix_fm_leaked_txn);
+ return true;
+}
+bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len)
+{
+ if (!fix_fm_false_free_txn) {
+ fix_fm_false_free_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ fm->allocate(offset, len, fix_fm_false_free_txn);
+ return true;
+}
+
+bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag)
+{
+ // this is just a stub to count num of repairs properly,
+ // actual repair happens in BlueStore::_close_db_and_around()
+ // while doing _sync_bluefs_and_fm
+ ++out_of_sync_flag;
+ ++to_repair_cnt;
+ return true;
+}
+KeyValueDB::Transaction BlueStoreRepairer::fix_spanning_blobs(KeyValueDB* db)
+{
+ if (!fix_onode_txn) {
+ fix_onode_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ return fix_onode_txn;
+}
+
+bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
+{
+ if (misreferenced_extents.size()) {
+ size_t n = space_usage_tracker.filter_out(misreferenced_extents);
+ ceph_assert(n > 0);
+ if (!fix_misreferences_txn) {
+ fix_misreferences_txn = db->get_transaction();
+ }
+ return true;
+ }
+ return false;
+}
+
+unsigned BlueStoreRepairer::apply(KeyValueDB* db)
+{
+ if (fix_fm_leaked_txn) {
+ db->submit_transaction_sync(fix_fm_leaked_txn);
+ fix_fm_leaked_txn = nullptr;
+ }
+ if (fix_fm_false_free_txn) {
+ db->submit_transaction_sync(fix_fm_false_free_txn);
+ fix_fm_false_free_txn = nullptr;
+ }
+ if (remove_key_txn) {
+ db->submit_transaction_sync(remove_key_txn);
+ remove_key_txn = nullptr;
+ }
+ if (fix_misreferences_txn) {
+ db->submit_transaction_sync(fix_misreferences_txn);
+ fix_misreferences_txn = nullptr;
+ }
+ if (fix_onode_txn) {
+ db->submit_transaction_sync(fix_onode_txn);
+ fix_onode_txn = nullptr;
+ }
+ if (fix_shared_blob_txn) {
+ db->submit_transaction_sync(fix_shared_blob_txn);
+ fix_shared_blob_txn = nullptr;
+ }
+
+ if (fix_statfs_txn) {
+ db->submit_transaction_sync(fix_statfs_txn);
+ fix_statfs_txn = nullptr;
+ }
+ unsigned repaired = to_repair_cnt;
+ to_repair_cnt = 0;
+ return repaired;
+}
+
+// =======================================================
+// RocksDBBlueFSVolumeSelector
+
+uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
+ ceph_assert(h != nullptr);
+ uint64_t hint = reinterpret_cast<uint64_t>(h);
+ uint8_t res;
+ switch (hint) {
+ case LEVEL_SLOW:
+ res = BlueFS::BDEV_SLOW;
+ if (db_avail4slow > 0) {
+ // considering statically available db space vs.
+ // - observed maximums on DB dev for DB/WAL/UNSORTED data
+ // - observed maximum spillovers
+ uint64_t max_db_use = 0; // max db usage we potentially observed
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
+ // this could go to db hence using it in the estimation
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
+
+ auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
+ uint64_t avail = min(
+ db_avail4slow,
+ max_db_use < db_total ? db_total - max_db_use : 0);
+
+ // considering current DB dev usage for SLOW data
+ if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
+ res = BlueFS::BDEV_DB;
+ }
+ }
+ break;
+ case LEVEL_WAL:
+ res = BlueFS::BDEV_WAL;
+ break;
+ case LEVEL_DB:
+ default:
+ res = BlueFS::BDEV_DB;
+ break;
+ }
+ return res;
+}
+
+void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+ res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
+ res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
+}
+
+void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const {
+ uint8_t res = LEVEL_DB;
+ if (dirname.length() > 5) {
+ // the "db.slow" and "db.wal" directory names are hard-coded at
+ // match up with bluestore. the slow device is always the second
+ // one (when a dedicated block.db device is present and used at
+ // bdev 0). the wal device is always last.
+ if (boost::algorithm::ends_with(dirname, ".slow")) {
+ res = LEVEL_SLOW;
+ }
+ else if (boost::algorithm::ends_with(dirname, ".wal")) {
+ res = LEVEL_WAL;
+ }
+ }
+ return reinterpret_cast<void*>(res);
+}
+
+void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
+ auto max_x = per_level_per_dev_usage.get_max_x();
+ auto max_y = per_level_per_dev_usage.get_max_y();
+ sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
+ << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
+ << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
+ << ", db_avail:" << db_avail4slow << std::endl
+ << "Usage matrix:" << std::endl;
+ constexpr std::array<const char*, 7> names{ {
+ "DEV/LEV",
+ "WAL",
+ "DB",
+ "SLOW",
+ "*",
+ "*",
+ "REAL"
+ } };
+ const size_t width = 12;
+ for (size_t i = 0; i < names.size(); ++i) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << names[i];
+ }
+ sout << std::endl;
+ for (size_t l = 0; l < max_y; l++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ switch (l + LEVEL_FIRST) {
+ case LEVEL_WAL:
+ sout << "WAL"; break;
+ case LEVEL_DB:
+ sout << "DB"; break;
+ case LEVEL_SLOW:
+ sout << "SLOW"; break;
+ case LEVEL_MAX:
+ sout << "TOTALS"; break;
+ }
+ for (size_t d = 0; d < max_x - 1; d++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
+ }
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_usage.at(max_x - 1, l)))
+ << std::endl;
+ }
+ ceph_assert(max_x == per_level_per_dev_max.get_max_x());
+ ceph_assert(max_y == per_level_per_dev_max.get_max_y());
+ sout << "MAXIMUMS:" << std::endl;
+ for (size_t l = 0; l < max_y; l++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ switch (l + LEVEL_FIRST) {
+ case LEVEL_WAL:
+ sout << "WAL"; break;
+ case LEVEL_DB:
+ sout << "DB"; break;
+ case LEVEL_SLOW:
+ sout << "SLOW"; break;
+ case LEVEL_MAX:
+ sout << "TOTALS"; break;
+ }
+ for (size_t d = 0; d < max_x - 1; d++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
+ }
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
+ if (l < max_y - 1) {
+ sout << std::endl;
+ }
+ }
+}
+
+// =======================================================
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
new file mode 100644
index 00000000..159e9296
--- /dev/null
+++ b/src/os/bluestore/BlueStore.h
@@ -0,0 +1,3602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_BLUESTORE_H
+#define CEPH_OSD_BLUESTORE_H
+
+#include "acconfig.h"
+
+#include <unistd.h>
+
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/unordered_set.hpp>
+#include <boost/intrusive/set.hpp>
+#include <boost/functional/hash.hpp>
+#include <boost/dynamic_bitset.hpp>
+
+#include "include/cpp-btree/btree_set.h"
+
+#include "include/ceph_assert.h"
+#include "include/unordered_map.h"
+#include "include/mempool.h"
+#include "common/bloom_filter.hpp"
+#include "common/Finisher.h"
+#include "common/Throttle.h"
+#include "common/perf_counters.h"
+#include "common/PriorityCache.h"
+#include "compressor/Compressor.h"
+#include "os/ObjectStore.h"
+
+#include "bluestore_types.h"
+#include "BlockDevice.h"
+#include "BlueFS.h"
+#include "common/EventTrace.h"
+
+class Allocator;
+class FreelistManager;
+class BlueStoreRepairer;
+
+//#define DEBUG_CACHE
+//#define DEBUG_DEFERRED
+
+
+
+// constants for Buffer::optimize()
+#define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
+
+
+enum {
+ l_bluestore_first = 732430,
+ l_bluestore_kv_flush_lat,
+ l_bluestore_kv_commit_lat,
+ l_bluestore_kv_sync_lat,
+ l_bluestore_kv_final_lat,
+ l_bluestore_state_prepare_lat,
+ l_bluestore_state_aio_wait_lat,
+ l_bluestore_state_io_done_lat,
+ l_bluestore_state_kv_queued_lat,
+ l_bluestore_state_kv_committing_lat,
+ l_bluestore_state_kv_done_lat,
+ l_bluestore_state_deferred_queued_lat,
+ l_bluestore_state_deferred_aio_wait_lat,
+ l_bluestore_state_deferred_cleanup_lat,
+ l_bluestore_state_finishing_lat,
+ l_bluestore_state_done_lat,
+ l_bluestore_throttle_lat,
+ l_bluestore_submit_lat,
+ l_bluestore_commit_lat,
+ l_bluestore_read_lat,
+ l_bluestore_read_onode_meta_lat,
+ l_bluestore_read_wait_aio_lat,
+ l_bluestore_compress_lat,
+ l_bluestore_decompress_lat,
+ l_bluestore_csum_lat,
+ l_bluestore_compress_success_count,
+ l_bluestore_compress_rejected_count,
+ l_bluestore_write_pad_bytes,
+ l_bluestore_deferred_write_ops,
+ l_bluestore_deferred_write_bytes,
+ l_bluestore_write_penalty_read_ops,
+ l_bluestore_allocated,
+ l_bluestore_stored,
+ l_bluestore_compressed,
+ l_bluestore_compressed_allocated,
+ l_bluestore_compressed_original,
+ l_bluestore_onodes,
+ l_bluestore_onode_hits,
+ l_bluestore_onode_misses,
+ l_bluestore_onode_shard_hits,
+ l_bluestore_onode_shard_misses,
+ l_bluestore_extents,
+ l_bluestore_blobs,
+ l_bluestore_buffers,
+ l_bluestore_buffer_bytes,
+ l_bluestore_buffer_hit_bytes,
+ l_bluestore_buffer_miss_bytes,
+ l_bluestore_write_big,
+ l_bluestore_write_big_bytes,
+ l_bluestore_write_big_blobs,
+ l_bluestore_write_small,
+ l_bluestore_write_small_bytes,
+ l_bluestore_write_small_unused,
+ l_bluestore_write_small_deferred,
+ l_bluestore_write_small_pre_read,
+ l_bluestore_write_small_new,
+ l_bluestore_txc,
+ l_bluestore_onode_reshard,
+ l_bluestore_blob_split,
+ l_bluestore_extent_compress,
+ l_bluestore_gc_merged,
+ l_bluestore_read_eio,
+ l_bluestore_reads_with_retries,
+ l_bluestore_fragmentation,
+ l_bluestore_omap_seek_to_first_lat,
+ l_bluestore_omap_upper_bound_lat,
+ l_bluestore_omap_lower_bound_lat,
+ l_bluestore_omap_next_lat,
+ l_bluestore_omap_get_keys_lat,
+ l_bluestore_omap_get_values_lat,
+ l_bluestore_clist_lat,
+ l_bluestore_remove_lat,
+ l_bluestore_last
+};
+
+#define META_POOL_ID ((uint64_t)-1ull)
+
+class BlueStore : public ObjectStore,
+ public BlueFSDeviceExpander,
+ public md_config_obs_t {
+ // -----------------------------------------------------
+ // types
+public:
+ // config observer
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override;
+
+ //handler for discard event
+ void handle_discard(interval_set<uint64_t>& to_release);
+
+ void _set_csum();
+ void _set_compression();
+ void _set_throttle_params();
+ int _set_cache_sizes();
+
+ class TransContext;
+
+ typedef map<uint64_t, bufferlist> ready_regions_t;
+
+
+ struct BufferSpace;
+ struct Collection;
+ typedef boost::intrusive_ptr<Collection> CollectionRef;
+
+ struct AioContext {
+ virtual void aio_finish(BlueStore *store) = 0;
+ virtual ~AioContext() {}
+ };
+
+ /// cached buffer
+ struct Buffer {
+ MEMPOOL_CLASS_HELPERS();
+
+ enum {
+ STATE_EMPTY, ///< empty buffer -- used for cache history
+ STATE_CLEAN, ///< clean data that is up to date
+ STATE_WRITING, ///< data that is being written (io not yet complete)
+ };
+ static const char *get_state_name(int s) {
+ switch (s) {
+ case STATE_EMPTY: return "empty";
+ case STATE_CLEAN: return "clean";
+ case STATE_WRITING: return "writing";
+ default: return "???";
+ }
+ }
+ enum {
+ FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN)
+ // NOTE: fix operator<< when you define a second flag
+ };
+ static const char *get_flag_name(int s) {
+ switch (s) {
+ case FLAG_NOCACHE: return "nocache";
+ default: return "???";
+ }
+ }
+
+ BufferSpace *space;
+ uint16_t state; ///< STATE_*
+ uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
+ uint32_t flags; ///< FLAG_*
+ uint64_t seq;
+ uint32_t offset, length;
+ bufferlist data;
+
+ boost::intrusive::list_member_hook<> lru_item;
+ boost::intrusive::list_member_hook<> state_item;
+
+ Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
+ unsigned f = 0)
+ : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
+ Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, bufferlist& b,
+ unsigned f = 0)
+ : space(space), state(s), flags(f), seq(q), offset(o),
+ length(b.length()), data(b) {}
+
+ bool is_empty() const {
+ return state == STATE_EMPTY;
+ }
+ bool is_clean() const {
+ return state == STATE_CLEAN;
+ }
+ bool is_writing() const {
+ return state == STATE_WRITING;
+ }
+
+ uint32_t end() const {
+ return offset + length;
+ }
+
+ void truncate(uint32_t newlen) {
+ ceph_assert(newlen < length);
+ if (data.length()) {
+ bufferlist t;
+ t.substr_of(data, 0, newlen);
+ data.claim(t);
+ }
+ length = newlen;
+ }
+ void maybe_rebuild() {
+ if (data.length() &&
+ (data.get_num_buffers() > 1 ||
+ data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
+ data.rebuild();
+ }
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_string("state", get_state_name(state));
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+ f->dump_unsigned("data_length", data.length());
+ }
+ };
+
+ struct Cache;
+
+ /// map logical extent range (object) onto buffers
+ struct BufferSpace {
+ enum {
+ BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache
+ };
+
+ typedef boost::intrusive::list<
+ Buffer,
+ boost::intrusive::member_hook<
+ Buffer,
+ boost::intrusive::list_member_hook<>,
+ &Buffer::state_item> > state_list_t;
+
+ mempool::bluestore_cache_meta::map<uint32_t, std::unique_ptr<Buffer>>
+ buffer_map;
+
+ // we use a bare intrusive list here instead of std::map because
+ // it uses less memory and we expect this to be very small (very
+ // few IOs in flight to the same Blob at the same time).
+ state_list_t writing; ///< writing buffers, sorted by seq, ascending
+
+ ~BufferSpace() {
+ ceph_assert(buffer_map.empty());
+ ceph_assert(writing.empty());
+ }
+
+ void _add_buffer(Cache* cache, Buffer *b, int level, Buffer *near) {
+ cache->_audit("_add_buffer start");
+ buffer_map[b->offset].reset(b);
+ if (b->is_writing()) {
+ b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
+ if (writing.empty() || writing.rbegin()->seq <= b->seq) {
+ writing.push_back(*b);
+ } else {
+ auto it = writing.begin();
+ while (it->seq < b->seq) {
+ ++it;
+ }
+
+ ceph_assert(it->seq >= b->seq);
+ // note that this will insert b before it
+ // hence the order is maintained
+ writing.insert(it, *b);
+ }
+ } else {
+ b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
+ cache->_add_buffer(b, level, near);
+ }
+ cache->_audit("_add_buffer end");
+ }
+ void _rm_buffer(Cache* cache, Buffer *b) {
+ _rm_buffer(cache, buffer_map.find(b->offset));
+ }
+ void _rm_buffer(Cache* cache,
+ map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
+ ceph_assert(p != buffer_map.end());
+ cache->_audit("_rm_buffer start");
+ if (p->second->is_writing()) {
+ writing.erase(writing.iterator_to(*p->second));
+ } else {
+ cache->_rm_buffer(p->second.get());
+ }
+ buffer_map.erase(p);
+ cache->_audit("_rm_buffer end");
+ }
+
+ map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
+ uint32_t offset) {
+ auto i = buffer_map.lower_bound(offset);
+ if (i != buffer_map.begin()) {
+ --i;
+ if (i->first + i->second->length <= offset)
+ ++i;
+ }
+ return i;
+ }
+
+ // must be called under protection of the Cache lock
+ void _clear(Cache* cache);
+
+ // return value is the highest cache_private of a trimmed buffer, or 0.
+ int discard(Cache* cache, uint32_t offset, uint32_t length) {
+ std::lock_guard l(cache->lock);
+ return _discard(cache, offset, length);
+ }
+ int _discard(Cache* cache, uint32_t offset, uint32_t length);
+
+ void write(Cache* cache, uint64_t seq, uint32_t offset, bufferlist& bl,
+ unsigned flags) {
+ std::lock_guard l(cache->lock);
+ Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
+ flags);
+ b->cache_private = _discard(cache, offset, bl.length());
+ _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
+ }
+ void _finish_write(Cache* cache, uint64_t seq);
+ void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
+ std::lock_guard l(cache->lock);
+ Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
+ b->cache_private = _discard(cache, offset, bl.length());
+ _add_buffer(cache, b, 1, nullptr);
+ }
+
+ void read(Cache* cache, uint32_t offset, uint32_t length,
+ BlueStore::ready_regions_t& res,
+ interval_set<uint32_t>& res_intervals,
+ int flags = 0);
+
+ void truncate(Cache* cache, uint32_t offset) {
+ discard(cache, offset, (uint32_t)-1 - offset);
+ }
+
+ void split(Cache* cache, size_t pos, BufferSpace &r);
+
+ void dump(Cache* cache, Formatter *f) const {
+ std::lock_guard l(cache->lock);
+ f->open_array_section("buffers");
+ for (auto& i : buffer_map) {
+ f->open_object_section("buffer");
+ ceph_assert(i.first == i.second->offset);
+ i.second->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ };
+
+ struct SharedBlobSet;
+
+ /// in-memory shared blob state (incl cached buffers)
+ struct SharedBlob {
+ MEMPOOL_CLASS_HELPERS();
+
+ std::atomic_int nref = {0}; ///< reference count
+ bool loaded = false;
+
+ CollectionRef coll;
+ union {
+ uint64_t sbid_unloaded; ///< sbid if persistent isn't loaded
+ bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
+ };
+ BufferSpace bc; ///< buffer cache
+
+ SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
+ if (get_cache()) {
+ get_cache()->add_blob();
+ }
+ }
+ SharedBlob(uint64_t i, Collection *_coll);
+ ~SharedBlob();
+
+ uint64_t get_sbid() const {
+ return loaded ? persistent->sbid : sbid_unloaded;
+ }
+
+ friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
+ friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
+
+ friend ostream& operator<<(ostream& out, const SharedBlob& sb);
+
+ void get() {
+ ++nref;
+ }
+ void put();
+
+ /// get logical references
+ void get_ref(uint64_t offset, uint32_t length);
+
+ /// put logical references, and get back any released extents
+ void put_ref(uint64_t offset, uint32_t length,
+ PExtentVector *r, bool *unshare);
+
+ void finish_write(uint64_t seq);
+
+ friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
+ return l.get_sbid() == r.get_sbid();
+ }
+ inline Cache* get_cache() {
+ return coll ? coll->cache : nullptr;
+ }
+ inline SharedBlobSet* get_parent() {
+ return coll ? &(coll->shared_blob_set) : nullptr;
+ }
+ inline bool is_loaded() const {
+ return loaded;
+ }
+
+ };
+ typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
+
+ /// a lookup table of SharedBlobs
+ struct SharedBlobSet {
+ /// protect lookup, insertion, removal
+ ceph::mutex lock = ceph::make_mutex("BlueStore::SharedBlobSet::lock");
+
+ // we use a bare pointer because we don't want to affect the ref
+ // count
+ mempool::bluestore_cache_meta::unordered_map<uint64_t,SharedBlob*> sb_map;
+
+ SharedBlobRef lookup(uint64_t sbid) {
+ std::lock_guard l(lock);
+ auto p = sb_map.find(sbid);
+ if (p == sb_map.end() ||
+ p->second->nref == 0) {
+ return nullptr;
+ }
+ return p->second;
+ }
+
+ void add(Collection* coll, SharedBlob *sb) {
+ std::lock_guard l(lock);
+ sb_map[sb->get_sbid()] = sb;
+ sb->coll = coll;
+ }
+
+ bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) {
+ std::lock_guard l(lock);
+ ceph_assert(sb->get_parent() == this);
+ if (verify_nref_is_zero && sb->nref != 0) {
+ return false;
+ }
+ // only remove if it still points to us
+ auto p = sb_map.find(sb->get_sbid());
+ if (p != sb_map.end() &&
+ p->second == sb) {
+ sb_map.erase(p);
+ }
+ return true;
+ }
+
+ bool empty() {
+ std::lock_guard l(lock);
+ return sb_map.empty();
+ }
+
+ template <int LogLevelV>
+ void dump(CephContext *cct);
+ };
+
+//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
+
+ /// in-memory blob metadata and associated cached buffers (if any)
+ struct Blob {
+ MEMPOOL_CLASS_HELPERS();
+
+ std::atomic_int nref = {0}; ///< reference count
+ int16_t id = -1; ///< id, for spanning blobs only, >= 0
+ int16_t last_encoded_id = -1; ///< (ephemeral) used during encoding only
+ SharedBlobRef shared_blob; ///< shared blob state (if any)
+
+ private:
+ mutable bluestore_blob_t blob; ///< decoded blob metadata
+#ifdef CACHE_BLOB_BL
+ mutable bufferlist blob_bl; ///< cached encoded blob, blob is dirty if empty
+#endif
+ /// refs from this shard. ephemeral if id<0, persisted if spanning.
+ bluestore_blob_use_tracker_t used_in_blob;
+
+ public:
+
+ friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
+ friend void intrusive_ptr_release(Blob *b) { b->put(); }
+
+ friend ostream& operator<<(ostream& out, const Blob &b);
+
+ const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
+ return used_in_blob;
+ }
+ bool is_referenced() const {
+ return used_in_blob.is_not_empty();
+ }
+ uint32_t get_referenced_bytes() const {
+ return used_in_blob.get_referenced_bytes();
+ }
+
+ bool is_spanning() const {
+ return id >= 0;
+ }
+
+ bool can_split() const {
+ std::lock_guard l(shared_blob->get_cache()->lock);
+ // splitting a BufferSpace writing list is too hard; don't try.
+ return shared_blob->bc.writing.empty() &&
+ used_in_blob.can_split() &&
+ get_blob().can_split();
+ }
+
+ bool can_split_at(uint32_t blob_offset) const {
+ return used_in_blob.can_split_at(blob_offset) &&
+ get_blob().can_split_at(blob_offset);
+ }
+
+ bool can_reuse_blob(uint32_t min_alloc_size,
+ uint32_t target_blob_size,
+ uint32_t b_offset,
+ uint32_t *length0);
+
+ void dup(Blob& o) {
+ o.shared_blob = shared_blob;
+ o.blob = blob;
+#ifdef CACHE_BLOB_BL
+ o.blob_bl = blob_bl;
+#endif
+ }
+
+ inline const bluestore_blob_t& get_blob() const {
+ return blob;
+ }
+ inline bluestore_blob_t& dirty_blob() {
+#ifdef CACHE_BLOB_BL
+ blob_bl.clear();
+#endif
+ return blob;
+ }
+
+ /// discard buffers for unallocated regions
+ void discard_unallocated(Collection *coll);
+
+ /// get logical references
+ void get_ref(Collection *coll, uint32_t offset, uint32_t length);
+ /// put logical references, and get back any released extents
+ bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
+ PExtentVector *r);
+
+ /// split the blob
+ void split(Collection *coll, uint32_t blob_offset, Blob *o);
+
+ void get() {
+ ++nref;
+ }
+ void put() {
+ if (--nref == 0)
+ delete this;
+ }
+
+
+#ifdef CACHE_BLOB_BL
+ void _encode() const {
+ if (blob_bl.length() == 0 ) {
+ encode(blob, blob_bl);
+ } else {
+ ceph_assert(blob_bl.length());
+ }
+ }
+ void bound_encode(
+ size_t& p,
+ bool include_ref_map) const {
+ _encode();
+ p += blob_bl.length();
+ if (include_ref_map) {
+ used_in_blob.bound_encode(p);
+ }
+ }
+ void encode(
+ bufferlist::contiguous_appender& p,
+ bool include_ref_map) const {
+ _encode();
+ p.append(blob_bl);
+ if (include_ref_map) {
+ used_in_blob.encode(p);
+ }
+ }
+ void decode(
+ Collection */*coll*/,
+ bufferptr::const_iterator& p,
+ bool include_ref_map) {
+ const char *start = p.get_pos();
+ denc(blob, p);
+ const char *end = p.get_pos();
+ blob_bl.clear();
+ blob_bl.append(start, end - start);
+ if (include_ref_map) {
+ used_in_blob.decode(p);
+ }
+ }
+#else
+ void bound_encode(
+ size_t& p,
+ uint64_t struct_v,
+ uint64_t sbid,
+ bool include_ref_map) const {
+ denc(blob, p, struct_v);
+ if (blob.is_shared()) {
+ denc(sbid, p);
+ }
+ if (include_ref_map) {
+ used_in_blob.bound_encode(p);
+ }
+ }
+ void encode(
+ bufferlist::contiguous_appender& p,
+ uint64_t struct_v,
+ uint64_t sbid,
+ bool include_ref_map) const {
+ denc(blob, p, struct_v);
+ if (blob.is_shared()) {
+ denc(sbid, p);
+ }
+ if (include_ref_map) {
+ used_in_blob.encode(p);
+ }
+ }
+ void decode(
+ Collection *coll,
+ bufferptr::const_iterator& p,
+ uint64_t struct_v,
+ uint64_t* sbid,
+ bool include_ref_map);
+#endif
+ };
+ typedef boost::intrusive_ptr<Blob> BlobRef;
+ typedef mempool::bluestore_cache_meta::map<int,BlobRef> blob_map_t;
+
+ /// a logical extent, pointing to (some portion of) a blob
+ typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
+ struct Extent : public ExtentBase {
+ MEMPOOL_CLASS_HELPERS();
+
+ uint32_t logical_offset = 0; ///< logical offset
+ uint32_t blob_offset = 0; ///< blob offset
+ uint32_t length = 0; ///< length
+ BlobRef blob; ///< the blob with our data
+
+ /// ctor for lookup only
+ explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
+ /// ctor for delayed initialization (see decode_some())
+ explicit Extent() : ExtentBase() {
+ }
+ /// ctor for general usage
+ Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
+ : ExtentBase(),
+ logical_offset(lo), blob_offset(o), length(l) {
+ assign_blob(b);
+ }
+ ~Extent() {
+ if (blob) {
+ blob->shared_blob->get_cache()->rm_extent();
+ }
+ }
+
+ void assign_blob(const BlobRef& b) {
+ ceph_assert(!blob);
+ blob = b;
+ blob->shared_blob->get_cache()->add_extent();
+ }
+
+ // comparators for intrusive_set
+ friend bool operator<(const Extent &a, const Extent &b) {
+ return a.logical_offset < b.logical_offset;
+ }
+ friend bool operator>(const Extent &a, const Extent &b) {
+ return a.logical_offset > b.logical_offset;
+ }
+ friend bool operator==(const Extent &a, const Extent &b) {
+ return a.logical_offset == b.logical_offset;
+ }
+
+ uint32_t blob_start() const {
+ return logical_offset - blob_offset;
+ }
+
+ uint32_t blob_end() const {
+ return blob_start() + blob->get_blob().get_logical_length();
+ }
+
+ uint32_t logical_end() const {
+ return logical_offset + length;
+ }
+
+ // return true if any piece of the blob is out of
+ // the given range [o, o + l].
+ bool blob_escapes_range(uint32_t o, uint32_t l) const {
+ return blob_start() < o || blob_end() > o + l;
+ }
+ };
+ typedef boost::intrusive::set<Extent> extent_map_t;
+
+
+ friend ostream& operator<<(ostream& out, const Extent& e);
+
+ struct OldExtent {
+ boost::intrusive::list_member_hook<> old_extent_item;
+ Extent e;
+ PExtentVector r;
+ bool blob_empty; // flag to track the last removed extent that makes blob
+ // empty - required to update compression stat properly
+ OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
+ : e(lo, o, l, b), blob_empty(false) {
+ }
+ static OldExtent* create(CollectionRef c,
+ uint32_t lo,
+ uint32_t o,
+ uint32_t l,
+ BlobRef& b);
+ };
+ typedef boost::intrusive::list<
+ OldExtent,
+ boost::intrusive::member_hook<
+ OldExtent,
+ boost::intrusive::list_member_hook<>,
+ &OldExtent::old_extent_item> > old_extent_map_t;
+
+ struct Onode;
+
+ /// a sharded extent map, mapping offsets to lextents to blobs
+ struct ExtentMap {
+ Onode *onode;
+ extent_map_t extent_map; ///< map of Extents to Blobs
+ blob_map_t spanning_blob_map; ///< blobs that span shards
+ typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+ struct Shard {
+ bluestore_onode_t::shard_info *shard_info = nullptr;
+ unsigned extents = 0; ///< count extents in this shard
+ bool loaded = false; ///< true if shard is loaded
+ bool dirty = false; ///< true if shard is dirty and needs reencoding
+ };
+ mempool::bluestore_cache_meta::vector<Shard> shards; ///< shards
+
+ bufferlist inline_bl; ///< cached encoded map, if unsharded; empty=>dirty
+
+ uint32_t needs_reshard_begin = 0;
+ uint32_t needs_reshard_end = 0;
+
+ void dup(BlueStore* b, TransContext*, CollectionRef&, OnodeRef&, OnodeRef&,
+ uint64_t&, uint64_t&, uint64_t&);
+
+ bool needs_reshard() const {
+ return needs_reshard_end > needs_reshard_begin;
+ }
+ void clear_needs_reshard() {
+ needs_reshard_begin = needs_reshard_end = 0;
+ }
+ void request_reshard(uint32_t begin, uint32_t end) {
+ if (begin < needs_reshard_begin) {
+ needs_reshard_begin = begin;
+ }
+ if (end > needs_reshard_end) {
+ needs_reshard_end = end;
+ }
+ }
+
+ struct DeleteDisposer {
+ void operator()(Extent *e) { delete e; }
+ };
+
+ ExtentMap(Onode *o);
+ ~ExtentMap() {
+ extent_map.clear_and_dispose(DeleteDisposer());
+ }
+
+ void clear() {
+ extent_map.clear_and_dispose(DeleteDisposer());
+ shards.clear();
+ inline_bl.clear();
+ clear_needs_reshard();
+ }
+
+ bool encode_some(uint32_t offset, uint32_t length, bufferlist& bl,
+ unsigned *pn);
+ unsigned decode_some(bufferlist& bl);
+
+ void bound_encode_spanning_blobs(size_t& p);
+ void encode_spanning_blobs(bufferlist::contiguous_appender& p);
+ void decode_spanning_blobs(bufferptr::const_iterator& p);
+
+ BlobRef get_spanning_blob(int id) {
+ auto p = spanning_blob_map.find(id);
+ ceph_assert(p != spanning_blob_map.end());
+ return p->second;
+ }
+
+ void update(KeyValueDB::Transaction t, bool force);
+ decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
+ void reshard(
+ KeyValueDB *db,
+ KeyValueDB::Transaction t);
+
+ /// initialize Shards from the onode
+ void init_shards(bool loaded, bool dirty);
+
+ /// return index of shard containing offset
+ /// or -1 if not found
+ int seek_shard(uint32_t offset) {
+ size_t end = shards.size();
+ size_t mid, left = 0;
+ size_t right = end; // one passed the right end
+
+ while (left < right) {
+ mid = left + (right - left) / 2;
+ if (offset >= shards[mid].shard_info->offset) {
+ size_t next = mid + 1;
+ if (next >= end || offset < shards[next].shard_info->offset)
+ return mid;
+ //continue to search forwards
+ left = next;
+ } else {
+ //continue to search backwards
+ right = mid;
+ }
+ }
+
+ return -1; // not found
+ }
+
+ /// check if a range spans a shard
+ bool spans_shard(uint32_t offset, uint32_t length) {
+ if (shards.empty()) {
+ return false;
+ }
+ int s = seek_shard(offset);
+ ceph_assert(s >= 0);
+ if (s == (int)shards.size() - 1) {
+ return false; // last shard
+ }
+ if (offset + length <= shards[s+1].shard_info->offset) {
+ return false;
+ }
+ return true;
+ }
+
+ /// ensure that a range of the map is loaded
+ void fault_range(KeyValueDB *db,
+ uint32_t offset, uint32_t length);
+
+ /// ensure a range of the map is marked dirty
+ void dirty_range(uint32_t offset, uint32_t length);
+
+ /// for seek_lextent test
+ extent_map_t::iterator find(uint64_t offset);
+
+ /// seek to the first lextent including or after offset
+ extent_map_t::iterator seek_lextent(uint64_t offset);
+ extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
+
+ /// add a new Extent
+ void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
+ extent_map.insert(*new Extent(lo, o, l, b));
+ }
+
+ /// remove (and delete) an Extent
+ void rm(extent_map_t::iterator p) {
+ extent_map.erase_and_dispose(p, DeleteDisposer());
+ }
+
+ bool has_any_lextents(uint64_t offset, uint64_t length);
+
+ /// consolidate adjacent lextents in extent_map
+ int compress_extent_map(uint64_t offset, uint64_t length);
+
+ /// punch a logical hole. add lextents to deref to target list.
+ void punch_hole(CollectionRef &c,
+ uint64_t offset, uint64_t length,
+ old_extent_map_t *old_extents);
+
+ /// put new lextent into lextent_map overwriting existing ones if
+ /// any and update references accordingly
+ Extent *set_lextent(CollectionRef &c,
+ uint64_t logical_offset,
+ uint64_t offset, uint64_t length,
+ BlobRef b,
+ old_extent_map_t *old_extents);
+
+ /// split a blob (and referring extents)
+ BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
+ };
+
+ /// Compressed Blob Garbage collector
+ /*
+ The primary idea of the collector is to estimate a difference between
+ allocation units(AU) currently present for compressed blobs and new AUs
+ required to store that data uncompressed.
+ Estimation is performed for protrusive extents within a logical range
+ determined by a concatenation of old_extents collection and specific(current)
+ write request.
+ The root cause for old_extents use is the need to handle blob ref counts
+ properly. Old extents still hold blob refs and hence we need to traverse
+ the collection to determine if blob to be released.
+ Protrusive extents are extents that fit into the blob set in action
+ (ones that are below the logical range from above) but not removed totally
+ due to the current write.
+ E.g. for
+ extent1 <loffs = 100, boffs = 100, len = 100> ->
+ blob1<compressed, len_on_disk=4096, logical_len=8192>
+ extent2 <loffs = 200, boffs = 200, len = 100> ->
+ blob2<raw, len_on_disk=4096, llen=4096>
+ extent3 <loffs = 300, boffs = 300, len = 100> ->
+ blob1<compressed, len_on_disk=4096, llen=8192>
+ extent4 <loffs = 4096, boffs = 0, len = 100> ->
+ blob3<raw, len_on_disk=4096, llen=4096>
+ write(300~100)
+ protrusive extents are within the following ranges <0~300, 400~8192-400>
+ In this case existing AUs that might be removed due to GC (i.e. blob1)
+ use 2x4K bytes.
+ And new AUs expected after GC = 0 since extent1 to be merged into blob2.
+ Hence we should do a collect.
+ */
+ class GarbageCollector
+ {
+ public:
+ /// return amount of allocation units that might be saved due to GC
+ int64_t estimate(
+ uint64_t offset,
+ uint64_t length,
+ const ExtentMap& extent_map,
+ const old_extent_map_t& old_extents,
+ uint64_t min_alloc_size);
+
+ /// return a collection of extents to perform GC on
+ const interval_set<uint64_t>& get_extents_to_collect() const {
+ return extents_to_collect;
+ }
+ GarbageCollector(CephContext* _cct) : cct(_cct) {}
+
+ private:
+ struct BlobInfo {
+ uint64_t referenced_bytes = 0; ///< amount of bytes referenced in blob
+ int64_t expected_allocations = 0; ///< new alloc units required
+ ///< in case of gc fulfilled
+ bool collect_candidate = false; ///< indicate if blob has any extents
+ ///< eligible for GC.
+ extent_map_t::const_iterator first_lextent; ///< points to the first
+ ///< lextent referring to
+ ///< the blob if any.
+ ///< collect_candidate flag
+ ///< determines the validity
+ extent_map_t::const_iterator last_lextent; ///< points to the last
+ ///< lextent referring to
+ ///< the blob if any.
+
+ BlobInfo(uint64_t ref_bytes) :
+ referenced_bytes(ref_bytes) {
+ }
+ };
+ CephContext* cct;
+ map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
+ ///< copies that are affected by the
+ ///< specific write
+
+ ///< protrusive extents that should be collected if GC takes place
+ interval_set<uint64_t> extents_to_collect;
+
+ boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
+ ///< unit when traversing
+ ///< protrusive extents.
+ ///< Other extents mapped to
+ ///< this AU to be ignored
+ ///< (except the case where
+ ///< uncompressed extent follows
+ ///< compressed one - see below).
+ BlobInfo* blob_info_counted = nullptr; ///< set if previous allocation unit
+ ///< caused expected_allocations
+ ///< counter increment at this blob.
+ ///< if uncompressed extent follows
+ ///< a decrement for the
+ ///< expected_allocations counter
+ ///< is needed
+ int64_t expected_allocations = 0; ///< new alloc units required in case
+ ///< of gc fulfilled
+ int64_t expected_for_release = 0; ///< alloc units currently used by
+ ///< compressed blobs that might
+ ///< gone after GC
+
+ protected:
+ void process_protrusive_extents(const BlueStore::ExtentMap& extent_map,
+ uint64_t start_offset,
+ uint64_t end_offset,
+ uint64_t start_touch_offset,
+ uint64_t end_touch_offset,
+ uint64_t min_alloc_size);
+ };
+
+ struct OnodeSpace;
+
+ /// an in-memory object
+ struct Onode {
+ MEMPOOL_CLASS_HELPERS();
+
+ std::atomic_int nref; ///< reference count
+ Collection *c;
+
+ ghobject_t oid;
+
+ /// key under PREFIX_OBJ where we are stored
+ mempool::bluestore_cache_meta::string key;
+
+ boost::intrusive::list_member_hook<> lru_item;
+
+ bluestore_onode_t onode; ///< metadata stored as value in kv store
+ bool exists; ///< true if object logically exists
+
+ ExtentMap extent_map;
+
+ // track txc's that have not been committed to kv store (and whose
+ // effects cannot be read via the kvdb read methods)
+ std::atomic<int> flushing_count = {0};
+ /// protect flush_txns
+ ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
+ ceph::condition_variable flush_cond; ///< wait here for uncommitted txns
+
+ Onode(Collection *c, const ghobject_t& o,
+ const mempool::bluestore_cache_meta::string& k)
+ : nref(0),
+ c(c),
+ oid(o),
+ key(k),
+ exists(false),
+ extent_map(this) {
+ }
+ Onode(Collection* c, const ghobject_t& o,
+ const string& k)
+ : nref(0),
+ c(c),
+ oid(o),
+ key(k),
+ exists(false),
+ extent_map(this) {
+ }
+ Onode(Collection* c, const ghobject_t& o,
+ const char* k)
+ : nref(0),
+ c(c),
+ oid(o),
+ key(k),
+ exists(false),
+ extent_map(this) {
+ }
+
+ static Onode* decode(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const string& key,
+ const bufferlist& v);
+
+ void flush();
+ void get() {
+ ++nref;
+ }
+ void put() {
+ if (--nref == 0)
+ delete this;
+ }
+ };
+ typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+
+ /// a cache (shard) of onodes and buffers
+ struct Cache {
+ CephContext* cct;
+ PerfCounters *logger;
+
+ /// protect lru and other structures
+ ceph::recursive_mutex lock = {
+ ceph::make_recursive_mutex("BlueStore::Cache::lock") };
+
+ std::atomic<uint64_t> num_extents = {0};
+ std::atomic<uint64_t> num_blobs = {0};
+
+ std::array<std::pair<ghobject_t, mono_clock::time_point>, 64> dumped_onodes;
+
+ static Cache *create(CephContext* cct, string type, PerfCounters *logger);
+
+ Cache(CephContext* cct) : cct(cct), logger(nullptr) {}
+ virtual ~Cache() {}
+
+ virtual void _add_onode(OnodeRef& o, int level) = 0;
+ virtual void _rm_onode(OnodeRef& o) = 0;
+ virtual void _touch_onode(OnodeRef& o) = 0;
+
+ virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0;
+ virtual void _rm_buffer(Buffer *b) = 0;
+ virtual void _move_buffer(Cache *src, Buffer *b) = 0;
+ virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0;
+ virtual void _touch_buffer(Buffer *b) = 0;
+
+ virtual uint64_t _get_num_onodes() = 0;
+ virtual uint64_t _get_buffer_bytes() = 0;
+
+ void add_extent() {
+ ++num_extents;
+ }
+ void rm_extent() {
+ --num_extents;
+ }
+
+ void add_blob() {
+ ++num_blobs;
+ }
+ void rm_blob() {
+ --num_blobs;
+ }
+
+ void trim(uint64_t onode_max, uint64_t buffer_max);
+
+ void trim_all();
+
+ virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0;
+
+ virtual void add_stats(uint64_t *onodes, uint64_t *extents,
+ uint64_t *blobs,
+ uint64_t *buffers,
+ uint64_t *bytes) = 0;
+
+ bool empty() {
+ std::lock_guard l(lock);
+ return _get_num_onodes() == 0 && _get_buffer_bytes() == 0;
+ }
+
+#ifdef DEBUG_CACHE
+ virtual void _audit(const char *s) = 0;
+#else
+ void _audit(const char *s) { /* no-op */ }
+#endif
+ };
+
+ /// simple LRU cache for onodes and buffers
+ struct LRUCache : public Cache {
+ private:
+ typedef boost::intrusive::list<
+ Onode,
+ boost::intrusive::member_hook<
+ Onode,
+ boost::intrusive::list_member_hook<>,
+ &Onode::lru_item> > onode_lru_list_t;
+ typedef boost::intrusive::list<
+ Buffer,
+ boost::intrusive::member_hook<
+ Buffer,
+ boost::intrusive::list_member_hook<>,
+ &Buffer::lru_item> > buffer_lru_list_t;
+
+ onode_lru_list_t onode_lru;
+ onode_lru_list_t::iterator last_pinned;
+
+ buffer_lru_list_t buffer_lru;
+ uint64_t buffer_size = 0;
+
+ void _onode_lru_erase(onode_lru_list_t::iterator it) {
+ if (it == last_pinned) {
+ last_pinned = onode_lru.end();
+ }
+ onode_lru.erase(it);
+ }
+
+ public:
+ LRUCache(CephContext* cct) : Cache(cct), last_pinned(onode_lru.end()){}
+ uint64_t _get_num_onodes() override {
+ return onode_lru.size();
+ }
+ void _add_onode(OnodeRef& o, int level) override {
+ if (level > 0)
+ onode_lru.push_front(*o);
+ else
+ onode_lru.push_back(*o);
+ }
+ void _rm_onode(OnodeRef& o) override {
+ auto q = onode_lru.iterator_to(*o);
+ _onode_lru_erase(q);
+ }
+ void _touch_onode(OnodeRef& o) override;
+
+ uint64_t _get_buffer_bytes() override {
+ return buffer_size;
+ }
+ void _add_buffer(Buffer *b, int level, Buffer *near) override {
+ if (near) {
+ auto q = buffer_lru.iterator_to(*near);
+ buffer_lru.insert(q, *b);
+ } else if (level > 0) {
+ buffer_lru.push_front(*b);
+ } else {
+ buffer_lru.push_back(*b);
+ }
+ buffer_size += b->length;
+ }
+ void _rm_buffer(Buffer *b) override {
+ ceph_assert(buffer_size >= b->length);
+ buffer_size -= b->length;
+ auto q = buffer_lru.iterator_to(*b);
+ buffer_lru.erase(q);
+ }
+ void _move_buffer(Cache *src, Buffer *b) override {
+ src->_rm_buffer(b);
+ _add_buffer(b, 0, nullptr);
+ }
+ void _adjust_buffer_size(Buffer *b, int64_t delta) override {
+ ceph_assert((int64_t)buffer_size + delta >= 0);
+ buffer_size += delta;
+ }
+ void _touch_buffer(Buffer *b) override {
+ auto p = buffer_lru.iterator_to(*b);
+ buffer_lru.erase(p);
+ buffer_lru.push_front(*b);
+ _audit("_touch_buffer end");
+ }
+
+ void _trim(uint64_t onode_max, uint64_t buffer_max) override;
+
+ void add_stats(uint64_t *onodes, uint64_t *extents,
+ uint64_t *blobs,
+ uint64_t *buffers,
+ uint64_t *bytes) override {
+ std::lock_guard l(lock);
+ *onodes += onode_lru.size();
+ *extents += num_extents;
+ *blobs += num_blobs;
+ *buffers += buffer_lru.size();
+ *bytes += buffer_size;
+ }
+
+#ifdef DEBUG_CACHE
+ void _audit(const char *s) override;
+#endif
+ };
+
+ // 2Q cache for buffers, LRU for onodes
+ struct TwoQCache : public Cache {
+ private:
+ // stick with LRU for onodes for now (fixme?)
+ typedef boost::intrusive::list<
+ Onode,
+ boost::intrusive::member_hook<
+ Onode,
+ boost::intrusive::list_member_hook<>,
+ &Onode::lru_item> > onode_lru_list_t;
+ typedef boost::intrusive::list<
+ Buffer,
+ boost::intrusive::member_hook<
+ Buffer,
+ boost::intrusive::list_member_hook<>,
+ &Buffer::lru_item> > buffer_list_t;
+
+ onode_lru_list_t onode_lru;
+ onode_lru_list_t::iterator last_pinned;
+
+ buffer_list_t buffer_hot; ///< "Am" hot buffers
+ buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers
+ buffer_list_t buffer_warm_out; ///< "A1out" empty buffers we've evicted
+ uint64_t buffer_bytes = 0; ///< bytes
+
+ enum {
+ BUFFER_NEW = 0,
+ BUFFER_WARM_IN, ///< in buffer_warm_in
+ BUFFER_WARM_OUT, ///< in buffer_warm_out
+ BUFFER_HOT, ///< in buffer_hot
+ BUFFER_TYPE_MAX
+ };
+
+ uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
+
+ void _onode_lru_erase(onode_lru_list_t::iterator it) {
+ if (it == last_pinned) {
+ last_pinned = onode_lru.end();
+ }
+ onode_lru.erase(it);
+ }
+ public:
+ TwoQCache(CephContext* cct) : Cache(cct), last_pinned(onode_lru.end()){}
+ uint64_t _get_num_onodes() override {
+ return onode_lru.size();
+ }
+ void _add_onode(OnodeRef& o, int level) override {
+ if (level > 0)
+ onode_lru.push_front(*o);
+ else
+ onode_lru.push_back(*o);
+ }
+ void _rm_onode(OnodeRef& o) override {
+ auto q = onode_lru.iterator_to(*o);
+ _onode_lru_erase(q);
+ }
+ void _touch_onode(OnodeRef& o) override;
+
+ uint64_t _get_buffer_bytes() override {
+ return buffer_bytes;
+ }
+ void _add_buffer(Buffer *b, int level, Buffer *near) override;
+ void _rm_buffer(Buffer *b) override;
+ void _move_buffer(Cache *src, Buffer *b) override;
+ void _adjust_buffer_size(Buffer *b, int64_t delta) override;
+ void _touch_buffer(Buffer *b) override {
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ // do nothing (somewhat counter-intuitively!)
+ break;
+ case BUFFER_WARM_OUT:
+ // move from warm_out to hot LRU
+ ceph_abort_msg("this happens via discard hint");
+ break;
+ case BUFFER_HOT:
+ // move to front of hot LRU
+ buffer_hot.erase(buffer_hot.iterator_to(*b));
+ buffer_hot.push_front(*b);
+ break;
+ }
+ _audit("_touch_buffer end");
+ }
+
+ void _trim(uint64_t onode_max, uint64_t buffer_max) override;
+
+ void add_stats(uint64_t *onodes, uint64_t *extents,
+ uint64_t *blobs,
+ uint64_t *buffers,
+ uint64_t *bytes) override {
+ std::lock_guard l(lock);
+ *onodes += onode_lru.size();
+ *extents += num_extents;
+ *blobs += num_blobs;
+ *buffers += buffer_hot.size() + buffer_warm_in.size();
+ *bytes += buffer_bytes;
+ }
+
+#ifdef DEBUG_CACHE
+ void _audit(const char *s) override;
+#endif
+ };
+
+ struct OnodeSpace {
+ private:
+ Cache *cache;
+
+ /// forward lookups
+ mempool::bluestore_cache_meta::unordered_map<ghobject_t,OnodeRef> onode_map;
+
+ friend class Collection; // for split_cache()
+
+ public:
+ OnodeSpace(Cache *c) : cache(c) {}
+ ~OnodeSpace() {
+ clear();
+ }
+
+ OnodeRef add(const ghobject_t& oid, OnodeRef o);
+ OnodeRef lookup(const ghobject_t& o);
+ void remove(const ghobject_t& oid) {
+ onode_map.erase(oid);
+ }
+ void rename(OnodeRef& o, const ghobject_t& old_oid,
+ const ghobject_t& new_oid,
+ const mempool::bluestore_cache_meta::string& new_okey);
+ void clear();
+ bool empty();
+
+ template <int LogLevelV>
+ void dump(CephContext *cct);
+
+ /// return true if f true for any item
+ bool map_any(std::function<bool(OnodeRef)> f);
+ };
+
+ class OpSequencer;
+ typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
+
+ struct Collection : public CollectionImpl {
+ BlueStore *store;
+ OpSequencerRef osr;
+ Cache *cache; ///< our cache shard
+ bluestore_cnode_t cnode;
+ RWLock lock;
+
+ bool exists;
+
+ SharedBlobSet shared_blob_set; ///< open SharedBlobs
+
+ // cache onodes on a per-collection basis to avoid lock
+ // contention.
+ OnodeSpace onode_map;
+
+ //pool options
+ pool_opts_t pool_opts;
+ ContextQueue *commit_queue;
+
+ OnodeRef get_onode(const ghobject_t& oid, bool create);
+
+ // the terminology is confusing here, sorry!
+ //
+ // blob_t shared_blob_t
+ // !shared unused -> open
+ // shared !loaded -> open + shared
+ // shared loaded -> open + shared + loaded
+ //
+ // i.e.,
+ // open = SharedBlob is instantiated
+ // shared = blob_t shared flag is set; SharedBlob is hashed.
+ // loaded = SharedBlob::shared_blob_t is loaded from kv store
+ void open_shared_blob(uint64_t sbid, BlobRef b);
+ void load_shared_blob(SharedBlobRef sb);
+ void make_blob_shared(uint64_t sbid, BlobRef b);
+ uint64_t make_blob_unshared(SharedBlob *sb);
+
+ BlobRef new_blob() {
+ BlobRef b = new Blob();
+ b->shared_blob = new SharedBlob(this);
+ return b;
+ }
+
+ bool contains(const ghobject_t& oid) {
+ if (cid.is_meta())
+ return oid.hobj.pool == -1;
+ spg_t spgid;
+ if (cid.is_pg(&spgid))
+ return
+ spgid.pgid.contains(cnode.bits, oid) &&
+ oid.shard_id == spgid.shard;
+ return false;
+ }
+
+ void split_cache(Collection *dest);
+
+ bool flush_commit(Context *c) override;
+ void flush() override;
+ void flush_all_but_last();
+
+ Collection(BlueStore *ns, Cache *ca, coll_t c);
+ };
+
+ class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+ CollectionRef c;
+ OnodeRef o;
+ KeyValueDB::Iterator it;
+ string head, tail;
+
+ string _stringify() const;
+
+ public:
+ OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
+ int seek_to_first() override;
+ int upper_bound(const string &after) override;
+ int lower_bound(const string &to) override;
+ bool valid() override;
+ int next() override;
+ string key() override;
+ bufferlist value() override;
+ int status() override {
+ return 0;
+ }
+ };
+
+ struct volatile_statfs{
+ enum {
+ STATFS_ALLOCATED = 0,
+ STATFS_STORED,
+ STATFS_COMPRESSED_ORIGINAL,
+ STATFS_COMPRESSED,
+ STATFS_COMPRESSED_ALLOCATED,
+ STATFS_LAST
+ };
+ int64_t values[STATFS_LAST];
+ volatile_statfs() {
+ memset(this, 0, sizeof(volatile_statfs));
+ }
+ void reset() {
+ *this = volatile_statfs();
+ }
+ void publish(store_statfs_t* buf) const {
+ buf->allocated = allocated();
+ buf->data_stored = stored();
+ buf->data_compressed = compressed();
+ buf->data_compressed_original = compressed_original();
+ buf->data_compressed_allocated = compressed_allocated();
+ }
+
+ volatile_statfs& operator+=(const volatile_statfs& other) {
+ for (size_t i = 0; i < STATFS_LAST; ++i) {
+ values[i] += other.values[i];
+ }
+ return *this;
+ }
+ int64_t& allocated() {
+ return values[STATFS_ALLOCATED];
+ }
+ int64_t& stored() {
+ return values[STATFS_STORED];
+ }
+ int64_t& compressed_original() {
+ return values[STATFS_COMPRESSED_ORIGINAL];
+ }
+ int64_t& compressed() {
+ return values[STATFS_COMPRESSED];
+ }
+ int64_t& compressed_allocated() {
+ return values[STATFS_COMPRESSED_ALLOCATED];
+ }
+ int64_t allocated() const {
+ return values[STATFS_ALLOCATED];
+ }
+ int64_t stored() const {
+ return values[STATFS_STORED];
+ }
+ int64_t compressed_original() const {
+ return values[STATFS_COMPRESSED_ORIGINAL];
+ }
+ int64_t compressed() const {
+ return values[STATFS_COMPRESSED];
+ }
+ int64_t compressed_allocated() const {
+ return values[STATFS_COMPRESSED_ALLOCATED];
+ }
+ volatile_statfs& operator=(const store_statfs_t& st) {
+ values[STATFS_ALLOCATED] = st.allocated;
+ values[STATFS_STORED] = st.data_stored;
+ values[STATFS_COMPRESSED_ORIGINAL] = st.data_compressed_original;
+ values[STATFS_COMPRESSED] = st.data_compressed;
+ values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
+ return *this;
+ }
+ bool is_empty() {
+ return values[STATFS_ALLOCATED] == 0 &&
+ values[STATFS_STORED] == 0 &&
+ values[STATFS_COMPRESSED] == 0 &&
+ values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
+ values[STATFS_COMPRESSED_ALLOCATED] == 0;
+ }
+ void decode(bufferlist::const_iterator& it) {
+ using ceph::decode;
+ for (size_t i = 0; i < STATFS_LAST; i++) {
+ decode(values[i], it);
+ }
+ }
+
+ void encode(bufferlist& bl) {
+ using ceph::encode;
+ for (size_t i = 0; i < STATFS_LAST; i++) {
+ encode(values[i], bl);
+ }
+ }
+ };
+
+ struct TransContext final : public AioContext {
+ MEMPOOL_CLASS_HELPERS();
+
+ typedef enum {
+ STATE_PREPARE,
+ STATE_AIO_WAIT,
+ STATE_IO_DONE,
+ STATE_KV_QUEUED, // queued for kv_sync_thread submission
+ STATE_KV_SUBMITTED, // submitted to kv; not yet synced
+ STATE_KV_DONE,
+ STATE_DEFERRED_QUEUED, // in deferred_queue (pending or running)
+ STATE_DEFERRED_CLEANUP, // remove deferred kv record
+ STATE_DEFERRED_DONE,
+ STATE_FINISHING,
+ STATE_DONE,
+ } state_t;
+
+ state_t state = STATE_PREPARE;
+
+ const char *get_state_name() {
+ switch (state) {
+ case STATE_PREPARE: return "prepare";
+ case STATE_AIO_WAIT: return "aio_wait";
+ case STATE_IO_DONE: return "io_done";
+ case STATE_KV_QUEUED: return "kv_queued";
+ case STATE_KV_SUBMITTED: return "kv_submitted";
+ case STATE_KV_DONE: return "kv_done";
+ case STATE_DEFERRED_QUEUED: return "deferred_queued";
+ case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
+ case STATE_DEFERRED_DONE: return "deferred_done";
+ case STATE_FINISHING: return "finishing";
+ case STATE_DONE: return "done";
+ }
+ return "???";
+ }
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+ const char *get_state_latency_name(int state) {
+ switch (state) {
+ case l_bluestore_state_prepare_lat: return "prepare";
+ case l_bluestore_state_aio_wait_lat: return "aio_wait";
+ case l_bluestore_state_io_done_lat: return "io_done";
+ case l_bluestore_state_kv_queued_lat: return "kv_queued";
+ case l_bluestore_state_kv_committing_lat: return "kv_committing";
+ case l_bluestore_state_kv_done_lat: return "kv_done";
+ case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
+ case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
+ case l_bluestore_state_finishing_lat: return "finishing";
+ case l_bluestore_state_done_lat: return "done";
+ }
+ return "???";
+ }
+#endif
+
+ utime_t log_state_latency(PerfCounters *logger, int state) {
+ utime_t lat, now = ceph_clock_now();
+ lat = now - last_stamp;
+ logger->tinc(state, lat);
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+ if (state >= l_bluestore_state_prepare_lat && state <= l_bluestore_state_done_lat) {
+ double usecs = (now.to_nsec()-last_stamp.to_nsec())/1000;
+ OID_ELAPSED("", usecs, get_state_latency_name(state));
+ }
+#endif
+ last_stamp = now;
+ return lat;
+ }
+
+ CollectionRef ch;
+ OpSequencerRef osr; // this should be ch->osr
+ boost::intrusive::list_member_hook<> sequencer_item;
+
+ uint64_t bytes = 0, cost = 0;
+
+ set<OnodeRef> onodes; ///< these need to be updated/written
+ set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
+ set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
+ set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
+
+ KeyValueDB::Transaction t; ///< then we will commit this
+ list<Context*> oncommits; ///< more commit completions
+ list<CollectionRef> removed_collections; ///< colls we removed
+
+ boost::intrusive::list_member_hook<> deferred_queue_item;
+ bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
+
+ interval_set<uint64_t> allocated, released;
+ volatile_statfs statfs_delta; ///< overall store statistics delta
+ uint64_t osd_pool_id = META_POOL_ID; ///< osd pool id we're operating on
+
+ IOContext ioc;
+ bool had_ios = false; ///< true if we submitted IOs before our kv txn
+
+ uint64_t seq = 0;
+ utime_t start;
+ utime_t last_stamp;
+
+ uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated
+ uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated
+
+ explicit TransContext(CephContext* cct, Collection *c, OpSequencer *o,
+ list<Context*> *on_commits)
+ : ch(c),
+ osr(o),
+ ioc(cct, this),
+ start(ceph_clock_now()) {
+ last_stamp = start;
+ if (on_commits) {
+ oncommits.swap(*on_commits);
+ }
+ }
+ ~TransContext() {
+ delete deferred_txn;
+ }
+
+ void write_onode(OnodeRef &o) {
+ onodes.insert(o);
+ }
+ void write_shared_blob(SharedBlobRef &sb) {
+ shared_blobs.insert(sb);
+ }
+ void unshare_blob(SharedBlob *sb) {
+ shared_blobs.erase(sb);
+ }
+
+ /// note we logically modified object (when onode itself is unmodified)
+ void note_modified_object(OnodeRef &o) {
+ // onode itself isn't written, though
+ modified_objects.insert(o);
+ }
+ void note_removed_object(OnodeRef& o) {
+ onodes.erase(o);
+ modified_objects.insert(o);
+ }
+
+ void aio_finish(BlueStore *store) override {
+ store->txc_aio_finish(this);
+ }
+ };
+
+ typedef boost::intrusive::list<
+ TransContext,
+ boost::intrusive::member_hook<
+ TransContext,
+ boost::intrusive::list_member_hook<>,
+ &TransContext::deferred_queue_item> > deferred_queue_t;
+
+ struct DeferredBatch final : public AioContext {
+ OpSequencer *osr;
+ struct deferred_io {
+ bufferlist bl; ///< data
+ uint64_t seq; ///< deferred transaction seq
+ };
+ map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
+ deferred_queue_t txcs; ///< txcs in this batch
+ IOContext ioc; ///< our aios
+ /// bytes of pending io for each deferred seq (may be 0)
+ map<uint64_t,int> seq_bytes;
+
+ void _discard(CephContext *cct, uint64_t offset, uint64_t length);
+ void _audit(CephContext *cct);
+
+ DeferredBatch(CephContext *cct, OpSequencer *osr)
+ : osr(osr), ioc(cct, this) {}
+
+ /// prepare a write
+ void prepare_write(CephContext *cct,
+ uint64_t seq, uint64_t offset, uint64_t length,
+ bufferlist::const_iterator& p);
+
+ void aio_finish(BlueStore *store) override {
+ store->_deferred_aio_finish(osr);
+ }
+ };
+
+ class OpSequencer : public RefCountedObject {
+ public:
+ ceph::mutex qlock = ceph::make_mutex("BlueStore::OpSequencer::qlock");
+ ceph::condition_variable qcond;
+ typedef boost::intrusive::list<
+ TransContext,
+ boost::intrusive::member_hook<
+ TransContext,
+ boost::intrusive::list_member_hook<>,
+ &TransContext::sequencer_item> > q_list_t;
+ q_list_t q; ///< transactions
+
+ boost::intrusive::list_member_hook<> deferred_osr_queue_item;
+
+ DeferredBatch *deferred_running = nullptr;
+ DeferredBatch *deferred_pending = nullptr;
+
+ BlueStore *store;
+ coll_t cid;
+
+ uint64_t last_seq = 0;
+
+ std::atomic_int txc_with_unstable_io = {0}; ///< num txcs with unstable io
+
+ std::atomic_int kv_committing_serially = {0};
+
+ std::atomic_int kv_submitted_waiters = {0};
+
+ std::atomic_bool zombie = {false}; ///< in zombie_osr set (collection going away)
+
+ OpSequencer(BlueStore *store, const coll_t& c)
+ : RefCountedObject(store->cct, 0),
+ store(store), cid(c) {
+ }
+ ~OpSequencer() {
+ ceph_assert(q.empty());
+ }
+
+ void queue_new(TransContext *txc) {
+ std::lock_guard l(qlock);
+ txc->seq = ++last_seq;
+ q.push_back(*txc);
+ }
+
+ void drain() {
+ std::unique_lock l(qlock);
+ while (!q.empty())
+ qcond.wait(l);
+ }
+
+ void drain_preceding(TransContext *txc) {
+ std::unique_lock l(qlock);
+ while (!q.empty() && &q.front() != txc)
+ qcond.wait(l);
+ }
+
+ bool _is_all_kv_submitted() {
+ // caller must hold qlock & q.empty() must not empty
+ ceph_assert(!q.empty());
+ TransContext *txc = &q.back();
+ if (txc->state >= TransContext::STATE_KV_SUBMITTED) {
+ return true;
+ }
+ return false;
+ }
+
+ void flush() {
+ std::unique_lock l(qlock);
+ while (true) {
+ // set flag before the check because the condition
+ // may become true outside qlock, and we need to make
+ // sure those threads see waiters and signal qcond.
+ ++kv_submitted_waiters;
+ if (q.empty() || _is_all_kv_submitted()) {
+ --kv_submitted_waiters;
+ return;
+ }
+ qcond.wait(l);
+ --kv_submitted_waiters;
+ }
+ }
+
+ void flush_all_but_last() {
+ std::unique_lock l(qlock);
+ assert (q.size() >= 1);
+ while (true) {
+ // set flag before the check because the condition
+ // may become true outside qlock, and we need to make
+ // sure those threads see waiters and signal qcond.
+ ++kv_submitted_waiters;
+ if (q.size() <= 1) {
+ --kv_submitted_waiters;
+ return;
+ } else {
+ auto it = q.rbegin();
+ it++;
+ if (it->state >= TransContext::STATE_KV_SUBMITTED) {
+ --kv_submitted_waiters;
+ return;
+ }
+ }
+ qcond.wait(l);
+ --kv_submitted_waiters;
+ }
+ }
+
+ bool flush_commit(Context *c) {
+ std::lock_guard l(qlock);
+ if (q.empty()) {
+ return true;
+ }
+ TransContext *txc = &q.back();
+ if (txc->state >= TransContext::STATE_KV_DONE) {
+ return true;
+ }
+ txc->oncommits.push_back(c);
+ return false;
+ }
+ };
+
+ typedef boost::intrusive::list<
+ OpSequencer,
+ boost::intrusive::member_hook<
+ OpSequencer,
+ boost::intrusive::list_member_hook<>,
+ &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
+
+ struct KVSyncThread : public Thread {
+ BlueStore *store;
+ explicit KVSyncThread(BlueStore *s) : store(s) {}
+ void *entry() override {
+ store->_kv_sync_thread();
+ return NULL;
+ }
+ };
+ struct KVFinalizeThread : public Thread {
+ BlueStore *store;
+ explicit KVFinalizeThread(BlueStore *s) : store(s) {}
+ void *entry() {
+ store->_kv_finalize_thread();
+ return NULL;
+ }
+ };
+
+ struct DBHistogram {
+ struct value_dist {
+ uint64_t count;
+ uint32_t max_len;
+ };
+
+ struct key_dist {
+ uint64_t count;
+ uint32_t max_len;
+ map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key
+ };
+
+ map<string, map<int, struct key_dist> > key_hist;
+ map<int, uint64_t> value_hist;
+ int get_key_slab(size_t sz);
+ string get_key_slab_to_range(int slab);
+ int get_value_slab(size_t sz);
+ string get_value_slab_to_range(int slab);
+ void update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
+ const string &prefix, size_t key_size, size_t value_size);
+ void dump(Formatter *f);
+ };
+
+ // --------------------------------------------------------
+ // members
+private:
+ BlueFS *bluefs = nullptr;
+ unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
+ bool bluefs_single_shared_device = true;
+ mono_time bluefs_last_balance;
+ utime_t next_dump_on_bluefs_alloc_failure;
+
+ KeyValueDB *db = nullptr;
+ BlockDevice *bdev = nullptr;
+ std::string freelist_type;
+ FreelistManager *fm = nullptr;
+ Allocator *alloc = nullptr;
+ uuid_d fsid;
+ int path_fd = -1; ///< open handle to $path
+ int fsid_fd = -1; ///< open handle (locked) to $path/fsid
+ bool mounted = false;
+
+ RWLock coll_lock = {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map
+ mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
+ bool collections_had_errors = false;
+ map<coll_t,CollectionRef> new_coll_map;
+
+ vector<Cache*> cache_shards;
+
+ /// protect zombie_osr_set
+ ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock");
+ std::map<coll_t,OpSequencerRef> zombie_osr_set; ///< set of OpSequencers for deleted collections
+
+ std::atomic<uint64_t> nid_last = {0};
+ std::atomic<uint64_t> nid_max = {0};
+ std::atomic<uint64_t> blobid_last = {0};
+ std::atomic<uint64_t> blobid_max = {0};
+
+ Throttle throttle_bytes; ///< submit to commit
+ Throttle throttle_deferred_bytes; ///< submit to deferred complete
+
+ interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs
+ interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
+
+ ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::deferred_lock");
+ std::atomic<uint64_t> deferred_seq = {0};
+ deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
+ int deferred_queue_size = 0; ///< num txc's queued across all osrs
+ atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
+ Finisher deferred_finisher, finisher;
+
+ KVSyncThread kv_sync_thread;
+ ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock");
+ ceph::condition_variable kv_cond;
+ bool _kv_only = false;
+ bool kv_sync_started = false;
+ bool kv_stop = false;
+ bool kv_finalize_started = false;
+ bool kv_finalize_stop = false;
+ deque<TransContext*> kv_queue; ///< ready, already submitted
+ deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
+ deque<TransContext*> kv_committing; ///< currently syncing
+ deque<DeferredBatch*> deferred_done_queue; ///< deferred ios done
+
+ KVFinalizeThread kv_finalize_thread;
+ ceph::mutex kv_finalize_lock = ceph::make_mutex("BlueStore::kv_finalize_lock");
+ ceph::condition_variable kv_finalize_cond;
+ deque<TransContext*> kv_committing_to_finalize; ///< pending finalization
+ deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
+
+ PerfCounters *logger = nullptr;
+
+ list<CollectionRef> removed_collections;
+
+ RWLock debug_read_error_lock = {"BlueStore::debug_read_error_lock"};
+ set<ghobject_t> debug_data_error_objects;
+ set<ghobject_t> debug_mdata_error_objects;
+
+ std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
+
+ uint64_t block_size = 0; ///< block size of block device (power of 2)
+ uint64_t block_mask = 0; ///< mask to get just the block offset
+ size_t block_size_order = 0; ///< bits to shift to get block size
+
+ uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2)
+ ///< bits for min_alloc_size
+ uint8_t min_alloc_size_order = 0;
+ static_assert(std::numeric_limits<uint8_t>::max() >
+ std::numeric_limits<decltype(min_alloc_size)>::digits,
+ "not enough bits for min_alloc_size");
+
+ ///< maximum allocation unit (power of 2)
+ std::atomic<uint64_t> max_alloc_size = {0};
+
+ ///< number threshold for forced deferred writes
+ std::atomic<int> deferred_batch_ops = {0};
+
+ ///< size threshold for forced deferred writes
+ std::atomic<uint64_t> prefer_deferred_size = {0};
+
+ ///< approx cost per io, in bytes
+ std::atomic<uint64_t> throttle_cost_per_io = {0};
+
+ std::atomic<Compressor::CompressionMode> comp_mode =
+ {Compressor::COMP_NONE}; ///< compression mode
+ CompressorRef compressor;
+ std::atomic<uint64_t> comp_min_blob_size = {0};
+ std::atomic<uint64_t> comp_max_blob_size = {0};
+
+ std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size
+
+ uint64_t kv_ios = 0;
+ uint64_t kv_throttle_costs = 0;
+
+ // cache trim control
+ uint64_t cache_size = 0; ///< total cache size
+ double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
+ double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
+ double cache_data_ratio = 0; ///< cache ratio dedicated to object data
+ bool cache_autotune = false; ///< cache autotune setting
+ double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
+ uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache
+ uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache
+ double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
+ uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache
+ double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing
+ std::atomic<uint32_t> config_changed = {0}; ///< Counter to determine if there is a configuration change.
+
+ typedef map<uint64_t, volatile_statfs> osd_pools_map;
+
+ ceph::mutex vstatfs_lock = ceph::make_mutex("BlueStore::vstatfs_lock");
+ volatile_statfs vstatfs;
+ osd_pools_map osd_pools; // protected by vstatfs_lock as well
+
+ bool per_pool_stat_collection = true;
+
+ struct MempoolThread : public Thread {
+ public:
+ BlueStore *store;
+
+ ceph::condition_variable cond;
+ ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
+ bool stop = false;
+ uint64_t autotune_cache_size = 0;
+ std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
+ std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
+
+ struct MempoolCache : public PriorityCache::PriCache {
+ BlueStore *store;
+ int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
+ int64_t committed_bytes = 0;
+ double cache_ratio = 0;
+
+ MempoolCache(BlueStore *s) : store(s) {};
+
+ virtual uint64_t _get_used_bytes() const = 0;
+
+ virtual int64_t request_cache_bytes(
+ PriorityCache::Priority pri, uint64_t total_cache) const {
+ int64_t assigned = get_cache_bytes(pri);
+
+ switch (pri) {
+ // All cache items are currently shoved into the PRI1 priority
+ case PriorityCache::Priority::PRI1:
+ {
+ int64_t request = _get_used_bytes();
+ return(request > assigned) ? request - assigned : 0;
+ }
+ default:
+ break;
+ }
+ return -EOPNOTSUPP;
+ }
+
+ virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
+ return cache_bytes[pri];
+ }
+ virtual int64_t get_cache_bytes() const {
+ int64_t total = 0;
+
+ for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+ PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+ total += get_cache_bytes(pri);
+ }
+ return total;
+ }
+ virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] = bytes;
+ }
+ virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] += bytes;
+ }
+ virtual int64_t commit_cache_size(uint64_t total_cache) {
+ committed_bytes = PriorityCache::get_chunk(
+ get_cache_bytes(), total_cache);
+ return committed_bytes;
+ }
+ virtual int64_t get_committed_size() const {
+ return committed_bytes;
+ }
+ virtual double get_cache_ratio() const {
+ return cache_ratio;
+ }
+ virtual void set_cache_ratio(double ratio) {
+ cache_ratio = ratio;
+ }
+ virtual string get_cache_name() const = 0;
+ };
+
+ struct MetaCache : public MempoolCache {
+ MetaCache(BlueStore *s) : MempoolCache(s) {};
+
+ virtual uint64_t _get_used_bytes() const {
+ return mempool::bluestore_Buffer::allocated_bytes() +
+ mempool::bluestore_Blob::allocated_bytes() +
+ mempool::bluestore_Extent::allocated_bytes() +
+ mempool::bluestore_cache_meta::allocated_bytes() +
+ mempool::bluestore_cache_other::allocated_bytes() +
+ mempool::bluestore_cache_onode::allocated_bytes() +
+ mempool::bluestore_SharedBlob::allocated_bytes() +
+ mempool::bluestore_inline_bl::allocated_bytes();
+ }
+
+ virtual string get_cache_name() const {
+ return "BlueStore Meta Cache";
+ }
+
+ uint64_t _get_num_onodes() const {
+ uint64_t onode_num =
+ mempool::bluestore_cache_onode::allocated_items();
+ return (2 > onode_num) ? 2 : onode_num;
+ }
+
+ double get_bytes_per_onode() const {
+ return (double)_get_used_bytes() / (double)_get_num_onodes();
+ }
+ };
+ std::shared_ptr<MetaCache> meta_cache;
+
+ struct DataCache : public MempoolCache {
+ DataCache(BlueStore *s) : MempoolCache(s) {};
+
+ virtual uint64_t _get_used_bytes() const {
+ uint64_t bytes = 0;
+ for (auto i : store->cache_shards) {
+ bytes += i->_get_buffer_bytes();
+ }
+ return bytes;
+ }
+ virtual string get_cache_name() const {
+ return "BlueStore Data Cache";
+ }
+ };
+ std::shared_ptr<DataCache> data_cache;
+
+ public:
+ explicit MempoolThread(BlueStore *s)
+ : store(s),
+ meta_cache(new MetaCache(s)),
+ data_cache(new DataCache(s)) {}
+
+ void *entry() override;
+ void init() {
+ ceph_assert(stop == false);
+ create("bstore_mempool");
+ }
+ void shutdown() {
+ lock.lock();
+ stop = true;
+ cond.notify_all();
+ lock.unlock();
+ join();
+ }
+
+ private:
+ void _adjust_cache_settings();
+ void _trim_shards(bool interval_stats);
+ void _tune_cache_size(bool interval_stats);
+ void _balance_cache(
+ const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches);
+ void _balance_cache_pri(
+ int64_t *mem_avail,
+ const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches,
+ PriorityCache::Priority pri);
+ void _update_cache_settings();
+ } mempool_thread;
+
+ // --------------------------------------------------------
+ // private methods
+
+ void _init_logger();
+ void _shutdown_logger();
+ int _reload_logger();
+
+ int _open_path();
+ void _close_path();
+ int _open_fsid(bool create);
+ int _lock_fsid();
+ int _read_fsid(uuid_d *f);
+ int _write_fsid();
+ void _close_fsid();
+ void _set_alloc_sizes();
+ void _set_blob_size();
+ void _set_finisher_num();
+ void _update_osd_memory_options();
+
+ int _open_bdev(bool create);
+ // Verifies if disk space is enough for reserved + min bluefs
+ // and alters the latter if needed.
+ // Depends on min_alloc_size hence should be called after
+ // its initialization (and outside of _open_bdev)
+ void _validate_bdev();
+ void _close_bdev();
+
+ int _minimal_open_bluefs(bool create);
+ void _minimal_close_bluefs();
+ int _open_bluefs(bool create);
+ void _close_bluefs(bool cold_close);
+
+ // Limited (u)mount intended for BlueFS operations only
+ int _mount_for_bluefs();
+ void _umount_for_bluefs();
+
+
+ int _is_bluefs(bool create, bool* ret);
+ /*
+ * opens both DB and dependant super_meta, FreelistManager and allocator
+ * in the proper order
+ */
+ int _open_db_and_around(bool read_only);
+ void _close_db_and_around(bool read_only);
+
+ // updates legacy bluefs related recs in DB to a state valid for
+ // downgrades from nautilus.
+ void _sync_bluefs_and_fm();
+
+ /*
+ * @warning to_repair_db means that we open this db to repair it, will not
+ * hold the rocksdb's file lock.
+ */
+ int _open_db(bool create,
+ bool to_repair_db=false,
+ bool read_only = false);
+ void _close_db(bool read_only);
+ int _open_fm(KeyValueDB::Transaction t);
+ void _close_fm();
+ int _open_alloc();
+ void _close_alloc();
+ int _open_collections();
+ void _fsck_collections(int64_t* errors);
+ void _close_collections();
+
+ int _setup_block_symlink_or_file(string name, string path, uint64_t size,
+ bool create);
+
+public:
+ static int _write_bdev_label(CephContext* cct,
+ string path, bluestore_bdev_label_t label);
+ static int _read_bdev_label(CephContext* cct, string path,
+ bluestore_bdev_label_t *label);
+private:
+ int _check_or_set_bdev_label(string path, uint64_t size, string desc,
+ bool create);
+
+ int _open_super_meta();
+
+ void _open_statfs();
+ void _get_statfs_overall(struct store_statfs_t *buf);
+
+ void _dump_alloc_on_failure();
+
+ int64_t _get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total);
+ int _balance_bluefs_freespace();
+
+ CollectionRef _get_collection(const coll_t& cid);
+ void _queue_reap_collection(CollectionRef& c);
+ void _reap_collections();
+ void _update_cache_logger();
+
+ void _assign_nid(TransContext *txc, OnodeRef o);
+ uint64_t _assign_blobid(TransContext *txc);
+
+ template <int LogLevelV>
+ friend void _dump_onode(CephContext *cct, const Onode& o);
+ template <int LogLevelV>
+ friend void _dump_extent_map(CephContext *cct, const ExtentMap& em);
+ template <int LogLevelV>
+ friend void _dump_transaction(CephContext *cct, Transaction *t);
+
+ TransContext *_txc_create(Collection *c, OpSequencer *osr,
+ list<Context*> *on_commits);
+ void _txc_update_store_statfs(TransContext *txc);
+ void _txc_add_transaction(TransContext *txc, Transaction *t);
+ void _txc_calc_cost(TransContext *txc);
+ void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
+ void _txc_state_proc(TransContext *txc);
+ void _txc_aio_submit(TransContext *txc);
+public:
+ void txc_aio_finish(void *p) {
+ _txc_state_proc(static_cast<TransContext*>(p));
+ }
+private:
+ void _txc_finish_io(TransContext *txc);
+ void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
+ void _txc_applied_kv(TransContext *txc);
+ void _txc_committed_kv(TransContext *txc);
+ void _txc_finish(TransContext *txc);
+ void _txc_release_alloc(TransContext *txc);
+
+ void _osr_attach(Collection *c);
+ void _osr_register_zombie(OpSequencer *osr);
+ void _osr_drain(OpSequencer *osr);
+ void _osr_drain_preceding(TransContext *txc);
+ void _osr_drain_all();
+
+ void _kv_start();
+ void _kv_stop();
+ void _kv_sync_thread();
+ void _kv_finalize_thread();
+
+ bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
+ void _deferred_queue(TransContext *txc);
+public:
+ void deferred_try_submit();
+private:
+ void _deferred_submit_unlock(OpSequencer *osr);
+ void _deferred_aio_finish(OpSequencer *osr);
+ int _deferred_replay();
+
+public:
+ using mempool_dynamic_bitset =
+ boost::dynamic_bitset<uint64_t,
+ mempool::bluestore_fsck::pool_allocator<uint64_t>>;
+ using per_pool_statfs =
+ mempool::bluestore_fsck::map<uint64_t, store_statfs_t>;
+
+ enum FSCKDepth {
+ FSCK_REGULAR,
+ FSCK_DEEP,
+ FSCK_SHALLOW
+ };
+
+private:
+ int _fsck_check_extents(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const PExtentVector& extents,
+ bool compressed,
+ mempool_dynamic_bitset &used_blocks,
+ uint64_t granularity,
+ BlueStoreRepairer* repairer,
+ store_statfs_t& expected_statfs,
+ FSCKDepth depth);
+
+ void _fsck_check_pool_statfs(
+ per_pool_statfs& expected_pool_statfs,
+ int64_t& errors,
+ int64_t &warnings,
+ BlueStoreRepairer* repairer);
+
+ int _fsck(FSCKDepth depth, bool repair);
+ int _fsck_on_open(BlueStore::FSCKDepth depth, bool repair);
+
+ void _buffer_cache_write(
+ TransContext *txc,
+ BlobRef b,
+ uint64_t offset,
+ bufferlist& bl,
+ unsigned flags) {
+ b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
+ flags);
+ txc->shared_blobs_written.insert(b->shared_blob);
+ }
+
+ int _collection_list(
+ Collection *c, const ghobject_t& start, const ghobject_t& end,
+ int max, bool legacy, vector<ghobject_t> *ls, ghobject_t *next);
+
+ template <typename T, typename F>
+ T select_option(const std::string& opt_name, T val1, F f) {
+ //NB: opt_name reserved for future use
+ boost::optional<T> val2 = f();
+ if (val2) {
+ return *val2;
+ }
+ return val1;
+ }
+
+ void _apply_padding(uint64_t head_pad,
+ uint64_t tail_pad,
+ bufferlist& padded);
+
+ void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
+
+ // -- ondisk version ---
+public:
+ const int32_t latest_ondisk_format = 2; ///< our version
+ const int32_t min_readable_ondisk_format = 1; ///< what we can read
+ const int32_t min_compat_ondisk_format = 2; ///< who can read us
+
+private:
+ int32_t ondisk_format = 0; ///< value detected on mount
+
+ int _upgrade_super(); ///< upgrade (called during open_super)
+ uint64_t _get_ondisk_reserved() const;
+ void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
+
+ // --- public interface ---
+public:
+ BlueStore(CephContext *cct, const string& path);
+ BlueStore(CephContext *cct, const string& path, uint64_t min_alloc_size); // Ctor for UT only
+ ~BlueStore() override;
+
+ string get_type() override {
+ return "bluestore";
+ }
+
+ bool needs_journal() override { return false; };
+ bool wants_journal() override { return false; };
+ bool allows_journal() override { return false; };
+
+ int get_devices(set<string> *ls) override;
+
+ bool is_rotational() override;
+ bool is_journal_rotational() override;
+
+ string get_default_device_class() override {
+ string device_class;
+ map<string, string> metadata;
+ collect_metadata(&metadata);
+ auto it = metadata.find("bluestore_bdev_type");
+ if (it != metadata.end()) {
+ device_class = it->second;
+ }
+ return device_class;
+ }
+
+ int get_numa_node(
+ int *numa_node,
+ set<int> *nodes,
+ set<string> *failed) override;
+
+ static int get_block_device_fsid(CephContext* cct, const string& path,
+ uuid_d *fsid);
+
+ bool test_mount_in_use() override;
+
+private:
+ int _mount(bool kv_only, bool open_db=true);
+public:
+ int mount() override {
+ return _mount(false);
+ }
+ int umount() override;
+
+ int start_kv_only(KeyValueDB **pdb, bool open_db=true) {
+ int r = _mount(true, open_db);
+ if (r < 0)
+ return r;
+ *pdb = db;
+ return 0;
+ }
+
+ int write_meta(const std::string& key, const std::string& value) override;
+ int read_meta(const std::string& key, std::string *value) override;
+
+ int cold_open();
+ int cold_close();
+
+ int fsck(bool deep) override {
+ return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, false);
+ }
+ int repair(bool deep) override {
+ return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, true);
+ }
+ int quick_fix() override {
+ return _fsck(FSCK_SHALLOW, true);
+ }
+
+ void set_cache_shards(unsigned num) override;
+ void dump_cache_stats(Formatter *f) override {
+ int onode_count = 0, buffers_bytes = 0;
+ for (auto i: cache_shards) {
+ onode_count += i->_get_num_onodes();
+ buffers_bytes += i->_get_buffer_bytes();
+ }
+ f->dump_int("bluestore_onode", onode_count);
+ f->dump_int("bluestore_buffers", buffers_bytes);
+ }
+ void dump_cache_stats(ostream& ss) override {
+ int onode_count = 0, buffers_bytes = 0;
+ for (auto i: cache_shards) {
+ onode_count += i->_get_num_onodes();
+ buffers_bytes += i->_get_buffer_bytes();
+ }
+ ss << "bluestore_onode: " << onode_count;
+ ss << "bluestore_buffers: " << buffers_bytes;
+ }
+
+ int validate_hobject_key(const hobject_t &obj) const override {
+ return 0;
+ }
+ unsigned get_max_attr_name_length() override {
+ return 256; // arbitrary; there is no real limit internally
+ }
+
+ int mkfs() override;
+ int mkjournal() override {
+ return 0;
+ }
+
+ void get_db_statistics(Formatter *f) override;
+ void generate_db_histogram(Formatter *f) override;
+ void _flush_cache();
+ int flush_cache(ostream *os = NULL) override;
+ void dump_perf_counters(Formatter *f) override {
+ f->open_object_section("perf_counters");
+ logger->dump_formatted(f, false);
+ f->close_section();
+ }
+
+ int add_new_bluefs_device(int id, const string& path);
+ int migrate_to_existing_bluefs_device(const set<int>& devs_source,
+ int id);
+ int migrate_to_new_bluefs_device(const set<int>& devs_source,
+ int id,
+ const string& path);
+ int expand_devices(ostream& out);
+ string get_device_path(unsigned id);
+
+ int dump_bluefs_sizes(ostream& out);
+
+public:
+ int statfs(struct store_statfs_t *buf,
+ osd_alert_list_t* alerts = nullptr) override;
+ int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override;
+
+ void collect_metadata(map<string,string> *pm) override;
+
+ bool exists(CollectionHandle &c, const ghobject_t& oid) override;
+ int set_collection_opts(
+ CollectionHandle& c,
+ const pool_opts_t& opts) override;
+ int stat(
+ CollectionHandle &c,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio = false) override;
+ int read(
+ CollectionHandle &c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags = 0) override;
+ int _do_read(
+ Collection *c,
+ OnodeRef o,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags = 0,
+ uint64_t retry_count = 0);
+
+private:
+ int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
+ uint64_t offset, size_t len, interval_set<uint64_t>& destset);
+public:
+ int fiemap(CollectionHandle &c, const ghobject_t& oid,
+ uint64_t offset, size_t len, bufferlist& bl) override;
+ int fiemap(CollectionHandle &c, const ghobject_t& oid,
+ uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
+
+
+ int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
+ bufferptr& value) override;
+
+ int getattrs(CollectionHandle &c, const ghobject_t& oid,
+ map<string,bufferptr>& aset) override;
+
+ int list_collections(vector<coll_t>& ls) override;
+
+ CollectionHandle open_collection(const coll_t &c) override;
+ CollectionHandle create_new_collection(const coll_t& cid) override;
+ void set_collection_commit_queue(const coll_t& cid,
+ ContextQueue *commit_queue) override;
+
+ bool collection_exists(const coll_t& c) override;
+ int collection_empty(CollectionHandle& c, bool *empty) override;
+ int collection_bits(CollectionHandle& c) override;
+
+ int collection_list(CollectionHandle &c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ int max,
+ vector<ghobject_t> *ls, ghobject_t *next) override;
+
+ int collection_list_legacy(CollectionHandle &c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ int max,
+ vector<ghobject_t> *ls,
+ ghobject_t *next) override;
+
+ int omap_get(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+ ) override;
+
+ /// Get omap header
+ int omap_get_header(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ bool allow_eio = false ///< [in] don't assert on eio
+ ) override;
+
+ /// Get keys defined on oid
+ int omap_get_keys(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
+ ) override;
+
+ /// Get key values
+ int omap_get_values(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to get
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ ) override;
+
+ /// Filters keys into out which are defined on oid
+ int omap_check_keys(
+ CollectionHandle &c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to check
+ set<string> *out ///< [out] Subset of keys defined on oid
+ ) override;
+
+ ObjectMap::ObjectMapIterator get_omap_iterator(
+ CollectionHandle &c, ///< [in] collection
+ const ghobject_t &oid ///< [in] object
+ ) override;
+
+ void set_fsid(uuid_d u) override {
+ fsid = u;
+ }
+ uuid_d get_fsid() override {
+ return fsid;
+ }
+
+ uint64_t estimate_objects_overhead(uint64_t num_objects) override {
+ return num_objects * 300; //assuming per-object overhead is 300 bytes
+ }
+
+ struct BSPerfTracker {
+ PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
+ PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
+
+ objectstore_perf_stat_t get_cur_stats() const {
+ objectstore_perf_stat_t ret;
+ ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
+ ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
+ return ret;
+ }
+
+ void update_from_perfcounters(PerfCounters &logger);
+ } perf_tracker;
+
+ objectstore_perf_stat_t get_cur_stats() override {
+ perf_tracker.update_from_perfcounters(*logger);
+ return perf_tracker.get_cur_stats();
+ }
+ const PerfCounters* get_perf_counters() const override {
+ return logger;
+ }
+ const PerfCounters* get_bluefs_perf_counters() const {
+ return bluefs->get_perf_counters();
+ }
+
+ int queue_transactions(
+ CollectionHandle& ch,
+ vector<Transaction>& tls,
+ TrackedOpRef op = TrackedOpRef(),
+ ThreadPool::TPHandle *handle = NULL) override;
+
+ // error injection
+ void inject_data_error(const ghobject_t& o) override {
+ RWLock::WLocker l(debug_read_error_lock);
+ debug_data_error_objects.insert(o);
+ }
+ void inject_mdata_error(const ghobject_t& o) override {
+ RWLock::WLocker l(debug_read_error_lock);
+ debug_mdata_error_objects.insert(o);
+ }
+
+ /// methods to inject various errors fsck can repair
+ void inject_broken_shared_blob_key(const string& key,
+ const bufferlist& bl);
+ void inject_leaked(uint64_t len);
+ void inject_false_free(coll_t cid, ghobject_t oid);
+ void inject_statfs(const string& key, const store_statfs_t& new_statfs);
+ void inject_global_statfs(const store_statfs_t& new_statfs);
+ void inject_misreference(coll_t cid1, ghobject_t oid1,
+ coll_t cid2, ghobject_t oid2,
+ uint64_t offset);
+ void inject_zombie_spanning_blob(coll_t cid, ghobject_t oid, int16_t blob_id);
+ // resets global per_pool_omap in DB
+
+ void compact() override {
+ ceph_assert(db);
+ db->compact();
+ }
+ bool has_builtin_csum() const override {
+ return true;
+ }
+
+ /*
+ Allocate space for BlueFS from slow device.
+ Either automatically applies allocated extents to underlying
+ BlueFS (extents == nullptr) or just return them (non-null extents) provided
+ */
+ int allocate_bluefs_freespace(
+ uint64_t min_size,
+ uint64_t size,
+ PExtentVector* extents);
+
+ inline void log_latency(const char* name,
+ int idx,
+ const ceph::timespan& lat,
+ double lat_threshold,
+ const char* info = "") const;
+
+ inline void log_latency_fn(const char* name,
+ int idx,
+ const ceph::timespan& lat,
+ double lat_threshold,
+ std::function<string (const ceph::timespan& lat)> fn) const;
+
+private:
+ bool _debug_data_eio(const ghobject_t& o) {
+ if (!cct->_conf->bluestore_debug_inject_read_err) {
+ return false;
+ }
+ RWLock::RLocker l(debug_read_error_lock);
+ return debug_data_error_objects.count(o);
+ }
+ bool _debug_mdata_eio(const ghobject_t& o) {
+ if (!cct->_conf->bluestore_debug_inject_read_err) {
+ return false;
+ }
+ RWLock::RLocker l(debug_read_error_lock);
+ return debug_mdata_error_objects.count(o);
+ }
+ void _debug_obj_on_delete(const ghobject_t& o) {
+ if (cct->_conf->bluestore_debug_inject_read_err) {
+ RWLock::WLocker l(debug_read_error_lock);
+ debug_data_error_objects.erase(o);
+ debug_mdata_error_objects.erase(o);
+ }
+ }
+private:
+ ceph::mutex qlock = ceph::make_mutex("BlueStore::Alerts::qlock");
+ string failed_cmode;
+ set<string> failed_compressors;
+ string spillover_alert;
+ string legacy_statfs_alert;
+ string disk_size_mismatch_alert;
+
+ void _log_alerts(osd_alert_list_t& alerts);
+ bool _set_compression_alert(bool cmode, const char* s) {
+ std::lock_guard l(qlock);
+ if (cmode) {
+ bool ret = failed_cmode.empty();
+ failed_cmode = s;
+ return ret;
+ }
+ return failed_compressors.emplace(s).second;
+ }
+ void _clear_compression_alert() {
+ std::lock_guard l(qlock);
+ failed_compressors.clear();
+ failed_cmode.clear();
+ }
+
+ void _set_spillover_alert(const string& s) {
+ std::lock_guard l(qlock);
+ spillover_alert = s;
+ }
+ void _clear_spillover_alert() {
+ std::lock_guard l(qlock);
+ spillover_alert.clear();
+ }
+
+ void _check_legacy_statfs_alert();
+ void _set_disk_size_mismatch_alert(const string& s) {
+ std::lock_guard l(qlock);
+ disk_size_mismatch_alert = s;
+ }
+
+private:
+
+ // --------------------------------------------------------
+ // read processing internal methods
+ int _verify_csum(
+ OnodeRef& o,
+ const bluestore_blob_t* blob,
+ uint64_t blob_xoffset,
+ const bufferlist& bl,
+ uint64_t logical_offset) const;
+ int _decompress(bufferlist& source, bufferlist* result);
+
+
+ // --------------------------------------------------------
+ // write ops
+
+ struct WriteContext {
+ bool buffered = false; ///< buffered write
+ bool compress = false; ///< compressed write
+ uint64_t target_blob_size = 0; ///< target (max) blob size
+ unsigned csum_order = 0; ///< target checksum chunk order
+
+ old_extent_map_t old_extents; ///< must deref these blobs
+ interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
+
+ struct write_item {
+ uint64_t logical_offset; ///< write logical offset
+ BlobRef b;
+ uint64_t blob_length;
+ uint64_t b_off;
+ bufferlist bl;
+ uint64_t b_off0; ///< original offset in a blob prior to padding
+ uint64_t length0; ///< original data length prior to padding
+
+ bool mark_unused;
+ bool new_blob; ///< whether new blob was created
+
+ bool compressed = false;
+ bufferlist compressed_bl;
+ size_t compressed_len = 0;
+
+ write_item(
+ uint64_t logical_offs,
+ BlobRef b,
+ uint64_t blob_len,
+ uint64_t o,
+ bufferlist& bl,
+ uint64_t o0,
+ uint64_t l0,
+ bool _mark_unused,
+ bool _new_blob)
+ :
+ logical_offset(logical_offs),
+ b(b),
+ blob_length(blob_len),
+ b_off(o),
+ bl(bl),
+ b_off0(o0),
+ length0(l0),
+ mark_unused(_mark_unused),
+ new_blob(_new_blob) {}
+ };
+ vector<write_item> writes; ///< blobs we're writing
+
+ /// partial clone of the context
+ void fork(const WriteContext& other) {
+ buffered = other.buffered;
+ compress = other.compress;
+ target_blob_size = other.target_blob_size;
+ csum_order = other.csum_order;
+ }
+ void write(
+ uint64_t loffs,
+ BlobRef b,
+ uint64_t blob_len,
+ uint64_t o,
+ bufferlist& bl,
+ uint64_t o0,
+ uint64_t len0,
+ bool _mark_unused,
+ bool _new_blob) {
+ writes.emplace_back(loffs,
+ b,
+ blob_len,
+ o,
+ bl,
+ o0,
+ len0,
+ _mark_unused,
+ _new_blob);
+ }
+ /// Checks for writes to the same pextent within a blob
+ bool has_conflict(
+ BlobRef b,
+ uint64_t loffs,
+ uint64_t loffs_end,
+ uint64_t min_alloc_size);
+ };
+
+ void _do_write_small(
+ TransContext *txc,
+ CollectionRef &c,
+ OnodeRef o,
+ uint64_t offset, uint64_t length,
+ bufferlist::iterator& blp,
+ WriteContext *wctx);
+ void _do_write_big(
+ TransContext *txc,
+ CollectionRef &c,
+ OnodeRef o,
+ uint64_t offset, uint64_t length,
+ bufferlist::iterator& blp,
+ WriteContext *wctx);
+ int _do_alloc_write(
+ TransContext *txc,
+ CollectionRef c,
+ OnodeRef o,
+ WriteContext *wctx);
+ void _wctx_finish(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ WriteContext *wctx,
+ set<SharedBlob*> *maybe_unshared_blobs=0);
+
+ int _write(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t len,
+ bufferlist& bl,
+ uint32_t fadvise_flags);
+ void _pad_zeros(bufferlist *bl, uint64_t *offset,
+ uint64_t chunk_size);
+
+ void _choose_write_options(CollectionRef& c,
+ OnodeRef o,
+ uint32_t fadvise_flags,
+ WriteContext *wctx);
+
+ int _do_gc(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ const WriteContext& wctx,
+ uint64_t *dirty_start,
+ uint64_t *dirty_end);
+
+ int _do_write(TransContext *txc,
+ CollectionRef &c,
+ OnodeRef o,
+ uint64_t offset, uint64_t length,
+ bufferlist& bl,
+ uint32_t fadvise_flags);
+ void _do_write_data(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ uint64_t offset,
+ uint64_t length,
+ bufferlist& bl,
+ WriteContext *wctx);
+
+ int _touch(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o);
+ int _do_zero(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t len);
+ int _zero(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t len);
+ void _do_truncate(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ uint64_t offset,
+ set<SharedBlob*> *maybe_unshared_blobs=0);
+ int _truncate(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset);
+ int _remove(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o);
+ int _do_remove(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o);
+ int _setattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name,
+ bufferptr& val);
+ int _setattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const map<string,bufferptr>& aset);
+ int _rmattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name);
+ int _rmattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o);
+ void _do_omap_clear(TransContext *txc, const string& prefix, uint64_t id);
+ int _omap_clear(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o);
+ int _omap_setkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist& bl);
+ int _omap_setheader(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist& header);
+ int _omap_rmkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist& bl);
+ int _omap_rmkey_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& first, const string& last);
+ int _set_alloc_hint(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+ int _do_clone_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ uint64_t srcoff, uint64_t length, uint64_t dstoff);
+ int _clone(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo);
+ int _clone_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ uint64_t srcoff, uint64_t length, uint64_t dstoff);
+ int _rename(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ const ghobject_t& new_oid);
+ int _create_collection(TransContext *txc, const coll_t &cid,
+ unsigned bits, CollectionRef *c);
+ int _remove_collection(TransContext *txc, const coll_t &cid,
+ CollectionRef *c);
+ void _do_remove_collection(TransContext *txc, CollectionRef *c);
+ int _split_collection(TransContext *txc,
+ CollectionRef& c,
+ CollectionRef& d,
+ unsigned bits, int rem);
+ int _merge_collection(TransContext *txc,
+ CollectionRef *c,
+ CollectionRef& d,
+ unsigned bits);
+
+private:
+ std::atomic<uint64_t> out_of_sync_fm = {0};
+ // --------------------------------------------------------
+ // BlueFSDeviceExpander implementation
+ uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
+ uint64_t bluefs_total) override {
+ auto delta = _get_bluefs_size_delta(bluefs_free, bluefs_total);
+ return delta > 0 ? delta : 0;
+ }
+ int allocate_freespace(
+ uint64_t min_size,
+ uint64_t size,
+ PExtentVector& extents) override {
+ return allocate_bluefs_freespace(min_size, size, &extents);
+ };
+ size_t available_freespace(uint64_t alloc_size) override;
+
+public:
+ struct sb_info_t {
+ coll_t cid;
+ int64_t pool_id = INT64_MIN;
+ list<ghobject_t> oids;
+ BlueStore::SharedBlobRef sb;
+ bluestore_extent_ref_map_t ref_map;
+ bool compressed = false;
+ bool passed = false;
+ bool updated = false;
+ };
+ typedef btree::btree_set<
+ uint64_t, std::less<uint64_t>,
+ mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
+
+ typedef mempool::bluestore_fsck::map<uint64_t, sb_info_t> sb_info_map_t;
+ struct FSCK_ObjectCtx {
+ int64_t& errors;
+ int64_t& warnings;
+ uint64_t& num_objects;
+ uint64_t& num_extents;
+ uint64_t& num_blobs;
+ uint64_t& num_sharded_objects;
+ uint64_t& num_spanning_blobs;
+
+ mempool_dynamic_bitset* used_blocks;
+ uint64_t_btree_t* used_omap_head;
+ uint64_t_btree_t* used_per_pool_omap_head;
+ uint64_t_btree_t* used_pgmeta_omap_head;
+
+ ceph::mutex* sb_info_lock;
+ sb_info_map_t& sb_info;
+
+ store_statfs_t& expected_store_statfs;
+ per_pool_statfs& expected_pool_statfs;
+ BlueStoreRepairer* repairer;
+
+ FSCK_ObjectCtx(int64_t& e,
+ int64_t& w,
+ uint64_t& _num_objects,
+ uint64_t& _num_extents,
+ uint64_t& _num_blobs,
+ uint64_t& _num_sharded_objects,
+ uint64_t& _num_spanning_blobs,
+ mempool_dynamic_bitset* _ub,
+ uint64_t_btree_t* _used_omap_head,
+ uint64_t_btree_t* _used_per_pool_omap_head,
+ uint64_t_btree_t* _used_pgmeta_omap_head,
+ ceph::mutex* _sb_info_lock,
+ sb_info_map_t& _sb_info,
+ store_statfs_t& _store_statfs,
+ per_pool_statfs& _pool_statfs,
+ BlueStoreRepairer* _repairer) :
+ errors(e),
+ warnings(w),
+ num_objects(_num_objects),
+ num_extents(_num_extents),
+ num_blobs(_num_blobs),
+ num_sharded_objects(_num_sharded_objects),
+ num_spanning_blobs(_num_spanning_blobs),
+ used_blocks(_ub),
+ used_omap_head(_used_omap_head),
+ used_per_pool_omap_head(_used_per_pool_omap_head),
+ used_pgmeta_omap_head(_used_pgmeta_omap_head),
+ sb_info_lock(_sb_info_lock),
+ sb_info(_sb_info),
+ expected_store_statfs(_store_statfs),
+ expected_pool_statfs(_pool_statfs),
+ repairer(_repairer) {
+ }
+ };
+
+ OnodeRef fsck_check_objects_shallow(
+ FSCKDepth depth,
+ int64_t pool_id,
+ CollectionRef c,
+ const ghobject_t& oid,
+ const string& key,
+ const bufferlist& value,
+ mempool::bluestore_fsck::list<string>& expecting_shards,
+ map<BlobRef, bluestore_blob_t::unused_t>* referenced,
+ const BlueStore::FSCK_ObjectCtx& ctx);
+
+private:
+ void _fsck_check_objects(FSCKDepth depth,
+ FSCK_ObjectCtx& ctx);
+};
+
+inline ostream& operator<<(ostream& out, const BlueStore::volatile_statfs& s) {
+ return out
+ << " allocated:"
+ << s.values[BlueStore::volatile_statfs::STATFS_ALLOCATED]
+ << " stored:"
+ << s.values[BlueStore::volatile_statfs::STATFS_STORED]
+ << " compressed:"
+ << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED]
+ << " compressed_orig:"
+ << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL]
+ << " compressed_alloc:"
+ << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED];
+}
+
+static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
+ o->get();
+}
+static inline void intrusive_ptr_release(BlueStore::Onode *o) {
+ o->put();
+}
+
+static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
+ o->get();
+}
+static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
+ o->put();
+}
+
+class BlueStoreRepairer
+{
+public:
+ // to simplify future potential migration to mempools
+ using fsck_interval = interval_set<uint64_t>;
+
+ // Structure to track what pextents are used for specific cid/oid.
+ // Similar to Bloom filter positive and false-positive matches are
+ // possible only.
+ // Maintains two lists of bloom filters for both cids and oids
+ // where each list entry is a BF for specific disk pextent
+ // The length of the extent per filter is measured on init.
+ // Allows to filter out 'uninteresting' pextents to speadup subsequent
+ // 'is_used' access.
+ struct StoreSpaceTracker {
+ const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
+ const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
+ const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
+ static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
+
+ typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
+ bloom_vector collections_bfs;
+ bloom_vector objects_bfs;
+
+ bool was_filtered_out = false;
+ uint64_t granularity = 0; // extent length for a single filter
+
+ StoreSpaceTracker() {
+ }
+ StoreSpaceTracker(const StoreSpaceTracker& from) :
+ collections_bfs(from.collections_bfs),
+ objects_bfs(from.objects_bfs),
+ granularity(from.granularity) {
+ }
+
+ void init(uint64_t total,
+ uint64_t min_alloc_size,
+ uint64_t mem_cap = DEF_MEM_CAP) {
+ ceph_assert(!granularity); // not initialized yet
+ ceph_assert(min_alloc_size && isp2(min_alloc_size));
+ ceph_assert(mem_cap);
+
+ total = round_up_to(total, min_alloc_size);
+ granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
+
+ if (!granularity) {
+ granularity = min_alloc_size;
+ } else {
+ granularity = round_up_to(granularity, min_alloc_size);
+ }
+
+ uint64_t entries = round_up_to(total, granularity) / granularity;
+ collections_bfs.resize(entries,
+ bloom_filter(BLOOM_FILTER_SALT_COUNT,
+ BLOOM_FILTER_TABLE_SIZE,
+ 0,
+ BLOOM_FILTER_EXPECTED_COUNT));
+ objects_bfs.resize(entries,
+ bloom_filter(BLOOM_FILTER_SALT_COUNT,
+ BLOOM_FILTER_TABLE_SIZE,
+ 0,
+ BLOOM_FILTER_EXPECTED_COUNT));
+ }
+ inline uint32_t get_hash(const coll_t& cid) const {
+ return cid.hash_to_shard(1);
+ }
+ inline void set_used(uint64_t offset, uint64_t len,
+ const coll_t& cid, const ghobject_t& oid) {
+ ceph_assert(granularity); // initialized
+
+ // can't call this func after filter_out has been applied
+ ceph_assert(!was_filtered_out);
+ if (!len) {
+ return;
+ }
+ auto pos = offset / granularity;
+ auto end_pos = (offset + len - 1) / granularity;
+ while (pos <= end_pos) {
+ collections_bfs[pos].insert(get_hash(cid));
+ objects_bfs[pos].insert(oid.hobj.get_hash());
+ ++pos;
+ }
+ }
+ // filter-out entries unrelated to the specified(broken) extents.
+ // 'is_used' calls are permitted after that only
+ size_t filter_out(const fsck_interval& extents);
+
+ // determines if collection's present after filtering-out
+ inline bool is_used(const coll_t& cid) const {
+ ceph_assert(was_filtered_out);
+ for(auto& bf : collections_bfs) {
+ if (bf.contains(get_hash(cid))) {
+ return true;
+ }
+ }
+ return false;
+ }
+ // determines if object's present after filtering-out
+ inline bool is_used(const ghobject_t& oid) const {
+ ceph_assert(was_filtered_out);
+ for(auto& bf : objects_bfs) {
+ if (bf.contains(oid.hobj.get_hash())) {
+ return true;
+ }
+ }
+ return false;
+ }
+ // determines if collection's present before filtering-out
+ inline bool is_used(const coll_t& cid, uint64_t offs) const {
+ ceph_assert(granularity); // initialized
+ ceph_assert(!was_filtered_out);
+ auto &bf = collections_bfs[offs / granularity];
+ if (bf.contains(get_hash(cid))) {
+ return true;
+ }
+ return false;
+ }
+ // determines if object's present before filtering-out
+ inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
+ ceph_assert(granularity); // initialized
+ ceph_assert(!was_filtered_out);
+ auto &bf = objects_bfs[offs / granularity];
+ if (bf.contains(oid.hobj.get_hash())) {
+ return true;
+ }
+ return false;
+ }
+ };
+public:
+
+ bool remove_key(KeyValueDB *db, const string& prefix, const string& key);
+ bool fix_shared_blob(KeyValueDB *db,
+ uint64_t sbid,
+ const bufferlist* bl);
+ bool fix_statfs(KeyValueDB *db, const string& key,
+ const store_statfs_t& new_statfs);
+
+ bool fix_leaked(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len);
+ bool fix_false_free(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len);
+ bool fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag);
+ KeyValueDB::Transaction fix_spanning_blobs(KeyValueDB* db);
+
+ void init(uint64_t total_space, uint64_t lres_tracking_unit_size);
+
+ bool preprocess_misreference(KeyValueDB *db);
+
+ unsigned apply(KeyValueDB* db);
+
+ void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
+ misreferenced_extents.union_insert(offs, len);
+ if (inc_error) {
+ ++to_repair_cnt;
+ }
+ }
+ void inc_repaired() {
+ ++to_repair_cnt;
+ }
+
+ StoreSpaceTracker& get_space_usage_tracker() {
+ return space_usage_tracker;
+ }
+ const fsck_interval& get_misreferences() const {
+ return misreferenced_extents;
+ }
+ KeyValueDB::Transaction get_fix_misreferences_txn() {
+ return fix_misreferences_txn;
+ }
+
+private:
+ unsigned to_repair_cnt = 0;
+ KeyValueDB::Transaction fix_fm_leaked_txn;
+ KeyValueDB::Transaction fix_fm_false_free_txn;
+ KeyValueDB::Transaction remove_key_txn;
+ KeyValueDB::Transaction fix_statfs_txn;
+ KeyValueDB::Transaction fix_shared_blob_txn;
+
+ KeyValueDB::Transaction fix_misreferences_txn;
+ KeyValueDB::Transaction fix_onode_txn;
+
+ StoreSpaceTracker space_usage_tracker;
+
+ // non-shared extents with multiple references
+ fsck_interval misreferenced_extents;
+
+};
+
+class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
+{
+ template <class T, size_t MaxX, size_t MaxY>
+ class matrix_2d {
+ T values[MaxX][MaxY];
+ public:
+ matrix_2d() {
+ clear();
+ }
+ T& at(size_t x, size_t y) {
+ ceph_assert(x < MaxX);
+ ceph_assert(y < MaxY);
+
+ return values[x][y];
+ }
+ size_t get_max_x() const {
+ return MaxX;
+ }
+ size_t get_max_y() const {
+ return MaxY;
+ }
+ void clear() {
+ memset(values, 0, sizeof(values));
+ }
+ };
+
+ enum {
+ // use 0/nullptr as unset indication
+ LEVEL_FIRST = 1,
+ LEVEL_WAL = LEVEL_FIRST,
+ LEVEL_DB,
+ LEVEL_SLOW,
+ LEVEL_MAX
+ };
+ // add +1 row for corresponding per-device totals
+ // add +1 column for per-level actual (taken from file size) total
+ typedef matrix_2d<uint64_t, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
+
+ per_level_per_dev_usage_t per_level_per_dev_usage;
+
+ // Note: maximum per-device totals below might be smaller than corresponding
+ // perf counters by up to a single alloc unit (1M) due to superblock extent.
+ // The later is not accounted here.
+ per_level_per_dev_usage_t per_level_per_dev_max;
+
+ uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
+ uint64_t db_avail4slow = 0;
+ enum {
+ OLD_POLICY,
+ USE_SOME_EXTRA
+ };
+
+public:
+ RocksDBBlueFSVolumeSelector(
+ uint64_t _wal_total,
+ uint64_t _db_total,
+ uint64_t _slow_total,
+ uint64_t _level0_size,
+ uint64_t _level_base,
+ uint64_t _level_multiplier,
+ double reserved_factor,
+ uint64_t reserved,
+ bool new_pol)
+ {
+ l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
+ l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
+ l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
+
+ if (!new_pol) {
+ return;
+ }
+
+ // Calculating how much extra space is available at DB volume.
+ // Depending on the presence of explicit reserved size specification it might be either
+ // * DB volume size - reserved
+ // or
+ // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
+ if (!reserved) {
+ uint64_t prev_levels = _level0_size;
+ uint64_t cur_level = _level_base;
+ uint64_t cur_threshold = 0;
+ do {
+ uint64_t next_level = cur_level * _level_multiplier;
+ uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor;
+ if (_db_total <= next_threshold) {
+ db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0;
+ break;
+ } else {
+ prev_levels += cur_level;
+ cur_level = next_level;
+ cur_threshold = next_threshold;
+ }
+ } while (true);
+ } else {
+ db_avail4slow = _db_total - reserved;
+ }
+ }
+
+ void* get_hint_by_device(uint8_t dev) const override {
+ ceph_assert(dev == BlueFS::BDEV_WAL); // others aren't used atm
+ return reinterpret_cast<void*>(LEVEL_WAL);
+ }
+ void* get_hint_by_dir(const string& dirname) const override;
+
+ void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ for (auto& p : fnode.extents) {
+ auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+ auto& max = per_level_per_dev_max.at(p.bdev, pos);
+ cur += p.length;
+ if (cur > max) {
+ max = cur;
+ }
+ {
+ //update per-device totals
+ auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+ auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+ cur += p.length;
+ if (cur > max) {
+ max = cur;
+ }
+ }
+ }
+ {
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+ cur += fnode.size;
+ if (cur > max) {
+ max = cur;
+ }
+ }
+ }
+ void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ for (auto& p : fnode.extents) {
+ auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+ ceph_assert(cur >= p.length);
+ cur -= p.length;
+
+ //update per-device totals
+ auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+ ceph_assert(cur2 >= p.length);
+ cur2 -= p.length;
+ }
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ ceph_assert(cur >= fnode.size);
+ cur -= fnode.size;
+ }
+ void add_usage(void* hint, uint64_t fsize) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+ cur += fsize;
+ if (cur > max) {
+ max = cur;
+ }
+ }
+ void sub_usage(void* hint, uint64_t fsize) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ ceph_assert(cur >= fsize);
+ per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos) -= fsize;
+ }
+
+ uint8_t select_prefer_bdev(void* h) override;
+ void get_paths(
+ const std::string& base,
+ BlueFSVolumeSelector::paths& res) const override;
+
+ void dump(ostream& sout) override;
+};
+
+#endif
diff --git a/src/os/bluestore/FreelistManager.cc b/src/os/bluestore/FreelistManager.cc
new file mode 100644
index 00000000..8aeb4526
--- /dev/null
+++ b/src/os/bluestore/FreelistManager.cc
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "FreelistManager.h"
+#include "BitmapFreelistManager.h"
+
+FreelistManager *FreelistManager::create(
+ CephContext* cct,
+ string type,
+ string prefix)
+{
+ // a bit of a hack... we hard-code the prefixes here. we need to
+ // put the freelistmanagers in different prefixes because the merge
+ // op is per prefix, has to done pre-db-open, and we don't know the
+ // freelist type until after we open the db.
+ ceph_assert(prefix == "B");
+ if (type == "bitmap")
+ return new BitmapFreelistManager(cct, "B", "b");
+ return NULL;
+}
+
+void FreelistManager::setup_merge_operators(KeyValueDB *db)
+{
+ BitmapFreelistManager::setup_merge_operator(db, "b");
+}
diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h
new file mode 100644
index 00000000..56e05d14
--- /dev/null
+++ b/src/os/bluestore/FreelistManager.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_FREELISTMANAGER_H
+#define CEPH_OS_BLUESTORE_FREELISTMANAGER_H
+
+#include <string>
+#include <map>
+#include <mutex>
+#include <ostream>
+#include "kv/KeyValueDB.h"
+
+class FreelistManager {
+public:
+ CephContext* cct;
+ FreelistManager(CephContext* cct) : cct(cct) {}
+ virtual ~FreelistManager() {}
+
+ static FreelistManager *create(
+ CephContext* cct,
+ string type,
+ string prefix);
+
+ static void setup_merge_operators(KeyValueDB *db);
+
+ virtual int create(uint64_t size, uint64_t granularity,
+ KeyValueDB::Transaction txn) = 0;
+
+ virtual int expand(uint64_t new_size,
+ KeyValueDB::Transaction txn) = 0;
+
+ virtual int init(KeyValueDB *kvdb) = 0;
+ virtual void shutdown() = 0;
+
+ virtual void dump(KeyValueDB *kvdb) = 0;
+
+ virtual void enumerate_reset() = 0;
+ virtual bool enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length) = 0;
+
+ virtual void allocate(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn) = 0;
+ virtual void release(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn) = 0;
+
+ virtual uint64_t get_size() const = 0;
+ virtual uint64_t get_alloc_units() const = 0;
+ virtual uint64_t get_alloc_size() const = 0;
+
+};
+
+
+#endif
diff --git a/src/os/bluestore/HybridAllocator.cc b/src/os/bluestore/HybridAllocator.cc
new file mode 100644
index 00000000..6caf5c6d
--- /dev/null
+++ b/src/os/bluestore/HybridAllocator.cc
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "HybridAllocator.h"
+
+#include <limits>
+
+#include "common/config_proxy.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "HybridAllocator "
+
+
+int64_t HybridAllocator::allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector* extents)
+{
+ ldout(cct, 10) << __func__ << std::hex
+ << " want 0x" << want
+ << " unit 0x" << unit
+ << " max_alloc_size 0x" << max_alloc_size
+ << " hint 0x" << hint
+ << std::dec << dendl;
+ ceph_assert(isp2(unit));
+ ceph_assert(want % unit == 0);
+
+ if (max_alloc_size == 0) {
+ max_alloc_size = want;
+ }
+ if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
+ max_alloc_size >= cap) {
+ max_alloc_size = p2align(uint64_t(cap), (uint64_t)get_block_size());
+ }
+
+ std::lock_guard l(lock);
+
+ int64_t res;
+ PExtentVector local_extents;
+
+ // preserve original 'extents' vector state
+ auto orig_size = extents->size();
+ auto orig_pos = extents->end();
+ if (orig_size) {
+ --orig_pos;
+ }
+
+ // try bitmap first to avoid unneeded contiguous extents split if
+ // desired amount is less than shortes range in AVL
+ if (bmap_alloc && bmap_alloc->get_free() &&
+ want < _lowest_size_available()) {
+ res = bmap_alloc->allocate(want, unit, max_alloc_size, hint, extents);
+ if (res < 0) {
+ // got a failure, release already allocated and
+ // start over allocation from avl
+ if (orig_size) {
+ local_extents.insert(
+ local_extents.end(), ++orig_pos, extents->end());
+ extents->resize(orig_size);
+ } else {
+ extents->swap(local_extents);
+ }
+ bmap_alloc->release(local_extents);
+ res = 0;
+ }
+ if ((uint64_t)res < want) {
+ auto res2 = _allocate(want - res, unit, max_alloc_size, hint, extents);
+ if (res2 < 0) {
+ res = res2; // caller to do the release
+ } else {
+ res += res2;
+ }
+ }
+ } else {
+ res = _allocate(want, unit, max_alloc_size, hint, extents);
+ if (res < 0) {
+ // got a failure, release already allocated and
+ // start over allocation from bitmap
+ if (orig_size) {
+ local_extents.insert(
+ local_extents.end(), ++orig_pos, extents->end());
+ extents->resize(orig_size);
+ } else {
+ extents->swap(local_extents);
+ }
+ _release(local_extents);
+ res = 0;
+ }
+ if ((uint64_t)res < want ) {
+ auto res2 = bmap_alloc ?
+ bmap_alloc->allocate(want - res, unit, max_alloc_size, hint, extents) :
+ 0;
+ if (res2 < 0 ) {
+ res = res2; // caller to do the release
+ } else {
+ res += res2;
+ }
+ }
+ }
+ return res ? res : -ENOSPC;
+}
+
+void HybridAllocator::release(const interval_set<uint64_t>& release_set) {
+ std::lock_guard l(lock);
+ // this will attempt to put free ranges into AvlAllocator first and
+ // fallback to bitmap one via _try_insert_range call
+ _release(release_set);
+}
+
+uint64_t HybridAllocator::get_free()
+{
+ std::lock_guard l(lock);
+ return (bmap_alloc ? bmap_alloc->get_free() : 0) + _get_free();
+}
+
+double HybridAllocator::get_fragmentation()
+{
+ std::lock_guard l(lock);
+ auto f = AvlAllocator::_get_fragmentation();
+ auto bmap_free = bmap_alloc ? bmap_alloc->get_free() : 0;
+ if (bmap_free) {
+ auto _free = _get_free() + bmap_free;
+ auto bf = bmap_alloc->get_fragmentation();
+
+ f = f * _get_free() / _free + bf * bmap_free / _free;
+ }
+ return f;
+}
+
+void HybridAllocator::dump()
+{
+ std::lock_guard l(lock);
+ AvlAllocator::_dump();
+ if (bmap_alloc) {
+ bmap_alloc->dump();
+ }
+ ldout(cct, 0) << __func__
+ << " avl_free: " << _get_free()
+ << " bmap_free: " << (bmap_alloc ? bmap_alloc->get_free() : 0)
+ << dendl;
+}
+
+void HybridAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+ AvlAllocator::dump(notify);
+ if (bmap_alloc) {
+ bmap_alloc->dump(notify);
+ }
+}
+
+void HybridAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+ std::lock_guard l(lock);
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << offset
+ << " length 0x" << length
+ << std::dec << dendl;
+ _try_remove_from_tree(offset, length,
+ [&](uint64_t o, uint64_t l, bool found) {
+ if (!found) {
+ if (bmap_alloc) {
+ bmap_alloc->init_rm_free(o, l);
+ } else {
+ lderr(cct) << "init_rm_free lambda" << std::hex
+ << "Uexpected extent: "
+ << " 0x" << o << "~" << l
+ << std::dec << dendl;
+ ceph_assert(false);
+ }
+ }
+ });
+}
+
+void HybridAllocator::shutdown()
+{
+ std::lock_guard l(lock);
+ _shutdown();
+ if (bmap_alloc) {
+ bmap_alloc->shutdown();
+ delete bmap_alloc;
+ bmap_alloc = nullptr;
+ }
+}
+
+void HybridAllocator::_spillover_range(uint64_t start, uint64_t end)
+{
+ auto size = end - start;
+ dout(20) << __func__
+ << std::hex << " "
+ << start << "~" << size
+ << std::dec
+ << dendl;
+ ceph_assert(size);
+ if (!bmap_alloc) {
+ dout(1) << __func__
+ << std::hex
+ << " constructing fallback allocator"
+ << dendl;
+ bmap_alloc = new BitmapAllocator(cct,
+ get_capacity(),
+ get_block_size(),
+ get_name() + ".fallback");
+ }
+ bmap_alloc->init_add_free(start, size);
+}
+
+void HybridAllocator::_add_to_tree(uint64_t start, uint64_t size)
+{
+ if (bmap_alloc) {
+ uint64_t head = bmap_alloc->claim_free_to_left(start);
+ uint64_t tail = bmap_alloc->claim_free_to_right(start + size);
+ ceph_assert(head <= start);
+ start -= head;
+ size += head + tail;
+ }
+ AvlAllocator::_add_to_tree(start, size);
+}
diff --git a/src/os/bluestore/HybridAllocator.h b/src/os/bluestore/HybridAllocator.h
new file mode 100644
index 00000000..e8246cf4
--- /dev/null
+++ b/src/os/bluestore/HybridAllocator.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <mutex>
+
+#include "AvlAllocator.h"
+#include "BitmapAllocator.h"
+
+class HybridAllocator : public AvlAllocator {
+ BitmapAllocator* bmap_alloc = nullptr;
+public:
+ HybridAllocator(CephContext* cct, int64_t device_size, int64_t _block_size,
+ uint64_t max_mem,
+ const std::string& name) :
+ AvlAllocator(cct, device_size, _block_size, max_mem, name) {
+ }
+ int64_t allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector *extents) override;
+ void release(const interval_set<uint64_t>& release_set) override;
+ uint64_t get_free() override;
+ double get_fragmentation() override;
+
+ void dump() override;
+ void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override;
+ void init_rm_free(uint64_t offset, uint64_t length) override;
+ void shutdown() override;
+
+protected:
+ // intended primarily for UT
+ BitmapAllocator* get_bmap() {
+ return bmap_alloc;
+ }
+ const BitmapAllocator* get_bmap() const {
+ return bmap_alloc;
+ }
+private:
+
+ void _spillover_range(uint64_t start, uint64_t end) override;
+
+ // called when extent to be released/marked free
+ void _add_to_tree(uint64_t start, uint64_t size) override;
+};
diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc
new file mode 100644
index 00000000..2a20f209
--- /dev/null
+++ b/src/os/bluestore/KernelDevice.cc
@@ -0,0 +1,1185 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "KernelDevice.h"
+#include "include/types.h"
+#include "include/compat.h"
+#include "include/stringify.h"
+#include "common/blkdev.h"
+#include "common/errno.h"
+#if defined(__FreeBSD__)
+#include "bsm/audit_errno.h"
+#endif
+#include "common/debug.h"
+#include "common/align.h"
+#include "common/numa.h"
+
+#include "global/global_context.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bdev
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev(" << this << " " << path << ") "
+
+KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv)
+ : BlockDevice(cct, cb, cbpriv),
+ aio(false), dio(false),
+ aio_queue(cct->_conf->bdev_aio_max_queue_depth),
+ discard_callback(d_cb),
+ discard_callback_priv(d_cbpriv),
+ aio_stop(false),
+ discard_started(false),
+ discard_stop(false),
+ aio_thread(this),
+ discard_thread(this),
+ injecting_crash(0)
+{
+ fd_directs.resize(WRITE_LIFE_MAX, -1);
+ fd_buffereds.resize(WRITE_LIFE_MAX, -1);
+}
+
+int KernelDevice::_lock()
+{
+ dout(10) << __func__ << " " << fd_directs[WRITE_LIFE_NOT_SET] << dendl;
+ // When the block changes, systemd-udevd will open the block,
+ // read some information and close it. Then a failure occurs here.
+ // So we need to try again here.
+ int fd = fd_directs[WRITE_LIFE_NOT_SET];
+ uint64_t nr_tries = 0;
+ for (;;) {
+ struct flock fl = { F_WRLCK,
+ SEEK_SET };
+ int r = ::fcntl(fd, F_OFD_SETLK, &fl);
+ if (r < 0) {
+ if (errno == EINVAL) {
+ r = ::flock(fd, LOCK_EX | LOCK_NB);
+ }
+ }
+ if (r == 0) {
+ return 0;
+ }
+ if (errno != EAGAIN) {
+ return -errno;
+ }
+ dout(1) << __func__ << " flock busy on " << path << dendl;
+ if (const uint64_t max_retry =
+ cct->_conf.get_val<uint64_t>("bdev_flock_retry");
+ max_retry > 0 && nr_tries++ == max_retry) {
+ return -EAGAIN;
+ }
+ double retry_interval =
+ cct->_conf.get_val<double>("bdev_flock_retry_interval");
+ std::this_thread::sleep_for(ceph::make_timespan(retry_interval));
+ }
+}
+
+int KernelDevice::open(const string& p)
+{
+ path = p;
+ int r = 0, i = 0;
+ dout(1) << __func__ << " path " << path << dendl;
+
+ for (i = 0; i < WRITE_LIFE_MAX; i++) {
+ int fd = ::open(path.c_str(), O_RDWR | O_DIRECT);
+ if (fd < 0) {
+ r = -errno;
+ break;
+ }
+ fd_directs[i] = fd;
+
+ fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ break;
+ }
+ fd_buffereds[i] = fd;
+ }
+
+ if (i != WRITE_LIFE_MAX) {
+ derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
+ goto out_fail;
+ }
+
+#if defined(F_SET_FILE_RW_HINT)
+ for (i = WRITE_LIFE_NONE; i < WRITE_LIFE_MAX; i++) {
+ if (fcntl(fd_directs[i], F_SET_FILE_RW_HINT, &i) < 0) {
+ r = -errno;
+ break;
+ }
+ if (fcntl(fd_buffereds[i], F_SET_FILE_RW_HINT, &i) < 0) {
+ r = -errno;
+ break;
+ }
+ }
+ if (i != WRITE_LIFE_MAX) {
+ enable_wrt = false;
+ dout(0) << "ioctl(F_SET_FILE_RW_HINT) on " << path << " failed: " << cpp_strerror(r) << dendl;
+ }
+#endif
+
+ dio = true;
+ aio = cct->_conf->bdev_aio;
+ if (!aio) {
+ ceph_abort_msg("non-aio not supported");
+ }
+
+ // disable readahead as it will wreak havoc on our mix of
+ // directio/aio and buffered io.
+ r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], 0, 0, POSIX_FADV_RANDOM);
+ if (r) {
+ r = -r;
+ derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
+ goto out_fail;
+ }
+
+ if (lock_exclusive) {
+ r = _lock();
+ if (r < 0) {
+ derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
+ << dendl;
+ goto out_fail;
+ }
+ }
+
+ struct stat st;
+ r = ::fstat(fd_directs[WRITE_LIFE_NOT_SET], &st);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
+ goto out_fail;
+ }
+
+ // Operate as though the block size is 4 KB. The backing file
+ // blksize doesn't strictly matter except that some file systems may
+ // require a read/modify/write if we write something smaller than
+ // it.
+ block_size = cct->_conf->bdev_block_size;
+ if (block_size != (unsigned)st.st_blksize) {
+ dout(1) << __func__ << " backing device/file reports st_blksize "
+ << st.st_blksize << ", using bdev_block_size "
+ << block_size << " anyway" << dendl;
+ }
+
+
+ {
+ BlkDev blkdev_direct(fd_directs[WRITE_LIFE_NOT_SET]);
+ BlkDev blkdev_buffered(fd_buffereds[WRITE_LIFE_NOT_SET]);
+
+ if (S_ISBLK(st.st_mode)) {
+ int64_t s;
+ r = blkdev_direct.get_size(&s);
+ if (r < 0) {
+ goto out_fail;
+ }
+ size = s;
+ } else {
+ size = st.st_size;
+ }
+
+ char partition[PATH_MAX], devname[PATH_MAX];
+ if ((r = blkdev_buffered.partition(partition, PATH_MAX)) ||
+ (r = blkdev_buffered.wholedisk(devname, PATH_MAX))) {
+ derr << "unable to get device name for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ rotational = true;
+ } else {
+ dout(20) << __func__ << " devname " << devname << dendl;
+ rotational = blkdev_buffered.is_rotational();
+ support_discard = blkdev_buffered.support_discard();
+ this->devname = devname;
+ _detect_vdo();
+ }
+ }
+
+ r = _aio_start();
+ if (r < 0) {
+ goto out_fail;
+ }
+ _discard_start();
+
+ // round size down to an even block
+ size &= ~(block_size - 1);
+
+ dout(1) << __func__
+ << " size " << size
+ << " (0x" << std::hex << size << std::dec << ", "
+ << byte_u_t(size) << ")"
+ << " block_size " << block_size
+ << " (" << byte_u_t(block_size) << ")"
+ << " " << (rotational ? "rotational" : "non-rotational")
+ << " discard " << (support_discard ? "supported" : "not supported")
+ << dendl;
+ return 0;
+
+out_fail:
+ for (i = 0; i < WRITE_LIFE_MAX; i++) {
+ if (fd_directs[i] >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i]));
+ fd_directs[i] = -1;
+ } else {
+ break;
+ }
+ if (fd_buffereds[i] >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i]));
+ fd_buffereds[i] = -1;
+ } else {
+ break;
+ }
+ }
+ return r;
+}
+
+int KernelDevice::get_devices(std::set<std::string> *ls)
+{
+ if (devname.empty()) {
+ return 0;
+ }
+ get_raw_devices(devname, ls);
+ return 0;
+}
+
+void KernelDevice::close()
+{
+ dout(1) << __func__ << dendl;
+ _aio_stop();
+ _discard_stop();
+
+ if (vdo_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
+ vdo_fd = -1;
+ }
+
+ for (int i = 0; i < WRITE_LIFE_MAX; i++) {
+ assert(fd_directs[i] >= 0);
+ VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i]));
+ fd_directs[i] = -1;
+
+ assert(fd_buffereds[i] >= 0);
+ VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i]));
+ fd_buffereds[i] = -1;
+ }
+ path.clear();
+}
+
+int KernelDevice::collect_metadata(const string& prefix, map<string,string> *pm) const
+{
+ (*pm)[prefix + "support_discard"] = stringify((int)(bool)support_discard);
+ (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
+ (*pm)[prefix + "size"] = stringify(get_size());
+ (*pm)[prefix + "block_size"] = stringify(get_block_size());
+ (*pm)[prefix + "driver"] = "KernelDevice";
+ if (rotational) {
+ (*pm)[prefix + "type"] = "hdd";
+ } else {
+ (*pm)[prefix + "type"] = "ssd";
+ }
+ if (vdo_fd >= 0) {
+ (*pm)[prefix + "vdo"] = "true";
+ uint64_t total, avail;
+ get_vdo_utilization(vdo_fd, &total, &avail);
+ (*pm)[prefix + "vdo_physical_size"] = stringify(total);
+ }
+
+ struct stat st;
+ int r = ::fstat(fd_buffereds[WRITE_LIFE_NOT_SET], &st);
+ if (r < 0)
+ return -errno;
+ if (S_ISBLK(st.st_mode)) {
+ (*pm)[prefix + "access_mode"] = "blk";
+
+ char buffer[1024] = {0};
+ BlkDev blkdev{fd_buffereds[WRITE_LIFE_NOT_SET]};
+ if (r = blkdev.partition(buffer, sizeof(buffer)); r) {
+ (*pm)[prefix + "partition_path"] = "unknown";
+ } else {
+ (*pm)[prefix + "partition_path"] = buffer;
+ }
+ buffer[0] = '\0';
+ if (r = blkdev.partition(buffer, sizeof(buffer)); r) {
+ (*pm)[prefix + "dev_node"] = "unknown";
+ } else {
+ (*pm)[prefix + "dev_node"] = buffer;
+ }
+ if (!r) {
+ return 0;
+ }
+ buffer[0] = '\0';
+ blkdev.model(buffer, sizeof(buffer));
+ (*pm)[prefix + "model"] = buffer;
+
+ buffer[0] = '\0';
+ blkdev.dev(buffer, sizeof(buffer));
+ (*pm)[prefix + "dev"] = buffer;
+
+ // nvme exposes a serial number
+ buffer[0] = '\0';
+ blkdev.serial(buffer, sizeof(buffer));
+ (*pm)[prefix + "serial"] = buffer;
+
+ if (blkdev.is_nvme())
+ (*pm)[prefix + "type"] = "nvme";
+
+ // numa
+ int node;
+ r = blkdev.get_numa_node(&node);
+ if (r >= 0) {
+ (*pm)[prefix + "numa_node"] = stringify(node);
+ }
+ } else {
+ (*pm)[prefix + "access_mode"] = "file";
+ (*pm)[prefix + "path"] = path;
+ }
+ return 0;
+}
+
+void KernelDevice::_detect_vdo()
+{
+ vdo_fd = get_vdo_stats_handle(devname.c_str(), &vdo_name);
+ if (vdo_fd >= 0) {
+ dout(1) << __func__ << " VDO volume " << vdo_name
+ << " maps to " << devname << dendl;
+ } else {
+ dout(20) << __func__ << " no VDO volume maps to " << devname << dendl;
+ }
+ return;
+}
+
+bool KernelDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const
+{
+ if (vdo_fd < 0) {
+ return false;
+ }
+ return get_vdo_utilization(vdo_fd, total, avail);
+}
+
+int KernelDevice::choose_fd(bool buffered, int write_hint) const
+{
+ assert(write_hint >= WRITE_LIFE_NOT_SET && write_hint < WRITE_LIFE_MAX);
+ if (!enable_wrt)
+ write_hint = WRITE_LIFE_NOT_SET;
+ return buffered ? fd_buffereds[write_hint] : fd_directs[write_hint];
+}
+
+int KernelDevice::flush()
+{
+ // protect flush with a mutex. note that we are not really protecting
+ // data here. instead, we're ensuring that if any flush() caller
+ // sees that io_since_flush is true, they block any racing callers
+ // until the flush is observed. that allows racing threads to be
+ // calling flush while still ensuring that *any* of them that got an
+ // aio completion notification will not return before that aio is
+ // stable on disk: whichever thread sees the flag first will block
+ // followers until the aio is stable.
+ std::lock_guard l(flush_mutex);
+
+ bool expect = true;
+ if (!io_since_flush.compare_exchange_strong(expect, false)) {
+ dout(10) << __func__ << " no-op (no ios since last flush), flag is "
+ << (int)io_since_flush.load() << dendl;
+ return 0;
+ }
+
+ dout(10) << __func__ << " start" << dendl;
+ if (cct->_conf->bdev_inject_crash) {
+ ++injecting_crash;
+ // sleep for a moment to give other threads a chance to submit or
+ // wait on io that races with a flush.
+ derr << __func__ << " injecting crash. first we sleep..." << dendl;
+ sleep(cct->_conf->bdev_inject_crash_flush_delay);
+ derr << __func__ << " and now we die" << dendl;
+ cct->_log->flush();
+ _exit(1);
+ }
+ utime_t start = ceph_clock_now();
+ int r = ::fdatasync(fd_directs[WRITE_LIFE_NOT_SET]);
+ utime_t end = ceph_clock_now();
+ utime_t dur = end - start;
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fdatasync got: " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ dout(5) << __func__ << " in " << dur << dendl;;
+ return r;
+}
+
+int KernelDevice::_aio_start()
+{
+ if (aio) {
+ dout(10) << __func__ << dendl;
+ int r = aio_queue.init();
+ if (r < 0) {
+ if (r == -EAGAIN) {
+ derr << __func__ << " io_setup(2) failed with EAGAIN; "
+ << "try increasing /proc/sys/fs/aio-max-nr" << dendl;
+ } else {
+ derr << __func__ << " io_setup(2) failed: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ aio_thread.create("bstore_aio");
+ }
+ return 0;
+}
+
+void KernelDevice::_aio_stop()
+{
+ if (aio) {
+ dout(10) << __func__ << dendl;
+ aio_stop = true;
+ aio_thread.join();
+ aio_stop = false;
+ aio_queue.shutdown();
+ }
+}
+
+int KernelDevice::_discard_start()
+{
+ discard_thread.create("bstore_discard");
+ return 0;
+}
+
+void KernelDevice::_discard_stop()
+{
+ dout(10) << __func__ << dendl;
+ {
+ std::unique_lock l(discard_lock);
+ while (!discard_started) {
+ discard_cond.wait(l);
+ }
+ discard_stop = true;
+ discard_cond.notify_all();
+ }
+ discard_thread.join();
+ {
+ std::lock_guard l(discard_lock);
+ discard_stop = false;
+ }
+ dout(10) << __func__ << " stopped" << dendl;
+}
+
+void KernelDevice::discard_drain()
+{
+ dout(10) << __func__ << dendl;
+ std::unique_lock l(discard_lock);
+ while (!discard_queued.empty() || discard_running) {
+ discard_cond.wait(l);
+ }
+}
+
+static bool is_expected_ioerr(const int r)
+{
+ // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135
+ return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC ||
+ r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO ||
+ r == -ENODATA || r == -EILSEQ || r == -ENOMEM ||
+#if defined(__linux__)
+ r == -EREMCHG || r == -EBADE
+#elif defined(__FreeBSD__)
+ r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE
+#endif
+ );
+}
+
+void KernelDevice::_aio_thread()
+{
+ dout(10) << __func__ << " start" << dendl;
+ int inject_crash_count = 0;
+ while (!aio_stop) {
+ dout(40) << __func__ << " polling" << dendl;
+ int max = cct->_conf->bdev_aio_reap_max;
+ aio_t *aio[max];
+ int r = aio_queue.get_next_completed(cct->_conf->bdev_aio_poll_ms,
+ aio, max);
+ if (r < 0) {
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ ceph_abort_msg("got unexpected error from io_getevents");
+ }
+ if (r > 0) {
+ dout(30) << __func__ << " got " << r << " completed aios" << dendl;
+ for (int i = 0; i < r; ++i) {
+ IOContext *ioc = static_cast<IOContext*>(aio[i]->priv);
+ _aio_log_finish(ioc, aio[i]->offset, aio[i]->length);
+ if (aio[i]->queue_item.is_linked()) {
+ std::lock_guard l(debug_queue_lock);
+ debug_aio_unlink(*aio[i]);
+ }
+
+ // set flag indicating new ios have completed. we do this *before*
+ // any completion or notifications so that any user flush() that
+ // follows the observed io completion will include this io. Note
+ // that an earlier, racing flush() could observe and clear this
+ // flag, but that also ensures that the IO will be stable before the
+ // later flush() occurs.
+ io_since_flush.store(true);
+
+ long r = aio[i]->get_return_value();
+ if (r < 0) {
+ derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")"
+ << dendl;
+ if (ioc->allow_eio && is_expected_ioerr(r)) {
+ derr << __func__ << " translating the error to EIO for upper layer"
+ << dendl;
+ ioc->set_return_value(-EIO);
+ } else {
+ if (is_expected_ioerr(r)) {
+ note_io_error_event(
+ devname.c_str(),
+ path.c_str(),
+ r,
+#if defined(HAVE_POSIXAIO)
+ aio[i]->aio.aiocb.aio_lio_opcode,
+#else
+ aio[i]->iocb.aio_lio_opcode,
+#endif
+ aio[i]->offset,
+ aio[i]->length);
+ ceph_abort_msg(
+ "Unexpected IO error. "
+ "This may suggest a hardware issue. "
+ "Please check your kernel log!");
+ }
+ ceph_abort_msg(
+ "Unexpected IO error. "
+ "This may suggest HW issue. Please check your dmesg!");
+ }
+ } else if (aio[i]->length != (uint64_t)r) {
+ derr << "aio to 0x" << std::hex << aio[i]->offset
+ << "~" << aio[i]->length << std::dec
+ << " but returned: " << r << dendl;
+ ceph_abort_msg("unexpected aio return value: does not match length");
+ }
+
+ dout(10) << __func__ << " finished aio " << aio[i] << " r " << r
+ << " ioc " << ioc
+ << " with " << (ioc->num_running.load() - 1)
+ << " aios left" << dendl;
+
+ // NOTE: once num_running and we either call the callback or
+ // call aio_wake we cannot touch ioc or aio[] as the caller
+ // may free it.
+ if (ioc->priv) {
+ if (--ioc->num_running == 0) {
+ aio_callback(aio_callback_priv, ioc->priv);
+ }
+ } else {
+ ioc->try_aio_wake();
+ }
+ }
+ }
+ if (cct->_conf->bdev_debug_aio) {
+ utime_t now = ceph_clock_now();
+ std::lock_guard l(debug_queue_lock);
+ if (debug_oldest) {
+ if (debug_stall_since == utime_t()) {
+ debug_stall_since = now;
+ } else {
+ if (cct->_conf->bdev_debug_aio_suicide_timeout) {
+ utime_t cutoff = now;
+ cutoff -= cct->_conf->bdev_debug_aio_suicide_timeout;
+ if (debug_stall_since < cutoff) {
+ derr << __func__ << " stalled aio " << debug_oldest
+ << " since " << debug_stall_since << ", timeout is "
+ << cct->_conf->bdev_debug_aio_suicide_timeout
+ << "s, suicide" << dendl;
+ ceph_abort_msg("stalled aio... buggy kernel or bad device?");
+ }
+ }
+ }
+ }
+ }
+ reap_ioc();
+ if (cct->_conf->bdev_inject_crash) {
+ ++inject_crash_count;
+ if (inject_crash_count * cct->_conf->bdev_aio_poll_ms / 1000 >
+ cct->_conf->bdev_inject_crash + cct->_conf->bdev_inject_crash_flush_delay) {
+ derr << __func__ << " bdev_inject_crash trigger from aio thread"
+ << dendl;
+ cct->_log->flush();
+ _exit(1);
+ }
+ }
+ }
+ reap_ioc();
+ dout(10) << __func__ << " end" << dendl;
+}
+
+void KernelDevice::_discard_thread()
+{
+ std::unique_lock l(discard_lock);
+ ceph_assert(!discard_started);
+ discard_started = true;
+ discard_cond.notify_all();
+ while (true) {
+ ceph_assert(discard_finishing.empty());
+ if (discard_queued.empty()) {
+ if (discard_stop)
+ break;
+ dout(20) << __func__ << " sleep" << dendl;
+ discard_cond.notify_all(); // for the thread trying to drain...
+ discard_cond.wait(l);
+ dout(20) << __func__ << " wake" << dendl;
+ } else {
+ discard_finishing.swap(discard_queued);
+ discard_running = true;
+ l.unlock();
+ dout(20) << __func__ << " finishing" << dendl;
+ for (auto p = discard_finishing.begin();p != discard_finishing.end(); ++p) {
+ discard(p.get_start(), p.get_len());
+ }
+
+ discard_callback(discard_callback_priv, static_cast<void*>(&discard_finishing));
+ discard_finishing.clear();
+ l.lock();
+ discard_running = false;
+ }
+ }
+ dout(10) << __func__ << " finish" << dendl;
+ discard_started = false;
+}
+
+int KernelDevice::queue_discard(interval_set<uint64_t> &to_release)
+{
+ if (!support_discard)
+ return -1;
+
+ if (to_release.empty())
+ return 0;
+
+ std::lock_guard l(discard_lock);
+ discard_queued.insert(to_release);
+ discard_cond.notify_all();
+ return 0;
+}
+
+void KernelDevice::_aio_log_start(
+ IOContext *ioc,
+ uint64_t offset,
+ uint64_t length)
+{
+ dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ if (cct->_conf->bdev_debug_inflight_ios) {
+ std::lock_guard l(debug_lock);
+ if (debug_inflight.intersects(offset, length)) {
+ derr << __func__ << " inflight overlap of 0x"
+ << std::hex
+ << offset << "~" << length << std::dec
+ << " with " << debug_inflight << dendl;
+ ceph_abort();
+ }
+ debug_inflight.insert(offset, length);
+ }
+}
+
+void KernelDevice::debug_aio_link(aio_t& aio)
+{
+ if (debug_queue.empty()) {
+ debug_oldest = &aio;
+ }
+ debug_queue.push_back(aio);
+}
+
+void KernelDevice::debug_aio_unlink(aio_t& aio)
+{
+ if (aio.queue_item.is_linked()) {
+ debug_queue.erase(debug_queue.iterator_to(aio));
+ if (debug_oldest == &aio) {
+ auto age = cct->_conf->bdev_debug_aio_log_age;
+ if (age && debug_stall_since != utime_t()) {
+ utime_t cutoff = ceph_clock_now();
+ cutoff -= age;
+ if (debug_stall_since < cutoff) {
+ derr << __func__ << " stalled aio " << debug_oldest
+ << " since " << debug_stall_since << ", timeout is "
+ << age
+ << "s" << dendl;
+ }
+ }
+
+ if (debug_queue.empty()) {
+ debug_oldest = nullptr;
+ } else {
+ debug_oldest = &debug_queue.front();
+ }
+ debug_stall_since = utime_t();
+ }
+ }
+}
+
+void KernelDevice::_aio_log_finish(
+ IOContext *ioc,
+ uint64_t offset,
+ uint64_t length)
+{
+ dout(20) << __func__ << " " << aio << " 0x"
+ << std::hex << offset << "~" << length << std::dec << dendl;
+ if (cct->_conf->bdev_debug_inflight_ios) {
+ std::lock_guard l(debug_lock);
+ debug_inflight.erase(offset, length);
+ }
+}
+
+void KernelDevice::aio_submit(IOContext *ioc)
+{
+ dout(20) << __func__ << " ioc " << ioc
+ << " pending " << ioc->num_pending.load()
+ << " running " << ioc->num_running.load()
+ << dendl;
+
+ if (ioc->num_pending.load() == 0) {
+ return;
+ }
+
+ // move these aside, and get our end iterator position now, as the
+ // aios might complete as soon as they are submitted and queue more
+ // wal aio's.
+ list<aio_t>::iterator e = ioc->running_aios.begin();
+ ioc->running_aios.splice(e, ioc->pending_aios);
+
+ int pending = ioc->num_pending.load();
+ ioc->num_running += pending;
+ ioc->num_pending -= pending;
+ ceph_assert(ioc->num_pending.load() == 0); // we should be only thread doing this
+ ceph_assert(ioc->pending_aios.size() == 0);
+
+ if (cct->_conf->bdev_debug_aio) {
+ list<aio_t>::iterator p = ioc->running_aios.begin();
+ while (p != e) {
+ dout(30) << __func__ << " " << *p << dendl;
+ std::lock_guard l(debug_queue_lock);
+ debug_aio_link(*p++);
+ }
+ }
+
+ void *priv = static_cast<void*>(ioc);
+ int r, retries = 0;
+ r = aio_queue.submit_batch(ioc->running_aios.begin(), e,
+ pending, priv, &retries);
+
+ if (retries)
+ derr << __func__ << " retries " << retries << dendl;
+ if (r < 0) {
+ derr << " aio submit got " << cpp_strerror(r) << dendl;
+ ceph_assert(r == 0);
+ }
+}
+
+int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int write_hint)
+{
+ uint64_t len = bl.length();
+ dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
+ << std::dec << (buffered ? " (buffered)" : " (direct)") << dendl;
+ if (cct->_conf->bdev_inject_crash &&
+ rand() % cct->_conf->bdev_inject_crash == 0) {
+ derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
+ << off << "~" << len << std::dec << dendl;
+ ++injecting_crash;
+ return 0;
+ }
+ vector<iovec> iov;
+ bl.prepare_iov(&iov);
+
+ auto left = len;
+ auto o = off;
+ size_t idx = 0;
+ do {
+ auto r = ::pwritev(choose_fd(buffered, write_hint),
+ &iov[idx], iov.size() - idx, o);
+
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ o += r;
+ left -= r;
+ if (left) {
+ // skip fully processed IOVs
+ while (idx < iov.size() && (size_t)r >= iov[idx].iov_len) {
+ r -= iov[idx++].iov_len;
+ }
+ // update partially processed one if any
+ if (r) {
+ ceph_assert(idx < iov.size());
+ ceph_assert((size_t)r < iov[idx].iov_len);
+ iov[idx].iov_base = static_cast<char*>(iov[idx].iov_base) + r;
+ iov[idx].iov_len -= r;
+ r = 0;
+ }
+ ceph_assert(r == 0);
+ }
+ } while (left);
+
+#ifdef HAVE_SYNC_FILE_RANGE
+ if (buffered) {
+ // initiate IO and wait till it completes
+ auto r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER|SYNC_FILE_RANGE_WAIT_BEFORE);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+#endif
+
+ io_since_flush.store(true);
+
+ return 0;
+}
+
+int KernelDevice::write(
+ uint64_t off,
+ bufferlist &bl,
+ bool buffered,
+ int write_hint)
+{
+ uint64_t len = bl.length();
+ dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
+ << (buffered ? " (buffered)" : " (direct)")
+ << dendl;
+ ceph_assert(is_valid_io(off, len));
+ if (cct->_conf->objectstore_blackhole) {
+ lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO"
+ << dendl;
+ return 0;
+ }
+
+ if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
+ bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
+ dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
+ }
+ dout(40) << "data: ";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ return _sync_write(off, bl, buffered, write_hint);
+}
+
+int KernelDevice::aio_write(
+ uint64_t off,
+ bufferlist &bl,
+ IOContext *ioc,
+ bool buffered,
+ int write_hint)
+{
+ uint64_t len = bl.length();
+ dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
+ << (buffered ? " (buffered)" : " (direct)")
+ << dendl;
+ ceph_assert(is_valid_io(off, len));
+ if (cct->_conf->objectstore_blackhole) {
+ lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO"
+ << dendl;
+ return 0;
+ }
+
+ if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
+ bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
+ dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
+ }
+ dout(40) << "data: ";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ _aio_log_start(ioc, off, len);
+
+#ifdef HAVE_LIBAIO
+ if (aio && dio && !buffered) {
+ if (cct->_conf->bdev_inject_crash &&
+ rand() % cct->_conf->bdev_inject_crash == 0) {
+ derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
+ << off << "~" << len << std::dec
+ << dendl;
+ // generate a real io so that aio_wait behaves properly, but make it
+ // a read instead of write, and toss the result.
+ ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
+ ++ioc->num_pending;
+ auto& aio = ioc->pending_aios.back();
+ aio.pread(off, len);
+ ++injecting_crash;
+ } else {
+ if (bl.length() <= RW_IO_MAX) {
+ // fast path (non-huge write)
+ ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
+ ++ioc->num_pending;
+ auto& aio = ioc->pending_aios.back();
+ bl.prepare_iov(&aio.iov);
+ aio.bl.claim_append(bl);
+ aio.pwritev(off, len);
+ dout(30) << aio << dendl;
+ dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
+ << std::dec << " aio " << &aio << dendl;
+ } else {
+ // write in RW_IO_MAX-sized chunks
+ uint64_t prev_len = 0;
+ while (prev_len < bl.length()) {
+ bufferlist tmp;
+ if (prev_len + RW_IO_MAX < bl.length()) {
+ tmp.substr_of(bl, prev_len, RW_IO_MAX);
+ } else {
+ tmp.substr_of(bl, prev_len, bl.length() - prev_len);
+ }
+ auto len = tmp.length();
+ ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
+ ++ioc->num_pending;
+ auto& aio = ioc->pending_aios.back();
+ tmp.prepare_iov(&aio.iov);
+ aio.bl.claim_append(tmp);
+ aio.pwritev(off + prev_len, len);
+ dout(30) << aio << dendl;
+ dout(5) << __func__ << " 0x" << std::hex << off + prev_len
+ << "~" << len
+ << std::dec << " aio " << &aio << " (piece)" << dendl;
+ prev_len += len;
+ }
+ }
+ }
+ } else
+#endif
+ {
+ int r = _sync_write(off, bl, buffered, write_hint);
+ _aio_log_finish(ioc, off, len);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+int KernelDevice::discard(uint64_t offset, uint64_t len)
+{
+ int r = 0;
+ if (cct->_conf->objectstore_blackhole) {
+ lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO"
+ << dendl;
+ return 0;
+ }
+ if (support_discard) {
+ dout(10) << __func__
+ << " 0x" << std::hex << offset << "~" << len << std::dec
+ << dendl;
+
+ r = BlkDev{fd_directs[WRITE_LIFE_NOT_SET]}.discard((int64_t)offset, (int64_t)len);
+ }
+ return r;
+}
+
+int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered)
+{
+ dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
+ << (buffered ? " (buffered)" : " (direct)")
+ << dendl;
+ ceph_assert(is_valid_io(off, len));
+
+ _aio_log_start(ioc, off, len);
+
+ auto start1 = mono_clock::now();
+
+ auto p = buffer::ptr_node::create(buffer::create_small_page_aligned(len));
+ int r = ::pread(buffered ? fd_buffereds[WRITE_LIFE_NOT_SET] : fd_directs[WRITE_LIFE_NOT_SET],
+ p->c_str(), len, off);
+ auto age = cct->_conf->bdev_debug_aio_log_age;
+ if (mono_clock::now() - start1 >= make_timespan(age)) {
+ derr << __func__ << " stalled read "
+ << " 0x" << std::hex << off << "~" << len << std::dec
+ << (buffered ? " (buffered)" : " (direct)")
+ << " since " << start1 << ", timeout is "
+ << age
+ << "s" << dendl;
+ }
+
+ if (r < 0) {
+ if (ioc->allow_eio && is_expected_ioerr(r)) {
+ r = -EIO;
+ } else {
+ r = -errno;
+ }
+ goto out;
+ }
+ ceph_assert((uint64_t)r == len);
+ pbl->push_back(std::move(p));
+
+ dout(40) << "data: ";
+ pbl->hexdump(*_dout);
+ *_dout << dendl;
+
+ out:
+ _aio_log_finish(ioc, off, len);
+ return r < 0 ? r : 0;
+}
+
+int KernelDevice::aio_read(
+ uint64_t off,
+ uint64_t len,
+ bufferlist *pbl,
+ IOContext *ioc)
+{
+ dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
+ << dendl;
+
+ int r = 0;
+#ifdef HAVE_LIBAIO
+ if (aio && dio) {
+ ceph_assert(is_valid_io(off, len));
+ _aio_log_start(ioc, off, len);
+ ioc->pending_aios.push_back(aio_t(ioc, fd_directs[WRITE_LIFE_NOT_SET]));
+ ++ioc->num_pending;
+ aio_t& aio = ioc->pending_aios.back();
+ aio.pread(off, len);
+ dout(30) << aio << dendl;
+ pbl->append(aio.bl);
+ dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
+ << std::dec << " aio " << &aio << dendl;
+ } else
+#endif
+ {
+ r = read(off, len, pbl, ioc, false);
+ }
+
+ return r;
+}
+
+int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf)
+{
+ uint64_t aligned_off = align_down(off, block_size);
+ uint64_t aligned_len = align_up(off+len, block_size) - aligned_off;
+ bufferptr p = buffer::create_small_page_aligned(aligned_len);
+ int r = 0;
+
+ auto start1 = mono_clock::now();
+ r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], p.c_str(), aligned_len, aligned_off);
+ auto age = cct->_conf->bdev_debug_aio_log_age;
+ if (mono_clock::now() - start1 >= make_timespan(age)) {
+ derr << __func__ << " stalled read "
+ << " 0x" << std::hex << off << "~" << len << std::dec
+ << " since " << start1 << ", timeout is "
+ << age
+ << "s" << dendl;
+ }
+
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
+ << " error: " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ ceph_assert((uint64_t)r == aligned_len);
+ memcpy(buf, p.c_str() + (off - aligned_off), len);
+
+ dout(40) << __func__ << " data: ";
+ bufferlist bl;
+ bl.append(buf, len);
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ out:
+ return r < 0 ? r : 0;
+}
+
+int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
+ bool buffered)
+{
+ dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
+ << "buffered " << buffered
+ << dendl;
+ ceph_assert(len > 0);
+ ceph_assert(off < size);
+ ceph_assert(off + len <= size);
+ int r = 0;
+ auto age = cct->_conf->bdev_debug_aio_log_age;
+
+ //if it's direct io and unaligned, we have to use a internal buffer
+ if (!buffered && ((off % block_size != 0)
+ || (len % block_size != 0)
+ || (uintptr_t(buf) % CEPH_PAGE_SIZE != 0)))
+ return direct_read_unaligned(off, len, buf);
+
+ auto start1 = mono_clock::now();
+ if (buffered) {
+ //buffered read
+ auto off0 = off;
+ char *t = buf;
+ uint64_t left = len;
+ while (left > 0) {
+ r = ::pread(fd_buffereds[WRITE_LIFE_NOT_SET], t, left, off);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " 0x" << std::hex << off << "~" << left
+ << std::dec << " error: " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ off += r;
+ t += r;
+ left -= r;
+ }
+ if (mono_clock::now() - start1 >= make_timespan(age)) {
+ derr << __func__ << " stalled read "
+ << " 0x" << std::hex << off0 << "~" << len << std::dec
+ << " (buffered) since " << start1 << ", timeout is "
+ << age
+ << "s" << dendl;
+ }
+ } else {
+ //direct and aligned read
+ r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], buf, len, off);
+ if (mono_clock::now() - start1 >= make_timespan(age)) {
+ derr << __func__ << " stalled read "
+ << " 0x" << std::hex << off << "~" << len << std::dec
+ << " (direct) since " << start1 << ", timeout is "
+ << age
+ << "s" << dendl;
+ }
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " direct_aligned_read" << " 0x" << std::hex
+ << off << "~" << left << std::dec << " error: " << cpp_strerror(r)
+ << dendl;
+ goto out;
+ }
+ ceph_assert((uint64_t)r == len);
+ }
+
+ dout(40) << __func__ << " data: ";
+ bufferlist bl;
+ bl.append(buf, len);
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ out:
+ return r < 0 ? r : 0;
+}
+
+int KernelDevice::invalidate_cache(uint64_t off, uint64_t len)
+{
+ dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
+ << dendl;
+ ceph_assert(off % block_size == 0);
+ ceph_assert(len % block_size == 0);
+ int r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, POSIX_FADV_DONTNEED);
+ if (r) {
+ r = -r;
+ derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
+ << " error: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+}
diff --git a/src/os/bluestore/KernelDevice.h b/src/os/bluestore/KernelDevice.h
new file mode 100644
index 00000000..19b52abd
--- /dev/null
+++ b/src/os/bluestore/KernelDevice.h
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_BLUESTORE_KERNELDEVICE_H
+#define CEPH_OS_BLUESTORE_KERNELDEVICE_H
+
+#include <atomic>
+
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "common/Thread.h"
+#include "include/utime.h"
+
+#include "ceph_aio.h"
+#include "BlockDevice.h"
+
+#define RW_IO_MAX (INT_MAX & CEPH_PAGE_MASK)
+
+
+class KernelDevice : public BlockDevice {
+ std::vector<int> fd_directs, fd_buffereds;
+ bool enable_wrt = true;
+ std::string path;
+ bool aio, dio;
+
+ int vdo_fd = -1; ///< fd for vdo sysfs directory
+ string vdo_name;
+
+ std::string devname; ///< kernel dev name (/sys/block/$devname), if any
+
+ ceph::mutex debug_lock = ceph::make_mutex("KernelDevice::debug_lock");
+ interval_set<uint64_t> debug_inflight;
+
+ std::atomic<bool> io_since_flush = {false};
+ ceph::mutex flush_mutex = ceph::make_mutex("KernelDevice::flush_mutex");
+
+ aio_queue_t aio_queue;
+ aio_callback_t discard_callback;
+ void *discard_callback_priv;
+ bool aio_stop;
+ bool discard_started;
+ bool discard_stop;
+
+ ceph::mutex discard_lock = ceph::make_mutex("KernelDevice::discard_lock");
+ ceph::condition_variable discard_cond;
+ bool discard_running = false;
+ interval_set<uint64_t> discard_queued;
+ interval_set<uint64_t> discard_finishing;
+
+ struct AioCompletionThread : public Thread {
+ KernelDevice *bdev;
+ explicit AioCompletionThread(KernelDevice *b) : bdev(b) {}
+ void *entry() override {
+ bdev->_aio_thread();
+ return NULL;
+ }
+ } aio_thread;
+
+ struct DiscardThread : public Thread {
+ KernelDevice *bdev;
+ explicit DiscardThread(KernelDevice *b) : bdev(b) {}
+ void *entry() override {
+ bdev->_discard_thread();
+ return NULL;
+ }
+ } discard_thread;
+
+ std::atomic_int injecting_crash;
+
+ void _aio_thread();
+ void _discard_thread();
+ int queue_discard(interval_set<uint64_t> &to_release) override;
+
+ int _aio_start();
+ void _aio_stop();
+
+ int _discard_start();
+ void _discard_stop();
+
+ void _aio_log_start(IOContext *ioc, uint64_t offset, uint64_t length);
+ void _aio_log_finish(IOContext *ioc, uint64_t offset, uint64_t length);
+
+ int _sync_write(uint64_t off, bufferlist& bl, bool buffered, int write_hint);
+
+ int _lock();
+
+ int direct_read_unaligned(uint64_t off, uint64_t len, char *buf);
+
+ // stalled aio debugging
+ aio_list_t debug_queue;
+ ceph::mutex debug_queue_lock = ceph::make_mutex("KernelDevice::debug_queue_lock");
+ aio_t *debug_oldest = nullptr;
+ utime_t debug_stall_since;
+ void debug_aio_link(aio_t& aio);
+ void debug_aio_unlink(aio_t& aio);
+
+ void _detect_vdo();
+ int choose_fd(bool buffered, int write_hint) const;
+
+public:
+ KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
+
+ void aio_submit(IOContext *ioc) override;
+ void discard_drain() override;
+
+ int collect_metadata(const std::string& prefix, map<std::string,std::string> *pm) const override;
+ int get_devname(std::string *s) override {
+ if (devname.empty()) {
+ return -ENOENT;
+ }
+ *s = devname;
+ return 0;
+ }
+ int get_devices(std::set<std::string> *ls) override;
+
+ bool get_thin_utilization(uint64_t *total, uint64_t *avail) const override;
+
+ int read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered) override;
+ int aio_read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc) override;
+ int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override;
+
+ int write(uint64_t off, bufferlist& bl, bool buffered, int write_hint = WRITE_LIFE_NOT_SET) override;
+ int aio_write(uint64_t off, bufferlist& bl,
+ IOContext *ioc,
+ bool buffered,
+ int write_hint = WRITE_LIFE_NOT_SET) override;
+ int flush() override;
+ int discard(uint64_t offset, uint64_t len) override;
+
+ // for managing buffered readers/writers
+ int invalidate_cache(uint64_t off, uint64_t len) override;
+ int open(const std::string& path) override;
+ void close() override;
+};
+
+#endif
diff --git a/src/os/bluestore/NVMEDevice.cc b/src/os/bluestore/NVMEDevice.cc
new file mode 100644
index 00000000..acd9eb03
--- /dev/null
+++ b/src/os/bluestore/NVMEDevice.cc
@@ -0,0 +1,952 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+//
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <chrono>
+#include <fstream>
+#include <functional>
+#include <map>
+#include <thread>
+
+#include <spdk/nvme.h>
+
+#include "include/stringify.h"
+#include "include/types.h"
+#include "include/compat.h"
+#include "common/align.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "common/perf_counters.h"
+
+#include "NVMEDevice.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_bdev
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev(" << sn << ") "
+
+thread_local SharedDriverQueueData *queue_t;
+
+static constexpr uint16_t data_buffer_default_num = 1024;
+
+static constexpr uint32_t data_buffer_size = 8192;
+
+static constexpr uint16_t inline_segment_num = 32;
+
+enum {
+ l_bluestore_nvmedevice_first = 632430,
+ l_bluestore_nvmedevice_write_lat,
+ l_bluestore_nvmedevice_read_lat,
+ l_bluestore_nvmedevice_flush_lat,
+ l_bluestore_nvmedevice_write_queue_lat,
+ l_bluestore_nvmedevice_read_queue_lat,
+ l_bluestore_nvmedevice_flush_queue_lat,
+ l_bluestore_nvmedevice_queue_ops,
+ l_bluestore_nvmedevice_polling_lat,
+ l_bluestore_nvmedevice_buffer_alloc_failed,
+ l_bluestore_nvmedevice_last
+};
+
+static void io_complete(void *t, const struct spdk_nvme_cpl *completion);
+
+struct IORequest {
+ uint16_t cur_seg_idx = 0;
+ uint16_t nseg;
+ uint32_t cur_seg_left = 0;
+ void *inline_segs[inline_segment_num];
+ void **extra_segs = nullptr;
+};
+
+struct Task;
+
+class SharedDriverData {
+ unsigned id;
+ spdk_nvme_transport_id trid;
+ spdk_nvme_ctrlr *ctrlr;
+ spdk_nvme_ns *ns;
+ uint32_t block_size = 0;
+ uint64_t size = 0;
+
+ public:
+ std::vector<NVMEDevice*> registered_devices;
+ friend class SharedDriverQueueData;
+ SharedDriverData(unsigned id_, const spdk_nvme_transport_id& trid_,
+ spdk_nvme_ctrlr *c, spdk_nvme_ns *ns_)
+ : id(id_),
+ trid(trid_),
+ ctrlr(c),
+ ns(ns_) {
+ block_size = spdk_nvme_ns_get_extended_sector_size(ns);
+ size = spdk_nvme_ns_get_size(ns);
+ }
+
+ bool is_equal(const spdk_nvme_transport_id& trid2) const {
+ return spdk_nvme_transport_id_compare(&trid, &trid2) == 0;
+ }
+ ~SharedDriverData() {
+ }
+
+ void register_device(NVMEDevice *device) {
+ registered_devices.push_back(device);
+ }
+
+ void remove_device(NVMEDevice *device) {
+ std::vector<NVMEDevice*> new_devices;
+ for (auto &&it : registered_devices) {
+ if (it != device)
+ new_devices.push_back(it);
+ }
+ registered_devices.swap(new_devices);
+ }
+
+ uint32_t get_block_size() {
+ return block_size;
+ }
+ uint64_t get_size() {
+ return size;
+ }
+};
+
+class SharedDriverQueueData {
+ NVMEDevice *bdev;
+ SharedDriverData *driver;
+ spdk_nvme_ctrlr *ctrlr;
+ spdk_nvme_ns *ns;
+ std::string sn;
+ uint32_t block_size;
+ uint32_t max_queue_depth;
+ struct spdk_nvme_qpair *qpair;
+ bool reap_io = false;
+ int alloc_buf_from_pool(Task *t, bool write);
+
+ public:
+ uint32_t current_queue_depth = 0;
+ std::atomic_ulong completed_op_seq, queue_op_seq;
+ std::vector<void*> data_buf_mempool;
+ PerfCounters *logger = nullptr;
+ void _aio_handle(Task *t, IOContext *ioc);
+
+ SharedDriverQueueData(NVMEDevice *bdev, SharedDriverData *driver)
+ : bdev(bdev),
+ driver(driver) {
+ ctrlr = driver->ctrlr;
+ ns = driver->ns;
+ block_size = driver->block_size;
+
+ struct spdk_nvme_io_qpair_opts opts = {};
+ spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
+ opts.qprio = SPDK_NVME_QPRIO_URGENT;
+ // usable queue depth should minus 1 to aovid overflow.
+ max_queue_depth = opts.io_queue_size - 1;
+ qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
+ ceph_assert(qpair != NULL);
+
+ // allocate spdk dma memory
+ for (uint16_t i = 0; i < data_buffer_default_num; i++) {
+ void *b = spdk_dma_zmalloc(data_buffer_size, CEPH_PAGE_SIZE, NULL);
+ if (!b) {
+ derr << __func__ << " failed to create memory pool for nvme data buffer" << dendl;
+ ceph_assert(b);
+ }
+ data_buf_mempool.push_back(b);
+ }
+
+ PerfCountersBuilder b(g_ceph_context, string("NVMEDevice-AIOThread-"+stringify(this)),
+ l_bluestore_nvmedevice_first, l_bluestore_nvmedevice_last);
+ b.add_time_avg(l_bluestore_nvmedevice_write_lat, "write_lat", "Average write completing latency");
+ b.add_time_avg(l_bluestore_nvmedevice_read_lat, "read_lat", "Average read completing latency");
+ b.add_time_avg(l_bluestore_nvmedevice_flush_lat, "flush_lat", "Average flush completing latency");
+ b.add_u64(l_bluestore_nvmedevice_queue_ops, "queue_ops", "Operations in nvme queue");
+ b.add_time_avg(l_bluestore_nvmedevice_polling_lat, "polling_lat", "Average polling latency");
+ b.add_time_avg(l_bluestore_nvmedevice_write_queue_lat, "write_queue_lat", "Average queue write request latency");
+ b.add_time_avg(l_bluestore_nvmedevice_read_queue_lat, "read_queue_lat", "Average queue read request latency");
+ b.add_time_avg(l_bluestore_nvmedevice_flush_queue_lat, "flush_queue_lat", "Average queue flush request latency");
+ b.add_u64_counter(l_bluestore_nvmedevice_buffer_alloc_failed, "buffer_alloc_failed", "Alloc data buffer failed count");
+ logger = b.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(logger);
+ bdev->queue_number++;
+ if (bdev->queue_number.load() == 1)
+ reap_io = true;
+ }
+
+ ~SharedDriverQueueData() {
+ g_ceph_context->get_perfcounters_collection()->remove(logger);
+ if (qpair) {
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ bdev->queue_number--;
+ }
+
+ // free all spdk dma memory;
+ if (!data_buf_mempool.empty()) {
+ for (uint16_t i = 0; i < data_buffer_default_num; i++) {
+ void *b = data_buf_mempool[i];
+ ceph_assert(b);
+ spdk_dma_free(b);
+ }
+ data_buf_mempool.clear();
+ }
+
+ delete logger;
+ }
+};
+
+struct Task {
+ NVMEDevice *device;
+ IOContext *ctx = nullptr;
+ IOCommand command;
+ uint64_t offset;
+ uint64_t len;
+ bufferlist bl;
+ std::function<void()> fill_cb;
+ Task *next = nullptr;
+ int64_t return_code;
+ ceph::coarse_real_clock::time_point start;
+ IORequest io_request;
+ ceph::mutex lock = ceph::make_mutex("Task::lock");
+ ceph::condition_variable cond;
+ SharedDriverQueueData *queue = nullptr;
+ Task(NVMEDevice *dev, IOCommand c, uint64_t off, uint64_t l, int64_t rc = 0)
+ : device(dev), command(c), offset(off), len(l),
+ return_code(rc),
+ start(ceph::coarse_real_clock::now()) {}
+ ~Task() {
+ ceph_assert(!io_request.nseg);
+ }
+ void release_segs(SharedDriverQueueData *queue_data) {
+ if (io_request.extra_segs) {
+ for (uint16_t i = 0; i < io_request.nseg; i++)
+ queue_data->data_buf_mempool.push_back(io_request.extra_segs[i]);
+ delete io_request.extra_segs;
+ } else if (io_request.nseg) {
+ for (uint16_t i = 0; i < io_request.nseg; i++)
+ queue_data->data_buf_mempool.push_back(io_request.inline_segs[i]);
+ }
+ ctx->total_nseg -= io_request.nseg;
+ io_request.nseg = 0;
+ }
+
+ void copy_to_buf(char *buf, uint64_t off, uint64_t len) {
+ uint64_t copied = 0;
+ uint64_t left = len;
+ void **segs = io_request.extra_segs ? io_request.extra_segs : io_request.inline_segs;
+ uint16_t i = 0;
+ while (left > 0) {
+ char *src = static_cast<char*>(segs[i++]);
+ uint64_t need_copy = std::min(left, data_buffer_size-off);
+ memcpy(buf+copied, src+off, need_copy);
+ off = 0;
+ left -= need_copy;
+ copied += need_copy;
+ }
+ }
+};
+
+static void data_buf_reset_sgl(void *cb_arg, uint32_t sgl_offset)
+{
+ Task *t = static_cast<Task*>(cb_arg);
+ uint32_t i = sgl_offset / data_buffer_size;
+ uint32_t offset = i * data_buffer_size;
+ ceph_assert(i <= t->io_request.nseg);
+
+ for (; i < t->io_request.nseg; i++) {
+ offset += data_buffer_size;
+ if (offset > sgl_offset) {
+ if (offset > t->len)
+ offset = t->len;
+ break;
+ }
+ }
+
+ t->io_request.cur_seg_idx = i;
+ t->io_request.cur_seg_left = offset - sgl_offset;
+ return ;
+}
+
+static int data_buf_next_sge(void *cb_arg, void **address, uint32_t *length)
+{
+ uint32_t size;
+ void *addr;
+ Task *t = static_cast<Task*>(cb_arg);
+ if (t->io_request.cur_seg_idx >= t->io_request.nseg) {
+ *length = 0;
+ *address = 0;
+ return 0;
+ }
+
+ addr = t->io_request.extra_segs ? t->io_request.extra_segs[t->io_request.cur_seg_idx] : t->io_request.inline_segs[t->io_request.cur_seg_idx];
+
+ size = data_buffer_size;
+ if (t->io_request.cur_seg_idx == t->io_request.nseg - 1) {
+ uint64_t tail = t->len % data_buffer_size;
+ if (tail) {
+ size = (uint32_t) tail;
+ }
+ }
+
+ if (t->io_request.cur_seg_left) {
+ *address = (void *)((uint64_t)addr + size - t->io_request.cur_seg_left);
+ *length = t->io_request.cur_seg_left;
+ t->io_request.cur_seg_left = 0;
+ } else {
+ *address = addr;
+ *length = size;
+ }
+
+ t->io_request.cur_seg_idx++;
+ return 0;
+}
+
+int SharedDriverQueueData::alloc_buf_from_pool(Task *t, bool write)
+{
+ uint64_t count = t->len / data_buffer_size;
+ if (t->len % data_buffer_size)
+ ++count;
+ void **segs;
+ if (count > data_buf_mempool.size())
+ return -ENOMEM;
+ if (count <= inline_segment_num) {
+ segs = t->io_request.inline_segs;
+ } else {
+ t->io_request.extra_segs = new void*[count];
+ segs = t->io_request.extra_segs;
+ }
+ for (uint16_t i = 0; i < count; i++) {
+ segs[i] = data_buf_mempool.back();
+ data_buf_mempool.pop_back();
+ }
+ t->io_request.nseg = count;
+ t->ctx->total_nseg += count;
+ if (write) {
+ auto blp = t->bl.begin();
+ uint32_t len = 0;
+ uint16_t i = 0;
+ for (; i < count - 1; ++i) {
+ blp.copy(data_buffer_size, static_cast<char*>(segs[i]));
+ len += data_buffer_size;
+ }
+ blp.copy(t->bl.length() - len, static_cast<char*>(segs[i]));
+ }
+
+ return 0;
+}
+
+void SharedDriverQueueData::_aio_handle(Task *t, IOContext *ioc)
+{
+ dout(20) << __func__ << " start" << dendl;
+
+ int r = 0;
+ uint64_t lba_off, lba_count;
+ uint32_t max_io_completion = (uint32_t)g_conf().get_val<uint64_t>("bluestore_spdk_max_io_completion");
+ uint64_t io_sleep_in_us = g_conf().get_val<uint64_t>("bluestore_spdk_io_sleep");
+
+ ceph::coarse_real_clock::time_point cur, start
+ = ceph::coarse_real_clock::now();
+ while (ioc->num_running) {
+ again:
+ dout(40) << __func__ << " polling" << dendl;
+ if (current_queue_depth) {
+ r = spdk_nvme_qpair_process_completions(qpair, max_io_completion);
+ if (r < 0) {
+ ceph_abort();
+ } else if (r == 0) {
+ usleep(io_sleep_in_us);
+ }
+ }
+
+ for (; t; t = t->next) {
+ if (current_queue_depth == max_queue_depth) {
+ // no slots
+ goto again;
+ }
+
+ t->queue = this;
+ lba_off = t->offset / block_size;
+ lba_count = t->len / block_size;
+ switch (t->command) {
+ case IOCommand::WRITE_COMMAND:
+ {
+ dout(20) << __func__ << " write command issued " << lba_off << "~" << lba_count << dendl;
+ r = alloc_buf_from_pool(t, true);
+ if (r < 0) {
+ logger->inc(l_bluestore_nvmedevice_buffer_alloc_failed);
+ goto again;
+ }
+
+ r = spdk_nvme_ns_cmd_writev(
+ ns, qpair, lba_off, lba_count, io_complete, t, 0,
+ data_buf_reset_sgl, data_buf_next_sge);
+ if (r < 0) {
+ derr << __func__ << " failed to do write command" << dendl;
+ t->ctx->nvme_task_first = t->ctx->nvme_task_last = nullptr;
+ t->release_segs(this);
+ delete t;
+ ceph_abort();
+ }
+ cur = ceph::coarse_real_clock::now();
+ auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>(cur - t->start);
+ logger->tinc(l_bluestore_nvmedevice_write_queue_lat, dur);
+ break;
+ }
+ case IOCommand::READ_COMMAND:
+ {
+ dout(20) << __func__ << " read command issued " << lba_off << "~" << lba_count << dendl;
+ r = alloc_buf_from_pool(t, false);
+ if (r < 0) {
+ logger->inc(l_bluestore_nvmedevice_buffer_alloc_failed);
+ goto again;
+ }
+
+ r = spdk_nvme_ns_cmd_readv(
+ ns, qpair, lba_off, lba_count, io_complete, t, 0,
+ data_buf_reset_sgl, data_buf_next_sge);
+ if (r < 0) {
+ derr << __func__ << " failed to read" << dendl;
+ t->release_segs(this);
+ delete t;
+ ceph_abort();
+ } else {
+ cur = ceph::coarse_real_clock::now();
+ auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>(cur - t->start);
+ logger->tinc(l_bluestore_nvmedevice_read_queue_lat, dur);
+ }
+ break;
+ }
+ case IOCommand::FLUSH_COMMAND:
+ {
+ dout(20) << __func__ << " flush command issueed " << dendl;
+ r = spdk_nvme_ns_cmd_flush(ns, qpair, io_complete, t);
+ if (r < 0) {
+ derr << __func__ << " failed to flush" << dendl;
+ t->release_segs(this);
+ delete t;
+ ceph_abort();
+ } else {
+ cur = ceph::coarse_real_clock::now();
+ auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>(cur - t->start);
+ logger->tinc(l_bluestore_nvmedevice_flush_queue_lat, dur);
+ }
+ break;
+ }
+ }
+ current_queue_depth++;
+ }
+ cur = ceph::coarse_real_clock::now();
+ auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>(cur - start);
+ logger->tinc(l_bluestore_nvmedevice_polling_lat, dur);
+ start = ceph::coarse_real_clock::now();
+ }
+
+ if (reap_io)
+ bdev->reap_ioc();
+ dout(20) << __func__ << " end" << dendl;
+}
+
+#define dout_subsys ceph_subsys_bdev
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev "
+
+class NVMEManager {
+ public:
+ struct ProbeContext {
+ spdk_nvme_transport_id trid;
+ NVMEManager *manager;
+ SharedDriverData *driver;
+ bool done;
+ };
+
+ private:
+ ceph::mutex lock = ceph::make_mutex("NVMEManager::lock");
+ bool stopping = false;
+ std::vector<SharedDriverData*> shared_driver_datas;
+ std::thread dpdk_thread;
+ ceph::mutex probe_queue_lock = ceph::make_mutex("NVMEManager::probe_queue_lock");
+ ceph::condition_variable probe_queue_cond;
+ std::list<ProbeContext*> probe_queue;
+
+ public:
+ NVMEManager() {}
+ ~NVMEManager() {
+ if (!dpdk_thread.joinable())
+ return;
+ {
+ std::lock_guard guard(probe_queue_lock);
+ stopping = true;
+ probe_queue_cond.notify_all();
+ }
+ dpdk_thread.join();
+ }
+
+ int try_get(const spdk_nvme_transport_id& trid, SharedDriverData **driver);
+ void register_ctrlr(const spdk_nvme_transport_id& trid, spdk_nvme_ctrlr *c, SharedDriverData **driver) {
+ ceph_assert(ceph_mutex_is_locked(lock));
+ spdk_nvme_ns *ns;
+ int num_ns = spdk_nvme_ctrlr_get_num_ns(c);
+ ceph_assert(num_ns >= 1);
+ if (num_ns > 1) {
+ dout(0) << __func__ << " namespace count larger than 1, currently only use the first namespace" << dendl;
+ }
+ ns = spdk_nvme_ctrlr_get_ns(c, 1);
+ if (!ns) {
+ derr << __func__ << " failed to get namespace at 1" << dendl;
+ ceph_abort();
+ }
+ dout(1) << __func__ << " successfully attach nvme device at" << trid.traddr << dendl;
+
+ // only support one device per osd now!
+ ceph_assert(shared_driver_datas.empty());
+ // index 0 is occurred by master thread
+ shared_driver_datas.push_back(new SharedDriverData(shared_driver_datas.size()+1, trid, c, ns));
+ *driver = shared_driver_datas.back();
+ }
+};
+
+static NVMEManager manager;
+
+static bool probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, struct spdk_nvme_ctrlr_opts *opts)
+{
+ NVMEManager::ProbeContext *ctx = static_cast<NVMEManager::ProbeContext*>(cb_ctx);
+
+ if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ dout(0) << __func__ << " only probe local nvme device" << dendl;
+ return false;
+ }
+
+ dout(0) << __func__ << " found device at: "
+ << "trtype=" << spdk_nvme_transport_id_trtype_str(trid->trtype) << ", "
+ << "traddr=" << trid->traddr << dendl;
+ if (spdk_nvme_transport_id_compare(&ctx->trid, trid)) {
+ dout(0) << __func__ << " device traddr (" << ctx->trid.traddr << ") not match " << trid->traddr << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+static void attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
+{
+ auto ctx = static_cast<NVMEManager::ProbeContext*>(cb_ctx);
+ ctx->manager->register_ctrlr(ctx->trid, ctrlr, &ctx->driver);
+}
+
+int NVMEManager::try_get(const spdk_nvme_transport_id& trid, SharedDriverData **driver)
+{
+ std::lock_guard l(lock);
+ for (auto &&it : shared_driver_datas) {
+ if (it->is_equal(trid)) {
+ *driver = it;
+ return 0;
+ }
+ }
+
+ auto coremask_arg = g_conf().get_val<std::string>("bluestore_spdk_coremask");
+ int m_core_arg = -1;
+ try {
+ auto core_value = stoull(coremask_arg, nullptr, 16);
+ m_core_arg = ffsll(core_value);
+ } catch (const std::logic_error& e) {
+ derr << __func__ << " invalid bluestore_spdk_coremask: "
+ << coremask_arg << dendl;
+ return -EINVAL;
+ }
+ // at least one core is needed for using spdk
+ if (m_core_arg == 0) {
+ derr << __func__ << " invalid bluestore_spdk_coremask, "
+ << "at least one core is needed" << dendl;
+ return -ENOENT;
+ }
+ m_core_arg -= 1;
+
+ uint32_t mem_size_arg = (uint32_t)g_conf().get_val<Option::size_t>("bluestore_spdk_mem");
+
+ if (!dpdk_thread.joinable()) {
+ dpdk_thread = std::thread(
+ [this, coremask_arg, m_core_arg, mem_size_arg]() {
+ static struct spdk_env_opts opts;
+ int r;
+
+ spdk_env_opts_init(&opts);
+ opts.name = "nvme-device-manager";
+ opts.core_mask = coremask_arg.c_str();
+ opts.master_core = m_core_arg;
+ opts.mem_size = mem_size_arg;
+ spdk_env_init(&opts);
+ spdk_unaffinitize_thread();
+
+ spdk_nvme_retry_count = g_ceph_context->_conf->bdev_nvme_retry_count;
+ if (spdk_nvme_retry_count < 0)
+ spdk_nvme_retry_count = SPDK_NVME_DEFAULT_RETRY_COUNT;
+
+ std::unique_lock l(probe_queue_lock);
+ while (!stopping) {
+ if (!probe_queue.empty()) {
+ ProbeContext* ctxt = probe_queue.front();
+ probe_queue.pop_front();
+ r = spdk_nvme_probe(NULL, ctxt, probe_cb, attach_cb, NULL);
+ if (r < 0) {
+ ceph_assert(!ctxt->driver);
+ derr << __func__ << " device probe nvme failed" << dendl;
+ }
+ ctxt->done = true;
+ probe_queue_cond.notify_all();
+ } else {
+ probe_queue_cond.wait(l);
+ }
+ }
+ for (auto p : probe_queue)
+ p->done = true;
+ probe_queue_cond.notify_all();
+ }
+ );
+ }
+
+ ProbeContext ctx{trid, this, nullptr, false};
+ {
+ std::unique_lock l(probe_queue_lock);
+ probe_queue.push_back(&ctx);
+ while (!ctx.done)
+ probe_queue_cond.wait(l);
+ }
+ if (!ctx.driver)
+ return -1;
+ *driver = ctx.driver;
+
+ return 0;
+}
+
+void io_complete(void *t, const struct spdk_nvme_cpl *completion)
+{
+ Task *task = static_cast<Task*>(t);
+ IOContext *ctx = task->ctx;
+ SharedDriverQueueData *queue = task->queue;
+
+ ceph_assert(queue != NULL);
+ ceph_assert(ctx != NULL);
+ --queue->current_queue_depth;
+ auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>(
+ ceph::coarse_real_clock::now() - task->start);
+ if (task->command == IOCommand::WRITE_COMMAND) {
+ queue->logger->tinc(l_bluestore_nvmedevice_write_lat, dur);
+ ceph_assert(!spdk_nvme_cpl_is_error(completion));
+ dout(20) << __func__ << " write/zero op successfully, left "
+ << queue->queue_op_seq - queue->completed_op_seq << dendl;
+ // check waiting count before doing callback (which may
+ // destroy this ioc).
+ if (ctx->priv) {
+ if (!--ctx->num_running) {
+ task->device->aio_callback(task->device->aio_callback_priv, ctx->priv);
+ }
+ } else {
+ ctx->try_aio_wake();
+ }
+ task->release_segs(queue);
+ delete task;
+ } else if (task->command == IOCommand::READ_COMMAND) {
+ queue->logger->tinc(l_bluestore_nvmedevice_read_lat, dur);
+ ceph_assert(!spdk_nvme_cpl_is_error(completion));
+ dout(20) << __func__ << " read op successfully" << dendl;
+ task->fill_cb();
+ task->release_segs(queue);
+ // read submitted by AIO
+ if (!task->return_code) {
+ if (ctx->priv) {
+ if (!--ctx->num_running) {
+ task->device->aio_callback(task->device->aio_callback_priv, ctx->priv);
+ }
+ } else {
+ ctx->try_aio_wake();
+ }
+ delete task;
+ } else {
+ task->return_code = 0;
+ ctx->try_aio_wake();
+ }
+ } else {
+ ceph_assert(task->command == IOCommand::FLUSH_COMMAND);
+ ceph_assert(!spdk_nvme_cpl_is_error(completion));
+ queue->logger->tinc(l_bluestore_nvmedevice_flush_lat, dur);
+ dout(20) << __func__ << " flush op successfully" << dendl;
+ task->return_code = 0;
+ }
+}
+
+// ----------------
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev(" << name << ") "
+
+NVMEDevice::NVMEDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
+ : BlockDevice(cct, cb, cbpriv),
+ driver(nullptr)
+{
+}
+
+int NVMEDevice::open(const string& p)
+{
+ dout(1) << __func__ << " path " << p << dendl;
+
+ std::ifstream ifs(p);
+ if (!ifs) {
+ derr << __func__ << " unable to open " << p << dendl;
+ return -1;
+ }
+ string val;
+ std::getline(ifs, val);
+ spdk_nvme_transport_id trid;
+ if (int r = spdk_nvme_transport_id_parse(&trid, val.c_str()); r) {
+ derr << __func__ << " unable to read " << p << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ if (int r = manager.try_get(trid, &driver); r < 0) {
+ derr << __func__ << " failed to get nvme device with transport address " << trid.traddr << dendl;
+ return r;
+ }
+
+ driver->register_device(this);
+ block_size = driver->get_block_size();
+ size = driver->get_size();
+ name = trid.traddr;
+
+ //nvme is non-rotational device.
+ rotational = false;
+
+ // round size down to an even block
+ size &= ~(block_size - 1);
+
+ dout(1) << __func__ << " size " << size << " (" << byte_u_t(size) << ")"
+ << " block_size " << block_size << " (" << byte_u_t(block_size)
+ << ")" << dendl;
+
+
+ return 0;
+}
+
+void NVMEDevice::close()
+{
+ dout(1) << __func__ << dendl;
+
+ delete queue_t;
+ queue_t = nullptr;
+ name.clear();
+ driver->remove_device(this);
+
+ dout(1) << __func__ << " end" << dendl;
+}
+
+int NVMEDevice::collect_metadata(const string& prefix, map<string,string> *pm) const
+{
+ (*pm)[prefix + "rotational"] = "0";
+ (*pm)[prefix + "size"] = stringify(get_size());
+ (*pm)[prefix + "block_size"] = stringify(get_block_size());
+ (*pm)[prefix + "driver"] = "NVMEDevice";
+ (*pm)[prefix + "type"] = "nvme";
+ (*pm)[prefix + "access_mode"] = "spdk";
+ (*pm)[prefix + "nvme_serial_number"] = name;
+
+ return 0;
+}
+
+int NVMEDevice::flush()
+{
+ return 0;
+}
+
+void NVMEDevice::aio_submit(IOContext *ioc)
+{
+ dout(20) << __func__ << " ioc " << ioc << " pending "
+ << ioc->num_pending.load() << " running "
+ << ioc->num_running.load() << dendl;
+ int pending = ioc->num_pending.load();
+ Task *t = static_cast<Task*>(ioc->nvme_task_first);
+ if (pending && t) {
+ ioc->num_running += pending;
+ ioc->num_pending -= pending;
+ ceph_assert(ioc->num_pending.load() == 0); // we should be only thread doing this
+ // Only need to push the first entry
+ ioc->nvme_task_first = ioc->nvme_task_last = nullptr;
+ if (!queue_t)
+ queue_t = new SharedDriverQueueData(this, driver);
+ queue_t->_aio_handle(t, ioc);
+ }
+}
+
+static void write_split(
+ NVMEDevice *dev,
+ uint64_t off,
+ bufferlist &bl,
+ IOContext *ioc)
+{
+ uint64_t remain_len = bl.length(), begin = 0, write_size;
+ Task *t, *first, *last;
+ // This value may need to be got from configuration later.
+ uint64_t split_size = 131072; // 128KB.
+
+ while (remain_len > 0) {
+ write_size = std::min(remain_len, split_size);
+ t = new Task(dev, IOCommand::WRITE_COMMAND, off + begin, write_size);
+ // TODO: if upper layer alloc memory with known physical address,
+ // we can reduce this copy
+ bl.splice(0, write_size, &t->bl);
+ remain_len -= write_size;
+ t->ctx = ioc;
+ first = static_cast<Task*>(ioc->nvme_task_first);
+ last = static_cast<Task*>(ioc->nvme_task_last);
+ if (last)
+ last->next = t;
+ if (!first)
+ ioc->nvme_task_first = t;
+ ioc->nvme_task_last = t;
+ ++ioc->num_pending;
+ begin += write_size;
+ }
+}
+
+int NVMEDevice::aio_write(
+ uint64_t off,
+ bufferlist &bl,
+ IOContext *ioc,
+ bool buffered,
+ int write_hint)
+{
+ uint64_t len = bl.length();
+ dout(20) << __func__ << " " << off << "~" << len << " ioc " << ioc
+ << " buffered " << buffered << dendl;
+ ceph_assert(is_valid_io(off, len));
+
+ write_split(this, off, bl, ioc);
+ dout(5) << __func__ << " " << off << "~" << len << dendl;
+
+ return 0;
+}
+
+int NVMEDevice::write(uint64_t off, bufferlist &bl, bool buffered, int write_hint)
+{
+ uint64_t len = bl.length();
+ dout(20) << __func__ << " " << off << "~" << len << " buffered "
+ << buffered << dendl;
+ ceph_assert(off % block_size == 0);
+ ceph_assert(len % block_size == 0);
+ ceph_assert(len > 0);
+ ceph_assert(off < size);
+ ceph_assert(off + len <= size);
+
+ IOContext ioc(cct, NULL);
+ write_split(this, off, bl, &ioc);
+ dout(5) << __func__ << " " << off << "~" << len << dendl;
+ aio_submit(&ioc);
+ ioc.aio_wait();
+ return 0;
+}
+
+int NVMEDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered)
+{
+ dout(5) << __func__ << " " << off << "~" << len << " ioc " << ioc << dendl;
+ ceph_assert(is_valid_io(off, len));
+
+ Task *t = new Task(this, IOCommand::READ_COMMAND, off, len, 1);
+ bufferptr p = buffer::create_small_page_aligned(len);
+ int r = 0;
+ t->ctx = ioc;
+ char *buf = p.c_str();
+ t->fill_cb = [buf, t]() {
+ t->copy_to_buf(buf, 0, t->len);
+ };
+
+ ++ioc->num_pending;
+ ioc->nvme_task_first = t;
+ aio_submit(ioc);
+ ioc->aio_wait();
+
+ pbl->push_back(std::move(p));
+ r = t->return_code;
+ delete t;
+ return r;
+}
+
+int NVMEDevice::aio_read(
+ uint64_t off,
+ uint64_t len,
+ bufferlist *pbl,
+ IOContext *ioc)
+{
+ dout(20) << __func__ << " " << off << "~" << len << " ioc " << ioc << dendl;
+ ceph_assert(is_valid_io(off, len));
+
+ Task *t = new Task(this, IOCommand::READ_COMMAND, off, len);
+
+ bufferptr p = buffer::create_small_page_aligned(len);
+ pbl->append(p);
+ t->ctx = ioc;
+ char* buf = p.c_str();
+ t->fill_cb = [buf, t]() {
+ t->copy_to_buf(buf, 0, t->len);
+ };
+
+ Task *first = static_cast<Task*>(ioc->nvme_task_first);
+ Task *last = static_cast<Task*>(ioc->nvme_task_last);
+ if (last)
+ last->next = t;
+ if (!first)
+ ioc->nvme_task_first = t;
+ ioc->nvme_task_last = t;
+ ++ioc->num_pending;
+
+ return 0;
+}
+
+int NVMEDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered)
+{
+ ceph_assert(len > 0);
+ ceph_assert(off < size);
+ ceph_assert(off + len <= size);
+
+ uint64_t aligned_off = align_down(off, block_size);
+ uint64_t aligned_len = align_up(off+len, block_size) - aligned_off;
+ dout(5) << __func__ << " " << off << "~" << len
+ << " aligned " << aligned_off << "~" << aligned_len << dendl;
+ IOContext ioc(g_ceph_context, nullptr);
+ Task *t = new Task(this, IOCommand::READ_COMMAND, aligned_off, aligned_len, 1);
+ int r = 0;
+ t->ctx = &ioc;
+ t->fill_cb = [buf, t, off, len]() {
+ t->copy_to_buf(buf, off-t->offset, len);
+ };
+
+ ++ioc.num_pending;
+ ioc.nvme_task_first = t;
+ aio_submit(&ioc);
+ ioc.aio_wait();
+
+ r = t->return_code;
+ delete t;
+ return r;
+}
+
+int NVMEDevice::invalidate_cache(uint64_t off, uint64_t len)
+{
+ dout(5) << __func__ << " " << off << "~" << len << dendl;
+ return 0;
+}
diff --git a/src/os/bluestore/NVMEDevice.h b/src/os/bluestore/NVMEDevice.h
new file mode 100644
index 00000000..f44aeb59
--- /dev/null
+++ b/src/os/bluestore/NVMEDevice.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_BLUESTORE_NVMEDEVICE
+#define CEPH_OS_BLUESTORE_NVMEDEVICE
+
+#include <queue>
+#include <map>
+#include <limits>
+
+// since _Static_assert introduced in c11
+#define _Static_assert static_assert
+
+
+#include "include/interval_set.h"
+#include "common/ceph_time.h"
+#include "BlockDevice.h"
+
+enum class IOCommand {
+ READ_COMMAND,
+ WRITE_COMMAND,
+ FLUSH_COMMAND
+};
+
+class SharedDriverData;
+class SharedDriverQueueData;
+
+class NVMEDevice : public BlockDevice {
+ /**
+ * points to pinned, physically contiguous memory region;
+ * contains 4KB IDENTIFY structure for controller which is
+ * target for CONTROLLER IDENTIFY command during initialization
+ */
+ SharedDriverData *driver;
+ string name;
+
+ public:
+ std::atomic_int queue_number = {0};
+ SharedDriverData *get_driver() { return driver; }
+
+ NVMEDevice(CephContext* cct, aio_callback_t cb, void *cbpriv);
+
+ bool supported_bdev_label() override { return false; }
+
+ void aio_submit(IOContext *ioc) override;
+
+ int read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered) override;
+ int aio_read(
+ uint64_t off,
+ uint64_t len,
+ bufferlist *pbl,
+ IOContext *ioc) override;
+ int aio_write(uint64_t off, bufferlist& bl,
+ IOContext *ioc,
+ bool buffered,
+ int write_hint = WRITE_LIFE_NOT_SET) override;
+ int write(uint64_t off, bufferlist& bl, bool buffered, int write_hint = WRITE_LIFE_NOT_SET) override;
+ int flush() override;
+ int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override;
+
+ // for managing buffered readers/writers
+ int invalidate_cache(uint64_t off, uint64_t len) override;
+ int open(const string& path) override;
+ void close() override;
+ int collect_metadata(const string& prefix, map<string,string> *pm) const override;
+};
+
+#endif
diff --git a/src/os/bluestore/PMEMDevice.cc b/src/os/bluestore/PMEMDevice.cc
new file mode 100644
index 00000000..1f9d9599
--- /dev/null
+++ b/src/os/bluestore/PMEMDevice.cc
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel <jianpeng.ma@intel.com>
+ *
+ * Author: Jianpeng Ma <jianpeng.ma@intel.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "PMEMDevice.h"
+#include "libpmem.h"
+#include "include/types.h"
+#include "include/compat.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "common/blkdev.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bdev
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev-PMEM(" << path << ") "
+
+PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv)
+ : BlockDevice(cct, cb, cbpriv),
+ fd(-1), addr(0),
+ injecting_crash(0)
+{
+}
+
+int PMEMDevice::_lock()
+{
+ struct flock l;
+ memset(&l, 0, sizeof(l));
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ l.l_start = 0;
+ l.l_len = 0;
+ int r = ::fcntl(fd, F_SETLK, &l);
+ if (r < 0)
+ return -errno;
+ return 0;
+}
+
+int PMEMDevice::open(const string& p)
+{
+ path = p;
+ int r = 0;
+ dout(1) << __func__ << " path " << path << dendl;
+
+ fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = _lock();
+ if (r < 0) {
+ derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
+ << dendl;
+ goto out_fail;
+ }
+
+ struct stat st;
+ r = ::fstat(fd, &st);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
+ goto out_fail;
+ }
+
+ size_t map_len;
+ addr = (char *)pmem_map_file(path.c_str(), 0, PMEM_FILE_EXCL, O_RDWR, &map_len, NULL);
+ if (addr == NULL) {
+ derr << __func__ << " pmem_map_file failed: " << pmem_errormsg() << dendl;
+ goto out_fail;
+ }
+ size = map_len;
+
+ // Operate as though the block size is 4 KB. The backing file
+ // blksize doesn't strictly matter except that some file systems may
+ // require a read/modify/write if we write something smaller than
+ // it.
+ block_size = g_conf()->bdev_block_size;
+ if (block_size != (unsigned)st.st_blksize) {
+ dout(1) << __func__ << " backing device/file reports st_blksize "
+ << st.st_blksize << ", using bdev_block_size "
+ << block_size << " anyway" << dendl;
+ }
+
+ dout(1) << __func__
+ << " size " << size
+ << " (" << byte_u_t(size) << ")"
+ << " block_size " << block_size
+ << " (" << byte_u_t(block_size) << ")"
+ << dendl;
+ return 0;
+
+ out_fail:
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ fd = -1;
+ return r;
+}
+
+void PMEMDevice::close()
+{
+ dout(1) << __func__ << dendl;
+
+ ceph_assert(addr != NULL);
+ pmem_unmap(addr, size);
+ ceph_assert(fd >= 0);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ fd = -1;
+
+ path.clear();
+}
+
+int PMEMDevice::collect_metadata(const string& prefix, map<string,string> *pm) const
+{
+ (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
+ (*pm)[prefix + "size"] = stringify(get_size());
+ (*pm)[prefix + "block_size"] = stringify(get_block_size());
+ (*pm)[prefix + "driver"] = "PMEMDevice";
+ (*pm)[prefix + "type"] = "ssd";
+
+ struct stat st;
+ int r = ::fstat(fd, &st);
+ if (r < 0)
+ return -errno;
+ if (S_ISBLK(st.st_mode)) {
+ (*pm)[prefix + "access_mode"] = "blk";
+ char buffer[1024] = {0};
+ BlkDev blkdev(fd_buffered);
+
+ blkdev.model(buffer, sizeof(buffer));
+ (*pm)[prefix + "model"] = buffer;
+
+ buffer[0] = '\0';
+ blkdev.dev(buffer, sizeof(buffer));
+ (*pm)[prefix + "dev"] = buffer;
+
+ // nvme exposes a serial number
+ buffer[0] = '\0';
+ blkdev.serial(buffer, sizeof(buffer));
+ (*pm)[prefix + "serial"] = buffer;
+
+ if (blkdev.is_nvme())
+ (*pm)[prefix + "type"] = "nvme";
+ } else {
+ (*pm)[prefix + "access_mode"] = "file";
+ (*pm)[prefix + "path"] = path;
+ }
+ return 0;
+}
+
+int PMEMDevice::flush()
+{
+ //Because all write is persist. So no need
+ return 0;
+}
+
+
+void PMEMDevice::aio_submit(IOContext *ioc)
+{
+ if (ioc->priv) {
+ ceph_assert(ioc->num_running == 0);
+ aio_callback(aio_callback_priv, ioc->priv);
+ } else {
+ ioc->try_aio_wake();
+ }
+ return;
+}
+
+int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered, int write_hint = WRITE_LIFE_NOT_SET)
+{
+ uint64_t len = bl.length();
+ dout(20) << __func__ << " " << off << "~" << len << dendl;
+ ceph_assert(is_valid_io(off, len));
+
+ dout(40) << "data: ";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ if (g_conf()->bdev_inject_crash &&
+ rand() % g_conf()->bdev_inject_crash == 0) {
+ derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len
+ << dendl;
+ ++injecting_crash;
+ return 0;
+ }
+
+ bufferlist::iterator p = bl.begin();
+ uint32_t off1 = off;
+ while (len) {
+ const char *data;
+ uint32_t l = p.get_ptr_and_advance(len, &data);
+ pmem_memcpy_persist(addr + off1, data, l);
+ len -= l;
+ off1 += l;
+ }
+ return 0;
+}
+
+int PMEMDevice::aio_write(
+ uint64_t off,
+ bufferlist &bl,
+ IOContext *ioc,
+ bool buffered,
+ int write_hint = WRITE_LIFE_NOT_SET)
+{
+ return write(off, bl, buffered);
+}
+
+
+int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered)
+{
+ dout(5) << __func__ << " " << off << "~" << len << dendl;
+ ceph_assert(is_valid_io(off, len));
+
+ bufferptr p = buffer::create_small_page_aligned(len);
+ memcpy(p.c_str(), addr + off, len);
+
+ pbl->clear();
+ pbl->push_back(std::move(p));
+
+ dout(40) << "data: ";
+ pbl->hexdump(*_dout);
+ *_dout << dendl;
+
+ return 0;
+}
+
+int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc)
+{
+ return read(off, len, pbl, ioc, false);
+}
+
+int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered)
+{
+ dout(5) << __func__ << " " << off << "~" << len << dendl;
+ ceph_assert(is_valid_io(off, len));
+
+ memcpy(buf, addr + off, len);
+ return 0;
+}
+
+
+int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len)
+{
+ dout(5) << __func__ << " " << off << "~" << len << dendl;
+ return 0;
+}
+
+
diff --git a/src/os/bluestore/PMEMDevice.h b/src/os/bluestore/PMEMDevice.h
new file mode 100644
index 00000000..3077375a
--- /dev/null
+++ b/src/os/bluestore/PMEMDevice.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel <jianpeng.ma@intel.com>
+ *
+ * Author: Jianpeng Ma <jianpeng.ma@intel.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_BLUESTORE_PMEMDEVICE_H
+#define CEPH_OS_BLUESTORE_PMEMDEVICE_H
+
+#include <atomic>
+
+#include "os/fs/FS.h"
+#include "include/interval_set.h"
+#include "ceph_aio.h"
+#include "BlockDevice.h"
+
+class PMEMDevice : public BlockDevice {
+ int fd;
+ char *addr; //the address of mmap
+ std::string path;
+
+ ceph::mutex debug_lock = ceph::make_mutex("PMEMDevice::debug_lock");
+ interval_set<uint64_t> debug_inflight;
+
+ std::atomic_int injecting_crash;
+ int _lock();
+
+public:
+ PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv);
+
+
+ void aio_submit(IOContext *ioc) override;
+
+ int collect_metadata(const std::string& prefix, map<std::string,std::string> *pm) const override;
+
+ int read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered) override;
+ int aio_read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc) override;
+
+ int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override;
+ int write(uint64_t off, bufferlist& bl, bool buffered, int write_hint = WRITE_LIFE_NOT_SET) override;
+ int aio_write(uint64_t off, bufferlist& bl,
+ IOContext *ioc,
+ bool buffered,
+ int write_hint = WRITE_LIFE_NOT_SET) override;
+ int flush() override;
+
+ // for managing buffered readers/writers
+ int invalidate_cache(uint64_t off, uint64_t len) override;
+ int open(const std::string &path) override;
+ void close() override;
+
+private:
+ bool is_valid_io(uint64_t off, uint64_t len) const {
+ return (len > 0 &&
+ off < size &&
+ off + len <= size);
+ }
+};
+
+#endif
diff --git a/src/os/bluestore/StupidAllocator.cc b/src/os/bluestore/StupidAllocator.cc
new file mode 100644
index 00000000..f75f7446
--- /dev/null
+++ b/src/os/bluestore/StupidAllocator.cc
@@ -0,0 +1,364 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "StupidAllocator.h"
+#include "bluestore_types.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "stupidalloc 0x" << this << " "
+
+StupidAllocator::StupidAllocator(CephContext* cct,
+ const std::string& name,
+ int64_t _block_size)
+ : Allocator(name), cct(cct), num_free(0),
+ free(10),
+ last_alloc(0),
+ block_size(_block_size)
+{
+}
+
+StupidAllocator::~StupidAllocator()
+{
+}
+
+unsigned StupidAllocator::_choose_bin(uint64_t orig_len)
+{
+ uint64_t len = orig_len / cct->_conf->bdev_block_size;
+ int bin = std::min((int)cbits(len), (int)free.size() - 1);
+ ldout(cct, 30) << __func__ << " len 0x" << std::hex << orig_len
+ << std::dec << " -> " << bin << dendl;
+ return bin;
+}
+
+void StupidAllocator::_insert_free(uint64_t off, uint64_t len)
+{
+ unsigned bin = _choose_bin(len);
+ ldout(cct, 30) << __func__ << " 0x" << std::hex << off << "~" << len
+ << std::dec << " in bin " << bin << dendl;
+ while (true) {
+ free[bin].insert(off, len, &off, &len);
+ unsigned newbin = _choose_bin(len);
+ if (newbin == bin)
+ break;
+ ldout(cct, 30) << __func__ << " promoting 0x" << std::hex << off << "~" << len
+ << std::dec << " to bin " << newbin << dendl;
+ free[bin].erase(off, len);
+ bin = newbin;
+ }
+}
+
+/// return the effective length of the extent if we align to alloc_unit
+uint64_t StupidAllocator::_aligned_len(
+ StupidAllocator::interval_set_t::iterator p,
+ uint64_t alloc_unit)
+{
+ uint64_t skew = p.get_start() % alloc_unit;
+ if (skew)
+ skew = alloc_unit - skew;
+ if (skew > p.get_len())
+ return 0;
+ else
+ return p.get_len() - skew;
+}
+
+int64_t StupidAllocator::allocate_int(
+ uint64_t want_size, uint64_t alloc_unit, int64_t hint,
+ uint64_t *offset, uint32_t *length)
+{
+ std::lock_guard l(lock);
+ ldout(cct, 10) << __func__ << " want_size 0x" << std::hex << want_size
+ << " alloc_unit 0x" << alloc_unit
+ << " hint 0x" << hint << std::dec
+ << dendl;
+ uint64_t want = std::max(alloc_unit, want_size);
+ int bin = _choose_bin(want);
+ int orig_bin = bin;
+
+ auto p = free[0].begin();
+
+ if (!hint)
+ hint = last_alloc;
+
+ // search up (from hint)
+ if (hint) {
+ for (bin = orig_bin; bin < (int)free.size(); ++bin) {
+ p = free[bin].lower_bound(hint);
+ while (p != free[bin].end()) {
+ if (_aligned_len(p, alloc_unit) >= want_size) {
+ goto found;
+ }
+ ++p;
+ }
+ }
+ }
+
+ // search up (from origin, and skip searched extents by hint)
+ for (bin = orig_bin; bin < (int)free.size(); ++bin) {
+ p = free[bin].begin();
+ auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
+ while (p != end) {
+ if (_aligned_len(p, alloc_unit) >= want_size) {
+ goto found;
+ }
+ ++p;
+ }
+ }
+
+ // search down (hint)
+ if (hint) {
+ for (bin = orig_bin; bin >= 0; --bin) {
+ p = free[bin].lower_bound(hint);
+ while (p != free[bin].end()) {
+ if (_aligned_len(p, alloc_unit) >= alloc_unit) {
+ goto found;
+ }
+ ++p;
+ }
+ }
+ }
+
+ // search down (from origin, and skip searched extents by hint)
+ for (bin = orig_bin; bin >= 0; --bin) {
+ p = free[bin].begin();
+ auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
+ while (p != end) {
+ if (_aligned_len(p, alloc_unit) >= alloc_unit) {
+ goto found;
+ }
+ ++p;
+ }
+ }
+
+ return -ENOSPC;
+
+ found:
+ uint64_t skew = p.get_start() % alloc_unit;
+ if (skew)
+ skew = alloc_unit - skew;
+ *offset = p.get_start() + skew;
+ *length = std::min(std::max(alloc_unit, want_size), p2align((p.get_len() - skew), alloc_unit));
+ if (cct->_conf->bluestore_debug_small_allocations) {
+ uint64_t max =
+ alloc_unit * (rand() % cct->_conf->bluestore_debug_small_allocations);
+ if (max && *length > max) {
+ ldout(cct, 10) << __func__ << " shortening allocation of 0x" << std::hex
+ << *length << " -> 0x"
+ << max << " due to debug_small_allocations" << std::dec
+ << dendl;
+ *length = max;
+ }
+ }
+ ldout(cct, 30) << __func__ << " got 0x" << std::hex << *offset << "~" << *length
+ << " from bin " << std::dec << bin << dendl;
+
+ free[bin].erase(*offset, *length);
+ uint64_t off, len;
+ if (*offset && free[bin].contains(*offset - skew - 1, &off, &len)) {
+ int newbin = _choose_bin(len);
+ if (newbin != bin) {
+ ldout(cct, 30) << __func__ << " demoting 0x" << std::hex << off << "~" << len
+ << std::dec << " to bin " << newbin << dendl;
+ free[bin].erase(off, len);
+ _insert_free(off, len);
+ }
+ }
+ if (free[bin].contains(*offset + *length, &off, &len)) {
+ int newbin = _choose_bin(len);
+ if (newbin != bin) {
+ ldout(cct, 30) << __func__ << " demoting 0x" << std::hex << off << "~" << len
+ << std::dec << " to bin " << newbin << dendl;
+ free[bin].erase(off, len);
+ _insert_free(off, len);
+ }
+ }
+
+ num_free -= *length;
+ ceph_assert(num_free >= 0);
+ last_alloc = *offset + *length;
+ return 0;
+}
+
+int64_t StupidAllocator::allocate(
+ uint64_t want_size,
+ uint64_t alloc_unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector *extents)
+{
+ uint64_t allocated_size = 0;
+ uint64_t offset = 0;
+ uint32_t length = 0;
+ int res = 0;
+
+ if (max_alloc_size == 0) {
+ max_alloc_size = want_size;
+ }
+ // cap with 32-bit val
+ max_alloc_size = std::min(max_alloc_size, 0x10000000 - alloc_unit);
+
+ while (allocated_size < want_size) {
+ res = allocate_int(std::min(max_alloc_size, (want_size - allocated_size)),
+ alloc_unit, hint, &offset, &length);
+ if (res != 0) {
+ /*
+ * Allocation failed.
+ */
+ break;
+ }
+ bool can_append = true;
+ if (!extents->empty()) {
+ bluestore_pextent_t &last_extent = extents->back();
+ if (last_extent.end() == offset) {
+ uint64_t l64 = last_extent.length;
+ l64 += length;
+ if (l64 < 0x100000000 && l64 <= max_alloc_size) {
+ can_append = false;
+ last_extent.length += length;
+ }
+ }
+ }
+ if (can_append) {
+ extents->emplace_back(bluestore_pextent_t(offset, length));
+ }
+
+ allocated_size += length;
+ hint = offset + length;
+ }
+
+ if (allocated_size == 0) {
+ return -ENOSPC;
+ }
+ return allocated_size;
+}
+
+void StupidAllocator::release(
+ const interval_set<uint64_t>& release_set)
+{
+ std::lock_guard l(lock);
+ for (interval_set<uint64_t>::const_iterator p = release_set.begin();
+ p != release_set.end();
+ ++p) {
+ const auto offset = p.get_start();
+ const auto length = p.get_len();
+ ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ _insert_free(offset, length);
+ num_free += length;
+ }
+}
+
+uint64_t StupidAllocator::get_free()
+{
+ std::lock_guard l(lock);
+ return num_free;
+}
+
+double StupidAllocator::get_fragmentation()
+{
+ ceph_assert(block_size);
+ double res;
+ uint64_t max_intervals = 0;
+ uint64_t intervals = 0;
+ {
+ std::lock_guard l(lock);
+ max_intervals = p2roundup<uint64_t>(num_free, block_size) / block_size;
+ for (unsigned bin = 0; bin < free.size(); ++bin) {
+ intervals += free[bin].num_intervals();
+ }
+ }
+ ldout(cct, 30) << __func__ << " " << intervals << "/" << max_intervals
+ << dendl;
+ ceph_assert(intervals <= max_intervals);
+ if (!intervals || max_intervals <= 1) {
+ return 0.0;
+ }
+ intervals--;
+ max_intervals--;
+ res = (double)intervals / max_intervals;
+ return res;
+}
+
+void StupidAllocator::dump()
+{
+ std::lock_guard l(lock);
+ for (unsigned bin = 0; bin < free.size(); ++bin) {
+ ldout(cct, 0) << __func__ << " free bin " << bin << ": "
+ << free[bin].num_intervals() << " extents" << dendl;
+ for (auto p = free[bin].begin();
+ p != free[bin].end();
+ ++p) {
+ ldout(cct, 0) << __func__ << " 0x" << std::hex << p.get_start() << "~"
+ << p.get_len() << std::dec << dendl;
+ }
+ }
+}
+
+void StupidAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+ std::lock_guard l(lock);
+ for (unsigned bin = 0; bin < free.size(); ++bin) {
+ for (auto p = free[bin].begin(); p != free[bin].end(); ++p) {
+ notify(p.get_start(), p.get_len());
+ }
+ }
+}
+
+void StupidAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+ std::lock_guard l(lock);
+ ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ _insert_free(offset, length);
+ num_free += length;
+}
+
+void StupidAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+ std::lock_guard l(lock);
+ ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+ << std::dec << dendl;
+ interval_set_t rm;
+ rm.insert(offset, length);
+ for (unsigned i = 0; i < free.size() && !rm.empty(); ++i) {
+ interval_set_t overlap;
+ overlap.intersection_of(rm, free[i]);
+ if (!overlap.empty()) {
+ ldout(cct, 20) << __func__ << " bin " << i << " rm 0x" << std::hex << overlap
+ << std::dec << dendl;
+ auto it = overlap.begin();
+ auto it_end = overlap.end();
+ while (it != it_end) {
+ auto o = it.get_start();
+ auto l = it.get_len();
+
+ free[i].erase(o, l,
+ [&](uint64_t off, uint64_t len) {
+ unsigned newbin = _choose_bin(len);
+ if (newbin != i) {
+ ldout(cct, 30) << __func__ << " demoting1 0x" << std::hex << off << "~" << len
+ << std::dec << " to bin " << newbin << dendl;
+ _insert_free(off, len);
+ return true;
+ }
+ return false;
+ });
+ ++it;
+ }
+
+ rm.subtract(overlap);
+ }
+ }
+ ceph_assert(rm.empty());
+ num_free -= length;
+ ceph_assert(num_free >= 0);
+}
+
+
+void StupidAllocator::shutdown()
+{
+ ldout(cct, 1) << __func__ << dendl;
+}
+
diff --git a/src/os/bluestore/StupidAllocator.h b/src/os/bluestore/StupidAllocator.h
new file mode 100644
index 00000000..d9c4a447
--- /dev/null
+++ b/src/os/bluestore/StupidAllocator.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_STUPIDALLOCATOR_H
+#define CEPH_OS_BLUESTORE_STUPIDALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "include/btree_map.h"
+#include "include/interval_set.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+#include "common/ceph_mutex.h"
+
+class StupidAllocator : public Allocator {
+ CephContext* cct;
+ ceph::mutex lock = ceph::make_mutex("StupidAllocator::lock");
+
+ int64_t num_free; ///< total bytes in freelist
+ int64_t block_size;
+
+ typedef mempool::bluestore_alloc::pool_allocator<
+ pair<const uint64_t,uint64_t>> allocator_t;
+ typedef btree::btree_map<uint64_t,uint64_t,std::less<uint64_t>,allocator_t> interval_set_map_t;
+ typedef interval_set<uint64_t,interval_set_map_t> interval_set_t;
+ std::vector<interval_set_t> free; ///< leading-edge copy
+
+ uint64_t last_alloc;
+
+ unsigned _choose_bin(uint64_t len);
+ void _insert_free(uint64_t offset, uint64_t len);
+
+ uint64_t _aligned_len(
+ interval_set_t::iterator p,
+ uint64_t alloc_unit);
+
+public:
+ StupidAllocator(CephContext* cct, const std::string& name, int64_t block_size);
+ ~StupidAllocator() override;
+
+ int64_t allocate(
+ uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+ int64_t hint, PExtentVector *extents) override;
+
+ int64_t allocate_int(
+ uint64_t want_size, uint64_t alloc_unit, int64_t hint,
+ uint64_t *offset, uint32_t *length);
+
+ void release(
+ const interval_set<uint64_t>& release_set) override;
+
+ uint64_t get_free() override;
+ double get_fragmentation() override;
+
+ void dump() override;
+ void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override;
+
+ void init_add_free(uint64_t offset, uint64_t length) override;
+ void init_rm_free(uint64_t offset, uint64_t length) override;
+
+ void shutdown() override;
+};
+
+#endif
diff --git a/src/os/bluestore/aio.cc b/src/os/bluestore/aio.cc
new file mode 100644
index 00000000..eb0c13fe
--- /dev/null
+++ b/src/os/bluestore/aio.cc
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include "ceph_aio.h"
+
+std::ostream& operator<<(std::ostream& os, const aio_t& aio)
+{
+ unsigned i = 0;
+ os << "aio: ";
+ for (auto& iov : aio.iov) {
+ os << "\n [" << i++ << "] 0x"
+ << std::hex << iov.iov_base << "~" << iov.iov_len << std::dec;
+ }
+ return os;
+}
+
+int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
+ uint16_t aios_size, void *priv,
+ int *retries)
+{
+ // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
+ int attempts = 16;
+ int delay = 125;
+ int r;
+
+ aio_iter cur = begin;
+ struct aio_t *piocb[aios_size];
+ int left = 0;
+ while (cur != end) {
+ cur->priv = priv;
+ *(piocb+left) = &(*cur);
+ ++left;
+ ++cur;
+ }
+ ceph_assert(aios_size >= left);
+ int done = 0;
+ while (left > 0) {
+#if defined(HAVE_LIBAIO)
+ r = io_submit(ctx, std::min(left, max_iodepth), (struct iocb**)(piocb + done));
+#elif defined(HAVE_POSIXAIO)
+ if (piocb[done]->n_aiocb == 1) {
+ // TODO: consider batching multiple reads together with lio_listio
+ piocb[done]->aio.aiocb.aio_sigevent.sigev_notify = SIGEV_KEVENT;
+ piocb[done]->aio.aiocb.aio_sigevent.sigev_notify_kqueue = ctx;
+ piocb[done]->aio.aiocb.aio_sigevent.sigev_value.sival_ptr = piocb[done];
+ r = aio_read(&piocb[done]->aio.aiocb);
+ } else {
+ struct sigevent sev;
+ sev.sigev_notify = SIGEV_KEVENT;
+ sev.sigev_notify_kqueue = ctx;
+ sev.sigev_value.sival_ptr = piocb[done];
+ r = lio_listio(LIO_NOWAIT, &piocb[done]->aio.aiocbp, piocb[done]->n_aiocb, &sev);
+ }
+#endif
+ if (r < 0) {
+ if (r == -EAGAIN && attempts-- > 0) {
+ usleep(delay);
+ delay *= 2;
+ (*retries)++;
+ continue;
+ }
+ return r;
+ }
+ ceph_assert(r > 0);
+ done += r;
+ left -= r;
+ attempts = 16;
+ delay = 125;
+ }
+ return done;
+}
+
+int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
+{
+#if defined(HAVE_LIBAIO)
+ io_event events[max];
+#elif defined(HAVE_POSIXAIO)
+ struct kevent events[max];
+#endif
+ struct timespec t = {
+ timeout_ms / 1000,
+ (timeout_ms % 1000) * 1000 * 1000
+ };
+
+ int r = 0;
+ do {
+#if defined(HAVE_LIBAIO)
+ r = io_getevents(ctx, 1, max, events, &t);
+#elif defined(HAVE_POSIXAIO)
+ r = kevent(ctx, NULL, 0, events, max, &t);
+ if (r < 0)
+ r = -errno;
+#endif
+ } while (r == -EINTR);
+
+ for (int i=0; i<r; ++i) {
+#if defined(HAVE_LIBAIO)
+ paio[i] = (aio_t *)events[i].obj;
+ paio[i]->rval = events[i].res;
+#else
+ paio[i] = (aio_t*)events[i].udata;
+ if (paio[i]->n_aiocb == 1) {
+ paio[i]->rval = aio_return(&paio[i]->aio.aiocb);
+ } else {
+ // Emulate the return value of pwritev. I can't find any documentation
+ // for what the value of io_event.res is supposed to be. I'm going to
+ // assume that it's just like pwritev/preadv/pwrite/pread.
+ paio[i]->rval = 0;
+ for (int j = 0; j < paio[i]->n_aiocb; j++) {
+ int res = aio_return(&paio[i]->aio.aiocbp[j]);
+ if (res < 0) {
+ paio[i]->rval = res;
+ break;
+ } else {
+ paio[i]->rval += res;
+ }
+ }
+ free(paio[i]->aio.aiocbp);
+ }
+#endif
+ }
+ return r;
+}
diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc
new file mode 100644
index 00000000..c565f43b
--- /dev/null
+++ b/src/os/bluestore/bluefs_types.cc
@@ -0,0 +1,213 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include "bluefs_types.h"
+#include "common/Formatter.h"
+#include "include/uuid.h"
+#include "include/stringify.h"
+
+// bluefs_extent_t
+void bluefs_extent_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+ f->dump_unsigned("bdev", bdev);
+}
+
+void bluefs_extent_t::generate_test_instances(list<bluefs_extent_t*>& ls)
+{
+ ls.push_back(new bluefs_extent_t);
+ ls.push_back(new bluefs_extent_t);
+ ls.back()->offset = 1;
+ ls.back()->length = 2;
+ ls.back()->bdev = 1;
+}
+
+ostream& operator<<(ostream& out, const bluefs_extent_t& e)
+{
+ return out << (int)e.bdev << ":0x" << std::hex << e.offset << "~" << e.length
+ << std::dec;
+}
+
+// bluefs_super_t
+
+void bluefs_super_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(uuid, bl);
+ encode(osd_uuid, bl);
+ encode(version, bl);
+ encode(block_size, bl);
+ encode(log_fnode, bl);
+ ENCODE_FINISH(bl);
+}
+
+void bluefs_super_t::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(uuid, p);
+ decode(osd_uuid, p);
+ decode(version, p);
+ decode(block_size, p);
+ decode(log_fnode, p);
+ DECODE_FINISH(p);
+}
+
+void bluefs_super_t::dump(Formatter *f) const
+{
+ f->dump_stream("uuid") << uuid;
+ f->dump_stream("osd_uuid") << osd_uuid;
+ f->dump_unsigned("version", version);
+ f->dump_unsigned("block_size", block_size);
+ f->dump_object("log_fnode", log_fnode);
+}
+
+void bluefs_super_t::generate_test_instances(list<bluefs_super_t*>& ls)
+{
+ ls.push_back(new bluefs_super_t);
+ ls.push_back(new bluefs_super_t);
+ ls.back()->version = 1;
+ ls.back()->block_size = 4096;
+}
+
+ostream& operator<<(ostream& out, const bluefs_super_t& s)
+{
+ return out << "super(uuid " << s.uuid
+ << " osd " << s.osd_uuid
+ << " v " << s.version
+ << " block_size 0x" << std::hex << s.block_size
+ << " log_fnode 0x" << s.log_fnode
+ << std::dec << ")";
+}
+
+// bluefs_fnode_t
+
+mempool::bluefs::vector<bluefs_extent_t>::iterator bluefs_fnode_t::seek(
+ uint64_t offset, uint64_t *x_off)
+{
+ auto p = extents.begin();
+
+ if (extents_index.size() > 4) {
+ auto it = std::upper_bound(extents_index.begin(), extents_index.end(),
+ offset);
+ assert(it != extents_index.begin());
+ --it;
+ assert(offset >= *it);
+ p += it - extents_index.begin();
+ offset -= *it;
+ }
+
+ while (p != extents.end()) {
+ if ((int64_t) offset >= p->length) {
+ offset -= p->length;
+ ++p;
+ } else {
+ break;
+ }
+ }
+ *x_off = offset;
+ return p;
+}
+
+void bluefs_fnode_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("ino", ino);
+ f->dump_unsigned("size", size);
+ f->dump_stream("mtime") << mtime;
+ f->open_array_section("extents");
+ for (auto& p : extents)
+ f->dump_object("extent", p);
+ f->close_section();
+}
+
+void bluefs_fnode_t::generate_test_instances(list<bluefs_fnode_t*>& ls)
+{
+ ls.push_back(new bluefs_fnode_t);
+ ls.push_back(new bluefs_fnode_t);
+ ls.back()->ino = 123;
+ ls.back()->size = 1048576;
+ ls.back()->mtime = utime_t(123,45);
+ ls.back()->extents.push_back(bluefs_extent_t(0, 1048576, 4096));
+ ls.back()->__unused__ = 1;
+}
+
+ostream& operator<<(ostream& out, const bluefs_fnode_t& file)
+{
+ return out << "file(ino " << file.ino
+ << " size 0x" << std::hex << file.size << std::dec
+ << " mtime " << file.mtime
+ << " allocated " << std::hex << file.allocated << std::dec
+ << " extents " << file.extents
+ << ")";
+}
+
+
+// bluefs_transaction_t
+
+void bluefs_transaction_t::encode(bufferlist& bl) const
+{
+ uint32_t crc = op_bl.crc32c(-1);
+ ENCODE_START(1, 1, bl);
+ encode(uuid, bl);
+ encode(seq, bl);
+ // not using bufferlist encode method, as it merely copies the bufferptr and not
+ // contents, meaning we're left with fragmented target bl
+ __u32 len = op_bl.length();
+ encode(len, bl);
+ for (auto& it : op_bl.buffers()) {
+ bl.append(it.c_str(), it.length());
+ }
+ encode(crc, bl);
+ ENCODE_FINISH(bl);
+}
+
+void bluefs_transaction_t::decode(bufferlist::const_iterator& p)
+{
+ uint32_t crc;
+ DECODE_START(1, p);
+ decode(uuid, p);
+ decode(seq, p);
+ decode(op_bl, p);
+ decode(crc, p);
+ DECODE_FINISH(p);
+ uint32_t actual = op_bl.crc32c(-1);
+ if (actual != crc)
+ throw buffer::malformed_input("bad crc " + stringify(actual)
+ + " expected " + stringify(crc));
+}
+
+void bluefs_transaction_t::dump(Formatter *f) const
+{
+ f->dump_stream("uuid") << uuid;
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("op_bl_length", op_bl.length());
+ f->dump_unsigned("crc", op_bl.crc32c(-1));
+}
+
+void bluefs_transaction_t::generate_test_instances(
+ list<bluefs_transaction_t*>& ls)
+{
+ ls.push_back(new bluefs_transaction_t);
+ ls.push_back(new bluefs_transaction_t);
+ ls.back()->op_init();
+ ls.back()->op_alloc_add(0, 0, 123123211);
+ ls.back()->op_alloc_rm(1, 0, 123);
+ ls.back()->op_dir_create("dir");
+ ls.back()->op_dir_create("dir2");
+ bluefs_fnode_t fnode;
+ fnode.ino = 2;
+ ls.back()->op_file_update(fnode);
+ ls.back()->op_dir_link("dir", "file1", 2);
+ ls.back()->op_dir_unlink("dir", "file1");
+ ls.back()->op_file_remove(2);
+ ls.back()->op_dir_remove("dir2");
+}
+
+ostream& operator<<(ostream& out, const bluefs_transaction_t& t)
+{
+ return out << "txn(seq " << t.seq
+ << " len 0x" << std::hex << t.op_bl.length()
+ << " crc 0x" << t.op_bl.crc32c(-1)
+ << std::dec << ")";
+}
diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h
new file mode 100644
index 00000000..9ac27fab
--- /dev/null
+++ b/src/os/bluestore/bluefs_types.h
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_BLUEFS_TYPES_H
+#define CEPH_OS_BLUESTORE_BLUEFS_TYPES_H
+
+#include "bluestore_types.h"
+#include "include/utime.h"
+#include "include/encoding.h"
+#include "include/denc.h"
+
+class bluefs_extent_t {
+public:
+ uint64_t offset = 0;
+ uint32_t length = 0;
+ uint8_t bdev;
+
+ bluefs_extent_t(uint8_t b = 0, uint64_t o = 0, uint32_t l = 0)
+ : offset(o), length(l), bdev(b) {}
+
+ uint64_t end() const { return offset + length; }
+ DENC(bluefs_extent_t, v, p) {
+ DENC_START(1, 1, p);
+ denc_lba(v.offset, p);
+ denc_varint_lowz(v.length, p);
+ denc(v.bdev, p);
+ DENC_FINISH(p);
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluefs_extent_t*>&);
+};
+WRITE_CLASS_DENC(bluefs_extent_t)
+
+ostream& operator<<(ostream& out, const bluefs_extent_t& e);
+
+struct bluefs_fnode_t {
+ uint64_t ino;
+ uint64_t size;
+ utime_t mtime;
+ uint8_t __unused__; // was prefer_bdev
+ mempool::bluefs::vector<bluefs_extent_t> extents;
+
+ // precalculated logical offsets for extents vector entries
+ // allows fast lookup for extent index by the offset value via upper_bound()
+ mempool::bluefs::vector<uint64_t> extents_index;
+
+ uint64_t allocated;
+
+ bluefs_fnode_t() : ino(0), size(0), __unused__(0), allocated(0) {}
+
+ uint64_t get_allocated() const {
+ return allocated;
+ }
+
+ void recalc_allocated() {
+ allocated = 0;
+ extents_index.reserve(extents.size());
+ for (auto& p : extents) {
+ extents_index.emplace_back(allocated);
+ allocated += p.length;
+ }
+ }
+
+ DENC_HELPERS
+ void bound_encode(size_t& p) const {
+ _denc_friend(*this, p);
+ }
+ void encode(bufferlist::contiguous_appender& p) const {
+ DENC_DUMP_PRE(bluefs_fnode_t);
+ _denc_friend(*this, p);
+ DENC_DUMP_POST(bluefs_fnode_t);
+ }
+ void decode(buffer::ptr::const_iterator& p) {
+ _denc_friend(*this, p);
+ recalc_allocated();
+ }
+ template<typename T, typename P>
+ friend std::enable_if_t<std::is_same_v<bluefs_fnode_t, std::remove_const_t<T>>>
+ _denc_friend(T& v, P& p) {
+ DENC_START(1, 1, p);
+ denc_varint(v.ino, p);
+ denc_varint(v.size, p);
+ denc(v.mtime, p);
+ denc(v.__unused__, p);
+ denc(v.extents, p);
+ DENC_FINISH(p);
+ }
+
+ void append_extent(const bluefs_extent_t& ext) {
+ if (!extents.empty() &&
+ extents.back().end() == ext.offset &&
+ extents.back().bdev == ext.bdev &&
+ (uint64_t)extents.back().length + (uint64_t)ext.length < 0xffffffff) {
+ extents.back().length += ext.length;
+ } else {
+ extents_index.emplace_back(allocated);
+ extents.push_back(ext);
+ }
+ allocated += ext.length;
+ }
+
+ void pop_front_extent() {
+ auto it = extents.begin();
+ allocated -= it->length;
+ extents_index.erase(extents_index.begin());
+ for (auto& i: extents_index) {
+ i -= it->length;
+ }
+ extents.erase(it);
+ }
+
+ void swap_extents(bluefs_fnode_t& other) {
+ other.extents.swap(extents);
+ other.extents_index.swap(extents_index);
+ std::swap(allocated, other.allocated);
+ }
+ void clear_extents() {
+ extents_index.clear();
+ extents.clear();
+ allocated = 0;
+ }
+
+ mempool::bluefs::vector<bluefs_extent_t>::iterator seek(
+ uint64_t off, uint64_t *x_off);
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluefs_fnode_t*>& ls);
+
+};
+WRITE_CLASS_DENC(bluefs_fnode_t)
+
+ostream& operator<<(ostream& out, const bluefs_fnode_t& file);
+
+
+struct bluefs_super_t {
+ uuid_d uuid; ///< unique to this bluefs instance
+ uuid_d osd_uuid; ///< matches the osd that owns us
+ uint64_t version;
+ uint32_t block_size;
+
+ bluefs_fnode_t log_fnode;
+
+ bluefs_super_t()
+ : version(0),
+ block_size(4096) { }
+
+ uint64_t block_mask() const {
+ return ~((uint64_t)block_size - 1);
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluefs_super_t*>& ls);
+};
+WRITE_CLASS_ENCODER(bluefs_super_t)
+
+ostream& operator<<(ostream&, const bluefs_super_t& s);
+
+
+struct bluefs_transaction_t {
+ typedef enum {
+ OP_NONE = 0,
+ OP_INIT, ///< initial (empty) file system marker
+ OP_ALLOC_ADD, ///< add extent to available block storage (extent)
+ OP_ALLOC_RM, ///< remove extent from available block storage (extent)
+ OP_DIR_LINK, ///< (re)set a dir entry (dirname, filename, ino)
+ OP_DIR_UNLINK, ///< remove a dir entry (dirname, filename)
+ OP_DIR_CREATE, ///< create a dir (dirname)
+ OP_DIR_REMOVE, ///< remove a dir (dirname)
+ OP_FILE_UPDATE, ///< set/update file metadata (file)
+ OP_FILE_REMOVE, ///< remove file (ino)
+ OP_JUMP, ///< jump the seq # and offset
+ OP_JUMP_SEQ, ///< jump the seq #
+ } op_t;
+
+ uuid_d uuid; ///< fs uuid
+ uint64_t seq; ///< sequence number
+ bufferlist op_bl; ///< encoded transaction ops
+
+ bluefs_transaction_t() : seq(0) {}
+
+ void clear() {
+ *this = bluefs_transaction_t();
+ }
+ bool empty() const {
+ return op_bl.length() == 0;
+ }
+
+ void op_init() {
+ using ceph::encode;
+ encode((__u8)OP_INIT, op_bl);
+ }
+ void op_alloc_add(uint8_t id, uint64_t offset, uint64_t length) {
+ using ceph::encode;
+ encode((__u8)OP_ALLOC_ADD, op_bl);
+ encode(id, op_bl);
+ encode(offset, op_bl);
+ encode(length, op_bl);
+ }
+ void op_alloc_rm(uint8_t id, uint64_t offset, uint64_t length) {
+ using ceph::encode;
+ encode((__u8)OP_ALLOC_RM, op_bl);
+ encode(id, op_bl);
+ encode(offset, op_bl);
+ encode(length, op_bl);
+ }
+ void op_dir_create(const string& dir) {
+ using ceph::encode;
+ encode((__u8)OP_DIR_CREATE, op_bl);
+ encode(dir, op_bl);
+ }
+ void op_dir_remove(const string& dir) {
+ using ceph::encode;
+ encode((__u8)OP_DIR_REMOVE, op_bl);
+ encode(dir, op_bl);
+ }
+ void op_dir_link(const string& dir, const string& file, uint64_t ino) {
+ using ceph::encode;
+ encode((__u8)OP_DIR_LINK, op_bl);
+ encode(dir, op_bl);
+ encode(file, op_bl);
+ encode(ino, op_bl);
+ }
+ void op_dir_unlink(const string& dir, const string& file) {
+ using ceph::encode;
+ encode((__u8)OP_DIR_UNLINK, op_bl);
+ encode(dir, op_bl);
+ encode(file, op_bl);
+ }
+ void op_file_update(const bluefs_fnode_t& file) {
+ using ceph::encode;
+ encode((__u8)OP_FILE_UPDATE, op_bl);
+ encode(file, op_bl);
+ }
+ void op_file_remove(uint64_t ino) {
+ using ceph::encode;
+ encode((__u8)OP_FILE_REMOVE, op_bl);
+ encode(ino, op_bl);
+ }
+ void op_jump(uint64_t next_seq, uint64_t offset) {
+ using ceph::encode;
+ encode((__u8)OP_JUMP, op_bl);
+ encode(next_seq, op_bl);
+ encode(offset, op_bl);
+ }
+ void op_jump_seq(uint64_t next_seq) {
+ using ceph::encode;
+ encode((__u8)OP_JUMP_SEQ, op_bl);
+ encode(next_seq, op_bl);
+ }
+ void claim_ops(bluefs_transaction_t& from) {
+ op_bl.claim_append(from.op_bl);
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluefs_transaction_t*>& ls);
+};
+WRITE_CLASS_ENCODER(bluefs_transaction_t)
+
+ostream& operator<<(ostream& out, const bluefs_transaction_t& t);
+
+#endif
diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc
new file mode 100644
index 00000000..fc33289b
--- /dev/null
+++ b/src/os/bluestore/bluestore_tool.cc
@@ -0,0 +1,864 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+#include <time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "global/global_init.h"
+#include "common/ceph_argparse.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+
+#include "os/bluestore/BlueFS.h"
+#include "os/bluestore/BlueStore.h"
+#include "common/admin_socket.h"
+
+namespace po = boost::program_options;
+
+void usage(po::options_description &desc)
+{
+ cout << desc << std::endl;
+}
+
+void validate_path(CephContext *cct, const string& path, bool bluefs)
+{
+ BlueStore bluestore(cct, path);
+ string type;
+ int r = bluestore.read_meta("type", &type);
+ if (r < 0) {
+ cerr << "failed to load os-type: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (type != "bluestore") {
+ cerr << "expected bluestore, but type is " << type << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (!bluefs) {
+ return;
+ }
+
+ string kv_backend;
+ r = bluestore.read_meta("kv_backend", &kv_backend);
+ if (r < 0) {
+ cerr << "failed to load kv_backend: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (kv_backend != "rocksdb") {
+ cerr << "expect kv_backend to be rocksdb, but is " << kv_backend
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ string bluefs_enabled;
+ r = bluestore.read_meta("bluefs", &bluefs_enabled);
+ if (r < 0) {
+ cerr << "failed to load do_bluefs: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (bluefs_enabled != "1") {
+ cerr << "bluefs not enabled for rocksdb" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+}
+
+const char* find_device_path(
+ int id,
+ CephContext *cct,
+ const vector<string>& devs)
+{
+ for (auto& i : devs) {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct, i, &label);
+ if (r < 0) {
+ cerr << "unable to read label for " << i << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if ((id == BlueFS::BDEV_SLOW && label.description == "main") ||
+ (id == BlueFS::BDEV_DB && label.description == "bluefs db") ||
+ (id == BlueFS::BDEV_WAL && label.description == "bluefs wal")) {
+ return i.c_str();
+ }
+ }
+ return nullptr;
+}
+
+void parse_devices(
+ CephContext *cct,
+ const vector<string>& devs,
+ map<string, int>* got,
+ bool* has_db,
+ bool* has_wal)
+{
+ string main;
+ bool was_db = false;
+ if (has_wal) {
+ *has_wal = false;
+ }
+ if (has_db) {
+ *has_db = false;
+ }
+ for (auto& d : devs) {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct, d, &label);
+ if (r < 0) {
+ cerr << "unable to read label for " << d << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ int id = -1;
+ if (label.description == "main")
+ main = d;
+ else if (label.description == "bluefs db") {
+ id = BlueFS::BDEV_DB;
+ was_db = true;
+ if (has_db) {
+ *has_db = true;
+ }
+ }
+ else if (label.description == "bluefs wal") {
+ id = BlueFS::BDEV_WAL;
+ if (has_wal) {
+ *has_wal = true;
+ }
+ }
+ if (id >= 0) {
+ got->emplace(d, id);
+ }
+ }
+ if (main.length()) {
+ int id = was_db ? BlueFS::BDEV_SLOW : BlueFS::BDEV_DB;
+ got->emplace(main, id);
+ }
+}
+
+void add_devices(
+ BlueFS *fs,
+ CephContext *cct,
+ const vector<string>& devs)
+{
+ map<string, int> got;
+ parse_devices(cct, devs, &got, nullptr, nullptr);
+ for(auto e : got) {
+ char target_path[PATH_MAX] = "";
+ if(!e.first.empty()) {
+ if (realpath(e.first.c_str(), target_path) == nullptr) {
+ cerr << "failed to retrieve absolute path for " << e.first
+ << ": " << cpp_strerror(errno)
+ << std::endl;
+ }
+ }
+
+ cout << " slot " << e.second << " " << e.first;
+ if (target_path[0]) {
+ cout << " -> " << target_path;
+ }
+ cout << std::endl;
+ int r = fs->add_block_device(e.second, e.first, false);
+ if (r < 0) {
+ cerr << "unable to open " << e.first << ": " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+}
+
+BlueFS *open_bluefs(
+ CephContext *cct,
+ const string& path,
+ const vector<string>& devs)
+{
+ validate_path(cct, path, true);
+ BlueFS *fs = new BlueFS(cct);
+
+ add_devices(fs, cct, devs);
+
+ int r = fs->mount();
+ if (r < 0) {
+ cerr << "unable to mount bluefs: " << cpp_strerror(r)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ return fs;
+}
+
+void log_dump(
+ CephContext *cct,
+ const string& path,
+ const vector<string>& devs)
+{
+ BlueFS* fs = open_bluefs(cct, path, devs);
+ int r = fs->log_dump();
+ if (r < 0) {
+ cerr << "log_dump failed" << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ delete fs;
+}
+
+void inferring_bluefs_devices(vector<string>& devs, std::string& path)
+{
+ cout << "inferring bluefs devices from bluestore path" << std::endl;
+ for (auto fn : {"block", "block.wal", "block.db"}) {
+ string p = path + "/" + fn;
+ struct stat st;
+ if (::stat(p.c_str(), &st) == 0) {
+ devs.push_back(p);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ string out_dir;
+ vector<string> devs;
+ vector<string> devs_source;
+ string dev_target;
+ string path;
+ string action;
+ string log_file;
+ string key, value;
+ vector<string> allocs_name;
+ int log_level = 30;
+ bool fsck_deep = false;
+ po::options_description po_options("Options");
+ po_options.add_options()
+ ("help,h", "produce help message")
+ ("path", po::value<string>(&path), "bluestore path")
+ ("out-dir", po::value<string>(&out_dir), "output directory")
+ ("log-file,l", po::value<string>(&log_file), "log file")
+ ("log-level", po::value<int>(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)")
+ ("dev", po::value<vector<string>>(&devs), "device(s)")
+ ("devs-source", po::value<vector<string>>(&devs_source), "bluefs-dev-migrate source device(s)")
+ ("dev-target", po::value<string>(&dev_target), "target/resulting device")
+ ("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)")
+ ("key,k", po::value<string>(&key), "label metadata key name")
+ ("value,v", po::value<string>(&value), "label metadata value")
+ ("allocator", po::value<vector<string>>(&allocs_name), "allocator to inspect: 'block'/'bluefs-wal'/'bluefs-db'/'bluefs-slow'")
+ ;
+ po::options_description po_positional("Positional options");
+ po_positional.add_options()
+ ("command", po::value<string>(&action),
+ "fsck, "
+ "repair, "
+ "quick-fix, "
+ "bluefs-export, "
+ "bluefs-bdev-sizes, "
+ "bluefs-bdev-expand, "
+ "bluefs-bdev-new-db, "
+ "bluefs-bdev-new-wal, "
+ "bluefs-bdev-migrate, "
+ "show-label, "
+ "set-label-key, "
+ "rm-label-key, "
+ "prime-osd-dir, "
+ "bluefs-log-dump, "
+ "free-dump, "
+ "free-score")
+ ;
+ po::options_description po_all("All options");
+ po_all.add(po_options).add(po_positional);
+ po::positional_options_description pd;
+ pd.add("command", 1);
+
+ vector<string> ceph_option_strings;
+ po::variables_map vm;
+ try {
+ po::parsed_options parsed =
+ po::command_line_parser(argc, argv).options(po_all).allow_unregistered().positional(pd).run();
+ po::store( parsed, vm);
+ po::notify(vm);
+ ceph_option_strings = po::collect_unrecognized(parsed.options,
+ po::include_positional);
+ } catch(po::error &e) {
+ std::cerr << e.what() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ // normalize path (remove ending '/' if any)
+ if (path.size() > 1 && *(path.end() - 1) == '/') {
+ path.resize(path.size() - 1);
+ }
+ if (vm.count("help")) {
+ usage(po_all);
+ exit(EXIT_SUCCESS);
+ }
+ if (action.empty()) {
+ cerr << "must specify an action; --help for help" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ if (action == "fsck" || action == "repair" || action == "quick-fix") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ if (action == "prime-osd-dir") {
+ if (devs.size() != 1) {
+ cerr << "must specify the main bluestore device" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (path.empty()) {
+ cerr << "must specify osd dir to prime" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ if (action == "set-label-key" ||
+ action == "rm-label-key") {
+ if (devs.size() != 1) {
+ cerr << "must specify the main bluestore device" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (key.size() == 0) {
+ cerr << "must specify a key name with -k" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (action == "set-label-key" && value.size() == 0) {
+ cerr << "must specify a value with -v" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ if (action == "show-label") {
+ if (devs.empty() && path.empty()) {
+ cerr << "must specify bluestore path *or* raw device(s)" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (devs.empty())
+ inferring_bluefs_devices(devs, path);
+ }
+ if (action == "bluefs-export" || action == "bluefs-log-dump") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if ((action == "bluefs-export") && out_dir.empty()) {
+ cerr << "must specify out-dir to export bluefs" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ inferring_bluefs_devices(devs, path);
+ }
+ if (action == "bluefs-bdev-sizes" || action == "bluefs-bdev-expand") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ inferring_bluefs_devices(devs, path);
+ }
+ if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (dev_target.empty()) {
+ cout << "NOTICE: --dev-target option omitted, will allocate as a file" << std::endl;
+ }
+ inferring_bluefs_devices(devs, path);
+ }
+ if (action == "bluefs-bdev-migrate") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ inferring_bluefs_devices(devs, path);
+ if (devs_source.size() == 0) {
+ cerr << "must specify source devices with --devs-source" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (dev_target.empty()) {
+ cerr << "must specify target device with --dev-target" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ if (action == "free-score" || action == "free-dump") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ for (auto name : allocs_name) {
+ if (!name.empty() &&
+ name != "block" &&
+ name != "bluefs-db" &&
+ name != "bluefs-wal" &&
+ name != "bluefs-slow") {
+ cerr << "unknown allocator '" << name << "'" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ if (allocs_name.empty())
+ allocs_name = vector<string>{"block", "bluefs-db", "bluefs-wal", "bluefs-slow"};
+ }
+ vector<const char*> args;
+ if (log_file.size()) {
+ args.push_back("--log-file");
+ args.push_back(log_file.c_str());
+ static char ll[10];
+ snprintf(ll, sizeof(ll), "%d", log_level);
+ args.push_back("--debug-bluestore");
+ args.push_back(ll);
+ args.push_back("--debug-bluefs");
+ args.push_back(ll);
+ }
+ args.push_back("--no-log-to-stderr");
+ args.push_back("--err-to-stderr");
+
+ for (auto& i : ceph_option_strings) {
+ args.push_back(i.c_str());
+ }
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
+ common_init_finish(cct.get());
+
+ if (action == "fsck" ||
+ action == "repair" ||
+ action == "quick-fix") {
+ validate_path(cct.get(), path, false);
+ BlueStore bluestore(cct.get(), path);
+ int r;
+ if (action == "fsck") {
+ r = bluestore.fsck(fsck_deep);
+ } else if (action == "repair") {
+ r = bluestore.repair(fsck_deep);
+ } else {
+ r = bluestore.quick_fix();
+ }
+ if (r < 0) {
+ cerr << "error from fsck: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ } else if (r > 0) {
+ cerr << action << " found " << r << " error(s)" << std::endl;
+ exit(EXIT_FAILURE);
+ } else {
+ cout << action << " success" << std::endl;
+ }
+ }
+ else if (action == "prime-osd-dir") {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+ if (r < 0) {
+ cerr << "failed to read label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ // kludge some things into the map that we want to populate into
+ // target dir
+ label.meta["path_block"] = devs.front();
+ label.meta["type"] = "bluestore";
+ label.meta["fsid"] = stringify(label.osd_uuid);
+
+ for (auto kk : {
+ "whoami",
+ "osd_key",
+ "ceph_fsid",
+ "fsid",
+ "type",
+ "ready" }) {
+ string k = kk;
+ auto i = label.meta.find(k);
+ if (i == label.meta.end()) {
+ continue;
+ }
+ string p = path + "/" + k;
+ string v = i->second;
+ if (k == "osd_key") {
+ p = path + "/keyring";
+ v = "[osd.";
+ v += label.meta["whoami"];
+ v += "]\nkey = " + i->second;
+ }
+ v += "\n";
+ int fd = ::open(p.c_str(), O_CREAT|O_TRUNC|O_WRONLY|O_CLOEXEC, 0600);
+ if (fd < 0) {
+ cerr << "error writing " << p << ": " << cpp_strerror(errno)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ int r = safe_write(fd, v.c_str(), v.size());
+ if (r < 0) {
+ cerr << "error writing to " << p << ": " << cpp_strerror(errno)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ ::close(fd);
+ }
+ }
+ else if (action == "show-label") {
+ JSONFormatter jf(true);
+ jf.open_object_section("devices");
+ for (auto& i : devs) {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct.get(), i, &label);
+ if (r < 0) {
+ cerr << "unable to read label for " << i << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ jf.open_object_section(i.c_str());
+ label.dump(&jf);
+ jf.close_section();
+ }
+ jf.close_section();
+ jf.flush(cout);
+ }
+ else if (action == "set-label-key") {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+ if (r < 0) {
+ cerr << "unable to read label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (key == "size") {
+ label.size = strtoull(value.c_str(), nullptr, 10);
+ } else if (key =="osd_uuid") {
+ label.osd_uuid.parse(value.c_str());
+ } else if (key =="btime") {
+ uint64_t epoch;
+ uint64_t nsec;
+ int r = utime_t::parse_date(value.c_str(), &epoch, &nsec);
+ if (r == 0) {
+ label.btime = utime_t(epoch, nsec);
+ }
+ } else if (key =="description") {
+ label.description = value;
+ } else {
+ label.meta[key] = value;
+ }
+ r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+ if (r < 0) {
+ cerr << "unable to write label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ else if (action == "rm-label-key") {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+ if (r < 0) {
+ cerr << "unable to read label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (!label.meta.count(key)) {
+ cerr << "key '" << key << "' not present" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ label.meta.erase(key);
+ r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+ if (r < 0) {
+ cerr << "unable to write label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ else if (action == "bluefs-bdev-sizes") {
+ BlueStore bluestore(cct.get(), path);
+ bluestore.dump_bluefs_sizes(cout);
+ }
+ else if (action == "bluefs-bdev-expand") {
+ BlueStore bluestore(cct.get(), path);
+ auto r = bluestore.expand_devices(cout);
+ if (r <0) {
+ cerr << "failed to expand bluestore devices: "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ else if (action == "bluefs-export") {
+ BlueFS *fs = open_bluefs(cct.get(), path, devs);
+
+ vector<string> dirs;
+ int r = fs->readdir("", &dirs);
+ if (r < 0) {
+ cerr << "readdir in root failed: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ if (::access(out_dir.c_str(), F_OK)) {
+ r = ::mkdir(out_dir.c_str(), 0755);
+ if (r < 0) {
+ r = -errno;
+ cerr << "mkdir " << out_dir << " failed: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ for (auto& dir : dirs) {
+ if (dir[0] == '.')
+ continue;
+ cout << dir << "/" << std::endl;
+ vector<string> ls;
+ r = fs->readdir(dir, &ls);
+ if (r < 0) {
+ cerr << "readdir " << dir << " failed: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ string full = out_dir + "/" + dir;
+ if (::access(full.c_str(), F_OK)) {
+ r = ::mkdir(full.c_str(), 0755);
+ if (r < 0) {
+ r = -errno;
+ cerr << "mkdir " << full << " failed: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ for (auto& file : ls) {
+ if (file[0] == '.')
+ continue;
+ cout << dir << "/" << file << std::endl;
+ uint64_t size;
+ utime_t mtime;
+ r = fs->stat(dir, file, &size, &mtime);
+ if (r < 0) {
+ cerr << "stat " << file << " failed: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ string path = out_dir + "/" + dir + "/" + file;
+ int fd = ::open(path.c_str(), O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0644);
+ if (fd < 0) {
+ r = -errno;
+ cerr << "open " << path << " failed: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (size > 0) {
+ BlueFS::FileReader *h;
+ r = fs->open_for_read(dir, file, &h, false);
+ if (r < 0) {
+ cerr << "open_for_read " << dir << "/" << file << " failed: "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ int pos = 0;
+ int left = size;
+ while (left) {
+ bufferlist bl;
+ r = fs->read(h, &h->buf, pos, left, &bl, NULL);
+ if (r <= 0) {
+ cerr << "read " << dir << "/" << file << " from " << pos
+ << " failed: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ int rc = bl.write_fd(fd);
+ if (rc < 0) {
+ cerr << "write to " << path << " failed: "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ pos += r;
+ left -= r;
+ }
+ delete h;
+ }
+ ::close(fd);
+ }
+ }
+ fs->umount();
+ delete fs;
+ } else if (action == "bluefs-log-dump") {
+ log_dump(cct.get(), path, devs);
+ } else if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+ map<string, int> cur_devs_map;
+ bool need_db = action == "bluefs-bdev-new-db";
+
+ bool has_wal = false;
+ bool has_db = false;
+ char target_path[PATH_MAX] = "";
+
+ parse_devices(cct.get(), devs, &cur_devs_map, &has_db, &has_wal);
+
+ if (has_db && has_wal) {
+ cerr << "can't allocate new device, both WAL and DB exist"
+ << std::endl;
+ exit(EXIT_FAILURE);
+ } else if (need_db && has_db) {
+ cerr << "can't allocate new DB device, already exists"
+ << std::endl;
+ exit(EXIT_FAILURE);
+ } else if (!need_db && has_wal) {
+ cerr << "can't allocate new WAL device, already exists"
+ << std::endl;
+ exit(EXIT_FAILURE);
+ } else if(!dev_target.empty() &&
+ realpath(dev_target.c_str(), target_path) == nullptr) {
+ cerr << "failed to retrieve absolute path for " << dev_target
+ << ": " << cpp_strerror(errno)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ // Create either DB or WAL volume
+ int r = EXIT_FAILURE;
+ if (need_db && cct->_conf->bluestore_block_db_size == 0) {
+ cerr << "DB size isn't specified, "
+ "please set Ceph bluestore-block-db-size config parameter "
+ << std::endl;
+ } else if (!need_db && cct->_conf->bluestore_block_wal_size == 0) {
+ cerr << "WAL size isn't specified, "
+ "please set Ceph bluestore-block-wal-size config parameter "
+ << std::endl;
+ } else {
+ BlueStore bluestore(cct.get(), path);
+ r = bluestore.add_new_bluefs_device(
+ need_db ? BlueFS::BDEV_NEWDB : BlueFS::BDEV_NEWWAL,
+ target_path);
+ if (r == 0) {
+ cout << (need_db ? "DB" : "WAL") << " device added " << target_path
+ << std::endl;
+ } else {
+ cerr << "failed to add " << (need_db ? "DB" : "WAL") << " device:"
+ << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ } else if (action == "bluefs-bdev-migrate") {
+ map<string, int> cur_devs_map;
+ set<int> src_dev_ids;
+ map<string, int> src_devs;
+
+ parse_devices(cct.get(), devs, &cur_devs_map, nullptr, nullptr);
+ for (auto& s : devs_source) {
+ auto i = cur_devs_map.find(s);
+ if (i != cur_devs_map.end()) {
+ if (s == dev_target) {
+ cerr << "Device " << dev_target
+ << " is present in both source and target lists, omitted."
+ << std::endl;
+ } else {
+ src_devs.emplace(*i);
+ src_dev_ids.emplace(i->second);
+ }
+ } else {
+ cerr << "can't migrate " << s << ", not a valid bluefs volume "
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ auto i = cur_devs_map.find(dev_target);
+
+ if (i != cur_devs_map.end()) {
+ // Migrate to an existing BlueFS volume
+
+ auto dev_target_id = i->second;
+ if (dev_target_id == BlueFS::BDEV_WAL) {
+ // currently we're unable to migrate to WAL device since there is no space
+ // reserved for superblock
+ cerr << "Migrate to WAL device isn't supported." << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ BlueStore bluestore(cct.get(), path);
+ int r = bluestore.migrate_to_existing_bluefs_device(
+ src_dev_ids,
+ dev_target_id);
+ if (r == 0) {
+ for(auto src : src_devs) {
+ if (src.second != BlueFS::BDEV_SLOW) {
+ cout << " device removed:" << src.second << " " << src.first
+ << std::endl;
+ }
+ }
+ } else {
+ bool need_db = dev_target_id == BlueFS::BDEV_DB;
+ cerr << "failed to migrate to existing BlueFS device: "
+ << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_WAL)
+ << " " << dev_target
+ << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ } else {
+ // Migrate to a new BlueFS volume
+ // via creating either DB or WAL volume
+ char target_path[PATH_MAX] = "";
+ int dev_target_id;
+ if (src_dev_ids.count(BlueFS::BDEV_DB)) {
+ // if we have DB device in the source list - we create DB device
+ // (and may be remove WAL).
+ dev_target_id = BlueFS::BDEV_NEWDB;
+ } else if (src_dev_ids.count(BlueFS::BDEV_WAL)) {
+ dev_target_id = BlueFS::BDEV_NEWWAL;
+ } else {
+ cerr << "Unable to migrate Slow volume to new location, "
+ "please allocate new DB or WAL with "
+ "--bluefs-bdev-new-db(wal) command"
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if(!dev_target.empty() &&
+ realpath(dev_target.c_str(), target_path) == nullptr) {
+ cerr << "failed to retrieve absolute path for " << dev_target
+ << ": " << cpp_strerror(errno)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ BlueStore bluestore(cct.get(), path);
+
+ bool need_db = dev_target_id == BlueFS::BDEV_NEWDB;
+ int r = bluestore.migrate_to_new_bluefs_device(
+ src_dev_ids,
+ dev_target_id,
+ target_path);
+ if (r == 0) {
+ for(auto src : src_devs) {
+ if (src.second != BlueFS::BDEV_SLOW) {
+ cout << " device removed:" << src.second << " " << src.first
+ << std::endl;
+ }
+ }
+ cout << " device added: "
+ << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+ << " " << target_path
+ << std::endl;
+ } else {
+ cerr << "failed to migrate to new BlueFS device: "
+ << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+ << " " << target_path
+ << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ } else if (action == "free-dump" || action == "free-score") {
+ AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+ ceph_assert(admin_socket);
+ std::string action_name = action == "free-dump" ? "dump" : "score";
+ validate_path(cct.get(), path, false);
+ BlueStore bluestore(cct.get(), path);
+ int r = bluestore.cold_open();
+ if (r < 0) {
+ cerr << "error from cold_open: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ for (auto alloc_name : allocs_name) {
+ ceph::bufferlist out;
+ bool b = admin_socket->execute_command(
+ "{\"prefix\": \"bluestore allocator " + action_name + " " + alloc_name + "\"}", out);
+ if (!b) {
+ cerr << "failure querying '" << alloc_name << "'" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ cout << alloc_name << ":" << std::endl;
+ cout << std::string(out.c_str(),out.length()) << std::endl;
+ }
+
+ bluestore.cold_close();
+ } else {
+ cerr << "unrecognized action " << action << std::endl;
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/src/os/bluestore/bluestore_types.cc b/src/os/bluestore/bluestore_types.cc
new file mode 100644
index 00000000..134eed5b
--- /dev/null
+++ b/src/os/bluestore/bluestore_types.cc
@@ -0,0 +1,1138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "bluestore_types.h"
+#include "common/Formatter.h"
+#include "common/Checksummer.h"
+#include "include/stringify.h"
+
+// bluestore_bdev_label_t
+
+void bluestore_bdev_label_t::encode(bufferlist& bl) const
+{
+ // be slightly friendly to someone who looks at the device
+ bl.append("bluestore block device\n");
+ bl.append(stringify(osd_uuid));
+ bl.append("\n");
+ ENCODE_START(2, 1, bl);
+ encode(osd_uuid, bl);
+ encode(size, bl);
+ encode(btime, bl);
+ encode(description, bl);
+ encode(meta, bl);
+ ENCODE_FINISH(bl);
+}
+
+void bluestore_bdev_label_t::decode(bufferlist::const_iterator& p)
+{
+ p.advance(60u); // see above
+ DECODE_START(2, p);
+ decode(osd_uuid, p);
+ decode(size, p);
+ decode(btime, p);
+ decode(description, p);
+ if (struct_v >= 2) {
+ decode(meta, p);
+ }
+ DECODE_FINISH(p);
+}
+
+void bluestore_bdev_label_t::dump(Formatter *f) const
+{
+ f->dump_stream("osd_uuid") << osd_uuid;
+ f->dump_unsigned("size", size);
+ f->dump_stream("btime") << btime;
+ f->dump_string("description", description);
+ for (auto& i : meta) {
+ f->dump_string(i.first.c_str(), i.second);
+ }
+}
+
+void bluestore_bdev_label_t::generate_test_instances(
+ list<bluestore_bdev_label_t*>& o)
+{
+ o.push_back(new bluestore_bdev_label_t);
+ o.push_back(new bluestore_bdev_label_t);
+ o.back()->size = 123;
+ o.back()->btime = utime_t(4, 5);
+ o.back()->description = "fakey";
+ o.back()->meta["foo"] = "bar";
+}
+
+ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l)
+{
+ return out << "bdev(osd_uuid " << l.osd_uuid
+ << ", size 0x" << std::hex << l.size << std::dec
+ << ", btime " << l.btime
+ << ", desc " << l.description
+ << ", " << l.meta.size() << " meta"
+ << ")";
+}
+
+// cnode_t
+
+void bluestore_cnode_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("bits", bits);
+}
+
+void bluestore_cnode_t::generate_test_instances(list<bluestore_cnode_t*>& o)
+{
+ o.push_back(new bluestore_cnode_t());
+ o.push_back(new bluestore_cnode_t(0));
+ o.push_back(new bluestore_cnode_t(123));
+}
+
+ostream& operator<<(ostream& out, const bluestore_cnode_t& l)
+{
+ return out << "cnode(bits " << l.bits << ")";
+}
+
+// bluestore_extent_ref_map_t
+
+void bluestore_extent_ref_map_t::_check() const
+{
+ uint64_t pos = 0;
+ unsigned refs = 0;
+ for (const auto &p : ref_map) {
+ if (p.first < pos)
+ ceph_abort_msg("overlap");
+ if (p.first == pos && p.second.refs == refs)
+ ceph_abort_msg("unmerged");
+ pos = p.first + p.second.length;
+ refs = p.second.refs;
+ }
+}
+
+void bluestore_extent_ref_map_t::_maybe_merge_left(
+ map<uint64_t,record_t>::iterator& p)
+{
+ if (p == ref_map.begin())
+ return;
+ auto q = p;
+ --q;
+ if (q->second.refs == p->second.refs &&
+ q->first + q->second.length == p->first) {
+ q->second.length += p->second.length;
+ ref_map.erase(p);
+ p = q;
+ }
+}
+
+void bluestore_extent_ref_map_t::get(uint64_t offset, uint32_t length)
+{
+ auto p = ref_map.lower_bound(offset);
+ if (p != ref_map.begin()) {
+ --p;
+ if (p->first + p->second.length <= offset) {
+ ++p;
+ }
+ }
+ while (length > 0) {
+ if (p == ref_map.end()) {
+ // nothing after offset; add the whole thing.
+ p = ref_map.insert(
+ map<uint64_t,record_t>::value_type(offset, record_t(length, 1))).first;
+ break;
+ }
+ if (p->first > offset) {
+ // gap
+ uint64_t newlen = std::min<uint64_t>(p->first - offset, length);
+ p = ref_map.insert(
+ map<uint64_t,record_t>::value_type(offset,
+ record_t(newlen, 1))).first;
+ offset += newlen;
+ length -= newlen;
+ _maybe_merge_left(p);
+ ++p;
+ continue;
+ }
+ if (p->first < offset) {
+ // split off the portion before offset
+ ceph_assert(p->first + p->second.length > offset);
+ uint64_t left = p->first + p->second.length - offset;
+ p->second.length = offset - p->first;
+ p = ref_map.insert(map<uint64_t,record_t>::value_type(
+ offset, record_t(left, p->second.refs))).first;
+ // continue below
+ }
+ ceph_assert(p->first == offset);
+ if (length < p->second.length) {
+ ref_map.insert(make_pair(offset + length,
+ record_t(p->second.length - length,
+ p->second.refs)));
+ p->second.length = length;
+ ++p->second.refs;
+ break;
+ }
+ ++p->second.refs;
+ offset += p->second.length;
+ length -= p->second.length;
+ _maybe_merge_left(p);
+ ++p;
+ }
+ if (p != ref_map.end())
+ _maybe_merge_left(p);
+ //_check();
+}
+
+void bluestore_extent_ref_map_t::put(
+ uint64_t offset, uint32_t length,
+ PExtentVector *release,
+ bool *maybe_unshared)
+{
+ //NB: existing entries in 'release' container must be preserved!
+ bool unshared = true;
+ auto p = ref_map.lower_bound(offset);
+ if (p == ref_map.end() || p->first > offset) {
+ if (p == ref_map.begin()) {
+ ceph_abort_msg("put on missing extent (nothing before)");
+ }
+ --p;
+ if (p->first + p->second.length <= offset) {
+ ceph_abort_msg("put on missing extent (gap)");
+ }
+ }
+ if (p->first < offset) {
+ uint64_t left = p->first + p->second.length - offset;
+ p->second.length = offset - p->first;
+ if (p->second.refs != 1) {
+ unshared = false;
+ }
+ p = ref_map.insert(map<uint64_t,record_t>::value_type(
+ offset, record_t(left, p->second.refs))).first;
+ }
+ while (length > 0) {
+ ceph_assert(p->first == offset);
+ if (length < p->second.length) {
+ if (p->second.refs != 1) {
+ unshared = false;
+ }
+ ref_map.insert(make_pair(offset + length,
+ record_t(p->second.length - length,
+ p->second.refs)));
+ if (p->second.refs > 1) {
+ p->second.length = length;
+ --p->second.refs;
+ if (p->second.refs != 1) {
+ unshared = false;
+ }
+ _maybe_merge_left(p);
+ } else {
+ if (release)
+ release->push_back(bluestore_pextent_t(p->first, length));
+ ref_map.erase(p);
+ }
+ goto out;
+ }
+ offset += p->second.length;
+ length -= p->second.length;
+ if (p->second.refs > 1) {
+ --p->second.refs;
+ if (p->second.refs != 1) {
+ unshared = false;
+ }
+ _maybe_merge_left(p);
+ ++p;
+ } else {
+ if (release)
+ release->push_back(bluestore_pextent_t(p->first, p->second.length));
+ ref_map.erase(p++);
+ }
+ }
+ if (p != ref_map.end())
+ _maybe_merge_left(p);
+ //_check();
+out:
+ if (maybe_unshared) {
+ if (unshared) {
+ // we haven't seen a ref != 1 yet; check the whole map.
+ for (auto& p : ref_map) {
+ if (p.second.refs != 1) {
+ unshared = false;
+ break;
+ }
+ }
+ }
+ *maybe_unshared = unshared;
+ }
+}
+
+bool bluestore_extent_ref_map_t::contains(uint64_t offset, uint32_t length) const
+{
+ auto p = ref_map.lower_bound(offset);
+ if (p == ref_map.end() || p->first > offset) {
+ if (p == ref_map.begin()) {
+ return false; // nothing before
+ }
+ --p;
+ if (p->first + p->second.length <= offset) {
+ return false; // gap
+ }
+ }
+ while (length > 0) {
+ if (p == ref_map.end())
+ return false;
+ if (p->first > offset)
+ return false;
+ if (p->first + p->second.length >= offset + length)
+ return true;
+ uint64_t overlap = p->first + p->second.length - offset;
+ offset += overlap;
+ length -= overlap;
+ ++p;
+ }
+ return true;
+}
+
+bool bluestore_extent_ref_map_t::intersects(
+ uint64_t offset,
+ uint32_t length) const
+{
+ auto p = ref_map.lower_bound(offset);
+ if (p != ref_map.begin()) {
+ --p;
+ if (p->first + p->second.length <= offset) {
+ ++p;
+ }
+ }
+ if (p == ref_map.end())
+ return false;
+ if (p->first >= offset + length)
+ return false;
+ return true; // intersects p!
+}
+
+void bluestore_extent_ref_map_t::dump(Formatter *f) const
+{
+ f->open_array_section("ref_map");
+ for (auto& p : ref_map) {
+ f->open_object_section("ref");
+ f->dump_unsigned("offset", p.first);
+ f->dump_unsigned("length", p.second.length);
+ f->dump_unsigned("refs", p.second.refs);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void bluestore_extent_ref_map_t::generate_test_instances(
+ list<bluestore_extent_ref_map_t*>& o)
+{
+ o.push_back(new bluestore_extent_ref_map_t);
+ o.push_back(new bluestore_extent_ref_map_t);
+ o.back()->get(10, 10);
+ o.back()->get(18, 22);
+ o.back()->get(20, 20);
+ o.back()->get(10, 25);
+ o.back()->get(15, 20);
+}
+
+ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& m)
+{
+ out << "ref_map(";
+ for (auto p = m.ref_map.begin(); p != m.ref_map.end(); ++p) {
+ if (p != m.ref_map.begin())
+ out << ",";
+ out << std::hex << "0x" << p->first << "~" << p->second.length << std::dec
+ << "=" << p->second.refs;
+ }
+ out << ")";
+ return out;
+}
+
+// bluestore_blob_use_tracker_t
+
+void bluestore_blob_use_tracker_t::allocate()
+{
+ ceph_assert(num_au != 0);
+ bytes_per_au = new uint32_t[num_au];
+ mempool::get_pool(
+ mempool::pool_index_t(mempool::mempool_bluestore_cache_other)).
+ adjust_count(1, sizeof(uint32_t) * num_au);
+
+ for (uint32_t i = 0; i < num_au; ++i) {
+ bytes_per_au[i] = 0;
+ }
+}
+
+void bluestore_blob_use_tracker_t::init(
+ uint32_t full_length, uint32_t _au_size) {
+ ceph_assert(!au_size || is_empty());
+ ceph_assert(_au_size > 0);
+ ceph_assert(full_length > 0);
+ clear();
+ uint32_t _num_au = round_up_to(full_length, _au_size) / _au_size;
+ au_size = _au_size;
+ if ( _num_au > 1 ) {
+ num_au = _num_au;
+ allocate();
+ }
+}
+
+void bluestore_blob_use_tracker_t::get(
+ uint32_t offset, uint32_t length)
+{
+ ceph_assert(au_size);
+ if (!num_au) {
+ total_bytes += length;
+ } else {
+ auto end = offset + length;
+
+ while (offset < end) {
+ auto phase = offset % au_size;
+ bytes_per_au[offset / au_size] +=
+ std::min(au_size - phase, end - offset);
+ offset += (phase ? au_size - phase : au_size);
+ }
+ }
+}
+
+bool bluestore_blob_use_tracker_t::put(
+ uint32_t offset, uint32_t length,
+ PExtentVector *release_units)
+{
+ ceph_assert(au_size);
+ if (release_units) {
+ release_units->clear();
+ }
+ bool maybe_empty = true;
+ if (!num_au) {
+ ceph_assert(total_bytes >= length);
+ total_bytes -= length;
+ } else {
+ auto end = offset + length;
+ uint64_t next_offs = 0;
+ while (offset < end) {
+ auto phase = offset % au_size;
+ size_t pos = offset / au_size;
+ auto diff = std::min(au_size - phase, end - offset);
+ ceph_assert(diff <= bytes_per_au[pos]);
+ bytes_per_au[pos] -= diff;
+ offset += (phase ? au_size - phase : au_size);
+ if (bytes_per_au[pos] == 0) {
+ if (release_units) {
+ if (release_units->empty() || next_offs != pos * au_size) {
+ release_units->emplace_back(pos * au_size, au_size);
+ } else {
+ release_units->back().length += au_size;
+ }
+ next_offs += au_size;
+ }
+ } else {
+ maybe_empty = false; // micro optimization detecting we aren't empty
+ // even in the affected extent
+ }
+ }
+ }
+ bool empty = maybe_empty ? !is_not_empty() : false;
+ if (empty && release_units) {
+ release_units->clear();
+ }
+ return empty;
+}
+
+bool bluestore_blob_use_tracker_t::can_split() const
+{
+ return num_au > 0;
+}
+
+bool bluestore_blob_use_tracker_t::can_split_at(uint32_t blob_offset) const
+{
+ ceph_assert(au_size);
+ return (blob_offset % au_size) == 0 &&
+ blob_offset < num_au * au_size;
+}
+
+void bluestore_blob_use_tracker_t::split(
+ uint32_t blob_offset,
+ bluestore_blob_use_tracker_t* r)
+{
+ ceph_assert(au_size);
+ ceph_assert(can_split());
+ ceph_assert(can_split_at(blob_offset));
+ ceph_assert(r->is_empty());
+
+ uint32_t new_num_au = blob_offset / au_size;
+ r->init( (num_au - new_num_au) * au_size, au_size);
+
+ for (auto i = new_num_au; i < num_au; i++) {
+ r->get((i - new_num_au) * au_size, bytes_per_au[i]);
+ bytes_per_au[i] = 0;
+ }
+ if (new_num_au == 0) {
+ clear();
+ } else if (new_num_au == 1) {
+ uint32_t tmp = bytes_per_au[0];
+ uint32_t _au_size = au_size;
+ clear();
+ au_size = _au_size;
+ total_bytes = tmp;
+ } else {
+ num_au = new_num_au;
+ }
+}
+
+bool bluestore_blob_use_tracker_t::equal(
+ const bluestore_blob_use_tracker_t& other) const
+{
+ if (!num_au && !other.num_au) {
+ return total_bytes == other.total_bytes && au_size == other.au_size;
+ } else if (num_au && other.num_au) {
+ if (num_au != other.num_au || au_size != other.au_size) {
+ return false;
+ }
+ for (size_t i = 0; i < num_au; i++) {
+ if (bytes_per_au[i] != other.bytes_per_au[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ uint32_t n = num_au ? num_au : other.num_au;
+ uint32_t referenced =
+ num_au ? other.get_referenced_bytes() : get_referenced_bytes();
+ auto bytes_per_au_tmp = num_au ? bytes_per_au : other.bytes_per_au;
+ uint32_t my_referenced = 0;
+ for (size_t i = 0; i < n; i++) {
+ my_referenced += bytes_per_au_tmp[i];
+ if (my_referenced > referenced) {
+ return false;
+ }
+ }
+ return my_referenced == referenced;
+}
+
+void bluestore_blob_use_tracker_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("num_au", num_au);
+ f->dump_unsigned("au_size", au_size);
+ if (!num_au) {
+ f->dump_unsigned("total_bytes", total_bytes);
+ } else {
+ f->open_array_section("bytes_per_au");
+ for (size_t i = 0; i < num_au; ++i) {
+ f->dump_unsigned("", bytes_per_au[i]);
+ }
+ f->close_section();
+ }
+}
+
+void bluestore_blob_use_tracker_t::generate_test_instances(
+ list<bluestore_blob_use_tracker_t*>& o)
+{
+ o.push_back(new bluestore_blob_use_tracker_t());
+ o.back()->init(16, 16);
+ o.back()->get(10, 10);
+ o.back()->get(10, 5);
+ o.push_back(new bluestore_blob_use_tracker_t());
+ o.back()->init(60, 16);
+ o.back()->get(18, 22);
+ o.back()->get(20, 20);
+ o.back()->get(15, 20);
+}
+
+ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& m)
+{
+ out << "use_tracker(" << std::hex;
+ if (!m.num_au) {
+ out << "0x" << m.au_size
+ << " "
+ << "0x" << m.total_bytes;
+ } else {
+ out << "0x" << m.num_au
+ << "*0x" << m.au_size
+ << " 0x[";
+ for (size_t i = 0; i < m.num_au; ++i) {
+ if (i != 0)
+ out << ",";
+ out << m.bytes_per_au[i];
+ }
+ out << "]";
+ }
+ out << std::dec << ")";
+ return out;
+}
+
+// bluestore_pextent_t
+
+void bluestore_pextent_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+}
+
+ostream& operator<<(ostream& out, const bluestore_pextent_t& o) {
+ if (o.is_valid())
+ return out << "0x" << std::hex << o.offset << "~" << o.length << std::dec;
+ else
+ return out << "!~" << std::hex << o.length << std::dec;
+}
+
+void bluestore_pextent_t::generate_test_instances(list<bluestore_pextent_t*>& ls)
+{
+ ls.push_back(new bluestore_pextent_t);
+ ls.push_back(new bluestore_pextent_t(1, 2));
+}
+
+// bluestore_blob_t
+
+string bluestore_blob_t::get_flags_string(unsigned flags)
+{
+ string s;
+ if (flags & FLAG_COMPRESSED) {
+ if (s.length())
+ s += '+';
+ s += "compressed";
+ }
+ if (flags & FLAG_CSUM) {
+ if (s.length())
+ s += '+';
+ s += "csum";
+ }
+ if (flags & FLAG_HAS_UNUSED) {
+ if (s.length())
+ s += '+';
+ s += "has_unused";
+ }
+ if (flags & FLAG_SHARED) {
+ if (s.length())
+ s += '+';
+ s += "shared";
+ }
+
+ return s;
+}
+
+size_t bluestore_blob_t::get_csum_value_size() const
+{
+ return Checksummer::get_csum_value_size(csum_type);
+}
+
+void bluestore_blob_t::dump(Formatter *f) const
+{
+ f->open_array_section("extents");
+ for (auto& p : extents) {
+ f->dump_object("extent", p);
+ }
+ f->close_section();
+ f->dump_unsigned("logical_length", logical_length);
+ f->dump_unsigned("compressed_length", compressed_length);
+ f->dump_unsigned("flags", flags);
+ f->dump_unsigned("csum_type", csum_type);
+ f->dump_unsigned("csum_chunk_order", csum_chunk_order);
+ f->open_array_section("csum_data");
+ size_t n = get_csum_count();
+ for (unsigned i = 0; i < n; ++i)
+ f->dump_unsigned("csum", get_csum_item(i));
+ f->close_section();
+ f->dump_unsigned("unused", unused);
+}
+
+void bluestore_blob_t::generate_test_instances(list<bluestore_blob_t*>& ls)
+{
+ ls.push_back(new bluestore_blob_t);
+ ls.push_back(new bluestore_blob_t(0));
+ ls.push_back(new bluestore_blob_t);
+ ls.back()->allocated_test(bluestore_pextent_t(111, 222));
+ ls.push_back(new bluestore_blob_t);
+ ls.back()->init_csum(Checksummer::CSUM_XXHASH32, 16, 65536);
+ ls.back()->csum_data = buffer::claim_malloc(4, strdup("abcd"));
+ ls.back()->add_unused(0, 3);
+ ls.back()->add_unused(8, 8);
+ ls.back()->allocated_test(bluestore_pextent_t(0x40100000, 0x10000));
+ ls.back()->allocated_test(
+ bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x1000));
+ ls.back()->allocated_test(bluestore_pextent_t(0x40120000, 0x10000));
+}
+
+ostream& operator<<(ostream& out, const bluestore_blob_t& o)
+{
+ out << "blob(" << o.get_extents();
+ if (o.is_compressed()) {
+ out << " clen 0x" << std::hex
+ << o.get_logical_length()
+ << " -> 0x"
+ << o.get_compressed_payload_length()
+ << std::dec;
+ }
+ if (o.flags) {
+ out << " " << o.get_flags_string();
+ }
+ if (o.has_csum()) {
+ out << " " << Checksummer::get_csum_type_string(o.csum_type)
+ << "/0x" << std::hex << (1ull << o.csum_chunk_order) << std::dec;
+ }
+ if (o.has_unused())
+ out << " unused=0x" << std::hex << o.unused << std::dec;
+ out << ")";
+ return out;
+}
+
+void bluestore_blob_t::calc_csum(uint64_t b_off, const bufferlist& bl)
+{
+ switch (csum_type) {
+ case Checksummer::CSUM_XXHASH32:
+ Checksummer::calculate<Checksummer::xxhash32>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+ break;
+ case Checksummer::CSUM_XXHASH64:
+ Checksummer::calculate<Checksummer::xxhash64>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+ break;;
+ case Checksummer::CSUM_CRC32C:
+ Checksummer::calculate<Checksummer::crc32c>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+ break;
+ case Checksummer::CSUM_CRC32C_16:
+ Checksummer::calculate<Checksummer::crc32c_16>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+ break;
+ case Checksummer::CSUM_CRC32C_8:
+ Checksummer::calculate<Checksummer::crc32c_8>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+ break;
+ }
+}
+
+int bluestore_blob_t::verify_csum(uint64_t b_off, const bufferlist& bl,
+ int* b_bad_off, uint64_t *bad_csum) const
+{
+ int r = 0;
+
+ *b_bad_off = -1;
+ switch (csum_type) {
+ case Checksummer::CSUM_NONE:
+ break;
+ case Checksummer::CSUM_XXHASH32:
+ *b_bad_off = Checksummer::verify<Checksummer::xxhash32>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+ break;
+ case Checksummer::CSUM_XXHASH64:
+ *b_bad_off = Checksummer::verify<Checksummer::xxhash64>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+ break;
+ case Checksummer::CSUM_CRC32C:
+ *b_bad_off = Checksummer::verify<Checksummer::crc32c>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+ break;
+ case Checksummer::CSUM_CRC32C_16:
+ *b_bad_off = Checksummer::verify<Checksummer::crc32c_16>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+ break;
+ case Checksummer::CSUM_CRC32C_8:
+ *b_bad_off = Checksummer::verify<Checksummer::crc32c_8>(
+ get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+ break;
+ default:
+ r = -EOPNOTSUPP;
+ break;
+ }
+
+ if (r < 0)
+ return r;
+ else if (*b_bad_off >= 0)
+ return -1; // bad checksum
+ else
+ return 0;
+}
+
+void bluestore_blob_t::allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs)
+{
+ if (extents.size() == 0) {
+ // if blob is compressed then logical length to be already configured
+ // otherwise - to be unset.
+ ceph_assert((is_compressed() && logical_length != 0) ||
+ (!is_compressed() && logical_length == 0));
+
+ extents.reserve(allocs.size() + (b_off ? 1 : 0));
+ if (b_off) {
+ extents.emplace_back(
+ bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, b_off));
+
+ }
+ uint32_t new_len = b_off;
+ for (auto& a : allocs) {
+ extents.emplace_back(a.offset, a.length);
+ new_len += a.length;
+ }
+ if (!is_compressed()) {
+ logical_length = new_len;
+ }
+ } else {
+ ceph_assert(!is_compressed()); // partial allocations are forbidden when
+ // compressed
+ ceph_assert(b_off < logical_length);
+ uint32_t cur_offs = 0;
+ auto start_it = extents.begin();
+ size_t pos = 0;
+ while (true) {
+ ceph_assert(start_it != extents.end());
+ if (cur_offs + start_it->length > b_off) {
+ break;
+ }
+ cur_offs += start_it->length;
+ ++start_it;
+ ++pos;
+ }
+ uint32_t head = b_off - cur_offs;
+ uint32_t end_off = b_off + length;
+ auto end_it = start_it;
+
+ while (true) {
+ ceph_assert(end_it != extents.end());
+ ceph_assert(!end_it->is_valid());
+ if (cur_offs + end_it->length >= end_off) {
+ break;
+ }
+ cur_offs += end_it->length;
+ ++end_it;
+ }
+ ceph_assert(cur_offs + end_it->length >= end_off);
+ uint32_t tail = cur_offs + end_it->length - end_off;
+
+ start_it = extents.erase(start_it, end_it + 1);
+ size_t count = allocs.size();
+ count += head ? 1 : 0;
+ count += tail ? 1 : 0;
+ extents.insert(start_it,
+ count,
+ bluestore_pextent_t(
+ bluestore_pextent_t::INVALID_OFFSET, 0));
+
+ // Workaround to resolve lack of proper iterator return in vector::insert
+ // Looks like some gcc/stl implementations still lack it despite c++11
+ // support claim
+ start_it = extents.begin() + pos;
+
+ if (head) {
+ start_it->length = head;
+ ++start_it;
+ }
+ for(auto& e : allocs) {
+ *start_it = e;
+ ++start_it;
+ }
+ if (tail) {
+ start_it->length = tail;
+ }
+ }
+}
+
+// cut it out of extents
+struct vecbuilder {
+ PExtentVector v;
+ uint64_t invalid = 0;
+
+ void add_invalid(uint64_t length) {
+ invalid += length;
+ }
+ void flush() {
+ if (invalid) {
+ v.emplace_back(bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET,
+ invalid));
+
+ invalid = 0;
+ }
+ }
+ void add(uint64_t offset, uint64_t length) {
+ if (offset == bluestore_pextent_t::INVALID_OFFSET) {
+ add_invalid(length);
+ }
+ else {
+ flush();
+ v.emplace_back(offset, length);
+ }
+ }
+};
+
+void bluestore_blob_t::allocated_test(const bluestore_pextent_t& alloc)
+{
+ extents.emplace_back(alloc);
+ if (!is_compressed()) {
+ logical_length += alloc.length;
+ }
+}
+
+bool bluestore_blob_t::release_extents(bool all,
+ const PExtentVector& logical,
+ PExtentVector* r)
+{
+ // common case: all of it?
+ if (all) {
+ uint64_t pos = 0;
+ for (auto& e : extents) {
+ if (e.is_valid()) {
+ r->push_back(e);
+ }
+ pos += e.length;
+ }
+ ceph_assert(is_compressed() || get_logical_length() == pos);
+ extents.resize(1);
+ extents[0].offset = bluestore_pextent_t::INVALID_OFFSET;
+ extents[0].length = pos;
+ return true;
+ }
+ // remove from pextents according to logical release list
+ vecbuilder vb;
+ auto loffs_it = logical.begin();
+ auto lend = logical.end();
+ uint32_t pext_loffs_start = 0; //starting loffset of the current pextent
+ uint32_t pext_loffs = 0; //current loffset
+ auto pext_it = extents.begin();
+ auto pext_end = extents.end();
+ while (pext_it != pext_end) {
+ if (loffs_it == lend ||
+ pext_loffs_start + pext_it->length <= loffs_it->offset) {
+ int delta0 = pext_loffs - pext_loffs_start;
+ ceph_assert(delta0 >= 0);
+ if ((uint32_t)delta0 < pext_it->length) {
+ vb.add(pext_it->offset + delta0, pext_it->length - delta0);
+ }
+ pext_loffs_start += pext_it->length;
+ pext_loffs = pext_loffs_start;
+ ++pext_it;
+ }
+ else {
+ //assert(pext_loffs == pext_loffs_start);
+ int delta0 = pext_loffs - pext_loffs_start;
+ ceph_assert(delta0 >= 0);
+
+ int delta = loffs_it->offset - pext_loffs;
+ ceph_assert(delta >= 0);
+ if (delta > 0) {
+ vb.add(pext_it->offset + delta0, delta);
+ pext_loffs += delta;
+ }
+
+ PExtentVector::iterator last_r = r->end();
+ if (r->begin() != last_r) {
+ --last_r;
+ }
+ uint32_t to_release = loffs_it->length;
+ do {
+ uint32_t to_release_part =
+ std::min(pext_it->length - delta0 - delta, to_release);
+ auto o = pext_it->offset + delta0 + delta;
+ if (last_r != r->end() && last_r->offset + last_r->length == o) {
+ last_r->length += to_release_part;
+ }
+ else {
+ last_r = r->emplace(r->end(), o, to_release_part);
+ }
+ to_release -= to_release_part;
+ pext_loffs += to_release_part;
+ if (pext_loffs == pext_loffs_start + pext_it->length) {
+ pext_loffs_start += pext_it->length;
+ pext_loffs = pext_loffs_start;
+ pext_it++;
+ delta0 = delta = 0;
+ }
+ } while (to_release > 0 && pext_it != pext_end);
+ vb.add_invalid(loffs_it->length - to_release);
+ ++loffs_it;
+ }
+ }
+ vb.flush();
+ extents.swap(vb.v);
+ return false;
+}
+
+void bluestore_blob_t::split(uint32_t blob_offset, bluestore_blob_t& rb)
+{
+ size_t left = blob_offset;
+ uint32_t llen_lb = 0;
+ uint32_t llen_rb = 0;
+ unsigned i = 0;
+ for (auto p = extents.begin(); p != extents.end(); ++p, ++i) {
+ if (p->length <= left) {
+ left -= p->length;
+ llen_lb += p->length;
+ continue;
+ }
+ if (left) {
+ if (p->is_valid()) {
+ rb.extents.emplace_back(bluestore_pextent_t(p->offset + left,
+ p->length - left));
+ }
+ else {
+ rb.extents.emplace_back(bluestore_pextent_t(
+ bluestore_pextent_t::INVALID_OFFSET,
+ p->length - left));
+ }
+ llen_rb += p->length - left;
+ llen_lb += left;
+ p->length = left;
+ ++i;
+ ++p;
+ }
+ while (p != extents.end()) {
+ llen_rb += p->length;
+ rb.extents.push_back(*p++);
+ }
+ extents.resize(i);
+ logical_length = llen_lb;
+ rb.logical_length = llen_rb;
+ break;
+ }
+ rb.flags = flags;
+
+ if (has_csum()) {
+ rb.csum_type = csum_type;
+ rb.csum_chunk_order = csum_chunk_order;
+ size_t csum_order = get_csum_chunk_size();
+ ceph_assert(blob_offset % csum_order == 0);
+ size_t pos = (blob_offset / csum_order) * get_csum_value_size();
+ // deep copy csum data
+ bufferptr old;
+ old.swap(csum_data);
+ rb.csum_data = bufferptr(old.c_str() + pos, old.length() - pos);
+ csum_data = bufferptr(old.c_str(), pos);
+ }
+}
+
+// bluestore_shared_blob_t
+MEMPOOL_DEFINE_OBJECT_FACTORY(bluestore_shared_blob_t, bluestore_shared_blob_t,
+ bluestore_cache_other);
+
+void bluestore_shared_blob_t::dump(Formatter *f) const
+{
+ f->dump_int("sbid", sbid);
+ f->dump_object("ref_map", ref_map);
+}
+
+void bluestore_shared_blob_t::generate_test_instances(
+ list<bluestore_shared_blob_t*>& ls)
+{
+ ls.push_back(new bluestore_shared_blob_t(1));
+}
+
+ostream& operator<<(ostream& out, const bluestore_shared_blob_t& sb)
+{
+ out << "(sbid 0x" << std::hex << sb.sbid << std::dec;
+ out << " " << sb.ref_map << ")";
+ return out;
+}
+
+// bluestore_onode_t
+
+void bluestore_onode_t::shard_info::dump(Formatter *f) const
+{
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("bytes", bytes);
+}
+
+ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si)
+{
+ return out << std::hex << "0x" << si.offset << "(0x" << si.bytes << " bytes"
+ << std::dec << ")";
+}
+
+void bluestore_onode_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("nid", nid);
+ f->dump_unsigned("size", size);
+ f->open_object_section("attrs");
+ for (auto p = attrs.begin(); p != attrs.end(); ++p) {
+ f->open_object_section("attr");
+ f->dump_string("name", p->first.c_str()); // it's not quite std::string
+ f->dump_unsigned("len", p->second.length());
+ f->close_section();
+ }
+ f->close_section();
+ f->dump_string("flags", get_flags_string());
+ f->open_array_section("extent_map_shards");
+ for (auto si : extent_map_shards) {
+ f->dump_object("shard", si);
+ }
+ f->close_section();
+ f->dump_unsigned("expected_object_size", expected_object_size);
+ f->dump_unsigned("expected_write_size", expected_write_size);
+ f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
+}
+
+void bluestore_onode_t::generate_test_instances(list<bluestore_onode_t*>& o)
+{
+ o.push_back(new bluestore_onode_t());
+ // FIXME
+}
+
+// bluestore_deferred_op_t
+
+void bluestore_deferred_op_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("op", (int)op);
+ f->dump_unsigned("data_len", data.length());
+ f->open_array_section("extents");
+ for (auto& e : extents) {
+ f->dump_object("extent", e);
+ }
+ f->close_section();
+}
+
+void bluestore_deferred_op_t::generate_test_instances(list<bluestore_deferred_op_t*>& o)
+{
+ o.push_back(new bluestore_deferred_op_t);
+ o.push_back(new bluestore_deferred_op_t);
+ o.back()->op = OP_WRITE;
+ o.back()->extents.push_back(bluestore_pextent_t(1, 2));
+ o.back()->extents.push_back(bluestore_pextent_t(100, 5));
+ o.back()->data.append("my data");
+}
+
+void bluestore_deferred_transaction_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("seq", seq);
+ f->open_array_section("ops");
+ for (list<bluestore_deferred_op_t>::const_iterator p = ops.begin(); p != ops.end(); ++p) {
+ f->dump_object("op", *p);
+ }
+ f->close_section();
+
+ f->open_array_section("released extents");
+ for (interval_set<uint64_t>::const_iterator p = released.begin(); p != released.end(); ++p) {
+ f->open_object_section("extent");
+ f->dump_unsigned("offset", p.get_start());
+ f->dump_unsigned("length", p.get_len());
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void bluestore_deferred_transaction_t::generate_test_instances(list<bluestore_deferred_transaction_t*>& o)
+{
+ o.push_back(new bluestore_deferred_transaction_t());
+ o.push_back(new bluestore_deferred_transaction_t());
+ o.back()->seq = 123;
+ o.back()->ops.push_back(bluestore_deferred_op_t());
+ o.back()->ops.push_back(bluestore_deferred_op_t());
+ o.back()->ops.back().op = bluestore_deferred_op_t::OP_WRITE;
+ o.back()->ops.back().extents.push_back(bluestore_pextent_t(1,7));
+ o.back()->ops.back().data.append("foodata");
+}
+
+void bluestore_compression_header_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("type", type);
+ f->dump_unsigned("length", length);
+}
+
+void bluestore_compression_header_t::generate_test_instances(
+ list<bluestore_compression_header_t*>& o)
+{
+ o.push_back(new bluestore_compression_header_t);
+ o.push_back(new bluestore_compression_header_t(1));
+ o.back()->length = 1234;
+}
diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h
new file mode 100644
index 00000000..8232801c
--- /dev/null
+++ b/src/os/bluestore/bluestore_types.h
@@ -0,0 +1,1044 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
+#define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
+
+#include <ostream>
+#include <bitset>
+#include <type_traits>
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "include/utime.h"
+#include "common/hobject.h"
+#include "compressor/Compressor.h"
+#include "common/Checksummer.h"
+#include "include/mempool.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+/// label for block device
+struct bluestore_bdev_label_t {
+ uuid_d osd_uuid; ///< osd uuid
+ uint64_t size = 0; ///< device size
+ utime_t btime; ///< birth time
+ string description; ///< device description
+
+ map<string,string> meta; ///< {read,write}_meta() content from ObjectStore
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_bdev_label_t*>& o);
+};
+WRITE_CLASS_ENCODER(bluestore_bdev_label_t)
+
+ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l);
+
+/// collection metadata
+struct bluestore_cnode_t {
+ uint32_t bits; ///< how many bits of coll pgid are significant
+
+ explicit bluestore_cnode_t(int b=0) : bits(b) {}
+
+ DENC(bluestore_cnode_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.bits, p);
+ DENC_FINISH(p);
+ }
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_cnode_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_cnode_t)
+
+ostream& operator<<(ostream& out, const bluestore_cnode_t& l);
+
+template <typename OFFS_TYPE, typename LEN_TYPE>
+struct bluestore_interval_t
+{
+ static const uint64_t INVALID_OFFSET = ~0ull;
+
+ OFFS_TYPE offset = 0;
+ LEN_TYPE length = 0;
+
+ bluestore_interval_t(){}
+ bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+
+ bool is_valid() const {
+ return offset != INVALID_OFFSET;
+ }
+ uint64_t end() const {
+ return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET;
+ }
+
+ bool operator==(const bluestore_interval_t& other) const {
+ return offset == other.offset && length == other.length;
+ }
+
+};
+
+/// pextent: physical extent
+struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t>
+{
+ bluestore_pextent_t() {}
+ bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {}
+ bluestore_pextent_t(const bluestore_interval_t &ext) :
+ bluestore_interval_t(ext.offset, ext.length) {}
+
+ DENC(bluestore_pextent_t, v, p) {
+ denc_lba(v.offset, p);
+ denc_varint_lowz(v.length, p);
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_pextent_t*>& ls);
+};
+WRITE_CLASS_DENC(bluestore_pextent_t)
+
+ostream& operator<<(ostream& out, const bluestore_pextent_t& o);
+
+typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector;
+
+template<>
+struct denc_traits<PExtentVector> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const PExtentVector& v, size_t& p) {
+ p += sizeof(uint32_t);
+ const auto size = v.size();
+ if (size) {
+ size_t per = 0;
+ denc(v.front(), per);
+ p += per * size;
+ }
+ }
+ static void encode(const PExtentVector& v,
+ bufferlist::contiguous_appender& p) {
+ denc_varint(v.size(), p);
+ for (auto& i : v) {
+ denc(i, p);
+ }
+ }
+ static void decode(PExtentVector& v, bufferptr::const_iterator& p) {
+ unsigned num;
+ denc_varint(num, p);
+ v.clear();
+ v.resize(num);
+ for (unsigned i=0; i<num; ++i) {
+ denc(v[i], p);
+ }
+ }
+};
+
+/// extent_map: a map of reference counted extents
+struct bluestore_extent_ref_map_t {
+ struct record_t {
+ uint32_t length;
+ uint32_t refs;
+ record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {}
+ DENC(bluestore_extent_ref_map_t::record_t, v, p) {
+ denc_varint_lowz(v.length, p);
+ denc_varint(v.refs, p);
+ }
+ };
+
+ typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t;
+ map_t ref_map;
+
+ void _check() const;
+ void _maybe_merge_left(map_t::iterator& p);
+
+ void clear() {
+ ref_map.clear();
+ }
+ bool empty() const {
+ return ref_map.empty();
+ }
+
+ void get(uint64_t offset, uint32_t len);
+ void put(uint64_t offset, uint32_t len, PExtentVector *release,
+ bool *maybe_unshared);
+
+ bool contains(uint64_t offset, uint32_t len) const;
+ bool intersects(uint64_t offset, uint32_t len) const;
+
+ void bound_encode(size_t& p) const {
+ denc_varint((uint32_t)0, p);
+ if (!ref_map.empty()) {
+ size_t elem_size = 0;
+ denc_varint_lowz((uint64_t)0, elem_size);
+ ref_map.begin()->second.bound_encode(elem_size);
+ p += elem_size * ref_map.size();
+ }
+ }
+ void encode(bufferlist::contiguous_appender& p) const {
+ const uint32_t n = ref_map.size();
+ denc_varint(n, p);
+ if (n) {
+ auto i = ref_map.begin();
+ denc_varint_lowz(i->first, p);
+ i->second.encode(p);
+ int64_t pos = i->first;
+ while (++i != ref_map.end()) {
+ denc_varint_lowz((int64_t)i->first - pos, p);
+ i->second.encode(p);
+ pos = i->first;
+ }
+ }
+ }
+ void decode(bufferptr::const_iterator& p) {
+ uint32_t n;
+ denc_varint(n, p);
+ if (n) {
+ int64_t pos;
+ denc_varint_lowz(pos, p);
+ ref_map[pos].decode(p);
+ while (--n) {
+ int64_t delta;
+ denc_varint_lowz(delta, p);
+ pos += delta;
+ ref_map[pos].decode(p);
+ }
+ }
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_extent_ref_map_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_extent_ref_map_t)
+
+
+ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& rm);
+static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l,
+ const bluestore_extent_ref_map_t::record_t& r) {
+ return l.length == r.length && l.refs == r.refs;
+}
+static inline bool operator==(const bluestore_extent_ref_map_t& l,
+ const bluestore_extent_ref_map_t& r) {
+ return l.ref_map == r.ref_map;
+}
+static inline bool operator!=(const bluestore_extent_ref_map_t& l,
+ const bluestore_extent_ref_map_t& r) {
+ return !(l == r);
+}
+
+/// blob_use_tracker: a set of per-alloc unit ref counters to track blob usage
+struct bluestore_blob_use_tracker_t {
+ // N.B.: There is no need to minimize au_size/num_au
+ // as much as possible (e.g. have just a single byte for au_size) since:
+ // 1) Struct isn't packed hence it's padded. And even if it's packed see 2)
+ // 2) Mem manager has its own granularity, most probably >= 8 bytes
+ //
+ uint32_t au_size; // Allocation (=tracking) unit size,
+ // == 0 if uninitialized
+ uint32_t num_au; // Amount of allocation units tracked
+ // == 0 if single unit or the whole blob is tracked
+
+ union {
+ uint32_t* bytes_per_au;
+ uint32_t total_bytes;
+ };
+
+ bluestore_blob_use_tracker_t()
+ : au_size(0), num_au(0), bytes_per_au(nullptr) {
+ }
+ ~bluestore_blob_use_tracker_t() {
+ clear();
+ }
+
+ void clear() {
+ if (num_au != 0) {
+ delete[] bytes_per_au;
+ mempool::get_pool(
+ mempool::pool_index_t(mempool::mempool_bluestore_cache_other)).
+ adjust_count(-1, -sizeof(uint32_t) * num_au);
+ }
+ bytes_per_au = 0;
+ au_size = 0;
+ num_au = 0;
+ }
+
+ uint32_t get_referenced_bytes() const {
+ uint32_t total = 0;
+ if (!num_au) {
+ total = total_bytes;
+ } else {
+ for (size_t i = 0; i < num_au; ++i) {
+ total += bytes_per_au[i];
+ }
+ }
+ return total;
+ }
+ bool is_not_empty() const {
+ if (!num_au) {
+ return total_bytes != 0;
+ } else {
+ for (size_t i = 0; i < num_au; ++i) {
+ if (bytes_per_au[i]) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+ bool is_empty() const {
+ return !is_not_empty();
+ }
+ void prune_tail(uint32_t new_len) {
+ if (num_au) {
+ new_len = round_up_to(new_len, au_size);
+ uint32_t _num_au = new_len / au_size;
+ ceph_assert(_num_au <= num_au);
+ if (_num_au) {
+ num_au = _num_au; // bytes_per_au array is left unmodified
+
+ } else {
+ clear();
+ }
+ }
+ }
+ void add_tail(uint32_t new_len, uint32_t _au_size) {
+ auto full_size = au_size * (num_au ? num_au : 1);
+ ceph_assert(new_len >= full_size);
+ if (new_len == full_size) {
+ return;
+ }
+ if (!num_au) {
+ uint32_t old_total = total_bytes;
+ total_bytes = 0;
+ init(new_len, _au_size);
+ ceph_assert(num_au);
+ bytes_per_au[0] = old_total;
+ } else {
+ ceph_assert(_au_size == au_size);
+ new_len = round_up_to(new_len, au_size);
+ uint32_t _num_au = new_len / au_size;
+ ceph_assert(_num_au >= num_au);
+ if (_num_au > num_au) {
+ auto old_bytes = bytes_per_au;
+ auto old_num_au = num_au;
+ num_au = _num_au;
+ allocate();
+ for (size_t i = 0; i < old_num_au; i++) {
+ bytes_per_au[i] = old_bytes[i];
+ }
+ for (size_t i = old_num_au; i < num_au; i++) {
+ bytes_per_au[i] = 0;
+ }
+ delete[] old_bytes;
+ }
+ }
+ }
+
+ void init(
+ uint32_t full_length,
+ uint32_t _au_size);
+
+ void get(
+ uint32_t offset,
+ uint32_t len);
+
+ /// put: return true if the blob has no references any more after the call,
+ /// no release_units is filled for the sake of performance.
+ /// return false if there are some references to the blob,
+ /// in this case release_units contains pextents
+ /// (identified by their offsets relative to the blob start)
+ /// that are not used any more and can be safely deallocated.
+ bool put(
+ uint32_t offset,
+ uint32_t len,
+ PExtentVector *release);
+
+ bool can_split() const;
+ bool can_split_at(uint32_t blob_offset) const;
+ void split(
+ uint32_t blob_offset,
+ bluestore_blob_use_tracker_t* r);
+
+ bool equal(
+ const bluestore_blob_use_tracker_t& other) const;
+
+ void bound_encode(size_t& p) const {
+ denc_varint(au_size, p);
+ if (au_size) {
+ denc_varint(num_au, p);
+ if (!num_au) {
+ denc_varint(total_bytes, p);
+ } else {
+ size_t elem_size = 0;
+ denc_varint((uint32_t)0, elem_size);
+ p += elem_size * num_au;
+ }
+ }
+ }
+ void encode(bufferlist::contiguous_appender& p) const {
+ denc_varint(au_size, p);
+ if (au_size) {
+ denc_varint(num_au, p);
+ if (!num_au) {
+ denc_varint(total_bytes, p);
+ } else {
+ size_t elem_size = 0;
+ denc_varint((uint32_t)0, elem_size);
+ for (size_t i = 0; i < num_au; ++i) {
+ denc_varint(bytes_per_au[i], p);
+ }
+ }
+ }
+ }
+ void decode(bufferptr::const_iterator& p) {
+ clear();
+ denc_varint(au_size, p);
+ if (au_size) {
+ denc_varint(num_au, p);
+ if (!num_au) {
+ denc_varint(total_bytes, p);
+ } else {
+ allocate();
+ for (size_t i = 0; i < num_au; ++i) {
+ denc_varint(bytes_per_au[i], p);
+ }
+ }
+ }
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_blob_use_tracker_t*>& o);
+private:
+ void allocate();
+};
+WRITE_CLASS_DENC(bluestore_blob_use_tracker_t)
+ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& rm);
+
+/// blob: a piece of data on disk
+struct bluestore_blob_t {
+private:
+ PExtentVector extents; ///< raw data position on device
+ uint32_t logical_length = 0; ///< original length of data stored in the blob
+ uint32_t compressed_length = 0; ///< compressed length if any
+
+public:
+ enum {
+ LEGACY_FLAG_MUTABLE = 1, ///< [legacy] blob can be overwritten or split
+ FLAG_COMPRESSED = 2, ///< blob is compressed
+ FLAG_CSUM = 4, ///< blob has checksums
+ FLAG_HAS_UNUSED = 8, ///< blob has unused map
+ FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob
+ };
+ static string get_flags_string(unsigned flags);
+
+ uint32_t flags = 0; ///< FLAG_*
+
+ typedef uint16_t unused_t;
+ unused_t unused = 0; ///< portion that has never been written to (bitmap)
+
+ uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_*
+ uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes
+
+ bufferptr csum_data; ///< opaque vector of csum data
+
+ bluestore_blob_t(uint32_t f = 0) : flags(f) {}
+
+ const PExtentVector& get_extents() const {
+ return extents;
+ }
+ PExtentVector& dirty_extents() {
+ return extents;
+ }
+
+ DENC_HELPERS;
+ void bound_encode(size_t& p, uint64_t struct_v) const {
+ ceph_assert(struct_v == 1 || struct_v == 2);
+ denc(extents, p);
+ denc_varint(flags, p);
+ denc_varint_lowz(logical_length, p);
+ denc_varint_lowz(compressed_length, p);
+ denc(csum_type, p);
+ denc(csum_chunk_order, p);
+ denc_varint(csum_data.length(), p);
+ p += csum_data.length();
+ p += sizeof(unused_t);
+ }
+
+ void encode(bufferlist::contiguous_appender& p, uint64_t struct_v) const {
+ ceph_assert(struct_v == 1 || struct_v == 2);
+ denc(extents, p);
+ denc_varint(flags, p);
+ if (is_compressed()) {
+ denc_varint_lowz(logical_length, p);
+ denc_varint_lowz(compressed_length, p);
+ }
+ if (has_csum()) {
+ denc(csum_type, p);
+ denc(csum_chunk_order, p);
+ denc_varint(csum_data.length(), p);
+ memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(),
+ csum_data.length());
+ }
+ if (has_unused()) {
+ denc(unused, p);
+ }
+ }
+
+ void decode(bufferptr::const_iterator& p, uint64_t struct_v) {
+ ceph_assert(struct_v == 1 || struct_v == 2);
+ denc(extents, p);
+ denc_varint(flags, p);
+ if (is_compressed()) {
+ denc_varint_lowz(logical_length, p);
+ denc_varint_lowz(compressed_length, p);
+ } else {
+ logical_length = get_ondisk_length();
+ }
+ if (has_csum()) {
+ denc(csum_type, p);
+ denc(csum_chunk_order, p);
+ int len;
+ denc_varint(len, p);
+ csum_data = p.get_ptr(len);
+ csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
+ if (has_unused()) {
+ denc(unused, p);
+ }
+ }
+
+ bool can_split() const {
+ return
+ !has_flag(FLAG_SHARED) &&
+ !has_flag(FLAG_COMPRESSED) &&
+ !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex
+ }
+ bool can_split_at(uint32_t blob_offset) const {
+ return !has_csum() || blob_offset % get_csum_chunk_size() == 0;
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_blob_t*>& ls);
+
+ bool has_flag(unsigned f) const {
+ return flags & f;
+ }
+ void set_flag(unsigned f) {
+ flags |= f;
+ }
+ void clear_flag(unsigned f) {
+ flags &= ~f;
+ }
+ string get_flags_string() const {
+ return get_flags_string(flags);
+ }
+
+ void set_compressed(uint64_t clen_orig, uint64_t clen) {
+ set_flag(FLAG_COMPRESSED);
+ logical_length = clen_orig;
+ compressed_length = clen;
+ }
+ bool is_mutable() const {
+ return !is_compressed() && !is_shared();
+ }
+ bool is_compressed() const {
+ return has_flag(FLAG_COMPRESSED);
+ }
+ bool has_csum() const {
+ return has_flag(FLAG_CSUM);
+ }
+ bool has_unused() const {
+ return has_flag(FLAG_HAS_UNUSED);
+ }
+ bool is_shared() const {
+ return has_flag(FLAG_SHARED);
+ }
+
+ /// return chunk (i.e. min readable block) size for the blob
+ uint64_t get_chunk_size(uint64_t dev_block_size) const {
+ return has_csum() ?
+ std::max<uint64_t>(dev_block_size, get_csum_chunk_size()) : dev_block_size;
+ }
+ uint32_t get_csum_chunk_size() const {
+ return 1 << csum_chunk_order;
+ }
+ uint32_t get_compressed_payload_length() const {
+ return is_compressed() ? compressed_length : 0;
+ }
+ uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const {
+ auto p = extents.begin();
+ ceph_assert(p != extents.end());
+ while (x_off >= p->length) {
+ x_off -= p->length;
+ ++p;
+ ceph_assert(p != extents.end());
+ }
+ if (plen)
+ *plen = p->length - x_off;
+ return p->offset + x_off;
+ }
+
+ // validate whether or not the status of pextents within the given range
+ // meets the requirement(allocated or unallocated).
+ bool _validate_range(uint64_t b_off, uint64_t b_len,
+ bool require_allocated) const {
+ auto p = extents.begin();
+ ceph_assert(p != extents.end());
+ while (b_off >= p->length) {
+ b_off -= p->length;
+ ++p;
+ ceph_assert(p != extents.end());
+ }
+ b_len += b_off;
+ while (b_len) {
+ ceph_assert(p != extents.end());
+ if (require_allocated != p->is_valid()) {
+ return false;
+ }
+
+ if (p->length >= b_len) {
+ return true;
+ }
+ b_len -= p->length;
+ ++p;
+ }
+ ceph_abort_msg("we should not get here");
+ return false;
+ }
+
+ /// return true if the entire range is allocated
+ /// (mapped to extents on disk)
+ bool is_allocated(uint64_t b_off, uint64_t b_len) const {
+ return _validate_range(b_off, b_len, true);
+ }
+
+ /// return true if the entire range is unallocated
+ /// (not mapped to extents on disk)
+ bool is_unallocated(uint64_t b_off, uint64_t b_len) const {
+ return _validate_range(b_off, b_len, false);
+ }
+
+ /// return true if the logical range has never been used
+ bool is_unused(uint64_t offset, uint64_t length) const {
+ if (!has_unused()) {
+ return false;
+ }
+ ceph_assert(!is_compressed());
+ uint64_t blob_len = get_logical_length();
+ ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+ ceph_assert(offset + length <= blob_len);
+ uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+ uint64_t start = offset / chunk_size;
+ uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
+ auto i = start;
+ while (i < end && (unused & (1u << i))) {
+ i++;
+ }
+ return i >= end;
+ }
+
+ /// mark a range that has never been used
+ void add_unused(uint64_t offset, uint64_t length) {
+ ceph_assert(!is_compressed());
+ uint64_t blob_len = get_logical_length();
+ ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+ ceph_assert(offset + length <= blob_len);
+ uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+ uint64_t start = round_up_to(offset, chunk_size) / chunk_size;
+ uint64_t end = (offset + length) / chunk_size;
+ for (auto i = start; i < end; ++i) {
+ unused |= (1u << i);
+ }
+ if (start != end) {
+ set_flag(FLAG_HAS_UNUSED);
+ }
+ }
+
+ /// indicate that a range has (now) been used.
+ void mark_used(uint64_t offset, uint64_t length) {
+ if (has_unused()) {
+ ceph_assert(!is_compressed());
+ uint64_t blob_len = get_logical_length();
+ ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+ ceph_assert(offset + length <= blob_len);
+ uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+ uint64_t start = offset / chunk_size;
+ uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
+ for (auto i = start; i < end; ++i) {
+ unused &= ~(1u << i);
+ }
+ if (unused == 0) {
+ clear_flag(FLAG_HAS_UNUSED);
+ }
+ }
+ }
+
+ template<class F>
+ int map(uint64_t x_off, uint64_t x_len, F&& f) const {
+ static_assert(std::is_invocable_r_v<int, F, uint64_t, uint64_t>);
+
+ auto p = extents.begin();
+ ceph_assert(p != extents.end());
+ while (x_off >= p->length) {
+ x_off -= p->length;
+ ++p;
+ ceph_assert(p != extents.end());
+ }
+ while (x_len > 0) {
+ ceph_assert(p != extents.end());
+ uint64_t l = std::min(p->length - x_off, x_len);
+ int r = f(p->offset + x_off, l);
+ if (r < 0)
+ return r;
+ x_off = 0;
+ x_len -= l;
+ ++p;
+ }
+ return 0;
+ }
+ template<class F>
+ void map_bl(uint64_t x_off,
+ bufferlist& bl,
+ F&& f) const {
+ static_assert(std::is_invocable_v<F, uint64_t, bufferlist&>);
+
+ auto p = extents.begin();
+ ceph_assert(p != extents.end());
+ while (x_off >= p->length) {
+ x_off -= p->length;
+ ++p;
+ ceph_assert(p != extents.end());
+ }
+ bufferlist::iterator it = bl.begin();
+ uint64_t x_len = bl.length();
+ while (x_len > 0) {
+ ceph_assert(p != extents.end());
+ uint64_t l = std::min(p->length - x_off, x_len);
+ bufferlist t;
+ it.copy(l, t);
+ f(p->offset + x_off, t);
+ x_off = 0;
+ x_len -= l;
+ ++p;
+ }
+ }
+
+ uint32_t get_ondisk_length() const {
+ uint32_t len = 0;
+ for (auto &p : extents) {
+ len += p.length;
+ }
+ return len;
+ }
+
+ uint32_t get_logical_length() const {
+ return logical_length;
+ }
+ size_t get_csum_value_size() const;
+
+ size_t get_csum_count() const {
+ size_t vs = get_csum_value_size();
+ if (!vs)
+ return 0;
+ return csum_data.length() / vs;
+ }
+ uint64_t get_csum_item(unsigned i) const {
+ size_t cs = get_csum_value_size();
+ const char *p = csum_data.c_str();
+ switch (cs) {
+ case 0:
+ ceph_abort_msg("no csum data, bad index");
+ case 1:
+ return reinterpret_cast<const uint8_t*>(p)[i];
+ case 2:
+ return reinterpret_cast<const ceph_le16*>(p)[i];
+ case 4:
+ return reinterpret_cast<const ceph_le32*>(p)[i];
+ case 8:
+ return reinterpret_cast<const ceph_le64*>(p)[i];
+ default:
+ ceph_abort_msg("unrecognized csum word size");
+ }
+ }
+ const char *get_csum_item_ptr(unsigned i) const {
+ size_t cs = get_csum_value_size();
+ return csum_data.c_str() + (cs * i);
+ }
+ char *get_csum_item_ptr(unsigned i) {
+ size_t cs = get_csum_value_size();
+ return csum_data.c_str() + (cs * i);
+ }
+
+ void init_csum(unsigned type, unsigned order, unsigned len) {
+ flags |= FLAG_CSUM;
+ csum_type = type;
+ csum_chunk_order = order;
+ csum_data = buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
+ csum_data.zero();
+ csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
+
+ /// calculate csum for the buffer at the given b_off
+ void calc_csum(uint64_t b_off, const bufferlist& bl);
+
+ /// verify csum: return -EOPNOTSUPP for unsupported checksum type;
+ /// return -1 and valid(nonnegative) b_bad_off for checksum error;
+ /// return 0 if all is well.
+ int verify_csum(uint64_t b_off, const bufferlist& bl, int* b_bad_off,
+ uint64_t *bad_csum) const;
+
+ bool can_prune_tail() const {
+ return
+ extents.size() > 1 && // if it's all invalid it's not pruning.
+ !extents.back().is_valid() &&
+ !has_unused();
+ }
+ void prune_tail() {
+ const auto &p = extents.back();
+ logical_length -= p.length;
+ extents.pop_back();
+ if (has_csum()) {
+ bufferptr t;
+ t.swap(csum_data);
+ csum_data = bufferptr(t.c_str(),
+ get_logical_length() / get_csum_chunk_size() *
+ get_csum_value_size());
+ }
+ }
+ void add_tail(uint32_t new_len) {
+ ceph_assert(is_mutable());
+ ceph_assert(!has_unused());
+ ceph_assert(new_len > logical_length);
+ extents.emplace_back(
+ bluestore_pextent_t(
+ bluestore_pextent_t::INVALID_OFFSET,
+ new_len - logical_length));
+ logical_length = new_len;
+ if (has_csum()) {
+ bufferptr t;
+ t.swap(csum_data);
+ csum_data = buffer::create(
+ get_csum_value_size() * logical_length / get_csum_chunk_size());
+ csum_data.copy_in(0, t.length(), t.c_str());
+ csum_data.zero(t.length(), csum_data.length() - t.length());
+ }
+ }
+ uint32_t get_release_size(uint32_t min_alloc_size) const {
+ if (is_compressed()) {
+ return get_logical_length();
+ }
+ uint32_t res = get_csum_chunk_size();
+ if (!has_csum() || res < min_alloc_size) {
+ res = min_alloc_size;
+ }
+ return res;
+ }
+
+ void split(uint32_t blob_offset, bluestore_blob_t& rb);
+ void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
+ void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
+
+ /// updates blob's pextents container and return unused pextents eligible
+ /// for release.
+ /// all - indicates that the whole blob to be released.
+ /// logical - specifies set of logical extents within blob's
+ /// to be released
+ /// Returns true if blob has no more valid pextents
+ bool release_extents(
+ bool all,
+ const PExtentVector& logical,
+ PExtentVector* r);
+};
+WRITE_CLASS_DENC_FEATURED(bluestore_blob_t)
+
+ostream& operator<<(ostream& out, const bluestore_blob_t& o);
+
+
+/// shared blob state
+struct bluestore_shared_blob_t {
+ MEMPOOL_CLASS_HELPERS();
+ uint64_t sbid; ///> shared blob id
+ bluestore_extent_ref_map_t ref_map; ///< shared blob extents
+
+ bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {}
+ bluestore_shared_blob_t(uint64_t _sbid,
+ bluestore_extent_ref_map_t&& _ref_map )
+ : sbid(_sbid), ref_map(std::move(_ref_map)) {}
+
+ DENC(bluestore_shared_blob_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.ref_map, p);
+ DENC_FINISH(p);
+ }
+
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_shared_blob_t*>& ls);
+
+ bool empty() const {
+ return ref_map.empty();
+ }
+};
+WRITE_CLASS_DENC(bluestore_shared_blob_t)
+
+ostream& operator<<(ostream& out, const bluestore_shared_blob_t& o);
+
+/// onode: per-object metadata
+struct bluestore_onode_t {
+ uint64_t nid = 0; ///< numeric id (locally unique)
+ uint64_t size = 0; ///< object size
+ // mempool to be assigned to buffer::ptr manually
+ std::map<mempool::bluestore_cache_meta::string, ceph::buffer::ptr> attrs;
+
+ struct shard_info {
+ uint32_t offset = 0; ///< logical offset for start of shard
+ uint32_t bytes = 0; ///< encoded bytes
+ DENC(shard_info, v, p) {
+ denc_varint(v.offset, p);
+ denc_varint(v.bytes, p);
+ }
+ void dump(Formatter *f) const;
+ };
+ vector<shard_info> extent_map_shards; ///< extent map shards (if any)
+
+ uint32_t expected_object_size = 0;
+ uint32_t expected_write_size = 0;
+ uint32_t alloc_hint_flags = 0;
+
+ uint8_t flags = 0;
+
+ enum {
+ FLAG_OMAP = 1, ///< object may have omap data
+ FLAG_PGMETA_OMAP = 2, ///< omap data is in meta omap prefix
+ };
+
+ string get_flags_string() const {
+ string s;
+ if (flags & FLAG_OMAP) {
+ s = "omap";
+ }
+ return s;
+ }
+
+ bool has_flag(unsigned f) const {
+ return flags & f;
+ }
+
+ void set_flag(unsigned f) {
+ flags |= f;
+ }
+
+ void clear_flag(unsigned f) {
+ flags &= ~f;
+ }
+
+ bool has_omap() const {
+ return has_flag(FLAG_OMAP);
+ }
+ bool is_pgmeta_omap() const {
+ return has_flag(FLAG_PGMETA_OMAP);
+ }
+
+ void set_omap_flag() {
+ set_flag(FLAG_OMAP);
+ }
+
+ void clear_omap_flag() {
+ clear_flag(FLAG_OMAP);
+ }
+
+ DENC(bluestore_onode_t, v, p) {
+ DENC_START(1, 1, p);
+ denc_varint(v.nid, p);
+ denc_varint(v.size, p);
+ denc(v.attrs, p);
+ denc(v.flags, p);
+ denc(v.extent_map_shards, p);
+ denc_varint(v.expected_object_size, p);
+ denc_varint(v.expected_write_size, p);
+ denc_varint(v.alloc_hint_flags, p);
+ DENC_FINISH(p);
+ }
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_onode_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_onode_t::shard_info)
+WRITE_CLASS_DENC(bluestore_onode_t)
+
+ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si);
+
+/// writeahead-logged op
+struct bluestore_deferred_op_t {
+ typedef enum {
+ OP_WRITE = 1,
+ } type_t;
+ __u8 op = 0;
+
+ PExtentVector extents;
+ bufferlist data;
+
+ DENC(bluestore_deferred_op_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.op, p);
+ denc(v.extents, p);
+ denc(v.data, p);
+ DENC_FINISH(p);
+ }
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_deferred_op_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_deferred_op_t)
+
+
+/// writeahead-logged transaction
+struct bluestore_deferred_transaction_t {
+ uint64_t seq = 0;
+ list<bluestore_deferred_op_t> ops;
+ interval_set<uint64_t> released; ///< allocations to release after tx
+
+ bluestore_deferred_transaction_t() : seq(0) {}
+
+ DENC(bluestore_deferred_transaction_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.seq, p);
+ denc(v.ops, p);
+ denc(v.released, p);
+ DENC_FINISH(p);
+ }
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_deferred_transaction_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_deferred_transaction_t)
+
+struct bluestore_compression_header_t {
+ uint8_t type = Compressor::COMP_ALG_NONE;
+ uint32_t length = 0;
+
+ bluestore_compression_header_t() {}
+ bluestore_compression_header_t(uint8_t _type)
+ : type(_type) {}
+
+ DENC(bluestore_compression_header_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ denc(v.length, p);
+ DENC_FINISH(p);
+ }
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<bluestore_compression_header_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_compression_header_t)
+
+
+#endif
diff --git a/src/os/bluestore/ceph_aio.h b/src/os/bluestore/ceph_aio.h
new file mode 100644
index 00000000..ab033886
--- /dev/null
+++ b/src/os/bluestore/ceph_aio.h
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "acconfig.h"
+
+#if defined(HAVE_LIBAIO)
+#include <libaio.h>
+#elif defined(HAVE_POSIXAIO)
+#include <aio.h>
+#include <sys/event.h>
+#endif
+
+#include <boost/intrusive/list.hpp>
+#include <boost/container/small_vector.hpp>
+
+#include "include/buffer.h"
+#include "include/types.h"
+
+struct aio_t {
+#if defined(HAVE_LIBAIO)
+ struct iocb iocb{}; // must be first element; see shenanigans in aio_queue_t
+#elif defined(HAVE_POSIXAIO)
+ // static long aio_listio_max = -1;
+ union {
+ struct aiocb aiocb;
+ struct aiocb *aiocbp;
+ } aio;
+ int n_aiocb;
+#endif
+ void *priv;
+ int fd;
+ boost::container::small_vector<iovec,4> iov;
+ uint64_t offset, length;
+ long rval;
+ bufferlist bl; ///< write payload (so that it remains stable for duration)
+
+ boost::intrusive::list_member_hook<> queue_item;
+
+ aio_t(void *p, int f) : priv(p), fd(f), offset(0), length(0), rval(-1000) {
+ }
+
+ void pwritev(uint64_t _offset, uint64_t len) {
+ offset = _offset;
+ length = len;
+#if defined(HAVE_LIBAIO)
+ io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset);
+#elif defined(HAVE_POSIXAIO)
+ n_aiocb = iov.size();
+ aio.aiocbp = (struct aiocb*)calloc(iov.size(), sizeof(struct aiocb));
+ for (int i = 0; i < iov.size(); i++) {
+ aio.aiocbp[i].aio_fildes = fd;
+ aio.aiocbp[i].aio_offset = offset;
+ aio.aiocbp[i].aio_buf = iov[i].iov_base;
+ aio.aiocbp[i].aio_nbytes = iov[i].iov_len;
+ aio.aiocbp[i].aio_lio_opcode = LIO_WRITE;
+ offset += iov[i].iov_len;
+ }
+#endif
+ }
+ void pread(uint64_t _offset, uint64_t len) {
+ offset = _offset;
+ length = len;
+ bufferptr p = buffer::create_small_page_aligned(length);
+#if defined(HAVE_LIBAIO)
+ io_prep_pread(&iocb, fd, p.c_str(), length, offset);
+#elif defined(HAVE_POSIXAIO)
+ n_aiocb = 1;
+ aio.aiocb.aio_fildes = fd;
+ aio.aiocb.aio_buf = p.c_str();
+ aio.aiocb.aio_nbytes = length;
+ aio.aiocb.aio_offset = offset;
+#endif
+ bl.append(std::move(p));
+ }
+
+ long get_return_value() {
+ return rval;
+ }
+};
+
+std::ostream& operator<<(std::ostream& os, const aio_t& aio);
+
+typedef boost::intrusive::list<
+ aio_t,
+ boost::intrusive::member_hook<
+ aio_t,
+ boost::intrusive::list_member_hook<>,
+ &aio_t::queue_item> > aio_list_t;
+
+struct aio_queue_t {
+ int max_iodepth;
+#if defined(HAVE_LIBAIO)
+ io_context_t ctx;
+#elif defined(HAVE_POSIXAIO)
+ int ctx;
+#endif
+
+ typedef list<aio_t>::iterator aio_iter;
+
+ explicit aio_queue_t(unsigned max_iodepth)
+ : max_iodepth(max_iodepth),
+ ctx(0) {
+ }
+ ~aio_queue_t() {
+ ceph_assert(ctx == 0);
+ }
+
+ int init() {
+ ceph_assert(ctx == 0);
+#if defined(HAVE_LIBAIO)
+ int r = io_setup(max_iodepth, &ctx);
+ if (r < 0) {
+ if (ctx) {
+ io_destroy(ctx);
+ ctx = 0;
+ }
+ }
+ return r;
+#elif defined(HAVE_POSIXAIO)
+ ctx = kqueue();
+ if (ctx < 0)
+ return -errno;
+ else
+ return 0;
+#endif
+ }
+ void shutdown() {
+ if (ctx) {
+#if defined(HAVE_LIBAIO)
+ int r = io_destroy(ctx);
+#elif defined(HAVE_POSIXAIO)
+ int r = close(ctx);
+#endif
+ ceph_assert(r == 0);
+ ctx = 0;
+ }
+ }
+
+ int submit_batch(aio_iter begin, aio_iter end, uint16_t aios_size,
+ void *priv, int *retries);
+ int get_next_completed(int timeout_ms, aio_t **paio, int max);
+};
diff --git a/src/os/bluestore/fastbmap_allocator_impl.cc b/src/os/bluestore/fastbmap_allocator_impl.cc
new file mode 100755
index 00000000..c8909655
--- /dev/null
+++ b/src/os/bluestore/fastbmap_allocator_impl.cc
@@ -0,0 +1,717 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#include "fastbmap_allocator_impl.h"
+
+uint64_t AllocatorLevel::l0_dives = 0;
+uint64_t AllocatorLevel::l0_iterations = 0;
+uint64_t AllocatorLevel::l0_inner_iterations = 0;
+uint64_t AllocatorLevel::alloc_fragments = 0;
+uint64_t AllocatorLevel::alloc_fragments_fast = 0;
+uint64_t AllocatorLevel::l2_allocs = 0;
+
+inline interval_t _align2units(uint64_t offset, uint64_t len, uint64_t min_length)
+{
+ interval_t res;
+ if (len >= min_length) {
+ res.offset = p2roundup(offset, min_length);
+ auto delta_off = res.offset - offset;
+ if (len > delta_off) {
+ res.length = len - delta_off;
+ res.length = p2align<uint64_t>(res.length, min_length);
+ if (res.length) {
+ return res;
+ }
+ }
+ }
+ return interval_t();
+}
+
+interval_t AllocatorLevel01Loose::_get_longest_from_l0(uint64_t pos0,
+ uint64_t pos1, uint64_t min_length, interval_t* tail) const
+{
+ interval_t res;
+ if (pos0 >= pos1) {
+ return res;
+ }
+ auto pos = pos0;
+
+ interval_t res_candidate;
+ if (tail->length != 0) {
+ ceph_assert((tail->offset % l0_granularity) == 0);
+ ceph_assert((tail->length % l0_granularity) == 0);
+ res_candidate.offset = tail->offset / l0_granularity;
+ res_candidate.length = tail->length / l0_granularity;
+ }
+ *tail = interval_t();
+
+ auto d = bits_per_slot;
+ slot_t bits = l0[pos / d];
+ bits >>= pos % d;
+ bool end_loop = false;
+ auto min_granules = min_length / l0_granularity;
+
+ do {
+ if ((pos % d) == 0) {
+ bits = l0[pos / d];
+ if (pos1 - pos >= d) {
+ switch(bits) {
+ case all_slot_set:
+ // slot is totally free
+ if (!res_candidate.length) {
+ res_candidate.offset = pos;
+ }
+ res_candidate.length += d;
+ pos += d;
+ end_loop = pos >= pos1;
+ if (end_loop) {
+ *tail = res_candidate;
+ res_candidate = _align2units(res_candidate.offset,
+ res_candidate.length, min_granules);
+ if(res.length < res_candidate.length) {
+ res = res_candidate;
+ }
+ }
+ continue;
+ case all_slot_clear:
+ // slot is totally allocated
+ res_candidate = _align2units(res_candidate.offset,
+ res_candidate.length, min_granules);
+ if (res.length < res_candidate.length) {
+ res = res_candidate;
+ }
+ res_candidate = interval_t();
+ pos += d;
+ end_loop = pos >= pos1;
+ continue;
+ }
+ }
+ } //if ((pos % d) == 0)
+
+ end_loop = ++pos >= pos1;
+ if (bits & 1) {
+ // item is free
+ if (!res_candidate.length) {
+ res_candidate.offset = pos - 1;
+ }
+ ++res_candidate.length;
+ if (end_loop) {
+ *tail = res_candidate;
+ res_candidate = _align2units(res_candidate.offset,
+ res_candidate.length, min_granules);
+ if (res.length < res_candidate.length) {
+ res = res_candidate;
+ }
+ }
+ } else {
+ res_candidate = _align2units(res_candidate.offset,
+ res_candidate.length, min_granules);
+ if (res.length < res_candidate.length) {
+ res = res_candidate;
+ }
+ res_candidate = interval_t();
+ }
+ bits >>= 1;
+ } while (!end_loop);
+ res.offset *= l0_granularity;
+ res.length *= l0_granularity;
+ tail->offset *= l0_granularity;
+ tail->length *= l0_granularity;
+ return res;
+}
+
+void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start,
+ uint64_t pos_end, uint64_t length, uint64_t min_length, int mode,
+ search_ctx_t* ctx)
+{
+ auto d = L1_ENTRIES_PER_SLOT;
+ ceph_assert((pos_start % d) == 0);
+ ceph_assert((pos_end % d) == 0);
+
+ uint64_t l0_w = slots_per_slotset * L0_ENTRIES_PER_SLOT;
+
+ uint64_t l1_pos = pos_start;
+ const interval_t empty_tail;
+ interval_t prev_tail;
+
+ uint64_t next_free_l1_pos = 0;
+ for (auto pos = pos_start / d; pos < pos_end / d; ++pos) {
+ slot_t slot_val = l1[pos];
+ // FIXME minor: code below can be optimized to check slot_val against
+ // all_slot_set(_clear) value
+
+ for (auto c = 0; c < d; c++) {
+ switch (slot_val & L1_ENTRY_MASK) {
+ case L1_ENTRY_FREE:
+ prev_tail = empty_tail;
+ if (!ctx->free_count) {
+ ctx->free_l1_pos = l1_pos;
+ } else if (l1_pos != next_free_l1_pos){
+ auto o = ctx->free_l1_pos * l1_granularity;
+ auto l = ctx->free_count * l1_granularity;
+ // check if already found extent fits min_length after alignment
+ if (_align2units(o, l, min_length).length >= min_length) {
+ break;
+ }
+ // if not - proceed with the next one
+ ctx->free_l1_pos = l1_pos;
+ ctx->free_count = 0;
+ }
+ next_free_l1_pos = l1_pos + 1;
+ ++ctx->free_count;
+ if (mode == STOP_ON_EMPTY) {
+ return;
+ }
+ break;
+ case L1_ENTRY_FULL:
+ prev_tail = empty_tail;
+ break;
+ case L1_ENTRY_PARTIAL:
+ interval_t longest;
+ ++ctx->partial_count;
+
+ longest = _get_longest_from_l0(l1_pos * l0_w, (l1_pos + 1) * l0_w, min_length, &prev_tail);
+
+ if (longest.length >= length) {
+ if ((ctx->affordable_len == 0) ||
+ ((ctx->affordable_len != 0) &&
+ (longest.length < ctx->affordable_len))) {
+ ctx->affordable_len = longest.length;
+ ctx->affordable_offs = longest.offset;
+ }
+ }
+ if (longest.length >= min_length &&
+ (ctx->min_affordable_len == 0 ||
+ (longest.length < ctx->min_affordable_len))) {
+
+ ctx->min_affordable_len = p2align<uint64_t>(longest.length, min_length);
+ ctx->min_affordable_offs = longest.offset;
+ }
+ if (mode == STOP_ON_PARTIAL) {
+ return;
+ }
+ break;
+ }
+ slot_val >>= L1_ENTRY_WIDTH;
+ ++l1_pos;
+ }
+ }
+ ctx->fully_processed = true;
+}
+
+void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end)
+{
+ if (l0_pos == l0_pos_end) {
+ return;
+ }
+ auto d0 = bits_per_slotset;
+ uint64_t l1_w = L1_ENTRIES_PER_SLOT;
+ // this should be aligned with slotset boundaries
+ ceph_assert(0 == (l0_pos % d0));
+ ceph_assert(0 == (l0_pos_end % d0));
+
+ int64_t idx = l0_pos / bits_per_slot;
+ int64_t idx_end = l0_pos_end / bits_per_slot;
+ slot_t mask_to_apply = L1_ENTRY_NOT_USED;
+
+ auto l1_pos = l0_pos / d0;
+
+ while (idx < idx_end) {
+ if (l0[idx] == all_slot_clear) {
+ // if not all prev slots are allocated then no need to check the
+ // current slot set, it's partial
+ ++idx;
+ if (mask_to_apply == L1_ENTRY_NOT_USED) {
+ mask_to_apply = L1_ENTRY_FULL;
+ } else if (mask_to_apply != L1_ENTRY_FULL) {
+ idx = p2roundup(idx, int64_t(slots_per_slotset));
+ mask_to_apply = L1_ENTRY_PARTIAL;
+ }
+ } else if (l0[idx] == all_slot_set) {
+ // if not all prev slots are free then no need to check the
+ // current slot set, it's partial
+ ++idx;
+ if (mask_to_apply == L1_ENTRY_NOT_USED) {
+ mask_to_apply = L1_ENTRY_FREE;
+ } else if (mask_to_apply != L1_ENTRY_FREE) {
+ idx = p2roundup(idx, int64_t(slots_per_slotset));
+ mask_to_apply = L1_ENTRY_PARTIAL;
+ }
+ } else {
+ // no need to check the current slot set, it's partial
+ mask_to_apply = L1_ENTRY_PARTIAL;
+ ++idx;
+ idx = p2roundup(idx, int64_t(slots_per_slotset));
+ }
+ if ((idx % slots_per_slotset) == 0) {
+ ceph_assert(mask_to_apply != L1_ENTRY_NOT_USED);
+ uint64_t shift = (l1_pos % l1_w) * L1_ENTRY_WIDTH;
+ slot_t& slot_val = l1[l1_pos / l1_w];
+ auto mask = slot_t(L1_ENTRY_MASK) << shift;
+
+ slot_t old_mask = (slot_val & mask) >> shift;
+ switch(old_mask) {
+ case L1_ENTRY_FREE:
+ unalloc_l1_count--;
+ break;
+ case L1_ENTRY_PARTIAL:
+ partial_l1_count--;
+ break;
+ }
+ slot_val &= ~mask;
+ slot_val |= slot_t(mask_to_apply) << shift;
+ switch(mask_to_apply) {
+ case L1_ENTRY_FREE:
+ unalloc_l1_count++;
+ break;
+ case L1_ENTRY_PARTIAL:
+ partial_l1_count++;
+ break;
+ }
+ mask_to_apply = L1_ENTRY_NOT_USED;
+ ++l1_pos;
+ }
+ }
+}
+
+void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start,
+ int64_t l0_pos_end)
+{
+ auto d0 = L0_ENTRIES_PER_SLOT;
+
+ int64_t pos = l0_pos_start;
+ slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+ slot_t* val_s = &l0[pos / d0];
+ int64_t pos_e = std::min(l0_pos_end, p2roundup<int64_t>(l0_pos_start + 1, d0));
+ while (pos < pos_e) {
+ (*val_s) &= ~bits;
+ bits <<= 1;
+ pos++;
+ }
+ pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
+ while (pos < pos_e) {
+ *(++val_s) = all_slot_clear;
+ pos += d0;
+ }
+ bits = 1;
+ ++val_s;
+ while (pos < l0_pos_end) {
+ (*val_s) &= ~bits;
+ bits <<= 1;
+ pos++;
+ }
+}
+
+interval_t AllocatorLevel01Loose::_allocate_l1_contiguous(uint64_t length,
+ uint64_t min_length, uint64_t max_length,
+ uint64_t pos_start, uint64_t pos_end)
+{
+ interval_t res = { 0, 0 };
+ uint64_t l0_w = slots_per_slotset * L0_ENTRIES_PER_SLOT;
+
+ if (unlikely(length <= l0_granularity)) {
+ search_ctx_t ctx;
+ _analyze_partials(pos_start, pos_end, l0_granularity, l0_granularity,
+ STOP_ON_PARTIAL, &ctx);
+
+ // check partially free slot sets first (including neighboring),
+ // full length match required.
+ if (ctx.affordable_len) {
+ // allocate as specified
+ ceph_assert(ctx.affordable_len >= length);
+ auto pos = ctx.affordable_offs / l0_granularity;
+ _mark_alloc_l1_l0(pos, pos + 1);
+ res = interval_t(ctx.affordable_offs, length);
+ return res;
+ }
+
+ // allocate from free slot sets
+ if (ctx.free_count) {
+ auto l = std::min(length, ctx.free_count * l1_granularity);
+ ceph_assert((l % l0_granularity) == 0);
+ auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+ _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+ res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+ return res;
+ }
+ } else if (unlikely(length == l1_granularity)) {
+ search_ctx_t ctx;
+ _analyze_partials(pos_start, pos_end, length, min_length, STOP_ON_EMPTY, &ctx);
+
+ // allocate using contiguous extent found at l1 if any
+ if (ctx.free_count) {
+
+ auto l = std::min(length, ctx.free_count * l1_granularity);
+ ceph_assert((l % l0_granularity) == 0);
+ auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+ _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+ res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+
+ return res;
+ }
+
+ // we can terminate earlier on free entry only
+ ceph_assert(ctx.fully_processed);
+
+ // check partially free slot sets first (including neighboring),
+ // full length match required.
+ if (ctx.affordable_len) {
+ ceph_assert(ctx.affordable_len >= length);
+ ceph_assert((length % l0_granularity) == 0);
+ auto pos_start = ctx.affordable_offs / l0_granularity;
+ auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+ _mark_alloc_l1_l0(pos_start, pos_end);
+ res = interval_t(ctx.affordable_offs, length);
+ return res;
+ }
+ if (ctx.min_affordable_len) {
+ auto pos_start = ctx.min_affordable_offs / l0_granularity;
+ auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+ _mark_alloc_l1_l0(pos_start, pos_end);
+ return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+ }
+ } else {
+ search_ctx_t ctx;
+ _analyze_partials(pos_start, pos_end, length, min_length, NO_STOP, &ctx);
+ ceph_assert(ctx.fully_processed);
+ // check partially free slot sets first (including neighboring),
+ // full length match required.
+ if (ctx.affordable_len) {
+ ceph_assert(ctx.affordable_len >= length);
+ ceph_assert((length % l0_granularity) == 0);
+ auto pos_start = ctx.affordable_offs / l0_granularity;
+ auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+ _mark_alloc_l1_l0(pos_start, pos_end);
+ res = interval_t(ctx.affordable_offs, length);
+ return res;
+ }
+ // allocate using contiguous extent found at l1 if affordable
+ // align allocated extent with min_length
+ if (ctx.free_count) {
+ auto o = ctx.free_l1_pos * l1_granularity;
+ auto l = ctx.free_count * l1_granularity;
+ interval_t aligned_extent = _align2units(o, l, min_length);
+ if (aligned_extent.length > 0) {
+ aligned_extent.length = std::min(length,
+ uint64_t(aligned_extent.length));
+ ceph_assert((aligned_extent.offset % l0_granularity) == 0);
+ ceph_assert((aligned_extent.length % l0_granularity) == 0);
+
+ auto pos_start = aligned_extent.offset / l0_granularity;
+ auto pos_end = (aligned_extent.offset + aligned_extent.length) / l0_granularity;
+
+ _mark_alloc_l1_l0(pos_start, pos_end);
+ return aligned_extent;
+ }
+ }
+ if (ctx.min_affordable_len) {
+ auto pos_start = ctx.min_affordable_offs / l0_granularity;
+ auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+ _mark_alloc_l1_l0(pos_start, pos_end);
+ return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+ }
+ }
+ return res;
+}
+
+bool AllocatorLevel01Loose::_allocate_l1(uint64_t length,
+ uint64_t min_length, uint64_t max_length,
+ uint64_t l1_pos_start, uint64_t l1_pos_end,
+ uint64_t* allocated,
+ interval_vector_t* res)
+{
+ uint64_t d0 = L0_ENTRIES_PER_SLOT;
+ uint64_t d1 = L1_ENTRIES_PER_SLOT;
+
+ ceph_assert(0 == (l1_pos_start % (slots_per_slotset * d1)));
+ ceph_assert(0 == (l1_pos_end % (slots_per_slotset * d1)));
+ if (min_length != l0_granularity) {
+ // probably not the most effecient way but
+ // don't care much about that at the moment
+ bool has_space = true;
+ while (length > *allocated && has_space) {
+ interval_t i =
+ _allocate_l1_contiguous(length - *allocated, min_length, max_length,
+ l1_pos_start, l1_pos_end);
+ if (i.length == 0) {
+ has_space = false;
+ } else {
+ _fragment_and_emplace(max_length, i.offset, i.length, res);
+ *allocated += i.length;
+ }
+ }
+ } else {
+ uint64_t l0_w = slots_per_slotset * d0;
+
+ for (auto idx = l1_pos_start / d1;
+ idx < l1_pos_end / d1 && length > *allocated;
+ ++idx) {
+ slot_t& slot_val = l1[idx];
+ if (slot_val == all_slot_clear) {
+ continue;
+ } else if (slot_val == all_slot_set) {
+ uint64_t to_alloc = std::min(length - *allocated,
+ l1_granularity * d1);
+ *allocated += to_alloc;
+ ++alloc_fragments_fast;
+ _fragment_and_emplace(max_length, idx * d1 * l1_granularity, to_alloc,
+ res);
+ _mark_alloc_l1_l0(idx * d1 * bits_per_slotset,
+ idx * d1 * bits_per_slotset + to_alloc / l0_granularity);
+ continue;
+ }
+ auto free_pos = find_next_set_bit(slot_val, 0);
+ ceph_assert(free_pos < bits_per_slot);
+ do {
+ ceph_assert(length > *allocated);
+
+ bool empty;
+ empty = _allocate_l0(length, max_length,
+ (idx * d1 + free_pos / L1_ENTRY_WIDTH) * l0_w,
+ (idx * d1 + free_pos / L1_ENTRY_WIDTH + 1) * l0_w,
+ allocated,
+ res);
+
+ auto mask = slot_t(L1_ENTRY_MASK) << free_pos;
+
+ slot_t old_mask = (slot_val & mask) >> free_pos;
+ switch(old_mask) {
+ case L1_ENTRY_FREE:
+ unalloc_l1_count--;
+ break;
+ case L1_ENTRY_PARTIAL:
+ partial_l1_count--;
+ break;
+ }
+ slot_val &= ~mask;
+ if (empty) {
+ // the next line is no op with the current L1_ENTRY_FULL but left
+ // as-is for the sake of uniformity and to avoid potential errors
+ // in future
+ slot_val |= slot_t(L1_ENTRY_FULL) << free_pos;
+ } else {
+ slot_val |= slot_t(L1_ENTRY_PARTIAL) << free_pos;
+ partial_l1_count++;
+ }
+ if (length <= *allocated || slot_val == all_slot_clear) {
+ break;
+ }
+ free_pos = find_next_set_bit(slot_val, free_pos + L1_ENTRY_WIDTH);
+ } while (free_pos < bits_per_slot);
+ }
+ }
+ return _is_empty_l1(l1_pos_start, l1_pos_end);
+}
+
+void AllocatorLevel01Loose::collect_stats(
+ std::map<size_t, size_t>& bins_overall)
+{
+ size_t free_seq_cnt = 0;
+ for (auto slot : l0) {
+ if (slot == all_slot_set) {
+ free_seq_cnt += L0_ENTRIES_PER_SLOT;
+ } else if(slot != all_slot_clear) {
+ size_t pos = 0;
+ do {
+ auto pos1 = find_next_set_bit(slot, pos);
+ if (pos1 == pos) {
+ free_seq_cnt++;
+ pos = pos1 + 1;
+ } else {
+ if (free_seq_cnt) {
+ bins_overall[cbits(free_seq_cnt) - 1]++;
+ free_seq_cnt = 0;
+ }
+ if (pos1 < bits_per_slot) {
+ free_seq_cnt = 1;
+ }
+ pos = pos1 + 1;
+ }
+ } while (pos < bits_per_slot);
+ } else if (free_seq_cnt) {
+ bins_overall[cbits(free_seq_cnt) - 1]++;
+ free_seq_cnt = 0;
+ }
+ }
+ if (free_seq_cnt) {
+ bins_overall[cbits(free_seq_cnt) - 1]++;
+ }
+}
+
+inline ssize_t AllocatorLevel01Loose::count_0s(slot_t slot_val, size_t start_pos)
+ {
+ #ifdef __GNUC__
+ size_t pos = __builtin_ffsll(slot_val >> start_pos);
+ if (pos == 0)
+ return sizeof(slot_t)*8 - start_pos;
+ return pos - 1;
+ #else
+ size_t pos = start_pos;
+ slot_t mask = slot_t(1) << pos;
+ while (pos < bits_per_slot && (slot_val & mask) == 0) {
+ mask <<= 1;
+ pos++;
+ }
+ return pos - start_pos;
+ #endif
+ }
+
+ inline ssize_t AllocatorLevel01Loose::count_1s(slot_t slot_val, size_t start_pos)
+ {
+ return count_0s(~slot_val, start_pos);
+ }
+void AllocatorLevel01Loose::dump(
+ std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+ size_t len = 0;
+ size_t off = 0;
+ for (size_t i = 0; i < l1.size(); i++)
+ {
+ for (size_t j = 0; j < L1_ENTRIES_PER_SLOT * L1_ENTRY_WIDTH; j += L1_ENTRY_WIDTH)
+ {
+ size_t w = (l1[i] >> j) & L1_ENTRY_MASK;
+ switch (w) {
+ case L1_ENTRY_FULL:
+ if (len > 0) {
+ notify(off, len);
+ len = 0;
+ }
+ break;
+ case L1_ENTRY_FREE:
+ if (len == 0)
+ off = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset * bits_per_slot;
+ len += bits_per_slotset;
+ break;
+ case L1_ENTRY_PARTIAL:
+ size_t pos = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset;
+ for (size_t t = 0; t < slots_per_slotset; t++) {
+ size_t p = 0;
+ slot_t allocation_pattern = l0[pos + t];
+ while (p < bits_per_slot) {
+ if (len == 0) {
+ //continue to skip allocated space, meaning bits set to 0
+ ssize_t alloc_count = count_0s(allocation_pattern, p);
+ p += alloc_count;
+ //now we are switched to expecting free space
+ if (p < bits_per_slot) {
+ //now @p are 1s
+ ssize_t free_count = count_1s(allocation_pattern, p);
+ assert(free_count > 0);
+ len = free_count;
+ off = (pos + t) * bits_per_slot + p;
+ p += free_count;
+ }
+ } else {
+ //continue free region
+ ssize_t free_count = count_1s(allocation_pattern, p);
+ if (free_count == 0) {
+ notify(off, len);
+ len = 0;
+ } else {
+ p += free_count;
+ len += free_count;
+ }
+ }
+ }
+ }
+ break;
+ }
+ }
+ }
+ if (len > 0)
+ notify(off, len);
+}
+
+uint64_t AllocatorLevel01Loose::_claim_free_to_left_l0(int64_t l0_pos_start)
+{
+ int64_t d0 = L0_ENTRIES_PER_SLOT;
+
+ int64_t pos = l0_pos_start - 1;
+ slot_t bits = (slot_t)1 << (pos % d0);
+ int64_t idx = pos / d0;
+ slot_t* val_s = l0.data() + idx;
+
+ int64_t pos_e = p2align<int64_t>(pos, d0);
+
+ while (pos >= pos_e) {
+ if (0 == ((*val_s) & bits))
+ return pos + 1;
+ (*val_s) &= ~bits;
+ bits >>= 1;
+ --pos;
+ }
+ --idx;
+ val_s = l0.data() + idx;
+ while (idx >= 0 && (*val_s) == all_slot_set) {
+ *val_s = all_slot_clear;
+ --idx;
+ pos -= d0;
+ val_s = l0.data() + idx;
+ }
+
+ if (idx >= 0 &&
+ (*val_s) != all_slot_set && (*val_s) != all_slot_clear) {
+ int64_t pos_e = p2align<int64_t>(pos, d0);
+ slot_t bits = (slot_t)1 << (pos % d0);
+ while (pos >= pos_e) {
+ if (0 == ((*val_s) & bits))
+ return pos + 1;
+ (*val_s) &= ~bits;
+ bits >>= 1;
+ --pos;
+ }
+ }
+ return pos + 1;
+}
+
+uint64_t AllocatorLevel01Loose::_claim_free_to_right_l0(int64_t l0_pos_start)
+{
+ auto d0 = L0_ENTRIES_PER_SLOT;
+
+ int64_t pos = l0_pos_start;
+ slot_t bits = (slot_t)1 << (pos % d0);
+ size_t idx = pos / d0;
+ if (idx >= l0.size()) {
+ return pos;
+ }
+ slot_t* val_s = l0.data() + idx;
+
+ int64_t pos_e = p2roundup<int64_t>(pos + 1, d0);
+
+ while (pos < pos_e) {
+ if (0 == ((*val_s) & bits))
+ return pos;
+ (*val_s) &= ~bits;
+ bits <<= 1;
+ ++pos;
+ }
+ ++idx;
+ val_s = l0.data() + idx;
+ while (idx < l0.size() && (*val_s) == all_slot_set) {
+ *val_s = all_slot_clear;
+ ++idx;
+ pos += d0;
+ val_s = l0.data() + idx;
+ }
+
+ if (idx < l0.size() &&
+ (*val_s) != all_slot_set && (*val_s) != all_slot_clear) {
+ int64_t pos_e = p2roundup<int64_t>(pos + 1, d0);
+ slot_t bits = (slot_t)1 << (pos % d0);
+ while (pos < pos_e) {
+ if (0 == ((*val_s) & bits))
+ return pos;
+ (*val_s) &= ~bits;
+ bits <<= 1;
+ ++pos;
+ }
+ }
+ return pos;
+}
diff --git a/src/os/bluestore/fastbmap_allocator_impl.h b/src/os/bluestore/fastbmap_allocator_impl.h
new file mode 100755
index 00000000..52a1edee
--- /dev/null
+++ b/src/os/bluestore/fastbmap_allocator_impl.h
@@ -0,0 +1,833 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H
+#define __FAST_BITMAP_ALLOCATOR_IMPL_H
+#include "include/intarith.h"
+
+#include <vector>
+#include <algorithm>
+#include <mutex>
+
+typedef uint64_t slot_t;
+
+#ifdef NON_CEPH_BUILD
+#include <assert.h>
+struct interval_t
+{
+ uint64_t offset = 0;
+ uint64_t length = 0;
+
+ interval_t() {}
+ interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+ interval_t(const interval_t &ext) :
+ offset(ext.offset), length(ext.length) {}
+};
+typedef std::vector<interval_t> interval_vector_t;
+typedef std::vector<slot_t> slot_vector_t;
+#else
+#include "include/ceph_assert.h"
+#include "common/likely.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+#include "common/ceph_mutex.h"
+
+typedef bluestore_interval_t<uint64_t, uint64_t> interval_t;
+typedef PExtentVector interval_vector_t;
+
+typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
+
+#endif
+
+// fitting into cache line on x86_64
+static const size_t slots_per_slotset = 8; // 8 slots per set
+static const size_t slotset_bytes = sizeof(slot_t) * slots_per_slotset;
+static const size_t bits_per_slot = sizeof(slot_t) * 8;
+static const size_t bits_per_slotset = slotset_bytes * 8;
+static const slot_t all_slot_set = 0xffffffffffffffff;
+static const slot_t all_slot_clear = 0;
+
+inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos)
+{
+#ifdef __GNUC__
+ if (start_pos == 0) {
+ start_pos = __builtin_ffsll(slot_val);
+ return start_pos ? start_pos - 1 : bits_per_slot;
+ }
+#endif
+ slot_t mask = slot_t(1) << start_pos;
+ while (start_pos < bits_per_slot && !(slot_val & mask)) {
+ mask <<= 1;
+ ++start_pos;
+ }
+ return start_pos;
+}
+
+
+class AllocatorLevel
+{
+protected:
+
+ virtual uint64_t _children_per_slot() const = 0;
+ virtual uint64_t _level_granularity() const = 0;
+
+public:
+ static uint64_t l0_dives;
+ static uint64_t l0_iterations;
+ static uint64_t l0_inner_iterations;
+ static uint64_t alloc_fragments;
+ static uint64_t alloc_fragments_fast;
+ static uint64_t l2_allocs;
+
+ virtual ~AllocatorLevel()
+ {}
+
+ virtual void collect_stats(
+ std::map<size_t, size_t>& bins_overall) = 0;
+
+};
+
+class AllocatorLevel01 : public AllocatorLevel
+{
+protected:
+ slot_vector_t l0; // set bit means free entry
+ slot_vector_t l1;
+ uint64_t l0_granularity = 0; // space per entry
+ uint64_t l1_granularity = 0; // space per entry
+
+ size_t partial_l1_count = 0;
+ size_t unalloc_l1_count = 0;
+
+ double get_fragmentation() const {
+ double res = 0.0;
+ auto total = unalloc_l1_count + partial_l1_count;
+ if (total) {
+ res = double(partial_l1_count) / double(total);
+ }
+ return res;
+ }
+
+ uint64_t _level_granularity() const override
+ {
+ return l1_granularity;
+ }
+
+ inline bool _is_slot_fully_allocated(uint64_t idx) const {
+ return l1[idx] == all_slot_clear;
+ }
+public:
+ inline uint64_t get_min_alloc_size() const
+ {
+ return l0_granularity;
+ }
+
+};
+
+template <class T>
+class AllocatorLevel02;
+
+class AllocatorLevel01Loose : public AllocatorLevel01
+{
+ enum {
+ L1_ENTRY_WIDTH = 2,
+ L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1,
+ L1_ENTRY_FULL = 0x00,
+ L1_ENTRY_PARTIAL = 0x01,
+ L1_ENTRY_NOT_USED = 0x02,
+ L1_ENTRY_FREE = 0x03,
+ L1_ENTRIES_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, //32
+ L0_ENTRIES_PER_SLOT = bits_per_slot, // 64
+ };
+ uint64_t _children_per_slot() const override
+ {
+ return L1_ENTRIES_PER_SLOT;
+ }
+
+ interval_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1,
+ uint64_t min_length, interval_t* tail) const;
+
+ inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset,
+ uint64_t len,
+ interval_vector_t* res)
+ {
+ auto it = res->rbegin();
+ if (max_length) {
+ if (it != res->rend() && it->offset + it->length == offset) {
+ auto l = max_length - it->length;
+ if (l >= len) {
+ it->length += len;
+ return;
+ } else {
+ offset += l;
+ len -= l;
+ it->length += l;
+ }
+ }
+
+ while (len > max_length) {
+ res->emplace_back(offset, max_length);
+ offset += max_length;
+ len -= max_length;
+ }
+ res->emplace_back(offset, len);
+ return;
+ }
+
+ if (it != res->rend() && it->offset + it->length == offset) {
+ it->length += len;
+ } else {
+ res->emplace_back(offset, len);
+ }
+ }
+
+ bool _allocate_l0(uint64_t length,
+ uint64_t max_length,
+ uint64_t l0_pos0, uint64_t l0_pos1,
+ uint64_t* allocated,
+ interval_vector_t* res)
+ {
+ uint64_t d0 = L0_ENTRIES_PER_SLOT;
+
+ ++l0_dives;
+
+ ceph_assert(l0_pos0 < l0_pos1);
+ ceph_assert(length > *allocated);
+ ceph_assert(0 == (l0_pos0 % (slots_per_slotset * d0)));
+ ceph_assert(0 == (l0_pos1 % (slots_per_slotset * d0)));
+ ceph_assert(((length - *allocated) % l0_granularity) == 0);
+
+ uint64_t need_entries = (length - *allocated) / l0_granularity;
+
+ for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated);
+ ++idx) {
+ ++l0_iterations;
+ slot_t& slot_val = l0[idx];
+ auto base = idx * d0;
+ if (slot_val == all_slot_clear) {
+ continue;
+ } else if (slot_val == all_slot_set) {
+ uint64_t to_alloc = std::min(need_entries, d0);
+ *allocated += to_alloc * l0_granularity;
+ ++alloc_fragments;
+ need_entries -= to_alloc;
+
+ _fragment_and_emplace(max_length, base * l0_granularity,
+ to_alloc * l0_granularity, res);
+
+ if (to_alloc == d0) {
+ slot_val = all_slot_clear;
+ } else {
+ _mark_alloc_l0(base, base + to_alloc);
+ }
+ continue;
+ }
+
+ auto free_pos = find_next_set_bit(slot_val, 0);
+ ceph_assert(free_pos < bits_per_slot);
+ auto next_pos = free_pos + 1;
+ while (next_pos < bits_per_slot &&
+ (next_pos - free_pos) < need_entries) {
+ ++l0_inner_iterations;
+
+ if (0 == (slot_val & (slot_t(1) << next_pos))) {
+ auto to_alloc = (next_pos - free_pos);
+ *allocated += to_alloc * l0_granularity;
+ ++alloc_fragments;
+ need_entries -= to_alloc;
+ _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+ to_alloc * l0_granularity, res);
+ _mark_alloc_l0(base + free_pos, base + next_pos);
+ free_pos = find_next_set_bit(slot_val, next_pos + 1);
+ next_pos = free_pos + 1;
+ } else {
+ ++next_pos;
+ }
+ }
+ if (need_entries && free_pos < bits_per_slot) {
+ auto to_alloc = std::min(need_entries, d0 - free_pos);
+ *allocated += to_alloc * l0_granularity;
+ ++alloc_fragments;
+ need_entries -= to_alloc;
+ _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+ to_alloc * l0_granularity, res);
+ _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc);
+ }
+ }
+ return _is_empty_l0(l0_pos0, l0_pos1);
+ }
+
+protected:
+
+ friend class AllocatorLevel02<AllocatorLevel01Loose>;
+
+ void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+ {
+ l0_granularity = _alloc_unit;
+ // 512 bits at L0 mapped to L1 entry
+ l1_granularity = l0_granularity * bits_per_slotset;
+
+ // capacity to have slot alignment at l1
+ auto aligned_capacity =
+ p2roundup((int64_t)capacity,
+ int64_t(l1_granularity * slots_per_slotset * _children_per_slot()));
+ size_t slot_count =
+ aligned_capacity / l1_granularity / _children_per_slot();
+ // we use set bit(s) as a marker for (partially) free entry
+ l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+ // l0 slot count
+ size_t slot_count_l0 = aligned_capacity / _alloc_unit / bits_per_slot;
+ // we use set bit(s) as a marker for (partially) free entry
+ l0.resize(slot_count_l0, mark_as_free ? all_slot_set : all_slot_clear);
+
+ partial_l1_count = unalloc_l1_count = 0;
+ if (mark_as_free) {
+ unalloc_l1_count = slot_count * _children_per_slot();
+ auto l0_pos_no_use = p2roundup((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity;
+ _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity);
+ }
+ }
+
+ struct search_ctx_t
+ {
+ size_t partial_count = 0;
+ size_t free_count = 0;
+ uint64_t free_l1_pos = 0;
+
+ uint64_t min_affordable_len = 0;
+ uint64_t min_affordable_offs = 0;
+ uint64_t affordable_len = 0;
+ uint64_t affordable_offs = 0;
+
+ bool fully_processed = false;
+
+ void reset()
+ {
+ *this = search_ctx_t();
+ }
+ };
+ enum {
+ NO_STOP,
+ STOP_ON_EMPTY,
+ STOP_ON_PARTIAL,
+ };
+ void _analyze_partials(uint64_t pos_start, uint64_t pos_end,
+ uint64_t length, uint64_t min_length, int mode,
+ search_ctx_t* ctx);
+
+ void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end);
+ void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end);
+ uint64_t _claim_free_to_left_l0(int64_t l0_pos_start);
+ uint64_t _claim_free_to_right_l0(int64_t l0_pos_start);
+
+
+ void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+ {
+ _mark_alloc_l0(l0_pos_start, l0_pos_end);
+ l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
+ l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
+ _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+ }
+
+ void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+ {
+ auto d0 = L0_ENTRIES_PER_SLOT;
+
+ auto pos = l0_pos_start;
+ slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+ slot_t* val_s = &l0[pos / d0];
+ int64_t pos_e = std::min(l0_pos_end,
+ p2roundup<int64_t>(l0_pos_start + 1, d0));
+ while (pos < pos_e) {
+ *val_s |= bits;
+ bits <<= 1;
+ pos++;
+ }
+ pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
+ while (pos < pos_e) {
+ *(++val_s) = all_slot_set;
+ pos += d0;
+ }
+ bits = 1;
+ ++val_s;
+ while (pos < l0_pos_end) {
+ *val_s |= bits;
+ bits <<= 1;
+ pos++;
+ }
+ }
+
+ void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+ {
+ _mark_free_l0(l0_pos_start, l0_pos_end);
+ l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
+ l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
+ _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+ }
+
+ bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end)
+ {
+ bool no_free = true;
+ uint64_t d = slots_per_slotset * L0_ENTRIES_PER_SLOT;
+ ceph_assert(0 == (l0_pos % d));
+ ceph_assert(0 == (l0_pos_end % d));
+
+ auto idx = l0_pos / L0_ENTRIES_PER_SLOT;
+ auto idx_end = l0_pos_end / L0_ENTRIES_PER_SLOT;
+ while (idx < idx_end && no_free) {
+ no_free = l0[idx] == all_slot_clear;
+ ++idx;
+ }
+ return no_free;
+ }
+ bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end)
+ {
+ bool no_free = true;
+ uint64_t d = slots_per_slotset * _children_per_slot();
+ ceph_assert(0 == (l1_pos % d));
+ ceph_assert(0 == (l1_pos_end % d));
+
+ auto idx = l1_pos / L1_ENTRIES_PER_SLOT;
+ auto idx_end = l1_pos_end / L1_ENTRIES_PER_SLOT;
+ while (idx < idx_end && no_free) {
+ no_free = _is_slot_fully_allocated(idx);
+ ++idx;
+ }
+ return no_free;
+ }
+
+ interval_t _allocate_l1_contiguous(uint64_t length,
+ uint64_t min_length, uint64_t max_length,
+ uint64_t pos_start, uint64_t pos_end);
+
+ bool _allocate_l1(uint64_t length,
+ uint64_t min_length, uint64_t max_length,
+ uint64_t l1_pos_start, uint64_t l1_pos_end,
+ uint64_t* allocated,
+ interval_vector_t* res);
+
+ uint64_t _mark_alloc_l1(uint64_t offset, uint64_t length)
+ {
+ uint64_t l0_pos_start = offset / l0_granularity;
+ uint64_t l0_pos_end = p2roundup(offset + length, l0_granularity) / l0_granularity;
+ _mark_alloc_l1_l0(l0_pos_start, l0_pos_end);
+ return l0_granularity * (l0_pos_end - l0_pos_start);
+ }
+
+ uint64_t _free_l1(uint64_t offs, uint64_t len)
+ {
+ uint64_t l0_pos_start = offs / l0_granularity;
+ uint64_t l0_pos_end = p2roundup(offs + len, l0_granularity) / l0_granularity;
+ _mark_free_l1_l0(l0_pos_start, l0_pos_end);
+ return l0_granularity * (l0_pos_end - l0_pos_start);
+ }
+
+ uint64_t claim_free_to_left_l1(uint64_t offs)
+ {
+ uint64_t l0_pos_end = offs / l0_granularity;
+ uint64_t l0_pos_start = _claim_free_to_left_l0(l0_pos_end);
+ if (l0_pos_start < l0_pos_end) {
+ _mark_l1_on_l0(
+ p2align(l0_pos_start, uint64_t(bits_per_slotset)),
+ p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
+ return l0_granularity * (l0_pos_end - l0_pos_start);
+ }
+ return 0;
+ }
+
+ uint64_t claim_free_to_right_l1(uint64_t offs)
+ {
+ uint64_t l0_pos_start = offs / l0_granularity;
+ uint64_t l0_pos_end = _claim_free_to_right_l0(l0_pos_start);
+
+ if (l0_pos_start < l0_pos_end) {
+ _mark_l1_on_l0(
+ p2align(l0_pos_start, uint64_t(bits_per_slotset)),
+ p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
+ return l0_granularity * (l0_pos_end - l0_pos_start);
+ }
+ return 0;
+ }
+
+
+public:
+ uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+ {
+ if (pos1 == 0) {
+ pos1 = l1.size() * L1_ENTRIES_PER_SLOT;
+ }
+ auto avail = debug_get_free(pos0, pos1);
+ return (pos1 - pos0) * l1_granularity - avail;
+ }
+
+ uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0)
+ {
+ ceph_assert(0 == (l1_pos0 % L1_ENTRIES_PER_SLOT));
+ ceph_assert(0 == (l1_pos1 % L1_ENTRIES_PER_SLOT));
+
+ auto idx0 = l1_pos0 * slots_per_slotset;
+ auto idx1 = l1_pos1 * slots_per_slotset;
+
+ if (idx1 == 0) {
+ idx1 = l0.size();
+ }
+
+ uint64_t res = 0;
+ for (uint64_t i = idx0; i < idx1; ++i) {
+ auto v = l0[i];
+ if (v == all_slot_set) {
+ res += L0_ENTRIES_PER_SLOT;
+ } else if (v != all_slot_clear) {
+ size_t cnt = 0;
+#ifdef __GNUC__
+ cnt = __builtin_popcountll(v);
+#else
+ // Kernighan's Alg to count set bits
+ while (v) {
+ v &= (v - 1);
+ cnt++;
+ }
+#endif
+ res += cnt;
+ }
+ }
+ return res * l0_granularity;
+ }
+ void collect_stats(
+ std::map<size_t, size_t>& bins_overall) override;
+
+ static inline ssize_t count_0s(slot_t slot_val, size_t start_pos);
+ static inline ssize_t count_1s(slot_t slot_val, size_t start_pos);
+ void dump(std::function<void(uint64_t offset, uint64_t length)> notify);
+};
+
+
+class AllocatorLevel01Compact : public AllocatorLevel01
+{
+ uint64_t _children_per_slot() const override
+ {
+ return 8;
+ }
+public:
+ void collect_stats(
+ std::map<size_t, size_t>& bins_overall) override
+ {
+ // not implemented
+ }
+};
+
+template <class L1>
+class AllocatorLevel02 : public AllocatorLevel
+{
+public:
+ uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0)
+ {
+ std::lock_guard l(lock);
+ return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot,
+ pos1 * l1._children_per_slot() * bits_per_slot);
+ }
+ uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+ {
+ std::lock_guard l(lock);
+ return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot,
+ pos1 * l1._children_per_slot() * bits_per_slot);
+ }
+
+ uint64_t get_available()
+ {
+ std::lock_guard l(lock);
+ return available;
+ }
+ inline uint64_t get_min_alloc_size() const
+ {
+ return l1.get_min_alloc_size();
+ }
+ void collect_stats(
+ std::map<size_t, size_t>& bins_overall) override {
+
+ std::lock_guard l(lock);
+ l1.collect_stats(bins_overall);
+ }
+ uint64_t claim_free_to_left(uint64_t offset) {
+ std::lock_guard l(lock);
+ auto allocated = l1.claim_free_to_left_l1(offset);
+ ceph_assert(available >= allocated);
+ available -= allocated;
+
+ uint64_t l2_pos = (offset - allocated) / l2_granularity;
+ uint64_t l2_pos_end =
+ p2roundup(int64_t(offset), int64_t(l2_granularity)) / l2_granularity;
+ _mark_l2_on_l1(l2_pos, l2_pos_end);
+ return allocated;
+ }
+
+ uint64_t claim_free_to_right(uint64_t offset) {
+ std::lock_guard l(lock);
+ auto allocated = l1.claim_free_to_right_l1(offset);
+ ceph_assert(available >= allocated);
+ available -= allocated;
+
+ uint64_t l2_pos = (offset) / l2_granularity;
+ int64_t end = offset + allocated;
+ uint64_t l2_pos_end = p2roundup(end, int64_t(l2_granularity)) / l2_granularity;
+ _mark_l2_on_l1(l2_pos, l2_pos_end);
+ return allocated;
+ }
+protected:
+ ceph::mutex lock = ceph::make_mutex("AllocatorLevel02::lock");
+ L1 l1;
+ slot_vector_t l2;
+ uint64_t l2_granularity = 0; // space per entry
+ uint64_t available = 0;
+ uint64_t last_pos = 0;
+
+ enum {
+ L1_ENTRIES_PER_SLOT = bits_per_slot, // 64
+ };
+
+ uint64_t _children_per_slot() const override
+ {
+ return L1_ENTRIES_PER_SLOT;
+ }
+ uint64_t _level_granularity() const override
+ {
+ return l2_granularity;
+ }
+
+ void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+ {
+ ceph_assert(isp2(_alloc_unit));
+ l1._init(capacity, _alloc_unit, mark_as_free);
+
+ l2_granularity =
+ l1._level_granularity() * l1._children_per_slot() * slots_per_slotset;
+
+ // capacity to have slot alignment at l2
+ auto aligned_capacity =
+ p2roundup((int64_t)capacity, (int64_t)l2_granularity * L1_ENTRIES_PER_SLOT);
+ size_t elem_count = aligned_capacity / l2_granularity / L1_ENTRIES_PER_SLOT;
+ // we use set bit(s) as a marker for (partially) free entry
+ l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+ if (mark_as_free) {
+ // capacity to have slotset alignment at l1
+ auto l2_pos_no_use =
+ p2roundup((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity;
+ _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity);
+ available = p2align(capacity, _alloc_unit);
+ } else {
+ available = 0;
+ }
+ }
+
+ void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end)
+ {
+ auto d = L1_ENTRIES_PER_SLOT;
+ ceph_assert(0 <= l2_pos_end);
+ ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+ while (l2_pos < l2_pos_end) {
+ l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+ ++l2_pos;
+ }
+ }
+
+ void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end)
+ {
+ auto d = L1_ENTRIES_PER_SLOT;
+ ceph_assert(0 <= l2_pos_end);
+ ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+ while (l2_pos < l2_pos_end) {
+ l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+ ++l2_pos;
+ }
+ }
+
+ void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end)
+ {
+ auto d = L1_ENTRIES_PER_SLOT;
+ ceph_assert(0 <= l2_pos_end);
+ ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+ auto idx = l2_pos * slots_per_slotset;
+ auto idx_end = l2_pos_end * slots_per_slotset;
+ bool all_allocated = true;
+ while (idx < idx_end) {
+ if (!l1._is_slot_fully_allocated(idx)) {
+ all_allocated = false;
+ idx = p2roundup(int64_t(++idx), int64_t(slots_per_slotset));
+ }
+ else {
+ ++idx;
+ }
+ if ((idx % slots_per_slotset) == 0) {
+ if (all_allocated) {
+ l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+ }
+ else {
+ l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+ }
+ all_allocated = true;
+ ++l2_pos;
+ }
+ }
+ }
+
+ void _allocate_l2(uint64_t length,
+ uint64_t min_length,
+ uint64_t max_length,
+ uint64_t hint,
+
+ uint64_t* allocated,
+ interval_vector_t* res)
+ {
+ uint64_t prev_allocated = *allocated;
+ uint64_t d = L1_ENTRIES_PER_SLOT;
+ ceph_assert(min_length <= l2_granularity);
+ ceph_assert(max_length == 0 || max_length >= min_length);
+ ceph_assert(max_length == 0 || (max_length % min_length) == 0);
+ ceph_assert(length >= min_length);
+ ceph_assert((length % min_length) == 0);
+
+ uint64_t cap = 1ull << 31;
+ if (max_length == 0 || max_length >= cap) {
+ max_length = cap;
+ }
+
+ uint64_t l1_w = slots_per_slotset * l1._children_per_slot();
+
+ std::lock_guard l(lock);
+
+ if (available < min_length) {
+ return;
+ }
+ if (hint != 0) {
+ last_pos = (hint / (d * l2_granularity)) < l2.size() ? p2align(hint / l2_granularity, d) : 0;
+ }
+ auto l2_pos = last_pos;
+ auto last_pos0 = last_pos;
+ auto pos = last_pos / d;
+ auto pos_end = l2.size();
+ // outer loop below is intended to optimize the performance by
+ // avoiding 'modulo' operations inside the internal loop.
+ // Looks like they have negative impact on the performance
+ for (auto i = 0; i < 2; ++i) {
+ for(; length > *allocated && pos < pos_end; ++pos) {
+ slot_t& slot_val = l2[pos];
+ size_t free_pos = 0;
+ bool all_set = false;
+ if (slot_val == all_slot_clear) {
+ l2_pos += d;
+ last_pos = l2_pos;
+ continue;
+ } else if (slot_val == all_slot_set) {
+ free_pos = 0;
+ all_set = true;
+ } else {
+ free_pos = find_next_set_bit(slot_val, 0);
+ ceph_assert(free_pos < bits_per_slot);
+ }
+ do {
+ ceph_assert(length > *allocated);
+ bool empty = l1._allocate_l1(length,
+ min_length,
+ max_length,
+ (l2_pos + free_pos) * l1_w,
+ (l2_pos + free_pos + 1) * l1_w,
+ allocated,
+ res);
+ if (empty) {
+ slot_val &= ~(slot_t(1) << free_pos);
+ }
+ if (length <= *allocated || slot_val == all_slot_clear) {
+ break;
+ }
+ ++free_pos;
+ if (!all_set) {
+ free_pos = find_next_set_bit(slot_val, free_pos);
+ }
+ } while (free_pos < bits_per_slot);
+ last_pos = l2_pos;
+ l2_pos += d;
+ }
+ l2_pos = 0;
+ pos = 0;
+ pos_end = last_pos0 / d;
+ }
+
+ ++l2_allocs;
+ auto allocated_here = *allocated - prev_allocated;
+ ceph_assert(available >= allocated_here);
+ available -= allocated_here;
+ }
+
+#ifndef NON_CEPH_BUILD
+ // to provide compatibility with BlueStore's allocator interface
+ void _free_l2(const interval_set<uint64_t> & rr)
+ {
+ uint64_t released = 0;
+ std::lock_guard l(lock);
+ for (auto r : rr) {
+ released += l1._free_l1(r.first, r.second);
+ uint64_t l2_pos = r.first / l2_granularity;
+ uint64_t l2_pos_end = p2roundup(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity;
+
+ _mark_l2_free(l2_pos, l2_pos_end);
+ }
+ available += released;
+ }
+#endif
+
+ template <typename T>
+ void _free_l2(const T& rr)
+ {
+ uint64_t released = 0;
+ std::lock_guard l(lock);
+ for (auto r : rr) {
+ released += l1._free_l1(r.offset, r.length);
+ uint64_t l2_pos = r.offset / l2_granularity;
+ uint64_t l2_pos_end = p2roundup(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity;
+
+ _mark_l2_free(l2_pos, l2_pos_end);
+ }
+ available += released;
+ }
+
+ void _mark_allocated(uint64_t o, uint64_t len)
+ {
+ uint64_t l2_pos = o / l2_granularity;
+ uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+ std::lock_guard l(lock);
+ auto allocated = l1._mark_alloc_l1(o, len);
+ ceph_assert(available >= allocated);
+ available -= allocated;
+ _mark_l2_on_l1(l2_pos, l2_pos_end);
+ }
+
+ void _mark_free(uint64_t o, uint64_t len)
+ {
+ uint64_t l2_pos = o / l2_granularity;
+ uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+ std::lock_guard l(lock);
+ available += l1._free_l1(o, len);
+ _mark_l2_free(l2_pos, l2_pos_end);
+ }
+ void _shutdown()
+ {
+ last_pos = 0;
+ }
+ double _get_fragmentation() {
+ std::lock_guard l(lock);
+ return l1.get_fragmentation();
+ }
+};
+
+#endif
diff --git a/src/os/filestore/BtrfsFileStoreBackend.cc b/src/os/filestore/BtrfsFileStoreBackend.cc
new file mode 100644
index 00000000..2ff2000d
--- /dev/null
+++ b/src/os/filestore/BtrfsFileStoreBackend.cc
@@ -0,0 +1,575 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/ceph_assert.h"
+
+#ifndef __CYGWIN__
+#include "os/fs/btrfs_ioctl.h"
+#endif
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "BtrfsFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+
+#if defined(__linux__)
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
+ GenericFileStoreBackend(fs), has_clone_range(false),
+ has_snap_create(false), has_snap_destroy(false),
+ has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
+ m_filestore_btrfs_clone_range(cct()->_conf->filestore_btrfs_clone_range),
+ m_filestore_btrfs_snap (cct()->_conf->filestore_btrfs_snap) { }
+
+int BtrfsFileStoreBackend::detect_features()
+{
+ int r;
+
+ r = GenericFileStoreBackend::detect_features();
+ if (r < 0)
+ return r;
+
+ // clone_range?
+ if (m_filestore_btrfs_clone_range) {
+ int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY|O_CLOEXEC, 0600);
+ if (fd >= 0) {
+ if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
+ << cpp_strerror(r) << dendl;
+ }
+ btrfs_ioctl_clone_range_args clone_args;
+ memset(&clone_args, 0, sizeof(clone_args));
+ clone_args.src_fd = -1;
+ r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args);
+ if (r < 0 && errno == EBADF) {
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl;
+ has_clone_range = true;
+ } else {
+ r = -errno;
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
+ }
+ TEMP_FAILURE_RETRY(::close(fd));
+ } else {
+ r = -errno;
+ dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
+ << cpp_strerror(r) << dendl;
+ }
+ } else {
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
+ }
+
+ struct btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+
+ // create test source volume
+ vol_args.fd = 0;
+ strcpy(vol_args.name, "test_subvol");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args);
+ if (r != 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+ int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY|O_CLOEXEC);
+ if (srcfd < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+
+ // snap_create and snap_destroy?
+ vol_args.fd = srcfd;
+ strcpy(vol_args.name, "sync_snap_test");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ int err = errno;
+ if (r == 0 || errno == EEXIST) {
+ dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl;
+ has_snap_create = true;
+
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r == 0) {
+ dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl;
+ has_snap_destroy = true;
+ } else {
+ err = -errno;
+ dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+
+ if (err == -EPERM && getuid() != 0) {
+ dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
+ cerr << TEXT_YELLOW
+ << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
+ << TEXT_NORMAL << std::endl;
+ } else if (err == -EOPNOTSUPP) {
+ derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
+ }
+ }
+ } else {
+ dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
+ }
+
+ if (m_filestore_btrfs_snap) {
+ if (has_snap_destroy)
+ stable_commits = true;
+ else
+ dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
+ }
+
+ // start_sync?
+ __u64 transid = 0;
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid);
+ if (r < 0) {
+ int err = errno;
+ dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl;
+ }
+ if (r == 0 && transid > 0) {
+ dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl;
+
+ // do we have wait_sync too?
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+ if (r == 0 || errno == ERANGE) {
+ dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl;
+ has_wait_sync = true;
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+
+ if (has_wait_sync) {
+ // async snap creation?
+ struct btrfs_ioctl_vol_args_v2 async_args;
+ memset(&async_args, 0, sizeof(async_args));
+ async_args.fd = srcfd;
+ async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+ strcpy(async_args.name, "async_snap_test");
+
+ // remove old one, first
+ struct stat st;
+ strcpy(vol_args.name, async_args.name);
+ if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) {
+ dout(0) << "detect_feature: removing old async_snap_test" << dendl;
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r != 0) {
+ int err = errno;
+ dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
+ }
+ }
+
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+ if (r == 0 || errno == EEXIST) {
+ dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl;
+ has_snap_create_v2 = true;
+
+ // clean up
+ strcpy(vol_args.name, "async_snap_test");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r != 0) {
+ int err = errno;
+ dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+ }
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+ }
+
+ // clean up test subvol
+ if (srcfd >= 0)
+ TEMP_FAILURE_RETRY(::close(srcfd));
+
+ strcpy(vol_args.name, "test_subvol");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+
+ if (m_filestore_btrfs_snap && !has_snap_create_v2) {
+ dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
+ << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
+ << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n"
+ << " performance.\n"
+ << TEXT_NORMAL;
+ }
+
+ return 0;
+}
+
+bool BtrfsFileStoreBackend::can_checkpoint()
+{
+ return stable_commits;
+}
+
+int BtrfsFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+ return -EINVAL;
+ }
+
+ struct stat basest;
+ struct statfs currentfs;
+ ret = ::fstat(get_basedir_fd(), &basest);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = ::statfs(get_current_path().c_str(), &currentfs);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) {
+ dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl;
+ stable_commits = true;
+ }
+ return 0;
+ }
+
+ struct btrfs_ioctl_vol_args volargs;
+ memset(&volargs, 0, sizeof(volargs));
+
+ volargs.fd = 0;
+ strcpy(volargs.name, "current");
+ if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
+ ret = -errno;
+ dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl;
+ if (::chmod(get_current_path().c_str(), 0755) < 0) {
+ ret = -errno;
+ dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ stable_commits = true;
+ return 0;
+}
+
+int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+ int ret, err = 0;
+
+ struct stat basest;
+ ret = ::fstat(get_basedir_fd(), &basest);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // get snap list
+ DIR *dir = ::opendir(get_basedir_path().c_str());
+ if (!dir) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ list<string> snaps;
+ char path[PATH_MAX];
+ struct dirent *de;
+ while ((de = ::readdir(dir))) {
+ snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name);
+
+ struct stat st;
+ ret = ::stat(path, &st);
+ if (ret < 0) {
+ err = -errno;
+ dout(0) << "list_checkpoints: stat '" << path << "' failed: "
+ << cpp_strerror(err) << dendl;
+ break;
+ }
+
+ if (!S_ISDIR(st.st_mode))
+ continue;
+
+ struct statfs fs;
+ ret = ::statfs(path, &fs);
+ if (ret < 0) {
+ err = -errno;
+ dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
+ << cpp_strerror(err) << dendl;
+ break;
+ }
+
+ if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
+ snaps.push_back(string(de->d_name));
+ }
+
+ if (::closedir(dir) < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
+ if (!err)
+ err = ret;
+ }
+
+ if (err)
+ return err;
+
+ ls.swap(snaps);
+ return 0;
+}
+
+int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid)
+{
+ dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+ if (has_snap_create_v2 && transid) {
+ struct btrfs_ioctl_vol_args_v2 async_args;
+ memset(&async_args, 0, sizeof(async_args));
+ async_args.fd = get_current_fd();
+ async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+
+ size_t name_size = sizeof(async_args.name);
+ strncpy(async_args.name, name.c_str(), name_size);
+ async_args.name[name_size-1] = '\0';
+
+ int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl;
+ *transid = async_args.transid;
+ } else {
+ struct btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = get_current_fd();
+
+ size_t name_size = sizeof(vol_args.name);
+ strncpy(vol_args.name, name.c_str(), name_size);
+ vol_args.name[name_size-1] = '\0';
+
+ int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (transid)
+ *transid = 0;
+ }
+ return 0;
+}
+
+int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid)
+{
+ // wait for commit
+ dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl;
+ int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl;
+ return -errno;
+ }
+ dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl;
+ return 0;
+}
+
+int BtrfsFileStoreBackend::rollback_to(const string& name)
+{
+ dout(10) << "rollback_to: to '" << name << "'" << dendl;
+ char s[PATH_MAX];
+ btrfs_ioctl_vol_args vol_args;
+
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = 0;
+ strcpy(vol_args.name, "current");
+
+ int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (ret && errno != ENOENT) {
+ dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl;
+ snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
+ if (::rename(get_current_path().c_str(), s)) {
+ ret = -errno;
+ dout(0) << "rollback_to: error renaming old current subvol: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+
+ snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str());
+
+ // roll back
+ vol_args.fd = ::open(s, O_RDONLY|O_CLOEXEC);
+ if (vol_args.fd < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ if (ret < 0 ) {
+ ret = -errno;
+ dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl;
+ }
+ TEMP_FAILURE_RETRY(::close(vol_args.fd));
+ return ret;
+}
+
+int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
+{
+ dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+ btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = 0;
+ strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name));
+
+ int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (ret) {
+ ret = -errno;
+ dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int BtrfsFileStoreBackend::syncfs()
+{
+ dout(15) << "syncfs" << dendl;
+ // do a full btrfs commit
+ int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl;
+ }
+ return ret;
+}
+
+int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl;
+ size_t blk_size = get_blksize();
+ if (!has_clone_range ||
+ srcoff % blk_size != dstoff % blk_size) {
+ dout(20) << "clone_range: using copy" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ int err = 0;
+ int r = 0;
+
+ uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
+ uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
+ if (srcoffclone >= srcoff + len) {
+ dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ uint64_t lenclone = len - (srcoffclone - srcoff);
+ if (!ALIGNED(lenclone, blk_size)) {
+ struct stat from_stat, to_stat;
+ err = ::fstat(from, &from_stat);
+ if (err) return -errno;
+ err = ::fstat(to , &to_stat);
+ if (err) return -errno;
+
+ if (srcoff + len != (uint64_t)from_stat.st_size ||
+ dstoff + len < (uint64_t)to_stat.st_size) {
+ // Not to the end of the file, need to align length as well
+ lenclone = ALIGN_DOWN(lenclone, blk_size);
+ }
+ }
+ if (lenclone == 0) {
+ // too short
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone
+ << " to " << dstoffclone << " = " << r << dendl;
+ btrfs_ioctl_clone_range_args a;
+ a.src_fd = from;
+ a.src_offset = srcoffclone;
+ a.src_length = lenclone;
+ a.dest_offset = dstoffclone;
+ err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
+ if (err >= 0) {
+ r += err;
+ } else if (errno == EINVAL) {
+ // Still failed, might be compressed
+ dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ } else {
+ return -errno;
+ }
+
+ // Take care any trimmed from front
+ if (srcoffclone != srcoff) {
+ err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
+ if (err >= 0) {
+ r += err;
+ } else {
+ return err;
+ }
+ }
+
+ // Copy end
+ if (srcoffclone + lenclone != srcoff + len) {
+ err = _copy_range(from, to,
+ srcoffclone + lenclone,
+ (srcoff + len) - (srcoffclone + lenclone),
+ dstoffclone + lenclone);
+ if (err >= 0) {
+ r += err;
+ } else {
+ return err;
+ }
+ }
+ dout(20) << "clone_range: finished " << srcoff << "~" << len
+ << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+#endif
diff --git a/src/os/filestore/BtrfsFileStoreBackend.h b/src/os/filestore/BtrfsFileStoreBackend.h
new file mode 100644
index 00000000..0794be2d
--- /dev/null
+++ b/src/os/filestore/BtrfsFileStoreBackend.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BTRFSFILESTOREBACKEDN_H
+#define CEPH_BTRFSFILESTOREBACKEDN_H
+
+#if defined(__linux__)
+#include "GenericFileStoreBackend.h"
+
+class BtrfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+ bool has_clone_range; ///< clone range ioctl is supported
+ bool has_snap_create; ///< snap create ioctl is supported
+ bool has_snap_destroy; ///< snap destroy ioctl is supported
+ bool has_snap_create_v2; ///< snap create v2 ioctl (async!) is supported
+ bool has_wait_sync; ///< wait sync ioctl is supported
+ bool stable_commits;
+ bool m_filestore_btrfs_clone_range;
+ bool m_filestore_btrfs_snap;
+public:
+ explicit BtrfsFileStoreBackend(FileStore *fs);
+ ~BtrfsFileStoreBackend() override {}
+ const char *get_name() override {
+ return "btrfs";
+ }
+ int detect_features() override;
+ bool can_checkpoint() override;
+ int create_current() override;
+ int list_checkpoints(list<string>& ls) override;
+ int create_checkpoint(const string& name, uint64_t *cid) override;
+ int sync_checkpoint(uint64_t cid) override;
+ int rollback_to(const string& name) override;
+ int destroy_checkpoint(const string& name) override;
+ int syncfs() override;
+ int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override;
+};
+#endif
+#endif
diff --git a/src/os/filestore/CollectionIndex.h b/src/os/filestore/CollectionIndex.h
new file mode 100644
index 00000000..eb43e47d
--- /dev/null
+++ b/src/os/filestore/CollectionIndex.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef OS_COLLECTIONINDEX_H
+#define OS_COLLECTIONINDEX_H
+
+#include <string>
+#include <vector>
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/RWLock.h"
+
+/**
+ CollectionIndex provides an interface for manipulating indexed collections
+ */
+class CollectionIndex {
+public:
+ CephContext* cct;
+protected:
+ /**
+ * Object encapsulating a returned path.
+ *
+ * A path to an object (existent or non-existent) becomes invalid
+ * when a different object is created in the index. Path stores
+ * a shared_ptr to the CollectionIndex to keep the index alive
+ * during its lifetime.
+ * @see IndexManager
+ * @see self_ref
+ * @see set_ref
+ */
+ class Path {
+ public:
+ /// Returned path
+ string full_path;
+ /// Ref to parent Index
+ CollectionIndex* parent_ref;
+ /// coll_t for parent Index
+ coll_t parent_coll;
+
+ /// Normal Constructor
+ Path(
+ string path, ///< [in] Path to return.
+ CollectionIndex* ref)
+ : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {}
+
+ /// Debugging Constructor
+ Path(
+ string path, ///< [in] Path to return.
+ const coll_t& coll) ///< [in] collection
+ : full_path(path), parent_coll(coll) {}
+
+ /// Getter for the stored path.
+ const char *path() const { return full_path.c_str(); }
+
+ /// Getter for collection
+ const coll_t& coll() const { return parent_coll; }
+
+ /// Getter for parent
+ CollectionIndex* get_index() const {
+ return parent_ref;
+ }
+ };
+ public:
+
+ RWLock access_lock;
+ /// Type of returned paths
+ typedef std::shared_ptr<Path> IndexedPath;
+
+ static IndexedPath get_testing_path(string path, coll_t collection) {
+ return std::make_shared<Path>(path, collection);
+ }
+
+ static const uint32_t FLAT_INDEX_TAG = 0;
+ static const uint32_t HASH_INDEX_TAG = 1;
+ static const uint32_t HASH_INDEX_TAG_2 = 2;
+ static const uint32_t HOBJECT_WITH_POOL = 3;
+ /**
+ * For tracking Filestore collection versions.
+ *
+ * @return Collection version represented by the Index implementation
+ */
+ virtual uint32_t collection_version() = 0;
+
+ /**
+ * Returns the collection managed by this CollectionIndex
+ */
+ virtual coll_t coll() const = 0;
+
+
+ /**
+ * Initializes the index.
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int init() = 0;
+
+ /**
+ * Cleanup before replaying journal
+ *
+ * Index implementations may need to perform compound operations
+ * which may leave the collection unstable if interrupted. cleanup
+ * is called on mount to allow the CollectionIndex implementation
+ * to stabilize.
+ *
+ * @see HashIndex
+ * @return Error Code, 0 for success
+ */
+ virtual int cleanup() = 0;
+
+ /**
+ * Call when a file is created using a path returned from lookup.
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int created(
+ const ghobject_t &oid, ///< [in] Created object.
+ const char *path ///< [in] Path to created object.
+ ) = 0;
+
+ /**
+ * Removes oid from the collection
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int unlink(
+ const ghobject_t &oid ///< [in] Object to remove
+ ) = 0;
+
+ /**
+ * Gets the IndexedPath for oid.
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int lookup(
+ const ghobject_t &oid, ///< [in] Object to lookup
+ IndexedPath *path, ///< [out] Path to object
+ int *hardlink ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist.
+ ) = 0;
+
+ /**
+ * Moves objects matching @e match in the lsb @e bits
+ *
+ * dest and this must be the same subclass
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int split(
+ uint32_t match, //< [in] value to match
+ uint32_t bits, //< [in] bits to check
+ CollectionIndex* dest //< [in] destination index
+ ) { ceph_abort(); return 0; }
+
+ virtual int merge(
+ uint32_t bits, //< [in] common (target) bits
+ CollectionIndex* dest //< [in] destination index
+ ) { ceph_abort(); return 0; }
+
+
+ /// List contents of collection by hash
+ virtual int collection_list_partial(
+ const ghobject_t &start, ///< [in] object at which to start
+ const ghobject_t &end, ///< [in] list only objects < end
+ int max_count, ///< [in] return at most max_count objects
+ vector<ghobject_t> *ls, ///< [out] Listed objects
+ ghobject_t *next ///< [out] Next object to list
+ ) = 0;
+
+ /// Call prior to removing directory
+ virtual int prep_delete() { return 0; }
+
+ CollectionIndex(CephContext* cct, const coll_t& collection)
+ : cct(cct), access_lock("CollectionIndex::access_lock", true, false) {}
+
+ /*
+ * Pre-hash the collection, this collection should map to a PG folder.
+ *
+ * @param pg_num - pg number of the pool this collection belongs to.
+ * @param expected_num_objs - expected number of objects in this collection.
+ * @Return 0 on success, an error code otherwise.
+ */
+ virtual int pre_hash_collection(
+ uint32_t pg_num, ///< [in] pg number of the pool this collection belongs to
+ uint64_t expected_num_objs ///< [in] expected number of objects this collection has
+ ) { ceph_abort(); return 0; }
+
+ virtual int apply_layout_settings(int target_level) { ceph_abort(); return 0; }
+
+ /// Read index-wide settings (should be called after construction)
+ virtual int read_settings() { return 0; }
+
+ /// Virtual destructor
+ virtual ~CollectionIndex() {}
+};
+
+#endif
diff --git a/src/os/filestore/DBObjectMap.cc b/src/os/filestore/DBObjectMap.cc
new file mode 100644
index 00000000..5a057014
--- /dev/null
+++ b/src/os/filestore/DBObjectMap.cc
@@ -0,0 +1,1415 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+#include <iostream>
+#include <set>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "DBObjectMap.h"
+#include <errno.h>
+
+#include "common/debug.h"
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore "
+
+const string DBObjectMap::USER_PREFIX = "_USER_";
+const string DBObjectMap::XATTR_PREFIX = "_AXATTR_";
+const string DBObjectMap::SYS_PREFIX = "_SYS_";
+const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_";
+const string DBObjectMap::HEADER_KEY = "HEADER";
+const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER";
+const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER";
+const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_";
+
+// Legacy
+const string DBObjectMap::LEAF_PREFIX = "_LEAF_";
+const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_";
+
+static void append_escaped(const string &in, string *out)
+{
+ for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+ if (*i == '%') {
+ out->push_back('%');
+ out->push_back('p');
+ } else if (*i == '.') {
+ out->push_back('%');
+ out->push_back('e');
+ } else if (*i == '_') {
+ out->push_back('%');
+ out->push_back('u');
+ } else {
+ out->push_back(*i);
+ }
+ }
+}
+
+int DBObjectMap::check(std::ostream &out, bool repair, bool force)
+{
+ int errors = 0, comp_errors = 0;
+ bool repaired = false;
+ map<uint64_t, uint64_t> parent_to_num_children;
+ map<uint64_t, uint64_t> parent_to_actual_num_children;
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ _Header header;
+ bufferlist bl = iter->value();
+ while (true) {
+ auto bliter = bl.cbegin();
+ header.decode(bliter);
+ if (header.seq != 0)
+ parent_to_actual_num_children[header.seq] = header.num_children;
+
+ if (state.v == 2 || force) {
+ // Check complete table
+ bool complete_error = false;
+ boost::optional<string> prev;
+ KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+ for (complete_iter->seek_to_first(); complete_iter->valid();
+ complete_iter->next()) {
+ if (prev && prev >= complete_iter->key()) {
+ out << "Bad complete for " << header.oid << std::endl;
+ complete_error = true;
+ break;
+ }
+ prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
+ }
+ if (complete_error) {
+ out << "Complete mapping for " << header.seq << " :" << std::endl;
+ for (complete_iter->seek_to_first(); complete_iter->valid();
+ complete_iter->next()) {
+ out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
+ }
+ if (repair) {
+ repaired = true;
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+ db->submit_transaction(t);
+ out << "Cleared complete mapping to repair" << std::endl;
+ } else {
+ errors++; // Only count when not repaired
+ comp_errors++; // Track errors here for version update
+ }
+ }
+ }
+
+ if (header.parent == 0)
+ break;
+
+ if (!parent_to_num_children.count(header.parent))
+ parent_to_num_children[header.parent] = 0;
+ parent_to_num_children[header.parent]++;
+ if (parent_to_actual_num_children.count(header.parent))
+ break;
+
+ set<string> to_get;
+ map<string, bufferlist> got;
+ to_get.insert(HEADER_KEY);
+ db->get(sys_parent_prefix(header), to_get, &got);
+ if (got.empty()) {
+ out << "Missing: seq " << header.parent << std::endl;
+ errors++;
+ break;
+ } else {
+ bl = got.begin()->second;
+ }
+ }
+ }
+
+ for (map<uint64_t, uint64_t>::iterator i = parent_to_num_children.begin();
+ i != parent_to_num_children.end();
+ parent_to_num_children.erase(i++)) {
+ if (!parent_to_actual_num_children.count(i->first))
+ continue;
+ if (parent_to_actual_num_children[i->first] != i->second) {
+ out << "Invalid: seq " << i->first << " recorded children: "
+ << parent_to_actual_num_children[i->first] << " found: "
+ << i->second << std::endl;
+ errors++;
+ }
+ parent_to_actual_num_children.erase(i->first);
+ }
+
+ // Only advance the version from 2 to 3 here
+ // Mark as legacy because there are still older structures
+ // we don't update. The value of legacy is only used
+ // for internal assertions.
+ if (comp_errors == 0 && state.v == 2 && repair) {
+ state.v = 3;
+ state.legacy = true;
+ set_state();
+ }
+
+ if (errors == 0 && repaired)
+ return -1;
+ return errors;
+}
+
+string DBObjectMap::ghobject_key(const ghobject_t &oid)
+{
+ string out;
+ append_escaped(oid.hobj.oid.name, &out);
+ out.push_back('.');
+ append_escaped(oid.hobj.get_key(), &out);
+ out.push_back('.');
+ append_escaped(oid.hobj.nspace, &out);
+ out.push_back('.');
+
+ char snap_with_hash[1000];
+ char *t = snap_with_hash;
+ char *end = t + sizeof(snap_with_hash);
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "snapdir");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+
+ if (oid.hobj.pool == -1)
+ t += snprintf(t, end - t, ".none");
+ else
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
+ t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash());
+
+ if (oid.generation != ghobject_t::NO_GEN ||
+ oid.shard_id != shard_id_t::NO_SHARD) {
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
+ t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
+ }
+ out += string(snap_with_hash);
+ return out;
+}
+
+// ok: pglog%u3%efs1...0.none.0017B237
+// bad: plana8923501-10...4c.3.ffffffffffffffff.2
+// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2
+// returns 0 for false, 1 for true, negative for error
+int DBObjectMap::is_buggy_ghobject_key_v1(CephContext* cct,
+ const string &in)
+{
+ int dots = 5; // skip 5 .'s
+ const char *s = in.c_str();
+ do {
+ while (*s && *s != '.')
+ ++s;
+ if (!*s) {
+ derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+ return -EINVAL;
+ }
+ ++s;
+ } while (*s && --dots);
+ if (!*s) {
+ derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+ return -EINVAL;
+ }
+ // we are now either at a hash value (32 bits, 8 chars) or a generation
+ // value (64 bits) '.' and shard id. count the dots!
+ int len = 0;
+ while (*s && *s != '.') {
+ ++s;
+ ++len;
+ }
+ if (*s == '\0') {
+ if (len != 8) {
+ derr << "hash value is not 8 chars" << dendl;
+ return -EINVAL; // the hash value is always 8 chars.
+ }
+ return 0;
+ }
+ if (*s != '.') { // the shard follows.
+ derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl;
+ return -EINVAL;
+ }
+ return 1;
+}
+
+
+string DBObjectMap::map_header_key(const ghobject_t &oid)
+{
+ return ghobject_key(oid);
+}
+
+string DBObjectMap::header_key(uint64_t seq)
+{
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq);
+ return string(buf);
+}
+
+string DBObjectMap::complete_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX;
+}
+
+string DBObjectMap::user_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + USER_PREFIX;
+}
+
+string DBObjectMap::sys_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + SYS_PREFIX;
+}
+
+string DBObjectMap::xattr_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX;
+}
+
+string DBObjectMap::sys_parent_prefix(_Header header)
+{
+ return USER_PREFIX + header_key(header.parent) + SYS_PREFIX;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::init()
+{
+ invalid = false;
+ if (ready) {
+ return 0;
+ }
+ ceph_assert(!parent_iter);
+ if (header->parent) {
+ Header parent = map->lookup_parent(header);
+ if (!parent) {
+ ceph_abort();
+ return -EINVAL;
+ }
+ parent_iter = std::make_shared<DBObjectMapIteratorImpl>(map, parent);
+ }
+ key_iter = map->db->get_iterator(map->user_prefix(header));
+ ceph_assert(key_iter);
+ complete_iter = map->db->get_iterator(map->complete_prefix(header));
+ ceph_assert(complete_iter);
+ cur_iter = key_iter;
+ ceph_assert(cur_iter);
+ ready = true;
+ return 0;
+}
+
+ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
+ const ghobject_t &oid)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return ObjectMapIterator(new EmptyIteratorImpl());
+ DBObjectMapIterator iter = _get_iterator(header);
+ iter->hlock.swap(hl);
+ return iter;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first()
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->seek_to_first();
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->seek_to_first();
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last()
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->seek_to_last();
+ if (r < 0)
+ return r;
+ if (parent_iter->valid())
+ r = parent_iter->next();
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->seek_to_last();
+ if (r < 0)
+ return r;
+ if (key_iter->valid())
+ r = key_iter->next();
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to)
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->lower_bound(to);
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->lower_bound(to);
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound_parent(const string &to)
+{
+ int r = lower_bound(to);
+ if (r < 0)
+ return r;
+ if (valid() && !on_parent())
+ return next_parent();
+ else
+ return r;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after)
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->upper_bound(after);
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->upper_bound(after);
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid()
+{
+ bool valid = !invalid && ready;
+ ceph_assert(!valid || cur_iter->valid());
+ return valid;
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent()
+{
+ if (parent_iter && parent_iter->valid() &&
+ (!key_iter->valid() || key_iter->key() > parent_iter->key()))
+ return true;
+ return false;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next()
+{
+ ceph_assert(cur_iter->valid());
+ ceph_assert(valid());
+ cur_iter->next();
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next_parent()
+{
+ r = next();
+ if (r < 0)
+ return r;
+ while (parent_iter && parent_iter->valid() && !on_parent()) {
+ ceph_assert(valid());
+ r = lower_bound(parent_iter->key());
+ if (r < 0)
+ return r;
+ }
+
+ if (!parent_iter || !parent_iter->valid()) {
+ invalid = true;
+ }
+ return 0;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test,
+ string *begin,
+ string *end)
+{
+ /* This is clumsy because one cannot call prev() on end(), nor can one
+ * test for == begin().
+ */
+ complete_iter->upper_bound(to_test);
+ if (complete_iter->valid()) {
+ complete_iter->prev();
+ if (!complete_iter->valid()) {
+ complete_iter->upper_bound(to_test);
+ return false;
+ }
+ } else {
+ complete_iter->seek_to_last();
+ if (!complete_iter->valid())
+ return false;
+ }
+
+ ceph_assert(complete_iter->key() <= to_test);
+ ceph_assert(complete_iter->value().length() >= 1);
+ string _end(complete_iter->value().c_str(),
+ complete_iter->value().length() - 1);
+ if (_end.empty() || _end > to_test) {
+ if (begin)
+ *begin = complete_iter->key();
+ if (end)
+ *end = _end;
+ return true;
+ } else {
+ complete_iter->next();
+ ceph_assert(!complete_iter->valid() || complete_iter->key() > to_test);
+ return false;
+ }
+}
+
+/**
+ * Moves parent_iter to the next position both out of the complete_region and
+ * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and
+ * less than key_iter and key_iter otherwise.
+ */
+int DBObjectMap::DBObjectMapIteratorImpl::adjust()
+{
+ string begin, end;
+ while (parent_iter && parent_iter->valid()) {
+ if (in_complete_region(parent_iter->key(), &begin, &end)) {
+ if (end.size() == 0) {
+ parent_iter->seek_to_last();
+ if (parent_iter->valid())
+ parent_iter->next();
+ } else
+ parent_iter->lower_bound(end);
+ } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) {
+ parent_iter->next();
+ } else {
+ break;
+ }
+ }
+ if (valid_parent()) {
+ cur_iter = parent_iter;
+ } else if (key_iter->valid()) {
+ cur_iter = key_iter;
+ } else {
+ invalid = true;
+ }
+ ceph_assert(invalid || cur_iter->valid());
+ return 0;
+}
+
+
+string DBObjectMap::DBObjectMapIteratorImpl::key()
+{
+ return cur_iter->key();
+}
+
+bufferlist DBObjectMap::DBObjectMapIteratorImpl::value()
+{
+ return cur_iter->value();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::status()
+{
+ return r;
+}
+
+int DBObjectMap::set_keys(const ghobject_t &oid,
+ const map<string, bufferlist> &set,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_create_map_header(hl, oid, t);
+ if (!header)
+ return -EINVAL;
+ if (check_spos(oid, header, spos))
+ return 0;
+
+ t->set(user_prefix(header), set);
+
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::set_header(const ghobject_t &oid,
+ const bufferlist &bl,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_create_map_header(hl, oid, t);
+ if (!header)
+ return -EINVAL;
+ if (check_spos(oid, header, spos))
+ return 0;
+ _set_header(header, bl, t);
+ return db->submit_transaction(t);
+}
+
+void DBObjectMap::_set_header(Header header, const bufferlist &bl,
+ KeyValueDB::Transaction t)
+{
+ map<string, bufferlist> to_set;
+ to_set[USER_HEADER_KEY] = bl;
+ t->set(sys_prefix(header), to_set);
+}
+
+int DBObjectMap::get_header(const ghobject_t &oid,
+ bufferlist *bl)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header) {
+ return 0;
+ }
+ return _get_header(header, bl);
+}
+
+int DBObjectMap::_get_header(Header header,
+ bufferlist *bl)
+{
+ map<string, bufferlist> out;
+ while (true) {
+ out.clear();
+ set<string> to_get;
+ to_get.insert(USER_HEADER_KEY);
+ int r = db->get(sys_prefix(header), to_get, &out);
+ if (r == 0 && !out.empty())
+ break;
+ if (r < 0)
+ return r;
+ Header current(header);
+ if (!current->parent)
+ break;
+ header = lookup_parent(current);
+ }
+
+ if (!out.empty())
+ bl->swap(out.begin()->second);
+ return 0;
+}
+
+int DBObjectMap::clear(const ghobject_t &oid,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ if (check_spos(oid, header, spos))
+ return 0;
+ remove_map_header(hl, oid, header, t);
+ ceph_assert(header->num_children > 0);
+ header->num_children--;
+ int r = _clear(header, t);
+ if (r < 0)
+ return r;
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::_clear(Header header,
+ KeyValueDB::Transaction t)
+{
+ while (1) {
+ if (header->num_children) {
+ set_header(header, t);
+ break;
+ }
+ clear_header(header, t);
+ if (!header->parent)
+ break;
+ Header parent = lookup_parent(header);
+ if (!parent) {
+ return -EINVAL;
+ }
+ ceph_assert(parent->num_children > 0);
+ parent->num_children--;
+ header.swap(parent);
+ }
+ return 0;
+}
+
+int DBObjectMap::copy_up_header(Header header,
+ KeyValueDB::Transaction t)
+{
+ bufferlist bl;
+ int r = _get_header(header, &bl);
+ if (r < 0)
+ return r;
+
+ _set_header(header, bl, t);
+ return 0;
+}
+
+int DBObjectMap::rm_keys(const ghobject_t &oid,
+ const set<string> &to_clear,
+ const SequencerPosition *spos)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ KeyValueDB::Transaction t = db->get_transaction();
+ if (check_spos(oid, header, spos))
+ return 0;
+ t->rmkeys(user_prefix(header), to_clear);
+ if (!header->parent) {
+ return db->submit_transaction(t);
+ }
+
+ ceph_assert(state.legacy);
+
+ {
+ // We only get here for legacy (v2) stores
+ // Copy up all keys from parent excluding to_clear
+ // and remove parent
+ // This eliminates a v2 format use of complete for this oid only
+ map<string, bufferlist> to_write;
+ ObjectMapIterator iter = _get_iterator(header);
+ for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+ if (iter->status())
+ return iter->status();
+ if (!to_clear.count(iter->key()))
+ to_write[iter->key()] = iter->value();
+ }
+ t->set(user_prefix(header), to_write);
+ } // destruct iter which has parent in_use
+
+ copy_up_header(header, t);
+ Header parent = lookup_parent(header);
+ if (!parent)
+ return -EINVAL;
+ parent->num_children--;
+ _clear(parent, t);
+ header->parent = 0;
+ set_map_header(hl, oid, *header, t);
+ t->rmkeys_by_prefix(complete_prefix(header));
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::clear_keys_header(const ghobject_t &oid,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ if (check_spos(oid, header, spos))
+ return 0;
+
+ // save old attrs
+ KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+ if (!iter)
+ return -EINVAL;
+ map<string, bufferlist> attrs;
+ for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+ attrs.insert(make_pair(iter->key(), iter->value()));
+ if (iter->status())
+ return iter->status();
+
+ // remove current header
+ remove_map_header(hl, oid, header, t);
+ ceph_assert(header->num_children > 0);
+ header->num_children--;
+ int r = _clear(header, t);
+ if (r < 0)
+ return r;
+
+ // create new header
+ Header newheader = generate_new_header(oid, Header());
+ set_map_header(hl, oid, *newheader, t);
+ if (!attrs.empty())
+ t->set(xattr_prefix(newheader), attrs);
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::get(const ghobject_t &oid,
+ bufferlist *_header,
+ map<string, bufferlist> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ _get_header(header, _header);
+ ObjectMapIterator iter = _get_iterator(header);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ if (iter->status())
+ return iter->status();
+ out->insert(make_pair(iter->key(), iter->value()));
+ }
+ return 0;
+}
+
+int DBObjectMap::get_keys(const ghobject_t &oid,
+ set<string> *keys)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ ObjectMapIterator iter = _get_iterator(header);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ if (iter->status())
+ return iter->status();
+ keys->insert(iter->key());
+ }
+ return 0;
+}
+
+int DBObjectMap::scan(Header header,
+ const set<string> &in_keys,
+ set<string> *out_keys,
+ map<string, bufferlist> *out_values)
+{
+ ObjectMapIterator db_iter = _get_iterator(header);
+ for (set<string>::const_iterator key_iter = in_keys.begin();
+ key_iter != in_keys.end();
+ ++key_iter) {
+ db_iter->lower_bound(*key_iter);
+ if (db_iter->status())
+ return db_iter->status();
+ if (db_iter->valid() && db_iter->key() == *key_iter) {
+ if (out_keys)
+ out_keys->insert(*key_iter);
+ if (out_values)
+ out_values->insert(make_pair(db_iter->key(), db_iter->value()));
+ }
+ }
+ return 0;
+}
+
+int DBObjectMap::get_values(const ghobject_t &oid,
+ const set<string> &keys,
+ map<string, bufferlist> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ return scan(header, keys, 0, out);
+}
+
+int DBObjectMap::check_keys(const ghobject_t &oid,
+ const set<string> &keys,
+ set<string> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ return scan(header, keys, out, 0);
+}
+
+int DBObjectMap::get_xattrs(const ghobject_t &oid,
+ const set<string> &to_get,
+ map<string, bufferlist> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ return db->get(xattr_prefix(header), to_get, out);
+}
+
+int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
+ set<string> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+ if (!iter)
+ return -EINVAL;
+ for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+ out->insert(iter->key());
+ return iter->status();
+}
+
+int DBObjectMap::set_xattrs(const ghobject_t &oid,
+ const map<string, bufferlist> &to_set,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_create_map_header(hl, oid, t);
+ if (!header)
+ return -EINVAL;
+ if (check_spos(oid, header, spos))
+ return 0;
+ t->set(xattr_prefix(header), to_set);
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::remove_xattrs(const ghobject_t &oid,
+ const set<string> &to_remove,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ if (check_spos(oid, header, spos))
+ return 0;
+ t->rmkeys(xattr_prefix(header), to_remove);
+ return db->submit_transaction(t);
+}
+
+// ONLY USED FOR TESTING
+// Set version to 2 to avoid asserts
+int DBObjectMap::legacy_clone(const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos)
+{
+ state.legacy = true;
+
+ if (oid == target)
+ return 0;
+
+ MapHeaderLock _l1(this, std::min(oid, target));
+ MapHeaderLock _l2(this, std::max(oid, target));
+ MapHeaderLock *lsource, *ltarget;
+ if (oid > target) {
+ lsource = &_l2;
+ ltarget= &_l1;
+ } else {
+ lsource = &_l1;
+ ltarget= &_l2;
+ }
+
+ KeyValueDB::Transaction t = db->get_transaction();
+ {
+ Header destination = lookup_map_header(*ltarget, target);
+ if (destination) {
+ if (check_spos(target, destination, spos))
+ return 0;
+ destination->num_children--;
+ remove_map_header(*ltarget, target, destination, t);
+ _clear(destination, t);
+ }
+ }
+
+ Header parent = lookup_map_header(*lsource, oid);
+ if (!parent)
+ return db->submit_transaction(t);
+
+ Header source = generate_new_header(oid, parent);
+ Header destination = generate_new_header(target, parent);
+ if (spos)
+ destination->spos = *spos;
+
+ parent->num_children = 2;
+ set_header(parent, t);
+ set_map_header(*lsource, oid, *source, t);
+ set_map_header(*ltarget, target, *destination, t);
+
+ map<string, bufferlist> to_set;
+ KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent));
+ for (xattr_iter->seek_to_first();
+ xattr_iter->valid();
+ xattr_iter->next())
+ to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+ t->set(xattr_prefix(source), to_set);
+ t->set(xattr_prefix(destination), to_set);
+ t->rmkeys_by_prefix(xattr_prefix(parent));
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::clone(const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos)
+{
+ if (oid == target)
+ return 0;
+
+ MapHeaderLock _l1(this, std::min(oid, target));
+ MapHeaderLock _l2(this, std::max(oid, target));
+ MapHeaderLock *lsource, *ltarget;
+ if (oid > target) {
+ lsource = &_l2;
+ ltarget= &_l1;
+ } else {
+ lsource = &_l1;
+ ltarget= &_l2;
+ }
+
+ KeyValueDB::Transaction t = db->get_transaction();
+ {
+ Header destination = lookup_map_header(*ltarget, target);
+ if (destination) {
+ if (check_spos(target, destination, spos))
+ return 0;
+ destination->num_children--;
+ remove_map_header(*ltarget, target, destination, t);
+ _clear(destination, t);
+ }
+ }
+
+ Header source = lookup_map_header(*lsource, oid);
+ if (!source)
+ return db->submit_transaction(t);
+
+ Header destination = generate_new_header(target, Header());
+ if (spos)
+ destination->spos = *spos;
+
+ set_map_header(*ltarget, target, *destination, t);
+
+ bufferlist bl;
+ int r = _get_header(source, &bl);
+ if (r < 0)
+ return r;
+ _set_header(destination, bl, t);
+
+ map<string, bufferlist> to_set;
+ KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(source));
+ for (xattr_iter->seek_to_first();
+ xattr_iter->valid();
+ xattr_iter->next())
+ to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+ t->set(xattr_prefix(destination), to_set);
+
+ map<string, bufferlist> to_write;
+ ObjectMapIterator iter = _get_iterator(source);
+ for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+ if (iter->status())
+ return iter->status();
+ to_write[iter->key()] = iter->value();
+ }
+ t->set(user_prefix(destination), to_write);
+
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::upgrade_to_v2()
+{
+ dout(1) << __func__ << " start" << dendl;
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ iter->seek_to_first();
+ while (iter->valid()) {
+ unsigned count = 0;
+ KeyValueDB::Transaction t = db->get_transaction();
+ set<string> remove;
+ map<string, bufferlist> add;
+ for (;
+ iter->valid() && count < 300;
+ iter->next()) {
+ dout(20) << __func__ << " key is " << iter->key() << dendl;
+ int r = is_buggy_ghobject_key_v1(cct, iter->key());
+ if (r < 0) {
+ derr << __func__ << " bad key '" << iter->key() << "'" << dendl;
+ return r;
+ }
+ if (!r) {
+ dout(20) << __func__ << " " << iter->key() << " ok" << dendl;
+ continue;
+ }
+
+ // decode header to get oid
+ _Header hdr;
+ bufferlist bl = iter->value();
+ auto bliter = bl.cbegin();
+ hdr.decode(bliter);
+
+ string newkey(ghobject_key(hdr.oid));
+ dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl;
+ add[newkey] = iter->value();
+ remove.insert(iter->key());
+ ++count;
+ }
+
+ if (!remove.empty()) {
+ dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl;
+ t->rmkeys(HOBJECT_TO_SEQ, remove);
+ t->set(HOBJECT_TO_SEQ, add);
+ int r = db->submit_transaction(t);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ state.v = 2;
+
+ set_state();
+ return 0;
+}
+
+void DBObjectMap::set_state()
+{
+ Mutex::Locker l(header_lock);
+ KeyValueDB::Transaction t = db->get_transaction();
+ write_state(t);
+ int ret = db->submit_transaction_sync(t);
+ ceph_assert(ret == 0);
+ dout(1) << __func__ << " done" << dendl;
+ return;
+}
+
+int DBObjectMap::get_state()
+{
+ map<string, bufferlist> result;
+ set<string> to_get;
+ to_get.insert(GLOBAL_STATE_KEY);
+ int r = db->get(SYS_PREFIX, to_get, &result);
+ if (r < 0)
+ return r;
+ if (!result.empty()) {
+ auto bliter = result.begin()->second.cbegin();
+ state.decode(bliter);
+ } else {
+ // New store
+ state.v = State::CUR_VERSION;
+ state.seq = 1;
+ state.legacy = false;
+ }
+ return 0;
+}
+
+int DBObjectMap::init(bool do_upgrade)
+{
+ int ret = get_state();
+ if (ret < 0)
+ return ret;
+ if (state.v < 1) {
+ dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
+ << dendl;
+ return -ENOTSUP;
+ }
+ if (state.v < 2) { // Needs upgrade
+ if (!do_upgrade) {
+ dout(1) << "DOBjbectMap requires an upgrade,"
+ << " set filestore_update_to"
+ << dendl;
+ return -ENOTSUP;
+ } else {
+ int r = upgrade_to_v2();
+ if (r < 0)
+ return r;
+ }
+ }
+ ostringstream ss;
+ int errors = check(ss, true);
+ if (errors) {
+ derr << ss.str() << dendl;
+ if (errors > 0)
+ return -EINVAL;
+ }
+ dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl;
+ return 0;
+}
+
+int DBObjectMap::sync(const ghobject_t *oid,
+ const SequencerPosition *spos) {
+ KeyValueDB::Transaction t = db->get_transaction();
+ if (oid) {
+ ceph_assert(spos);
+ MapHeaderLock hl(this, *oid);
+ Header header = lookup_map_header(hl, *oid);
+ if (header) {
+ dout(10) << "oid: " << *oid << " setting spos to "
+ << *spos << dendl;
+ header->spos = *spos;
+ set_map_header(hl, *oid, *header, t);
+ }
+ /* It may appear that this and the identical portion of the else
+ * block can combined below, but in this block, the transaction
+ * must be submitted under *both* the MapHeaderLock and the full
+ * header_lock.
+ *
+ * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891.
+ */
+ Mutex::Locker l(header_lock);
+ write_state(t);
+ return db->submit_transaction_sync(t);
+ } else {
+ Mutex::Locker l(header_lock);
+ write_state(t);
+ return db->submit_transaction_sync(t);
+ }
+}
+
+int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
+ ceph_assert(header_lock.is_locked_by_me());
+ dout(20) << "dbobjectmap: seq is " << state.seq << dendl;
+ KeyValueDB::Transaction t = _t ? _t : db->get_transaction();
+ bufferlist bl;
+ state.encode(bl);
+ map<string, bufferlist> to_write;
+ to_write[GLOBAL_STATE_KEY] = bl;
+ t->set(SYS_PREFIX, to_write);
+ return _t ? 0 : db->submit_transaction(t);
+}
+
+
+DBObjectMap::Header DBObjectMap::_lookup_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid)
+{
+ ceph_assert(l.get_locked() == oid);
+
+ _Header *header = new _Header();
+ {
+ Mutex::Locker l(cache_lock);
+ if (caches.lookup(oid, header)) {
+ ceph_assert(!in_use.count(header->seq));
+ in_use.insert(header->seq);
+ return Header(header, RemoveOnDelete(this));
+ }
+ }
+
+ bufferlist out;
+ int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out);
+ if (r < 0 || out.length()==0) {
+ delete header;
+ return Header();
+ }
+
+ Header ret(header, RemoveOnDelete(this));
+ auto iter = out.cbegin();
+ ret->decode(iter);
+ {
+ Mutex::Locker l(cache_lock);
+ caches.add(oid, *ret);
+ }
+
+ ceph_assert(!in_use.count(header->seq));
+ in_use.insert(header->seq);
+ return ret;
+}
+
+DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
+ Header parent)
+{
+ Header header = Header(new _Header(), RemoveOnDelete(this));
+ header->seq = state.seq++;
+ if (parent) {
+ header->parent = parent->seq;
+ header->spos = parent->spos;
+ }
+ header->num_children = 1;
+ header->oid = oid;
+ ceph_assert(!in_use.count(header->seq));
+ in_use.insert(header->seq);
+
+ write_state();
+ return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
+{
+ Mutex::Locker l(header_lock);
+ while (in_use.count(input->parent))
+ header_cond.Wait(header_lock);
+ map<string, bufferlist> out;
+ set<string> keys;
+ keys.insert(HEADER_KEY);
+
+ dout(20) << "lookup_parent: parent " << input->parent
+ << " for seq " << input->seq << dendl;
+ int r = db->get(sys_parent_prefix(input), keys, &out);
+ if (r < 0) {
+ ceph_abort();
+ return Header();
+ }
+ if (out.empty()) {
+ ceph_abort();
+ return Header();
+ }
+
+ Header header = Header(new _Header(), RemoveOnDelete(this));
+ auto iter = out.begin()->second.cbegin();
+ header->decode(iter);
+ ceph_assert(header->seq == input->parent);
+ dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
+ << header->parent << dendl;
+ in_use.insert(header->seq);
+ return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_create_map_header(
+ const MapHeaderLock &hl,
+ const ghobject_t &oid,
+ KeyValueDB::Transaction t)
+{
+ Mutex::Locker l(header_lock);
+ Header header = _lookup_map_header(hl, oid);
+ if (!header) {
+ header = _generate_new_header(oid, Header());
+ set_map_header(hl, oid, *header, t);
+ }
+ return header;
+}
+
+void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t)
+{
+ dout(20) << "clear_header: clearing seq " << header->seq << dendl;
+ t->rmkeys_by_prefix(user_prefix(header));
+ t->rmkeys_by_prefix(sys_prefix(header));
+ if (state.legacy)
+ t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0
+ t->rmkeys_by_prefix(xattr_prefix(header));
+ set<string> keys;
+ keys.insert(header_key(header->seq));
+ t->rmkeys(USER_PREFIX, keys);
+}
+
+void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
+{
+ dout(20) << "set_header: setting seq " << header->seq << dendl;
+ map<string, bufferlist> to_write;
+ header->encode(to_write[HEADER_KEY]);
+ t->set(sys_prefix(header), to_write);
+}
+
+void DBObjectMap::remove_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid,
+ Header header,
+ KeyValueDB::Transaction t)
+{
+ ceph_assert(l.get_locked() == oid);
+ dout(20) << "remove_map_header: removing " << header->seq
+ << " oid " << oid << dendl;
+ set<string> to_remove;
+ to_remove.insert(map_header_key(oid));
+ t->rmkeys(HOBJECT_TO_SEQ, to_remove);
+ {
+ Mutex::Locker l(cache_lock);
+ caches.clear(oid);
+ }
+}
+
+void DBObjectMap::set_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid, _Header header,
+ KeyValueDB::Transaction t)
+{
+ ceph_assert(l.get_locked() == oid);
+ dout(20) << "set_map_header: setting " << header.seq
+ << " oid " << oid << " parent seq "
+ << header.parent << dendl;
+ map<string, bufferlist> to_set;
+ header.encode(to_set[map_header_key(oid)]);
+ t->set(HOBJECT_TO_SEQ, to_set);
+ {
+ Mutex::Locker l(cache_lock);
+ caches.add(oid, header);
+ }
+}
+
+bool DBObjectMap::check_spos(const ghobject_t &oid,
+ Header header,
+ const SequencerPosition *spos)
+{
+ if (!spos || *spos > header->spos) {
+ stringstream out;
+ if (spos)
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
+ << *spos << dendl;
+ else
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
+ << "empty" << dendl;
+ dout(10) << " > header.spos " << header->spos << dendl;
+ return false;
+ } else {
+ dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
+ << " <= header.spos " << header->spos << dendl;
+ return true;
+ }
+}
+
+int DBObjectMap::list_objects(vector<ghobject_t> *out)
+{
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ bufferlist bl = iter->value();
+ auto bliter = bl.cbegin();
+ _Header header;
+ header.decode(bliter);
+ out->push_back(header.oid);
+ }
+ return 0;
+}
+
+int DBObjectMap::list_object_headers(vector<_Header> *out)
+{
+ int error = 0;
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ bufferlist bl = iter->value();
+ auto bliter = bl.cbegin();
+ _Header header;
+ header.decode(bliter);
+ out->push_back(header);
+ while (header.parent) {
+ set<string> to_get;
+ map<string, bufferlist> got;
+ to_get.insert(HEADER_KEY);
+ db->get(sys_parent_prefix(header), to_get, &got);
+ if (got.empty()) {
+ dout(0) << "Missing: seq " << header.parent << dendl;
+ error = -ENOENT;
+ break;
+ } else {
+ bl = got.begin()->second;
+ auto bliter = bl.cbegin();
+ header.decode(bliter);
+ out->push_back(header);
+ }
+ }
+ }
+ return error;
+}
+
+ostream& operator<<(ostream& out, const DBObjectMap::_Header& h)
+{
+ out << "seq=" << h.seq << " parent=" << h.parent
+ << " num_children=" << h.num_children
+ << " ghobject=" << h.oid;
+ return out;
+}
+
+int DBObjectMap::rename(const ghobject_t &from,
+ const ghobject_t &to,
+ const SequencerPosition *spos)
+{
+ if (from == to)
+ return 0;
+
+ MapHeaderLock _l1(this, std::min(from, to));
+ MapHeaderLock _l2(this, std::max(from, to));
+ MapHeaderLock *lsource, *ltarget;
+ if (from > to) {
+ lsource = &_l2;
+ ltarget= &_l1;
+ } else {
+ lsource = &_l1;
+ ltarget= &_l2;
+ }
+
+ KeyValueDB::Transaction t = db->get_transaction();
+ {
+ Header destination = lookup_map_header(*ltarget, to);
+ if (destination) {
+ if (check_spos(to, destination, spos))
+ return 0;
+ destination->num_children--;
+ remove_map_header(*ltarget, to, destination, t);
+ _clear(destination, t);
+ }
+ }
+
+ Header hdr = lookup_map_header(*lsource, from);
+ if (!hdr)
+ return db->submit_transaction(t);
+
+ remove_map_header(*lsource, from, hdr, t);
+ hdr->oid = to;
+ set_map_header(*ltarget, to, *hdr, t);
+
+ return db->submit_transaction(t);
+}
diff --git a/src/os/filestore/DBObjectMap.h b/src/os/filestore/DBObjectMap.h
new file mode 100644
index 00000000..e288df83
--- /dev/null
+++ b/src/os/filestore/DBObjectMap.h
@@ -0,0 +1,585 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef DBOBJECTMAP_DB_H
+#define DBOBJECTMAP_DB_H
+
+#include "include/buffer_fwd.h"
+#include <set>
+#include <map>
+#include <string>
+
+#include <vector>
+#include <boost/scoped_ptr.hpp>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "osd/osd_types.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/simple_cache.hpp"
+#include <boost/optional/optional_io.hpp>
+
+#include "SequencerPosition.h"
+
+/**
+ * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
+ *
+ * Prefix space structure:
+ *
+ * @see complete_prefix
+ * @see user_prefix
+ * @see sys_prefix
+ *
+ * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and
+ * corresponding omap header
+ * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
+ * @see State
+ * @see write_state
+ * @see init
+ * @see generate_new_header
+ * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
+ * : key->value for header->seq
+ * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
+ * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
+ * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
+ * : USER_HEADER_KEY - omap header for header->seq
+ * : HEADER_KEY - encoding of header for header->seq
+ *
+ * For each node (represented by a header), we
+ * store three mappings: the key mapping, the complete mapping, and the parent.
+ * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in
+ * this mapping indicates that the key mapping contains all entries on [x,y).
+ * Note, max string is represented by "", so ""->"" indicates that the parent
+ * is unnecessary (@see rm_keys). When looking up a key not contained in the
+ * the complete set, we have to check the parent if we don't find it in the
+ * key set. During rm_keys, we copy keys from the parent and update the
+ * complete set to reflect the change @see rm_keys.
+ */
+class DBObjectMap : public ObjectMap {
+public:
+
+ KeyValueDB *get_db() override { return db.get(); }
+
+ /**
+ * Serializes access to next_seq as well as the in_use set
+ */
+ Mutex header_lock;
+ Cond header_cond;
+ Cond map_header_cond;
+
+ /**
+ * Set of headers currently in use
+ */
+ set<uint64_t> in_use;
+ set<ghobject_t> map_header_in_use;
+
+ /**
+ * Takes the map_header_in_use entry in constructor, releases in
+ * destructor
+ */
+ class MapHeaderLock {
+ DBObjectMap *db;
+ boost::optional<ghobject_t> locked;
+
+ MapHeaderLock(const MapHeaderLock &);
+ MapHeaderLock &operator=(const MapHeaderLock &);
+ public:
+ explicit MapHeaderLock(DBObjectMap *db) : db(db) {}
+ MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) {
+ Mutex::Locker l(db->header_lock);
+ while (db->map_header_in_use.count(*locked))
+ db->map_header_cond.Wait(db->header_lock);
+ db->map_header_in_use.insert(*locked);
+ }
+
+ const ghobject_t &get_locked() const {
+ ceph_assert(locked);
+ return *locked;
+ }
+
+ void swap(MapHeaderLock &o) {
+ ceph_assert(db == o.db);
+
+ // centos6's boost optional doesn't seem to have swap :(
+ boost::optional<ghobject_t> _locked = o.locked;
+ o.locked = locked;
+ locked = _locked;
+ }
+
+ ~MapHeaderLock() {
+ if (locked) {
+ Mutex::Locker l(db->header_lock);
+ ceph_assert(db->map_header_in_use.count(*locked));
+ db->map_header_cond.Signal();
+ db->map_header_in_use.erase(*locked);
+ }
+ }
+ };
+
+ DBObjectMap(CephContext* cct, KeyValueDB *db)
+ : ObjectMap(cct, db), header_lock("DBOBjectMap"),
+ cache_lock("DBObjectMap::CacheLock"),
+ caches(cct->_conf->filestore_omap_header_cache_size)
+ {}
+
+ int set_keys(
+ const ghobject_t &oid,
+ const map<string, bufferlist> &set,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int set_header(
+ const ghobject_t &oid,
+ const bufferlist &bl,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int get_header(
+ const ghobject_t &oid,
+ bufferlist *bl
+ ) override;
+
+ int clear(
+ const ghobject_t &oid,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int clear_keys_header(
+ const ghobject_t &oid,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int rm_keys(
+ const ghobject_t &oid,
+ const set<string> &to_clear,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int get(
+ const ghobject_t &oid,
+ bufferlist *header,
+ map<string, bufferlist> *out
+ ) override;
+
+ int get_keys(
+ const ghobject_t &oid,
+ set<string> *keys
+ ) override;
+
+ int get_values(
+ const ghobject_t &oid,
+ const set<string> &keys,
+ map<string, bufferlist> *out
+ ) override;
+
+ int check_keys(
+ const ghobject_t &oid,
+ const set<string> &keys,
+ set<string> *out
+ ) override;
+
+ int get_xattrs(
+ const ghobject_t &oid,
+ const set<string> &to_get,
+ map<string, bufferlist> *out
+ ) override;
+
+ int get_all_xattrs(
+ const ghobject_t &oid,
+ set<string> *out
+ ) override;
+
+ int set_xattrs(
+ const ghobject_t &oid,
+ const map<string, bufferlist> &to_set,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int remove_xattrs(
+ const ghobject_t &oid,
+ const set<string> &to_remove,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int clone(
+ const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos=0
+ ) override;
+
+ int rename(
+ const ghobject_t &from,
+ const ghobject_t &to,
+ const SequencerPosition *spos=0
+ );
+
+ int legacy_clone(
+ const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos=0
+ );
+
+ /// Read initial state from backing store
+ int get_state();
+ /// Write current state settings to DB
+ void set_state();
+ /// Read initial state and upgrade or initialize state
+ int init(bool upgrade = false);
+
+ /// Upgrade store to current version
+ int upgrade_to_v2();
+
+ /// Consistency check, debug, there must be no parallel writes
+ int check(std::ostream &out, bool repair = false, bool force = false) override;
+
+ /// Ensure that all previous operations are durable
+ int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
+
+ void compact() override {
+ ceph_assert(db);
+ db->compact();
+ }
+
+ /// Util, get all objects, there must be no other concurrent access
+ int list_objects(vector<ghobject_t> *objs ///< [out] objects
+ );
+
+ struct _Header;
+ // Util, get all object headers, there must be no other concurrent access
+ int list_object_headers(vector<_Header> *out ///< [out] headers
+ );
+
+ ObjectMapIterator get_iterator(const ghobject_t &oid) override;
+
+ static const string USER_PREFIX;
+ static const string XATTR_PREFIX;
+ static const string SYS_PREFIX;
+ static const string COMPLETE_PREFIX;
+ static const string HEADER_KEY;
+ static const string USER_HEADER_KEY;
+ static const string GLOBAL_STATE_KEY;
+ static const string HOBJECT_TO_SEQ;
+
+ /// Legacy
+ static const string LEAF_PREFIX;
+ static const string REVERSE_LEAF_PREFIX;
+
+ /// persistent state for store @see generate_header
+ struct State {
+ static const __u8 CUR_VERSION = 3;
+ __u8 v;
+ uint64_t seq;
+ // legacy is false when complete regions never used
+ bool legacy;
+ State() : v(0), seq(1), legacy(false) {}
+ explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {}
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(3, 1, bl);
+ encode(v, bl);
+ encode(seq, bl);
+ encode(legacy, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl) {
+ DECODE_START(3, bl);
+ if (struct_v >= 2)
+ decode(v, bl);
+ else
+ v = 0;
+ decode(seq, bl);
+ if (struct_v >= 3)
+ decode(legacy, bl);
+ else
+ legacy = false;
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_unsigned("v", v);
+ f->dump_unsigned("seq", seq);
+ f->dump_bool("legacy", legacy);
+ }
+
+ static void generate_test_instances(list<State*> &o) {
+ o.push_back(new State(0));
+ o.push_back(new State(20));
+ }
+ } state;
+
+ struct _Header {
+ uint64_t seq;
+ uint64_t parent;
+ uint64_t num_children;
+
+ ghobject_t oid;
+
+ SequencerPosition spos;
+
+ void encode(bufferlist &bl) const {
+ coll_t unused;
+ ENCODE_START(2, 1, bl);
+ encode(seq, bl);
+ encode(parent, bl);
+ encode(num_children, bl);
+ encode(unused, bl);
+ encode(oid, bl);
+ encode(spos, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl) {
+ coll_t unused;
+ DECODE_START(2, bl);
+ decode(seq, bl);
+ decode(parent, bl);
+ decode(num_children, bl);
+ decode(unused, bl);
+ decode(oid, bl);
+ if (struct_v >= 2)
+ decode(spos, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("parent", parent);
+ f->dump_unsigned("num_children", num_children);
+ f->dump_stream("oid") << oid;
+ }
+
+ static void generate_test_instances(list<_Header*> &o) {
+ o.push_back(new _Header);
+ o.push_back(new _Header);
+ o.back()->parent = 20;
+ o.back()->seq = 30;
+ }
+
+ size_t length() {
+ return sizeof(_Header);
+ }
+
+ _Header() : seq(0), parent(0), num_children(1) {}
+ };
+
+ /// String munging (public for testing)
+ static string ghobject_key(const ghobject_t &oid);
+ static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
+ static int is_buggy_ghobject_key_v1(CephContext* cct,
+ const string &in);
+private:
+ /// Implicit lock on Header->seq
+ typedef std::shared_ptr<_Header> Header;
+ Mutex cache_lock;
+ SimpleLRU<ghobject_t, _Header> caches;
+
+ string map_header_key(const ghobject_t &oid);
+ string header_key(uint64_t seq);
+ string complete_prefix(Header header);
+ string user_prefix(Header header);
+ string sys_prefix(Header header);
+ string xattr_prefix(Header header);
+ string sys_parent_prefix(_Header header);
+ string sys_parent_prefix(Header header) {
+ return sys_parent_prefix(*header);
+ }
+
+ class EmptyIteratorImpl : public ObjectMapIteratorImpl {
+ public:
+ int seek_to_first() override { return 0; }
+ int seek_to_last() { return 0; }
+ int upper_bound(const string &after) override { return 0; }
+ int lower_bound(const string &to) override { return 0; }
+ bool valid() override { return false; }
+ int next() override { ceph_abort(); return 0; }
+ string key() override { ceph_abort(); return ""; }
+ bufferlist value() override { ceph_abort(); return bufferlist(); }
+ int status() override { return 0; }
+ };
+
+
+ /// Iterator
+ class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl {
+ public:
+ DBObjectMap *map;
+
+ /// NOTE: implicit lock hlock->get_locked() when returned out of the class
+ MapHeaderLock hlock;
+ /// NOTE: implicit lock on header->seq AND for all ancestors
+ Header header;
+
+ /// parent_iter == NULL iff no parent
+ std::shared_ptr<DBObjectMapIteratorImpl> parent_iter;
+ KeyValueDB::Iterator key_iter;
+ KeyValueDB::Iterator complete_iter;
+
+ /// cur_iter points to currently valid iterator
+ std::shared_ptr<ObjectMapIteratorImpl> cur_iter;
+ int r;
+
+ /// init() called, key_iter, complete_iter, parent_iter filled in
+ bool ready;
+ /// past end
+ bool invalid;
+
+ DBObjectMapIteratorImpl(DBObjectMap *map, Header header) :
+ map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {}
+ int seek_to_first() override;
+ int seek_to_last();
+ int upper_bound(const string &after) override;
+ int lower_bound(const string &to) override;
+ bool valid() override;
+ int next() override;
+ string key() override;
+ bufferlist value() override;
+ int status() override;
+
+ bool on_parent() {
+ return cur_iter == parent_iter;
+ }
+
+ /// skips to next valid parent entry
+ int next_parent();
+
+ /// first parent() >= to
+ int lower_bound_parent(const string &to);
+
+ /**
+ * Tests whether to_test is in complete region
+ *
+ * postcondition: complete_iter will be max s.t. complete_iter->value > to_test
+ */
+ int in_complete_region(const string &to_test, ///< [in] key to test
+ string *begin, ///< [out] beginning of region
+ string *end ///< [out] end of region
+ ); ///< @returns true if to_test is in the complete region, else false
+
+ private:
+ int init();
+ bool valid_parent();
+ int adjust();
+ };
+
+ typedef std::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator;
+ DBObjectMapIterator _get_iterator(Header header) {
+ return std::make_shared<DBObjectMapIteratorImpl>(this, header);
+ }
+
+ /// sys
+
+ /// Removes node corresponding to header
+ void clear_header(Header header, KeyValueDB::Transaction t);
+
+ /// Set node containing input to new contents
+ void set_header(Header input, KeyValueDB::Transaction t);
+
+ /// Remove leaf node corresponding to oid in c
+ void remove_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid,
+ Header header,
+ KeyValueDB::Transaction t);
+
+ /// Set leaf node for c and oid to the value of header
+ void set_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid, _Header header,
+ KeyValueDB::Transaction t);
+
+ /// Set leaf node for c and oid to the value of header
+ bool check_spos(const ghobject_t &oid,
+ Header header,
+ const SequencerPosition *spos);
+
+ /// Lookup or create header for c oid
+ Header lookup_create_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid,
+ KeyValueDB::Transaction t);
+
+ /**
+ * Generate new header for c oid with new seq number
+ *
+ * Has the side effect of synchronously saving the new DBObjectMap state
+ */
+ Header _generate_new_header(const ghobject_t &oid, Header parent);
+ Header generate_new_header(const ghobject_t &oid, Header parent) {
+ Mutex::Locker l(header_lock);
+ return _generate_new_header(oid, parent);
+ }
+
+ /// Lookup leaf header for c oid
+ Header _lookup_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid);
+ Header lookup_map_header(
+ const MapHeaderLock &l2,
+ const ghobject_t &oid) {
+ Mutex::Locker l(header_lock);
+ return _lookup_map_header(l2, oid);
+ }
+
+ /// Lookup header node for input
+ Header lookup_parent(Header input);
+
+
+ /// Helpers
+ int _get_header(Header header, bufferlist *bl);
+
+ /// Scan keys in header into out_keys and out_values (if nonnull)
+ int scan(Header header,
+ const set<string> &in_keys,
+ set<string> *out_keys,
+ map<string, bufferlist> *out_values);
+
+ /// Remove header and all related prefixes
+ int _clear(Header header,
+ KeyValueDB::Transaction t);
+
+ /* Scan complete region bumping *begin to the beginning of any
+ * containing region and adding all complete region keys between
+ * the updated begin and end to the complete_keys_to_remove set */
+ int merge_new_complete(DBObjectMapIterator &iter,
+ string *begin,
+ const string &end,
+ set<string> *complete_keys_to_remove);
+
+ /// Writes out State (mainly next_seq)
+ int write_state(KeyValueDB::Transaction _t =
+ KeyValueDB::Transaction());
+
+ /// Copies header entry from parent @see rm_keys
+ int copy_up_header(Header header,
+ KeyValueDB::Transaction t);
+
+ /// Sets header @see set_header
+ void _set_header(Header header, const bufferlist &bl,
+ KeyValueDB::Transaction t);
+
+ /**
+ * Removes header seq lock and possibly object lock
+ * once Header is out of scope
+ * @see lookup_parent
+ * @see generate_new_header
+ */
+ class RemoveOnDelete {
+ public:
+ DBObjectMap *db;
+ explicit RemoveOnDelete(DBObjectMap *db) :
+ db(db) {}
+ void operator() (_Header *header) {
+ Mutex::Locker l(db->header_lock);
+ ceph_assert(db->in_use.count(header->seq));
+ db->in_use.erase(header->seq);
+ db->header_cond.Signal();
+ delete header;
+ }
+ };
+ friend class RemoveOnDelete;
+};
+WRITE_CLASS_ENCODER(DBObjectMap::_Header)
+WRITE_CLASS_ENCODER(DBObjectMap::State)
+
+ostream& operator<<(ostream& out, const DBObjectMap::_Header& h);
+
+#endif
diff --git a/src/os/filestore/FDCache.h b/src/os/filestore/FDCache.h
new file mode 100644
index 00000000..ee8c4fb0
--- /dev/null
+++ b/src/os/filestore/FDCache.h
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_FDCACHE_H
+#define CEPH_FDCACHE_H
+
+#include <memory>
+#include <errno.h>
+#include <cstdio>
+#include "common/config_obs.h"
+#include "common/hobject.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/shared_cache.hpp"
+#include "include/compat.h"
+#include "include/intarith.h"
+
+/**
+ * FD Cache
+ */
+class FDCache : public md_config_obs_t {
+public:
+ /**
+ * FD
+ *
+ * Wrapper for an fd. Destructor closes the fd.
+ */
+ class FD {
+ public:
+ const int fd;
+ explicit FD(int _fd) : fd(_fd) {
+ ceph_assert(_fd >= 0);
+ }
+ int operator*() const {
+ return fd;
+ }
+ ~FD() {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ };
+
+private:
+ CephContext *cct;
+ const int registry_shards;
+ SharedLRU<ghobject_t, FD> *registry;
+
+public:
+ explicit FDCache(CephContext *cct) : cct(cct),
+ registry_shards(std::max<int64_t>(cct->_conf->filestore_fd_cache_shards, 1)) {
+ ceph_assert(cct);
+ cct->_conf.add_observer(this);
+ registry = new SharedLRU<ghobject_t, FD>[registry_shards];
+ for (int i = 0; i < registry_shards; ++i) {
+ registry[i].set_cct(cct);
+ registry[i].set_size(
+ std::max<int64_t>((cct->_conf->filestore_fd_cache_size / registry_shards), 1));
+ }
+ }
+ ~FDCache() override {
+ cct->_conf.remove_observer(this);
+ delete[] registry;
+ }
+ typedef std::shared_ptr<FD> FDRef;
+
+ FDRef lookup(const ghobject_t &hoid) {
+ int registry_id = hoid.hobj.get_hash() % registry_shards;
+ return registry[registry_id].lookup(hoid);
+ }
+
+ FDRef add(const ghobject_t &hoid, int fd, bool *existed) {
+ int registry_id = hoid.hobj.get_hash() % registry_shards;
+ return registry[registry_id].add(hoid, new FD(fd), existed);
+ }
+
+ /// clear cached fd for hoid, subsequent lookups will get an empty FD
+ void clear(const ghobject_t &hoid) {
+ int registry_id = hoid.hobj.get_hash() % registry_shards;
+ registry[registry_id].purge(hoid);
+ }
+
+ /// md_config_obs_t
+ const char** get_tracked_conf_keys() const override {
+ static const char* KEYS[] = {
+ "filestore_fd_cache_size",
+ NULL
+ };
+ return KEYS;
+ }
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override {
+ if (changed.count("filestore_fd_cache_size")) {
+ for (int i = 0; i < registry_shards; ++i)
+ registry[i].set_size(
+ std::max<int64_t>((conf->filestore_fd_cache_size / registry_shards), 1));
+ }
+ }
+
+};
+typedef FDCache::FDRef FDRef;
+
+#endif
diff --git a/src/os/filestore/FileJournal.cc b/src/os/filestore/FileJournal.cc
new file mode 100644
index 00000000..f0351fe4
--- /dev/null
+++ b/src/os/filestore/FileJournal.cc
@@ -0,0 +1,2216 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "acconfig.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "FileJournal.h"
+#include "include/color.h"
+#include "common/perf_counters.h"
+#include "FileStore.h"
+
+#include "include/compat.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+
+#include "common/blkdev.h"
+#if defined(__linux__)
+#include "common/linux_version.h"
+#endif
+
+#if defined(__FreeBSD__)
+#define O_DSYNC O_SYNC
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+const static int64_t ONE_MEG(1 << 20);
+const static int CEPH_DIRECTIO_ALIGNMENT(4096);
+
+
+int FileJournal::_open(bool forwrite, bool create)
+{
+ int flags, ret;
+
+ if (forwrite) {
+ flags = O_RDWR;
+ if (directio)
+ flags |= O_DIRECT | O_DSYNC;
+ } else {
+ flags = O_RDONLY;
+ }
+ if (create)
+ flags |= O_CREAT;
+
+ if (fd >= 0) {
+ if (TEMP_FAILURE_RETRY(::close(fd))) {
+ int err = errno;
+ derr << "FileJournal::_open: error closing old fd: "
+ << cpp_strerror(err) << dendl;
+ }
+ }
+ fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags|O_CLOEXEC, 0644));
+ if (fd < 0) {
+ int err = errno;
+ dout(2) << "FileJournal::_open unable to open journal "
+ << fn << ": " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+
+ struct stat st;
+ ret = ::fstat(fd, &st);
+ if (ret) {
+ ret = errno;
+ derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;
+ ret = -ret;
+ goto out_fd;
+ }
+
+ if (S_ISBLK(st.st_mode)) {
+ ret = _open_block_device();
+ } else if (S_ISREG(st.st_mode)) {
+ if (aio && !force_aio) {
+ derr << "FileJournal::_open: disabling aio for non-block journal. Use "
+ << "journal_force_aio to force use of aio anyway" << dendl;
+ aio = false;
+ }
+ ret = _open_file(st.st_size, st.st_blksize, create);
+ } else {
+ derr << "FileJournal::_open: wrong journal file type: " << st.st_mode
+ << dendl;
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ goto out_fd;
+
+#ifdef HAVE_LIBAIO
+ if (aio) {
+ aio_ctx = 0;
+ ret = io_setup(128, &aio_ctx);
+ if (ret < 0) {
+ switch (ret) {
+ // Contrary to naive expectations -EAGIAN means ...
+ case -EAGAIN:
+ derr << "FileJournal::_open: user's limit of aio events exceeded. "
+ << "Try increasing /proc/sys/fs/aio-max-nr" << dendl;
+ break;
+ default:
+ derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl;
+ break;
+ }
+ goto out_fd;
+ }
+ }
+#endif
+
+ /* We really want max_size to be a multiple of block_size. */
+ max_size -= max_size % block_size;
+
+ dout(1) << "_open " << fn << " fd " << fd
+ << ": " << max_size
+ << " bytes, block size " << block_size
+ << " bytes, directio = " << directio
+ << ", aio = " << aio
+ << dendl;
+ return 0;
+
+ out_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ fd = -1;
+ return ret;
+}
+
+int FileJournal::_open_block_device()
+{
+ int64_t bdev_sz = 0;
+ BlkDev blkdev(fd);
+ int ret = blkdev.get_size(&bdev_sz);
+ if (ret) {
+ dout(0) << __func__ << ": failed to read block device size." << dendl;
+ return -EIO;
+ }
+
+ /* Check for bdev_sz too small */
+ if (bdev_sz < ONE_MEG) {
+ dout(0) << __func__ << ": your block device must be at least "
+ << ONE_MEG << " bytes to be used for a Ceph journal." << dendl;
+ return -EINVAL;
+ }
+
+ dout(10) << __func__ << ": ignoring osd journal size. "
+ << "We'll use the entire block device (size: " << bdev_sz << ")"
+ << dendl;
+ max_size = bdev_sz;
+
+ block_size = cct->_conf->journal_block_size;
+
+ if (cct->_conf->journal_discard) {
+ discard = blkdev.support_discard();
+ dout(10) << fn << " support discard: " << (int)discard << dendl;
+ }
+
+ return 0;
+}
+
+int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
+ bool create)
+{
+ int ret;
+ int64_t conf_journal_sz(cct->_conf->osd_journal_size);
+ conf_journal_sz <<= 20;
+
+ if ((cct->_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) {
+ derr << "I'm sorry, I don't know how large of a journal to create."
+ << "Please specify a block device to use as the journal OR "
+ << "set osd_journal_size in your ceph.conf" << dendl;
+ return -EINVAL;
+ }
+
+ if (create && (oldsize < conf_journal_sz)) {
+ uint64_t newsize(conf_journal_sz);
+ dout(10) << __func__ << " _open extending to " << newsize << " bytes" << dendl;
+ ret = ::ftruncate(fd, newsize);
+ if (ret < 0) {
+ int err = errno;
+ derr << "FileJournal::_open_file : unable to extend journal to "
+ << newsize << " bytes: " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ ret = ceph_posix_fallocate(fd, 0, newsize);
+ if (ret) {
+ derr << "FileJournal::_open_file : unable to preallocation journal to "
+ << newsize << " bytes: " << cpp_strerror(ret) << dendl;
+ return -ret;
+ }
+ max_size = newsize;
+ }
+ else {
+ max_size = oldsize;
+ }
+ block_size = cct->_conf->journal_block_size;
+
+ if (create && cct->_conf->journal_zero_on_create) {
+ derr << "FileJournal::_open_file : zeroing journal" << dendl;
+ uint64_t write_size = 1 << 20;
+ char *buf;
+ ret = ::posix_memalign((void **)&buf, block_size, write_size);
+ if (ret != 0) {
+ return -ret;
+ }
+ memset(static_cast<void*>(buf), 0, write_size);
+ uint64_t i = 0;
+ for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
+ ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
+ if (ret < 0) {
+ free(buf);
+ return -errno;
+ }
+ }
+ if (i < (uint64_t)max_size) {
+ ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
+ if (ret < 0) {
+ free(buf);
+ return -errno;
+ }
+ }
+ free(buf);
+ }
+
+
+ dout(10) << "_open journal is not a block device, NOT checking disk "
+ << "write cache on '" << fn << "'" << dendl;
+
+ return 0;
+}
+
+// This can not be used on an active journal
+int FileJournal::check()
+{
+ int ret;
+
+ ceph_assert(fd == -1);
+ ret = _open(false, false);
+ if (ret)
+ return ret;
+
+ ret = read_header(&header);
+ if (ret < 0)
+ goto done;
+
+ if (header.fsid != fsid) {
+ derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+ << ", invalid (someone else's?) journal" << dendl;
+ ret = -EINVAL;
+ goto done;
+ }
+
+ dout(1) << "check: header looks ok" << dendl;
+ ret = 0;
+
+ done:
+ close();
+ return ret;
+}
+
+
+int FileJournal::create()
+{
+ void *buf = 0;
+ int64_t needed_space;
+ int ret;
+ buffer::ptr bp;
+ dout(2) << "create " << fn << " fsid " << fsid << dendl;
+
+ ret = _open(true, true);
+ if (ret)
+ goto done;
+
+ // write empty header
+ header = header_t();
+ header.flags = header_t::FLAG_CRC; // enable crcs on any new journal.
+ header.fsid = fsid;
+ header.max_size = max_size;
+ header.block_size = block_size;
+ if (cct->_conf->journal_block_align || directio)
+ header.alignment = block_size;
+ else
+ header.alignment = 16; // at least stay word aligned on 64bit machines...
+
+ header.start = get_top();
+ header.start_seq = 0;
+
+ print_header(header);
+
+ // static zeroed buffer for alignment padding
+ delete [] zero_buf;
+ zero_buf = new char[header.alignment];
+ memset(zero_buf, 0, header.alignment);
+
+ bp = prepare_header();
+ if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) {
+ ret = -errno;
+ derr << "FileJournal::create : create write header error "
+ << cpp_strerror(ret) << dendl;
+ goto close_fd;
+ }
+
+ // zero first little bit, too.
+ ret = posix_memalign(&buf, block_size, block_size);
+ if (ret) {
+ ret = -ret;
+ derr << "FileJournal::create: failed to allocate " << block_size
+ << " bytes of memory: " << cpp_strerror(ret) << dendl;
+ goto close_fd;
+ }
+ memset(buf, 0, block_size);
+ if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) {
+ ret = -errno;
+ derr << "FileJournal::create: error zeroing first " << block_size
+ << " bytes " << cpp_strerror(ret) << dendl;
+ goto free_buf;
+ }
+
+ needed_space = cct->_conf->osd_max_write_size << 20;
+ needed_space += (2 * sizeof(entry_header_t)) + get_top();
+ if (header.max_size - header.start < needed_space) {
+ derr << "FileJournal::create: OSD journal is not large enough to hold "
+ << "osd_max_write_size bytes!" << dendl;
+ ret = -ENOSPC;
+ goto free_buf;
+ }
+
+ dout(2) << "create done" << dendl;
+ ret = 0;
+
+free_buf:
+ free(buf);
+ buf = 0;
+close_fd:
+ if (TEMP_FAILURE_RETRY(::close(fd)) < 0) {
+ ret = -errno;
+ derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret)
+ << dendl;
+ }
+done:
+ fd = -1;
+ return ret;
+}
+
+// This can not be used on an active journal
+int FileJournal::peek_fsid(uuid_d& fsid)
+{
+ ceph_assert(fd == -1);
+ int r = _open(false, false);
+ if (r)
+ return r;
+ r = read_header(&header);
+ if (r < 0)
+ goto out;
+ fsid = header.fsid;
+out:
+ close();
+ return r;
+}
+
+int FileJournal::open(uint64_t fs_op_seq)
+{
+ dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl;
+
+ uint64_t next_seq = fs_op_seq + 1;
+ uint64_t seq = -1;
+
+ int err = _open(false);
+ if (err)
+ return err;
+
+ // assume writeable, unless...
+ read_pos = 0;
+ write_pos = get_top();
+
+ // read header?
+ err = read_header(&header);
+ if (err < 0)
+ goto out;
+
+ // static zeroed buffer for alignment padding
+ delete [] zero_buf;
+ zero_buf = new char[header.alignment];
+ memset(zero_buf, 0, header.alignment);
+
+ dout(10) << "open header.fsid = " << header.fsid
+ //<< " vs expected fsid = " << fsid
+ << dendl;
+ if (header.fsid != fsid) {
+ derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+ << ", invalid (someone else's?) journal" << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if (header.max_size > max_size) {
+ dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if (header.block_size != block_size) {
+ dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if (header.max_size % header.block_size) {
+ dout(2) << "open journal max size " << header.max_size
+ << " not a multiple of block size " << header.block_size << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if (header.alignment != block_size && directio) {
+ dout(0) << "open journal alignment " << header.alignment << " does not match block size "
+ << block_size << " (required for direct_io journal mode)" << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+ if ((header.alignment % CEPH_DIRECTIO_ALIGNMENT) && directio) {
+ dout(0) << "open journal alignment " << header.alignment
+ << " is not multiple of minimum directio alignment "
+ << CEPH_DIRECTIO_ALIGNMENT << " (required for direct_io journal mode)"
+ << dendl;
+ err = -EINVAL;
+ goto out;
+ }
+
+ // looks like a valid header.
+ write_pos = 0; // not writeable yet
+
+ journaled_seq = header.committed_up_to;
+
+ // find next entry
+ read_pos = header.start;
+ seq = header.start_seq;
+
+ while (1) {
+ bufferlist bl;
+ off64_t old_pos = read_pos;
+ if (!read_entry(bl, seq)) {
+ dout(10) << "open reached end of journal." << dendl;
+ break;
+ }
+ if (seq > next_seq) {
+ dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq
+ << ", ignoring journal contents"
+ << dendl;
+ read_pos = -1;
+ last_committed_seq = 0;
+ return 0;
+ }
+ if (seq == next_seq) {
+ dout(10) << "open reached seq " << seq << dendl;
+ read_pos = old_pos;
+ break;
+ }
+ seq++; // next event should follow.
+ }
+
+ return 0;
+out:
+ close();
+ return err;
+}
+
+void FileJournal::_close(int fd) const
+{
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileJournal::close()
+{
+ dout(1) << "close " << fn << dendl;
+
+ // stop writer thread
+ stop_writer();
+
+ // close
+ ceph_assert(writeq_empty());
+ ceph_assert(!must_write_header);
+ ceph_assert(fd >= 0);
+ _close(fd);
+ fd = -1;
+}
+
+
+int FileJournal::dump(ostream& out)
+{
+ return _dump(out, false);
+}
+
+int FileJournal::simple_dump(ostream& out)
+{
+ return _dump(out, true);
+}
+
+int FileJournal::_dump(ostream& out, bool simple)
+{
+ JSONFormatter f(true);
+ int ret = _fdump(f, simple);
+ f.flush(out);
+ return ret;
+}
+
+int FileJournal::_fdump(Formatter &f, bool simple)
+{
+ dout(10) << "_fdump" << dendl;
+
+ ceph_assert(fd == -1);
+ int err = _open(false, false);
+ if (err)
+ return err;
+
+ err = read_header(&header);
+ if (err < 0) {
+ close();
+ return err;
+ }
+
+ off64_t next_pos = header.start;
+
+ f.open_object_section("journal");
+
+ f.open_object_section("header");
+ f.dump_unsigned("flags", header.flags);
+ ostringstream os;
+ os << header.fsid;
+ f.dump_string("fsid", os.str());
+ f.dump_unsigned("block_size", header.block_size);
+ f.dump_unsigned("alignment", header.alignment);
+ f.dump_int("max_size", header.max_size);
+ f.dump_int("start", header.start);
+ f.dump_unsigned("committed_up_to", header.committed_up_to);
+ f.dump_unsigned("start_seq", header.start_seq);
+ f.close_section();
+
+ f.open_array_section("entries");
+ uint64_t seq = header.start_seq;
+ while (1) {
+ bufferlist bl;
+ off64_t pos = next_pos;
+
+ if (!pos) {
+ dout(2) << "_dump -- not readable" << dendl;
+ err = -EINVAL;
+ break;
+ }
+ stringstream ss;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ &ss);
+ if (result != SUCCESS) {
+ if (seq < header.committed_up_to) {
+ dout(2) << "Unable to read past sequence " << seq
+ << " but header indicates the journal has committed up through "
+ << header.committed_up_to << ", journal is corrupt" << dendl;
+ err = -EINVAL;
+ }
+ dout(25) << ss.str() << dendl;
+ dout(25) << "No further valid entries found, journal is most likely valid"
+ << dendl;
+ break;
+ }
+
+ f.open_object_section("entry");
+ f.dump_unsigned("offset", pos);
+ f.dump_unsigned("seq", seq);
+ if (simple) {
+ f.dump_unsigned("bl.length", bl.length());
+ } else {
+ f.open_array_section("transactions");
+ auto p = bl.cbegin();
+ int trans_num = 0;
+ while (!p.end()) {
+ ObjectStore::Transaction t(p);
+ f.open_object_section("transaction");
+ f.dump_unsigned("trans_num", trans_num);
+ t.dump(&f);
+ f.close_section();
+ trans_num++;
+ }
+ f.close_section();
+ }
+ f.close_section();
+ }
+
+ f.close_section();
+ f.close_section();
+ dout(10) << "dump finish" << dendl;
+
+ close();
+ return err;
+}
+
+
+void FileJournal::start_writer()
+{
+ write_stop = false;
+ aio_stop = false;
+ write_thread.create("journal_write");
+#ifdef HAVE_LIBAIO
+ if (aio)
+ write_finish_thread.create("journal_wrt_fin");
+#endif
+}
+
+void FileJournal::stop_writer()
+{
+ // Do nothing if writer already stopped or never started
+ if (!write_stop)
+ {
+ {
+ Mutex::Locker l(write_lock);
+ Mutex::Locker p(writeq_lock);
+ write_stop = true;
+ writeq_cond.Signal();
+ // Doesn't hurt to signal commit_cond in case thread is waiting there
+ // and caller didn't use committed_thru() first.
+ commit_cond.Signal();
+ }
+ write_thread.join();
+
+ // write journal header now so that we have less to replay on remount
+ write_header_sync();
+ }
+
+#ifdef HAVE_LIBAIO
+ // stop aio completeion thread *after* writer thread has stopped
+ // and has submitted all of its io
+ if (aio && !aio_stop) {
+ aio_lock.Lock();
+ aio_stop = true;
+ aio_cond.Signal();
+ write_finish_cond.Signal();
+ aio_lock.Unlock();
+ write_finish_thread.join();
+ }
+#endif
+}
+
+
+
+void FileJournal::print_header(const header_t &header) const
+{
+ dout(10) << "header: block_size " << header.block_size
+ << " alignment " << header.alignment
+ << " max_size " << header.max_size
+ << dendl;
+ dout(10) << "header: start " << header.start << dendl;
+ dout(10) << " write_pos " << write_pos << dendl;
+}
+
+int FileJournal::read_header(header_t *hdr) const
+{
+ dout(10) << "read_header" << dendl;
+ bufferlist bl;
+
+ buffer::ptr bp = buffer::create_small_page_aligned(block_size);
+ char* bpdata = bp.c_str();
+ int r = ::pread(fd, bpdata, bp.length(), 0);
+
+ if (r < 0) {
+ int err = errno;
+ dout(0) << "read_header got " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+
+ // don't use bp.zero() here, because it also invalidates
+ // crc cache (which is not yet populated anyway)
+ if (bp.length() != (size_t)r) {
+ // r will be always less or equal than bp.length
+ bpdata += r;
+ memset(bpdata, 0, bp.length() - r);
+ }
+
+ bl.push_back(std::move(bp));
+
+ try {
+ auto p = bl.cbegin();
+ decode(*hdr, p);
+ }
+ catch (buffer::error& e) {
+ derr << "read_header error decoding journal header" << dendl;
+ return -EINVAL;
+ }
+
+
+ /*
+ * Unfortunately we weren't initializing the flags field for new
+ * journals! Aie. This is safe(ish) now that we have only one
+ * flag. Probably around when we add the next flag we need to
+ * remove this or else this (eventually old) code will clobber newer
+ * code's flags.
+ */
+ if (hdr->flags > 3) {
+ derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
+ hdr->flags = 0;
+ }
+
+ print_header(*hdr);
+
+ return 0;
+}
+
+bufferptr FileJournal::prepare_header()
+{
+ bufferlist bl;
+ {
+ Mutex::Locker l(finisher_lock);
+ header.committed_up_to = journaled_seq;
+ }
+ encode(header, bl);
+ bufferptr bp = buffer::create_small_page_aligned(get_top());
+ // don't use bp.zero() here, because it also invalidates
+ // crc cache (which is not yet populated anyway)
+ char* data = bp.c_str();
+ memcpy(data, bl.c_str(), bl.length());
+ data += bl.length();
+ memset(data, 0, bp.length()-bl.length());
+ return bp;
+}
+
+void FileJournal::write_header_sync()
+{
+ Mutex::Locker locker(write_lock);
+ must_write_header = true;
+ bufferlist bl;
+ do_write(bl);
+ dout(20) << __func__ << " finish" << dendl;
+}
+
+int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size)
+{
+ // already full?
+ if (full_state != FULL_NOTFULL)
+ return -ENOSPC;
+
+ // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL.
+ off64_t room;
+ if (pos >= header.start)
+ room = (header.max_size - pos) + (header.start - get_top()) - 1;
+ else
+ room = header.start - pos - 1;
+ dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start
+ << " top " << get_top() << dendl;
+
+ if (do_sync_cond) {
+ if (room >= (header.max_size >> 1) &&
+ room - size < (header.max_size >> 1)) {
+ dout(10) << " passing half full mark, triggering commit" << dendl;
+ do_sync_cond->SloppySignal(); // initiate a real commit so we can trim
+ }
+ }
+
+ if (room >= size) {
+ dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl;
+ if (pos + size > header.max_size)
+ must_write_header = true;
+ return 0;
+ }
+
+ // full
+ dout(1) << "check_for_full at " << pos << " : JOURNAL FULL "
+ << pos << " >= " << room
+ << " (max_size " << header.max_size << " start " << header.start << ")"
+ << dendl;
+
+ off64_t max = header.max_size - get_top();
+ if (size > max)
+ dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl;
+
+ return -ENOSPC;
+}
+
+int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+ // gather queued writes
+ off64_t queue_pos = write_pos;
+
+ int eleft = cct->_conf->journal_max_write_entries;
+ unsigned bmax = cct->_conf->journal_max_write_bytes;
+
+ if (full_state != FULL_NOTFULL)
+ return -ENOSPC;
+
+ while (!writeq_empty()) {
+ list<write_item> items;
+ batch_pop_write(items);
+ list<write_item>::iterator it = items.begin();
+ while (it != items.end()) {
+ uint64_t bytes = it->bl.length();
+ int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes);
+ if (r == 0) { // prepare ok, delete it
+ items.erase(it++);
+#ifdef HAVE_LIBAIO
+ {
+ Mutex::Locker locker(aio_lock);
+ ceph_assert(aio_write_queue_ops > 0);
+ aio_write_queue_ops--;
+ ceph_assert(aio_write_queue_bytes >= bytes);
+ aio_write_queue_bytes -= bytes;
+ }
+#else
+ (void)bytes;
+#endif
+ }
+ if (r == -ENOSPC) {
+ // the journal maybe full, insert the left item to writeq
+ batch_unpop_write(items);
+ if (orig_ops)
+ goto out; // commit what we have
+
+ if (logger)
+ logger->inc(l_filestore_journal_full);
+
+ if (wait_on_full) {
+ dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl;
+ } else {
+ dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl;
+
+ // throw out what we have so far
+ full_state = FULL_FULL;
+ while (!writeq_empty()) {
+ complete_write(1, peek_write().orig_len);
+ pop_write();
+ }
+ print_header(header);
+ }
+
+ return -ENOSPC; // hrm, full on first op
+ }
+ if (eleft) {
+ if (--eleft == 0) {
+ dout(20) << "prepare_multi_write hit max events per write "
+ << cct->_conf->journal_max_write_entries << dendl;
+ batch_unpop_write(items);
+ goto out;
+ }
+ }
+ if (bmax) {
+ if (bl.length() >= bmax) {
+ dout(20) << "prepare_multi_write hit max write size "
+ << cct->_conf->journal_max_write_bytes << dendl;
+ batch_unpop_write(items);
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl;
+ ceph_assert((write_pos + bl.length() == queue_pos) ||
+ (write_pos + bl.length() - header.max_size + get_top() == queue_pos));
+ return 0;
+}
+
+/*
+void FileJournal::queue_write_fin(uint64_t seq, Context *fin)
+{
+ writing_seq.push_back(seq);
+ if (!waiting_for_notfull.empty()) {
+ // make sure previously unjournaled stuff waiting for UNFULL triggers
+ // _before_ newly journaled stuff does
+ dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin
+ << " until after UNFULL" << dendl;
+ C_Gather *g = new C_Gather(writeq.front().fin);
+ writing_fin.push_back(g->new_sub());
+ waiting_for_notfull.push_back(g->new_sub());
+ } else {
+ writing_fin.push_back(writeq.front().fin);
+ dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl;
+ }
+}
+*/
+
+void FileJournal::queue_completions_thru(uint64_t seq)
+{
+ ceph_assert(finisher_lock.is_locked());
+ utime_t now = ceph_clock_now();
+ list<completion_item> items;
+ batch_pop_completions(items);
+ list<completion_item>::iterator it = items.begin();
+ while (it != items.end()) {
+ completion_item& next = *it;
+ if (next.seq > seq)
+ break;
+ utime_t lat = now;
+ lat -= next.start;
+ dout(10) << "queue_completions_thru seq " << seq
+ << " queueing seq " << next.seq
+ << " " << next.finish
+ << " lat " << lat << dendl;
+ if (logger) {
+ logger->tinc(l_filestore_journal_latency, lat);
+ }
+ if (next.finish)
+ finisher->queue(next.finish);
+ if (next.tracked_op) {
+ next.tracked_op->mark_event("journaled_completion_queued");
+ next.tracked_op->journal_trace.event("queued completion");
+ next.tracked_op->journal_trace.keyval("completed through", seq);
+ }
+ items.erase(it++);
+ }
+ batch_unpop_completions(items);
+ finisher_cond.Signal();
+}
+
+
+int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+ uint64_t seq = next_write.seq;
+ bufferlist &ebl = next_write.bl;
+ off64_t size = ebl.length();
+
+ int r = check_for_full(seq, queue_pos, size);
+ if (r < 0)
+ return r; // ENOSPC or EAGAIN
+
+ uint32_t orig_len = next_write.orig_len;
+ orig_bytes += orig_len;
+ orig_ops++;
+
+ // add to write buffer
+ dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq
+ << " len " << orig_len << " -> " << size << dendl;
+
+ unsigned seq_offset = offsetof(entry_header_t, seq);
+ unsigned magic1_offset = offsetof(entry_header_t, magic1);
+ unsigned magic2_offset = offsetof(entry_header_t, magic2);
+
+ bufferptr headerptr = ebl.buffers().front();
+ uint64_t _seq = seq;
+ uint64_t _queue_pos = queue_pos;
+ uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64());
+ headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq);
+ headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+ headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+ bufferptr footerptr = ebl.buffers().back();
+ unsigned post_offset = footerptr.length() - sizeof(entry_header_t);
+ footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq);
+ footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+ footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+ bl.claim_append(ebl);
+ if (next_write.tracked_op) {
+ next_write.tracked_op->mark_event("write_thread_in_journal_buffer");
+ next_write.tracked_op->journal_trace.event("prepare_single_write");
+ }
+
+ journalq.push_back(pair<uint64_t,off64_t>(seq, queue_pos));
+ writing_seq = seq;
+
+ queue_pos += size;
+ if (queue_pos >= header.max_size)
+ queue_pos = queue_pos + get_top() - header.max_size;
+
+ return 0;
+}
+
+void FileJournal::check_align(off64_t pos, bufferlist& bl)
+{
+ // make sure list segments are page aligned
+ if (directio && !bl.is_aligned_size_and_memory(block_size, CEPH_DIRECTIO_ALIGNMENT)) {
+ ceph_assert((bl.length() & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0);
+ ceph_assert((pos & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0);
+ ceph_abort_msg("bl was not aligned");
+ }
+}
+
+int FileJournal::write_bl(off64_t& pos, bufferlist& bl)
+{
+ int ret;
+
+ off64_t spos = ::lseek64(fd, pos, SEEK_SET);
+ if (spos < 0) {
+ ret = -errno;
+ derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = bl.write_fd(fd);
+ if (ret) {
+ derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ pos += bl.length();
+ if (pos == header.max_size)
+ pos = get_top();
+ return 0;
+}
+
+void FileJournal::do_write(bufferlist& bl)
+{
+ // nothing to do?
+ if (bl.length() == 0 && !must_write_header)
+ return;
+
+ buffer::ptr hbp;
+ if (cct->_conf->journal_write_header_frequency &&
+ (((++journaled_since_start) %
+ cct->_conf->journal_write_header_frequency) == 0)) {
+ must_write_header = true;
+ }
+
+ if (must_write_header) {
+ must_write_header = false;
+ hbp = prepare_header();
+ }
+
+ dout(15) << "do_write writing " << write_pos << "~" << bl.length()
+ << (hbp.length() ? " + header":"")
+ << dendl;
+
+ utime_t from = ceph_clock_now();
+
+ // entry
+ off64_t pos = write_pos;
+
+ // Adjust write_pos
+ write_pos += bl.length();
+ if (write_pos >= header.max_size)
+ write_pos = write_pos - header.max_size + get_top();
+
+ write_lock.Unlock();
+
+ // split?
+ off64_t split = 0;
+ if (pos + bl.length() > header.max_size) {
+ bufferlist first, second;
+ split = header.max_size - pos;
+ first.substr_of(bl, 0, split);
+ second.substr_of(bl, split, bl.length() - split);
+ ceph_assert(first.length() + second.length() == bl.length());
+ dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length()
+ << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl;
+
+ //Save pos to write first piece second
+ off64_t first_pos = pos;
+ off64_t orig_pos;
+ pos = get_top();
+ // header too?
+ if (hbp.length()) {
+ // be sneaky: include the header in the second fragment
+ bufferlist tmp;
+ tmp.push_back(hbp);
+ tmp.claim_append(second);
+ second.swap(tmp);
+ pos = 0; // we included the header
+ }
+ // Write the second portion first possible with the header, so
+ // do_read_entry() won't even get a valid entry_header_t if there
+ // is a crash between the two writes.
+ orig_pos = pos;
+ if (write_bl(pos, second)) {
+ derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+ << ") failed" << dendl;
+ check_align(pos, second);
+ ceph_abort();
+ }
+ orig_pos = first_pos;
+ if (write_bl(first_pos, first)) {
+ derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+ << ") failed" << dendl;
+ check_align(first_pos, first);
+ ceph_abort();
+ }
+ ceph_assert(first_pos == get_top());
+ } else {
+ // header too?
+ if (hbp.length()) {
+ if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) {
+ int err = errno;
+ derr << "FileJournal::do_write: pwrite(fd=" << fd
+ << ", hbp.length=" << hbp.length() << ") failed :"
+ << cpp_strerror(err) << dendl;
+ ceph_abort();
+ }
+ }
+
+ if (write_bl(pos, bl)) {
+ derr << "FileJournal::do_write: write_bl(pos=" << pos
+ << ") failed" << dendl;
+ check_align(pos, bl);
+ ceph_abort();
+ }
+ }
+
+ if (!directio) {
+ dout(20) << "do_write fsync" << dendl;
+
+ /*
+ * We'd really love to have a fsync_range or fdatasync_range and do a:
+ *
+ * if (split) {
+ * ::fsync_range(fd, header.max_size - split, split)l
+ * ::fsync_range(fd, get_top(), bl.length() - split);
+ * else
+ * ::fsync_range(fd, write_pos, bl.length())
+ *
+ * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be
+ * too hard given all the underlying infrastructure already exist.
+ *
+ * NOTE: using sync_file_range here would not be safe as it does not
+ * flush disk caches or commits any sort of metadata.
+ */
+ int ret = 0;
+#if defined(__APPLE__) || defined(__FreeBSD__)
+ ret = ::fsync(fd);
+#else
+ ret = ::fdatasync(fd);
+#endif
+ if (ret < 0) {
+ derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+#ifdef HAVE_POSIX_FADVISE
+ if (cct->_conf->filestore_fadvise)
+ posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+ }
+
+ utime_t lat = ceph_clock_now() - from;
+ dout(20) << "do_write latency " << lat << dendl;
+
+ write_lock.Lock();
+
+ ceph_assert(write_pos == pos);
+ ceph_assert(write_pos % header.alignment == 0);
+
+ {
+ Mutex::Locker locker(finisher_lock);
+ journaled_seq = writing_seq;
+
+ // kick finisher?
+ // only if we haven't filled up recently!
+ if (full_state != FULL_NOTFULL) {
+ dout(10) << "do_write NOT queueing finisher seq " << journaled_seq
+ << ", full_commit_seq|full_restart_seq" << dendl;
+ } else {
+ if (plug_journal_completions) {
+ dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq
+ << " due to completion plug" << dendl;
+ } else {
+ dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl;
+ queue_completions_thru(journaled_seq);
+ }
+ }
+ }
+}
+
+void FileJournal::flush()
+{
+ dout(10) << "waiting for completions to empty" << dendl;
+ {
+ Mutex::Locker l(finisher_lock);
+ while (!completions_empty())
+ finisher_cond.Wait(finisher_lock);
+ }
+ dout(10) << "flush waiting for finisher" << dendl;
+ finisher->wait_for_empty();
+ dout(10) << "flush done" << dendl;
+}
+
+
+void FileJournal::write_thread_entry()
+{
+ dout(10) << "write_thread_entry start" << dendl;
+ while (1) {
+ {
+ Mutex::Locker locker(writeq_lock);
+ if (writeq.empty() && !must_write_header) {
+ if (write_stop)
+ break;
+ dout(20) << "write_thread_entry going to sleep" << dendl;
+ writeq_cond.Wait(writeq_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ continue;
+ }
+ }
+
+#ifdef HAVE_LIBAIO
+ if (aio) {
+ Mutex::Locker locker(aio_lock);
+ // should we back off to limit aios in flight? try to do this
+ // adaptively so that we submit larger aios once we have lots of
+ // them in flight.
+ //
+ // NOTE: our condition here is based on aio_num (protected by
+ // aio_lock) and throttle_bytes (part of the write queue). when
+ // we sleep, we *only* wait for aio_num to change, and do not
+ // wake when more data is queued. this is not strictly correct,
+ // but should be fine given that we will have plenty of aios in
+ // flight if we hit this limit to ensure we keep the device
+ // saturated.
+ while (aio_num > 0) {
+ int exp = std::min<int>(aio_num * 2, 24);
+ long unsigned min_new = 1ull << exp;
+ uint64_t cur = aio_write_queue_bytes;
+ dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes
+ << " ... exp " << exp << " min_new " << min_new
+ << " ... pending " << cur << dendl;
+ if (cur >= min_new)
+ break;
+ dout(20) << "write_thread_entry deferring until more aios complete: "
+ << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new
+ << " bytes to start a new aio (currently " << cur << " pending)" << dendl;
+ aio_cond.Wait(aio_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ }
+ }
+#endif
+
+ Mutex::Locker locker(write_lock);
+ uint64_t orig_ops = 0;
+ uint64_t orig_bytes = 0;
+
+ bufferlist bl;
+ int r = prepare_multi_write(bl, orig_ops, orig_bytes);
+ // Don't care about journal full if stoppping, so drop queue and
+ // possibly let header get written and loop above to notice stop
+ if (r == -ENOSPC) {
+ if (write_stop) {
+ dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl;
+ while (!writeq_empty()) {
+ complete_write(1, peek_write().orig_len);
+ pop_write();
+ }
+ print_header(header);
+ r = 0;
+ } else {
+ dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl;
+ commit_cond.Wait(write_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ continue;
+ }
+ }
+ ceph_assert(r == 0);
+
+ if (logger) {
+ logger->inc(l_filestore_journal_wr);
+ logger->inc(l_filestore_journal_wr_bytes, bl.length());
+ }
+
+#ifdef HAVE_LIBAIO
+ if (aio)
+ do_aio_write(bl);
+ else
+ do_write(bl);
+#else
+ do_write(bl);
+#endif
+ complete_write(orig_ops, orig_bytes);
+ }
+
+ dout(10) << "write_thread_entry finish" << dendl;
+}
+
+#ifdef HAVE_LIBAIO
+void FileJournal::do_aio_write(bufferlist& bl)
+{
+
+ if (cct->_conf->journal_write_header_frequency &&
+ (((++journaled_since_start) %
+ cct->_conf->journal_write_header_frequency) == 0)) {
+ must_write_header = true;
+ }
+
+ // nothing to do?
+ if (bl.length() == 0 && !must_write_header)
+ return;
+
+ buffer::ptr hbp;
+ if (must_write_header) {
+ must_write_header = false;
+ hbp = prepare_header();
+ }
+
+ // entry
+ off64_t pos = write_pos;
+
+ dout(15) << "do_aio_write writing " << pos << "~" << bl.length()
+ << (hbp.length() ? " + header":"")
+ << dendl;
+
+ // split?
+ off64_t split = 0;
+ if (pos + bl.length() > header.max_size) {
+ bufferlist first, second;
+ split = header.max_size - pos;
+ first.substr_of(bl, 0, split);
+ second.substr_of(bl, split, bl.length() - split);
+ ceph_assert(first.length() + second.length() == bl.length());
+ dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl;
+
+ if (write_aio_bl(pos, first, 0)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ ceph_assert(pos == header.max_size);
+ if (hbp.length()) {
+ // be sneaky: include the header in the second fragment
+ bufferlist tmp;
+ tmp.push_back(hbp);
+ tmp.claim_append(second);
+ second.swap(tmp);
+ pos = 0; // we included the header
+ } else
+ pos = get_top(); // no header, start after that
+ if (write_aio_bl(pos, second, writing_seq)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ } else {
+ // header too?
+ if (hbp.length()) {
+ bufferlist hbl;
+ hbl.push_back(hbp);
+ loff_t pos = 0;
+ if (write_aio_bl(pos, hbl, 0)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl;
+ ceph_abort();
+ }
+ }
+
+ if (write_aio_bl(pos, bl, writing_seq)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ }
+
+ write_pos = pos;
+ if (write_pos == header.max_size)
+ write_pos = get_top();
+ ceph_assert(write_pos % header.alignment == 0);
+}
+
+/**
+ * write a buffer using aio
+ *
+ * @param seq seq to trigger when this aio completes. if 0, do not update any state
+ * on completion.
+ */
+int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
+{
+ dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl;
+
+ while (bl.length() > 0) {
+ int max = std::min<int>(bl.get_num_buffers(), IOV_MAX-1);
+ iovec *iov = new iovec[max];
+ int n = 0;
+ unsigned len = 0;
+ for (auto p = std::cbegin(bl.buffers()); n < max; ++p, ++n) {
+ ceph_assert(p != std::cend(bl.buffers()));
+ iov[n].iov_base = const_cast<void*>(static_cast<const void*>(p->c_str()));
+ iov[n].iov_len = p->length();
+ len += p->length();
+ }
+
+ bufferlist tbl;
+ bl.splice(0, len, &tbl); // move bytes from bl -> tbl
+
+ // lock only aio_queue, current aio, aio_num, aio_bytes, which may be
+ // modified in check_aio_completion
+ aio_lock.Lock();
+ aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq));
+ aio_info& aio = aio_queue.back();
+ aio.iov = iov;
+
+ io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos);
+
+ dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len
+ << " in " << n << dendl;
+
+ aio_num++;
+ aio_bytes += aio.len;
+
+ // need to save current aio len to update write_pos later because current
+ // aio could be ereased from aio_queue once it is done
+ uint64_t cur_len = aio.len;
+ // unlock aio_lock because following io_submit might take time to return
+ aio_lock.Unlock();
+
+ iocb *piocb = &aio.iocb;
+
+ // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
+ int attempts = 16;
+ int delay = 125;
+ do {
+ int r = io_submit(aio_ctx, 1, &piocb);
+ dout(20) << "write_aio_bl io_submit return value: " << r << dendl;
+ if (r < 0) {
+ derr << "io_submit to " << aio.off << "~" << cur_len
+ << " got " << cpp_strerror(r) << dendl;
+ if (r == -EAGAIN && attempts-- > 0) {
+ usleep(delay);
+ delay *= 2;
+ continue;
+ }
+ check_align(pos, tbl);
+ ceph_abort_msg("io_submit got unexpected error");
+ } else {
+ break;
+ }
+ } while (true);
+ pos += cur_len;
+ }
+ aio_lock.Lock();
+ write_finish_cond.Signal();
+ aio_lock.Unlock();
+ return 0;
+}
+#endif
+
+void FileJournal::write_finish_thread_entry()
+{
+#ifdef HAVE_LIBAIO
+ dout(10) << __func__ << " enter" << dendl;
+ while (true) {
+ {
+ Mutex::Locker locker(aio_lock);
+ if (aio_queue.empty()) {
+ if (aio_stop)
+ break;
+ dout(20) << __func__ << " sleeping" << dendl;
+ write_finish_cond.Wait(aio_lock);
+ continue;
+ }
+ }
+
+ dout(20) << __func__ << " waiting for aio(s)" << dendl;
+ io_event event[16];
+ int r = io_getevents(aio_ctx, 1, 16, event, NULL);
+ if (r < 0) {
+ if (r == -EINTR) {
+ dout(0) << "io_getevents got " << cpp_strerror(r) << dendl;
+ continue;
+ }
+ derr << "io_getevents got " << cpp_strerror(r) << dendl;
+ if (r == -EIO) {
+ note_io_error_event(devname.c_str(), fn.c_str(), -EIO, 0, 0, 0);
+ }
+ ceph_abort_msg("got unexpected error from io_getevents");
+ }
+
+ {
+ Mutex::Locker locker(aio_lock);
+ for (int i=0; i<r; i++) {
+ aio_info *ai = (aio_info *)event[i].obj;
+ if (event[i].res != ai->len) {
+ derr << "aio to " << ai->off << "~" << ai->len
+ << " returned: " << (int)event[i].res << dendl;
+ ceph_abort_msg("unexpected aio error");
+ }
+ dout(10) << __func__ << " aio " << ai->off
+ << "~" << ai->len << " done" << dendl;
+ ai->done = true;
+ }
+ check_aio_completion();
+ }
+ }
+ dout(10) << __func__ << " exit" << dendl;
+#endif
+}
+
+#ifdef HAVE_LIBAIO
+/**
+ * check aio_wait for completed aio, and update state appropriately.
+ */
+void FileJournal::check_aio_completion()
+{
+ ceph_assert(aio_lock.is_locked());
+ dout(20) << "check_aio_completion" << dendl;
+
+ bool completed_something = false, signal = false;
+ uint64_t new_journaled_seq = 0;
+
+ list<aio_info>::iterator p = aio_queue.begin();
+ while (p != aio_queue.end() && p->done) {
+ dout(20) << "check_aio_completion completed seq " << p->seq << " "
+ << p->off << "~" << p->len << dendl;
+ if (p->seq) {
+ new_journaled_seq = p->seq;
+ completed_something = true;
+ }
+ aio_num--;
+ aio_bytes -= p->len;
+ aio_queue.erase(p++);
+ signal = true;
+ }
+
+ if (completed_something) {
+ // kick finisher?
+ // only if we haven't filled up recently!
+ Mutex::Locker locker(finisher_lock);
+ journaled_seq = new_journaled_seq;
+ if (full_state != FULL_NOTFULL) {
+ dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq
+ << ", full_commit_seq|full_restart_seq" << dendl;
+ } else {
+ if (plug_journal_completions) {
+ dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq
+ << " due to completion plug" << dendl;
+ } else {
+ dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl;
+ queue_completions_thru(journaled_seq);
+ }
+ }
+ }
+ if (signal) {
+ // maybe write queue was waiting for aio count to drop?
+ aio_cond.Signal();
+ }
+}
+#endif
+
+int FileJournal::prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) {
+ dout(10) << "prepare_entry " << tls << dendl;
+ int data_len = cct->_conf->journal_align_min_size - 1;
+ int data_align = -1; // -1 indicates that we don't care about the alignment
+ bufferlist bl;
+ for (vector<ObjectStore::Transaction>::iterator p = tls.begin();
+ p != tls.end(); ++p) {
+ if ((int)(*p).get_data_length() > data_len) {
+ data_len = (*p).get_data_length();
+ data_align = ((*p).get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK;
+ }
+ encode(*p, bl);
+ }
+ if (tbl->length()) {
+ bl.claim_append(*tbl);
+ }
+ // add it this entry
+ entry_header_t h;
+ unsigned head_size = sizeof(entry_header_t);
+ off64_t base_size = 2*head_size + bl.length();
+ memset(&h, 0, sizeof(h));
+ if (data_align >= 0)
+ h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK;
+ off64_t size = round_up_to(base_size + h.pre_pad, header.alignment);
+ unsigned post_pad = size - base_size - h.pre_pad;
+ h.len = bl.length();
+ h.post_pad = post_pad;
+ h.crc32c = bl.crc32c(0);
+ dout(10) << " len " << bl.length() << " -> " << size
+ << " (head " << head_size << " pre_pad " << h.pre_pad
+ << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")"
+ << " (bl alignment " << data_align << ")"
+ << dendl;
+ bufferlist ebl;
+ // header
+ ebl.append((const char*)&h, sizeof(h));
+ if (h.pre_pad) {
+ ebl.push_back(buffer::create_static(h.pre_pad, zero_buf));
+ }
+ // payload
+ ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+ if (h.post_pad) {
+ ebl.push_back(buffer::create_static(h.post_pad, zero_buf));
+ }
+ // footer
+ ebl.append((const char*)&h, sizeof(h));
+ if (directio)
+ ebl.rebuild_aligned(CEPH_DIRECTIO_ALIGNMENT);
+ tbl->claim(ebl);
+ return h.len;
+}
+
+void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
+ Context *oncommit, TrackedOpRef osd_op)
+{
+ // dump on queue
+ dout(5) << "submit_entry seq " << seq
+ << " len " << e.length()
+ << " (" << oncommit << ")" << dendl;
+ ceph_assert(e.length() > 0);
+ ceph_assert(e.length() < header.max_size);
+
+ if (logger) {
+ logger->inc(l_filestore_journal_queue_bytes, orig_len);
+ logger->inc(l_filestore_journal_queue_ops, 1);
+ }
+
+ throttle.register_throttle_seq(seq, e.length());
+ if (logger) {
+ logger->inc(l_filestore_journal_ops, 1);
+ logger->inc(l_filestore_journal_bytes, e.length());
+ }
+
+ if (osd_op) {
+ osd_op->mark_event("commit_queued_for_journal_write");
+ if (osd_op->store_trace) {
+ osd_op->journal_trace.init("journal", &trace_endpoint, &osd_op->store_trace);
+ osd_op->journal_trace.event("submit_entry");
+ osd_op->journal_trace.keyval("seq", seq);
+ }
+ }
+ {
+ Mutex::Locker l1(writeq_lock);
+#ifdef HAVE_LIBAIO
+ Mutex::Locker l2(aio_lock);
+#endif
+ Mutex::Locker l3(completions_lock);
+
+#ifdef HAVE_LIBAIO
+ aio_write_queue_ops++;
+ aio_write_queue_bytes += e.length();
+ aio_cond.Signal();
+#endif
+
+ completions.push_back(
+ completion_item(
+ seq, oncommit, ceph_clock_now(), osd_op));
+ if (writeq.empty())
+ writeq_cond.Signal();
+ writeq.push_back(write_item(seq, e, orig_len, osd_op));
+ if (osd_op)
+ osd_op->journal_trace.keyval("queue depth", writeq.size());
+ }
+}
+
+bool FileJournal::writeq_empty()
+{
+ Mutex::Locker locker(writeq_lock);
+ return writeq.empty();
+}
+
+FileJournal::write_item &FileJournal::peek_write()
+{
+ ceph_assert(write_lock.is_locked());
+ Mutex::Locker locker(writeq_lock);
+ return writeq.front();
+}
+
+void FileJournal::pop_write()
+{
+ ceph_assert(write_lock.is_locked());
+ Mutex::Locker locker(writeq_lock);
+ if (logger) {
+ logger->dec(l_filestore_journal_queue_bytes, writeq.front().orig_len);
+ logger->dec(l_filestore_journal_queue_ops, 1);
+ }
+ writeq.pop_front();
+}
+
+void FileJournal::batch_pop_write(list<write_item> &items)
+{
+ ceph_assert(write_lock.is_locked());
+ {
+ Mutex::Locker locker(writeq_lock);
+ writeq.swap(items);
+ }
+ for (auto &&i : items) {
+ if (logger) {
+ logger->dec(l_filestore_journal_queue_bytes, i.orig_len);
+ logger->dec(l_filestore_journal_queue_ops, 1);
+ }
+ }
+}
+
+void FileJournal::batch_unpop_write(list<write_item> &items)
+{
+ ceph_assert(write_lock.is_locked());
+ for (auto &&i : items) {
+ if (logger) {
+ logger->inc(l_filestore_journal_queue_bytes, i.orig_len);
+ logger->inc(l_filestore_journal_queue_ops, 1);
+ }
+ }
+ Mutex::Locker locker(writeq_lock);
+ writeq.splice(writeq.begin(), items);
+}
+
+void FileJournal::commit_start(uint64_t seq)
+{
+ dout(10) << "commit_start" << dendl;
+
+ // was full?
+ switch (full_state) {
+ case FULL_NOTFULL:
+ break; // all good
+
+ case FULL_FULL:
+ if (seq >= journaled_seq) {
+ dout(1) << " FULL_FULL -> FULL_WAIT. commit_start on seq "
+ << seq << " > journaled_seq " << journaled_seq
+ << ", moving to FULL_WAIT."
+ << dendl;
+ full_state = FULL_WAIT;
+ } else {
+ dout(1) << "FULL_FULL commit_start on seq "
+ << seq << " < journaled_seq " << journaled_seq
+ << ", remaining in FULL_FULL"
+ << dendl;
+ }
+ break;
+
+ case FULL_WAIT:
+ dout(1) << " FULL_WAIT -> FULL_NOTFULL. journal now active, setting completion plug." << dendl;
+ full_state = FULL_NOTFULL;
+ plug_journal_completions = true;
+ break;
+ }
+}
+
+/*
+ *send discard command to joural block deivce
+ */
+void FileJournal::do_discard(int64_t offset, int64_t end)
+{
+ dout(10) << __func__ << " trim(" << offset << ", " << end << dendl;
+
+ offset = round_up_to(offset, block_size);
+ if (offset >= end)
+ return;
+ end = round_up_to(end - block_size, block_size);
+ ceph_assert(end >= offset);
+ if (offset < end) {
+ BlkDev blkdev(fd);
+ if (blkdev.discard(offset, end - offset) < 0) {
+ dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl;
+ }
+ }
+}
+
+void FileJournal::committed_thru(uint64_t seq)
+{
+ Mutex::Locker locker(write_lock);
+
+ auto released = throttle.flush(seq);
+ if (logger) {
+ logger->dec(l_filestore_journal_ops, released.first);
+ logger->dec(l_filestore_journal_bytes, released.second);
+ }
+
+ if (seq < last_committed_seq) {
+ dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl;
+ ceph_assert(seq >= last_committed_seq);
+ return;
+ }
+ if (seq == last_committed_seq) {
+ dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl;
+ return;
+ }
+
+ dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl;
+ last_committed_seq = seq;
+
+ // completions!
+ {
+ Mutex::Locker locker(finisher_lock);
+ queue_completions_thru(seq);
+ if (plug_journal_completions && seq >= header.start_seq) {
+ dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl;
+ plug_journal_completions = false;
+ queue_completions_thru(journaled_seq);
+ }
+ }
+
+ // adjust start pointer
+ while (!journalq.empty() && journalq.front().first <= seq) {
+ journalq.pop_front();
+ }
+
+ int64_t old_start = header.start;
+ if (!journalq.empty()) {
+ header.start = journalq.front().second;
+ header.start_seq = journalq.front().first;
+ } else {
+ header.start = write_pos;
+ header.start_seq = seq + 1;
+ }
+
+ if (discard) {
+ dout(10) << __func__ << " will trim (" << old_start << ", " << header.start << ")" << dendl;
+ if (old_start < header.start)
+ do_discard(old_start, header.start - 1);
+ else {
+ do_discard(old_start, header.max_size - 1);
+ do_discard(get_top(), header.start - 1);
+ }
+ }
+
+ must_write_header = true;
+ print_header(header);
+
+ // committed but unjournaled items
+ while (!writeq_empty() && peek_write().seq <= seq) {
+ dout(15) << " dropping committed but unwritten seq " << peek_write().seq
+ << " len " << peek_write().bl.length()
+ << dendl;
+ complete_write(1, peek_write().orig_len);
+ pop_write();
+ }
+
+ commit_cond.Signal();
+
+ dout(10) << "committed_thru done" << dendl;
+}
+
+
+void FileJournal::complete_write(uint64_t ops, uint64_t bytes)
+{
+ dout(5) << __func__ << " finished " << ops << " ops and "
+ << bytes << " bytes" << dendl;
+}
+
+int FileJournal::make_writeable()
+{
+ dout(10) << __func__ << dendl;
+ int r = set_throttle_params();
+ if (r < 0)
+ return r;
+
+ r = _open(true);
+ if (r < 0)
+ return r;
+
+ if (read_pos > 0)
+ write_pos = read_pos;
+ else
+ write_pos = get_top();
+ read_pos = 0;
+
+ must_write_header = true;
+
+ start_writer();
+ return 0;
+}
+
+int FileJournal::set_throttle_params()
+{
+ stringstream ss;
+ bool valid = throttle.set_params(
+ cct->_conf->journal_throttle_low_threshhold,
+ cct->_conf->journal_throttle_high_threshhold,
+ cct->_conf->filestore_expected_throughput_bytes,
+ cct->_conf->journal_throttle_high_multiple,
+ cct->_conf->journal_throttle_max_multiple,
+ header.max_size - get_top(),
+ &ss);
+
+ if (!valid) {
+ derr << "tried to set invalid params: "
+ << ss.str()
+ << dendl;
+ }
+ return valid ? 0 : -EINVAL;
+}
+
+const char** FileJournal::get_tracked_conf_keys() const
+{
+ static const char *KEYS[] = {
+ "journal_throttle_low_threshhold",
+ "journal_throttle_high_threshhold",
+ "journal_throttle_high_multiple",
+ "journal_throttle_max_multiple",
+ "filestore_expected_throughput_bytes",
+ NULL};
+ return KEYS;
+}
+
+void FileJournal::wrap_read_bl(
+ off64_t pos,
+ int64_t olen,
+ bufferlist* bl,
+ off64_t *out_pos
+ ) const
+{
+ while (olen > 0) {
+ while (pos >= header.max_size)
+ pos = pos + get_top() - header.max_size;
+
+ int64_t len;
+ if (pos + olen > header.max_size)
+ len = header.max_size - pos; // partial
+ else
+ len = olen; // rest
+
+ int64_t actual = ::lseek64(fd, pos, SEEK_SET);
+ ceph_assert(actual == pos);
+
+ bufferptr bp = buffer::create(len);
+ int r = safe_read_exact(fd, bp.c_str(), len);
+ if (r) {
+ derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned "
+ << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ bl->push_back(std::move(bp));
+ pos += len;
+ olen -= len;
+ }
+ if (pos >= header.max_size)
+ pos = pos + get_top() - header.max_size;
+ if (out_pos)
+ *out_pos = pos;
+}
+
+bool FileJournal::read_entry(
+ bufferlist &bl,
+ uint64_t &next_seq,
+ bool *corrupt)
+{
+ if (corrupt)
+ *corrupt = false;
+ uint64_t seq = next_seq;
+
+ if (!read_pos) {
+ dout(2) << "read_entry -- not readable" << dendl;
+ return false;
+ }
+
+ off64_t pos = read_pos;
+ off64_t next_pos = pos;
+ stringstream ss;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ &ss);
+ if (result == SUCCESS) {
+ journalq.push_back( pair<uint64_t,off64_t>(seq, pos));
+ uint64_t amount_to_take =
+ next_pos > pos ?
+ next_pos - pos :
+ (header.max_size - pos) + (next_pos - get_top());
+ throttle.take(amount_to_take);
+ throttle.register_throttle_seq(next_seq, amount_to_take);
+ if (logger) {
+ logger->inc(l_filestore_journal_ops, 1);
+ logger->inc(l_filestore_journal_bytes, amount_to_take);
+ }
+ if (next_seq > seq) {
+ return false;
+ } else {
+ read_pos = next_pos;
+ next_seq = seq;
+ if (seq > journaled_seq)
+ journaled_seq = seq;
+ return true;
+ }
+ } else {
+ derr << "do_read_entry(" << pos << "): " << ss.str() << dendl;
+ }
+
+ if (seq && seq < header.committed_up_to) {
+ derr << "Unable to read past sequence " << seq
+ << " but header indicates the journal has committed up through "
+ << header.committed_up_to << ", journal is corrupt" << dendl;
+ if (cct->_conf->journal_ignore_corruption) {
+ if (corrupt)
+ *corrupt = true;
+ return false;
+ } else {
+ ceph_abort();
+ }
+ }
+
+ dout(2) << "No further valid entries found, journal is most likely valid"
+ << dendl;
+ return false;
+}
+
+FileJournal::read_entry_result FileJournal::do_read_entry(
+ off64_t init_pos,
+ off64_t *next_pos,
+ bufferlist *bl,
+ uint64_t *seq,
+ ostream *ss,
+ entry_header_t *_h) const
+{
+ off64_t cur_pos = init_pos;
+ bufferlist _bl;
+ if (!bl)
+ bl = &_bl;
+
+ // header
+ entry_header_t *h;
+ bufferlist hbl;
+ off64_t _next_pos;
+ wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos);
+ h = reinterpret_cast<entry_header_t *>(hbl.c_str());
+
+ if (!h->check_magic(cur_pos, header.get_fsid64())) {
+ dout(25) << "read_entry " << init_pos
+ << " : bad header magic, end of journal" << dendl;
+ if (ss)
+ *ss << "bad header magic";
+ if (next_pos)
+ *next_pos = init_pos + (4<<10); // check 4k ahead
+ return MAYBE_CORRUPT;
+ }
+ cur_pos = _next_pos;
+
+ // pad + body + pad
+ if (h->pre_pad)
+ cur_pos += h->pre_pad;
+
+ bl->clear();
+ wrap_read_bl(cur_pos, h->len, bl, &cur_pos);
+
+ if (h->post_pad)
+ cur_pos += h->post_pad;
+
+ // footer
+ entry_header_t *f;
+ bufferlist fbl;
+ wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos);
+ f = reinterpret_cast<entry_header_t *>(fbl.c_str());
+ if (memcmp(f, h, sizeof(*f))) {
+ if (ss)
+ *ss << "bad footer magic, partial entry";
+ if (next_pos)
+ *next_pos = cur_pos;
+ return MAYBE_CORRUPT;
+ }
+
+ if ((header.flags & header_t::FLAG_CRC) || // if explicitly enabled (new journal)
+ h->crc32c != 0) { // newer entry in old journal
+ uint32_t actual_crc = bl->crc32c(0);
+ if (actual_crc != h->crc32c) {
+ if (ss)
+ *ss << "header crc (" << h->crc32c
+ << ") doesn't match body crc (" << actual_crc << ")";
+ if (next_pos)
+ *next_pos = cur_pos;
+ return MAYBE_CORRUPT;
+ }
+ }
+
+ // yay!
+ dout(2) << "read_entry " << init_pos << " : seq " << h->seq
+ << " " << h->len << " bytes"
+ << dendl;
+
+ // ok!
+ if (seq)
+ *seq = h->seq;
+
+
+ if (next_pos)
+ *next_pos = cur_pos;
+
+ if (_h)
+ *_h = *h;
+
+ ceph_assert(cur_pos % header.alignment == 0);
+ return SUCCESS;
+}
+
+void FileJournal::reserve_throttle_and_backoff(uint64_t count)
+{
+ throttle.get(count);
+}
+
+void FileJournal::get_header(
+ uint64_t wanted_seq,
+ off64_t *_pos,
+ entry_header_t *h)
+{
+ off64_t pos = header.start;
+ off64_t next_pos = pos;
+ bufferlist bl;
+ uint64_t seq = 0;
+ dout(2) << __func__ << dendl;
+ while (1) {
+ bl.clear();
+ pos = next_pos;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ 0,
+ h);
+ if (result == FAILURE || result == MAYBE_CORRUPT)
+ ceph_abort();
+ if (seq == wanted_seq) {
+ if (_pos)
+ *_pos = pos;
+ return;
+ }
+ }
+ ceph_abort(); // not reachable
+}
+
+void FileJournal::corrupt(
+ int wfd,
+ off64_t corrupt_at)
+{
+ dout(2) << __func__ << dendl;
+ if (corrupt_at >= header.max_size)
+ corrupt_at = corrupt_at + get_top() - header.max_size;
+
+ int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET);
+ ceph_assert(actual == corrupt_at);
+
+ char buf[10];
+ int r = safe_read_exact(fd, buf, 1);
+ ceph_assert(r == 0);
+
+ actual = ::lseek64(wfd, corrupt_at, SEEK_SET);
+ ceph_assert(actual == corrupt_at);
+
+ buf[0]++;
+ r = safe_write(wfd, buf, 1);
+ ceph_assert(r == 0);
+}
+
+void FileJournal::corrupt_payload(
+ int wfd,
+ uint64_t seq)
+{
+ dout(2) << __func__ << dendl;
+ off64_t pos = 0;
+ entry_header_t h;
+ get_header(seq, &pos, &h);
+ off64_t corrupt_at =
+ pos + sizeof(entry_header_t) + h.pre_pad;
+ corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_footer_magic(
+ int wfd,
+ uint64_t seq)
+{
+ dout(2) << __func__ << dendl;
+ off64_t pos = 0;
+ entry_header_t h;
+ get_header(seq, &pos, &h);
+ off64_t corrupt_at =
+ pos + sizeof(entry_header_t) + h.pre_pad +
+ h.len + h.post_pad +
+ (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+ corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_header_magic(
+ int wfd,
+ uint64_t seq)
+{
+ dout(2) << __func__ << dendl;
+ off64_t pos = 0;
+ entry_header_t h;
+ get_header(seq, &pos, &h);
+ off64_t corrupt_at =
+ pos +
+ (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+ corrupt(wfd, corrupt_at);
+}
+
+off64_t FileJournal::get_journal_size_estimate()
+{
+ off64_t size, start = header.start;
+ if (write_pos < start) {
+ size = (max_size - start) + write_pos;
+ } else {
+ size = write_pos - start;
+ }
+ dout(20) << __func__ << " journal size=" << size << dendl;
+ return size;
+}
+
+void FileJournal::get_devices(set<string> *ls)
+{
+ string dev_node;
+ BlkDev blkdev(fd);
+ if (int rc = blkdev.wholedisk(&dev_node); rc) {
+ return;
+ }
+ get_raw_devices(dev_node, ls);
+}
+
+void FileJournal::collect_metadata(map<string,string> *pm)
+{
+ BlkDev blkdev(fd);
+ char partition_path[PATH_MAX];
+ char dev_node[PATH_MAX];
+ if (blkdev.partition(partition_path, PATH_MAX)) {
+ (*pm)["backend_filestore_journal_partition_path"] = "unknown";
+ } else {
+ (*pm)["backend_filestore_journal_partition_path"] = string(partition_path);
+ }
+ if (blkdev.wholedisk(dev_node, PATH_MAX)) {
+ (*pm)["backend_filestore_journal_dev_node"] = "unknown";
+ } else {
+ (*pm)["backend_filestore_journal_dev_node"] = string(dev_node);
+ devname = dev_node;
+ }
+}
diff --git a/src/os/filestore/FileJournal.h b/src/os/filestore/FileJournal.h
new file mode 100644
index 00000000..2313b4b8
--- /dev/null
+++ b/src/os/filestore/FileJournal.h
@@ -0,0 +1,556 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILEJOURNAL_H
+#define CEPH_FILEJOURNAL_H
+
+#include <stdlib.h>
+#include <deque>
+using std::deque;
+
+#include "Journal.h"
+#include "common/config_fwd.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+#include "common/Throttle.h"
+#include "JournalThrottle.h"
+#include "common/zipkin_trace.h"
+
+#ifdef HAVE_LIBAIO
+# include <libaio.h>
+#endif
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+/**
+ * Implements journaling on top of block device or file.
+ *
+ * Lock ordering is write_lock > aio_lock > (completions_lock | finisher_lock)
+ */
+class FileJournal :
+ public Journal,
+ public md_config_obs_t {
+public:
+ /// Protected by finisher_lock
+ struct completion_item {
+ uint64_t seq;
+ Context *finish;
+ utime_t start;
+ TrackedOpRef tracked_op;
+ completion_item(uint64_t o, Context *c, utime_t s, TrackedOpRef opref)
+ : seq(o), finish(c), start(s), tracked_op(opref) {}
+ completion_item() : seq(0), finish(0), start(0) {}
+ };
+ struct write_item {
+ uint64_t seq;
+ bufferlist bl;
+ uint32_t orig_len;
+ TrackedOpRef tracked_op;
+ ZTracer::Trace trace;
+ write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) :
+ seq(s), orig_len(ol), tracked_op(opref) {
+ bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+ }
+ write_item() : seq(0), orig_len(0) {}
+ };
+
+ Mutex finisher_lock;
+ Cond finisher_cond;
+ uint64_t journaled_seq;
+ bool plug_journal_completions;
+
+ Mutex writeq_lock;
+ Cond writeq_cond;
+ list<write_item> writeq;
+ bool writeq_empty();
+ write_item &peek_write();
+ void pop_write();
+ void batch_pop_write(list<write_item> &items);
+ void batch_unpop_write(list<write_item> &items);
+
+ Mutex completions_lock;
+ list<completion_item> completions;
+ bool completions_empty() {
+ Mutex::Locker l(completions_lock);
+ return completions.empty();
+ }
+ void batch_pop_completions(list<completion_item> &items) {
+ Mutex::Locker l(completions_lock);
+ completions.swap(items);
+ }
+ void batch_unpop_completions(list<completion_item> &items) {
+ Mutex::Locker l(completions_lock);
+ completions.splice(completions.begin(), items);
+ }
+ completion_item completion_peek_front() {
+ Mutex::Locker l(completions_lock);
+ ceph_assert(!completions.empty());
+ return completions.front();
+ }
+ void completion_pop_front() {
+ Mutex::Locker l(completions_lock);
+ ceph_assert(!completions.empty());
+ completions.pop_front();
+ }
+
+ int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) override;
+
+ void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len,
+ Context *oncommit,
+ TrackedOpRef osd_op = TrackedOpRef()) override;
+ /// End protected by finisher_lock
+
+ /*
+ * journal header
+ */
+ struct header_t {
+ enum {
+ FLAG_CRC = (1<<0),
+ // NOTE: remove kludgey weirdness in read_header() next time a flag is added.
+ };
+
+ uint64_t flags;
+ uuid_d fsid;
+ __u32 block_size;
+ __u32 alignment;
+ int64_t max_size; // max size of journal ring buffer
+ int64_t start; // offset of first entry
+ uint64_t committed_up_to; // committed up to
+
+ /**
+ * start_seq
+ *
+ * entry at header.start has sequence >= start_seq
+ *
+ * Generally, the entry at header.start will have sequence
+ * start_seq if it exists. The only exception is immediately
+ * after journal creation since the first sequence number is
+ * not known.
+ *
+ * If the first read on open fails, we can assume corruption
+ * if start_seq > committed_up_to because the entry would have
+ * a sequence >= start_seq and therefore > committed_up_to.
+ */
+ uint64_t start_seq;
+
+ header_t() :
+ flags(0), block_size(0), alignment(0), max_size(0), start(0),
+ committed_up_to(0), start_seq(0) {}
+
+ void clear() {
+ start = block_size;
+ }
+
+ uint64_t get_fsid64() const {
+ return *(uint64_t*)fsid.bytes();
+ }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ __u32 v = 4;
+ encode(v, bl);
+ bufferlist em;
+ {
+ encode(flags, em);
+ encode(fsid, em);
+ encode(block_size, em);
+ encode(alignment, em);
+ encode(max_size, em);
+ encode(start, em);
+ encode(committed_up_to, em);
+ encode(start_seq, em);
+ }
+ encode(em, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ __u32 v;
+ decode(v, bl);
+ if (v < 2) { // normally 0, but conceivably 1
+ // decode old header_t struct (pre v0.40).
+ bl.advance(4u); // skip __u32 flags (it was unused by any old code)
+ flags = 0;
+ uint64_t tfsid;
+ decode(tfsid, bl);
+ *(uint64_t*)&fsid.bytes()[0] = tfsid;
+ *(uint64_t*)&fsid.bytes()[8] = tfsid;
+ decode(block_size, bl);
+ decode(alignment, bl);
+ decode(max_size, bl);
+ decode(start, bl);
+ committed_up_to = 0;
+ start_seq = 0;
+ return;
+ }
+ bufferlist em;
+ decode(em, bl);
+ auto t = em.cbegin();
+ decode(flags, t);
+ decode(fsid, t);
+ decode(block_size, t);
+ decode(alignment, t);
+ decode(max_size, t);
+ decode(start, t);
+
+ if (v > 2)
+ decode(committed_up_to, t);
+ else
+ committed_up_to = 0;
+
+ if (v > 3)
+ decode(start_seq, t);
+ else
+ start_seq = 0;
+ }
+ } header;
+
+ struct entry_header_t {
+ uint64_t seq; // fs op seq #
+ uint32_t crc32c; // payload only. not header, pre_pad, post_pad, or footer.
+ uint32_t len;
+ uint32_t pre_pad, post_pad;
+ uint64_t magic1;
+ uint64_t magic2;
+
+ static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) {
+ return (fsid ^ seq ^ len);
+ }
+ bool check_magic(off64_t pos, uint64_t fsid) {
+ return
+ magic1 == (uint64_t)pos &&
+ magic2 == (fsid ^ seq ^ len);
+ }
+ } __attribute__((__packed__, aligned(4)));
+
+ bool journalq_empty() { return journalq.empty(); }
+
+private:
+ string fn;
+
+ char *zero_buf;
+ off64_t max_size;
+ size_t block_size;
+ bool directio, aio, force_aio;
+ bool must_write_header;
+ off64_t write_pos; // byte where the next entry to be written will go
+ off64_t read_pos; //
+ bool discard; //for block journal whether support discard
+
+#ifdef HAVE_LIBAIO
+ /// state associated with an in-flight aio request
+ /// Protected by aio_lock
+ struct aio_info {
+ struct iocb iocb {};
+ bufferlist bl;
+ struct iovec *iov;
+ bool done;
+ uint64_t off, len; ///< these are for debug only
+ uint64_t seq; ///< seq number to complete on aio completion, if non-zero
+
+ aio_info(bufferlist& b, uint64_t o, uint64_t s)
+ : iov(NULL), done(false), off(o), len(b.length()), seq(s) {
+ bl.claim(b);
+ }
+ ~aio_info() {
+ delete[] iov;
+ }
+ };
+ Mutex aio_lock;
+ Cond aio_cond;
+ Cond write_finish_cond;
+ io_context_t aio_ctx;
+ list<aio_info> aio_queue;
+ int aio_num, aio_bytes;
+ uint64_t aio_write_queue_ops;
+ uint64_t aio_write_queue_bytes;
+ /// End protected by aio_lock
+#endif
+
+ uint64_t last_committed_seq;
+ uint64_t journaled_since_start;
+
+ string devname;
+
+ /*
+ * full states cycle at the beginnging of each commit epoch, when commit_start()
+ * is called.
+ * FULL - we just filled up during this epoch.
+ * WAIT - we filled up last epoch; now we have to wait until everything during
+ * that epoch commits to the fs before we can start writing over it.
+ * NOTFULL - all good, journal away.
+ */
+ enum {
+ FULL_NOTFULL = 0,
+ FULL_FULL = 1,
+ FULL_WAIT = 2,
+ } full_state;
+
+ int fd;
+
+ // in journal
+ deque<pair<uint64_t, off64_t> > journalq; // track seq offsets, so we can trim later.
+ uint64_t writing_seq;
+
+
+ // throttle
+ int set_throttle_params();
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set <std::string> &changed) override {
+ for (const char **i = get_tracked_conf_keys();
+ *i;
+ ++i) {
+ if (changed.count(string(*i))) {
+ set_throttle_params();
+ return;
+ }
+ }
+ }
+
+ void complete_write(uint64_t ops, uint64_t bytes);
+ JournalThrottle throttle;
+
+ // write thread
+ Mutex write_lock;
+ bool write_stop;
+ bool aio_stop;
+
+ Cond commit_cond;
+
+ int _open(bool wr, bool create=false);
+ int _open_block_device();
+ void _close(int fd) const;
+ int _open_file(int64_t oldsize, blksize_t blksize, bool create);
+ int _dump(ostream& out, bool simple);
+ void print_header(const header_t &hdr) const;
+ int read_header(header_t *hdr) const;
+ bufferptr prepare_header();
+ void start_writer();
+ void stop_writer();
+ void write_thread_entry();
+
+ void queue_completions_thru(uint64_t seq);
+
+ int check_for_full(uint64_t seq, off64_t pos, off64_t size);
+ int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee);
+ int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos,
+ uint64_t& orig_ops, uint64_t& orig_bytes);
+ void do_write(bufferlist& bl);
+
+ void write_finish_thread_entry();
+ void check_aio_completion();
+ void do_aio_write(bufferlist& bl);
+ int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq);
+
+
+ void check_align(off64_t pos, bufferlist& bl);
+ int write_bl(off64_t& pos, bufferlist& bl);
+
+ /// read len from journal starting at in_pos and wrapping up to len
+ void wrap_read_bl(
+ off64_t in_pos, ///< [in] start position
+ int64_t len, ///< [in] length to read
+ bufferlist* bl, ///< [out] result
+ off64_t *out_pos ///< [out] next position to read, will be wrapped
+ ) const;
+
+ void do_discard(int64_t offset, int64_t end);
+
+ class Writer : public Thread {
+ FileJournal *journal;
+ public:
+ explicit Writer(FileJournal *fj) : journal(fj) {}
+ void *entry() override {
+ journal->write_thread_entry();
+ return 0;
+ }
+ } write_thread;
+
+ class WriteFinisher : public Thread {
+ FileJournal *journal;
+ public:
+ explicit WriteFinisher(FileJournal *fj) : journal(fj) {}
+ void *entry() override {
+ journal->write_finish_thread_entry();
+ return 0;
+ }
+ } write_finish_thread;
+
+ off64_t get_top() const {
+ return round_up_to(sizeof(header), block_size);
+ }
+
+ ZTracer::Endpoint trace_endpoint;
+
+ public:
+ FileJournal(CephContext* cct, uuid_d fsid, Finisher *fin, Cond *sync_cond,
+ const char *f, bool dio=false, bool ai=true, bool faio=false) :
+ Journal(cct, fsid, fin, sync_cond),
+ finisher_lock("FileJournal::finisher_lock", false, true, false),
+ journaled_seq(0),
+ plug_journal_completions(false),
+ writeq_lock("FileJournal::writeq_lock", false, true, false),
+ completions_lock(
+ "FileJournal::completions_lock", false, true, false),
+ fn(f),
+ zero_buf(NULL),
+ max_size(0), block_size(0),
+ directio(dio), aio(ai), force_aio(faio),
+ must_write_header(false),
+ write_pos(0), read_pos(0),
+ discard(false),
+#ifdef HAVE_LIBAIO
+ aio_lock("FileJournal::aio_lock"),
+ aio_ctx(0),
+ aio_num(0), aio_bytes(0),
+ aio_write_queue_ops(0),
+ aio_write_queue_bytes(0),
+#endif
+ last_committed_seq(0),
+ journaled_since_start(0),
+ full_state(FULL_NOTFULL),
+ fd(-1),
+ writing_seq(0),
+ throttle(cct->_conf->filestore_caller_concurrency),
+ write_lock("FileJournal::write_lock", false, true, false),
+ write_stop(true),
+ aio_stop(true),
+ write_thread(this),
+ write_finish_thread(this),
+ trace_endpoint("0.0.0.0", 0, "FileJournal") {
+
+ if (aio && !directio) {
+ lderr(cct) << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
+ aio = false;
+ }
+#ifndef HAVE_LIBAIO
+ if (aio && ::getenv("CEPH_DEV") == NULL) {
+ lderr(cct) << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl;
+ aio = false;
+ }
+#endif
+
+ cct->_conf.add_observer(this);
+ }
+ ~FileJournal() override {
+ ceph_assert(fd == -1);
+ delete[] zero_buf;
+ cct->_conf.remove_observer(this);
+ }
+
+ int check() override;
+ int create() override;
+ int open(uint64_t fs_op_seq) override;
+ void close() override;
+ int peek_fsid(uuid_d& fsid);
+
+ int dump(ostream& out) override;
+ int simple_dump(ostream& out);
+ int _fdump(Formatter &f, bool simple);
+
+ void flush() override;
+
+ void get_devices(set<string> *ls) override;
+ void collect_metadata(map<string,string> *pm) override;
+
+ void reserve_throttle_and_backoff(uint64_t count) override;
+
+ bool is_writeable() override {
+ return read_pos == 0;
+ }
+ int make_writeable() override;
+
+ // writes
+ void commit_start(uint64_t seq) override;
+ void committed_thru(uint64_t seq) override;
+ bool should_commit_now() override {
+ return full_state != FULL_NOTFULL && !write_stop;
+ }
+
+ void write_header_sync();
+
+ void set_wait_on_full(bool b) { wait_on_full = b; }
+
+ off64_t get_journal_size_estimate();
+
+ // reads
+
+ /// Result code for read_entry
+ enum read_entry_result {
+ SUCCESS,
+ FAILURE,
+ MAYBE_CORRUPT
+ };
+
+ /**
+ * read_entry
+ *
+ * Reads next entry starting at pos. If the entry appears
+ * clean, *bl will contain the payload, *seq will contain
+ * the sequence number, and *out_pos will reflect the next
+ * read position. If the entry is invalid *ss will contain
+ * debug text, while *seq, *out_pos, and *bl will be unchanged.
+ *
+ * If the entry suggests a corrupt log, *ss will contain debug
+ * text, *out_pos will contain the next index to check. If
+ * we find an entry in this way that returns SUCCESS, the journal
+ * is most likely corrupt.
+ */
+ read_entry_result do_read_entry(
+ off64_t pos, ///< [in] position to read
+ off64_t *next_pos, ///< [out] next position to read
+ bufferlist* bl, ///< [out] payload for successful read
+ uint64_t *seq, ///< [out] seq of successful read
+ ostream *ss, ///< [out] error output
+ entry_header_t *h = 0 ///< [out] header
+ ) const; ///< @return result code
+
+ bool read_entry(
+ bufferlist &bl,
+ uint64_t &last_seq,
+ bool *corrupt
+ );
+
+ bool read_entry(
+ bufferlist &bl,
+ uint64_t &last_seq) override {
+ return read_entry(bl, last_seq, 0);
+ }
+
+ // Debug/Testing
+ void get_header(
+ uint64_t wanted_seq,
+ off64_t *_pos,
+ entry_header_t *h);
+ void corrupt(
+ int wfd,
+ off64_t corrupt_at);
+ void corrupt_payload(
+ int wfd,
+ uint64_t seq);
+ void corrupt_footer_magic(
+ int wfd,
+ uint64_t seq);
+ void corrupt_header_magic(
+ int wfd,
+ uint64_t seq);
+};
+
+WRITE_CLASS_ENCODER(FileJournal::header_t)
+
+#endif
diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc
new file mode 100644
index 00000000..d387947e
--- /dev/null
+++ b/src/os/filestore/FileStore.cc
@@ -0,0 +1,6425 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "include/compat.h"
+#include "include/int_types.h"
+#include "boost/tuple/tuple.hpp"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <errno.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#include <linux/falloc.h>
+#endif
+
+#include <iostream>
+#include <map>
+
+#include "include/linux_fiemap.h"
+
+#include "common/xattr.h"
+#include "chain_xattr.h"
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+
+#include <fstream>
+#include <sstream>
+
+#include "FileStore.h"
+#include "GenericFileStoreBackend.h"
+#include "BtrfsFileStoreBackend.h"
+#include "XfsFileStoreBackend.h"
+#include "ZFSFileStoreBackend.h"
+#include "common/BackTrace.h"
+#include "include/types.h"
+#include "FileJournal.h"
+
+#include "osd/osd_types.h"
+#include "include/color.h"
+#include "include/buffer.h"
+
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/run_cmd.h"
+#include "common/safe_io.h"
+#include "common/perf_counters.h"
+#include "common/sync_filesystem.h"
+#include "common/fd.h"
+#include "HashIndex.h"
+#include "DBObjectMap.h"
+#include "kv/KeyValueDB.h"
+
+#include "common/ceph_crypto.h"
+using ceph::crypto::SHA1;
+
+#include "include/ceph_assert.h"
+
+#include "common/config.h"
+#include "common/blkdev.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/objectstore.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore(" << basedir << ") "
+
+#define COMMIT_SNAP_ITEM "snap_%llu"
+#define CLUSTER_SNAP_ITEM "clustersnap_%s"
+
+#define REPLAY_GUARD_XATTR "user.cephos.seq"
+#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
+
+// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether
+// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file
+// xattrs and the value is "no", it indicates no xattrs in DBObjectMap
+#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out"
+#define XATTR_NO_SPILL_OUT "0"
+#define XATTR_SPILL_OUT "1"
+#define __FUNC__ __func__ << "(" << __LINE__ << ")"
+
+//Initial features in new superblock.
+static CompatSet get_fs_initial_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+
+//Features are added here that this FileStore supports.
+static CompatSet get_fs_supported_compat_set() {
+ CompatSet compat = get_fs_initial_compat_set();
+ //Any features here can be set in code, but not in initial superblock
+ compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ return compat;
+}
+
+int FileStore::validate_hobject_key(const hobject_t &obj) const
+{
+ unsigned len = LFNIndex::get_max_escaped_name_len(obj);
+ return len > m_filestore_max_xattr_value_size ? -ENAMETOOLONG : 0;
+}
+
+int FileStore::get_block_device_fsid(CephContext* cct, const string& path,
+ uuid_d *fsid)
+{
+ // make sure we don't try to use aio or direct_io (and get annoying
+ // error messages from failing to do so); performance implications
+ // should be irrelevant for this use
+ FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false);
+ return j.peek_fsid(*fsid);
+}
+
+void FileStore::FSPerfTracker::update_from_perfcounters(
+ PerfCounters &logger)
+{
+ os_commit_latency_ns.consume_next(
+ logger.get_tavg_ns(
+ l_filestore_journal_latency));
+ os_apply_latency_ns.consume_next(
+ logger.get_tavg_ns(
+ l_filestore_apply_latency));
+}
+
+
+ostream& operator<<(ostream& out, const FileStore::OpSequencer& s)
+{
+ return out << "osr(" << s.cid << ")";
+}
+
+int FileStore::get_cdir(const coll_t& cid, char *s, int len)
+{
+ const string &cid_str(cid.to_str());
+ return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str());
+}
+
+void FileStore::handle_eio()
+{
+ // don't try to map this back to an offset; too hard since there is
+ // a file system in between. we also don't really know whether this
+ // was a read or a write, since we have so many layers beneath us.
+ // don't even try.
+ note_io_error_event(devname.c_str(), basedir.c_str(), -EIO, 0, 0, 0);
+ ceph_abort_msg("unexpected eio error");
+}
+
+int FileStore::get_index(const coll_t& cid, Index *index)
+{
+ int r = index_manager.get_index(cid, basedir, index);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+int FileStore::init_index(const coll_t& cid)
+{
+ char path[PATH_MAX];
+ get_cdir(cid, path, sizeof(path));
+ int r = index_manager.init_index(cid, path, target_version);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path)
+{
+ IndexedPath path2;
+ if (!path)
+ path = &path2;
+ int r, exist;
+ ceph_assert(index.index);
+ r = (index.index)->lookup(oid, path, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (!exist)
+ return -ENOENT;
+ return 0;
+}
+
+int FileStore::lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length)
+{
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0)
+ return r;
+ r = ::ftruncate(**fd, length);
+ if (r < 0)
+ r = -errno;
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_truncate(**fd, length);
+ ceph_assert(rc >= 0);
+ }
+ lfn_close(fd);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+int FileStore::lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf)
+{
+ IndexedPath path;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ r = lfn_find(oid, index, &path);
+ if (r < 0)
+ return r;
+ r = ::stat(path->path(), buf);
+ if (r < 0)
+ r = -errno;
+ return r;
+}
+
+int FileStore::lfn_open(const coll_t& cid,
+ const ghobject_t& oid,
+ bool create,
+ FDRef *outfd,
+ Index *index)
+{
+ ceph_assert(outfd);
+ int r = 0;
+ bool need_lock = true;
+ int flags = O_RDWR;
+
+ if (create)
+ flags |= O_CREAT;
+ if (cct->_conf->filestore_odsync_write) {
+ flags |= O_DSYNC;
+ }
+
+ Index index2;
+ if (!index) {
+ index = &index2;
+ }
+ if (!((*index).index)) {
+ r = get_index(cid, index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ return r;
+ }
+ } else {
+ need_lock = false;
+ }
+
+ int fd, exist;
+ ceph_assert((*index).index);
+ if (need_lock) {
+ ((*index).index)->access_lock.get_write();
+ }
+ if (!replaying) {
+ *outfd = fdcache.lookup(oid);
+ if (*outfd) {
+ if (need_lock) {
+ ((*index).index)->access_lock.put_write();
+ }
+ return 0;
+ }
+ }
+
+
+ IndexedPath path2;
+ IndexedPath *path = &path2;
+
+ r = (*index)->lookup(oid, path, &exist);
+ if (r < 0) {
+ derr << "could not find " << oid << " in index: "
+ << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+
+ r = ::open((*path)->path(), flags|O_CLOEXEC, 0644);
+ if (r < 0) {
+ r = -errno;
+ dout(10) << "error opening file " << (*path)->path() << " with flags="
+ << flags << ": " << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+ fd = r;
+ if (create && (!exist)) {
+ r = (*index)->created(oid, (*path)->path());
+ if (r < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << "error creating " << oid << " (" << (*path)->path()
+ << ") in index: " << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+ r = chain_fsetxattr<true, true>(
+ fd, XATTR_SPILL_OUT_NAME,
+ XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT));
+ if (r < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path()
+ << "):" << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+ }
+
+ if (!replaying) {
+ bool existed;
+ *outfd = fdcache.add(oid, fd, &existed);
+ if (existed) {
+ TEMP_FAILURE_RETRY(::close(fd));
+ }
+ } else {
+ *outfd = std::make_shared<FDCache::FD>(fd);
+ }
+
+ if (need_lock) {
+ ((*index).index)->access_lock.put_write();
+ }
+
+ return 0;
+
+ fail:
+
+ if (need_lock) {
+ ((*index).index)->access_lock.put_write();
+ }
+
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+void FileStore::lfn_close(FDRef fd)
+{
+}
+
+int FileStore::lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid)
+{
+ Index index_new, index_old;
+ IndexedPath path_new, path_old;
+ int exist;
+ int r;
+ bool index_same = false;
+ if (c < newcid) {
+ r = get_index(newcid, &index_new);
+ if (r < 0)
+ return r;
+ r = get_index(c, &index_old);
+ if (r < 0)
+ return r;
+ } else if (c == newcid) {
+ r = get_index(c, &index_old);
+ if (r < 0)
+ return r;
+ index_new = index_old;
+ index_same = true;
+ } else {
+ r = get_index(c, &index_old);
+ if (r < 0)
+ return r;
+ r = get_index(newcid, &index_new);
+ if (r < 0)
+ return r;
+ }
+
+ ceph_assert(index_old.index);
+ ceph_assert(index_new.index);
+
+ if (!index_same) {
+
+ RWLock::RLocker l1((index_old.index)->access_lock);
+
+ r = index_old->lookup(o, &path_old, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (!exist)
+ return -ENOENT;
+
+ RWLock::WLocker l2((index_new.index)->access_lock);
+
+ r = index_new->lookup(newoid, &path_new, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (exist)
+ return -EEXIST;
+
+ dout(25) << __FUNC__ << ": path_old: " << path_old << dendl;
+ dout(25) << __FUNC__ << ": path_new: " << path_new << dendl;
+ r = ::link(path_old->path(), path_new->path());
+ if (r < 0)
+ return -errno;
+
+ r = index_new->created(newoid, path_new->path());
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ } else {
+ RWLock::WLocker l1((index_old.index)->access_lock);
+
+ r = index_old->lookup(o, &path_old, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (!exist)
+ return -ENOENT;
+
+ r = index_new->lookup(newoid, &path_new, &exist);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (exist)
+ return -EEXIST;
+
+ dout(25) << __FUNC__ << ": path_old: " << path_old << dendl;
+ dout(25) << __FUNC__ << ": path_new: " << path_new << dendl;
+ r = ::link(path_old->path(), path_new->path());
+ if (r < 0)
+ return -errno;
+
+ // make sure old fd for unlinked/overwritten file is gone
+ fdcache.clear(newoid);
+
+ r = index_new->created(newoid, path_new->path());
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ }
+ return 0;
+}
+
+int FileStore::lfn_unlink(const coll_t& cid, const ghobject_t& o,
+ const SequencerPosition &spos,
+ bool force_clear_omap)
+{
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0) {
+ dout(25) << __FUNC__ << ": get_index failed " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ ceph_assert(index.index);
+ RWLock::WLocker l((index.index)->access_lock);
+
+ {
+ IndexedPath path;
+ int hardlink;
+ r = index->lookup(o, &path, &hardlink);
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+
+ if (!force_clear_omap) {
+ if (hardlink == 0 || hardlink == 1) {
+ force_clear_omap = true;
+ }
+ }
+ if (force_clear_omap) {
+ dout(20) << __FUNC__ << ": clearing omap on " << o
+ << " in cid " << cid << dendl;
+ r = object_map->clear(o, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(25) << __FUNC__ << ": omap clear failed " << cpp_strerror(r) << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ if (cct->_conf->filestore_debug_inject_read_err) {
+ debug_obj_on_delete(o);
+ }
+ if (!m_disable_wbthrottle) {
+ wbthrottle.clear_object(o); // should be only non-cache ref
+ }
+ fdcache.clear(o);
+ } else {
+ /* Ensure that replay of this op doesn't result in the object_map
+ * going away.
+ */
+ if (!backend->can_checkpoint())
+ object_map->sync(&o, &spos);
+ }
+ if (hardlink == 0) {
+ if (!m_disable_wbthrottle) {
+ wbthrottle.clear_object(o); // should be only non-cache ref
+ }
+ return 0;
+ }
+ }
+ r = index->unlink(o);
+ if (r < 0) {
+ dout(25) << __FUNC__ << ": index unlink failed " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+FileStore::FileStore(CephContext* cct, const std::string &base,
+ const std::string &jdev, osflagbits_t flags,
+ const char *name, bool do_update) :
+ JournalingObjectStore(cct, base),
+ internal_name(name),
+ basedir(base), journalpath(jdev),
+ generic_flags(flags),
+ blk_size(0),
+ fsid_fd(-1), op_fd(-1),
+ basedir_fd(-1), current_fd(-1),
+ backend(nullptr),
+ index_manager(cct, do_update),
+ lock("FileStore::lock"),
+ force_sync(false),
+ sync_entry_timeo_lock("FileStore::sync_entry_timeo_lock"),
+ timer(cct, sync_entry_timeo_lock),
+ stop(false), sync_thread(this),
+ coll_lock("FileStore::coll_lock"),
+ fdcache(cct),
+ wbthrottle(cct),
+ next_osr_id(0),
+ m_disable_wbthrottle(cct->_conf->filestore_odsync_write ||
+ !cct->_conf->filestore_wbthrottle_enable),
+ throttle_ops(cct, "filestore_ops", cct->_conf->filestore_caller_concurrency),
+ throttle_bytes(cct, "filestore_bytes", cct->_conf->filestore_caller_concurrency),
+ m_ondisk_finisher_num(cct->_conf->filestore_ondisk_finisher_threads),
+ m_apply_finisher_num(cct->_conf->filestore_apply_finisher_threads),
+ op_tp(cct, "FileStore::op_tp", "tp_fstore_op", cct->_conf->filestore_op_threads, "filestore_op_threads"),
+ op_wq(this, cct->_conf->filestore_op_thread_timeout,
+ cct->_conf->filestore_op_thread_suicide_timeout, &op_tp),
+ logger(nullptr),
+ trace_endpoint("0.0.0.0", 0, "FileStore"),
+ read_error_lock("FileStore::read_error_lock"),
+ m_filestore_commit_timeout(cct->_conf->filestore_commit_timeout),
+ m_filestore_journal_parallel(cct->_conf->filestore_journal_parallel ),
+ m_filestore_journal_trailing(cct->_conf->filestore_journal_trailing),
+ m_filestore_journal_writeahead(cct->_conf->filestore_journal_writeahead),
+ m_filestore_fiemap_threshold(cct->_conf->filestore_fiemap_threshold),
+ m_filestore_max_sync_interval(cct->_conf->filestore_max_sync_interval),
+ m_filestore_min_sync_interval(cct->_conf->filestore_min_sync_interval),
+ m_filestore_fail_eio(cct->_conf->filestore_fail_eio),
+ m_filestore_fadvise(cct->_conf->filestore_fadvise),
+ do_update(do_update),
+ m_journal_dio(cct->_conf->journal_dio),
+ m_journal_aio(cct->_conf->journal_aio),
+ m_journal_force_aio(cct->_conf->journal_force_aio),
+ m_osd_rollback_to_cluster_snap(cct->_conf->osd_rollback_to_cluster_snap),
+ m_osd_use_stale_snap(cct->_conf->osd_use_stale_snap),
+ m_filestore_do_dump(false),
+ m_filestore_dump_fmt(true),
+ m_filestore_sloppy_crc(cct->_conf->filestore_sloppy_crc),
+ m_filestore_sloppy_crc_block_size(cct->_conf->filestore_sloppy_crc_block_size),
+ m_filestore_max_alloc_hint_size(cct->_conf->filestore_max_alloc_hint_size),
+ m_fs_type(0),
+ m_filestore_max_inline_xattr_size(0),
+ m_filestore_max_inline_xattrs(0),
+ m_filestore_max_xattr_value_size(0)
+{
+ m_filestore_kill_at = cct->_conf->filestore_kill_at;
+ for (int i = 0; i < m_ondisk_finisher_num; ++i) {
+ ostringstream oss;
+ oss << "filestore-ondisk-" << i;
+ Finisher *f = new Finisher(cct, oss.str(), "fn_odsk_fstore");
+ ondisk_finishers.push_back(f);
+ }
+ for (int i = 0; i < m_apply_finisher_num; ++i) {
+ ostringstream oss;
+ oss << "filestore-apply-" << i;
+ Finisher *f = new Finisher(cct, oss.str(), "fn_appl_fstore");
+ apply_finishers.push_back(f);
+ }
+
+ ostringstream oss;
+ oss << basedir << "/current";
+ current_fn = oss.str();
+
+ ostringstream sss;
+ sss << basedir << "/current/commit_op_seq";
+ current_op_seq_fn = sss.str();
+
+ ostringstream omss;
+ if (cct->_conf->filestore_omap_backend_path != "") {
+ omap_dir = cct->_conf->filestore_omap_backend_path;
+ } else {
+ omss << basedir << "/current/omap";
+ omap_dir = omss.str();
+ }
+
+ // initialize logger
+ PerfCountersBuilder plb(cct, internal_name, l_filestore_first, l_filestore_last);
+
+ plb.add_u64(l_filestore_journal_queue_ops, "journal_queue_ops", "Operations in journal queue");
+ plb.add_u64(l_filestore_journal_ops, "journal_ops", "Active journal entries to be applied");
+ plb.add_u64(l_filestore_journal_queue_bytes, "journal_queue_bytes", "Size of journal queue");
+ plb.add_u64(l_filestore_journal_bytes, "journal_bytes", "Active journal operation size to be applied");
+ plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency",
+ NULL, PerfCountersBuilder::PRIO_USEFUL);
+ plb.add_u64_counter(l_filestore_journal_wr, "journal_wr", "Journal write IOs");
+ plb.add_u64_avg(l_filestore_journal_wr_bytes, "journal_wr_bytes", "Journal data written");
+ plb.add_u64(l_filestore_op_queue_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue");
+ plb.add_u64(l_filestore_op_queue_ops, "op_queue_ops", "Operations in writing to FS queue");
+ plb.add_u64_counter(l_filestore_ops, "ops", "Operations written to store");
+ plb.add_u64(l_filestore_op_queue_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue");
+ plb.add_u64(l_filestore_op_queue_bytes, "op_queue_bytes", "Size of writing to FS queue");
+ plb.add_u64_counter(l_filestore_bytes, "bytes", "Data written to store");
+ plb.add_time_avg(l_filestore_apply_latency, "apply_latency", "Apply latency");
+ plb.add_u64(l_filestore_committing, "committing", "Is currently committing");
+
+ plb.add_u64_counter(l_filestore_commitcycle, "commitcycle", "Commit cycles");
+ plb.add_time_avg(l_filestore_commitcycle_interval, "commitcycle_interval", "Average interval between commits");
+ plb.add_time_avg(l_filestore_commitcycle_latency, "commitcycle_latency", "Average latency of commit");
+ plb.add_u64_counter(l_filestore_journal_full, "journal_full", "Journal writes while full");
+ plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg",
+ "Store operation queue latency", NULL, PerfCountersBuilder::PRIO_USEFUL);
+ plb.add_time(l_filestore_sync_pause_max_lat, "sync_pause_max_latency", "Max latency of op_wq pause before syncfs");
+
+ logger = plb.create_perf_counters();
+
+ cct->get_perfcounters_collection()->add(logger);
+ cct->_conf.add_observer(this);
+
+ superblock.compat_features = get_fs_initial_compat_set();
+}
+
+FileStore::~FileStore()
+{
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ delete *it;
+ *it = nullptr;
+ }
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ delete *it;
+ *it = nullptr;
+ }
+ cct->_conf.remove_observer(this);
+ cct->get_perfcounters_collection()->remove(logger);
+
+ if (journal)
+ journal->logger = nullptr;
+ delete logger;
+ logger = nullptr;
+
+ if (m_filestore_do_dump) {
+ dump_stop();
+ }
+}
+
+static void get_attrname(const char *name, char *buf, int len)
+{
+ snprintf(buf, len, "user.ceph.%s", name);
+}
+
+bool parse_attrname(char **name)
+{
+ if (strncmp(*name, "user.ceph.", 10) == 0) {
+ *name += 10;
+ return true;
+ }
+ return false;
+}
+
+void FileStore::collect_metadata(map<string,string> *pm)
+{
+ char partition_path[PATH_MAX];
+ char dev_node[PATH_MAX];
+
+ (*pm)["filestore_backend"] = backend->get_name();
+ ostringstream ss;
+ ss << "0x" << std::hex << m_fs_type << std::dec;
+ (*pm)["filestore_f_type"] = ss.str();
+
+ if (cct->_conf->filestore_collect_device_partition_information) {
+ int rc = 0;
+ BlkDev blkdev(fsid_fd);
+ if (rc = blkdev.partition(partition_path, PATH_MAX); rc) {
+ (*pm)["backend_filestore_partition_path"] = "unknown";
+ } else {
+ (*pm)["backend_filestore_partition_path"] = string(partition_path);
+ }
+ if (rc = blkdev.wholedisk(dev_node, PATH_MAX); rc) {
+ (*pm)["backend_filestore_dev_node"] = "unknown";
+ } else {
+ (*pm)["backend_filestore_dev_node"] = string(dev_node);
+ devname = dev_node;
+ }
+ if (rc == 0 && vdo_fd >= 0) {
+ (*pm)["vdo"] = "true";
+ (*pm)["vdo_physical_size"] =
+ stringify(4096 * get_vdo_stat(vdo_fd, "physical_blocks"));
+ }
+ if (journal) {
+ journal->collect_metadata(pm);
+ }
+ }
+}
+
+int FileStore::get_devices(set<string> *ls)
+{
+ string dev_node;
+ BlkDev blkdev(fsid_fd);
+ if (int rc = blkdev.wholedisk(&dev_node); rc) {
+ return rc;
+ }
+ get_raw_devices(dev_node, ls);
+ if (journal) {
+ journal->get_devices(ls);
+ }
+ return 0;
+}
+
+int FileStore::statfs(struct store_statfs_t *buf0, osd_alert_list_t* alerts)
+{
+ struct statfs buf;
+ buf0->reset();
+ if (alerts) {
+ alerts->clear(); // returns nothing for now
+ }
+ if (::statfs(basedir.c_str(), &buf) < 0) {
+ int r = -errno;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ ceph_assert(r != -ENOENT);
+ return r;
+ }
+
+ uint64_t bfree = buf.f_bavail * buf.f_bsize;
+
+ // assume all of leveldb/rocksdb is omap.
+ {
+ map<string,uint64_t> kv_usage;
+ buf0->omap_allocated += object_map->get_db()->get_estimated_size(kv_usage);
+ }
+
+ uint64_t thin_total, thin_avail;
+ if (get_vdo_utilization(vdo_fd, &thin_total, &thin_avail)) {
+ buf0->total = thin_total;
+ bfree = std::min(bfree, thin_avail);
+ buf0->allocated = thin_total - thin_avail;
+ buf0->data_stored = bfree;
+ } else {
+ buf0->total = buf.f_blocks * buf.f_bsize;
+ buf0->allocated = bfree;
+ buf0->data_stored = bfree;
+ }
+ buf0->available = bfree;
+
+ // FIXME: we don't know how to populate buf->internal_metadata; XFS doesn't
+ // tell us what its internal overhead is.
+
+ // Adjust for writes pending in the journal
+ if (journal) {
+ uint64_t estimate = journal->get_journal_size_estimate();
+ buf0->internally_reserved = estimate;
+ if (buf0->available > estimate)
+ buf0->available -= estimate;
+ else
+ buf0->available = 0;
+ }
+
+ return 0;
+}
+
+int FileStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf)
+{
+ return -ENOTSUP;
+}
+
+void FileStore::new_journal()
+{
+ if (journalpath.length()) {
+ dout(10) << "open_journal at " << journalpath << dendl;
+ journal = new FileJournal(cct, fsid, &finisher, &sync_cond,
+ journalpath.c_str(),
+ m_journal_dio, m_journal_aio,
+ m_journal_force_aio);
+ if (journal)
+ journal->logger = logger;
+ }
+ return;
+}
+
+int FileStore::dump_journal(ostream& out)
+{
+ int r;
+
+ if (!journalpath.length())
+ return -EINVAL;
+
+ FileJournal *journal = new FileJournal(cct, fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio);
+ r = journal->dump(out);
+ delete journal;
+ journal = nullptr;
+ return r;
+}
+
+FileStoreBackend *FileStoreBackend::create(unsigned long f_type, FileStore *fs)
+{
+ switch (f_type) {
+#if defined(__linux__)
+ case BTRFS_SUPER_MAGIC:
+ return new BtrfsFileStoreBackend(fs);
+# ifdef HAVE_LIBXFS
+ case XFS_SUPER_MAGIC:
+ return new XfsFileStoreBackend(fs);
+# endif
+#endif
+#ifdef HAVE_LIBZFS
+ case ZFS_SUPER_MAGIC:
+ return new ZFSFileStoreBackend(fs);
+#endif
+ default:
+ return new GenericFileStoreBackend(fs);
+ }
+}
+
+void FileStore::create_backend(unsigned long f_type)
+{
+ m_fs_type = f_type;
+
+ ceph_assert(!backend);
+ backend = FileStoreBackend::create(f_type, this);
+
+ dout(0) << "backend " << backend->get_name()
+ << " (magic 0x" << std::hex << f_type << std::dec << ")"
+ << dendl;
+
+ switch (f_type) {
+#if defined(__linux__)
+ case BTRFS_SUPER_MAGIC:
+ if (!m_disable_wbthrottle){
+ wbthrottle.set_fs(WBThrottle::BTRFS);
+ }
+ break;
+
+ case XFS_SUPER_MAGIC:
+ // wbthrottle is constructed with fs(WBThrottle::XFS)
+ break;
+#endif
+ }
+
+ set_xattr_limits_via_conf();
+}
+
+int FileStore::mkfs()
+{
+ int ret = 0;
+ char fsid_fn[PATH_MAX];
+ char fsid_str[40];
+ uuid_d old_fsid;
+ uuid_d old_omap_fsid;
+
+ dout(1) << "mkfs in " << basedir << dendl;
+ basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+ if (basedir_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // open+lock fsid
+ snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str());
+ fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644);
+ if (fsid_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ if (lock_fsid() < 0) {
+ ret = -EBUSY;
+ goto close_fsid_fd;
+ }
+
+ if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) {
+ if (fsid.is_zero()) {
+ fsid.generate_random();
+ dout(1) << __FUNC__ << ": generated fsid " << fsid << dendl;
+ } else {
+ dout(1) << __FUNC__ << ": using provided fsid " << fsid << dendl;
+ }
+
+ fsid.print(fsid_str);
+ strcat(fsid_str, "\n");
+ ret = ::ftruncate(fsid_fd, 0);
+ if (ret < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to truncate fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str));
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ if (::fsync(fsid_fd) < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": close failed: can't write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ dout(10) << __FUNC__ << ": fsid is " << fsid << dendl;
+ } else {
+ if (!fsid.is_zero() && fsid != old_fsid) {
+ derr << __FUNC__ << ": on-disk fsid " << old_fsid << " != provided " << fsid << dendl;
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+ fsid = old_fsid;
+ dout(1) << __FUNC__ << ": fsid is already set to " << fsid << dendl;
+ }
+
+ // version stamp
+ ret = write_version_stamp();
+ if (ret < 0) {
+ derr << __FUNC__ << ": write_version_stamp() failed: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ // superblock
+ superblock.omap_backend = cct->_conf->filestore_omap_backend;
+ ret = write_superblock();
+ if (ret < 0) {
+ derr << __FUNC__ << ": write_superblock() failed: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ struct statfs basefs;
+ ret = ::fstatfs(basedir_fd, &basefs);
+ if (ret < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": cannot fstatfs basedir "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+#if defined(__linux__)
+ if (basefs.f_type == BTRFS_SUPER_MAGIC &&
+ !g_ceph_context->check_experimental_feature_enabled("btrfs")) {
+ derr << __FUNC__ << ": deprecated btrfs support is not enabled" << dendl;
+ goto close_fsid_fd;
+ }
+#endif
+
+ create_backend(basefs.f_type);
+
+ ret = backend->create_current();
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to create current/ " << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ // write initial op_seq
+ {
+ uint64_t initial_seq = 0;
+ int fd = read_op_seq(&initial_seq);
+ if (fd < 0) {
+ ret = fd;
+ derr << __FUNC__ << ": failed to create " << current_op_seq_fn << ": "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ if (initial_seq == 0) {
+ ret = write_op_seq(fd, 1);
+ if (ret < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << __FUNC__ << ": failed to write to " << current_op_seq_fn << ": "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ if (backend->can_checkpoint()) {
+ // create snap_1 too
+ current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC);
+ ceph_assert(current_fd >= 0);
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull);
+ ret = backend->create_checkpoint(s, nullptr);
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ if (ret < 0 && ret != -EEXIST) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << __FUNC__ << ": failed to create snap_1: " << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ }
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir);
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to create " << cct->_conf->filestore_omap_backend << dendl;
+ goto close_fsid_fd;
+ }
+ // create fsid under omap
+ // open+lock fsid
+ int omap_fsid_fd;
+ char omap_fsid_fn[PATH_MAX];
+ snprintf(omap_fsid_fn, sizeof(omap_fsid_fn), "%s/osd_uuid", omap_dir.c_str());
+ omap_fsid_fd = ::open(omap_fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644);
+ if (omap_fsid_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open " << omap_fsid_fn << ": " << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ if (read_fsid(omap_fsid_fd, &old_omap_fsid) < 0 || old_omap_fsid.is_zero()) {
+ ceph_assert(!fsid.is_zero());
+ fsid.print(fsid_str);
+ strcat(fsid_str, "\n");
+ ret = ::ftruncate(omap_fsid_fd, 0);
+ if (ret < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to truncate fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_omap_fsid_fd;
+ }
+ ret = safe_write(omap_fsid_fd, fsid_str, strlen(fsid_str));
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_omap_fsid_fd;
+ }
+ dout(10) << __FUNC__ << ": write success, fsid:" << fsid_str << ", ret:" << ret << dendl;
+ if (::fsync(omap_fsid_fd) < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": close failed: can't write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_omap_fsid_fd;
+ }
+ dout(10) << "mkfs omap fsid is " << fsid << dendl;
+ } else {
+ if (fsid != old_omap_fsid) {
+ derr << __FUNC__ << ": " << omap_fsid_fn
+ << " has existed omap fsid " << old_omap_fsid
+ << " != expected osd fsid " << fsid
+ << dendl;
+ ret = -EINVAL;
+ goto close_omap_fsid_fd;
+ }
+ dout(1) << __FUNC__ << ": omap fsid is already set to " << fsid << dendl;
+ }
+
+ dout(1) << cct->_conf->filestore_omap_backend << " db exists/created" << dendl;
+
+ // journal?
+ ret = mkjournal();
+ if (ret)
+ goto close_omap_fsid_fd;
+
+ ret = write_meta("type", "filestore");
+ if (ret)
+ goto close_omap_fsid_fd;
+
+ dout(1) << "mkfs done in " << basedir << dendl;
+ ret = 0;
+
+ close_omap_fsid_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd));
+ close_fsid_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ close_basedir_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+ delete backend;
+ backend = nullptr;
+ return ret;
+}
+
+int FileStore::mkjournal()
+{
+ // read fsid
+ int ret;
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC, 0644);
+ if (fd < 0) {
+ int err = errno;
+ derr << __FUNC__ << ": open error: " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ ret = read_fsid(fd, &fsid);
+ if (ret < 0) {
+ derr << __FUNC__ << ": read error: " << cpp_strerror(ret) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+ ret = 0;
+
+ new_journal();
+ if (journal) {
+ ret = journal->check();
+ if (ret < 0) {
+ ret = journal->create();
+ if (ret)
+ derr << __FUNC__ << ": error creating journal on " << journalpath
+ << ": " << cpp_strerror(ret) << dendl;
+ else
+ dout(0) << __FUNC__ << ": created journal on " << journalpath << dendl;
+ }
+ delete journal;
+ journal = nullptr;
+ }
+ return ret;
+}
+
+int FileStore::read_fsid(int fd, uuid_d *uuid)
+{
+ char fsid_str[40];
+ memset(fsid_str, 0, sizeof(fsid_str));
+ int ret = safe_read(fd, fsid_str, sizeof(fsid_str));
+ if (ret < 0)
+ return ret;
+ if (ret == 8) {
+ // old 64-bit fsid... mirror it.
+ *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str;
+ *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str;
+ return 0;
+ }
+
+ if (ret > 36)
+ fsid_str[36] = 0;
+ else
+ fsid_str[ret] = 0;
+ if (!uuid->parse(fsid_str))
+ return -EINVAL;
+ return 0;
+}
+
+int FileStore::lock_fsid()
+{
+ struct flock l;
+ memset(&l, 0, sizeof(l));
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ l.l_start = 0;
+ l.l_len = 0;
+ int r = ::fcntl(fsid_fd, F_SETLK, &l);
+ if (r < 0) {
+ int err = errno;
+ dout(0) << __FUNC__ << ": failed to lock " << basedir << "/fsid, is another ceph-osd still running? "
+ << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ return 0;
+}
+
+bool FileStore::test_mount_in_use()
+{
+ dout(5) << __FUNC__ << ": basedir " << basedir << " journal " << journalpath << dendl;
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+
+ // verify fs isn't in use
+
+ fsid_fd = ::open(fn, O_RDWR|O_CLOEXEC, 0644);
+ if (fsid_fd < 0)
+ return 0; // no fsid, ok.
+ bool inuse = lock_fsid() < 0;
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ return inuse;
+}
+
+bool FileStore::is_rotational()
+{
+ bool rotational;
+ if (backend) {
+ rotational = backend->is_rotational();
+ } else {
+ int fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return true;
+ struct statfs st;
+ int r = ::fstatfs(fd, &st);
+ ::close(fd);
+ if (r < 0) {
+ return true;
+ }
+ create_backend(st.f_type);
+ rotational = backend->is_rotational();
+ delete backend;
+ backend = nullptr;
+ }
+ dout(10) << __func__ << " " << (int)rotational << dendl;
+ return rotational;
+}
+
+bool FileStore::is_journal_rotational()
+{
+ bool journal_rotational;
+ if (backend) {
+ journal_rotational = backend->is_journal_rotational();
+ } else {
+ int fd = ::open(journalpath.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return true;
+ struct statfs st;
+ int r = ::fstatfs(fd, &st);
+ ::close(fd);
+ if (r < 0) {
+ return true;
+ }
+ create_backend(st.f_type);
+ journal_rotational = backend->is_journal_rotational();
+ delete backend;
+ backend = nullptr;
+ }
+ dout(10) << __func__ << " " << (int)journal_rotational << dendl;
+ return journal_rotational;
+}
+
+int FileStore::_detect_fs()
+{
+ struct statfs st;
+ int r = ::fstatfs(basedir_fd, &st);
+ if (r < 0)
+ return -errno;
+
+ blk_size = st.f_bsize;
+
+#if defined(__linux__)
+ if (st.f_type == BTRFS_SUPER_MAGIC &&
+ !g_ceph_context->check_experimental_feature_enabled("btrfs")) {
+ derr <<__FUNC__ << ": deprecated btrfs support is not enabled" << dendl;
+ return -EPERM;
+ }
+#endif
+
+ create_backend(st.f_type);
+
+ r = backend->detect_features();
+ if (r < 0) {
+ derr << __FUNC__ << ": detect_features error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // vdo
+ {
+ char dev_node[PATH_MAX];
+ if (int rc = BlkDev{fsid_fd}.wholedisk(dev_node, PATH_MAX); rc == 0) {
+ vdo_fd = get_vdo_stats_handle(dev_node, &vdo_name);
+ if (vdo_fd >= 0) {
+ dout(0) << __func__ << " VDO volume " << vdo_name << " for " << dev_node
+ << dendl;
+ }
+ }
+ }
+
+ // test xattrs
+ char fn[PATH_MAX];
+ int x = rand();
+ int y = x+1;
+ snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str());
+ int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0700);
+ if (tmpfd < 0) {
+ int ret = -errno;
+ derr << __FUNC__ << ": unable to create " << fn << ": " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x));
+ if (ret >= 0)
+ ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y));
+ if ((ret < 0) || (x != y)) {
+ derr << "Extended attributes don't appear to work. ";
+ if (ret)
+ *_dout << "Got error " + cpp_strerror(ret) + ". ";
+ *_dout << "If you are using ext3 or ext4, be sure to mount the underlying "
+ << "file system with the 'user_xattr' option." << dendl;
+ ::unlink(fn);
+ VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+ return -ENOTSUP;
+ }
+
+ char buf[1000];
+ memset(buf, 0, sizeof(buf)); // shut up valgrind
+ chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf));
+ chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf));
+ chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf));
+ chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
+ ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
+ if (ret == -ENOSPC) {
+ dout(0) << "limited size xattrs" << dendl;
+ }
+ chain_fremovexattr(tmpfd, "user.test");
+ chain_fremovexattr(tmpfd, "user.test2");
+ chain_fremovexattr(tmpfd, "user.test3");
+ chain_fremovexattr(tmpfd, "user.test4");
+ chain_fremovexattr(tmpfd, "user.test5");
+
+ ::unlink(fn);
+ VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+
+ return 0;
+}
+
+int FileStore::_sanity_check_fs()
+{
+ // sanity check(s)
+
+ if (((int)m_filestore_journal_writeahead +
+ (int)m_filestore_journal_parallel +
+ (int)m_filestore_journal_trailing) > 1) {
+ dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl;
+ cerr << TEXT_RED
+ << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n"
+ << " is enabled in ceph.conf. You must choose a single journal mode."
+ << TEXT_NORMAL << std::endl;
+ return -EINVAL;
+ }
+
+ if (!backend->can_checkpoint()) {
+ if (!journal || !m_filestore_journal_writeahead) {
+ dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl;
+ cerr << TEXT_RED
+ << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n"
+ << " For non-btrfs volumes, a writeahead journal is required to\n"
+ << " maintain on-disk consistency in the event of a crash. Your conf\n"
+ << " should include something like:\n"
+ << " osd journal = /path/to/journal_device_or_file\n"
+ << " filestore journal writeahead = true\n"
+ << TEXT_NORMAL;
+ }
+ }
+
+ if (!journal) {
+ dout(0) << "mount WARNING: no journal" << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: No osd journal is configured: write latency may be high.\n"
+ << " If you will not be using an osd journal, write latency may be\n"
+ << " relatively high. It can be reduced somewhat by lowering\n"
+ << " filestore_max_sync_interval, but lower values mean lower write\n"
+ << " throughput, especially with spinning disks.\n"
+ << TEXT_NORMAL;
+ }
+
+ return 0;
+}
+
+int FileStore::write_superblock()
+{
+ bufferlist bl;
+ encode(superblock, bl);
+ return safe_write_file(basedir.c_str(), "superblock",
+ bl.c_str(), bl.length(), 0600);
+}
+
+int FileStore::read_superblock()
+{
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "superblock",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ // If the file doesn't exist write initial CompatSet
+ return write_superblock();
+ }
+ return ret;
+ }
+
+ bufferlist bl;
+ bl.push_back(std::move(bp));
+ auto i = bl.cbegin();
+ decode(superblock, i);
+ return 0;
+}
+
+int FileStore::update_version_stamp()
+{
+ return write_version_stamp();
+}
+
+int FileStore::version_stamp_is_valid(uint32_t *version)
+{
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "store_version",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ return ret;
+ }
+ bufferlist bl;
+ bl.push_back(std::move(bp));
+ auto i = bl.cbegin();
+ decode(*version, i);
+ dout(10) << __FUNC__ << ": was " << *version << " vs target "
+ << target_version << dendl;
+ if (*version == target_version)
+ return 1;
+ else
+ return 0;
+}
+
+int FileStore::flush_cache(ostream *os)
+{
+ string drop_caches_file = "/proc/sys/vm/drop_caches";
+ int drop_caches_fd = ::open(drop_caches_file.c_str(), O_WRONLY|O_CLOEXEC), ret = 0;
+ char buf[2] = "3";
+ size_t len = strlen(buf);
+
+ if (drop_caches_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open " << drop_caches_file << ": " << cpp_strerror(ret) << dendl;
+ if (os) {
+ *os << "FileStore flush_cache: failed to open " << drop_caches_file << ": " << cpp_strerror(ret);
+ }
+ return ret;
+ }
+
+ if (::write(drop_caches_fd, buf, len) < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to write to " << drop_caches_file << ": " << cpp_strerror(ret) << dendl;
+ if (os) {
+ *os << "FileStore flush_cache: failed to write to " << drop_caches_file << ": " << cpp_strerror(ret);
+ }
+ goto out;
+ }
+
+out:
+ ::close(drop_caches_fd);
+ return ret;
+}
+
+int FileStore::write_version_stamp()
+{
+ dout(1) << __FUNC__ << ": " << target_version << dendl;
+ bufferlist bl;
+ encode(target_version, bl);
+
+ return safe_write_file(basedir.c_str(), "store_version",
+ bl.c_str(), bl.length(), 0600);
+}
+
+int FileStore::upgrade()
+{
+ dout(1) << __FUNC__ << dendl;
+ uint32_t version;
+ int r = version_stamp_is_valid(&version);
+
+ if (r == -ENOENT) {
+ derr << "The store_version file doesn't exist." << dendl;
+ return -EINVAL;
+ }
+ if (r < 0)
+ return r;
+ if (r == 1)
+ return 0;
+
+ if (version < 3) {
+ derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl;
+ return -EINVAL;
+ }
+
+ // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to
+ // open up DBObjectMap with the do_upgrade flag, which we already did.
+ update_version_stamp();
+ return 0;
+}
+
+int FileStore::read_op_seq(uint64_t *seq)
+{
+ int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR|O_CLOEXEC, 0644);
+ if (op_fd < 0) {
+ int r = -errno;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ char s[40];
+ memset(s, 0, sizeof(s));
+ int ret = safe_read(op_fd, s, sizeof(s) - 1);
+ if (ret < 0) {
+ derr << __FUNC__ << ": error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+ ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+ return ret;
+ }
+ *seq = atoll(s);
+ return op_fd;
+}
+
+int FileStore::write_op_seq(int fd, uint64_t seq)
+{
+ char s[30];
+ snprintf(s, sizeof(s), "%" PRId64 "\n", seq);
+ int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0));
+ if (ret < 0) {
+ ret = -errno;
+ ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+ }
+ return ret;
+}
+
+int FileStore::mount()
+{
+ int ret;
+ char buf[PATH_MAX];
+ uint64_t initial_op_seq;
+ uuid_d omap_fsid;
+ set<string> cluster_snaps;
+ CompatSet supported_compat_set = get_fs_supported_compat_set();
+
+ dout(5) << "basedir " << basedir << " journal " << journalpath << dendl;
+
+ ret = set_throttle_params();
+ if (ret != 0)
+ goto done;
+
+ // make sure global base dir exists
+ if (::access(basedir.c_str(), R_OK | W_OK)) {
+ ret = -errno;
+ derr << __FUNC__ << ": unable to access basedir '" << basedir << "': "
+ << cpp_strerror(ret) << dendl;
+ goto done;
+ }
+
+ // get fsid
+ snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str());
+ fsid_fd = ::open(buf, O_RDWR|O_CLOEXEC, 0644);
+ if (fsid_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": error opening '" << buf << "': "
+ << cpp_strerror(ret) << dendl;
+ goto done;
+ }
+
+ ret = read_fsid(fsid_fd, &fsid);
+ if (ret < 0) {
+ derr << __FUNC__ << ": error reading fsid_fd: " << cpp_strerror(ret)
+ << dendl;
+ goto close_fsid_fd;
+ }
+
+ if (lock_fsid() < 0) {
+ derr << __FUNC__ << ": lock_fsid failed" << dendl;
+ ret = -EBUSY;
+ goto close_fsid_fd;
+ }
+
+ dout(10) << "mount fsid is " << fsid << dendl;
+
+
+ uint32_t version_stamp;
+ ret = version_stamp_is_valid(&version_stamp);
+ if (ret < 0) {
+ derr << __FUNC__ << ": error in version_stamp_is_valid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ } else if (ret == 0) {
+ if (do_update || (int)version_stamp < cct->_conf->filestore_update_to) {
+ derr << __FUNC__ << ": stale version stamp detected: "
+ << version_stamp
+ << ". Proceeding, do_update "
+ << "is set, performing disk format upgrade."
+ << dendl;
+ do_update = true;
+ } else {
+ ret = -EINVAL;
+ derr << __FUNC__ << ": stale version stamp " << version_stamp
+ << ". Please run the FileStore update script before starting the "
+ << "OSD, or set filestore_update_to to " << target_version
+ << " (currently " << cct->_conf->filestore_update_to << ")"
+ << dendl;
+ goto close_fsid_fd;
+ }
+ }
+
+ ret = read_superblock();
+ if (ret < 0) {
+ goto close_fsid_fd;
+ }
+
+ // Check if this FileStore supports all the necessary features to mount
+ if (supported_compat_set.compare(superblock.compat_features) == -1) {
+ derr << __FUNC__ << ": Incompatible features set "
+ << superblock.compat_features << dendl;
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+
+ // open some dir handles
+ basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+ if (basedir_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to open " << basedir << ": "
+ << cpp_strerror(ret) << dendl;
+ basedir_fd = -1;
+ goto close_fsid_fd;
+ }
+
+ // test for btrfs, xattrs, etc.
+ ret = _detect_fs();
+ if (ret < 0) {
+ derr << __FUNC__ << ": error in _detect_fs: "
+ << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ {
+ list<string> ls;
+ ret = backend->list_checkpoints(ls);
+ if (ret < 0) {
+ derr << __FUNC__ << ": error in _list_snaps: "<< cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ long long unsigned c, prev = 0;
+ char clustersnap[NAME_MAX];
+ for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) {
+ if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) {
+ ceph_assert(c > prev);
+ prev = c;
+ snaps.push_back(c);
+ } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1)
+ cluster_snaps.insert(*it);
+ }
+ }
+
+ if (m_osd_rollback_to_cluster_snap.length() &&
+ cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) {
+ derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl;
+ ret = -ENOENT;
+ goto close_basedir_fd;
+ }
+
+ char nosnapfn[200];
+ snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str());
+
+ if (backend->can_checkpoint()) {
+ if (snaps.empty()) {
+ dout(0) << __FUNC__ << ": WARNING: no consistent snaps found, store may be in inconsistent state" << dendl;
+ } else {
+ char s[NAME_MAX];
+ uint64_t curr_seq = 0;
+
+ if (m_osd_rollback_to_cluster_snap.length()) {
+ derr << TEXT_RED
+ << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **"
+ << TEXT_NORMAL
+ << dendl;
+ ceph_assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap));
+ snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str());
+ } else {
+ {
+ int fd = read_op_seq(&curr_seq);
+ if (fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ }
+ if (curr_seq)
+ dout(10) << " current/ seq was " << curr_seq << dendl;
+ else
+ dout(10) << " current/ missing entirely (unusual, but okay)" << dendl;
+
+ uint64_t cp = snaps.back();
+ dout(10) << " most recent snap from " << snaps << " is " << cp << dendl;
+
+ // if current/ is marked as non-snapshotted, refuse to roll
+ // back (without clear direction) to avoid throwing out new
+ // data.
+ struct stat st;
+ if (::stat(nosnapfn, &st) == 0) {
+ if (!m_osd_use_stale_snap) {
+ derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl;
+ derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl;
+ derr << "config option for --osd-use-stale-snap startup argument." << dendl;
+ ret = -ENOTSUP;
+ goto close_basedir_fd;
+ }
+ derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq
+ << ", newest snap is " << cp << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: forcing the use of stale snapshot data **"
+ << TEXT_NORMAL << std::endl;
+ }
+
+ dout(10) << __FUNC__ << ": rolling back to consistent snap " << cp << dendl;
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+ }
+
+ // drop current?
+ ret = backend->rollback_to(s);
+ if (ret) {
+ derr << __FUNC__ << ": error rolling back to " << s << ": "
+ << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+ }
+ }
+ initial_op_seq = 0;
+
+ current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC);
+ if (current_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ ceph_assert(current_fd >= 0);
+
+ op_fd = read_op_seq(&initial_op_seq);
+ if (op_fd < 0) {
+ ret = op_fd;
+ derr << __FUNC__ << ": read_op_seq failed" << dendl;
+ goto close_current_fd;
+ }
+
+ dout(5) << "mount op_seq is " << initial_op_seq << dendl;
+ if (initial_op_seq == 0) {
+ derr << "mount initial op seq is 0; something is wrong" << dendl;
+ ret = -EINVAL;
+ goto close_current_fd;
+ }
+
+ if (!backend->can_checkpoint()) {
+ // mark current/ as non-snapshotted so that we don't rollback away
+ // from it.
+ int r = ::creat(nosnapfn, 0644);
+ if (r < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": failed to create current/nosnap" << dendl;
+ goto close_current_fd;
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(r));
+ } else {
+ // clear nosnap marker, if present.
+ ::unlink(nosnapfn);
+ }
+
+ // check fsid with omap
+ // get omap fsid
+ char omap_fsid_buf[PATH_MAX];
+ struct ::stat omap_fsid_stat;
+ snprintf(omap_fsid_buf, sizeof(omap_fsid_buf), "%s/osd_uuid", omap_dir.c_str());
+ // if osd_uuid not exists, assume as this omap matchs corresponding osd
+ if (::stat(omap_fsid_buf, &omap_fsid_stat) != 0){
+ dout(10) << __FUNC__ << ": osd_uuid not found under omap, "
+ << "assume as matched."
+ << dendl;
+ } else {
+ int omap_fsid_fd;
+ // if osd_uuid exists, compares osd_uuid with fsid
+ omap_fsid_fd = ::open(omap_fsid_buf, O_RDONLY|O_CLOEXEC, 0644);
+ if (omap_fsid_fd < 0) {
+ ret = -errno;
+ derr << __FUNC__ << ": error opening '" << omap_fsid_buf << "': "
+ << cpp_strerror(ret)
+ << dendl;
+ goto close_current_fd;
+ }
+ ret = read_fsid(omap_fsid_fd, &omap_fsid);
+ VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd));
+ if (ret < 0) {
+ derr << __FUNC__ << ": error reading omap_fsid_fd"
+ << ", omap_fsid = " << omap_fsid
+ << cpp_strerror(ret)
+ << dendl;
+ goto close_current_fd;
+ }
+ if (fsid != omap_fsid) {
+ derr << __FUNC__ << ": " << omap_fsid_buf
+ << " has existed omap fsid " << omap_fsid
+ << " != expected osd fsid " << fsid
+ << dendl;
+ ret = -EINVAL;
+ goto close_current_fd;
+ }
+ }
+
+ dout(0) << "start omap initiation" << dendl;
+ if (!(generic_flags & SKIP_MOUNT_OMAP)) {
+ KeyValueDB * omap_store = KeyValueDB::create(cct,
+ superblock.omap_backend,
+ omap_dir);
+ if (!omap_store)
+ {
+ derr << __FUNC__ << ": Error creating " << superblock.omap_backend << dendl;
+ ret = -1;
+ goto close_current_fd;
+ }
+
+ if (superblock.omap_backend == "rocksdb")
+ ret = omap_store->init(cct->_conf->filestore_rocksdb_options);
+ else
+ ret = omap_store->init();
+
+ if (ret < 0) {
+ derr << __FUNC__ << ": Error initializing omap_store: " << cpp_strerror(ret) << dendl;
+ goto close_current_fd;
+ }
+
+ stringstream err;
+ if (omap_store->create_and_open(err)) {
+ delete omap_store;
+ omap_store = nullptr;
+ derr << __FUNC__ << ": Error initializing " << superblock.omap_backend
+ << " : " << err.str() << dendl;
+ ret = -1;
+ goto close_current_fd;
+ }
+
+ DBObjectMap *dbomap = new DBObjectMap(cct, omap_store);
+ ret = dbomap->init(do_update);
+ if (ret < 0) {
+ delete dbomap;
+ dbomap = nullptr;
+ derr << __FUNC__ << ": Error initializing DBObjectMap: " << ret << dendl;
+ goto close_current_fd;
+ }
+ stringstream err2;
+
+ if (cct->_conf->filestore_debug_omap_check && !dbomap->check(err2)) {
+ derr << err2.str() << dendl;
+ delete dbomap;
+ dbomap = nullptr;
+ ret = -EINVAL;
+ goto close_current_fd;
+ }
+ object_map.reset(dbomap);
+ }
+
+ // journal
+ new_journal();
+
+ // select journal mode?
+ if (journal) {
+ if (!m_filestore_journal_writeahead &&
+ !m_filestore_journal_parallel &&
+ !m_filestore_journal_trailing) {
+ if (!backend->can_checkpoint()) {
+ m_filestore_journal_writeahead = true;
+ dout(0) << __FUNC__ << ": enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl;
+ } else {
+ m_filestore_journal_parallel = true;
+ dout(0) << __FUNC__ << ": enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl;
+ }
+ } else {
+ if (m_filestore_journal_writeahead)
+ dout(0) << __FUNC__ << ": WRITEAHEAD journal mode explicitly enabled in conf" << dendl;
+ if (m_filestore_journal_parallel)
+ dout(0) << __FUNC__ << ": PARALLEL journal mode explicitly enabled in conf" << dendl;
+ if (m_filestore_journal_trailing)
+ dout(0) << __FUNC__ << ": TRAILING journal mode explicitly enabled in conf" << dendl;
+ }
+ if (m_filestore_journal_writeahead)
+ journal->set_wait_on_full(true);
+ } else {
+ dout(0) << __FUNC__ << ": no journal" << dendl;
+ }
+
+ ret = _sanity_check_fs();
+ if (ret) {
+ derr << __FUNC__ << ": _sanity_check_fs failed with error "
+ << ret << dendl;
+ goto close_current_fd;
+ }
+
+ // Cleanup possibly invalid collections
+ {
+ vector<coll_t> collections;
+ ret = list_collections(collections, true);
+ if (ret < 0) {
+ derr << "Error " << ret << " while listing collections" << dendl;
+ goto close_current_fd;
+ }
+ for (vector<coll_t>::iterator i = collections.begin();
+ i != collections.end();
+ ++i) {
+ Index index;
+ ret = get_index(*i, &index);
+ if (ret < 0) {
+ derr << "Unable to mount index " << *i
+ << " with error: " << ret << dendl;
+ goto close_current_fd;
+ }
+ ceph_assert(index.index);
+ RWLock::WLocker l((index.index)->access_lock);
+
+ index->cleanup();
+ }
+ }
+ if (!m_disable_wbthrottle) {
+ wbthrottle.start();
+ } else {
+ dout(0) << __FUNC__ << ": INFO: WbThrottle is disabled" << dendl;
+ if (cct->_conf->filestore_odsync_write) {
+ dout(0) << __FUNC__ << ": INFO: O_DSYNC write is enabled" << dendl;
+ }
+ }
+ sync_thread.create("filestore_sync");
+
+ if (!(generic_flags & SKIP_JOURNAL_REPLAY)) {
+ ret = journal_replay(initial_op_seq);
+ if (ret < 0) {
+ derr << __FUNC__ << ": failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl;
+ if (ret == -ENOTTY) {
+ derr << "maybe journal is not pointing to a block device and its size "
+ << "wasn't configured?" << dendl;
+ }
+
+ goto stop_sync;
+ }
+ }
+
+ {
+ stringstream err2;
+ if (cct->_conf->filestore_debug_omap_check && !object_map->check(err2)) {
+ derr << err2.str() << dendl;
+ ret = -EINVAL;
+ goto stop_sync;
+ }
+ }
+
+ init_temp_collections();
+
+ journal_start();
+
+ op_tp.start();
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ (*it)->start();
+ }
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ (*it)->start();
+ }
+
+ timer.init();
+
+ // upgrade?
+ if (cct->_conf->filestore_update_to >= (int)get_target_version()) {
+ int err = upgrade();
+ if (err < 0) {
+ derr << "error converting store" << dendl;
+ umount();
+ return err;
+ }
+ }
+
+ // all okay.
+ return 0;
+
+stop_sync:
+ // stop sync thread
+ lock.Lock();
+ stop = true;
+ sync_cond.Signal();
+ lock.Unlock();
+ sync_thread.join();
+ if (!m_disable_wbthrottle) {
+ wbthrottle.stop();
+ }
+close_current_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ current_fd = -1;
+close_basedir_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+ basedir_fd = -1;
+close_fsid_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+done:
+ ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+ delete backend;
+ backend = nullptr;
+ object_map.reset();
+ return ret;
+}
+
+void FileStore::init_temp_collections()
+{
+ dout(10) << __FUNC__ << dendl;
+ vector<coll_t> ls;
+ int r = list_collections(ls, true);
+ ceph_assert(r >= 0);
+
+ dout(20) << " ls " << ls << dendl;
+
+ SequencerPosition spos;
+
+ set<coll_t> temps;
+ for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p)
+ if (p->is_temp())
+ temps.insert(*p);
+ dout(20) << " temps " << temps << dendl;
+
+ for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ if (p->is_temp())
+ continue;
+ coll_map[*p] = new OpSequencer(cct, ++next_osr_id, *p);
+ if (p->is_meta())
+ continue;
+ coll_t temp = p->get_temp();
+ if (temps.count(temp)) {
+ temps.erase(temp);
+ } else {
+ dout(10) << __FUNC__ << ": creating " << temp << dendl;
+ r = _create_collection(temp, 0, spos);
+ ceph_assert(r == 0);
+ }
+ }
+
+ for (set<coll_t>::iterator p = temps.begin(); p != temps.end(); ++p) {
+ dout(10) << __FUNC__ << ": removing stray " << *p << dendl;
+ r = _collection_remove_recursive(*p, spos);
+ ceph_assert(r == 0);
+ }
+}
+
+int FileStore::umount()
+{
+ dout(5) << __FUNC__ << ": " << basedir << dendl;
+
+ flush();
+ sync();
+ do_force_sync();
+
+ {
+ Mutex::Locker l(coll_lock);
+ coll_map.clear();
+ }
+
+ lock.Lock();
+ stop = true;
+ sync_cond.Signal();
+ lock.Unlock();
+ sync_thread.join();
+ if (!m_disable_wbthrottle){
+ wbthrottle.stop();
+ }
+ op_tp.stop();
+
+ journal_stop();
+ if (!(generic_flags & SKIP_JOURNAL_REPLAY))
+ journal_write_close();
+
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ (*it)->stop();
+ }
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ (*it)->stop();
+ }
+
+ if (vdo_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
+ vdo_fd = -1;
+ }
+ if (fsid_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ }
+ if (op_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+ op_fd = -1;
+ }
+ if (current_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ current_fd = -1;
+ }
+ if (basedir_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+ basedir_fd = -1;
+ }
+
+ force_sync = false;
+
+ delete backend;
+ backend = nullptr;
+
+ object_map.reset();
+
+ {
+ Mutex::Locker l(sync_entry_timeo_lock);
+ timer.shutdown();
+ }
+
+ // nothing
+ return 0;
+}
+
+
+/// -----------------------------
+
+// keep OpSequencer handles alive for all time so that a sequence
+// that removes a collection and creates a new one will not allow
+// two sequencers for the same collection to be alive at once.
+
+ObjectStore::CollectionHandle FileStore::open_collection(const coll_t& c)
+{
+ Mutex::Locker l(coll_lock);
+ auto p = coll_map.find(c);
+ if (p == coll_map.end()) {
+ return CollectionHandle();
+ }
+ return p->second;
+}
+
+ObjectStore::CollectionHandle FileStore::create_new_collection(const coll_t& c)
+{
+ Mutex::Locker l(coll_lock);
+ auto p = coll_map.find(c);
+ if (p == coll_map.end()) {
+ auto *r = new OpSequencer(cct, ++next_osr_id, c);
+ coll_map[c] = r;
+ return r;
+ } else {
+ return p->second;
+ }
+}
+
+
+/// -----------------------------
+
+FileStore::Op *FileStore::build_op(vector<Transaction>& tls,
+ Context *onreadable,
+ Context *onreadable_sync,
+ TrackedOpRef osd_op)
+{
+ uint64_t bytes = 0, ops = 0;
+ for (vector<Transaction>::iterator p = tls.begin();
+ p != tls.end();
+ ++p) {
+ bytes += (*p).get_num_bytes();
+ ops += (*p).get_num_ops();
+ }
+
+ Op *o = new Op;
+ o->start = ceph_clock_now();
+ o->tls = std::move(tls);
+ o->onreadable = onreadable;
+ o->onreadable_sync = onreadable_sync;
+ o->ops = ops;
+ o->bytes = bytes;
+ o->osd_op = osd_op;
+ return o;
+}
+
+
+
+void FileStore::queue_op(OpSequencer *osr, Op *o)
+{
+ // queue op on sequencer, then queue sequencer for the threadpool,
+ // so that regardless of which order the threads pick up the
+ // sequencer, the op order will be preserved.
+
+ osr->queue(o);
+ o->trace.event("queued");
+
+ logger->inc(l_filestore_ops);
+ logger->inc(l_filestore_bytes, o->bytes);
+
+ dout(5) << __FUNC__ << ": " << o << " seq " << o->op
+ << " " << *osr
+ << " " << o->bytes << " bytes"
+ << " (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)"
+ << dendl;
+ op_wq.queue(osr);
+}
+
+void FileStore::op_queue_reserve_throttle(Op *o)
+{
+ throttle_ops.get();
+ throttle_bytes.get(o->bytes);
+
+ logger->set(l_filestore_op_queue_ops, throttle_ops.get_current());
+ logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::op_queue_release_throttle(Op *o)
+{
+ throttle_ops.put();
+ throttle_bytes.put(o->bytes);
+ logger->set(l_filestore_op_queue_ops, throttle_ops.get_current());
+ logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
+{
+ if (!m_disable_wbthrottle) {
+ wbthrottle.throttle();
+ }
+ // inject a stall?
+ if (cct->_conf->filestore_inject_stall) {
+ int orig = cct->_conf->filestore_inject_stall;
+ dout(5) << __FUNC__ << ": filestore_inject_stall " << orig << ", sleeping" << dendl;
+ sleep(orig);
+ cct->_conf.set_val("filestore_inject_stall", "0");
+ dout(5) << __FUNC__ << ": done stalling" << dendl;
+ }
+
+ osr->apply_lock.Lock();
+ Op *o = osr->peek_queue();
+ o->trace.event("op_apply_start");
+ apply_manager.op_apply_start(o->op);
+ dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " start" << dendl;
+ o->trace.event("_do_transactions start");
+ int r = _do_transactions(o->tls, o->op, &handle, osr->osr_name);
+ o->trace.event("op_apply_finish");
+ apply_manager.op_apply_finish(o->op);
+ dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " r = " << r
+ << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl;
+}
+
+void FileStore::_finish_op(OpSequencer *osr)
+{
+ list<Context*> to_queue;
+ Op *o = osr->dequeue(&to_queue);
+
+ o->tls.clear();
+
+ utime_t lat = ceph_clock_now();
+ lat -= o->start;
+
+ dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " lat " << lat << dendl;
+ osr->apply_lock.Unlock(); // locked in _do_op
+ o->trace.event("_finish_op");
+
+ // called with tp lock held
+ op_queue_release_throttle(o);
+
+ logger->tinc(l_filestore_apply_latency, lat);
+
+ if (o->onreadable_sync) {
+ o->onreadable_sync->complete(0);
+ }
+ if (o->onreadable) {
+ apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable);
+ }
+ if (!to_queue.empty()) {
+ apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue);
+ }
+ delete o;
+ o = nullptr;
+}
+
+struct C_JournaledAhead : public Context {
+ FileStore *fs;
+ FileStore::OpSequencer *osr;
+ FileStore::Op *o;
+ Context *ondisk;
+
+ C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk):
+ fs(f), osr(os), o(o), ondisk(ondisk) { }
+ void finish(int r) override {
+ fs->_journaled_ahead(osr, o, ondisk);
+ }
+};
+
+int FileStore::queue_transactions(CollectionHandle& ch, vector<Transaction>& tls,
+ TrackedOpRef osd_op,
+ ThreadPool::TPHandle *handle)
+{
+ Context *onreadable;
+ Context *ondisk;
+ Context *onreadable_sync;
+ ObjectStore::Transaction::collect_contexts(
+ tls, &onreadable, &ondisk, &onreadable_sync);
+
+ if (cct->_conf->objectstore_blackhole) {
+ dout(0) << __FUNC__ << ": objectstore_blackhole = TRUE, dropping transaction"
+ << dendl;
+ delete ondisk;
+ ondisk = nullptr;
+ delete onreadable;
+ onreadable = nullptr;
+ delete onreadable_sync;
+ onreadable_sync = nullptr;
+ return 0;
+ }
+
+ utime_t start = ceph_clock_now();
+
+ OpSequencer *osr = static_cast<OpSequencer*>(ch.get());
+ dout(5) << __FUNC__ << ": osr " << osr << " " << *osr << dendl;
+
+ ZTracer::Trace trace;
+ if (osd_op && osd_op->pg_trace) {
+ osd_op->store_trace.init("filestore op", &trace_endpoint, &osd_op->pg_trace);
+ trace = osd_op->store_trace;
+ }
+
+ if (journal && journal->is_writeable() && !m_filestore_journal_trailing) {
+ Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+
+ //prepare and encode transactions data out of lock
+ bufferlist tbl;
+ int orig_len = journal->prepare_entry(o->tls, &tbl);
+
+ if (handle)
+ handle->suspend_tp_timeout();
+
+ op_queue_reserve_throttle(o);
+ journal->reserve_throttle_and_backoff(tbl.length());
+
+ if (handle)
+ handle->reset_tp_timeout();
+
+ uint64_t op_num = submit_manager.op_submit_start();
+ o->op = op_num;
+ trace.keyval("opnum", op_num);
+
+ if (m_filestore_do_dump)
+ dump_transactions(o->tls, o->op, osr);
+
+ if (m_filestore_journal_parallel) {
+ dout(5) << __FUNC__ << ": (parallel) " << o->op << " " << o->tls << dendl;
+
+ trace.keyval("journal mode", "parallel");
+ trace.event("journal started");
+ _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op);
+
+ // queue inside submit_manager op submission lock
+ queue_op(osr, o);
+ trace.event("op queued");
+ } else if (m_filestore_journal_writeahead) {
+ dout(5) << __FUNC__ << ": (writeahead) " << o->op << " " << o->tls << dendl;
+
+ osr->queue_journal(o);
+
+ trace.keyval("journal mode", "writeahead");
+ trace.event("journal started");
+ _op_journal_transactions(tbl, orig_len, o->op,
+ new C_JournaledAhead(this, osr, o, ondisk),
+ osd_op);
+ } else {
+ ceph_abort();
+ }
+ submit_manager.op_submit_finish(op_num);
+ utime_t end = ceph_clock_now();
+ logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+ return 0;
+ }
+
+ if (!journal) {
+ Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+ dout(5) << __FUNC__ << ": (no journal) " << o << " " << tls << dendl;
+
+ if (handle)
+ handle->suspend_tp_timeout();
+
+ op_queue_reserve_throttle(o);
+
+ if (handle)
+ handle->reset_tp_timeout();
+
+ uint64_t op_num = submit_manager.op_submit_start();
+ o->op = op_num;
+
+ if (m_filestore_do_dump)
+ dump_transactions(o->tls, o->op, osr);
+
+ queue_op(osr, o);
+ trace.keyval("opnum", op_num);
+ trace.keyval("journal mode", "none");
+ trace.event("op queued");
+
+ if (ondisk)
+ apply_manager.add_waiter(op_num, ondisk);
+ submit_manager.op_submit_finish(op_num);
+ utime_t end = ceph_clock_now();
+ logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+ return 0;
+ }
+
+ ceph_assert(journal);
+ //prepare and encode transactions data out of lock
+ bufferlist tbl;
+ int orig_len = -1;
+ if (journal->is_writeable()) {
+ orig_len = journal->prepare_entry(tls, &tbl);
+ }
+ uint64_t op = submit_manager.op_submit_start();
+ dout(5) << __FUNC__ << ": (trailing journal) " << op << " " << tls << dendl;
+
+ if (m_filestore_do_dump)
+ dump_transactions(tls, op, osr);
+
+ trace.event("op_apply_start");
+ trace.keyval("opnum", op);
+ trace.keyval("journal mode", "trailing");
+ apply_manager.op_apply_start(op);
+ trace.event("do_transactions");
+ int r = do_transactions(tls, op);
+
+ if (r >= 0) {
+ trace.event("journal started");
+ _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op);
+ } else {
+ delete ondisk;
+ ondisk = nullptr;
+ }
+
+ // start on_readable finisher after we queue journal item, as on_readable callback
+ // is allowed to delete the Transaction
+ if (onreadable_sync) {
+ onreadable_sync->complete(r);
+ }
+ apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r);
+
+ submit_manager.op_submit_finish(op);
+ trace.event("op_apply_finish");
+ apply_manager.op_apply_finish(op);
+
+ utime_t end = ceph_clock_now();
+ logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+ return r;
+}
+
+void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
+{
+ dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl;
+
+ o->trace.event("writeahead journal finished");
+
+ // this should queue in order because the journal does it's completions in order.
+ queue_op(osr, o);
+
+ list<Context*> to_queue;
+ osr->dequeue_journal(&to_queue);
+
+ // do ondisk completions async, to prevent any onreadable_sync completions
+ // getting blocked behind an ondisk completion.
+ if (ondisk) {
+ dout(10) << " queueing ondisk " << ondisk << dendl;
+ ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk);
+ }
+ if (!to_queue.empty()) {
+ ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue);
+ }
+}
+
+int FileStore::_do_transactions(
+ vector<Transaction> &tls,
+ uint64_t op_seq,
+ ThreadPool::TPHandle *handle,
+ const char *osr_name)
+{
+ int trans_num = 0;
+
+ for (vector<Transaction>::iterator p = tls.begin();
+ p != tls.end();
+ ++p, trans_num++) {
+ _do_transaction(*p, op_seq, trans_num, handle, osr_name);
+ if (handle)
+ handle->reset_tp_timeout();
+ }
+
+ return 0;
+}
+
+void FileStore::_set_global_replay_guard(const coll_t& cid,
+ const SequencerPosition &spos)
+{
+ if (backend->can_checkpoint())
+ return;
+
+ // sync all previous operations on this sequencer
+ int ret = object_map->sync();
+ if (ret < 0) {
+ derr << __FUNC__ << ": omap sync error " << cpp_strerror(ret) << dendl;
+ ceph_abort_msg("_set_global_replay_guard failed");
+ }
+ ret = sync_filesystem(basedir_fd);
+ if (ret < 0) {
+ derr << __FUNC__ << ": sync_filesystem error " << cpp_strerror(ret) << dendl;
+ ceph_abort_msg("_set_global_replay_guard failed");
+ }
+
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ int err = errno;
+ derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("_set_global_replay_guard failed");
+ }
+
+ _inject_failure();
+
+ // then record that we did it
+ bufferlist v;
+ encode(spos, v);
+ int r = chain_fsetxattr<true, true>(
+ fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length());
+ if (r < 0) {
+ derr << __FUNC__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR
+ << " got " << cpp_strerror(r) << dendl;
+ ceph_abort_msg("fsetxattr failed");
+ }
+
+ // and make sure our xattr is durable.
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ _inject_failure();
+
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+int FileStore::_check_global_replay_guard(const coll_t& cid,
+ const SequencerPosition& spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ dout(10) << __FUNC__ << ": " << cid << " dne" << dendl;
+ return 1; // if collection does not exist, there is no guard, and we can replay.
+ }
+
+ char buf[100];
+ int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf));
+ if (r < 0) {
+ dout(20) << __FUNC__ << ": no xattr" << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return 1; // no xattr
+ }
+ bufferlist bl;
+ bl.append(buf, r);
+
+ SequencerPosition opos;
+ auto p = bl.cbegin();
+ decode(opos, p);
+
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return spos >= opos ? 1 : -1;
+}
+
+
+void FileStore::_set_replay_guard(const coll_t& cid,
+ const SequencerPosition &spos,
+ bool in_progress=false)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ int err = errno;
+ derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("_set_replay_guard failed");
+ }
+ _set_replay_guard(fd, spos, 0, in_progress);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+
+void FileStore::_set_replay_guard(int fd,
+ const SequencerPosition& spos,
+ const ghobject_t *hoid,
+ bool in_progress)
+{
+ if (backend->can_checkpoint())
+ return;
+
+ dout(10) << __FUNC__ << ": " << spos << (in_progress ? " START" : "") << dendl;
+
+ _inject_failure();
+
+ // first make sure the previous operation commits
+ int r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ if (!in_progress) {
+ // sync object_map too. even if this object has a header or keys,
+ // it have had them in the past and then removed them, so always
+ // sync.
+ object_map->sync(hoid, &spos);
+ }
+
+ _inject_failure();
+
+ // then record that we did it
+ bufferlist v(40);
+ encode(spos, v);
+ encode(in_progress, v);
+ r = chain_fsetxattr<true, true>(
+ fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+ if (r < 0) {
+ derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+ ceph_abort_msg("fsetxattr failed");
+ }
+
+ // and make sure our xattr is durable.
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ _inject_failure();
+
+ dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+void FileStore::_close_replay_guard(const coll_t& cid,
+ const SequencerPosition &spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ int err = errno;
+ derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("_close_replay_guard failed");
+ }
+ _close_replay_guard(fd, spos);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos,
+ const ghobject_t *hoid)
+{
+ if (backend->can_checkpoint())
+ return;
+
+ dout(10) << __FUNC__ << ": " << spos << dendl;
+
+ _inject_failure();
+
+ // sync object_map too. even if this object has a header or keys,
+ // it have had them in the past and then removed them, so always
+ // sync.
+ object_map->sync(hoid, &spos);
+
+ // then record that we are done with this operation
+ bufferlist v(40);
+ encode(spos, v);
+ bool in_progress = false;
+ encode(in_progress, v);
+ int r = chain_fsetxattr<true, true>(
+ fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+ if (r < 0) {
+ derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+ ceph_abort_msg("fsetxattr failed");
+ }
+
+ // and make sure our xattr is durable.
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ _inject_failure();
+
+ dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+int FileStore::_check_replay_guard(const coll_t& cid, const ghobject_t &oid,
+ const SequencerPosition& spos)
+{
+ if (!replaying || backend->can_checkpoint())
+ return 1;
+
+ int r = _check_global_replay_guard(cid, spos);
+ if (r < 0)
+ return r;
+
+ FDRef fd;
+ r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": " << cid << " " << oid << " dne" << dendl;
+ return 1; // if file does not exist, there is no guard, and we can replay.
+ }
+ int ret = _check_replay_guard(**fd, spos);
+ lfn_close(fd);
+ return ret;
+}
+
+int FileStore::_check_replay_guard(const coll_t& cid, const SequencerPosition& spos)
+{
+ if (!replaying || backend->can_checkpoint())
+ return 1;
+
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ dout(10) << __FUNC__ << ": " << cid << " dne" << dendl;
+ return 1; // if collection does not exist, there is no guard, and we can replay.
+ }
+ int ret = _check_replay_guard(fd, spos);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+}
+
+int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
+{
+ if (!replaying || backend->can_checkpoint())
+ return 1;
+
+ char buf[100];
+ int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf));
+ if (r < 0) {
+ dout(20) << __FUNC__ << ": no xattr" << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return 1; // no xattr
+ }
+ bufferlist bl;
+ bl.append(buf, r);
+
+ SequencerPosition opos;
+ auto p = bl.cbegin();
+ decode(opos, p);
+ bool in_progress = false;
+ if (!p.end()) // older journals don't have this
+ decode(in_progress, p);
+ if (opos > spos) {
+ dout(10) << __FUNC__ << ": object has " << opos << " > current pos " << spos
+ << ", now or in future, SKIPPING REPLAY" << dendl;
+ return -1;
+ } else if (opos == spos) {
+ if (in_progress) {
+ dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos
+ << ", in_progress=true, CONDITIONAL REPLAY" << dendl;
+ return 0;
+ } else {
+ dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos
+ << ", in_progress=false, SKIPPING REPLAY" << dendl;
+ return -1;
+ }
+ } else {
+ dout(10) << __FUNC__ << ": object has " << opos << " < current pos " << spos
+ << ", in past, will replay" << dendl;
+ return 1;
+ }
+}
+
+void FileStore::_do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle,
+ const char *osr_name)
+{
+ dout(10) << __FUNC__ << ": on " << &t << dendl;
+
+ Transaction::iterator i = t.begin();
+
+ SequencerPosition spos(op_seq, trans_num, 0);
+ while (i.have_op()) {
+ if (handle)
+ handle->reset_tp_timeout();
+
+ Transaction::Op *op = i.decode_op();
+ int r = 0;
+
+ _inject_failure();
+
+ switch (op->op) {
+ case Transaction::OP_NOP:
+ break;
+ case Transaction::OP_TOUCH:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, touch_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _touch(cid, oid);
+ tracepoint(objectstore, touch_exit, r);
+ }
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, write_enter, osr_name, off, len);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _write(cid, oid, off, len, bl, fadvise_flags);
+ tracepoint(objectstore, write_exit, r);
+ }
+ break;
+
+ case Transaction::OP_ZERO:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ tracepoint(objectstore, zero_enter, osr_name, off, len);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _zero(cid, oid, off, len);
+ tracepoint(objectstore, zero_exit, r);
+ }
+ break;
+
+ case Transaction::OP_TRIMCACHE:
+ {
+ // deprecated, no-op
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ uint64_t off = op->off;
+ tracepoint(objectstore, truncate_enter, osr_name, off);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _truncate(cid, oid, off);
+ tracepoint(objectstore, truncate_exit, r);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, remove_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _remove(cid, oid, spos);
+ tracepoint(objectstore, remove_exit, r);
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ string name = i.decode_string();
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, setattr_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0) {
+ map<string, bufferptr> to_set;
+ to_set[name] = bufferptr(bl.c_str(), bl.length());
+ r = _setattrs(cid, oid, to_set, spos);
+ if (r == -ENOSPC)
+ dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid
+ << " name " << name << " size " << bl.length() << dendl;
+ }
+ tracepoint(objectstore, setattr_exit, r);
+ }
+ break;
+
+ case Transaction::OP_SETATTRS:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ map<string, bufferptr> aset;
+ i.decode_attrset(aset);
+ tracepoint(objectstore, setattrs_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _setattrs(cid, oid, aset, spos);
+ tracepoint(objectstore, setattrs_exit, r);
+ if (r == -ENOSPC)
+ dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl;
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ string name = i.decode_string();
+ tracepoint(objectstore, rmattr_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _rmattr(cid, oid, name.c_str(), spos);
+ tracepoint(objectstore, rmattr_exit, r);
+ }
+ break;
+
+ case Transaction::OP_RMATTRS:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, rmattrs_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _rmattrs(cid, oid, spos);
+ tracepoint(objectstore, rmattrs_exit, r);
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ const ghobject_t &noid = i.get_oid(op->dest_oid);
+ tracepoint(objectstore, clone_enter, osr_name);
+ r = _clone(cid, oid, noid, spos);
+ tracepoint(objectstore, clone_exit, r);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const ghobject_t &noid = i.get_oid(op->dest_oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ?
+ _cid : _cid.get_temp();
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ tracepoint(objectstore, clone_range_enter, osr_name, len);
+ r = _clone_range(cid, oid, ncid, noid, off, len, off, spos);
+ tracepoint(objectstore, clone_range_exit, r);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE2:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const ghobject_t &noid = i.get_oid(op->dest_oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ?
+ _cid : _cid.get_temp();
+ uint64_t srcoff = op->off;
+ uint64_t len = op->len;
+ uint64_t dstoff = op->dest_off;
+ tracepoint(objectstore, clone_range2_enter, osr_name, len);
+ r = _clone_range(cid, oid, ncid, noid, srcoff, len, dstoff, spos);
+ tracepoint(objectstore, clone_range2_exit, r);
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ tracepoint(objectstore, mkcoll_enter, osr_name);
+ if (_check_replay_guard(cid, spos) > 0)
+ r = _create_collection(cid, op->split_bits, spos);
+ tracepoint(objectstore, mkcoll_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_SET_BITS:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ int bits = op->split_bits;
+ r = _collection_set_bits(cid, bits);
+ }
+ break;
+
+ case Transaction::OP_COLL_HINT:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ uint32_t type = op->hint_type;
+ bufferlist hint;
+ i.decode_bl(hint);
+ auto hiter = hint.cbegin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ decode(pg_num, hiter);
+ decode(num_objs, hiter);
+ if (_check_replay_guard(cid, spos) > 0) {
+ r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos);
+ }
+ } else {
+ // Ignore the hint
+ dout(10) << "Unrecognized collection hint type: " << type << dendl;
+ }
+ }
+ break;
+
+ case Transaction::OP_RMCOLL:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ tracepoint(objectstore, rmcoll_enter, osr_name);
+ if (_check_replay_guard(cid, spos) > 0)
+ r = _destroy_collection(cid);
+ tracepoint(objectstore, rmcoll_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ {
+ const coll_t &ocid = i.get_cid(op->cid);
+ const coll_t &ncid = i.get_cid(op->dest_cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+
+ ceph_assert(oid.hobj.pool >= -1);
+
+ // always followed by OP_COLL_REMOVE
+ Transaction::Op *op2 = i.decode_op();
+ const coll_t &ocid2 = i.get_cid(op2->cid);
+ const ghobject_t &oid2 = i.get_oid(op2->oid);
+ ceph_assert(op2->op == Transaction::OP_COLL_REMOVE);
+ ceph_assert(ocid2 == ocid);
+ ceph_assert(oid2 == oid);
+
+ tracepoint(objectstore, coll_add_enter);
+ r = _collection_add(ncid, ocid, oid, spos);
+ tracepoint(objectstore, coll_add_exit, r);
+ spos.op++;
+ if (r < 0)
+ break;
+ tracepoint(objectstore, coll_remove_enter, osr_name);
+ if (_check_replay_guard(ocid, oid, spos) > 0)
+ r = _remove(ocid, oid, spos);
+ tracepoint(objectstore, coll_remove_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_MOVE:
+ {
+ // WARNING: this is deprecated and buggy; only here to replay old journals.
+ const coll_t &ocid = i.get_cid(op->cid);
+ const coll_t &ncid = i.get_cid(op->dest_cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ tracepoint(objectstore, coll_move_enter);
+ r = _collection_add(ocid, ncid, oid, spos);
+ if (r == 0 &&
+ (_check_replay_guard(ocid, oid, spos) > 0))
+ r = _remove(ocid, oid, spos);
+ tracepoint(objectstore, coll_move_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_MOVE_RENAME:
+ {
+ const coll_t &_oldcid = i.get_cid(op->cid);
+ const ghobject_t &oldoid = i.get_oid(op->oid);
+ const coll_t &_newcid = i.get_cid(op->dest_cid);
+ const ghobject_t &newoid = i.get_oid(op->dest_oid);
+ const coll_t &oldcid = !_need_temp_object_collection(_oldcid, oldoid) ?
+ _oldcid : _oldcid.get_temp();
+ const coll_t &newcid = !_need_temp_object_collection(_newcid, newoid) ?
+ _oldcid : _newcid.get_temp();
+ tracepoint(objectstore, coll_move_rename_enter);
+ r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
+ tracepoint(objectstore, coll_move_rename_exit, r);
+ }
+ break;
+
+ case Transaction::OP_TRY_RENAME:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oldoid = i.get_oid(op->oid);
+ const ghobject_t &newoid = i.get_oid(op->dest_oid);
+ const coll_t &oldcid = !_need_temp_object_collection(_cid, oldoid) ?
+ _cid : _cid.get_temp();
+ const coll_t &newcid = !_need_temp_object_collection(_cid, newoid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, coll_try_rename_enter);
+ r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos, true);
+ tracepoint(objectstore, coll_try_rename_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ case Transaction::OP_COLL_RMATTR:
+ ceph_abort_msg("collection attr methods no longer implemented");
+ break;
+
+ case Transaction::OP_COLL_RENAME:
+ {
+ r = -EOPNOTSUPP;
+ }
+ break;
+
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ tracepoint(objectstore, omap_clear_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_clear(cid, oid, spos);
+ tracepoint(objectstore, omap_clear_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ map<string, bufferlist> aset;
+ i.decode_attrset(aset);
+ tracepoint(objectstore, omap_setkeys_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_setkeys(cid, oid, aset, spos);
+ tracepoint(objectstore, omap_setkeys_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ set<string> keys;
+ i.decode_keyset(keys);
+ tracepoint(objectstore, omap_rmkeys_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_rmkeys(cid, oid, keys, spos);
+ tracepoint(objectstore, omap_rmkeys_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ tracepoint(objectstore, omap_rmkeyrange_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_rmkeyrange(cid, oid, first, last, spos);
+ tracepoint(objectstore, omap_rmkeyrange_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, omap_setheader_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _omap_setheader(cid, oid, bl, spos);
+ tracepoint(objectstore, omap_setheader_exit, r);
+ }
+ break;
+ case Transaction::OP_SPLIT_COLLECTION:
+ {
+ ceph_abort_msg("not legacy journal; upgrade to firefly first");
+ }
+ break;
+ case Transaction::OP_SPLIT_COLLECTION2:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ uint32_t rem = op->split_rem;
+ coll_t dest = i.get_cid(op->dest_cid);
+ tracepoint(objectstore, split_coll2_enter, osr_name);
+ r = _split_collection(cid, bits, rem, dest, spos);
+ tracepoint(objectstore, split_coll2_exit, r);
+ }
+ break;
+
+ case Transaction::OP_MERGE_COLLECTION:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ coll_t dest = i.get_cid(op->dest_cid);
+ tracepoint(objectstore, merge_coll_enter, osr_name);
+ r = _merge_collection(cid, bits, dest, spos);
+ tracepoint(objectstore, merge_coll_exit, r);
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+ {
+ const coll_t &_cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+ _cid : _cid.get_temp();
+ uint64_t expected_object_size = op->expected_object_size;
+ uint64_t expected_write_size = op->expected_write_size;
+ tracepoint(objectstore, setallochint_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _set_alloc_hint(cid, oid, expected_object_size,
+ expected_write_size);
+ tracepoint(objectstore, setallochint_exit, r);
+ }
+ break;
+
+ default:
+ derr << "bad op " << op->op << dendl;
+ ceph_abort();
+ }
+
+ if (r < 0) {
+ bool ok = false;
+
+ if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2 ||
+ op->op == Transaction::OP_COLL_ADD ||
+ op->op == Transaction::OP_SETATTR ||
+ op->op == Transaction::OP_SETATTRS ||
+ op->op == Transaction::OP_RMATTR ||
+ op->op == Transaction::OP_OMAP_SETKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYRANGE ||
+ op->op == Transaction::OP_OMAP_SETHEADER))
+ // -ENOENT is normally okay
+ // ...including on a replayed OP_RMCOLL with checkpoint mode
+ ok = true;
+ if (r == -ENODATA)
+ ok = true;
+
+ if (op->op == Transaction::OP_SETALLOCHINT)
+ // Either EOPNOTSUPP or EINVAL most probably. EINVAL in most
+ // cases means invalid hint size (e.g. too big, not a multiple
+ // of block size, etc) or, at least on xfs, an attempt to set
+ // or change it when the file is not empty. However,
+ // OP_SETALLOCHINT is advisory, so ignore all errors.
+ ok = true;
+
+ if (replaying && !backend->can_checkpoint()) {
+ if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) {
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+ ok = true;
+ }
+ if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) {
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+ ok = true;
+ }
+ if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) {
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+ ok = true;
+ }
+ if (r == -ERANGE) {
+ dout(10) << "tolerating ERANGE on replay" << dendl;
+ ok = true;
+ }
+ if (r == -ENOENT) {
+ dout(10) << "tolerating ENOENT on replay" << dendl;
+ ok = true;
+ }
+ }
+
+ if (!ok) {
+ const char *msg = "unexpected error code";
+
+ if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2)) {
+ msg = "ENOENT on clone suggests osd bug";
+ } else if (r == -ENOSPC) {
+ // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+ // by partially applying transactions.
+ msg = "ENOSPC from disk filesystem, misconfigured cluster";
+ } else if (r == -ENOTEMPTY) {
+ msg = "ENOTEMPTY suggests garbage data in osd data dir";
+ } else if (r == -EPERM) {
+ msg = "EPERM suggests file(s) in osd data dir not owned by ceph user, or leveldb corruption";
+ }
+
+ derr << " error " << cpp_strerror(r) << " not handled on operation " << op
+ << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl;
+ dout(0) << msg << dendl;
+ dout(0) << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t.dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ if (r == -EMFILE) {
+ dump_open_fds(cct);
+ }
+
+ ceph_abort_msg("unexpected error");
+ }
+ }
+
+ spos.op++;
+ }
+
+ _inject_failure();
+}
+
+ /*********************************************/
+
+
+
+// --------------------
+// objects
+
+bool FileStore::exists(CollectionHandle& ch, const ghobject_t& oid)
+{
+ tracepoint(objectstore, exists_enter, ch->cid.c_str());
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+ struct stat st;
+ bool retval = stat(ch, oid, &st) == 0;
+ tracepoint(objectstore, exists_exit, retval);
+ return retval;
+}
+
+int FileStore::stat(
+ CollectionHandle& ch, const ghobject_t& oid, struct stat *st, bool allow_eio)
+{
+ tracepoint(objectstore, stat_enter, ch->cid.c_str());
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+ int r = lfn_stat(cid, oid, st);
+ ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid
+ << " = " << r << dendl;
+ } else {
+ dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid
+ << " = " << r
+ << " (size " << st->st_size << ")" << dendl;
+ }
+ if (cct->_conf->filestore_debug_inject_read_err &&
+ debug_mdata_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, stat_exit, r);
+ return r;
+ }
+}
+
+int FileStore::set_collection_opts(
+ CollectionHandle& ch,
+ const pool_opts_t& opts)
+{
+ return -EOPNOTSUPP;
+}
+
+int FileStore::read(
+ CollectionHandle& ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags)
+{
+ int got;
+ tracepoint(objectstore, read_enter, ch->cid.c_str(), offset, len);
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") open error: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (offset == 0 && len == 0) {
+ struct stat st;
+ memset(&st, 0, sizeof(struct stat));
+ int r = ::fstat(**fd, &st);
+ ceph_assert(r == 0);
+ len = st.st_size;
+ }
+
+#ifdef HAVE_POSIX_FADVISE
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM)
+ posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM);
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)
+ posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL);
+#endif
+
+ bufferptr bptr(len); // prealloc space for entire read
+ got = safe_pread(**fd, bptr.c_str(), len, offset);
+ if (got < 0) {
+ dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl;
+ lfn_close(fd);
+ return got;
+ }
+ bptr.set_length(got); // properly size the buffer
+ bl.clear();
+ bl.push_back(std::move(bptr)); // put it in the target bufferlist
+
+#ifdef HAVE_POSIX_FADVISE
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
+ posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED);
+ if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL))
+ posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL);
+#endif
+
+ if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) {
+ ostringstream ss;
+ int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss);
+ if (errors != 0) {
+ dout(0) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~"
+ << got << " ... BAD CRC:\n" << ss.str() << dendl;
+ ceph_abort_msg("bad crc on read");
+ }
+ }
+
+ lfn_close(fd);
+
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~"
+ << got << "/" << len << dendl;
+ if (cct->_conf->filestore_debug_inject_read_err &&
+ debug_data_eio(oid)) {
+ return -EIO;
+ } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
+ cct->_conf->filestore_debug_random_read_err &&
+ (rand() % (int)(cct->_conf->filestore_debug_random_read_err *
+ 100.0)) == 0) {
+ dout(0) << __func__ << ": inject random EIO" << dendl;
+ return -EIO;
+ } else {
+ tracepoint(objectstore, read_exit, got);
+ return got;
+ }
+}
+
+int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m)
+{
+ uint64_t i;
+ struct fiemap_extent *extent = nullptr;
+ struct fiemap *fiemap = nullptr;
+ int r = 0;
+
+more:
+ r = backend->do_fiemap(fd, offset, len, &fiemap);
+ if (r < 0)
+ return r;
+
+ if (fiemap->fm_mapped_extents == 0) {
+ free(fiemap);
+ return r;
+ }
+
+ extent = &fiemap->fm_extents[0];
+
+ /* start where we were asked to start */
+ if (extent->fe_logical < offset) {
+ extent->fe_length -= offset - extent->fe_logical;
+ extent->fe_logical = offset;
+ }
+
+ i = 0;
+
+ struct fiemap_extent *last = nullptr;
+ while (i < fiemap->fm_mapped_extents) {
+ struct fiemap_extent *next = extent + 1;
+
+ dout(10) << __FUNC__ << ": fm_mapped_extents=" << fiemap->fm_mapped_extents
+ << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl;
+
+ /* try to merge extents */
+ while ((i < fiemap->fm_mapped_extents - 1) &&
+ (extent->fe_logical + extent->fe_length == next->fe_logical)) {
+ next->fe_length += extent->fe_length;
+ next->fe_logical = extent->fe_logical;
+ extent = next;
+ next = extent + 1;
+ i++;
+ }
+
+ if (extent->fe_logical + extent->fe_length > offset + len)
+ extent->fe_length = offset + len - extent->fe_logical;
+ (*m)[extent->fe_logical] = extent->fe_length;
+ i++;
+ last = extent++;
+ }
+ uint64_t xoffset = last->fe_logical + last->fe_length - offset;
+ offset = last->fe_logical + last->fe_length;
+ len -= xoffset;
+ const bool is_last = (last->fe_flags & FIEMAP_EXTENT_LAST) || (len == 0);
+ free(fiemap);
+ if (!is_last) {
+ goto more;
+ }
+
+ return r;
+}
+
+int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m)
+{
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+ off_t hole_pos, data_pos;
+ int r = 0;
+
+ // If lseek fails with errno setting to be ENXIO, this means the current
+ // file offset is beyond the end of the file.
+ off_t start = offset;
+ while(start < (off_t)(offset + len)) {
+ data_pos = lseek(fd, start, SEEK_DATA);
+ if (data_pos < 0) {
+ if (errno == ENXIO)
+ break;
+ else {
+ r = -errno;
+ dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else if (data_pos > (off_t)(offset + len)) {
+ break;
+ }
+
+ hole_pos = lseek(fd, data_pos, SEEK_HOLE);
+ if (hole_pos < 0) {
+ if (errno == ENXIO) {
+ break;
+ } else {
+ r = -errno;
+ dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ if (hole_pos >= (off_t)(offset + len)) {
+ (*m)[data_pos] = offset + len - data_pos;
+ break;
+ }
+ (*m)[data_pos] = hole_pos - data_pos;
+ start = hole_pos;
+ }
+
+ return r;
+#else
+ (*m)[offset] = len;
+ return 0;
+#endif
+}
+
+int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+ uint64_t offset, size_t len,
+ bufferlist& bl)
+{
+ map<uint64_t, uint64_t> exomap;
+ int r = fiemap(ch, oid, offset, len, exomap);
+ if (r >= 0) {
+ encode(exomap, bl);
+ }
+ return r;
+}
+
+int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+ uint64_t offset, size_t len,
+ map<uint64_t, uint64_t>& destmap)
+{
+ tracepoint(objectstore, fiemap_enter, ch->cid.c_str(), offset, len);
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+ destmap.clear();
+
+ if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) ||
+ len <= (size_t)m_filestore_fiemap_threshold) {
+ destmap[offset] = len;
+ return 0;
+ }
+
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+
+ FDRef fd;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl;
+ goto done;
+ }
+
+ if (backend->has_seek_data_hole()) {
+ dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ r = _do_seek_hole_data(**fd, offset, len, &destmap);
+ } else if (backend->has_fiemap()) {
+ dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ r = _do_fiemap(**fd, offset, len, &destmap);
+ }
+
+ lfn_close(fd);
+
+done:
+
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << destmap.size() << " " << destmap << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ tracepoint(objectstore, fiemap_exit, r);
+ return r;
+}
+
+int FileStore::_remove(const coll_t& cid, const ghobject_t& oid,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+ int r = lfn_unlink(cid, oid, spos);
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << dendl;
+ int r = lfn_truncate(cid, oid, size);
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+ FDRef fd;
+ int r = lfn_open(cid, oid, true, &fd);
+ if (r < 0) {
+ return r;
+ } else {
+ lfn_close(fd);
+ }
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_write(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len,
+ const bufferlist& bl, uint32_t fadvise_flags)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ int r;
+
+ FDRef fd;
+ r = lfn_open(cid, oid, true, &fd);
+ if (r < 0) {
+ dout(0) << __FUNC__ << ": couldn't open " << cid << "/"
+ << oid << ": "
+ << cpp_strerror(r) << dendl;
+ goto out;
+ }
+
+ // write
+ r = bl.write_fd(**fd, offset);
+ if (r < 0) {
+ derr << __FUNC__ << ": write_fd on " << cid << "/" << oid
+ << " error: " << cpp_strerror(r) << dendl;
+ lfn_close(fd);
+ goto out;
+ }
+ r = bl.length();
+
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_write(**fd, offset, len, bl);
+ ceph_assert(rc >= 0);
+ }
+
+ if (replaying || m_disable_wbthrottle) {
+ if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) {
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(**fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+ }
+ } else {
+ wbthrottle.queue_wb(fd, oid, offset, len,
+ fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+ }
+
+ lfn_close(fd);
+
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ int ret = 0;
+
+ if (cct->_conf->filestore_punch_hole) {
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(__APPLE__) && !defined(__FreeBSD__)
+# ifdef FALLOC_FL_KEEP_SIZE
+ // first try to punch a hole.
+ FDRef fd;
+ ret = lfn_open(cid, oid, false, &fd);
+ if (ret < 0) {
+ goto out;
+ }
+
+ struct stat st;
+ ret = ::fstat(**fd, &st);
+ if (ret < 0) {
+ ret = -errno;
+ lfn_close(fd);
+ goto out;
+ }
+
+ // first try fallocate
+ ret = fallocate(**fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ offset, len);
+ if (ret < 0) {
+ ret = -errno;
+ } else {
+ // ensure we extend file size, if needed
+ if (len > 0 && offset + len > (uint64_t)st.st_size) {
+ ret = ::ftruncate(**fd, offset + len);
+ if (ret < 0) {
+ ret = -errno;
+ lfn_close(fd);
+ goto out;
+ }
+ }
+ }
+ lfn_close(fd);
+
+ if (ret >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_zero(**fd, offset, len);
+ ceph_assert(rc >= 0);
+ }
+
+ if (ret == 0)
+ goto out; // yay!
+ if (ret != -EOPNOTSUPP)
+ goto out; // some other error
+# endif
+# endif
+#endif
+ }
+
+ // lame, kernel is old and doesn't support it.
+ // write zeros.. yuck!
+ dout(20) << __FUNC__ << ": falling back to writing zeros" << dendl;
+ {
+ bufferlist bl;
+ bl.append_zero(len);
+ ret = _write(cid, oid, offset, len, bl);
+ }
+
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(__APPLE__) && !defined(__FreeBSD__)
+# ifdef FALLOC_FL_KEEP_SIZE
+ out:
+# endif
+# endif
+#endif
+ dout(20) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl;
+ return ret;
+}
+
+int FileStore::_clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+ const SequencerPosition& spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl;
+
+ if (_check_replay_guard(cid, newoid, spos) < 0)
+ return 0;
+
+ int r;
+ FDRef o, n;
+ {
+ Index index;
+ r = lfn_open(cid, oldoid, false, &o, &index);
+ if (r < 0) {
+ goto out2;
+ }
+ ceph_assert(index.index);
+ RWLock::WLocker l((index.index)->access_lock);
+
+ r = lfn_open(cid, newoid, true, &n, &index);
+ if (r < 0) {
+ goto out;
+ }
+ r = ::ftruncate(**n, 0);
+ if (r < 0) {
+ r = -errno;
+ goto out3;
+ }
+ struct stat st;
+ r = ::fstat(**o, &st);
+ if (r < 0) {
+ r = -errno;
+ goto out3;
+ }
+
+ r = _do_clone_range(**o, **n, 0, st.st_size, 0);
+ if (r < 0) {
+ goto out3;
+ }
+
+ dout(20) << "objectmap clone" << dendl;
+ r = object_map->clone(oldoid, newoid, &spos);
+ if (r < 0 && r != -ENOENT)
+ goto out3;
+ }
+
+ {
+ char buf[2];
+ map<string, bufferptr> aset;
+ r = _fgetattrs(**o, aset);
+ if (r < 0)
+ goto out3;
+
+ r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+ r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+ sizeof(XATTR_NO_SPILL_OUT));
+ } else {
+ r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+ sizeof(XATTR_SPILL_OUT));
+ }
+ if (r < 0)
+ goto out3;
+
+ r = _fsetattrs(**n, aset);
+ if (r < 0)
+ goto out3;
+ }
+
+ // clone is non-idempotent; record our work.
+ _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+ lfn_close(n);
+ out:
+ lfn_close(o);
+ out2:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+}
+
+int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << __FUNC__ << ": copy " << srcoff << "~" << len << " to " << dstoff << dendl;
+ return backend->clone_range(from, to, srcoff, len, dstoff);
+}
+
+int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl;
+ int r = 0;
+ map<uint64_t, uint64_t> exomap;
+ // fiemap doesn't allow zero length
+ if (len == 0)
+ return 0;
+
+ if (backend->has_seek_data_hole()) {
+ dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl;
+ r = _do_seek_hole_data(from, srcoff, len, &exomap);
+ } else if (backend->has_fiemap()) {
+ dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl;
+ r = _do_fiemap(from, srcoff, len, &exomap);
+ }
+
+
+ int64_t written = 0;
+ if (r < 0)
+ goto out;
+
+ for (map<uint64_t, uint64_t>::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) {
+ uint64_t it_off = miter->first - srcoff + dstoff;
+ r = _do_copy_range(from, to, miter->first, miter->second, it_off, true);
+ if (r < 0) {
+ derr << __FUNC__ << ": copy error at " << miter->first << "~" << miter->second
+ << " to " << it_off << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ written += miter->second;
+ }
+
+ if (r >= 0) {
+ if (m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+ ceph_assert(rc >= 0);
+ }
+ struct stat st;
+ r = ::fstat(to, &st);
+ if (r < 0) {
+ r = -errno;
+ derr << __FUNC__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ if (st.st_size < (int)(dstoff + len)) {
+ r = ::ftruncate(to, dstoff + len);
+ if (r < 0) {
+ r = -errno;
+ derr << __FUNC__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ }
+ r = written;
+ }
+
+ out:
+ dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc)
+{
+ dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl;
+ int r = 0;
+ loff_t pos = srcoff;
+ loff_t end = srcoff + len;
+ int buflen = 4096 * 16; //limit by pipe max size.see fcntl
+
+#ifdef CEPH_HAVE_SPLICE
+ if (backend->has_splice()) {
+ int pipefd[2];
+ if (pipe_cloexec(pipefd) < 0) {
+ int e = errno;
+ derr << " pipe " << " got " << cpp_strerror(e) << dendl;
+ return -e;
+ }
+
+ loff_t dstpos = dstoff;
+ while (pos < end) {
+ int l = std::min<int>(end-pos, buflen);
+ r = safe_splice(from, &pos, pipefd[1], nullptr, l, SPLICE_F_NONBLOCK);
+ dout(10) << " safe_splice read from " << pos << "~" << l << " got " << r << dendl;
+ if (r < 0) {
+ derr << __FUNC__ << ": safe_splice read error at " << pos << "~" << len
+ << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ if (r == 0) {
+ // hrm, bad source range, wtf.
+ r = -ERANGE;
+ derr << __FUNC__ << ": got short read result at " << pos
+ << " of fd " << from << " len " << len << dendl;
+ break;
+ }
+
+ r = safe_splice(pipefd[0], nullptr, to, &dstpos, r, 0);
+ dout(10) << " safe_splice write to " << to << " len " << r
+ << " got " << r << dendl;
+ if (r < 0) {
+ derr << __FUNC__ << ": write error at " << pos << "~"
+ << r << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ }
+ close(pipefd[0]);
+ close(pipefd[1]);
+ } else
+#endif
+ {
+ int64_t actual;
+
+ actual = ::lseek64(from, srcoff, SEEK_SET);
+ if (actual != (int64_t)srcoff) {
+ if (actual < 0)
+ r = -errno;
+ else
+ r = -EINVAL;
+ derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ actual = ::lseek64(to, dstoff, SEEK_SET);
+ if (actual != (int64_t)dstoff) {
+ if (actual < 0)
+ r = -errno;
+ else
+ r = -EINVAL;
+ derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ char buf[buflen];
+ while (pos < end) {
+ int l = std::min<int>(end-pos, buflen);
+ r = ::read(from, buf, l);
+ dout(25) << " read from " << pos << "~" << l << " got " << r << dendl;
+ if (r < 0) {
+ if (errno == EINTR) {
+ continue;
+ } else {
+ r = -errno;
+ derr << __FUNC__ << ": read error at " << pos << "~" << len
+ << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ }
+ if (r == 0) {
+ // hrm, bad source range, wtf.
+ r = -ERANGE;
+ derr << __FUNC__ << ": got short read result at " << pos
+ << " of fd " << from << " len " << len << dendl;
+ break;
+ }
+ int op = 0;
+ while (op < r) {
+ int r2 = safe_write(to, buf+op, r-op);
+ dout(25) << " write to " << to << " len " << (r-op)
+ << " got " << r2 << dendl;
+ if (r2 < 0) {
+ r = r2;
+ derr << __FUNC__ << ": write error at " << pos << "~"
+ << r-op << ", " << cpp_strerror(r) << dendl;
+
+ break;
+ }
+ op += (r-op);
+ }
+ if (r < 0)
+ break;
+ pos += r;
+ }
+ }
+
+ if (r < 0 && replaying) {
+ ceph_assert(r == -ERANGE);
+ derr << __FUNC__ << ": short source tolerated because we are replaying" << dendl;
+ r = len;
+ }
+ ceph_assert(replaying || pos == end);
+ if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+ ceph_assert(rc >= 0);
+ }
+ dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
+ uint64_t srcoff, uint64_t len, uint64_t dstoff,
+ const SequencerPosition& spos)
+{
+ dout(15) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl;
+
+ if (_check_replay_guard(newcid, newoid, spos) < 0)
+ return 0;
+
+ int r;
+ FDRef o, n;
+ r = lfn_open(oldcid, oldoid, false, &o);
+ if (r < 0) {
+ goto out2;
+ }
+ r = lfn_open(newcid, newoid, true, &n);
+ if (r < 0) {
+ goto out;
+ }
+ r = _do_clone_range(**o, **n, srcoff, len, dstoff);
+ if (r < 0) {
+ goto out3;
+ }
+
+ // clone is non-idempotent; record our work.
+ _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+ lfn_close(n);
+ out:
+ lfn_close(o);
+ out2:
+ dout(10) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " "
+ << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+
+class SyncEntryTimeout : public Context {
+public:
+ CephContext* cct;
+ explicit SyncEntryTimeout(CephContext* cct, int commit_timeo)
+ : cct(cct), m_commit_timeo(commit_timeo)
+ {
+ }
+
+ void finish(int r) override {
+ BackTrace *bt = new BackTrace(1);
+ generic_dout(-1) << "FileStore: sync_entry timed out after "
+ << m_commit_timeo << " seconds.\n";
+ bt->print(*_dout);
+ *_dout << dendl;
+ delete bt;
+ bt = nullptr;
+ ceph_abort();
+ }
+private:
+ int m_commit_timeo;
+};
+
+void FileStore::sync_entry()
+{
+ lock.Lock();
+ while (!stop) {
+ utime_t max_interval;
+ max_interval.set_from_double(m_filestore_max_sync_interval);
+ utime_t min_interval;
+ min_interval.set_from_double(m_filestore_min_sync_interval);
+
+ utime_t startwait = ceph_clock_now();
+ if (!force_sync) {
+ dout(20) << __FUNC__ << ": waiting for max_interval " << max_interval << dendl;
+ sync_cond.WaitInterval(lock, max_interval);
+ } else {
+ dout(20) << __FUNC__ << ": not waiting, force_sync set" << dendl;
+ }
+
+ if (force_sync) {
+ dout(20) << __FUNC__ << ": force_sync set" << dendl;
+ force_sync = false;
+ } else if (stop) {
+ dout(20) << __FUNC__ << ": stop set" << dendl;
+ break;
+ } else {
+ // wait for at least the min interval
+ utime_t woke = ceph_clock_now();
+ woke -= startwait;
+ dout(20) << __FUNC__ << ": woke after " << woke << dendl;
+ if (woke < min_interval) {
+ utime_t t = min_interval;
+ t -= woke;
+ dout(20) << __FUNC__ << ": waiting for another " << t
+ << " to reach min interval " << min_interval << dendl;
+ sync_cond.WaitInterval(lock, t);
+ }
+ }
+
+ list<Context*> fin;
+ again:
+ fin.swap(sync_waiters);
+ lock.Unlock();
+
+ op_tp.pause();
+ if (apply_manager.commit_start()) {
+ utime_t start = ceph_clock_now();
+ uint64_t cp = apply_manager.get_committing_seq();
+
+ sync_entry_timeo_lock.Lock();
+ SyncEntryTimeout *sync_entry_timeo =
+ new SyncEntryTimeout(cct, m_filestore_commit_timeout);
+ if (!timer.add_event_after(m_filestore_commit_timeout,
+ sync_entry_timeo)) {
+ sync_entry_timeo = nullptr;
+ }
+ sync_entry_timeo_lock.Unlock();
+
+ logger->set(l_filestore_committing, 1);
+
+ dout(15) << __FUNC__ << ": committing " << cp << dendl;
+ stringstream errstream;
+ if (cct->_conf->filestore_debug_omap_check && !object_map->check(errstream)) {
+ derr << errstream.str() << dendl;
+ ceph_abort();
+ }
+
+ if (backend->can_checkpoint()) {
+ int err = write_op_seq(op_fd, cp);
+ if (err < 0) {
+ derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("error during write_op_seq");
+ }
+
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+ uint64_t cid = 0;
+ err = backend->create_checkpoint(s, &cid);
+ if (err < 0) {
+ int err = errno;
+ derr << "snap create '" << s << "' got error " << err << dendl;
+ ceph_assert(err == 0);
+ }
+
+ snaps.push_back(cp);
+ apply_manager.commit_started();
+ op_tp.unpause();
+
+ if (cid > 0) {
+ dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl;
+ err = backend->sync_checkpoint(cid);
+ if (err < 0) {
+ derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("wait_sync got error");
+ }
+ dout(20) << " done waiting for checkpoint " << cid << " to complete" << dendl;
+ }
+ } else {
+ apply_manager.commit_started();
+ op_tp.unpause();
+
+ int err = object_map->sync();
+ if (err < 0) {
+ derr << "object_map sync got " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("object_map sync returned error");
+ }
+
+ err = backend->syncfs();
+ if (err < 0) {
+ derr << "syncfs got " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("syncfs returned error");
+ }
+
+ err = write_op_seq(op_fd, cp);
+ if (err < 0) {
+ derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("error during write_op_seq");
+ }
+ err = ::fsync(op_fd);
+ if (err < 0) {
+ derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl;
+ ceph_abort_msg("error during fsync of op_seq");
+ }
+ }
+
+ utime_t done = ceph_clock_now();
+ utime_t lat = done - start;
+ utime_t dur = done - startwait;
+ dout(10) << __FUNC__ << ": commit took " << lat << ", interval was " << dur << dendl;
+ utime_t max_pause_lat = logger->tget(l_filestore_sync_pause_max_lat);
+ if (max_pause_lat < dur - lat) {
+ logger->tinc(l_filestore_sync_pause_max_lat, dur - lat);
+ }
+
+ logger->inc(l_filestore_commitcycle);
+ logger->tinc(l_filestore_commitcycle_latency, lat);
+ logger->tinc(l_filestore_commitcycle_interval, dur);
+
+ apply_manager.commit_finish();
+ if (!m_disable_wbthrottle) {
+ wbthrottle.clear();
+ }
+
+ logger->set(l_filestore_committing, 0);
+
+ // remove old snaps?
+ if (backend->can_checkpoint()) {
+ char s[NAME_MAX];
+ while (snaps.size() > 2) {
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front());
+ snaps.pop_front();
+ dout(10) << "removing snap '" << s << "'" << dendl;
+ int r = backend->destroy_checkpoint(s);
+ if (r) {
+ int err = errno;
+ derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl;
+ }
+ }
+ }
+
+ dout(15) << __FUNC__ << ": committed to op_seq " << cp << dendl;
+
+ if (sync_entry_timeo) {
+ Mutex::Locker lock(sync_entry_timeo_lock);
+ timer.cancel_event(sync_entry_timeo);
+ }
+ } else {
+ op_tp.unpause();
+ }
+
+ lock.Lock();
+ finish_contexts(cct, fin, 0);
+ fin.clear();
+ if (!sync_waiters.empty()) {
+ dout(10) << __FUNC__ << ": more waiters, committing again" << dendl;
+ goto again;
+ }
+ if (!stop && journal && journal->should_commit_now()) {
+ dout(10) << __FUNC__ << ": journal says we should commit again (probably is/was full)" << dendl;
+ goto again;
+ }
+ }
+ stop = false;
+ lock.Unlock();
+}
+
+void FileStore::do_force_sync()
+{
+ dout(10) << __FUNC__ << dendl;
+ Mutex::Locker l(lock);
+ force_sync = true;
+ sync_cond.Signal();
+}
+
+void FileStore::start_sync(Context *onsafe)
+{
+ Mutex::Locker l(lock);
+ sync_waiters.push_back(onsafe);
+ sync_cond.Signal();
+ force_sync = true;
+ dout(10) << __FUNC__ << dendl;
+}
+
+void FileStore::sync()
+{
+ Mutex l("FileStore::sync");
+ Cond c;
+ bool done;
+ C_SafeCond *fin = new C_SafeCond(&l, &c, &done);
+
+ start_sync(fin);
+
+ l.Lock();
+ while (!done) {
+ dout(10) << "sync waiting" << dendl;
+ c.Wait(l);
+ }
+ l.Unlock();
+ dout(10) << "sync done" << dendl;
+}
+
+void FileStore::_flush_op_queue()
+{
+ dout(10) << __FUNC__ << ": draining op tp" << dendl;
+ op_wq.drain();
+ dout(10) << __FUNC__ << ": waiting for apply finisher" << dendl;
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ (*it)->wait_for_empty();
+ }
+}
+
+/*
+ * flush - make every queued write readable
+ */
+void FileStore::flush()
+{
+ dout(10) << __FUNC__ << dendl;
+
+ if (cct->_conf->filestore_blackhole) {
+ // wait forever
+ Mutex lock("FileStore::flush::lock");
+ Cond cond;
+ lock.Lock();
+ while (true)
+ cond.Wait(lock);
+ ceph_abort();
+ }
+
+ if (m_filestore_journal_writeahead) {
+ if (journal)
+ journal->flush();
+ dout(10) << __FUNC__ << ": draining ondisk finisher" << dendl;
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ (*it)->wait_for_empty();
+ }
+ }
+
+ _flush_op_queue();
+ dout(10) << __FUNC__ << ": complete" << dendl;
+}
+
+/*
+ * sync_and_flush - make every queued write readable AND committed to disk
+ */
+void FileStore::sync_and_flush()
+{
+ dout(10) << __FUNC__ << dendl;
+
+ if (m_filestore_journal_writeahead) {
+ if (journal)
+ journal->flush();
+ _flush_op_queue();
+ } else {
+ // includes m_filestore_journal_parallel
+ _flush_op_queue();
+ sync();
+ }
+ dout(10) << __FUNC__ << ": done" << dendl;
+}
+
+int FileStore::flush_journal()
+{
+ dout(10) << __FUNC__ << dendl;
+ sync_and_flush();
+ sync();
+ return 0;
+}
+
+int FileStore::snapshot(const string& name)
+{
+ dout(10) << __FUNC__ << ": " << name << dendl;
+ sync_and_flush();
+
+ if (!backend->can_checkpoint()) {
+ dout(0) << __FUNC__ << ": " << name << " failed, not supported" << dendl;
+ return -EOPNOTSUPP;
+ }
+
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str());
+
+ int r = backend->create_checkpoint(s, nullptr);
+ if (r) {
+ derr << __FUNC__ << ": " << name << " failed: " << cpp_strerror(r) << dendl;
+ }
+
+ return r;
+}
+
+// -------------------------------
+// attributes
+
+int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp)
+{
+ char val[CHAIN_XATTR_MAX_BLOCK_LEN];
+ int l = chain_fgetxattr(fd, name, val, sizeof(val));
+ if (l >= 0) {
+ bp = buffer::create(l);
+ memcpy(bp.c_str(), val, l);
+ } else if (l == -ERANGE) {
+ l = chain_fgetxattr(fd, name, 0, 0);
+ if (l > 0) {
+ bp = buffer::create(l);
+ l = chain_fgetxattr(fd, name, bp.c_str(), l);
+ }
+ }
+ ceph_assert(!m_filestore_fail_eio || l != -EIO);
+ return l;
+}
+
+int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset)
+{
+ // get attr list
+ char names1[100];
+ int len = chain_flistxattr(fd, names1, sizeof(names1)-1);
+ char *names2 = 0;
+ char *name = 0;
+ if (len == -ERANGE) {
+ len = chain_flistxattr(fd, 0, 0);
+ if (len < 0) {
+ ceph_assert(!m_filestore_fail_eio || len != -EIO);
+ return len;
+ }
+ dout(10) << " -ERANGE, len is " << len << dendl;
+ names2 = new char[len+1];
+ len = chain_flistxattr(fd, names2, len);
+ dout(10) << " -ERANGE, got " << len << dendl;
+ if (len < 0) {
+ ceph_assert(!m_filestore_fail_eio || len != -EIO);
+ delete[] names2;
+ return len;
+ }
+ name = names2;
+ } else if (len < 0) {
+ ceph_assert(!m_filestore_fail_eio || len != -EIO);
+ return len;
+ } else {
+ name = names1;
+ }
+ name[len] = 0;
+
+ char *end = name + len;
+ while (name < end) {
+ char *attrname = name;
+ if (parse_attrname(&name)) {
+ if (*name) {
+ dout(20) << __FUNC__ << ": " << fd << " getting '" << name << "'" << dendl;
+ int r = _fgetattr(fd, attrname, aset[name]);
+ if (r < 0) {
+ delete[] names2;
+ return r;
+ }
+ }
+ }
+ name += strlen(name) + 1;
+ }
+
+ delete[] names2;
+ return 0;
+}
+
+int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset)
+{
+ for (map<string, bufferptr>::iterator p = aset.begin();
+ p != aset.end();
+ ++p) {
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+ const char *val;
+ if (p->second.length())
+ val = p->second.c_str();
+ else
+ val = "";
+ // ??? Why do we skip setting all the other attrs if one fails?
+ int r = chain_fsetxattr(fd, n, val, p->second.length());
+ if (r < 0) {
+ derr << __FUNC__ << ": chain_setxattr returned " << r << dendl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+// debug EIO injection
+void FileStore::inject_data_error(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ dout(10) << __FUNC__ << ": init error on " << oid << dendl;
+ data_error_set.insert(oid);
+}
+void FileStore::inject_mdata_error(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ dout(10) << __FUNC__ << ": init error on " << oid << dendl;
+ mdata_error_set.insert(oid);
+}
+
+void FileStore::debug_obj_on_delete(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ dout(10) << __FUNC__ << ": clear error on " << oid << dendl;
+ data_error_set.erase(oid);
+ mdata_error_set.erase(oid);
+}
+bool FileStore::debug_data_eio(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ if (data_error_set.count(oid)) {
+ dout(10) << __FUNC__ << ": inject error on " << oid << dendl;
+ return true;
+ } else {
+ return false;
+ }
+}
+bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ if (mdata_error_set.count(oid)) {
+ dout(10) << __FUNC__ << ": inject error on " << oid << dendl;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+
+// objects
+
+int FileStore::getattr(CollectionHandle& ch, const ghobject_t& oid, const char *name, bufferptr &bp)
+{
+ tracepoint(objectstore, getattr_enter, ch->cid.c_str());
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+ r = _fgetattr(**fd, n, bp);
+ lfn_close(fd);
+ if (r == -ENODATA) {
+ map<string, bufferlist> got;
+ set<string> to_get;
+ to_get.insert(string(name));
+ Index index;
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ goto out;
+ }
+ r = object_map->get_xattrs(oid, to_get, &got);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": get_xattrs err r =" << r << dendl;
+ goto out;
+ }
+ if (got.empty()) {
+ dout(10) << __FUNC__ << ": got.size() is 0" << dendl;
+ return -ENODATA;
+ }
+ bp = bufferptr(got.begin()->second.c_str(),
+ got.begin()->second.length());
+ r = bp.length();
+ }
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ if (cct->_conf->filestore_debug_inject_read_err &&
+ debug_mdata_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, getattr_exit, r);
+ return r < 0 ? r : 0;
+ }
+}
+
+int FileStore::getattrs(CollectionHandle& ch, const ghobject_t& oid, map<string,bufferptr>& aset)
+{
+ tracepoint(objectstore, getattrs_enter, ch->cid.c_str());
+ const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+ set<string> omap_attrs;
+ map<string, bufferlist> omap_aset;
+ Index index;
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+
+ FDRef fd;
+ bool spill_out = true;
+ char buf[2];
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+ spill_out = false;
+
+ r = _fgetattrs(**fd, aset);
+ lfn_close(fd);
+ fd = FDRef(); // defensive
+ if (r < 0) {
+ goto out;
+ }
+
+ if (!spill_out) {
+ dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl;
+ goto out;
+ }
+
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ goto out;
+ }
+ {
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+
+ r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+ if (r == -ENOENT)
+ r = 0;
+ }
+ ceph_assert(omap_attrs.size() == omap_aset.size());
+ for (map<string, bufferlist>::iterator i = omap_aset.begin();
+ i != omap_aset.end();
+ ++i) {
+ string key(i->first);
+ aset.insert(make_pair(key,
+ bufferptr(i->second.c_str(), i->second.length())));
+ }
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+
+ if (cct->_conf->filestore_debug_inject_read_err &&
+ debug_mdata_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, getattrs_exit, r);
+ return r;
+ }
+}
+
+int FileStore::_setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset,
+ const SequencerPosition &spos)
+{
+ map<string, bufferlist> omap_set;
+ set<string> omap_remove;
+ map<string, bufferptr> inline_set;
+ map<string, bufferptr> inline_to_set;
+ FDRef fd;
+ int spill_out = -1;
+ bool incomplete_inline = false;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ char buf[2];
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+ spill_out = 0;
+ else
+ spill_out = 1;
+
+ r = _fgetattrs(**fd, inline_set);
+ incomplete_inline = (r == -E2BIG);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid
+ << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "")
+ << dendl;
+
+ for (map<string,bufferptr>::iterator p = aset.begin();
+ p != aset.end();
+ ++p) {
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+
+ if (incomplete_inline) {
+ chain_fremovexattr(**fd, n); // ignore any error
+ omap_set[p->first].push_back(p->second);
+ continue;
+ }
+
+ if (p->second.length() > m_filestore_max_inline_xattr_size) {
+ if (inline_set.count(p->first)) {
+ inline_set.erase(p->first);
+ r = chain_fremovexattr(**fd, n);
+ if (r < 0)
+ goto out_close;
+ }
+ omap_set[p->first].push_back(p->second);
+ continue;
+ }
+
+ if (!inline_set.count(p->first) &&
+ inline_set.size() >= m_filestore_max_inline_xattrs) {
+ omap_set[p->first].push_back(p->second);
+ continue;
+ }
+ omap_remove.insert(p->first);
+ inline_set.insert(*p);
+
+ inline_to_set.insert(*p);
+ }
+
+ if (spill_out != 1 && !omap_set.empty()) {
+ chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+ sizeof(XATTR_SPILL_OUT));
+ }
+
+ r = _fsetattrs(**fd, inline_to_set);
+ if (r < 0)
+ goto out_close;
+
+ if (spill_out && !omap_remove.empty()) {
+ r = object_map->remove_xattrs(oid, omap_remove, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not remove_xattrs r = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ goto out_close;
+ } else {
+ r = 0; // don't confuse the debug output
+ }
+ }
+
+ if (!omap_set.empty()) {
+ r = object_map->set_xattrs(oid, omap_set, &spos);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not set_xattrs r = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ goto out_close;
+ }
+ }
+ out_close:
+ lfn_close(fd);
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl;
+ FDRef fd;
+ bool spill_out = true;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ char buf[2];
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+ spill_out = false;
+ }
+
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+ r = chain_fremovexattr(**fd, n);
+ if (r == -ENODATA && spill_out) {
+ Index index;
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ goto out_close;
+ }
+ set<string> to_remove;
+ to_remove.insert(string(name));
+ r = object_map->remove_xattrs(oid, to_remove, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not remove_xattrs index r = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ goto out_close;
+ }
+ }
+ out_close:
+ lfn_close(fd);
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+ return r;
+}
+
+int FileStore::_rmattrs(const coll_t& cid, const ghobject_t& oid,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+ map<string,bufferptr> aset;
+ FDRef fd;
+ set<string> omap_attrs;
+ Index index;
+ bool spill_out = true;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ char buf[2];
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+ spill_out = false;
+ }
+
+ r = _fgetattrs(**fd, aset);
+ if (r >= 0) {
+ for (map<string,bufferptr>::iterator p = aset.begin(); p != aset.end(); ++p) {
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+ r = chain_fremovexattr(**fd, n);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not remove xattr r = " << r << dendl;
+ goto out_close;
+ }
+ }
+ }
+
+ if (!spill_out) {
+ dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl;
+ goto out_close;
+ }
+
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+ goto out_close;
+ }
+ {
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ goto out_close;
+ }
+ r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __FUNC__ << ": could not remove omap_attrs r = " << r << dendl;
+ goto out_close;
+ }
+ if (r == -ENOENT)
+ r = 0;
+ chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+ sizeof(XATTR_NO_SPILL_OUT));
+ }
+
+ out_close:
+ lfn_close(fd);
+ out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+
+
+
+int FileStore::_collection_remove_recursive(const coll_t &cid,
+ const SequencerPosition &spos)
+{
+ struct stat st;
+ int r = collection_stat(cid, &st);
+ if (r < 0) {
+ if (r == -ENOENT)
+ return 0;
+ return r;
+ }
+
+ vector<ghobject_t> objects;
+ ghobject_t max;
+ while (!max.is_max()) {
+ r = collection_list(cid, max, ghobject_t::get_max(),
+ 300, &objects, &max);
+ if (r < 0)
+ return r;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ ceph_assert(_check_replay_guard(cid, *i, spos));
+ r = _remove(cid, *i, spos);
+ if (r < 0)
+ return r;
+ }
+ objects.clear();
+ }
+ return _destroy_collection(cid);
+}
+
+// --------------------------
+// collections
+
+int FileStore::list_collections(vector<coll_t>& ls)
+{
+ return list_collections(ls, false);
+}
+
+int FileStore::list_collections(vector<coll_t>& ls, bool include_temp)
+{
+ tracepoint(objectstore, list_collections_enter);
+ dout(10) << __FUNC__ << dendl;
+
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/current", basedir.c_str());
+
+ int r = 0;
+ DIR *dir = ::opendir(fn);
+ if (!dir) {
+ r = -errno;
+ derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+
+ struct dirent *de = nullptr;
+ while ((de = ::readdir(dir))) {
+ if (de->d_type == DT_UNKNOWN) {
+ // d_type not supported (non-ext[234], btrfs), must stat
+ struct stat sb;
+ char filename[PATH_MAX];
+ if (int n = snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name);
+ n >= static_cast<int>(sizeof(filename))) {
+ derr << __func__ << " path length overrun: " << n << dendl;
+ ceph_abort();
+ }
+
+ r = ::stat(filename, &sb);
+ if (r < 0) {
+ r = -errno;
+ derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ break;
+ }
+ if (!S_ISDIR(sb.st_mode)) {
+ continue;
+ }
+ } else if (de->d_type != DT_DIR) {
+ continue;
+ }
+ if (strcmp(de->d_name, "omap") == 0) {
+ continue;
+ }
+ if (de->d_name[0] == '.' &&
+ (de->d_name[1] == '\0' ||
+ (de->d_name[1] == '.' &&
+ de->d_name[2] == '\0')))
+ continue;
+ coll_t cid;
+ if (!cid.parse(de->d_name)) {
+ derr << "ignoring invalid collection '" << de->d_name << "'" << dendl;
+ continue;
+ }
+ if (!cid.is_temp() || include_temp)
+ ls.push_back(cid);
+ }
+
+ if (r > 0) {
+ derr << "trying readdir " << fn << ": " << cpp_strerror(r) << dendl;
+ r = -r;
+ }
+
+ ::closedir(dir);
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ tracepoint(objectstore, list_collections_exit, r);
+ return r;
+}
+
+int FileStore::collection_stat(const coll_t& c, struct stat *st)
+{
+ tracepoint(objectstore, collection_stat_enter, c.c_str());
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << __FUNC__ << ": " << fn << dendl;
+ int r = ::stat(fn, st);
+ if (r < 0)
+ r = -errno;
+ dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ tracepoint(objectstore, collection_stat_exit, r);
+ return r;
+}
+
+bool FileStore::collection_exists(const coll_t& c)
+{
+ tracepoint(objectstore, collection_exists_enter, c.c_str());
+ struct stat st;
+ bool ret = collection_stat(c, &st) == 0;
+ tracepoint(objectstore, collection_exists_exit, ret);
+ return ret;
+}
+
+int FileStore::collection_empty(const coll_t& cid, bool *empty)
+{
+ tracepoint(objectstore, collection_empty_enter, cid.c_str());
+ dout(15) << __FUNC__ << ": " << cid << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0) {
+ derr << __FUNC__ << ": get_index returned: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ vector<ghobject_t> ls;
+ r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(),
+ 1, &ls, nullptr);
+ if (r < 0) {
+ derr << __FUNC__ << ": collection_list_partial returned: "
+ << cpp_strerror(r) << dendl;
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ *empty = ls.empty();
+ tracepoint(objectstore, collection_empty_exit, *empty);
+ return 0;
+}
+
+int FileStore::_collection_set_bits(const coll_t& c, int bits)
+{
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(10) << __FUNC__ << ": " << fn << " " << bits << dendl;
+ char n[PATH_MAX];
+ int r;
+ int32_t v = bits;
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto out;
+ }
+ get_attrname("bits", n, PATH_MAX);
+ r = chain_fsetxattr(fd, n, (char*)&v, sizeof(v));
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+ dout(10) << __FUNC__ << ": " << fn << " " << bits << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::collection_bits(CollectionHandle& ch)
+{
+ char fn[PATH_MAX];
+ get_cdir(ch->cid, fn, sizeof(fn));
+ dout(15) << __FUNC__ << ": " << fn << dendl;
+ int r;
+ char n[PATH_MAX];
+ int32_t bits;
+ int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ bits = r = -errno;
+ goto out;
+ }
+ get_attrname("bits", n, PATH_MAX);
+ r = chain_fgetxattr(fd, n, (char*)&bits, sizeof(bits));
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ if (r < 0) {
+ bits = r;
+ goto out;
+ }
+ out:
+ dout(10) << __FUNC__ << ": " << fn << " = " << bits << dendl;
+ return bits;
+}
+
+int FileStore::collection_list(const coll_t& c,
+ const ghobject_t& orig_start,
+ const ghobject_t& end,
+ int max,
+ vector<ghobject_t> *ls, ghobject_t *next)
+{
+ ghobject_t start = orig_start;
+ if (start.is_max())
+ return 0;
+
+ ghobject_t temp_next;
+ if (!next)
+ next = &temp_next;
+ // figure out the pool id. we need this in order to generate a
+ // meaningful 'next' value.
+ int64_t pool = -1;
+ shard_id_t shard;
+ {
+ spg_t pgid;
+ if (c.is_temp(&pgid)) {
+ pool = -2 - pgid.pool();
+ shard = pgid.shard;
+ } else if (c.is_pg(&pgid)) {
+ pool = pgid.pool();
+ shard = pgid.shard;
+ } else if (c.is_meta()) {
+ pool = -1;
+ shard = shard_id_t::NO_SHARD;
+ } else {
+ // hrm, the caller is test code! we should get kill it off. for now,
+ // tolerate it.
+ pool = 0;
+ shard = shard_id_t::NO_SHARD;
+ }
+ dout(20) << __FUNC__ << ": pool is " << pool << " shard is " << shard
+ << " pgid " << pgid << dendl;
+ }
+ ghobject_t sep;
+ sep.hobj.pool = -1;
+ sep.set_shard(shard);
+ if (!c.is_temp() && !c.is_meta()) {
+ if (start < sep) {
+ dout(10) << __FUNC__ << ": first checking temp pool" << dendl;
+ coll_t temp = c.get_temp();
+ int r = collection_list(temp, start, end, max, ls, next);
+ if (r < 0)
+ return r;
+ if (*next != ghobject_t::get_max())
+ return r;
+ start = sep;
+ dout(10) << __FUNC__ << ": fall through to non-temp collection, start "
+ << start << dendl;
+ } else {
+ dout(10) << __FUNC__ << ": start " << start << " >= sep " << sep << dendl;
+ }
+ }
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ r = index->collection_list_partial(start, end, max, ls, next);
+
+ if (r < 0) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ dout(20) << "objects: " << *ls << dendl;
+
+ // HashIndex doesn't know the pool when constructing a 'next' value
+ if (!next->is_max()) {
+ next->hobj.pool = pool;
+ next->set_shard(shard);
+ dout(20) << " next " << *next << dendl;
+ }
+
+ return 0;
+}
+
+int FileStore::omap_get(CollectionHandle& ch, const ghobject_t &hoid,
+ bufferlist *header,
+ map<string, bufferlist> *out)
+{
+ tracepoint(objectstore, omap_get_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->get(hoid, header, out);
+ if (r < 0 && r != -ENOENT) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ tracepoint(objectstore, omap_get_exit, 0);
+ return 0;
+}
+
+int FileStore::omap_get_header(
+ CollectionHandle& ch,
+ const ghobject_t &hoid,
+ bufferlist *bl,
+ bool allow_eio)
+{
+ tracepoint(objectstore, omap_get_header_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->get_header(hoid, bl);
+ if (r < 0 && r != -ENOENT) {
+ ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ tracepoint(objectstore, omap_get_header_exit, 0);
+ return 0;
+}
+
+int FileStore::omap_get_keys(CollectionHandle& ch, const ghobject_t &hoid, set<string> *keys)
+{
+ tracepoint(objectstore, omap_get_keys_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->get_keys(hoid, keys);
+ if (r < 0 && r != -ENOENT) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ tracepoint(objectstore, omap_get_keys_exit, 0);
+ return 0;
+}
+
+int FileStore::omap_get_values(CollectionHandle& ch, const ghobject_t &hoid,
+ const set<string> &keys,
+ map<string, bufferlist> *out)
+{
+ tracepoint(objectstore, omap_get_values_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ const char *where = "()";
+ int r = get_index(c, &index);
+ if (r < 0) {
+ where = " (get_index)";
+ goto out;
+ }
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0) {
+ where = " (lfn_find)";
+ goto out;
+ }
+ }
+ r = object_map->get_values(hoid, keys, out);
+ if (r < 0 && r != -ENOENT) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ where = " (get_values)";
+ goto out;
+ }
+ r = 0;
+ out:
+ tracepoint(objectstore, omap_get_values_exit, r);
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << " = " << r
+ << where << dendl;
+ return r;
+}
+
+int FileStore::omap_check_keys(CollectionHandle& ch, const ghobject_t &hoid,
+ const set<string> &keys,
+ set<string> *out)
+{
+ tracepoint(objectstore, omap_check_keys_enter, ch->cid.c_str());
+ const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(hoid);
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->check_keys(hoid, keys, out);
+ if (r < 0 && r != -ENOENT) {
+ if (r == -EIO && m_filestore_fail_eio) handle_eio();
+ return r;
+ }
+ tracepoint(objectstore, omap_check_keys_exit, 0);
+ return 0;
+}
+
+ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(
+ CollectionHandle& ch,
+ const ghobject_t &oid)
+{
+ auto osr = static_cast<OpSequencer*>(ch.get());
+ osr->wait_for_apply(oid);
+ return get_omap_iterator(ch->cid, oid);
+}
+
+ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(const coll_t& _c,
+ const ghobject_t &hoid)
+{
+ tracepoint(objectstore, get_omap_iterator, _c.c_str());
+ const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp();
+ dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 "
+ << "(get_index failed with " << cpp_strerror(r) << ")" << dendl;
+ return ObjectMap::ObjectMapIterator();
+ }
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0) {
+ dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 "
+ << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl;
+ return ObjectMap::ObjectMapIterator();
+ }
+ }
+ return object_map->get_iterator(hoid);
+}
+
+int FileStore::_collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
+ uint64_t expected_num_objs,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": collection: " << c << " pg number: "
+ << pg_num << " expected number of objects: " << expected_num_objs << dendl;
+
+ bool empty;
+ int ret = collection_empty(c, &empty);
+ if (ret < 0)
+ return ret;
+ if (!empty && !replaying) {
+ dout(0) << "Failed to give an expected number of objects hint to collection : "
+ << c << ", only empty collection can take such type of hint. " << dendl;
+ return 0;
+ }
+
+ Index index;
+ ret = get_index(c, &index);
+ if (ret < 0)
+ return ret;
+ // Pre-hash the collection
+ ret = index->pre_hash_collection(pg_num, expected_num_objs);
+ dout(10) << "pre_hash_collection " << c << " = " << ret << dendl;
+ if (ret < 0)
+ return ret;
+ _set_replay_guard(c, spos);
+
+ return 0;
+}
+
+int FileStore::_create_collection(
+ const coll_t& c,
+ int bits,
+ const SequencerPosition &spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << __FUNC__ << ": " << fn << dendl;
+ int r = ::mkdir(fn, 0755);
+ if (r < 0)
+ r = -errno;
+ if (r == -EEXIST && replaying)
+ r = 0;
+ dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+
+ if (r < 0)
+ return r;
+ r = init_index(c);
+ if (r < 0)
+ return r;
+ r = _collection_set_bits(c, bits);
+ if (r < 0)
+ return r;
+ // create parallel temp collection, too
+ if (!c.is_meta() && !c.is_temp()) {
+ coll_t temp = c.get_temp();
+ r = _create_collection(temp, 0, spos);
+ if (r < 0)
+ return r;
+ }
+
+ _set_replay_guard(c, spos);
+ return 0;
+}
+
+int FileStore::_destroy_collection(const coll_t& c)
+{
+ int r = 0;
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << __FUNC__ << ": " << fn << dendl;
+ {
+ Index from;
+ r = get_index(c, &from);
+ if (r < 0)
+ goto out;
+ ceph_assert(from.index);
+ RWLock::WLocker l((from.index)->access_lock);
+
+ r = from->prep_delete();
+ if (r < 0)
+ goto out;
+ }
+ r = ::rmdir(fn);
+ if (r < 0) {
+ r = -errno;
+ goto out;
+ }
+
+ out:
+ // destroy parallel temp collection, too
+ if (!c.is_meta() && !c.is_temp()) {
+ coll_t temp = c.get_temp();
+ int r2 = _destroy_collection(temp);
+ if (r2 < 0) {
+ r = r2;
+ goto out_final;
+ }
+ }
+
+ out_final:
+ dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_collection_add(const coll_t& c, const coll_t& oldcid, const ghobject_t& o,
+ const SequencerPosition& spos)
+{
+ dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << dendl;
+
+ int dstcmp = _check_replay_guard(c, o, spos);
+ if (dstcmp < 0)
+ return 0;
+
+ // check the src name too; it might have a newer guard, and we don't
+ // want to clobber it
+ int srccmp = _check_replay_guard(oldcid, o, spos);
+ if (srccmp < 0)
+ return 0;
+
+ // open guard on object so we don't any previous operations on the
+ // new name that will modify the source inode.
+ FDRef fd;
+ int r = lfn_open(oldcid, o, 0, &fd);
+ if (r < 0) {
+ // the source collection/object does not exist. If we are replaying, we
+ // should be safe, so just return 0 and move on.
+ ceph_assert(replaying);
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+ << oldcid << "/" << o << " (dne, continue replay) " << dendl;
+ return 0;
+ }
+ if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
+ _set_replay_guard(**fd, spos, &o, true);
+ }
+
+ r = lfn_link(oldcid, c, o, o);
+ if (replaying && !backend->can_checkpoint() &&
+ r == -EEXIST) // crashed between link() and set_replay_guard()
+ r = 0;
+
+ _inject_failure();
+
+ // close guard on object so we don't do this again
+ if (r == 0) {
+ _close_replay_guard(**fd, spos);
+ }
+ lfn_close(fd);
+
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
+ const SequencerPosition& spos,
+ bool allow_enoent)
+{
+ dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl;
+ int r = 0;
+ int dstcmp, srccmp;
+
+ if (replaying) {
+ /* If the destination collection doesn't exist during replay,
+ * we need to delete the src object and continue on
+ */
+ if (!collection_exists(c))
+ goto out_rm_src;
+ }
+
+ dstcmp = _check_replay_guard(c, o, spos);
+ if (dstcmp < 0)
+ goto out_rm_src;
+
+ // check the src name too; it might have a newer guard, and we don't
+ // want to clobber it
+ srccmp = _check_replay_guard(oldcid, oldoid, spos);
+ if (srccmp < 0)
+ return 0;
+
+ {
+ // open guard on object so we don't any previous operations on the
+ // new name that will modify the source inode.
+ FDRef fd;
+ r = lfn_open(oldcid, oldoid, 0, &fd);
+ if (r < 0) {
+ // the source collection/object does not exist. If we are replaying, we
+ // should be safe, so just return 0 and move on.
+ if (replaying) {
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+ << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl;
+ } else if (allow_enoent) {
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+ << oldcid << "/" << oldoid << " (dne, ignoring enoent)"
+ << dendl;
+ } else {
+ ceph_abort_msg("ERROR: source must exist");
+ }
+
+ if (!replaying) {
+ return 0;
+ }
+ if (allow_enoent && dstcmp > 0) { // if dstcmp == 0, try_rename was started.
+ return 0;
+ }
+
+ r = 0; // don't know if object_map was cloned
+ } else {
+ if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
+ _set_replay_guard(**fd, spos, &o, true);
+ }
+
+ r = lfn_link(oldcid, c, oldoid, o);
+ if (replaying && !backend->can_checkpoint() &&
+ r == -EEXIST) // crashed between link() and set_replay_guard()
+ r = 0;
+
+ lfn_close(fd);
+ fd = FDRef();
+
+ _inject_failure();
+ }
+
+ if (r == 0) {
+ // the name changed; link the omap content
+ r = object_map->rename(oldoid, o, &spos);
+ if (r == -ENOENT)
+ r = 0;
+ }
+
+ _inject_failure();
+
+ if (r == 0)
+ r = lfn_unlink(oldcid, oldoid, spos, true);
+
+ if (r == 0)
+ r = lfn_open(c, o, 0, &fd);
+
+ // close guard on object so we don't do this again
+ if (r == 0) {
+ _close_replay_guard(**fd, spos, &o);
+ lfn_close(fd);
+ }
+ }
+
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid
+ << " = " << r << dendl;
+ return r;
+
+ out_rm_src:
+ // remove source
+ if (_check_replay_guard(oldcid, oldoid, spos) > 0) {
+ r = lfn_unlink(oldcid, oldoid, spos, true);
+ }
+
+ dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid
+ << " = " << r << dendl;
+ return r;
+}
+
+void FileStore::_inject_failure()
+{
+ if (m_filestore_kill_at) {
+ int final = --m_filestore_kill_at;
+ dout(5) << __FUNC__ << ": " << (final+1) << " -> " << final << dendl;
+ if (final == 0) {
+ derr << __FUNC__ << ": KILLING" << dendl;
+ cct->_log->flush();
+ _exit(1);
+ }
+ }
+}
+
+int FileStore::_omap_clear(const coll_t& cid, const ghobject_t &hoid,
+ const SequencerPosition &spos) {
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->clear_keys_header(hoid, &spos);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ return 0;
+}
+
+int FileStore::_omap_setkeys(const coll_t& cid, const ghobject_t &hoid,
+ const map<string, bufferlist> &aset,
+ const SequencerPosition &spos) {
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+ Index index;
+ int r;
+ //treat pgmeta as a logical object, skip to check exist
+ if (hoid.is_pgmeta())
+ goto skip;
+
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(20) << __FUNC__ << ": get_index got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0) {
+ dout(20) << __FUNC__ << ": lfn_find got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+skip:
+ if (g_conf()->subsys.should_gather<ceph_subsys_filestore, 20>()) {
+ for (auto& p : aset) {
+ dout(20) << __FUNC__ << ": set " << p.first << dendl;
+ }
+ }
+ r = object_map->set_keys(hoid, aset, &spos);
+ dout(20) << __FUNC__ << ": " << cid << "/" << hoid << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &hoid,
+ const set<string> &keys,
+ const SequencerPosition &spos) {
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+ Index index;
+ int r;
+ //treat pgmeta as a logical object, skip to check exist
+ if (hoid.is_pgmeta())
+ goto skip;
+
+ r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+skip:
+ r = object_map->rm_keys(hoid, keys, &spos);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ return 0;
+}
+
+int FileStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &hoid,
+ const string& first, const string& last,
+ const SequencerPosition &spos) {
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl;
+ set<string> keys;
+ {
+ ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid);
+ if (!iter)
+ return -ENOENT;
+ for (iter->lower_bound(first); iter->valid() && iter->key() < last;
+ iter->next()) {
+ keys.insert(iter->key());
+ }
+ }
+ return _omap_rmkeys(cid, hoid, keys, spos);
+}
+
+int FileStore::_omap_setheader(const coll_t& cid, const ghobject_t &hoid,
+ const bufferlist &bl,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+ {
+ ceph_assert(index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ return object_map->set_header(hoid, bl, &spos);
+}
+
+int FileStore::_merge_collection(const coll_t& cid,
+ uint32_t bits,
+ coll_t dest,
+ const SequencerPosition &spos)
+{
+ dout(15) << __FUNC__ << ": " << cid << " " << dest
+ << " bits " << bits << dendl;
+ int r = 0;
+
+ if (!collection_exists(cid)) {
+ dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl;
+ ceph_assert(replaying);
+ return 0;
+ }
+ if (!collection_exists(dest)) {
+ dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl;
+ ceph_assert(replaying);
+ return 0;
+ }
+
+ // set bits
+ if (_check_replay_guard(cid, spos) > 0)
+ _collection_set_bits(dest, bits);
+
+ spg_t pgid;
+ bool is_pg = dest.is_pg(&pgid);
+ ceph_assert(is_pg);
+
+ int dstcmp = _check_replay_guard(dest, spos);
+ if (dstcmp < 0)
+ return 0;
+
+ int srccmp = _check_replay_guard(cid, spos);
+ if (srccmp < 0)
+ return 0;
+
+ _set_global_replay_guard(cid, spos);
+ _set_replay_guard(cid, spos, true);
+ _set_replay_guard(dest, spos, true);
+
+ // main collection
+ {
+ Index from;
+ r = get_index(cid, &from);
+
+ Index to;
+ if (!r)
+ r = get_index(dest, &to);
+
+ if (!r) {
+ ceph_assert(from.index);
+ RWLock::WLocker l1((from.index)->access_lock);
+
+ ceph_assert(to.index);
+ RWLock::WLocker l2((to.index)->access_lock);
+
+ r = from->merge(bits, to.index);
+ }
+ }
+
+ // temp too
+ {
+ Index from;
+ r = get_index(cid.get_temp(), &from);
+
+ Index to;
+ if (!r)
+ r = get_index(dest.get_temp(), &to);
+
+ if (!r) {
+ ceph_assert(from.index);
+ RWLock::WLocker l1((from.index)->access_lock);
+
+ ceph_assert(to.index);
+ RWLock::WLocker l2((to.index)->access_lock);
+
+ r = from->merge(bits, to.index);
+ }
+ }
+
+ // remove source
+ _destroy_collection(cid);
+
+ _close_replay_guard(dest, spos);
+ _close_replay_guard(dest.get_temp(), spos);
+ // no need to close guards on cid... it's removed.
+
+ if (!r && cct->_conf->filestore_debug_verify_split) {
+ vector<ghobject_t> objects;
+ ghobject_t next;
+ while (1) {
+ collection_list(
+ dest,
+ next, ghobject_t::get_max(),
+ get_ideal_list_max(),
+ &objects,
+ &next);
+ if (objects.empty())
+ break;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (!i->match(bits, pgid.pgid.ps())) {
+ dout(20) << __FUNC__ << ": " << *i << " does not belong in "
+ << cid << dendl;
+ ceph_assert(i->match(bits, pgid.pgid.ps()));
+ }
+ }
+ objects.clear();
+ }
+ }
+
+ dout(15) << __FUNC__ << ": " << cid << " " << dest << " bits " << bits
+ << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_split_collection(const coll_t& cid,
+ uint32_t bits,
+ uint32_t rem,
+ coll_t dest,
+ const SequencerPosition &spos)
+{
+ int r;
+ {
+ dout(15) << __FUNC__ << ": " << cid << " bits: " << bits << dendl;
+ if (!collection_exists(cid)) {
+ dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl;
+ ceph_assert(replaying);
+ return 0;
+ }
+ if (!collection_exists(dest)) {
+ dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl;
+ ceph_assert(replaying);
+ return 0;
+ }
+
+ int dstcmp = _check_replay_guard(dest, spos);
+ if (dstcmp < 0)
+ return 0;
+
+ int srccmp = _check_replay_guard(cid, spos);
+ if (srccmp < 0)
+ return 0;
+
+ _set_global_replay_guard(cid, spos);
+ _set_replay_guard(cid, spos, true);
+ _set_replay_guard(dest, spos, true);
+
+ Index from;
+ r = get_index(cid, &from);
+
+ Index to;
+ if (!r)
+ r = get_index(dest, &to);
+
+ if (!r) {
+ ceph_assert(from.index);
+ RWLock::WLocker l1((from.index)->access_lock);
+
+ ceph_assert(to.index);
+ RWLock::WLocker l2((to.index)->access_lock);
+
+ r = from->split(rem, bits, to.index);
+ }
+
+ _close_replay_guard(cid, spos);
+ _close_replay_guard(dest, spos);
+ }
+ _collection_set_bits(cid, bits);
+ if (!r && cct->_conf->filestore_debug_verify_split) {
+ vector<ghobject_t> objects;
+ ghobject_t next;
+ while (1) {
+ collection_list(
+ cid,
+ next, ghobject_t::get_max(),
+ get_ideal_list_max(),
+ &objects,
+ &next);
+ if (objects.empty())
+ break;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ dout(20) << __FUNC__ << ": " << *i << " still in source "
+ << cid << dendl;
+ ceph_assert(!i->match(bits, rem));
+ }
+ objects.clear();
+ }
+ next = ghobject_t();
+ while (1) {
+ collection_list(
+ dest,
+ next, ghobject_t::get_max(),
+ get_ideal_list_max(),
+ &objects,
+ &next);
+ if (objects.empty())
+ break;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ dout(20) << __FUNC__ << ": " << *i << " now in dest "
+ << *i << dendl;
+ ceph_assert(i->match(bits, rem));
+ }
+ objects.clear();
+ }
+ }
+ return r;
+}
+
+int FileStore::_set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size)
+{
+ dout(15) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl;
+
+ FDRef fd;
+ int ret = 0;
+
+ if (expected_object_size == 0 || expected_write_size == 0)
+ goto out;
+
+ ret = lfn_open(cid, oid, false, &fd);
+ if (ret < 0)
+ goto out;
+
+ {
+ // TODO: a more elaborate hint calculation
+ uint64_t hint = std::min<uint64_t>(expected_write_size, m_filestore_max_alloc_hint_size);
+
+ ret = backend->set_alloc_hint(**fd, hint);
+ dout(20) << __FUNC__ << ": hint " << hint << " ret " << ret << dendl;
+ }
+
+ lfn_close(fd);
+out:
+ dout(10) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl;
+ ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+ return ret;
+}
+
+const char** FileStore::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "filestore_max_inline_xattr_size",
+ "filestore_max_inline_xattr_size_xfs",
+ "filestore_max_inline_xattr_size_btrfs",
+ "filestore_max_inline_xattr_size_other",
+ "filestore_max_inline_xattrs",
+ "filestore_max_inline_xattrs_xfs",
+ "filestore_max_inline_xattrs_btrfs",
+ "filestore_max_inline_xattrs_other",
+ "filestore_max_xattr_value_size",
+ "filestore_max_xattr_value_size_xfs",
+ "filestore_max_xattr_value_size_btrfs",
+ "filestore_max_xattr_value_size_other",
+ "filestore_min_sync_interval",
+ "filestore_max_sync_interval",
+ "filestore_queue_max_ops",
+ "filestore_queue_max_bytes",
+ "filestore_expected_throughput_bytes",
+ "filestore_expected_throughput_ops",
+ "filestore_queue_low_threshhold",
+ "filestore_queue_high_threshhold",
+ "filestore_queue_high_delay_multiple",
+ "filestore_queue_max_delay_multiple",
+ "filestore_commit_timeout",
+ "filestore_dump_file",
+ "filestore_kill_at",
+ "filestore_fail_eio",
+ "filestore_fadvise",
+ "filestore_sloppy_crc",
+ "filestore_sloppy_crc_block_size",
+ "filestore_max_alloc_hint_size",
+ NULL
+ };
+ return KEYS;
+}
+
+void FileStore::handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed)
+{
+ if (changed.count("filestore_max_inline_xattr_size") ||
+ changed.count("filestore_max_inline_xattr_size_xfs") ||
+ changed.count("filestore_max_inline_xattr_size_btrfs") ||
+ changed.count("filestore_max_inline_xattr_size_other") ||
+ changed.count("filestore_max_inline_xattrs") ||
+ changed.count("filestore_max_inline_xattrs_xfs") ||
+ changed.count("filestore_max_inline_xattrs_btrfs") ||
+ changed.count("filestore_max_inline_xattrs_other") ||
+ changed.count("filestore_max_xattr_value_size") ||
+ changed.count("filestore_max_xattr_value_size_xfs") ||
+ changed.count("filestore_max_xattr_value_size_btrfs") ||
+ changed.count("filestore_max_xattr_value_size_other")) {
+ if (backend) {
+ Mutex::Locker l(lock);
+ set_xattr_limits_via_conf();
+ }
+ }
+
+ if (changed.count("filestore_queue_max_bytes") ||
+ changed.count("filestore_queue_max_ops") ||
+ changed.count("filestore_expected_throughput_bytes") ||
+ changed.count("filestore_expected_throughput_ops") ||
+ changed.count("filestore_queue_low_threshhold") ||
+ changed.count("filestore_queue_high_threshhold") ||
+ changed.count("filestore_queue_high_delay_multiple") ||
+ changed.count("filestore_queue_max_delay_multiple")) {
+ Mutex::Locker l(lock);
+ set_throttle_params();
+ }
+
+ if (changed.count("filestore_min_sync_interval") ||
+ changed.count("filestore_max_sync_interval") ||
+ changed.count("filestore_kill_at") ||
+ changed.count("filestore_fail_eio") ||
+ changed.count("filestore_sloppy_crc") ||
+ changed.count("filestore_sloppy_crc_block_size") ||
+ changed.count("filestore_max_alloc_hint_size") ||
+ changed.count("filestore_fadvise")) {
+ Mutex::Locker l(lock);
+ m_filestore_min_sync_interval = conf->filestore_min_sync_interval;
+ m_filestore_max_sync_interval = conf->filestore_max_sync_interval;
+ m_filestore_kill_at = conf->filestore_kill_at;
+ m_filestore_fail_eio = conf->filestore_fail_eio;
+ m_filestore_fadvise = conf->filestore_fadvise;
+ m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
+ m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
+ m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size;
+ }
+ if (changed.count("filestore_commit_timeout")) {
+ Mutex::Locker l(sync_entry_timeo_lock);
+ m_filestore_commit_timeout = conf->filestore_commit_timeout;
+ }
+ if (changed.count("filestore_dump_file")) {
+ if (conf->filestore_dump_file.length() &&
+ conf->filestore_dump_file != "-") {
+ dump_start(conf->filestore_dump_file);
+ } else {
+ dump_stop();
+ }
+ }
+}
+
+int FileStore::set_throttle_params()
+{
+ stringstream ss;
+ bool valid = throttle_bytes.set_params(
+ cct->_conf->filestore_queue_low_threshhold,
+ cct->_conf->filestore_queue_high_threshhold,
+ cct->_conf->filestore_expected_throughput_bytes,
+ cct->_conf->filestore_queue_high_delay_multiple?
+ cct->_conf->filestore_queue_high_delay_multiple:
+ cct->_conf->filestore_queue_high_delay_multiple_bytes,
+ cct->_conf->filestore_queue_max_delay_multiple?
+ cct->_conf->filestore_queue_max_delay_multiple:
+ cct->_conf->filestore_queue_max_delay_multiple_bytes,
+ cct->_conf->filestore_queue_max_bytes,
+ &ss);
+
+ valid &= throttle_ops.set_params(
+ cct->_conf->filestore_queue_low_threshhold,
+ cct->_conf->filestore_queue_high_threshhold,
+ cct->_conf->filestore_expected_throughput_ops,
+ cct->_conf->filestore_queue_high_delay_multiple?
+ cct->_conf->filestore_queue_high_delay_multiple:
+ cct->_conf->filestore_queue_high_delay_multiple_ops,
+ cct->_conf->filestore_queue_max_delay_multiple?
+ cct->_conf->filestore_queue_max_delay_multiple:
+ cct->_conf->filestore_queue_max_delay_multiple_ops,
+ cct->_conf->filestore_queue_max_ops,
+ &ss);
+
+ logger->set(l_filestore_op_queue_max_ops, throttle_ops.get_max());
+ logger->set(l_filestore_op_queue_max_bytes, throttle_bytes.get_max());
+
+ if (!valid) {
+ derr << "tried to set invalid params: "
+ << ss.str()
+ << dendl;
+ }
+ return valid ? 0 : -EINVAL;
+}
+
+void FileStore::dump_start(const std::string& file)
+{
+ dout(10) << __FUNC__ << ": " << file << dendl;
+ if (m_filestore_do_dump) {
+ dump_stop();
+ }
+ m_filestore_dump_fmt.reset();
+ m_filestore_dump_fmt.open_array_section("dump");
+ m_filestore_dump.open(file.c_str());
+ m_filestore_do_dump = true;
+}
+
+void FileStore::dump_stop()
+{
+ dout(10) << __FUNC__ << dendl;
+ m_filestore_do_dump = false;
+ if (m_filestore_dump.is_open()) {
+ m_filestore_dump_fmt.close_section();
+ m_filestore_dump_fmt.flush(m_filestore_dump);
+ m_filestore_dump.flush();
+ m_filestore_dump.close();
+ }
+}
+
+void FileStore::dump_transactions(vector<ObjectStore::Transaction>& ls, uint64_t seq, OpSequencer *osr)
+{
+ m_filestore_dump_fmt.open_array_section("transactions");
+ unsigned trans_num = 0;
+ for (vector<ObjectStore::Transaction>::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) {
+ m_filestore_dump_fmt.open_object_section("transaction");
+ m_filestore_dump_fmt.dump_stream("osr") << osr->cid;
+ m_filestore_dump_fmt.dump_unsigned("seq", seq);
+ m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num);
+ (*i).dump(&m_filestore_dump_fmt);
+ m_filestore_dump_fmt.close_section();
+ }
+ m_filestore_dump_fmt.close_section();
+ m_filestore_dump_fmt.flush(m_filestore_dump);
+ m_filestore_dump.flush();
+}
+
+void FileStore::get_db_statistics(Formatter* f)
+{
+ object_map->db->get_statistics(f);
+}
+
+void FileStore::set_xattr_limits_via_conf()
+{
+ uint32_t fs_xattr_size;
+ uint32_t fs_xattrs;
+ uint32_t fs_xattr_max_value_size;
+
+ switch (m_fs_type) {
+#if defined(__linux__)
+ case XFS_SUPER_MAGIC:
+ fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_xfs;
+ fs_xattrs = cct->_conf->filestore_max_inline_xattrs_xfs;
+ fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_xfs;
+ break;
+ case BTRFS_SUPER_MAGIC:
+ fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_btrfs;
+ fs_xattrs = cct->_conf->filestore_max_inline_xattrs_btrfs;
+ fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_btrfs;
+ break;
+#endif
+ default:
+ fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_other;
+ fs_xattrs = cct->_conf->filestore_max_inline_xattrs_other;
+ fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_other;
+ break;
+ }
+
+ // Use override value if set
+ if (cct->_conf->filestore_max_inline_xattr_size)
+ m_filestore_max_inline_xattr_size = cct->_conf->filestore_max_inline_xattr_size;
+ else
+ m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+ // Use override value if set
+ if (cct->_conf->filestore_max_inline_xattrs)
+ m_filestore_max_inline_xattrs = cct->_conf->filestore_max_inline_xattrs;
+ else
+ m_filestore_max_inline_xattrs = fs_xattrs;
+
+ // Use override value if set
+ if (cct->_conf->filestore_max_xattr_value_size)
+ m_filestore_max_xattr_value_size = cct->_conf->filestore_max_xattr_value_size;
+ else
+ m_filestore_max_xattr_value_size = fs_xattr_max_value_size;
+
+ if (m_filestore_max_xattr_value_size < cct->_conf->osd_max_object_name_len) {
+ derr << "WARNING: max attr value size ("
+ << m_filestore_max_xattr_value_size
+ << ") is smaller than osd_max_object_name_len ("
+ << cct->_conf->osd_max_object_name_len
+ << "). Your backend filesystem appears to not support attrs large "
+ << "enough to handle the configured max rados name size. You may get "
+ << "unexpected ENAMETOOLONG errors on rados operations or buggy "
+ << "behavior"
+ << dendl;
+ }
+}
+
+uint64_t FileStore::estimate_objects_overhead(uint64_t num_objects)
+{
+ uint64_t res = num_objects * blk_size / 2; //assumes that each object uses ( in average ) additional 1/2 block due to FS allocation granularity.
+ return res;
+}
+
+int FileStore::apply_layout_settings(const coll_t &cid, int target_level)
+{
+ dout(20) << __FUNC__ << ": " << cid << " target level: "
+ << target_level << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << "Error getting index for " << cid << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ return index->apply_layout_settings(target_level);
+}
+
+
+// -- FSSuperblock --
+
+void FSSuperblock::encode(bufferlist &bl) const
+{
+ ENCODE_START(2, 1, bl);
+ compat_features.encode(bl);
+ encode(omap_backend, bl);
+ ENCODE_FINISH(bl);
+}
+
+void FSSuperblock::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(2, bl);
+ compat_features.decode(bl);
+ if (struct_v >= 2)
+ decode(omap_backend, bl);
+ else
+ omap_backend = "leveldb";
+ DECODE_FINISH(bl);
+}
+
+void FSSuperblock::dump(Formatter *f) const
+{
+ f->open_object_section("compat");
+ compat_features.dump(f);
+ f->dump_string("omap_backend", omap_backend);
+ f->close_section();
+}
+
+void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o)
+{
+ FSSuperblock z;
+ o.push_back(new FSSuperblock(z));
+ CompatSet::FeatureSet feature_compat;
+ CompatSet::FeatureSet feature_ro_compat;
+ CompatSet::FeatureSet feature_incompat;
+ feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ z.compat_features = CompatSet(feature_compat, feature_ro_compat,
+ feature_incompat);
+ o.push_back(new FSSuperblock(z));
+ z.omap_backend = "rocksdb";
+ o.push_back(new FSSuperblock(z));
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore.osr(" << this << ") "
+
+void FileStore::OpSequencer::_register_apply(Op *o)
+{
+ if (o->registered_apply) {
+ dout(20) << __func__ << " " << o << " already registered" << dendl;
+ return;
+ }
+ o->registered_apply = true;
+ for (auto& t : o->tls) {
+ for (auto& i : t.get_object_index()) {
+ uint32_t key = i.first.hobj.get_hash();
+ applying.emplace(make_pair(key, &i.first));
+ dout(20) << __func__ << " " << o << " " << i.first << " ("
+ << &i.first << ")" << dendl;
+ }
+ }
+}
+
+void FileStore::OpSequencer::_unregister_apply(Op *o)
+{
+ ceph_assert(o->registered_apply);
+ for (auto& t : o->tls) {
+ for (auto& i : t.get_object_index()) {
+ uint32_t key = i.first.hobj.get_hash();
+ auto p = applying.find(key);
+ bool removed = false;
+ while (p != applying.end() &&
+ p->first == key) {
+ if (p->second == &i.first) {
+ dout(20) << __func__ << " " << o << " " << i.first << " ("
+ << &i.first << ")" << dendl;
+ applying.erase(p);
+ removed = true;
+ break;
+ }
+ ++p;
+ }
+ ceph_assert(removed);
+ }
+ }
+}
+
+void FileStore::OpSequencer::wait_for_apply(const ghobject_t& oid)
+{
+ Mutex::Locker l(qlock);
+ uint32_t key = oid.hobj.get_hash();
+retry:
+ while (true) {
+ // search all items in hash slot for a matching object
+ auto p = applying.find(key);
+ while (p != applying.end() &&
+ p->first == key) {
+ if (*p->second == oid) {
+ dout(20) << __func__ << " " << oid << " waiting on " << p->second
+ << dendl;
+ cond.Wait(qlock);
+ goto retry;
+ }
+ ++p;
+ }
+ break;
+ }
+ dout(20) << __func__ << " " << oid << " done" << dendl;
+}
diff --git a/src/os/filestore/FileStore.h b/src/os/filestore/FileStore.h
new file mode 100644
index 00000000..e09b9e04
--- /dev/null
+++ b/src/os/filestore/FileStore.h
@@ -0,0 +1,938 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILESTORE_H
+#define CEPH_FILESTORE_H
+
+#include "include/types.h"
+
+#include <map>
+#include <deque>
+#include <atomic>
+#include <fstream>
+
+
+#include <boost/scoped_ptr.hpp>
+
+#include "include/unordered_map.h"
+
+#include "include/ceph_assert.h"
+
+#include "os/ObjectStore.h"
+#include "JournalingObjectStore.h"
+
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "common/perf_counters.h"
+#include "common/zipkin_trace.h"
+
+#include "common/Mutex.h"
+#include "HashIndex.h"
+#include "IndexManager.h"
+#include "os/ObjectMap.h"
+#include "SequencerPosition.h"
+#include "FDCache.h"
+#include "WBThrottle.h"
+
+#include "include/uuid.h"
+
+#if defined(__linux__)
+# ifndef BTRFS_SUPER_MAGIC
+#define BTRFS_SUPER_MAGIC 0x9123683EUL
+# endif
+# ifndef XFS_SUPER_MAGIC
+#define XFS_SUPER_MAGIC 0x58465342UL
+# endif
+# ifndef ZFS_SUPER_MAGIC
+#define ZFS_SUPER_MAGIC 0x2fc12fc1UL
+# endif
+#endif
+
+
+class FileStoreBackend;
+
+#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
+
+enum {
+ l_filestore_first = 84000,
+ l_filestore_journal_queue_ops,
+ l_filestore_journal_queue_bytes,
+ l_filestore_journal_ops,
+ l_filestore_journal_bytes,
+ l_filestore_journal_latency,
+ l_filestore_journal_wr,
+ l_filestore_journal_wr_bytes,
+ l_filestore_journal_full,
+ l_filestore_committing,
+ l_filestore_commitcycle,
+ l_filestore_commitcycle_interval,
+ l_filestore_commitcycle_latency,
+ l_filestore_op_queue_max_ops,
+ l_filestore_op_queue_ops,
+ l_filestore_ops,
+ l_filestore_op_queue_max_bytes,
+ l_filestore_op_queue_bytes,
+ l_filestore_bytes,
+ l_filestore_apply_latency,
+ l_filestore_queue_transaction_latency_avg,
+ l_filestore_sync_pause_max_lat,
+ l_filestore_last,
+};
+
+class FSSuperblock {
+public:
+ CompatSet compat_features;
+ string omap_backend;
+
+ FSSuperblock() { }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<FSSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(FSSuperblock)
+
+inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
+{
+ return out << "sb(" << sb.compat_features << "): "
+ << sb.omap_backend;
+}
+
+class FileStore : public JournalingObjectStore,
+ public md_config_obs_t
+{
+ static const uint32_t target_version = 4;
+public:
+ uint32_t get_target_version() {
+ return target_version;
+ }
+
+ static int get_block_device_fsid(CephContext* cct, const string& path,
+ uuid_d *fsid);
+ struct FSPerfTracker {
+ PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
+ PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
+
+ objectstore_perf_stat_t get_cur_stats() const {
+ objectstore_perf_stat_t ret;
+ ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
+ ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
+ return ret;
+ }
+
+ void update_from_perfcounters(PerfCounters &logger);
+ } perf_tracker;
+ objectstore_perf_stat_t get_cur_stats() override {
+ perf_tracker.update_from_perfcounters(*logger);
+ return perf_tracker.get_cur_stats();
+ }
+ const PerfCounters* get_perf_counters() const override {
+ return logger;
+ }
+
+private:
+ string internal_name; ///< internal name, used to name the perfcounter instance
+ string basedir, journalpath;
+ osflagbits_t generic_flags;
+ std::string current_fn;
+ std::string current_op_seq_fn;
+ std::string omap_dir;
+ uuid_d fsid;
+
+ size_t blk_size; ///< fs block size
+
+ int fsid_fd, op_fd, basedir_fd, current_fd;
+
+ FileStoreBackend *backend;
+
+ void create_backend(unsigned long f_type);
+
+ string devname;
+
+ int vdo_fd = -1;
+ string vdo_name;
+
+ deque<uint64_t> snaps;
+
+ // Indexed Collections
+ IndexManager index_manager;
+ int get_index(const coll_t& c, Index *index);
+ int init_index(const coll_t& c);
+
+ bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) {
+ // - normal temp case: cid is pg, object is temp (pool < -1)
+ // - hammer temp case: cid is pg (or already temp), object pool is -1
+ return cid.is_pg() && oid.hobj.pool <= -1;
+ }
+ void init_temp_collections();
+
+ void handle_eio();
+
+ // ObjectMap
+ boost::scoped_ptr<ObjectMap> object_map;
+
+ // helper fns
+ int get_cdir(const coll_t& cid, char *s, int len);
+
+ /// read a uuid from fd
+ int read_fsid(int fd, uuid_d *uuid);
+
+ /// lock fsid_fd
+ int lock_fsid();
+
+ // sync thread
+ Mutex lock;
+ bool force_sync;
+ Cond sync_cond;
+
+ Mutex sync_entry_timeo_lock;
+ SafeTimer timer;
+
+ list<Context*> sync_waiters;
+ bool stop;
+ void sync_entry();
+ struct SyncThread : public Thread {
+ FileStore *fs;
+ explicit SyncThread(FileStore *f) : fs(f) {}
+ void *entry() override {
+ fs->sync_entry();
+ return 0;
+ }
+ } sync_thread;
+
+ // -- op workqueue --
+ struct Op {
+ utime_t start;
+ uint64_t op;
+ vector<Transaction> tls;
+ Context *onreadable, *onreadable_sync;
+ uint64_t ops, bytes;
+ TrackedOpRef osd_op;
+ ZTracer::Trace trace;
+ bool registered_apply = false;
+ };
+ class OpSequencer : public CollectionImpl {
+ CephContext *cct;
+ Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
+ list<Op*> q;
+ list<uint64_t> jq;
+ list<pair<uint64_t, Context*> > flush_commit_waiters;
+ Cond cond;
+ string osr_name_str;
+ /// hash of pointers to ghobject_t's for in-flight writes
+ unordered_multimap<uint32_t,const ghobject_t*> applying;
+ public:
+ Mutex apply_lock; // for apply mutual exclusion
+ int id;
+ const char *osr_name;
+
+ /// get_max_uncompleted
+ bool _get_max_uncompleted(
+ uint64_t *seq ///< [out] max uncompleted seq
+ ) {
+ ceph_assert(qlock.is_locked());
+ ceph_assert(seq);
+ *seq = 0;
+ if (q.empty() && jq.empty())
+ return true;
+
+ if (!q.empty())
+ *seq = q.back()->op;
+ if (!jq.empty() && jq.back() > *seq)
+ *seq = jq.back();
+
+ return false;
+ } /// @returns true if both queues are empty
+
+ /// get_min_uncompleted
+ bool _get_min_uncompleted(
+ uint64_t *seq ///< [out] min uncompleted seq
+ ) {
+ ceph_assert(qlock.is_locked());
+ ceph_assert(seq);
+ *seq = 0;
+ if (q.empty() && jq.empty())
+ return true;
+
+ if (!q.empty())
+ *seq = q.front()->op;
+ if (!jq.empty() && jq.front() < *seq)
+ *seq = jq.front();
+
+ return false;
+ } /// @returns true if both queues are empty
+
+ void _wake_flush_waiters(list<Context*> *to_queue) {
+ uint64_t seq;
+ if (_get_min_uncompleted(&seq))
+ seq = -1;
+
+ for (list<pair<uint64_t, Context*> >::iterator i =
+ flush_commit_waiters.begin();
+ i != flush_commit_waiters.end() && i->first < seq;
+ flush_commit_waiters.erase(i++)) {
+ to_queue->push_back(i->second);
+ }
+ }
+
+ void queue_journal(Op *o) {
+ Mutex::Locker l(qlock);
+ jq.push_back(o->op);
+ _register_apply(o);
+ }
+ void dequeue_journal(list<Context*> *to_queue) {
+ Mutex::Locker l(qlock);
+ jq.pop_front();
+ cond.Signal();
+ _wake_flush_waiters(to_queue);
+ }
+ void queue(Op *o) {
+ Mutex::Locker l(qlock);
+ q.push_back(o);
+ _register_apply(o);
+ o->trace.keyval("queue depth", q.size());
+ }
+ void _register_apply(Op *o);
+ void _unregister_apply(Op *o);
+ void wait_for_apply(const ghobject_t& oid);
+ Op *peek_queue() {
+ Mutex::Locker l(qlock);
+ ceph_assert(apply_lock.is_locked());
+ return q.front();
+ }
+
+ Op *dequeue(list<Context*> *to_queue) {
+ ceph_assert(to_queue);
+ ceph_assert(apply_lock.is_locked());
+ Mutex::Locker l(qlock);
+ Op *o = q.front();
+ q.pop_front();
+ cond.Signal();
+ _unregister_apply(o);
+ _wake_flush_waiters(to_queue);
+ return o;
+ }
+
+ void flush() override {
+ Mutex::Locker l(qlock);
+
+ while (cct->_conf->filestore_blackhole)
+ cond.Wait(qlock); // wait forever
+
+
+ // get max for journal _or_ op queues
+ uint64_t seq = 0;
+ if (!q.empty())
+ seq = q.back()->op;
+ if (!jq.empty() && jq.back() > seq)
+ seq = jq.back();
+
+ if (seq) {
+ // everything prior to our watermark to drain through either/both queues
+ while ((!q.empty() && q.front()->op <= seq) ||
+ (!jq.empty() && jq.front() <= seq))
+ cond.Wait(qlock);
+ }
+ }
+ bool flush_commit(Context *c) override {
+ Mutex::Locker l(qlock);
+ uint64_t seq = 0;
+ if (_get_max_uncompleted(&seq)) {
+ return true;
+ } else {
+ flush_commit_waiters.push_back(make_pair(seq, c));
+ return false;
+ }
+ }
+
+ OpSequencer(CephContext* cct, int i, coll_t cid)
+ : CollectionImpl(cid),
+ cct(cct),
+ qlock("FileStore::OpSequencer::qlock", false, false),
+ osr_name_str(stringify(cid)),
+ apply_lock("FileStore::OpSequencer::apply_lock", false, false),
+ id(i),
+ osr_name(osr_name_str.c_str()) {}
+ ~OpSequencer() override {
+ ceph_assert(q.empty());
+ }
+ };
+ typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
+
+ Mutex coll_lock;
+ map<coll_t,OpSequencerRef> coll_map;
+
+ friend ostream& operator<<(ostream& out, const OpSequencer& s);
+
+ FDCache fdcache;
+ WBThrottle wbthrottle;
+
+ std::atomic<int64_t> next_osr_id = { 0 };
+ bool m_disable_wbthrottle;
+ deque<OpSequencer*> op_queue;
+ BackoffThrottle throttle_ops, throttle_bytes;
+ const int m_ondisk_finisher_num;
+ const int m_apply_finisher_num;
+ vector<Finisher*> ondisk_finishers;
+ vector<Finisher*> apply_finishers;
+
+ ThreadPool op_tp;
+ struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
+ FileStore *store;
+ OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
+ : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {}
+
+ bool _enqueue(OpSequencer *osr) override {
+ store->op_queue.push_back(osr);
+ return true;
+ }
+ void _dequeue(OpSequencer *o) override {
+ ceph_abort();
+ }
+ bool _empty() override {
+ return store->op_queue.empty();
+ }
+ OpSequencer *_dequeue() override {
+ if (store->op_queue.empty())
+ return nullptr;
+ OpSequencer *osr = store->op_queue.front();
+ store->op_queue.pop_front();
+ return osr;
+ }
+ void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override {
+ store->_do_op(osr, handle);
+ }
+ void _process_finish(OpSequencer *osr) override {
+ store->_finish_op(osr);
+ }
+ void _clear() override {
+ ceph_assert(store->op_queue.empty());
+ }
+ } op_wq;
+
+ void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
+ void _finish_op(OpSequencer *o);
+ Op *build_op(vector<Transaction>& tls,
+ Context *onreadable, Context *onreadable_sync,
+ TrackedOpRef osd_op);
+ void queue_op(OpSequencer *osr, Op *o);
+ void op_queue_reserve_throttle(Op *o);
+ void op_queue_release_throttle(Op *o);
+ void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
+ friend struct C_JournaledAhead;
+
+ void new_journal();
+
+ PerfCounters *logger;
+
+ ZTracer::Endpoint trace_endpoint;
+
+public:
+ int lfn_find(const ghobject_t& oid, const Index& index,
+ IndexedPath *path = nullptr);
+ int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length);
+ int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf);
+ int lfn_open(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ bool create,
+ FDRef *outfd,
+ Index *index = nullptr);
+
+ void lfn_close(FDRef fd);
+ int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ;
+ int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos,
+ bool force_clear_omap=false);
+
+public:
+ FileStore(CephContext* cct, const std::string &base, const std::string &jdev,
+ osflagbits_t flags = 0,
+ const char *internal_name = "filestore", bool update_to=false);
+ ~FileStore() override;
+
+ string get_type() override {
+ return "filestore";
+ }
+
+ int _detect_fs();
+ int _sanity_check_fs();
+
+ bool test_mount_in_use() override;
+ int read_op_seq(uint64_t *seq);
+ int write_op_seq(int, uint64_t seq);
+ int mount() override;
+ int umount() override;
+
+ int validate_hobject_key(const hobject_t &obj) const override;
+
+ unsigned get_max_attr_name_length() override {
+ // xattr limit is 128; leave room for our prefixes (user.ceph._),
+ // some margin, and cap at 100
+ return 100;
+ }
+ int mkfs() override;
+ int mkjournal() override;
+ bool wants_journal() override {
+ return true;
+ }
+ bool allows_journal() override {
+ return true;
+ }
+ bool needs_journal() override {
+ return false;
+ }
+
+ bool is_sync_onreadable() const override {
+ return false;
+ }
+
+ bool is_rotational() override;
+ bool is_journal_rotational() override;
+
+ void dump_perf_counters(Formatter *f) override {
+ f->open_object_section("perf_counters");
+ logger->dump_formatted(f, false);
+ f->close_section();
+ }
+
+ int flush_cache(ostream *os = NULL) override;
+ int write_version_stamp();
+ int version_stamp_is_valid(uint32_t *version);
+ int update_version_stamp();
+ int upgrade() override;
+
+ bool can_sort_nibblewise() override {
+ return true; // i support legacy sort order
+ }
+
+ void collect_metadata(map<string,string> *pm) override;
+ int get_devices(set<string> *ls) override;
+
+ int statfs(struct store_statfs_t *buf,
+ osd_alert_list_t* alerts = nullptr) override;
+ int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override;
+
+ int _do_transactions(
+ vector<Transaction> &tls, uint64_t op_seq,
+ ThreadPool::TPHandle *handle,
+ const char *osr_name);
+ int do_transactions(vector<Transaction> &tls, uint64_t op_seq) override {
+ return _do_transactions(tls, op_seq, nullptr, "replay");
+ }
+ void _do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle, const char *osr_name);
+
+ CollectionHandle open_collection(const coll_t& c) override;
+ CollectionHandle create_new_collection(const coll_t& c) override;
+ void set_collection_commit_queue(const coll_t& cid,
+ ContextQueue *commit_queue) override {
+ }
+
+ int queue_transactions(CollectionHandle& ch, vector<Transaction>& tls,
+ TrackedOpRef op = TrackedOpRef(),
+ ThreadPool::TPHandle *handle = nullptr) override;
+
+ /**
+ * set replay guard xattr on given file
+ *
+ * This will ensure that we will not replay this (or any previous) operation
+ * against this particular inode/object.
+ *
+ * @param fd open file descriptor for the file/object
+ * @param spos sequencer position of the last operation we should not replay
+ */
+ void _set_replay_guard(int fd,
+ const SequencerPosition& spos,
+ const ghobject_t *oid=0,
+ bool in_progress=false);
+ void _set_replay_guard(const coll_t& cid,
+ const SequencerPosition& spos,
+ bool in_progress);
+ void _set_global_replay_guard(const coll_t& cid,
+ const SequencerPosition &spos);
+
+ /// close a replay guard opened with in_progress=true
+ void _close_replay_guard(int fd, const SequencerPosition& spos,
+ const ghobject_t *oid=0);
+ void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+
+ /**
+ * check replay guard xattr on given file
+ *
+ * Check the current position against any marker on the file that
+ * indicates which operations have already been applied. If the
+ * current or a newer operation has been marked as applied, we
+ * should not replay the current operation again.
+ *
+ * If we are not replaying the journal, we already return true. It
+ * is only on replay that we might return false, indicated that the
+ * operation should not be performed (again).
+ *
+ * @param fd open fd on the file/object in question
+ * @param spos sequencerposition for an operation we could apply/replay
+ * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
+ */
+ int _check_replay_guard(int fd, const SequencerPosition& spos);
+ int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+ int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos);
+ int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+
+ // ------------------
+ // objects
+ int pick_object_revision_lt(ghobject_t& oid) {
+ return 0;
+ }
+ using ObjectStore::exists;
+ bool exists(CollectionHandle& c, const ghobject_t& oid) override;
+ using ObjectStore::stat;
+ int stat(
+ CollectionHandle& c,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio = false) override;
+ using ObjectStore::set_collection_opts;
+ int set_collection_opts(
+ CollectionHandle& c,
+ const pool_opts_t& opts) override;
+ using ObjectStore::read;
+ int read(
+ CollectionHandle& c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags = 0) override;
+ int _do_fiemap(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m);
+ int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m);
+ using ObjectStore::fiemap;
+ int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) override;
+ int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
+
+ int _touch(const coll_t& cid, const ghobject_t& oid);
+ int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
+ const bufferlist& bl, uint32_t fadvise_flags = 0);
+ int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
+ int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+ int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+ const SequencerPosition& spos);
+ int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
+ uint64_t srcoff, uint64_t len, uint64_t dstoff,
+ const SequencerPosition& spos);
+ int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+ int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+ int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
+ int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos);
+
+ int _fgetattr(int fd, const char *name, bufferptr& bp);
+ int _fgetattrs(int fd, map<string,bufferptr>& aset);
+ int _fsetattrs(int fd, map<string, bufferptr> &aset);
+
+ void do_force_sync();
+ void start_sync(Context *onsafe);
+ void sync();
+ void _flush_op_queue();
+ void flush();
+ void sync_and_flush();
+
+ int flush_journal() override;
+ int dump_journal(ostream& out) override;
+
+ void set_fsid(uuid_d u) override {
+ fsid = u;
+ }
+ uuid_d get_fsid() override { return fsid; }
+
+ uint64_t estimate_objects_overhead(uint64_t num_objects) override;
+
+ // DEBUG read error injection, an object is removed from both on delete()
+ Mutex read_error_lock;
+ set<ghobject_t> data_error_set; // read() will return -EIO
+ set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
+ void inject_data_error(const ghobject_t &oid) override;
+ void inject_mdata_error(const ghobject_t &oid) override;
+
+ void compact() override {
+ ceph_assert(object_map);
+ object_map->compact();
+ }
+
+ bool has_builtin_csum() const override {
+ return false;
+ }
+
+ void debug_obj_on_delete(const ghobject_t &oid);
+ bool debug_data_eio(const ghobject_t &oid);
+ bool debug_mdata_eio(const ghobject_t &oid);
+
+ int snapshot(const string& name) override;
+
+ // attrs
+ using ObjectStore::getattr;
+ using ObjectStore::getattrs;
+ int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, bufferptr &bp) override;
+ int getattrs(CollectionHandle& c, const ghobject_t& oid, map<string,bufferptr>& aset) override;
+
+ int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset,
+ const SequencerPosition &spos);
+ int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
+ const SequencerPosition &spos);
+ int _rmattrs(const coll_t& cid, const ghobject_t& oid,
+ const SequencerPosition &spos);
+
+ int _collection_remove_recursive(const coll_t &cid,
+ const SequencerPosition &spos);
+
+ int _collection_set_bits(const coll_t& cid, int bits);
+
+ // collections
+ using ObjectStore::collection_list;
+ int collection_bits(CollectionHandle& c) override;
+ int collection_list(CollectionHandle& c,
+ const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *next) override {
+ c->flush();
+ return collection_list(c->cid, start, end, max, ls, next);
+ }
+ int collection_list(const coll_t& cid,
+ const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *next);
+ int list_collections(vector<coll_t>& ls) override;
+ int list_collections(vector<coll_t>& ls, bool include_temp);
+ int collection_stat(const coll_t& c, struct stat *st);
+ bool collection_exists(const coll_t& c) override;
+ int collection_empty(CollectionHandle& c, bool *empty) override {
+ c->flush();
+ return collection_empty(c->cid, empty);
+ }
+ int collection_empty(const coll_t& cid, bool *empty);
+
+ // omap (see ObjectStore.h for documentation)
+ using ObjectStore::omap_get;
+ int omap_get(CollectionHandle& c, const ghobject_t &oid, bufferlist *header,
+ map<string, bufferlist> *out) override;
+ using ObjectStore::omap_get_header;
+ int omap_get_header(
+ CollectionHandle& c,
+ const ghobject_t &oid,
+ bufferlist *out,
+ bool allow_eio = false) override;
+ using ObjectStore::omap_get_keys;
+ int omap_get_keys(CollectionHandle& c, const ghobject_t &oid, set<string> *keys) override;
+ using ObjectStore::omap_get_values;
+ int omap_get_values(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys,
+ map<string, bufferlist> *out) override;
+ using ObjectStore::omap_check_keys;
+ int omap_check_keys(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys,
+ set<string> *out) override;
+ using ObjectStore::get_omap_iterator;
+ ObjectMap::ObjectMapIterator get_omap_iterator(CollectionHandle& c, const ghobject_t &oid) override;
+ ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& cid, const ghobject_t &oid);
+
+ int _create_collection(const coll_t& c, int bits,
+ const SequencerPosition &spos);
+ int _destroy_collection(const coll_t& c);
+ /**
+ * Give an expected number of objects hint to the collection.
+ *
+ * @param c - collection id.
+ * @param pg_num - pg number of the pool this collection belongs to
+ * @param expected_num_objs - expected number of objects in this collection
+ * @param spos - sequence position
+ *
+ * @return 0 on success, an error code otherwise
+ */
+ int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
+ uint64_t expected_num_objs,
+ const SequencerPosition &spos);
+ int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid,
+ const SequencerPosition& spos);
+ int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
+ const SequencerPosition& spos,
+ bool ignore_enoent = false);
+
+ int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+
+ void dump_start(const std::string& file);
+ void dump_stop();
+ void dump_transactions(vector<Transaction>& ls, uint64_t seq, OpSequencer *osr);
+
+ virtual int apply_layout_settings(const coll_t &cid, int target_level);
+
+ void get_db_statistics(Formatter* f) override;
+
+private:
+ void _inject_failure();
+
+ // omap
+ int _omap_clear(const coll_t& cid, const ghobject_t &oid,
+ const SequencerPosition &spos);
+ int _omap_setkeys(const coll_t& cid, const ghobject_t &oid,
+ const map<string, bufferlist> &aset,
+ const SequencerPosition &spos);
+ int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const set<string> &keys,
+ const SequencerPosition &spos);
+ int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+ const string& first, const string& last,
+ const SequencerPosition &spos);
+ int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl,
+ const SequencerPosition &spos);
+ int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest,
+ const SequencerPosition &spos);
+ int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest,
+ const SequencerPosition &spos);
+
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed) override;
+ int set_throttle_params();
+ float m_filestore_commit_timeout;
+ bool m_filestore_journal_parallel;
+ bool m_filestore_journal_trailing;
+ bool m_filestore_journal_writeahead;
+ int m_filestore_fiemap_threshold;
+ double m_filestore_max_sync_interval;
+ double m_filestore_min_sync_interval;
+ bool m_filestore_fail_eio;
+ bool m_filestore_fadvise;
+ int do_update;
+ bool m_journal_dio, m_journal_aio, m_journal_force_aio;
+ std::string m_osd_rollback_to_cluster_snap;
+ bool m_osd_use_stale_snap;
+ bool m_filestore_do_dump;
+ std::ofstream m_filestore_dump;
+ JSONFormatter m_filestore_dump_fmt;
+ std::atomic<int64_t> m_filestore_kill_at = { 0 };
+ bool m_filestore_sloppy_crc;
+ int m_filestore_sloppy_crc_block_size;
+ uint64_t m_filestore_max_alloc_hint_size;
+ unsigned long m_fs_type;
+
+ //Determined xattr handling based on fs type
+ void set_xattr_limits_via_conf();
+ uint32_t m_filestore_max_inline_xattr_size;
+ uint32_t m_filestore_max_inline_xattrs;
+ uint32_t m_filestore_max_xattr_value_size;
+
+ FSSuperblock superblock;
+
+ /**
+ * write_superblock()
+ *
+ * Write superblock to persisent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int write_superblock();
+
+ /**
+ * read_superblock()
+ *
+ * Fill in FileStore::superblock by reading persistent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int read_superblock();
+
+ friend class FileStoreBackend;
+ friend class TestFileStore;
+};
+
+ostream& operator<<(ostream& out, const FileStore::OpSequencer& s);
+
+struct fiemap;
+
+class FileStoreBackend {
+private:
+ FileStore *filestore;
+protected:
+ int get_basedir_fd() {
+ return filestore->basedir_fd;
+ }
+ int get_current_fd() {
+ return filestore->current_fd;
+ }
+ int get_op_fd() {
+ return filestore->op_fd;
+ }
+ size_t get_blksize() {
+ return filestore->blk_size;
+ }
+ const string& get_basedir_path() {
+ return filestore->basedir;
+ }
+ const string& get_journal_path() {
+ return filestore->journalpath;
+ }
+ const string& get_current_path() {
+ return filestore->current_fn;
+ }
+ int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
+ if (has_fiemap() || has_seek_data_hole()) {
+ return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
+ } else {
+ return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
+ }
+ }
+ int get_crc_block_size() {
+ return filestore->m_filestore_sloppy_crc_block_size;
+ }
+
+public:
+ explicit FileStoreBackend(FileStore *fs) : filestore(fs) {}
+ virtual ~FileStoreBackend() {}
+
+ CephContext* cct() const {
+ return filestore->cct;
+ }
+
+ static FileStoreBackend *create(unsigned long f_type, FileStore *fs);
+
+ virtual const char *get_name() = 0;
+ virtual int detect_features() = 0;
+ virtual int create_current() = 0;
+ virtual bool can_checkpoint() = 0;
+ virtual int list_checkpoints(list<string>& ls) = 0;
+ virtual int create_checkpoint(const string& name, uint64_t *cid) = 0;
+ virtual int sync_checkpoint(uint64_t id) = 0;
+ virtual int rollback_to(const string& name) = 0;
+ virtual int destroy_checkpoint(const string& name) = 0;
+ virtual int syncfs() = 0;
+ virtual bool has_fiemap() = 0;
+ virtual bool has_seek_data_hole() = 0;
+ virtual bool is_rotational() = 0;
+ virtual bool is_journal_rotational() = 0;
+ virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
+ virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
+ virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
+ virtual bool has_splice() const = 0;
+
+ // hooks for (sloppy) crc tracking
+ virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
+ virtual int _crc_update_truncate(int fd, loff_t off) = 0;
+ virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
+ virtual int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff) = 0;
+ virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out) = 0;
+};
+
+#endif
diff --git a/src/os/filestore/GenericFileStoreBackend.cc b/src/os/filestore/GenericFileStoreBackend.cc
new file mode 100644
index 00000000..a75d501f
--- /dev/null
+++ b/src/os/filestore/GenericFileStoreBackend.cc
@@ -0,0 +1,468 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "GenericFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+#include "common/blkdev.h"
+
+#include "common/SloppyCRCMap.h"
+#include "os/filestore/chain_xattr.h"
+
+#define SLOPPY_CRC_XATTR "user.cephos.scrc"
+
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
+ FileStoreBackend(fs),
+ ioctl_fiemap(false),
+ seek_data_hole(false),
+ use_splice(false),
+ m_filestore_fiemap(cct()->_conf->filestore_fiemap),
+ m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole),
+ m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data),
+ m_filestore_splice(cct()->_conf->filestore_splice)
+{
+ // rotational?
+ {
+ // NOTE: the below won't work on btrfs; we'll assume rotational.
+ string fn = get_basedir_path();
+ int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ return;
+ }
+ BlkDev blkdev(fd);
+ m_rotational = blkdev.is_rotational();
+ dout(20) << __func__ << " basedir " << fn
+ << " rotational " << (int)m_rotational << dendl;
+ ::close(fd);
+ }
+ // journal rotational?
+ {
+ // NOTE: the below won't work on btrfs; we'll assume rotational.
+ string fn = get_journal_path();
+ int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ return;
+ }
+ BlkDev blkdev(fd);
+ m_journal_rotational = blkdev.is_rotational();
+ dout(20) << __func__ << " journal filename " << fn.c_str()
+ << " journal rotational " << (int)m_journal_rotational << dendl;
+ ::close(fd);
+ }
+}
+
+int GenericFileStoreBackend::detect_features()
+{
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
+
+ int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, 0644);
+ if (fd < 0) {
+ fd = -errno;
+ derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
+ return fd;
+ }
+
+ // ext4 has a bug in older kernels where fiemap will return an empty
+ // result in some cases. this is a file layout that triggers the bug
+ // on 2.6.34-rc5.
+ int v[] = {
+ 0x0000000000016000, 0x0000000000007000,
+ 0x000000000004a000, 0x0000000000007000,
+ 0x0000000000060000, 0x0000000000001000,
+ 0x0000000000061000, 0x0000000000008000,
+ 0x0000000000069000, 0x0000000000007000,
+ 0x00000000000a3000, 0x000000000000c000,
+ 0x000000000024e000, 0x000000000000c000,
+ 0x000000000028b000, 0x0000000000009000,
+ 0x00000000002b1000, 0x0000000000003000,
+ 0, 0
+ };
+ for (int i=0; v[i]; i++) {
+ int off = v[i++];
+ int len = v[i];
+
+ // write a large extent
+ char buf[len];
+ memset(buf, 1, sizeof(buf));
+ int r = ::lseek(fd, off, SEEK_SET);
+ if (r < 0) {
+ r = -errno;
+ derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+ r = write(fd, buf, sizeof(buf));
+ if (r < 0) {
+ derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+ }
+
+ // fiemap an extent inside that
+ if (!m_filestore_fiemap) {
+ dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
+ ioctl_fiemap = false;
+ } else {
+ struct fiemap *fiemap;
+ int r = do_fiemap(fd, 2430421, 59284, &fiemap);
+ if (r < 0) {
+ dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
+ ioctl_fiemap = false;
+ } else {
+ if (fiemap->fm_mapped_extents == 0) {
+ dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
+ ioctl_fiemap = false;
+ } else {
+ dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
+ ioctl_fiemap = true;
+ }
+ free(fiemap);
+ }
+ }
+
+ // SEEK_DATA/SEEK_HOLE detection
+ if (!m_filestore_seek_data_hole) {
+ dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
+ seek_data_hole = false;
+ } else {
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+ // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
+ // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
+ // Fall back to use fiemap.
+ off_t hole_pos;
+
+ hole_pos = lseek(fd, 0, SEEK_HOLE);
+ if (hole_pos < 0) {
+ if (errno == EINVAL) {
+ dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
+ seek_data_hole = false;
+ } else {
+ derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return -errno;
+ }
+ } else {
+ dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
+ seek_data_hole = true;
+ }
+#endif
+ }
+
+ //splice detection
+#ifdef CEPH_HAVE_SPLICE
+ if (!m_filestore_splice) {
+ dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl;
+ use_splice = false;
+ } else {
+ int pipefd[2];
+ loff_t off_in = 0;
+ int r;
+ if (pipe_cloexec(pipefd) < 0) {
+ int e = errno;
+ dout(0) << "detect_features: splice pipe met error " << cpp_strerror(e) << dendl;
+ } else {
+ lseek(fd, 0, SEEK_SET);
+ r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
+ if (!(r < 0 && errno == EINVAL)) {
+ use_splice = true;
+ dout(0) << "detect_features: splice is supported" << dendl;
+ } else
+ dout(0) << "detect_features: splice is NOT supported" << dendl;
+ close(pipefd[0]);
+ close(pipefd[1]);
+ }
+ }
+#endif
+ ::unlink(fn);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+
+ bool have_syncfs = false;
+#ifdef HAVE_SYS_SYNCFS
+ if (::syncfs(get_basedir_fd()) == 0) {
+ dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
+ have_syncfs = true;
+ } else {
+ dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
+ }
+#elif defined(SYS_syncfs)
+ if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
+ dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
+ have_syncfs = true;
+ } else {
+ dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+ }
+#elif defined(__NR_syncfs)
+ if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
+ dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
+ have_syncfs = true;
+ } else {
+ dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+ }
+#endif
+ if (!have_syncfs) {
+ dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
+ if (m_filestore_fsync_flushes_journal_data) {
+ dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
+ } else {
+ dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
+ dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
+ }
+ }
+
+ return 0;
+}
+
+int GenericFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
+ ret = -EINVAL;
+ }
+ } else {
+ ret = ::mkdir(get_current_path().c_str(), 0755);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
+ }
+ }
+ return ret;
+}
+
+int GenericFileStoreBackend::syncfs()
+{
+ int ret;
+ if (m_filestore_fsync_flushes_journal_data) {
+ dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
+ // make the file system's journal commit.
+ // this works with ext3, but NOT ext4
+ ret = ::fsync(get_op_fd());
+ if (ret < 0)
+ ret = -errno;
+ } else {
+ dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
+ ret = sync_filesystem(get_current_fd());
+ }
+ return ret;
+}
+
+int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
+{
+ struct fiemap *fiemap = NULL;
+ struct fiemap *_realloc_fiemap = NULL;
+ int size;
+ int ret;
+
+ fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
+ if (!fiemap)
+ return -ENOMEM;
+ /*
+ * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
+ * the result is (logical=4096, len=4096). It leak the [3990, 4096).
+ * Commit:"xfs: fix rounding error of fiemap length parameter
+ * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
+ * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
+ */
+ fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
+ fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
+ fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+ ret = -ENOTSUP;
+ goto done_err;
+#else
+ if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+ ret = -errno;
+ goto done_err;
+ }
+#endif
+ size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
+
+ _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
+ if (!_realloc_fiemap) {
+ ret = -ENOMEM;
+ goto done_err;
+ } else {
+ fiemap = _realloc_fiemap;
+ }
+
+ memset(fiemap->fm_extents, 0, size);
+
+ fiemap->fm_extent_count = fiemap->fm_mapped_extents;
+ fiemap->fm_mapped_extents = 0;
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+ ret = -ENOTSUP;
+ goto done_err;
+#else
+ if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+ ret = -errno;
+ goto done_err;
+ }
+ *pfiemap = fiemap;
+#endif
+ return 0;
+
+done_err:
+ *pfiemap = NULL;
+ free(fiemap);
+ return ret;
+}
+
+
+int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
+{
+ char buf[100];
+ bufferptr bp;
+ int r = 0;
+ int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
+ if (l == -ENODATA) {
+ return 0;
+ }
+ if (l >= 0) {
+ bp = buffer::create(l);
+ memcpy(bp.c_str(), buf, l);
+ } else if (l == -ERANGE) {
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
+ if (l > 0) {
+ bp = buffer::create(l);
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
+ }
+ }
+ bufferlist bl;
+ bl.append(std::move(bp));
+ auto p = bl.cbegin();
+ try {
+ decode(*cm, p);
+ }
+ catch (buffer::error &e) {
+ r = -EIO;
+ }
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
+{
+ bufferlist bl;
+ encode(*cm, bl);
+ int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm.write(off, len, bl, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.truncate(off);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.zero(off, len);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff)
+{
+ SloppyCRCMap scm_src(get_crc_block_size());
+ SloppyCRCMap scm_dst(get_crc_block_size());
+ int r = _crc_load_or_init(srcfd, &scm_src);
+ if (r < 0)
+ return r;
+ r = _crc_load_or_init(destfd, &scm_dst);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(destfd, &scm_dst);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ return scm.read(off, len, bl, out);
+}
diff --git a/src/os/filestore/GenericFileStoreBackend.h b/src/os/filestore/GenericFileStoreBackend.h
new file mode 100644
index 00000000..207c3d0d
--- /dev/null
+++ b/src/os/filestore/GenericFileStoreBackend.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_GENERICFILESTOREBACKEDN_H
+#define CEPH_GENERICFILESTOREBACKEDN_H
+
+#include "FileStore.h"
+
+class SloppyCRCMap;
+
+class GenericFileStoreBackend : public FileStoreBackend {
+private:
+ bool ioctl_fiemap;
+ bool seek_data_hole;
+ bool use_splice;
+ bool m_filestore_fiemap;
+ bool m_filestore_seek_data_hole;
+ bool m_filestore_fsync_flushes_journal_data;
+ bool m_filestore_splice;
+ bool m_rotational = true;
+ bool m_journal_rotational = true;
+public:
+ explicit GenericFileStoreBackend(FileStore *fs);
+ ~GenericFileStoreBackend() override {}
+
+ const char *get_name() override {
+ return "generic";
+ }
+ int detect_features() override;
+ int create_current() override;
+ bool can_checkpoint() override { return false; }
+ bool is_rotational() override {
+ return m_rotational;
+ }
+ bool is_journal_rotational() override {
+ return m_journal_rotational;
+ }
+ int list_checkpoints(list<string>& ls) override { return 0; }
+ int create_checkpoint(const string& name, uint64_t *cid) override { return -EOPNOTSUPP; }
+ int sync_checkpoint(uint64_t id) override { return -EOPNOTSUPP; }
+ int rollback_to(const string& name) override { return -EOPNOTSUPP; }
+ int destroy_checkpoint(const string& name) override { return -EOPNOTSUPP; }
+ int syncfs() override;
+ bool has_fiemap() override { return ioctl_fiemap; }
+ bool has_seek_data_hole() override { return seek_data_hole; }
+ int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) override;
+ int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override {
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+ int set_alloc_hint(int fd, uint64_t hint) override { return -EOPNOTSUPP; }
+ bool has_splice() const override { return use_splice; }
+private:
+ int _crc_load_or_init(int fd, SloppyCRCMap *cm);
+ int _crc_save(int fd, SloppyCRCMap *cm);
+public:
+ int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) override;
+ int _crc_update_truncate(int fd, loff_t off) override;
+ int _crc_update_zero(int fd, loff_t off, size_t len) override;
+ int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff) override;
+ int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out) override;
+};
+#endif
diff --git a/src/os/filestore/HashIndex.cc b/src/os/filestore/HashIndex.cc
new file mode 100644
index 00000000..ab56b43c
--- /dev/null
+++ b/src/os/filestore/HashIndex.cc
@@ -0,0 +1,1195 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+#include <errno.h>
+
+#include "HashIndex.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+
+const string HashIndex::SUBDIR_ATTR = "contents";
+const string HashIndex::SETTINGS_ATTR = "settings";
+const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
+
+/// hex digit to integer value
+int hex_to_int(char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ ceph_abort();
+}
+
+/// int value to hex digit
+char int_to_hex(int v)
+{
+ ceph_assert(v < 16);
+ if (v < 10)
+ return '0' + v;
+ return 'A' + v - 10;
+}
+
+/// reverse bits in a nibble (0..15)
+int reverse_nibble_bits(int in)
+{
+ ceph_assert(in < 16);
+ return
+ ((in & 8) >> 3) |
+ ((in & 4) >> 1) |
+ ((in & 2) << 1) |
+ ((in & 1) << 3);
+}
+
+/// reverse nibble bits in a hex digit
+char reverse_hexdigit_bits(char c)
+{
+ return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
+}
+
+/// reverse nibble bits in a hex string
+string reverse_hexdigit_bits_string(string s)
+{
+ for (unsigned i=0; i<s.size(); ++i)
+ s[i] = reverse_hexdigit_bits(s[i]);
+ return s;
+}
+
+/// compare hex digit (as length 1 string) bitwise
+bool cmp_hexdigit_bitwise(const string& l, const string& r)
+{
+ ceph_assert(l.length() == 1 && r.length() == 1);
+ int lv = hex_to_int(l[0]);
+ int rv = hex_to_int(r[0]);
+ ceph_assert(lv < 16);
+ ceph_assert(rv < 16);
+ return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
+}
+
+/// compare hex digit string bitwise
+bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
+{
+ string ll = reverse_hexdigit_bits_string(l);
+ string rr = reverse_hexdigit_bits_string(r);
+ return ll < rr;
+}
+
+int HashIndex::cleanup() {
+ bufferlist bl;
+ int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0) {
+ // No in progress operations!
+ return 0;
+ }
+ auto i = bl.cbegin();
+ InProgressOp in_progress(i);
+ subdir_info_s info;
+ r = get_info(in_progress.path, &info);
+ if (r == -ENOENT) {
+ return end_split_or_merge(in_progress.path);
+ } else if (r < 0) {
+ return r;
+ }
+
+ if (in_progress.is_split())
+ return complete_split(in_progress.path, info);
+ else if (in_progress.is_merge())
+ return complete_merge(in_progress.path, info);
+ else if (in_progress.is_col_split()) {
+ for (vector<string>::iterator i = in_progress.path.begin();
+ i != in_progress.path.end();
+ ++i) {
+ vector<string> path(in_progress.path.begin(), i);
+ int r = reset_attr(path);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+ }
+ else
+ return -EINVAL;
+}
+
+int HashIndex::reset_attr(
+ const vector<string> &path)
+{
+ int exists = 0;
+ int r = path_exists(path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists)
+ return 0;
+ map<string, ghobject_t> objects;
+ vector<string> subdirs;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ subdir_info_s info;
+ info.hash_level = path.size();
+ info.objs = objects.size();
+ info.subdirs = subdirs.size();
+ return set_info(path, info);
+}
+
+int HashIndex::col_split_level(
+ HashIndex &from,
+ HashIndex &to,
+ const vector<string> &path,
+ uint32_t inbits,
+ uint32_t match,
+ unsigned *mkdirred)
+{
+ /* For each subdir, move, recurse, or ignore based on comparing the low order
+ * bits of the hash represented by the subdir path with inbits, match passed
+ * in.
+ */
+ vector<string> subdirs;
+ int r = from.list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, ghobject_t> objects;
+ r = from.list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+
+ set<string> to_move;
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ uint32_t bits = 0;
+ uint32_t hash = 0;
+ vector<string> sub_path(path.begin(), path.end());
+ sub_path.push_back(*i);
+ path_to_hobject_hash_prefix(sub_path, &bits, &hash);
+ if (bits < inbits) {
+ if (hobject_t::match_hash(hash, bits, match)) {
+ r = col_split_level(
+ from,
+ to,
+ sub_path,
+ inbits,
+ match,
+ mkdirred);
+ if (r < 0)
+ return r;
+ if (*mkdirred > path.size())
+ *mkdirred = path.size();
+ } // else, skip, doesn't need to be moved or recursed into
+ } else {
+ if (hobject_t::match_hash(hash, inbits, match)) {
+ to_move.insert(*i);
+ }
+ } // else, skip, doesn't need to be moved or recursed into
+ }
+
+ /* Then, do the same for each object */
+ map<string, ghobject_t> objs_to_move;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (i->second.match(inbits, match)) {
+ objs_to_move.insert(*i);
+ }
+ }
+
+ if (objs_to_move.empty() && to_move.empty())
+ return 0;
+
+ // Make parent directories as needed
+ while (*mkdirred < path.size()) {
+ ++*mkdirred;
+ int exists = 0;
+ vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
+ r = to.path_exists(creating_path, &exists);
+ if (r < 0)
+ return r;
+ if (exists)
+ continue;
+ subdir_info_s info;
+ info.objs = 0;
+ info.subdirs = 0;
+ info.hash_level = creating_path.size();
+ if (*mkdirred < path.size() - 1)
+ info.subdirs = 1;
+ r = to.start_col_split(creating_path);
+ if (r < 0)
+ return r;
+ r = to.create_path(creating_path);
+ if (r < 0)
+ return r;
+ r = to.set_info(creating_path, info);
+ if (r < 0)
+ return r;
+ r = to.end_split_or_merge(creating_path);
+ if (r < 0)
+ return r;
+ }
+
+ subdir_info_s from_info;
+ subdir_info_s to_info;
+ r = from.get_info(path, &from_info);
+ if (r < 0)
+ return r;
+ r = to.get_info(path, &to_info);
+ if (r < 0)
+ return r;
+
+ from.start_col_split(path);
+ to.start_col_split(path);
+
+ // Do subdir moves
+ for (set<string>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ from_info.subdirs--;
+ to_info.subdirs++;
+ r = move_subdir(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+ for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
+ i != objs_to_move.end();
+ ++i) {
+ from_info.objs--;
+ to_info.objs++;
+ r = move_object(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+
+ r = to.set_info(path, to_info);
+ if (r < 0)
+ return r;
+ r = from.set_info(path, from_info);
+ if (r < 0)
+ return r;
+ from.end_split_or_merge(path);
+ to.end_split_or_merge(path);
+ return 0;
+}
+
+int HashIndex::_merge(
+ uint32_t bits,
+ CollectionIndex* dest) {
+ dout(20) << __func__ << " bits " << bits << dendl;
+ ceph_assert(collection_version() == dest->collection_version());
+
+ vector<string> emptypath;
+
+ // pre-split to common/target level so that any shared prefix DIR_?
+ // directories already exist at the destination. Since each
+ // directory is a nibble (4 bits),
+ unsigned shared = bits / 4;
+ dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl;
+ if (shared) {
+ split_dirs(emptypath, shared);
+ ((HashIndex*)dest)->split_dirs(emptypath, shared);
+ }
+
+ // now merge the contents
+ _merge_dirs(*this, *(HashIndex*)dest, emptypath);
+
+ return 0;
+}
+
+int HashIndex::_merge_dirs(
+ HashIndex& from,
+ HashIndex& to,
+ const vector<string>& path)
+{
+ dout(20) << __func__ << " path " << path << dendl;
+ int r;
+
+ vector<string> src_subs, dst_subs;
+ r = from.list_subdirs(path, &src_subs);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "from.list_subdirs"
+ << dendl;
+ return r;
+ }
+ r = to.list_subdirs(path, &dst_subs);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "to.list_subdirs"
+ << dendl;
+ return r;
+ }
+
+ for (auto& i : src_subs) {
+ if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) {
+ // move it
+ r = move_subdir(from, to, path, i);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "move_subdir(...,"
+ << path << "," << i << ")"
+ << dendl;
+ return r;
+ }
+ } else {
+ // common, recurse!
+ vector<string> nested = path;
+ nested.push_back(i);
+ r = _merge_dirs(from, to, nested);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "rec _merge_dirs"
+ << dendl;
+ return r;
+ }
+
+ // now remove it
+ r = remove_path(nested);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "remove_path "
+ << nested
+ << dendl;
+ return r;
+ }
+ }
+ }
+
+ // objects
+ map<string, ghobject_t> objects;
+ r = from.list_objects(path, 0, 0, &objects);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "from.list_objects"
+ << dendl;
+ return r;
+ }
+
+ for (auto& i : objects) {
+ r = move_object(from, to, path, i);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "move_object(...,"
+ << path << "," << i << ")"
+ << dendl;
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+
+int HashIndex::_split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest) {
+ ceph_assert(collection_version() == dest->collection_version());
+ unsigned mkdirred = 0;
+
+ return col_split_level(
+ *this,
+ *static_cast<HashIndex*>(dest),
+ vector<string>(),
+ bits,
+ match,
+ &mkdirred);
+}
+
+int HashIndex::split_dirs(const vector<string> &path, int target_level) {
+ dout(20) << __func__ << " " << path << " target level: "
+ << target_level << dendl;
+ subdir_info_s info;
+ int r = get_info(path, &info);
+ if (r < 0) {
+ dout(10) << "error looking up info for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (must_split(info, target_level)) {
+ dout(1) << __func__ << " " << path << " has " << info.objs
+ << " objects, " << info.hash_level
+ << " level, starting split in pg " << coll() << "." << dendl;
+ r = initiate_split(path, info);
+ if (r < 0) {
+ dout(10) << "error initiating split on " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = complete_split(path, info);
+ dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
+ << dendl;
+ if (r < 0) {
+ dout(10) << "error completing split on " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ vector<string> subdirs;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0) {
+ dout(10) << "error listing subdirs of " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ for (vector<string>::const_iterator it = subdirs.begin();
+ it != subdirs.end(); ++it) {
+ vector<string> subdir_path(path);
+ subdir_path.push_back(*it);
+ r = split_dirs(subdir_path, target_level);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return r;
+}
+
+int HashIndex::apply_layout_settings(int target_level) {
+ vector<string> path;
+ dout(10) << __func__ << " split multiple = " << split_multiplier
+ << " merge threshold = " << merge_threshold
+ << " split rand factor = " << cct->_conf->filestore_split_rand_factor
+ << " target level = " << target_level
+ << dendl;
+ int r = write_settings();
+ if (r < 0)
+ return r;
+ return split_dirs(path, target_level);
+}
+
+int HashIndex::_init() {
+ subdir_info_s info;
+ vector<string> path;
+ int r = set_info(path, info);
+ if (r < 0)
+ return r;
+ return write_settings();
+}
+
+int HashIndex::write_settings() {
+ if (cct->_conf->filestore_split_rand_factor > 0) {
+ settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
+ } else {
+ settings.split_rand_factor = 0;
+ }
+ vector<string> path;
+ bufferlist bl;
+ settings.encode(bl);
+ return add_attr_path(path, SETTINGS_ATTR, bl);
+}
+
+int HashIndex::read_settings() {
+ vector<string> path;
+ bufferlist bl;
+ int r = get_attr_path(path, SETTINGS_ATTR, bl);
+ if (r == -ENODATA)
+ return 0;
+ if (r < 0) {
+ derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ auto it = bl.cbegin();
+ settings.decode(it);
+ dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
+ return 0;
+}
+
+/* LFNIndex virtual method implementations */
+int HashIndex::_created(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name) {
+ subdir_info_s info;
+ int r;
+ r = get_info(path, &info);
+ if (r < 0)
+ return r;
+ info.objs++;
+ r = set_info(path, info);
+ if (r < 0)
+ return r;
+
+ if (must_split(info)) {
+ dout(1) << __func__ << " " << path << " has " << info.objs
+ << " objects, starting split in pg " << coll() << "." << dendl;
+ int r = initiate_split(path, info);
+ if (r < 0)
+ return r;
+ r = complete_split(path, info);
+ dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
+ << dendl;
+ return r;
+ } else {
+ return 0;
+ }
+}
+
+int HashIndex::_remove(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name) {
+ int r;
+ r = remove_object(path, oid);
+ if (r < 0)
+ return r;
+ subdir_info_s info;
+ r = get_info(path, &info);
+ if (r < 0)
+ return r;
+ info.objs--;
+ r = set_info(path, info);
+ if (r < 0)
+ return r;
+ if (must_merge(info)) {
+ r = initiate_merge(path, info);
+ if (r < 0)
+ return r;
+ return complete_merge(path, info);
+ } else {
+ return 0;
+ }
+}
+
+int HashIndex::_lookup(const ghobject_t &oid,
+ vector<string> *path,
+ string *mangled_name,
+ int *hardlink) {
+ vector<string> path_comp;
+ get_path_components(oid, &path_comp);
+ vector<string>::iterator next = path_comp.begin();
+ int exists;
+ while (1) {
+ int r = path_exists(*path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists) {
+ if (path->empty())
+ return -ENOENT;
+ path->pop_back();
+ break;
+ }
+ if (next == path_comp.end())
+ break;
+ path->push_back(*(next++));
+ }
+ return get_mangled_name(*path, oid, mangled_name, hardlink);
+}
+
+int HashIndex::_collection_list_partial(const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next) {
+ vector<string> path;
+ ghobject_t _next;
+ if (!next)
+ next = &_next;
+ *next = start;
+ dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
+ return list_by_hash(path, end, max_count, next, ls);
+}
+
+int HashIndex::prep_delete() {
+ return recursive_remove(vector<string>());
+}
+
+int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
+ int ret;
+ vector<string> path;
+ subdir_info_s root_info;
+ // Make sure there is neither objects nor sub-folders
+ // in this collection
+ ret = get_info(path, &root_info);
+ if (ret < 0)
+ return ret;
+
+ // Do the folder splitting first
+ ret = pre_split_folder(pg_num, expected_num_objs);
+ if (ret < 0)
+ return ret;
+ // Initialize the folder info starting from root
+ return init_split_folder(path, 0);
+}
+
+int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
+{
+ // If folder merging is enabled (by setting the threshold positive),
+ // no need to split
+ if (merge_threshold > 0)
+ return 0;
+ const coll_t c = coll();
+ // Do not split if the expected number of objects in this collection is zero (by default)
+ if (expected_num_objs == 0)
+ return 0;
+
+ // Calculate the number of leaf folders (which actually store files)
+ // need to be created
+ const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
+ uint64_t leavies = expected_num_objs / objs_per_folder ;
+ // No need to split
+ if (leavies == 0 || expected_num_objs == objs_per_folder)
+ return 0;
+
+ spg_t spgid;
+ if (!c.is_pg_prefix(&spgid))
+ return -EINVAL;
+ const ps_t ps = spgid.pgid.ps();
+
+ // the most significant bits of pg_num
+ const int pg_num_bits = calc_num_bits(pg_num - 1);
+ ps_t tmp_id = ps;
+ // calculate the number of levels we only create one sub folder
+ int num = pg_num_bits / 4;
+ // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
+ // so that splitting starts at level 3
+ if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
+ --num;
+ }
+
+ int ret;
+ // Start with creation that only has one subfolder
+ vector<string> paths;
+ int dump_num = num;
+ while (num-- > 0) {
+ ps_t v = tmp_id & 0x0000000f;
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ tmp_id = tmp_id >> 4;
+ }
+
+ // Starting from here, we can split by creating multiple subfolders
+ const int left_bits = pg_num_bits - dump_num * 4;
+ // this variable denotes how many bits (for this level) that can be
+ // used for sub folder splitting
+ int split_bits = 4 - left_bits;
+ // the below logic is inspired by rados.h#ceph_stable_mod,
+ // it basically determines how many sub-folders should we
+ // create for splitting
+ ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT
+ if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
+ ++split_bits;
+ }
+ const uint32_t subs = (1 << split_bits);
+ // Calculate how many levels we create starting from here
+ int level = 0;
+ int level_limit = MAX_HASH_LEVEL - dump_num - 1;
+ uint64_t actual_leaves = subs;
+ while (actual_leaves < leavies && level < level_limit) {
+ ++level;
+ actual_leaves <<= 4;
+ }
+ for (uint32_t i = 0; i < subs; ++i) {
+ ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT
+ int v = tmp_id | (i << ((4 - split_bits) % 4));
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(paths, level);
+ if (ret < 0)
+ return ret;
+ paths.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
+{
+ // Get the number of sub directories for the current path
+ vector<string> subdirs;
+ int ret = list_subdirs(path, &subdirs);
+ if (ret < 0)
+ return ret;
+ subdir_info_s info;
+ info.subdirs = subdirs.size();
+ info.hash_level = hash_level;
+ ret = set_info(path, info);
+ if (ret < 0)
+ return ret;
+ ret = fsync_dir(path);
+ if (ret < 0)
+ return ret;
+
+ // Do the same for subdirs
+ vector<string>::const_iterator iter;
+ for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
+ path.push_back(*iter);
+ ret = init_split_folder(path, hash_level + 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::recursive_create_path(vector<string>& path, int level)
+{
+ if (level == 0)
+ return 0;
+ for (int i = 0; i < 16; ++i) {
+ path.push_back(to_hex(i));
+ int ret = create_path(path);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(path, level - 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::recursive_remove(const vector<string> &path) {
+ return _recursive_remove(path, true);
+}
+
+int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
+ vector<string> subdirs;
+ dout(20) << __func__ << " path=" << path << dendl;
+ int r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, ghobject_t> objects;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ if (!objects.empty())
+ return -ENOTEMPTY;
+ vector<string> subdir(path);
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ subdir.push_back(*i);
+ r = _recursive_remove(subdir, false);
+ if (r < 0)
+ return r;
+ subdir.pop_back();
+ }
+ if (top)
+ return 0;
+ else
+ return remove_path(path);
+}
+
+int HashIndex::start_col_split(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_split(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::SPLIT, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_merge(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::MERGE, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::end_split_or_merge(const vector<string> &path) {
+ return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
+}
+
+int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
+ bufferlist buf;
+ int r = get_attr_path(path, SUBDIR_ATTR, buf);
+ if (r < 0)
+ return r;
+ auto bufiter = buf.cbegin();
+ info->decode(bufiter);
+ ceph_assert(path.size() == (unsigned)info->hash_level);
+ return 0;
+}
+
+int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
+ bufferlist buf;
+ ceph_assert(path.size() == (unsigned)info.hash_level);
+ info.encode(buf);
+ return add_attr_path(path, SUBDIR_ATTR, buf);
+}
+
+bool HashIndex::must_merge(const subdir_info_s &info) {
+ return (info.hash_level > 0 &&
+ merge_threshold > 0 &&
+ info.objs < (unsigned)merge_threshold &&
+ info.subdirs == 0);
+}
+
+bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
+ // target_level is used for ceph-objectstore-tool to split dirs offline.
+ // if it is set (defalult is 0) and current hash level < target_level,
+ // this dir would be split no matters how many objects it has.
+ return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
+ ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
+ (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
+}
+
+int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
+ return start_merge(path);
+}
+
+int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
+ vector<string> dst = path;
+ dst.pop_back();
+ subdir_info_s dstinfo;
+ int r, exists;
+ r = path_exists(path, &exists);
+ if (r < 0)
+ return r;
+ r = get_info(dst, &dstinfo);
+ if (r < 0)
+ return r;
+ if (exists) {
+ r = move_objects(path, dst);
+ if (r < 0)
+ return r;
+ r = reset_attr(dst);
+ if (r < 0)
+ return r;
+ r = remove_path(path);
+ if (r < 0)
+ return r;
+ }
+ if (must_merge(dstinfo)) {
+ r = initiate_merge(dst, dstinfo);
+ if (r < 0)
+ return r;
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+ return complete_merge(dst, dstinfo);
+ }
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+ return end_split_or_merge(path);
+}
+
+int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
+ return start_split(path);
+}
+
+int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
+ int level = info.hash_level;
+ map<string, ghobject_t> objects;
+ vector<string> dst = path;
+ int r;
+ dst.push_back("");
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ vector<string> subdirs_vec;
+ r = list_subdirs(path, &subdirs_vec);
+ if (r < 0)
+ return r;
+ set<string> subdirs;
+ subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
+ map<string, map<string, ghobject_t> > mapped;
+ map<string, ghobject_t> moved;
+ int num_moved = 0;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ vector<string> new_path;
+ get_path_components(i->second, &new_path);
+ mapped[new_path[level]][i->first] = i->second;
+ }
+ for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
+ i != mapped.end();
+ ) {
+ dst[level] = i->first;
+ /* If the info already exists, it must be correct,
+ * we may be picking up a partially finished split */
+ subdir_info_s temp;
+ // subdir has already been fully copied
+ if (subdirs.count(i->first) && !get_info(dst, &temp)) {
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ moved[j->first] = j->second;
+ num_moved++;
+ objects.erase(j->first);
+ }
+ ++i;
+ continue;
+ }
+
+ subdir_info_s info_new;
+ info_new.objs = i->second.size();
+ info_new.subdirs = 0;
+ info_new.hash_level = level + 1;
+ if (must_merge(info_new) && !subdirs.count(i->first)) {
+ mapped.erase(i++);
+ continue;
+ }
+
+ // Subdir doesn't yet exist
+ if (!subdirs.count(i->first)) {
+ info.subdirs += 1;
+ r = create_path(dst);
+ if (r < 0)
+ return r;
+ } // else subdir has been created but only partially copied
+
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ moved[j->first] = j->second;
+ num_moved++;
+ objects.erase(j->first);
+ r = link_object(path, dst, j->second, j->first);
+ // May be a partially finished split
+ if (r < 0 && r != -EEXIST) {
+ return r;
+ }
+ }
+
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+
+ // Presence of info must imply that all objects have been copied
+ r = set_info(dst, info_new);
+ if (r < 0)
+ return r;
+
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+
+ ++i;
+ }
+ r = remove_objects(path, moved, &objects);
+ if (r < 0)
+ return r;
+ info.objs = objects.size();
+ r = reset_attr(path);
+ if (r < 0)
+ return r;
+ r = fsync_dir(path);
+ if (r < 0)
+ return r;
+ return end_split_or_merge(path);
+}
+
+void HashIndex::get_path_components(const ghobject_t &oid,
+ vector<string> *path) {
+ char buf[MAX_HASH_LEVEL + 1];
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
+
+ // Path components are the hex characters of oid.hobj.hash, least
+ // significant first
+ for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+ path->push_back(string(&buf[i], 1));
+ }
+}
+
+string HashIndex::get_hash_str(uint32_t hash) {
+ char buf[MAX_HASH_LEVEL + 1];
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
+ string retval;
+ for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+ retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
+ }
+ return retval;
+}
+
+string HashIndex::get_path_str(const ghobject_t &oid) {
+ ceph_assert(!oid.is_max());
+ return get_hash_str(oid.hobj.get_hash());
+}
+
+uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
+ while (prefix.size() < sizeof(uint32_t) * 2) {
+ prefix.push_back('0');
+ }
+ uint32_t hash;
+ sscanf(prefix.c_str(), "%x", &hash);
+ // nibble reverse
+ hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
+ hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
+ hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
+ return hash;
+}
+
+int HashIndex::get_path_contents_by_hash_bitwise(
+ const vector<string> &path,
+ const ghobject_t *next_object,
+ set<string, CmpHexdigitStringBitwise> *hash_prefixes,
+ set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
+{
+ map<string, ghobject_t> rev_objects;
+ int r;
+ r = list_objects(path, 0, 0, &rev_objects);
+ if (r < 0)
+ return r;
+ // bitwise sort
+ for (map<string, ghobject_t>::iterator i = rev_objects.begin();
+ i != rev_objects.end();
+ ++i) {
+ if (next_object && i->second < *next_object)
+ continue;
+ string hash_prefix = get_path_str(i->second);
+ hash_prefixes->insert(hash_prefix);
+ objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
+ }
+ vector<string> subdirs;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ // sort subdirs bitwise (by reversing hex digit nibbles)
+ std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
+
+ // Local to this function, we will convert the prefix strings
+ // (previously simply the reversed hex digits) to also have each
+ // digit's nibbles reversed. This will make the strings sort
+ // bitwise.
+ string cur_prefix;
+ for (vector<string>::const_iterator i = path.begin();
+ i != path.end();
+ ++i) {
+ cur_prefix.append(reverse_hexdigit_bits_string(*i));
+ }
+ string next_object_string;
+ if (next_object)
+ next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
+ if (next_object) {
+ if (next_object->is_max())
+ continue;
+ if (candidate < next_object_string.substr(0, candidate.size()))
+ continue;
+ }
+ // re-reverse the hex digit nibbles for the caller
+ hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
+ }
+ return 0;
+}
+
+int HashIndex::list_by_hash(const vector<string> &path,
+ const ghobject_t &end,
+ int max_count,
+ ghobject_t *next,
+ vector<ghobject_t> *out)
+{
+ ceph_assert(out);
+ return list_by_hash_bitwise(path, end, max_count, next, out);
+}
+
+int HashIndex::list_by_hash_bitwise(
+ const vector<string> &path,
+ const ghobject_t& end,
+ int max_count,
+ ghobject_t *next,
+ vector<ghobject_t> *out)
+{
+ vector<string> next_path = path;
+ next_path.push_back("");
+ set<string, CmpHexdigitStringBitwise> hash_prefixes;
+ set<pair<string, ghobject_t>, CmpPairBitwise> objects;
+ int r = get_path_contents_by_hash_bitwise(path,
+ next,
+ &hash_prefixes,
+ &objects);
+ if (r < 0)
+ return r;
+ for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
+ i != hash_prefixes.end();
+ ++i) {
+ dout(20) << __func__ << " prefix " << *i << dendl;
+ set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
+ make_pair(*i, ghobject_t()));
+ if (j == objects.end() || j->first != *i) {
+ *(next_path.rbegin()) = *(i->rbegin());
+ ghobject_t next_recurse;
+ if (next)
+ next_recurse = *next;
+ r = list_by_hash_bitwise(next_path,
+ end,
+ max_count,
+ &next_recurse,
+ out);
+
+ if (r < 0)
+ return r;
+ if (!next_recurse.is_max()) {
+ if (next)
+ *next = next_recurse;
+ return 0;
+ }
+ } else {
+ while (j != objects.end() && j->first == *i) {
+ if (max_count > 0 && out->size() == (unsigned)max_count) {
+ if (next)
+ *next = j->second;
+ return 0;
+ }
+ if (j->second >= end) {
+ if (next)
+ *next = j->second;
+ return 0;
+ }
+ if (!next || j->second >= *next) {
+ dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
+ out->push_back(j->second);
+ }
+ ++j;
+ }
+ }
+ }
+ if (next)
+ *next = ghobject_t::get_max();
+ return 0;
+}
+
+
diff --git a/src/os/filestore/HashIndex.h b/src/os/filestore/HashIndex.h
new file mode 100644
index 00000000..7e34d155
--- /dev/null
+++ b/src/os/filestore/HashIndex.h
@@ -0,0 +1,462 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_HASHINDEX_H
+#define CEPH_HASHINDEX_H
+
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "LFNIndex.h"
+
+extern string reverse_hexdigit_bits_string(string l);
+
+/**
+ * Implements collection prehashing.
+ *
+ * @verbatim
+ * (root) - 0 - 0
+ * - 1
+ * - E
+ * - 1
+ * - 2 - D - 0
+ * .
+ * .
+ * .
+ * - F - 0
+ * @endverbatim
+ *
+ * A file is located at the longest existing directory from the root
+ * given by the hex characters in the hash beginning with the least
+ * significant.
+ *
+ * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
+ * would be located in (root)/2/D/0/
+ *
+ * Subdirectories are created when the number of objects in a
+ * directory exceed 16 * (abs(merge_threshhold) * split_multiplier +
+ * split_rand_factor). The number of objects in a directory is encoded
+ * as subdir_info_s in an xattr on the directory.
+ */
+class HashIndex : public LFNIndex {
+private:
+ /// Attribute name for storing subdir info @see subdir_info_s
+ static const string SUBDIR_ATTR;
+ /// Attribute name for storing index-wide settings
+ static const string SETTINGS_ATTR;
+ /// Attribute name for storing in progress op tag
+ static const string IN_PROGRESS_OP_TAG;
+ /// Size (bits) in object hash
+ static const int PATH_HASH_LEN = 32;
+ /// Max length of hashed path
+ static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4);
+
+ /**
+ * Merges occur when the number of object drops below
+ * merge_threshold and splits occur when the number of objects
+ * exceeds:
+ *
+ * 16 * (abs(merge_threshold) * split_multiplier + split_rand_factor)
+ *
+ * Please note if merge_threshold is less than zero, it will never
+ * do merging
+ */
+ int merge_threshold;
+ int split_multiplier;
+
+ /// Encodes current subdir state for determining when to split/merge.
+ struct subdir_info_s {
+ uint64_t objs; ///< Objects in subdir.
+ uint32_t subdirs; ///< Subdirs in subdir.
+ uint32_t hash_level; ///< Hashlevel of subdir.
+
+ subdir_info_s() : objs(0), subdirs(0), hash_level(0) {}
+
+ void encode(bufferlist &bl) const
+ {
+ using ceph::encode;
+ __u8 v = 1;
+ encode(v, bl);
+ encode(objs, bl);
+ encode(subdirs, bl);
+ encode(hash_level, bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl)
+ {
+ using ceph::decode;
+ __u8 v;
+ decode(v, bl);
+ ceph_assert(v == 1);
+ decode(objs, bl);
+ decode(subdirs, bl);
+ decode(hash_level, bl);
+ }
+ };
+
+ struct settings_s {
+ uint32_t split_rand_factor; ///< random factor added to split threshold (only on root of collection)
+ settings_s() : split_rand_factor(0) {}
+ void encode(bufferlist &bl) const
+ {
+ using ceph::encode;
+ __u8 v = 1;
+ encode(v, bl);
+ encode(split_rand_factor, bl);
+ }
+ void decode(bufferlist::const_iterator &bl)
+ {
+ using ceph::decode;
+ __u8 v;
+ decode(v, bl);
+ decode(split_rand_factor, bl);
+ }
+ } settings;
+
+ /// Encodes in progress split or merge
+ struct InProgressOp {
+ static const int SPLIT = 0;
+ static const int MERGE = 1;
+ static const int COL_SPLIT = 2;
+ int op;
+ vector<string> path;
+
+ InProgressOp(int op, const vector<string> &path)
+ : op(op), path(path) {}
+
+ explicit InProgressOp(bufferlist::const_iterator &bl) {
+ decode(bl);
+ }
+
+ bool is_split() const { return op == SPLIT; }
+ bool is_col_split() const { return op == COL_SPLIT; }
+ bool is_merge() const { return op == MERGE; }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ __u8 v = 1;
+ encode(v, bl);
+ encode(op, bl);
+ encode(path, bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl) {
+ using ceph::decode;
+ __u8 v;
+ decode(v, bl);
+ ceph_assert(v == 1);
+ decode(op, bl);
+ decode(path, bl);
+ }
+ };
+
+
+public:
+ /// Constructor.
+ HashIndex(
+ CephContext* cct,
+ coll_t collection, ///< [in] Collection
+ const char *base_path, ///< [in] Path to the index root.
+ int merge_at, ///< [in] Merge threshold.
+ int split_multiple, ///< [in] Split threshold.
+ uint32_t index_version,///< [in] Index version
+ double retry_probability=0) ///< [in] retry probability
+ : LFNIndex(cct, collection, base_path, index_version, retry_probability),
+ merge_threshold(merge_at),
+ split_multiplier(split_multiple)
+ {}
+
+ int read_settings() override;
+
+ /// @see CollectionIndex
+ uint32_t collection_version() override { return index_version; }
+
+ /// @see CollectionIndex
+ int cleanup() override;
+
+ /// @see CollectionIndex
+ int prep_delete() override;
+
+ /// @see CollectionIndex
+ int _split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest
+ ) override;
+
+ /// @see CollectionIndex
+ int _merge(
+ uint32_t bits,
+ CollectionIndex* dest
+ ) override;
+
+ int _merge_dirs(
+ HashIndex& from,
+ HashIndex& to,
+ const vector<string>& path);
+
+ /// @see CollectionIndex
+ int apply_layout_settings(int target_level) override;
+
+protected:
+ int _init() override;
+
+ int _created(
+ const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name
+ ) override;
+ int _remove(
+ const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name
+ ) override;
+ int _lookup(
+ const ghobject_t &oid,
+ vector<string> *path,
+ string *mangled_name,
+ int *hardlink
+ ) override;
+
+ /**
+ * Pre-hash the collection to create folders according to the expected number
+ * of objects in this collection.
+ */
+ int _pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ ) override;
+
+ int _collection_list_partial(
+ const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next
+ ) override;
+private:
+ /// Internal recursively remove path and its subdirs
+ int _recursive_remove(
+ const vector<string> &path, ///< [in] path to remove
+ bool top ///< [in] internal tracking of first caller
+ ); /// @return Error Code, 0 on success
+ /// Recursively remove path and its subdirs
+ int recursive_remove(
+ const vector<string> &path ///< [in] path to remove
+ ); /// @return Error Code, 0 on success
+ /// Tag root directory at beginning of col_split
+ int start_col_split(
+ const vector<string> &path ///< [in] path to split
+ ); ///< @return Error Code, 0 on success
+ /// Tag root directory at beginning of split
+ int start_split(
+ const vector<string> &path ///< [in] path to split
+ ); ///< @return Error Code, 0 on success
+ /// Tag root directory at beginning of split
+ int start_merge(
+ const vector<string> &path ///< [in] path to merge
+ ); ///< @return Error Code, 0 on success
+ /// Remove tag at end of split or merge
+ int end_split_or_merge(
+ const vector<string> &path ///< [in] path to split or merged
+ ); ///< @return Error Code, 0 on success
+ /// Gets info from the xattr on the subdir represented by path
+ int get_info(
+ const vector<string> &path, ///< [in] Path from which to read attribute.
+ subdir_info_s *info ///< [out] Attribute value
+ ); /// @return Error Code, 0 on success
+
+ /// Sets info to the xattr on the subdir represented by path
+ int set_info(
+ const vector<string> &path, ///< [in] Path on which to set attribute.
+ const subdir_info_s &info ///< [in] Value to set
+ ); /// @return Error Code, 0 on success
+
+ /// Encapsulates logic for when to split.
+ bool must_merge(
+ const subdir_info_s &info ///< [in] Info to check
+ ); /// @return True if info must be merged, False otherwise
+
+ /// Encapsulates logic for when to merge.
+ bool must_split(
+ const subdir_info_s &info, ///< [in] Info to check
+ int target_level = 0
+ ); /// @return True if info must be split, False otherwise
+
+ /// Initiates merge
+ int initiate_merge(
+ const vector<string> &path, ///< [in] Subdir to merge
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Completes merge
+ int complete_merge(
+ const vector<string> &path, ///< [in] Subdir to merge
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Resets attr to match actual subdir contents
+ int reset_attr(
+ const vector<string> &path ///< [in] path to cleanup
+ );
+
+ /// Initiate Split
+ int initiate_split(
+ const vector<string> &path, ///< [in] Subdir to split
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Completes Split
+ int complete_split(
+ const vector<string> &path, ///< [in] Subdir to split
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Determine path components from hoid hash
+ void get_path_components(
+ const ghobject_t &oid, ///< [in] Object for which to get path components
+ vector<string> *path ///< [out] Path components for hoid.
+ );
+
+ /// Pre-hash and split folders to avoid runtime splitting
+ /// according to the given expected object number.
+ int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs);
+
+ /// Initialize the folder (dir info) with the given hash
+ /// level and number of its subdirs.
+ int init_split_folder(vector<string> &path, uint32_t hash_level);
+
+ /// do collection split for path
+ static int col_split_level(
+ HashIndex &from, ///< [in] from index
+ HashIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path to split
+ uint32_t bits, ///< [in] num bits to match
+ uint32_t match, ///< [in] bits to match
+ unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred
+ );
+
+
+ /**
+ * Get string representation of ghobject_t/hash
+ *
+ * e.g: 0x01234567 -> "76543210"
+ */
+ static string get_path_str(
+ const ghobject_t &oid ///< [in] Object to get hash string for
+ ); ///< @return Hash string for hoid.
+
+ /// Get string from hash, @see get_path_str
+ static string get_hash_str(
+ uint32_t hash ///< [in] Hash to convert to a string.
+ ); ///< @return String representation of hash
+
+ /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00
+ static uint32_t hash_prefix_to_hash(
+ string prefix ///< [in] string to convert
+ ); ///< @return Hash
+
+ /// Get hash mod from path
+ static void path_to_hobject_hash_prefix(
+ const vector<string> &path,///< [in] path to convert
+ uint32_t *bits, ///< [out] bits
+ uint32_t *hash ///< [out] hash
+ ) {
+ string hash_str;
+ for (vector<string>::const_iterator i = path.begin();
+ i != path.end();
+ ++i) {
+ hash_str.push_back(*i->begin());
+ }
+ uint32_t rev_hash = hash_prefix_to_hash(hash_str);
+ if (hash)
+ *hash = rev_hash;
+ if (bits)
+ *bits = path.size() * 4;
+ }
+
+ /// Calculate the number of bits.
+ static int calc_num_bits(uint64_t n) {
+ int ret = 0;
+ while (n > 0) {
+ n = n >> 1;
+ ret++;
+ }
+ return ret;
+ }
+
+ /// Convert a number to hex string (upper case).
+ static string to_hex(int n) {
+ ceph_assert(n >= 0 && n < 16);
+ char c = (n <= 9 ? ('0' + n) : ('A' + n - 10));
+ string str;
+ str.append(1, c);
+ return str;
+ }
+
+ struct CmpPairBitwise {
+ bool operator()(const pair<string, ghobject_t>& l,
+ const pair<string, ghobject_t>& r) const
+ {
+ if (l.first < r.first)
+ return true;
+ if (l.first > r.first)
+ return false;
+ if (cmp(l.second, r.second) < 0)
+ return true;
+ return false;
+ }
+ };
+
+ struct CmpHexdigitStringBitwise {
+ bool operator()(const string& l, const string& r) const {
+ return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r);
+ }
+ };
+
+ /// Get path contents by hash
+ int get_path_contents_by_hash_bitwise(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t *next_object, /// [in] list > *next_object
+ set<string, CmpHexdigitStringBitwise> *hash_prefixes, /// [out] prefixes in dir
+ set<pair<string, ghobject_t>, CmpPairBitwise> *objects /// [out] objects
+ );
+
+ /// List objects in collection in ghobject_t order
+ int list_by_hash(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t &end, /// [in] List only objects < end
+ int max_count, /// [in] List at most max_count
+ ghobject_t *next, /// [in,out] List objects >= *next
+ vector<ghobject_t> *out /// [out] Listed objects
+ ); ///< @return Error Code, 0 on success
+ /// List objects in collection in ghobject_t order
+ int list_by_hash_bitwise(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t &end, /// [in] List only objects < end
+ int max_count, /// [in] List at most max_count
+ ghobject_t *next, /// [in,out] List objects >= *next
+ vector<ghobject_t> *out /// [out] Listed objects
+ ); ///< @return Error Code, 0 on success
+
+ /// Create the given levels of sub directories from the given root.
+ /// The contents of *path* is not changed after calling this function.
+ int recursive_create_path(vector<string>& path, int level);
+
+ /// split each dir below the given path
+ int split_dirs(const vector<string> &path, int target_level = 0);
+
+ int write_settings();
+};
+
+#endif
diff --git a/src/os/filestore/IndexManager.cc b/src/os/filestore/IndexManager.cc
new file mode 100644
index 00000000..73095026
--- /dev/null
+++ b/src/os/filestore/IndexManager.cc
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/unordered_map.h"
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include <errno.h>
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+
+#include "IndexManager.h"
+#include "HashIndex.h"
+#include "CollectionIndex.h"
+
+#include "chain_xattr.h"
+
+static int set_version(const char *path, uint32_t version) {
+ bufferlist bl;
+ encode(version, bl);
+ return chain_setxattr<true, true>(
+ path, "user.cephos.collection_version", bl.c_str(),
+ bl.length());
+}
+
+static int get_version(const char *path, uint32_t *version) {
+ bufferptr bp(PATH_MAX);
+ int r = chain_getxattr(path, "user.cephos.collection_version",
+ bp.c_str(), bp.length());
+ if (r < 0) {
+ if (r != -ENOENT) {
+ *version = 0;
+ return 0;
+ } else {
+ return r;
+ }
+ }
+ bp.set_length(r);
+ bufferlist bl;
+ bl.push_back(bp);
+ auto i = bl.cbegin();
+ decode(*version, i);
+ return 0;
+}
+
+IndexManager::~IndexManager() {
+
+ for (ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.begin();
+ it != col_indices.end(); ++it) {
+
+ delete it->second;
+ it->second = NULL;
+ }
+ col_indices.clear();
+}
+
+
+int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
+ RWLock::WLocker l(lock);
+ int r = set_version(path, version);
+ if (r < 0)
+ return r;
+ HashIndex index(cct, c, path, cct->_conf->filestore_merge_threshold,
+ cct->_conf->filestore_split_multiple,
+ version,
+ cct->_conf->filestore_index_retry_probability);
+ r = index.init();
+ if (r < 0)
+ return r;
+ return index.read_settings();
+}
+
+int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) {
+ if (upgrade) {
+ // Need to check the collection generation
+ int r;
+ uint32_t version = 0;
+ r = get_version(path, &version);
+ if (r < 0)
+ return r;
+
+ switch (version) {
+ case CollectionIndex::FLAT_INDEX_TAG:
+ case CollectionIndex::HASH_INDEX_TAG: // fall through
+ case CollectionIndex::HASH_INDEX_TAG_2: // fall through
+ case CollectionIndex::HOBJECT_WITH_POOL: {
+ // Must be a HashIndex
+ *index = new HashIndex(cct, c, path,
+ cct->_conf->filestore_merge_threshold,
+ cct->_conf->filestore_split_multiple,
+ version);
+ return (*index)->read_settings();
+ }
+ default: ceph_abort();
+ }
+
+ } else {
+ // No need to check
+ *index = new HashIndex(cct, c, path, cct->_conf->filestore_merge_threshold,
+ cct->_conf->filestore_split_multiple,
+ CollectionIndex::HOBJECT_WITH_POOL,
+ cct->_conf->filestore_index_retry_probability);
+ return (*index)->read_settings();
+ }
+}
+
+bool IndexManager::get_index_optimistic(coll_t c, Index *index) {
+ RWLock::RLocker l(lock);
+ ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
+ if (it == col_indices.end())
+ return false;
+ index->index = it->second;
+ return true;
+}
+
+int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) {
+ if (get_index_optimistic(c, index))
+ return 0;
+ RWLock::WLocker l(lock);
+ ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
+ if (it == col_indices.end()) {
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str());
+ CollectionIndex* colIndex = NULL;
+ int r = build_index(c, path, &colIndex);
+ if (r < 0)
+ return r;
+ col_indices[c] = colIndex;
+ index->index = colIndex;
+ } else {
+ index->index = it->second;
+ }
+ return 0;
+}
diff --git a/src/os/filestore/IndexManager.h b/src/os/filestore/IndexManager.h
new file mode 100644
index 00000000..19cd2926
--- /dev/null
+++ b/src/os/filestore/IndexManager.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef OS_INDEXMANAGER_H
+#define OS_INDEXMANAGER_H
+
+#include "include/unordered_map.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+
+#include "CollectionIndex.h"
+#include "HashIndex.h"
+
+
+/// Public type for Index
+struct Index {
+ CollectionIndex *index;
+
+ Index() : index(NULL) {}
+ explicit Index(CollectionIndex* index) : index(index) {}
+
+ CollectionIndex *operator->() { return index; }
+ CollectionIndex &operator*() { return *index; }
+};
+
+
+/**
+ * Encapsulates mutual exclusion for CollectionIndexes.
+ *
+ * Allowing a modification (removal or addition of an object) to occur
+ * while a read is occurring (lookup of an object's path and use of
+ * that path) may result in the path becoming invalid. Thus, during
+ * the lifetime of a CollectionIndex object and any paths returned
+ * by it, no other concurrent accesses may be allowed.
+ * This is enforced by using CollectionIndex::access_lock
+ */
+class IndexManager {
+ CephContext* cct;
+ RWLock lock; ///< Lock for Index Manager
+ bool upgrade;
+ ceph::unordered_map<coll_t, CollectionIndex* > col_indices;
+
+ /**
+ * Index factory
+ *
+ * Encapsulates logic for handling legacy FileStore
+ * layouts
+ *
+ * @param [in] c Collection for which to get index
+ * @param [in] path Path to collection
+ * @param [out] index Index for c
+ * @return error code
+ */
+ int build_index(coll_t c, const char *path, CollectionIndex **index);
+ bool get_index_optimistic(coll_t c, Index *index);
+public:
+ /// Constructor
+ explicit IndexManager(CephContext* cct,
+ bool upgrade) : cct(cct),
+ lock("IndexManager lock"),
+ upgrade(upgrade) {}
+
+ ~IndexManager();
+
+ /**
+ * Reserve and return index for c
+ *
+ * @param [in] c Collection for which to get index
+ * @param [in] baseDir base directory of collections
+ * @param [out] index Index for c
+ * @return error code
+ */
+ int get_index(coll_t c, const string& baseDir, Index *index);
+
+ /**
+ * Initialize index for collection c at path
+ *
+ * @param [in] c Collection for which to init Index
+ * @param [in] path Path to collection
+ * @param [in] filestore_version version of containing FileStore
+ * @return error code
+ */
+ int init_index(coll_t c, const char *path, uint32_t filestore_version);
+};
+
+#endif
diff --git a/src/os/filestore/Journal.h b/src/os/filestore/Journal.h
new file mode 100644
index 00000000..cfb667d8
--- /dev/null
+++ b/src/os/filestore/Journal.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_JOURNAL_H
+#define CEPH_JOURNAL_H
+
+#include <errno.h>
+
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "common/Finisher.h"
+#include "common/TrackedOp.h"
+#include "os/ObjectStore.h"
+#include "common/zipkin_trace.h"
+
+class PerfCounters;
+
+class Journal {
+protected:
+ uuid_d fsid;
+ Finisher *finisher;
+public:
+ CephContext* cct;
+ PerfCounters *logger;
+protected:
+ Cond *do_sync_cond;
+ bool wait_on_full;
+
+public:
+ Journal(CephContext* cct, uuid_d f, Finisher *fin, Cond *c=0) :
+ fsid(f), finisher(fin), cct(cct), logger(NULL),
+ do_sync_cond(c),
+ wait_on_full(false) { }
+ virtual ~Journal() { }
+
+ virtual int check() = 0; ///< check if journal appears valid
+ virtual int create() = 0; ///< create a fresh journal
+ virtual int open(uint64_t fs_op_seq) = 0; ///< open an existing journal
+ virtual void close() = 0; ///< close an open journal
+
+ virtual void flush() = 0;
+
+ virtual void get_devices(set<string> *ls) {}
+ virtual void collect_metadata(map<string,string> *pm) {}
+ /**
+ * reserve_throttle_and_backoff
+ *
+ * Implementation may throttle or backoff based on ops
+ * reserved here but not yet released using committed_thru.
+ */
+ virtual void reserve_throttle_and_backoff(uint64_t count) = 0;
+
+ virtual int dump(ostream& out) { return -EOPNOTSUPP; }
+
+ void set_wait_on_full(bool b) { wait_on_full = b; }
+
+ // writes
+ virtual bool is_writeable() = 0;
+ virtual int make_writeable() = 0;
+ virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
+ Context *oncommit,
+ TrackedOpRef osd_op = TrackedOpRef()) = 0;
+ virtual void commit_start(uint64_t seq) = 0;
+ virtual void committed_thru(uint64_t seq) = 0;
+
+ /// Read next journal entry - asserts on invalid journal
+ virtual bool read_entry(
+ bufferlist &bl, ///< [out] payload on successful read
+ uint64_t &seq ///< [in,out] sequence number on last successful read
+ ) = 0; ///< @return true on successful read, false on journal end
+
+ virtual bool should_commit_now() = 0;
+
+ virtual int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) = 0;
+
+ virtual off64_t get_journal_size_estimate() { return 0; }
+
+ // reads/recovery
+
+};
+
+#endif
diff --git a/src/os/filestore/JournalThrottle.cc b/src/os/filestore/JournalThrottle.cc
new file mode 100644
index 00000000..8475bbbf
--- /dev/null
+++ b/src/os/filestore/JournalThrottle.cc
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "JournalThrottle.h"
+#include "include/ceph_assert.h"
+
+bool JournalThrottle::set_params(
+ double _low_threshhold,
+ double _high_threshhold,
+ double _expected_throughput,
+ double _high_multiple,
+ double _max_multiple,
+ uint64_t _throttle_max,
+ std::ostream *errstream)
+{
+ return throttle.set_params(
+ _low_threshhold,
+ _high_threshhold,
+ _expected_throughput,
+ _high_multiple,
+ _max_multiple,
+ _throttle_max,
+ errstream);
+}
+
+std::chrono::duration<double> JournalThrottle::get(uint64_t c)
+{
+ return throttle.get(c);
+}
+
+uint64_t JournalThrottle::take(uint64_t c)
+{
+ return throttle.take(c);
+}
+
+void JournalThrottle::register_throttle_seq(uint64_t seq, uint64_t c)
+{
+ locker l(lock);
+ journaled_ops.push_back(std::make_pair(seq, c));
+}
+
+std::pair<uint64_t, uint64_t> JournalThrottle::flush(uint64_t mono_id)
+{
+ uint64_t to_put_bytes = 0;
+ uint64_t to_put_ops = 0;
+ {
+ locker l(lock);
+ while (!journaled_ops.empty() &&
+ journaled_ops.front().first <= mono_id) {
+ to_put_bytes += journaled_ops.front().second;
+ to_put_ops++;
+ journaled_ops.pop_front();
+ }
+ }
+ throttle.put(to_put_bytes);
+ return make_pair(to_put_ops, to_put_bytes);
+}
+
+uint64_t JournalThrottle::get_current()
+{
+ return throttle.get_current();
+}
+
+uint64_t JournalThrottle::get_max()
+{
+ return throttle.get_max();
+}
diff --git a/src/os/filestore/JournalThrottle.h b/src/os/filestore/JournalThrottle.h
new file mode 100644
index 00000000..75485d6d
--- /dev/null
+++ b/src/os/filestore/JournalThrottle.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_THROTTLE_H
+#define CEPH_JOURNAL_THROTTLE_H
+
+#include "common/Throttle.h"
+
+#include <list>
+#include <deque>
+#include <condition_variable>
+#include <thread>
+#include <vector>
+#include <chrono>
+#include <iostream>
+
+/**
+ * JournalThrottle
+ *
+ * Throttle designed to implement dynamic throttling as the journal fills
+ * up. The goal is to not delay ops at all when the journal is relatively
+ * empty, delay ops somewhat as the journal begins to fill (with the delay
+ * getting linearly longer as the journal fills up to a high water mark),
+ * and to delay much more aggressively (though still linearly with usage)
+ * until we hit the max value.
+ *
+ * The implementation simply wraps BackoffThrottle with a queue of
+ * journaled but not synced ops.
+ *
+ * The usage pattern is as follows:
+ * 1) Call get(seq, bytes) before taking the op_queue_throttle
+ * 2) Once the journal is flushed, flush(max_op_id_flushed)
+ */
+class JournalThrottle {
+ BackoffThrottle throttle;
+
+ std::mutex lock;
+ /// deque<id, count>
+ std::deque<std::pair<uint64_t, uint64_t> > journaled_ops;
+ using locker = std::unique_lock<std::mutex>;
+
+public:
+ /**
+ * set_params
+ *
+ * Sets params. If the params are invalid, returns false
+ * and populates errstream (if non-null) with a user compreshensible
+ * explanation.
+ */
+ bool set_params(
+ double low_threshhold,
+ double high_threshhold,
+ double expected_throughput,
+ double high_multiple,
+ double max_multiple,
+ uint64_t throttle_max,
+ std::ostream *errstream);
+
+ /**
+ * gets specified throttle for id mono_id, waiting as necessary
+ *
+ * @param c [in] amount to take
+ * @return duration waited
+ */
+ std::chrono::duration<double> get(uint64_t c);
+
+ /**
+ * take
+ *
+ * Takes specified throttle without waiting
+ */
+ uint64_t take(uint64_t c);
+
+ /**
+ * register_throttle_seq
+ *
+ * Registers a sequence number with an amount of throttle to
+ * release upon flush()
+ *
+ * @param seq [in] seq
+ */
+ void register_throttle_seq(uint64_t seq, uint64_t c);
+
+
+ /**
+ * Releases throttle held by ids <= mono_id
+ *
+ * @param mono_id [in] id up to which to flush
+ * @returns pair<ops_flushed, bytes_flushed>
+ */
+ std::pair<uint64_t, uint64_t> flush(uint64_t mono_id);
+
+ uint64_t get_current();
+ uint64_t get_max();
+
+ JournalThrottle(
+ unsigned expected_concurrency ///< [in] determines size of conds
+ ) : throttle(g_ceph_context, "filestore_journal", expected_concurrency) {}
+};
+
+#endif
diff --git a/src/os/filestore/JournalingObjectStore.cc b/src/os/filestore/JournalingObjectStore.cc
new file mode 100644
index 00000000..714d0935
--- /dev/null
+++ b/src/os/filestore/JournalingObjectStore.cc
@@ -0,0 +1,271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "JournalingObjectStore.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+
+
+void JournalingObjectStore::journal_start()
+{
+ dout(10) << "journal_start" << dendl;
+ finisher.start();
+}
+
+void JournalingObjectStore::journal_stop()
+{
+ dout(10) << "journal_stop" << dendl;
+ finisher.wait_for_empty();
+ finisher.stop();
+}
+
+// A journal_replay() makes journal writeable, this closes that out.
+void JournalingObjectStore::journal_write_close()
+{
+ if (journal) {
+ journal->close();
+ delete journal;
+ journal = 0;
+ }
+ apply_manager.reset();
+}
+
+int JournalingObjectStore::journal_replay(uint64_t fs_op_seq)
+{
+ dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl;
+
+ if (cct->_conf->journal_replay_from) {
+ dout(0) << "journal_replay forcing replay from "
+ << cct->_conf->journal_replay_from
+ << " instead of " << fs_op_seq << dendl;
+ // the previous op is the last one committed
+ fs_op_seq = cct->_conf->journal_replay_from - 1;
+ }
+
+ uint64_t op_seq = fs_op_seq;
+ apply_manager.init_seq(fs_op_seq);
+
+ if (!journal) {
+ submit_manager.set_op_seq(op_seq);
+ return 0;
+ }
+
+ int err = journal->open(op_seq);
+ if (err < 0) {
+ dout(3) << "journal_replay open failed with "
+ << cpp_strerror(err) << dendl;
+ delete journal;
+ journal = 0;
+ return err;
+ }
+
+ replaying = true;
+
+ int count = 0;
+ while (1) {
+ bufferlist bl;
+ uint64_t seq = op_seq + 1;
+ if (!journal->read_entry(bl, seq)) {
+ dout(3) << "journal_replay: end of journal, done." << dendl;
+ break;
+ }
+
+ if (seq <= op_seq) {
+ dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl;
+ continue;
+ }
+ ceph_assert(op_seq == seq-1);
+
+ dout(3) << "journal_replay: applying op seq " << seq << dendl;
+ auto p = bl.cbegin();
+ vector<ObjectStore::Transaction> tls;
+ while (!p.end()) {
+ tls.emplace_back(Transaction(p));
+ }
+
+ apply_manager.op_apply_start(seq);
+ int r = do_transactions(tls, seq);
+ apply_manager.op_apply_finish(seq);
+
+ op_seq = seq;
+ count++;
+
+ dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl;
+ }
+
+ if (count)
+ dout(3) << "journal_replay: total = " << count << dendl;
+
+ replaying = false;
+
+ submit_manager.set_op_seq(op_seq);
+
+ // done reading, make writeable.
+ err = journal->make_writeable();
+ if (err < 0)
+ return err;
+
+ if (!count)
+ journal->committed_thru(fs_op_seq);
+
+ return count;
+}
+
+
+// ------------------------------------
+
+uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op)
+{
+ Mutex::Locker l(apply_lock);
+ while (blocked) {
+ dout(10) << "op_apply_start blocked, waiting" << dendl;
+ blocked_cond.Wait(apply_lock);
+ }
+ dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> "
+ << (open_ops+1) << dendl;
+ ceph_assert(!blocked);
+ ceph_assert(op > committed_seq);
+ open_ops++;
+ return op;
+}
+
+void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op)
+{
+ Mutex::Locker l(apply_lock);
+ dout(10) << "op_apply_finish " << op << " open_ops " << open_ops << " -> "
+ << (open_ops-1) << ", max_applied_seq " << max_applied_seq << " -> "
+ << std::max(op, max_applied_seq) << dendl;
+ --open_ops;
+ ceph_assert(open_ops >= 0);
+
+ // signal a blocked commit_start
+ if (blocked) {
+ blocked_cond.Signal();
+ }
+
+ // there can be multiple applies in flight; track the max value we
+ // note. note that we can't _read_ this value and learn anything
+ // meaningful unless/until we've quiesced all in-flight applies.
+ if (op > max_applied_seq)
+ max_applied_seq = op;
+}
+
+uint64_t JournalingObjectStore::SubmitManager::op_submit_start()
+{
+ lock.Lock();
+ uint64_t op = ++op_seq;
+ dout(10) << "op_submit_start " << op << dendl;
+ return op;
+}
+
+void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op)
+{
+ dout(10) << "op_submit_finish " << op << dendl;
+ if (op != op_submitted + 1) {
+ dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
+ << ", OUT OF ORDER" << dendl;
+ ceph_abort_msg("out of order op_submit_finish");
+ }
+ op_submitted = op;
+ lock.Unlock();
+}
+
+
+// ------------------------------------------
+
+void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c)
+{
+ Mutex::Locker l(com_lock);
+ ceph_assert(c);
+ commit_waiters[op].push_back(c);
+}
+
+bool JournalingObjectStore::ApplyManager::commit_start()
+{
+ bool ret = false;
+
+ {
+ Mutex::Locker l(apply_lock);
+ dout(10) << "commit_start max_applied_seq " << max_applied_seq
+ << ", open_ops " << open_ops << dendl;
+ blocked = true;
+ while (open_ops > 0) {
+ dout(10) << "commit_start waiting for " << open_ops
+ << " open ops to drain" << dendl;
+ blocked_cond.Wait(apply_lock);
+ }
+ ceph_assert(open_ops == 0);
+ dout(10) << "commit_start blocked, all open_ops have completed" << dendl;
+ {
+ Mutex::Locker l(com_lock);
+ if (max_applied_seq == committed_seq) {
+ dout(10) << "commit_start nothing to do" << dendl;
+ blocked = false;
+ ceph_assert(commit_waiters.empty());
+ goto out;
+ }
+
+ committing_seq = max_applied_seq;
+
+ dout(10) << "commit_start committing " << committing_seq
+ << ", still blocked" << dendl;
+ }
+ }
+ ret = true;
+
+ if (journal)
+ journal->commit_start(committing_seq); // tell the journal too
+ out:
+ return ret;
+}
+
+void JournalingObjectStore::ApplyManager::commit_started()
+{
+ Mutex::Locker l(apply_lock);
+ // allow new ops. (underlying fs should now be committing all prior ops)
+ dout(10) << "commit_started committing " << committing_seq << ", unblocking"
+ << dendl;
+ blocked = false;
+ blocked_cond.Signal();
+}
+
+void JournalingObjectStore::ApplyManager::commit_finish()
+{
+ Mutex::Locker l(com_lock);
+ dout(10) << "commit_finish thru " << committing_seq << dendl;
+
+ if (journal)
+ journal->committed_thru(committing_seq);
+
+ committed_seq = committing_seq;
+
+ map<version_t, vector<Context*> >::iterator p = commit_waiters.begin();
+ while (p != commit_waiters.end() &&
+ p->first <= committing_seq) {
+ finisher.queue(p->second);
+ commit_waiters.erase(p++);
+ }
+}
+
+void JournalingObjectStore::_op_journal_transactions(
+ bufferlist& tbl, uint32_t orig_len, uint64_t op,
+ Context *onjournal, TrackedOpRef osd_op)
+{
+ if (osd_op.get())
+ dout(10) << "op_journal_transactions " << op << " reqid_t "
+ << (static_cast<OpRequest *>(osd_op.get()))->get_reqid() << dendl;
+ else
+ dout(10) << "op_journal_transactions " << op << dendl;
+
+ if (journal && journal->is_writeable()) {
+ journal->submit_entry(op, tbl, orig_len, onjournal, osd_op);
+ } else if (onjournal) {
+ apply_manager.add_waiter(op, onjournal);
+ }
+}
diff --git a/src/os/filestore/JournalingObjectStore.h b/src/os/filestore/JournalingObjectStore.h
new file mode 100644
index 00000000..a289d0e8
--- /dev/null
+++ b/src/os/filestore/JournalingObjectStore.h
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_JOURNALINGOBJECTSTORE_H
+#define CEPH_JOURNALINGOBJECTSTORE_H
+
+#include "os/ObjectStore.h"
+#include "Journal.h"
+#include "FileJournal.h"
+#include "common/RWLock.h"
+#include "osd/OpRequest.h"
+
+class JournalingObjectStore : public ObjectStore {
+protected:
+ Journal *journal;
+ Finisher finisher;
+
+
+ class SubmitManager {
+ CephContext* cct;
+ Mutex lock;
+ uint64_t op_seq;
+ uint64_t op_submitted;
+ public:
+ SubmitManager(CephContext* cct) :
+ cct(cct), lock("JOS::SubmitManager::lock", false, true, false),
+ op_seq(0), op_submitted(0)
+ {}
+ uint64_t op_submit_start();
+ void op_submit_finish(uint64_t op);
+ void set_op_seq(uint64_t seq) {
+ Mutex::Locker l(lock);
+ op_submitted = op_seq = seq;
+ }
+ uint64_t get_op_seq() {
+ return op_seq;
+ }
+ } submit_manager;
+
+ class ApplyManager {
+ CephContext* cct;
+ Journal *&journal;
+ Finisher &finisher;
+
+ Mutex apply_lock;
+ bool blocked;
+ Cond blocked_cond;
+ int open_ops;
+ uint64_t max_applied_seq;
+
+ Mutex com_lock;
+ map<version_t, vector<Context*> > commit_waiters;
+ uint64_t committing_seq, committed_seq;
+
+ public:
+ ApplyManager(CephContext* cct, Journal *&j, Finisher &f) :
+ cct(cct), journal(j), finisher(f),
+ apply_lock("JOS::ApplyManager::apply_lock", false, true, false),
+ blocked(false),
+ open_ops(0),
+ max_applied_seq(0),
+ com_lock("JOS::ApplyManager::com_lock", false, true, false),
+ committing_seq(0), committed_seq(0) {}
+ void reset() {
+ ceph_assert(open_ops == 0);
+ ceph_assert(blocked == false);
+ max_applied_seq = 0;
+ committing_seq = 0;
+ committed_seq = 0;
+ }
+ void add_waiter(uint64_t, Context*);
+ uint64_t op_apply_start(uint64_t op);
+ void op_apply_finish(uint64_t op);
+ bool commit_start();
+ void commit_started();
+ void commit_finish();
+ bool is_committing() {
+ Mutex::Locker l(com_lock);
+ return committing_seq != committed_seq;
+ }
+ uint64_t get_committed_seq() {
+ Mutex::Locker l(com_lock);
+ return committed_seq;
+ }
+ uint64_t get_committing_seq() {
+ Mutex::Locker l(com_lock);
+ return committing_seq;
+ }
+ void init_seq(uint64_t fs_op_seq) {
+ {
+ Mutex::Locker l(com_lock);
+ committed_seq = fs_op_seq;
+ committing_seq = fs_op_seq;
+ }
+ {
+ Mutex::Locker l(apply_lock);
+ max_applied_seq = fs_op_seq;
+ }
+ }
+ } apply_manager;
+
+ bool replaying;
+
+protected:
+ void journal_start();
+ void journal_stop();
+ void journal_write_close();
+ int journal_replay(uint64_t fs_op_seq);
+
+ void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op,
+ Context *onjournal, TrackedOpRef osd_op);
+
+ virtual int do_transactions(vector<ObjectStore::Transaction>& tls, uint64_t op_seq) = 0;
+
+public:
+ bool is_committing() {
+ return apply_manager.is_committing();
+ }
+ uint64_t get_committed_seq() {
+ return apply_manager.get_committed_seq();
+ }
+
+public:
+ JournalingObjectStore(CephContext* cct, const std::string& path)
+ : ObjectStore(cct, path),
+ journal(NULL),
+ finisher(cct, "JournalObjectStore", "fn_jrn_objstore"),
+ submit_manager(cct),
+ apply_manager(cct, journal, finisher),
+ replaying(false) {}
+
+ ~JournalingObjectStore() override {
+ }
+};
+
+#endif
diff --git a/src/os/filestore/LFNIndex.cc b/src/os/filestore/LFNIndex.cc
new file mode 100644
index 00000000..2451ae8c
--- /dev/null
+++ b/src/os/filestore/LFNIndex.cc
@@ -0,0 +1,1407 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <errno.h>
+#include <string.h>
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "include/compat.h"
+#include "chain_xattr.h"
+
+#include "LFNIndex.h"
+using ceph::crypto::SHA1;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") "
+
+
+const string LFNIndex::LFN_ATTR = "user.cephos.lfn";
+const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash.";
+const string LFNIndex::SUBDIR_PREFIX = "DIR_";
+const string LFNIndex::FILENAME_COOKIE = "long";
+const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN -
+ FILENAME_COOKIE.size() -
+ FILENAME_EXTRA;
+void LFNIndex::maybe_inject_failure()
+{
+ if (error_injection_enabled) {
+ if (current_failure > last_failure &&
+ (((double)(rand() % 10000))/((double)(10000))
+ < error_injection_probability)) {
+ last_failure = current_failure;
+ current_failure = 0;
+ throw RetryException();
+ }
+ ++current_failure;
+ }
+}
+
+// Helper to close fd's when we leave scope. This is useful when used
+// in combination with RetryException, thrown by the above.
+struct FDCloser {
+ int fd;
+ explicit FDCloser(int f) : fd(f) {}
+ ~FDCloser() {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+};
+
+
+/* Public methods */
+
+uint64_t LFNIndex::get_max_escaped_name_len(const hobject_t &obj)
+{
+ ghobject_t ghobj(obj);
+ ghobj.shard_id = shard_id_t(0);
+ ghobj.generation = 0;
+ ghobj.hobj.snap = 0;
+ return lfn_generate_object_name_current(ghobj).size();
+}
+
+int LFNIndex::init()
+{
+ return _init();
+}
+
+int LFNIndex::created(const ghobject_t &oid, const char *path)
+{
+ WRAP_RETRY(
+ vector<string> path_comp;
+ string short_name;
+ r = decompose_full_path(path, &path_comp, 0, &short_name);
+ if (r < 0)
+ goto out;
+ r = lfn_created(path_comp, oid, short_name);
+ if (r < 0) {
+ if (failed) {
+ /* This is hacky, but the only way we get ENOENT from lfn_created here is
+ * if we did a failure injection in _created below AND actually started the
+ * split or merge. In that case, lfn_created already suceeded, and
+ * WRAP_RETRY already cleaned it up and we are actually done. In a real
+ * failure, the filestore itself would have ended up calling this with
+ * the new path, not the old one, so we'd find it.
+ */
+ r = 0;
+ }
+ goto out;
+ }
+ r = _created(path_comp, oid, short_name);
+ if (r < 0)
+ goto out;
+ );
+}
+
+int LFNIndex::unlink(const ghobject_t &oid)
+{
+ WRAP_RETRY(
+ vector<string> path;
+ string short_name;
+ r = _lookup(oid, &path, &short_name, NULL);
+ if (r < 0) {
+ goto out;
+ }
+ r = _remove(path, oid, short_name);
+ if (r < 0) {
+ goto out;
+ }
+ );
+}
+
+int LFNIndex::lookup(const ghobject_t &oid,
+ IndexedPath *out_path,
+ int *hardlink)
+{
+ WRAP_RETRY(
+ vector<string> path;
+ string short_name;
+ r = _lookup(oid, &path, &short_name, hardlink);
+ if (r < 0)
+ goto out;
+ string full_path = get_full_path(path, short_name);
+ *out_path = std::make_shared<Path>(full_path, this);
+ r = 0;
+ );
+}
+
+int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
+{
+ return _pre_hash_collection(pg_num, expected_num_objs);
+}
+
+
+int LFNIndex::collection_list_partial(const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next)
+{
+ return _collection_list_partial(start, end, max_count, ls, next);
+}
+
+/* Derived class utility methods */
+
+int LFNIndex::fsync_dir(const vector<string> &path)
+{
+ maybe_inject_failure();
+ int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+ FDCloser f(fd);
+ maybe_inject_failure();
+ int r = ::fsync(fd);
+ maybe_inject_failure();
+ if (r < 0) {
+ derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ return 0;
+}
+
+int LFNIndex::link_object(const vector<string> &from,
+ const vector<string> &to,
+ const ghobject_t &oid,
+ const string &from_short_name)
+{
+ int r;
+ string from_path = get_full_path(from, from_short_name);
+ string to_path;
+ maybe_inject_failure();
+ r = lfn_get_name(to, oid, 0, &to_path, 0);
+ if (r < 0)
+ return r;
+ maybe_inject_failure();
+ r = ::link(from_path.c_str(), to_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::remove_objects(const vector<string> &dir,
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining)
+{
+ set<string> clean_chains;
+ for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin();
+ to_clean != to_remove.end();
+ ++to_clean) {
+ if (!lfn_is_hashed_filename(to_clean->first)) {
+ maybe_inject_failure();
+ int r = ::unlink(get_full_path(dir, to_clean->first).c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ continue;
+ }
+ if (clean_chains.count(lfn_get_short_name(to_clean->second, 0)))
+ continue;
+ set<int> holes;
+ map<int, pair<string, ghobject_t> > chain;
+ for (int i = 0; ; ++i) {
+ string short_name = lfn_get_short_name(to_clean->second, i);
+ if (remaining->count(short_name)) {
+ chain[i] = *(remaining->find(short_name));
+ } else if (to_remove.count(short_name)) {
+ holes.insert(i);
+ } else {
+ break;
+ }
+ }
+
+ map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin();
+ for (set<int>::iterator i = holes.begin();
+ i != holes.end();
+ ++i) {
+ if (candidate == chain.rend() || *i > candidate->first) {
+ string remove_path_name =
+ get_full_path(dir, lfn_get_short_name(to_clean->second, *i));
+ maybe_inject_failure();
+ int r = ::unlink(remove_path_name.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ continue;
+ }
+ string from = get_full_path(dir, candidate->second.first);
+ string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i));
+ maybe_inject_failure();
+ int r = ::rename(from.c_str(), to.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ remaining->erase(candidate->second.first);
+ remaining->insert(pair<string, ghobject_t>(
+ lfn_get_short_name(candidate->second.second, *i),
+ candidate->second.second));
+ ++candidate;
+ }
+ if (!holes.empty())
+ clean_chains.insert(lfn_get_short_name(to_clean->second, 0));
+ }
+ return 0;
+}
+
+int LFNIndex::move_objects(const vector<string> &from,
+ const vector<string> &to)
+{
+ map<string, ghobject_t> to_move;
+ int r;
+ r = list_objects(from, 0, NULL, &to_move);
+ if (r < 0)
+ return r;
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ string from_path = get_full_path(from, i->first);
+ string to_path, to_name;
+ r = lfn_get_name(to, i->second, &to_name, &to_path, 0);
+ if (r < 0)
+ return r;
+ maybe_inject_failure();
+ r = ::link(from_path.c_str(), to_path.c_str());
+ if (r < 0 && errno != EEXIST)
+ return -errno;
+ maybe_inject_failure();
+ r = lfn_created(to, i->second, to_name);
+ maybe_inject_failure();
+ if (r < 0)
+ return r;
+ }
+ r = fsync_dir(to);
+ if (r < 0)
+ return r;
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ maybe_inject_failure();
+ r = ::unlink(get_full_path(from, i->first).c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ }
+ return fsync_dir(from);
+}
+
+int LFNIndex::remove_object(const vector<string> &from,
+ const ghobject_t &oid)
+{
+ string short_name;
+ int r, exist;
+ maybe_inject_failure();
+ r = get_mangled_name(from, oid, &short_name, &exist);
+ maybe_inject_failure();
+ if (r < 0)
+ return r;
+ if (exist == 0)
+ return -ENOENT;
+ return lfn_unlink(from, oid, short_name);
+}
+
+int LFNIndex::get_mangled_name(const vector<string> &from,
+ const ghobject_t &oid,
+ string *mangled_name, int *hardlink)
+{
+ return lfn_get_name(from, oid, mangled_name, 0, hardlink);
+}
+
+int LFNIndex::move_subdir(
+ LFNIndex &from,
+ LFNIndex &dest,
+ const vector<string> &path,
+ string dir
+ )
+{
+ vector<string> sub_path(path.begin(), path.end());
+ sub_path.push_back(dir);
+ string from_path(from.get_full_path_subdir(sub_path));
+ string to_path(dest.get_full_path_subdir(sub_path));
+ int r = ::rename(from_path.c_str(), to_path.c_str());
+ if (r < 0)
+ return -errno;
+ return 0;
+}
+
+int LFNIndex::move_object(
+ LFNIndex &from,
+ LFNIndex &dest,
+ const vector<string> &path,
+ const pair<string, ghobject_t> &obj
+ )
+{
+ string from_path(from.get_full_path(path, obj.first));
+ string to_path;
+ string to_name;
+ int exists;
+ int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists) {
+ r = ::link(from_path.c_str(), to_path.c_str());
+ if (r < 0)
+ return r;
+ }
+ r = dest.lfn_created(path, obj.second, to_name);
+ if (r < 0)
+ return r;
+ r = dest.fsync_dir(path);
+ if (r < 0)
+ return r;
+ r = from.remove_object(path, obj.second);
+ if (r < 0)
+ return r;
+ return from.fsync_dir(path);
+}
+
+
+static int get_hobject_from_oinfo(const char *dir, const char *file,
+ ghobject_t *o)
+{
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/%s", dir, file);
+ // Hack, user.ceph._ is the attribute used to store the object info
+ bufferptr bp;
+ int r = chain_getxattr_buf(
+ path,
+ "user.ceph._",
+ &bp);
+ if (r < 0)
+ return r;
+ bufferlist bl;
+ if (r > 0)
+ bl.push_back(bp);
+ object_info_t oi(bl);
+ *o = ghobject_t(oi.soid);
+ return 0;
+}
+
+
+int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
+ long *handle, map<string, ghobject_t> *out)
+{
+ string to_list_path = get_full_path_subdir(to_list);
+ DIR *dir = ::opendir(to_list_path.c_str());
+ if (!dir) {
+ return -errno;
+ }
+
+ if (handle && *handle) {
+ seekdir(dir, *handle);
+ }
+
+ struct dirent *de = nullptr;
+ int r = 0;
+ int listed = 0;
+ bool end = true;
+ while ((de = ::readdir(dir))) {
+ end = false;
+ if (max_objs > 0 && listed >= max_objs) {
+ break;
+ }
+ if (de->d_name[0] == '.')
+ continue;
+ string short_name(de->d_name);
+ ghobject_t obj;
+ if (lfn_is_object(short_name)) {
+ r = lfn_translate(to_list, short_name, &obj);
+ if (r == -EINVAL) {
+ continue;
+ } else if (r < 0) {
+ goto cleanup;
+ } else {
+ string long_name = lfn_generate_object_name(obj);
+ if (!lfn_must_hash(long_name)) {
+ ceph_assert(long_name == short_name);
+ }
+ if (index_version == HASH_INDEX_TAG)
+ get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
+
+ out->insert(pair<string, ghobject_t>(short_name, obj));
+ ++listed;
+ }
+ }
+ }
+
+ if (handle && !end) {
+ *handle = telldir(dir);
+ }
+
+ r = 0;
+ cleanup:
+ ::closedir(dir);
+ return r;
+}
+
+int LFNIndex::list_subdirs(const vector<string> &to_list,
+ vector<string> *out)
+{
+ string to_list_path = get_full_path_subdir(to_list);
+ DIR *dir = ::opendir(to_list_path.c_str());
+ if (!dir)
+ return -errno;
+
+ struct dirent *de = nullptr;
+ while ((de = ::readdir(dir))) {
+ string short_name(de->d_name);
+ string demangled_name;
+ if (lfn_is_subdir(short_name, &demangled_name)) {
+ out->push_back(demangled_name);
+ }
+ }
+
+ ::closedir(dir);
+ return 0;
+}
+
+int LFNIndex::create_path(const vector<string> &to_create)
+{
+ maybe_inject_failure();
+ int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777);
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::remove_path(const vector<string> &to_remove)
+{
+ maybe_inject_failure();
+ int r = ::rmdir(get_full_path_subdir(to_remove).c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::path_exists(const vector<string> &to_check, int *exists)
+{
+ string full_path = get_full_path_subdir(to_check);
+ struct stat buf;
+ if (::stat(full_path.c_str(), &buf)) {
+ int r = -errno;
+ if (r == -ENOENT) {
+ *exists = 0;
+ return 0;
+ } else {
+ return r;
+ }
+ } else {
+ *exists = 1;
+ return 0;
+ }
+}
+
+int LFNIndex::add_attr_path(const vector<string> &path,
+ const string &attr_name,
+ bufferlist &attr_value)
+{
+ string full_path = get_full_path_subdir(path);
+ maybe_inject_failure();
+ return chain_setxattr<false, true>(
+ full_path.c_str(), mangle_attr_name(attr_name).c_str(),
+ reinterpret_cast<void *>(attr_value.c_str()),
+ attr_value.length());
+}
+
+int LFNIndex::get_attr_path(const vector<string> &path,
+ const string &attr_name,
+ bufferlist &attr_value)
+{
+ string full_path = get_full_path_subdir(path);
+ bufferptr bp;
+ int r = chain_getxattr_buf(
+ full_path.c_str(),
+ mangle_attr_name(attr_name).c_str(),
+ &bp);
+ if (r > 0)
+ attr_value.push_back(bp);
+ return r;
+}
+
+int LFNIndex::remove_attr_path(const vector<string> &path,
+ const string &attr_name)
+{
+ string full_path = get_full_path_subdir(path);
+ string mangled_attr_name = mangle_attr_name(attr_name);
+ maybe_inject_failure();
+ return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
+}
+
+string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid)
+{
+ char s[FILENAME_MAX_LEN];
+ char *end = s + sizeof(s);
+ char *t = s;
+
+ ceph_assert(oid.generation == ghobject_t::NO_GEN);
+ const char *i = oid.hobj.oid.name.c_str();
+ // Escape subdir prefix
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+ *t++ = '\\';
+ *t++ = 'd';
+ i += 4;
+ }
+ while (*i && t < end) {
+ if (*i == '\\') {
+ *t++ = '\\';
+ *t++ = '\\';
+ } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading .
+ *t++ = '\\';
+ *t++ = '.';
+ } else if (*i == '/') {
+ *t++ = '\\';
+ *t++ = 's';
+ } else
+ *t++ = *i;
+ i++;
+ }
+
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "_head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "_snapdir");
+ else
+ t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+
+ return string(s);
+}
+
+static void append_escaped(string::const_iterator begin,
+ string::const_iterator end,
+ string *out)
+{
+ for (string::const_iterator i = begin; i != end; ++i) {
+ if (*i == '\\') {
+ out->append("\\\\");
+ } else if (*i == '/') {
+ out->append("\\s");
+ } else if (*i == '_') {
+ out->append("\\u");
+ } else if (*i == '\0') {
+ out->append("\\n");
+ } else {
+ out->append(i, i+1);
+ }
+ }
+}
+
+string LFNIndex::lfn_generate_object_name_current(const ghobject_t &oid)
+{
+ string full_name;
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+ full_name.append("\\d");
+ i += 4;
+ } else if (oid.hobj.oid.name[0] == '.') {
+ full_name.append("\\.");
+ ++i;
+ }
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+ full_name.append("_");
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+ full_name.append("_");
+
+ char buf[PATH_MAX];
+ char *t = buf;
+ const char *end = t + sizeof(buf);
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "snapdir");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ t += snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+ full_name.append(buf, t);
+ full_name.append("_");
+
+ append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name);
+ full_name.append("_");
+
+ t = buf;
+ if (oid.hobj.pool == -1)
+ t += snprintf(t, end - t, "none");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool);
+ full_name.append(buf, t);
+
+ if (oid.generation != ghobject_t::NO_GEN ||
+ oid.shard_id != shard_id_t::NO_SHARD) {
+ full_name.append("_");
+
+ t = buf;
+ t += snprintf(t, end - buf, "%llx", (long long unsigned)oid.generation);
+ full_name.append(buf, t);
+
+ full_name.append("_");
+
+ t = buf;
+ t += snprintf(t, end - buf, "%x", (int)oid.shard_id);
+ full_name.append(buf, t);
+ }
+
+ return full_name;
+}
+
+string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid)
+{
+ if (index_version == HASH_INDEX_TAG)
+ return lfn_generate_object_name_keyless(oid);
+
+ ceph_assert(oid.generation == ghobject_t::NO_GEN);
+ string full_name;
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+ full_name.append("\\d");
+ i += 4;
+ } else if (oid.hobj.oid.name[0] == '.') {
+ full_name.append("\\.");
+ ++i;
+ }
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+ full_name.append("_");
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+ full_name.append("_");
+
+ char snap_with_hash[PATH_MAX];
+ char *t = snap_with_hash;
+ char *end = t + sizeof(snap_with_hash);
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "snapdir");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+ full_name += string(snap_with_hash);
+ return full_name;
+}
+
+int LFNIndex::lfn_get_name(const vector<string> &path,
+ const ghobject_t &oid,
+ string *mangled_name, string *out_path,
+ int *hardlink)
+{
+ string full_name = lfn_generate_object_name(oid);
+ int r;
+
+ if (!lfn_must_hash(full_name)) {
+ if (mangled_name)
+ *mangled_name = full_name;
+ if (out_path)
+ *out_path = get_full_path(path, full_name);
+ if (hardlink) {
+ struct stat buf;
+ string full_path = get_full_path(path, full_name);
+ maybe_inject_failure();
+ r = ::stat(full_path.c_str(), &buf);
+ if (r < 0) {
+ if (errno == ENOENT)
+ *hardlink = 0;
+ else
+ return -errno;
+ } else {
+ *hardlink = buf.st_nlink;
+ }
+ }
+ return 0;
+ }
+
+ int i = 0;
+ string candidate;
+ string candidate_path;
+ for ( ; ; ++i) {
+ candidate = lfn_get_short_name(oid, i);
+ candidate_path = get_full_path(path, candidate);
+ bufferptr bp;
+ r = chain_getxattr_buf(
+ candidate_path.c_str(),
+ get_lfn_attr().c_str(),
+ &bp);
+ if (r < 0) {
+ if (errno != ENODATA && errno != ENOENT)
+ return -errno;
+ if (errno == ENODATA) {
+ // Left over from incomplete transaction, it'll be replayed
+ maybe_inject_failure();
+ r = ::unlink(candidate_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ }
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (hardlink)
+ *hardlink = 0;
+ return 0;
+ }
+ ceph_assert(r > 0);
+ string lfn(bp.c_str(), bp.length());
+ if (lfn == full_name) {
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (hardlink) {
+ struct stat st;
+ r = ::stat(candidate_path.c_str(), &st);
+ if (r < 0) {
+ if (errno == ENOENT)
+ *hardlink = 0;
+ else
+ return -errno;
+ } else {
+ *hardlink = st.st_nlink;
+ }
+ }
+ return 0;
+ }
+ bp = bufferptr();
+ r = chain_getxattr_buf(
+ candidate_path.c_str(),
+ get_alt_lfn_attr().c_str(),
+ &bp);
+ if (r > 0) {
+ // only consider alt name if nlink > 1
+ struct stat st;
+ int rc = ::stat(candidate_path.c_str(), &st);
+ if (rc < 0)
+ return -errno;
+ if (st.st_nlink <= 1) {
+ // left over from incomplete unlink, remove
+ maybe_inject_failure();
+ dout(20) << __func__ << " found extra alt attr for " << candidate_path
+ << ", long name " << string(bp.c_str(), bp.length()) << dendl;
+ rc = chain_removexattr(candidate_path.c_str(),
+ get_alt_lfn_attr().c_str());
+ maybe_inject_failure();
+ if (rc < 0)
+ return rc;
+ continue;
+ }
+ string lfn(bp.c_str(), bp.length());
+ if (lfn == full_name) {
+ dout(20) << __func__ << " used alt attr for " << full_name << dendl;
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (hardlink)
+ *hardlink = st.st_nlink;
+ return 0;
+ }
+ }
+ }
+ ceph_abort(); // Unreachable
+ return 0;
+}
+
+int LFNIndex::lfn_created(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name)
+{
+ if (!lfn_is_hashed_filename(mangled_name))
+ return 0;
+ string full_path = get_full_path(path, mangled_name);
+ string full_name = lfn_generate_object_name(oid);
+ maybe_inject_failure();
+
+ // if the main attr exists and is different, move it to the alt attr.
+ bufferptr bp;
+ int r = chain_getxattr_buf(
+ full_path.c_str(),
+ get_lfn_attr().c_str(),
+ &bp);
+ if (r > 0) {
+ string lfn(bp.c_str(), bp.length());
+ if (lfn != full_name) {
+ dout(20) << __func__ << " " << mangled_name
+ << " moving old name to alt attr "
+ << lfn
+ << ", new name is " << full_name << dendl;
+ r = chain_setxattr<false, true>(
+ full_path.c_str(), get_alt_lfn_attr().c_str(),
+ bp.c_str(), bp.length());
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return chain_setxattr<false, true>(
+ full_path.c_str(), get_lfn_attr().c_str(),
+ full_name.c_str(), full_name.size());
+}
+
+int LFNIndex::lfn_unlink(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name)
+{
+ if (!lfn_is_hashed_filename(mangled_name)) {
+ string full_path = get_full_path(path, mangled_name);
+ maybe_inject_failure();
+ int r = ::unlink(full_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ return 0;
+ }
+
+ int i = 0;
+ for ( ; ; ++i) {
+ string candidate = lfn_get_short_name(oid, i);
+ if (candidate == mangled_name)
+ break;
+ }
+ int removed_index = i;
+ ++i;
+ for ( ; ; ++i) {
+ struct stat buf;
+ string to_check = lfn_get_short_name(oid, i);
+ string to_check_path = get_full_path(path, to_check);
+ int r = ::stat(to_check_path.c_str(), &buf);
+ if (r < 0) {
+ if (errno == ENOENT) {
+ break;
+ } else {
+ return -errno;
+ }
+ }
+ }
+ string full_path = get_full_path(path, mangled_name);
+ int fd = ::open(full_path.c_str(), O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+ FDCloser f(fd);
+ if (i == removed_index + 1) {
+ maybe_inject_failure();
+ int r = ::unlink(full_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ } else {
+ string& rename_to = full_path;
+ string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
+ maybe_inject_failure();
+ int r = ::rename(rename_from.c_str(), rename_to.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ }
+ struct stat st;
+ int r = ::fstat(fd, &st);
+ if (r == 0 && st.st_nlink > 0) {
+ // remove alt attr
+ dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
+ fsync_dir(path);
+ chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
+ }
+ return r;
+}
+
+int LFNIndex::lfn_translate(const vector<string> &path,
+ const string &short_name,
+ ghobject_t *out)
+{
+ if (!lfn_is_hashed_filename(short_name)) {
+ return lfn_parse_object_name(short_name, out);
+ }
+ string full_path = get_full_path(path, short_name);
+ // First, check alt attr
+ bufferptr bp;
+ int r = chain_getxattr_buf(
+ full_path.c_str(),
+ get_alt_lfn_attr().c_str(),
+ &bp);
+ if (r > 0) {
+ // There is an alt attr, does it match?
+ string lfn(bp.c_str(), bp.length());
+ if (short_name_matches(short_name.c_str(), lfn.c_str())) {
+ return lfn_parse_object_name(lfn, out);
+ }
+ }
+
+ // Get lfn_attr
+ bp = bufferptr();
+ r = chain_getxattr_buf(
+ full_path.c_str(),
+ get_lfn_attr().c_str(),
+ &bp);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ string long_name(bp.c_str(), bp.length());
+ return lfn_parse_object_name(long_name, out);
+}
+
+bool LFNIndex::lfn_is_object(const string &short_name)
+{
+ return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0);
+}
+
+bool LFNIndex::lfn_is_subdir(const string &name, string *demangled)
+{
+ if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) {
+ if (demangled)
+ *demangled = demangle_path_component(name);
+ return 1;
+ }
+ return 0;
+}
+
+static int parse_object(const char *s, ghobject_t& o)
+{
+ const char *hash = s + strlen(s) - 1;
+ while (*hash != '_' &&
+ hash > s)
+ hash--;
+ const char *bar = hash - 1;
+ while (*bar != '_' &&
+ bar > s)
+ bar--;
+ if (*bar == '_') {
+ char buf[bar-s + 1];
+ char *t = buf;
+ const char *i = s;
+ while (i < bar) {
+ if (*i == '\\') {
+ i++;
+ switch (*i) {
+ case '\\': *t++ = '\\'; break;
+ case '.': *t++ = '.'; break;
+ case 's': *t++ = '/'; break;
+ case 'd': {
+ *t++ = 'D';
+ *t++ = 'I';
+ *t++ = 'R';
+ *t++ = '_';
+ break;
+ }
+ default: ceph_abort();
+ }
+ } else {
+ *t++ = *i;
+ }
+ i++;
+ }
+ *t = 0;
+ o.hobj.oid.name = string(buf, t-buf);
+ if (strncmp(bar+1, "head", 4) == 0)
+ o.hobj.snap = CEPH_NOSNAP;
+ else if (strncmp(bar+1, "snapdir", 7) == 0)
+ o.hobj.snap = CEPH_SNAPDIR;
+ else
+ o.hobj.snap = strtoull(bar+1, NULL, 16);
+
+ uint32_t hobject_hash_input;
+ sscanf(hash, "_%X", &hobject_hash_input);
+ o.hobj.set_hash(hobject_hash_input);
+
+ return 1;
+ }
+ return 0;
+}
+
+int LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
+{
+ int r = parse_object(long_name.c_str(), *out);
+ int64_t pool = -1;
+ spg_t pg;
+ if (coll().is_pg_prefix(&pg))
+ pool = (int64_t)pg.pgid.pool();
+ out->hobj.pool = pool;
+ if (!r) return -EINVAL;
+ string temp = lfn_generate_object_name(*out);
+ return 0;
+}
+
+static bool append_unescaped(string::const_iterator begin,
+ string::const_iterator end,
+ string *out)
+{
+ for (string::const_iterator i = begin; i != end; ++i) {
+ if (*i == '\\') {
+ ++i;
+ if (*i == '\\')
+ out->append("\\");
+ else if (*i == 's')
+ out->append("/");
+ else if (*i == 'n')
+ (*out) += '\0';
+ else if (*i == 'u')
+ out->append("_");
+ else
+ return false;
+ } else {
+ out->append(i, i+1);
+ }
+ }
+ return true;
+}
+
+int LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
+ ghobject_t *out)
+{
+ string name;
+ string key;
+ uint32_t hash;
+ snapid_t snap;
+
+ string::const_iterator current = long_name.begin();
+ if (*current == '\\') {
+ ++current;
+ if (current == long_name.end()) {
+ return -EINVAL;
+ } else if (*current == 'd') {
+ name.append("DIR_");
+ ++current;
+ } else if (*current == '.') {
+ name.append(".");
+ ++current;
+ } else {
+ --current;
+ }
+ }
+
+ string::const_iterator end = current;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &name))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &key))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ string snap_str(current, end);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end != long_name.end())
+ return -EINVAL;
+ string hash_str(current, end);
+
+ if (snap_str == "head")
+ snap = CEPH_NOSNAP;
+ else if (snap_str == "snapdir")
+ snap = CEPH_SNAPDIR;
+ else
+ snap = strtoull(snap_str.c_str(), NULL, 16);
+ sscanf(hash_str.c_str(), "%X", &hash);
+
+
+ int64_t pool = -1;
+ spg_t pg;
+ if (coll().is_pg_prefix(&pg))
+ pool = (int64_t)pg.pgid.pool();
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
+ return 0;
+}
+
+
+int LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
+{
+ string name;
+ string key;
+ string ns;
+ uint32_t hash;
+ snapid_t snap;
+ uint64_t pool;
+ gen_t generation = ghobject_t::NO_GEN;
+ shard_id_t shard_id = shard_id_t::NO_SHARD;
+
+ if (index_version == HASH_INDEX_TAG)
+ return lfn_parse_object_name_keyless(long_name, out);
+ if (index_version == HASH_INDEX_TAG_2)
+ return lfn_parse_object_name_poolless(long_name, out);
+
+ string::const_iterator current = long_name.begin();
+ if (*current == '\\') {
+ ++current;
+ if (current == long_name.end()) {
+ return -EINVAL;
+ } else if (*current == 'd') {
+ name.append("DIR_");
+ ++current;
+ } else if (*current == '.') {
+ name.append(".");
+ ++current;
+ } else {
+ --current;
+ }
+ }
+
+ string::const_iterator end = current;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &name))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &key))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ string snap_str(current, end);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ string hash_str(current, end);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ if (!append_unescaped(current, end, &ns))
+ return -EINVAL;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ string pstring(current, end);
+
+ // Optional generation/shard_id
+ string genstring, shardstring;
+ if (end != long_name.end()) {
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return -EINVAL;
+ genstring = string(current, end);
+
+ generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end != long_name.end())
+ return -EINVAL;
+ shardstring = string(current, end);
+
+ shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16);
+ }
+
+ if (snap_str == "head")
+ snap = CEPH_NOSNAP;
+ else if (snap_str == "snapdir")
+ snap = CEPH_SNAPDIR;
+ else
+ snap = strtoull(snap_str.c_str(), NULL, 16);
+ sscanf(hash_str.c_str(), "%X", &hash);
+
+ if (pstring == "none")
+ pool = (uint64_t)-1;
+ else
+ pool = strtoull(pstring.c_str(), NULL, 16);
+
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
+ return 0;
+}
+
+bool LFNIndex::lfn_is_hashed_filename(const string &name)
+{
+ if (name.size() < (unsigned)FILENAME_SHORT_LEN) {
+ return 0;
+ }
+ if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size())
+ == FILENAME_COOKIE) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+bool LFNIndex::lfn_must_hash(const string &long_name)
+{
+ return (int)long_name.size() >= FILENAME_SHORT_LEN;
+}
+
+static inline void buf_to_hex(const unsigned char *buf, int len, char *str)
+{
+ int i;
+ str[0] = '\0';
+ for (i = 0; i < len; i++) {
+ sprintf(&str[i*2], "%02x", (int)buf[i]);
+ }
+}
+
+int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len)
+{
+ if (buf_len < FILENAME_HASH_LEN + 1)
+ return -EINVAL;
+
+ char buf[FILENAME_LFN_DIGEST_SIZE];
+ char hex[FILENAME_LFN_DIGEST_SIZE * 2];
+
+ SHA1 h;
+ h.Update((const unsigned char *)filename, strlen(filename));
+ h.Final((unsigned char *)buf);
+
+ buf_to_hex((unsigned char *)buf, (FILENAME_HASH_LEN + 1) / 2, hex);
+ strncpy(hash, hex, FILENAME_HASH_LEN);
+ hash[FILENAME_HASH_LEN] = '\0';
+ return 0;
+}
+
+void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len)
+{
+ char hash[FILENAME_HASH_LEN + 1];
+
+ ceph_assert(len >= FILENAME_SHORT_LEN + 4);
+
+ strncpy(filename, old_filename, FILENAME_PREFIX_LEN);
+ filename[FILENAME_PREFIX_LEN] = '\0';
+ if ((int)strlen(filename) < FILENAME_PREFIX_LEN)
+ return;
+ if (old_filename[FILENAME_PREFIX_LEN] == '\0')
+ return;
+
+ hash_filename(old_filename, hash, sizeof(hash));
+ int ofs = FILENAME_PREFIX_LEN;
+ while (1) {
+ int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str());
+ if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs)
+ break;
+ ofs--;
+ }
+}
+
+bool LFNIndex::short_name_matches(const char *short_name, const char *cand_long_name)
+{
+ const char *end = short_name;
+ while (*end) ++end;
+ const char *suffix = end;
+ if (suffix > short_name) --suffix; // last char
+ while (suffix > short_name && *suffix != '_') --suffix; // back to first _
+ if (suffix > short_name) --suffix; // one behind that
+ while (suffix > short_name && *suffix != '_') --suffix; // back to second _
+
+ int index = -1;
+ char buf[FILENAME_SHORT_LEN + 4];
+ ceph_assert((end - suffix) < (int)sizeof(buf));
+ int r = sscanf(suffix, "_%d_%s", &index, buf);
+ if (r < 2)
+ return false;
+ if (strcmp(buf, FILENAME_COOKIE.c_str()) != 0)
+ return false;
+ build_filename(cand_long_name, index, buf, sizeof(buf));
+ return strcmp(short_name, buf) == 0;
+}
+
+string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
+{
+ string long_name = lfn_generate_object_name(oid);
+ ceph_assert(lfn_must_hash(long_name));
+ char buf[FILENAME_SHORT_LEN + 4];
+ build_filename(long_name.c_str(), i, buf, sizeof(buf));
+ return string(buf);
+}
+
+const string &LFNIndex::get_base_path()
+{
+ return base_path;
+}
+
+string LFNIndex::get_full_path_subdir(const vector<string> &rel)
+{
+ string retval = get_base_path();
+ for (vector<string>::const_iterator i = rel.begin();
+ i != rel.end();
+ ++i) {
+ retval += "/";
+ retval += mangle_path_component(*i);
+ }
+ return retval;
+}
+
+string LFNIndex::get_full_path(const vector<string> &rel, const string &name)
+{
+ return get_full_path_subdir(rel) + "/" + name;
+}
+
+string LFNIndex::mangle_path_component(const string &component)
+{
+ return SUBDIR_PREFIX + component;
+}
+
+string LFNIndex::demangle_path_component(const string &component)
+{
+ return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size());
+}
+
+int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
+ ghobject_t *oid, string *shortname)
+{
+ const char *beginning = in + get_base_path().size();
+ const char *end = beginning;
+ while (1) {
+ end++;
+ beginning = end++;
+ for ( ; *end != '\0' && *end != '/'; ++end) ;
+ if (*end != '\0') {
+ out->push_back(demangle_path_component(string(beginning, end - beginning)));
+ continue;
+ } else {
+ break;
+ }
+ }
+ *shortname = string(beginning, end - beginning);
+ if (oid) {
+ int r = lfn_translate(*out, *shortname, oid);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+string LFNIndex::mangle_attr_name(const string &attr)
+{
+ return PHASH_ATTR_PREFIX + attr;
+}
diff --git a/src/os/filestore/LFNIndex.h b/src/os/filestore/LFNIndex.h
new file mode 100644
index 00000000..149ed10f
--- /dev/null
+++ b/src/os/filestore/LFNIndex.h
@@ -0,0 +1,614 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef OS_LFNINDEX_H
+#define OS_LFNINDEX_H
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <exception>
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/ceph_crypto.h"
+
+#include "CollectionIndex.h"
+
+/**
+ * LFNIndex also encapsulates logic for manipulating
+ * subdirectories of a collection as well as the long filename
+ * logic.
+ *
+ * The protected methods provide machinery for derived classes to
+ * manipulate subdirectories and objects.
+ *
+ * The virtual methods are to be overridden to provide the actual
+ * hashed layout.
+ *
+ * User must call created when an object is created.
+ *
+ * Synchronization: Calling code must ensure that there are no object
+ * creations or deletions during the lifetime of a Path object (except
+ * of an object at that path).
+ *
+ * Unless otherwise noted, methods which return an int return 0 on success
+ * and a negative error code on failure.
+ */
+#define WRAP_RETRY(x) { \
+ bool failed = false; \
+ int r = 0; \
+ init_inject_failure(); \
+ while (1) { \
+ try { \
+ if (failed) { \
+ r = cleanup(); \
+ ceph_assert(r == 0); \
+ } \
+ { x } \
+ out: \
+ complete_inject_failure(); \
+ return r; \
+ } catch (RetryException&) { \
+ failed = true; \
+ } catch (...) { \
+ ceph_abort(); \
+ } \
+ } \
+ return -1; \
+ } \
+
+
+
+class LFNIndex : public CollectionIndex {
+ /// Hash digest output size.
+ static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE;
+ /// Length of filename hash.
+ static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE;
+ /// Max filename size.
+ static const int FILENAME_MAX_LEN = 4096;
+ /// Length of hashed filename.
+ static const int FILENAME_SHORT_LEN = 255;
+ /// Length of hashed filename prefix.
+ static const int FILENAME_PREFIX_LEN;
+ /// Length of hashed filename cookie.
+ static const int FILENAME_EXTRA = 4;
+ /// Lfn cookie value.
+ static const string FILENAME_COOKIE;
+ /// Name of LFN attribute for storing full name.
+ static const string LFN_ATTR;
+ /// Prefix for subdir index attributes.
+ static const string PHASH_ATTR_PREFIX;
+ /// Prefix for index subdirectories.
+ static const string SUBDIR_PREFIX;
+
+ /// Path to Index base.
+ const string base_path;
+
+protected:
+ const uint32_t index_version;
+
+ /// true if retry injection is enabled
+ struct RetryException : public exception {};
+ bool error_injection_enabled;
+ bool error_injection_on;
+ double error_injection_probability;
+ uint64_t last_failure;
+ uint64_t current_failure;
+ void init_inject_failure() {
+ if (error_injection_on) {
+ error_injection_enabled = true;
+ last_failure = current_failure = 0;
+ }
+ }
+ void maybe_inject_failure();
+ void complete_inject_failure() {
+ error_injection_enabled = false;
+ }
+
+private:
+ string lfn_attribute, lfn_alt_attribute;
+ coll_t collection;
+
+public:
+ /// Constructor
+ LFNIndex(
+ CephContext* cct,
+ coll_t collection,
+ const char *base_path, ///< [in] path to Index root
+ uint32_t index_version,
+ double _error_injection_probability=0)
+ : CollectionIndex(cct, collection),
+ base_path(base_path),
+ index_version(index_version),
+ error_injection_enabled(false),
+ error_injection_on(_error_injection_probability != 0),
+ error_injection_probability(_error_injection_probability),
+ last_failure(0), current_failure(0),
+ collection(collection) {
+ if (index_version == HASH_INDEX_TAG) {
+ lfn_attribute = LFN_ATTR;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", index_version);
+ lfn_attribute = LFN_ATTR + string(buf);
+ lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
+ }
+ }
+
+ coll_t coll() const override { return collection; }
+
+ /// Virtual destructor
+ ~LFNIndex() override {}
+
+ /// @see CollectionIndex
+ int init() override;
+
+ /// @see CollectionIndex
+ int cleanup() override = 0;
+
+ /// @see CollectionIndex
+ int created(
+ const ghobject_t &oid,
+ const char *path
+ ) override;
+
+ /// @see CollectionIndex
+ int unlink(
+ const ghobject_t &oid
+ ) override;
+
+ /// @see CollectionIndex
+ int lookup(
+ const ghobject_t &oid,
+ IndexedPath *path,
+ int *hardlink
+ ) override;
+
+ /// @see CollectionIndex;
+ int pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ ) override;
+
+ /// @see CollectionIndex
+ int collection_list_partial(
+ const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next
+ ) override;
+
+ virtual int _split(
+ uint32_t match, //< [in] value to match
+ uint32_t bits, //< [in] bits to check
+ CollectionIndex* dest //< [in] destination index
+ ) = 0;
+ virtual int _merge(
+ uint32_t bits, //< [in] bits for target
+ CollectionIndex* dest //< [in] destination index
+ ) = 0;
+
+ /// @see CollectionIndex
+ int split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest
+ ) override {
+ WRAP_RETRY(
+ r = _split(match, bits, dest);
+ goto out;
+ );
+ }
+
+ /// @see CollectionIndex
+ int merge(
+ uint32_t bits,
+ CollectionIndex* dest
+ ) override {
+ WRAP_RETRY(
+ r = _merge(bits, dest);
+ goto out;
+ );
+ }
+
+ /**
+ * Returns the length of the longest escaped name which could result
+ * from any clone, shard, or rollback object of this object
+ */
+ static uint64_t get_max_escaped_name_len(const hobject_t &obj);
+
+protected:
+ virtual int _init() = 0;
+
+ /// Will be called upon object creation
+ virtual int _created(
+ const vector<string> &path, ///< [in] Path to subdir.
+ const ghobject_t &oid, ///< [in] Object created.
+ const string &mangled_name ///< [in] Mangled filename.
+ ) = 0;
+
+ /// Will be called to remove an object
+ virtual int _remove(
+ const vector<string> &path, ///< [in] Path to subdir.
+ const ghobject_t &oid, ///< [in] Object to remove.
+ const string &mangled_name ///< [in] Mangled filename.
+ ) = 0;
+
+ /// Return the path and mangled_name for oid.
+ virtual int _lookup(
+ const ghobject_t &oid,///< [in] Object for lookup.
+ vector<string> *path, ///< [out] Path to the object.
+ string *mangled_name, ///< [out] Mangled filename.
+ int *exists ///< [out] True if the object exists.
+ ) = 0;
+
+ /// Pre-hash the collection with the given pg number and
+ /// expected number of objects in the collection.
+ virtual int _pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ ) = 0;
+
+ /// @see CollectionIndex
+ virtual int _collection_list_partial(
+ const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next
+ ) = 0;
+
+protected:
+
+ /* Non-virtual utility methods */
+
+ /// Sync a subdirectory
+ int fsync_dir(
+ const vector<string> &path ///< [in] Path to sync
+ ); ///< @return Error Code, 0 on success
+
+ /// Link an object from from into to
+ int link_object(
+ const vector<string> &from, ///< [in] Source subdirectory.
+ const vector<string> &to, ///< [in] Dest subdirectory.
+ const ghobject_t &oid, ///< [in] Object to move.
+ const string &from_short_name ///< [in] Mangled filename of oid.
+ ); ///< @return Error Code, 0 on success
+
+ /**
+ * Efficiently remove objects from a subdirectory
+ *
+ * remove_object invalidates mangled names in the directory requiring
+ * the mangled name of each additional object to be looked up a second
+ * time. remove_objects removes the need for additional lookups
+ *
+ * @param [in] dir Directory from which to remove.
+ * @param [in] map of objects to remove to mangle names
+ * @param [in,out] map of filenames to objects
+ * @return Error Code, 0 on success.
+ */
+ int remove_objects(
+ const vector<string> &dir,
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining
+ );
+
+
+ /**
+ * Moves contents of from into to.
+ *
+ * Invalidates mangled names in to. If interrupted, all objects will be
+ * present in to before objects are removed from from. Ignores EEXIST
+ * while linking into to.
+ * @return Error Code, 0 on success
+ */
+ int move_objects(
+ const vector<string> &from, ///< [in] Source subdirectory.
+ const vector<string> &to ///< [in] Dest subdirectory.
+ );
+
+ /**
+ * Remove an object from from.
+ *
+ * Invalidates mangled names in from.
+ * @return Error Code, 0 on success
+ */
+ int remove_object(
+ const vector<string> &from, ///< [in] Directory from which to remove.
+ const ghobject_t &to_remove ///< [in] Object to remove.
+ );
+
+ /**
+ * Gets the filename corresponding to oid in from.
+ *
+ * The filename may differ between subdirectories. Furthermore,
+ * file creations ore removals in from may invalidate the name.
+ * @return Error code on failure, 0 on success
+ */
+ int get_mangled_name(
+ const vector<string> &from, ///< [in] Subdirectory
+ const ghobject_t &oid, ///< [in] Object
+ string *mangled_name, ///< [out] Filename
+ int *hardlink ///< [out] hardlink for this file, hardlink=0 mean no-exist
+ );
+
+ /// do move subdir from from to dest
+ static int move_subdir(
+ LFNIndex &from, ///< [in] from index
+ LFNIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path containing dir
+ string dir ///< [in] dir to move
+ );
+
+ /// do move object from from to dest
+ static int move_object(
+ LFNIndex &from, ///< [in] from index
+ LFNIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path to split
+ const pair<string, ghobject_t> &obj ///< [in] obj to move
+ );
+
+ /**
+ * Lists objects in to_list.
+ *
+ * @param [in] to_list Directory to list.
+ * @param [in] max_objects Max number to list.
+ * @param [in,out] handle Cookie for continuing the listing.
+ * Initialize to zero to start at the beginning of the directory.
+ * @param [out] out Mapping of listed object filenames to objects.
+ * @return Error code on failure, 0 on success
+ */
+ int list_objects(
+ const vector<string> &to_list,
+ int max_objects,
+ long *handle,
+ map<string, ghobject_t> *out
+ );
+
+ /// Lists subdirectories.
+ int list_subdirs(
+ const vector<string> &to_list, ///< [in] Directory to list.
+ vector<string> *out ///< [out] Subdirectories listed.
+ );
+
+ /// Create subdirectory.
+ int create_path(
+ const vector<string> &to_create ///< [in] Subdirectory to create.
+ );
+
+ /// Remove subdirectory.
+ int remove_path(
+ const vector<string> &to_remove ///< [in] Subdirectory to remove.
+ );
+
+ /// Check whether to_check exists.
+ int path_exists(
+ const vector<string> &to_check, ///< [in] Subdirectory to check.
+ int *exists ///< [out] 1 if it exists, 0 else
+ );
+
+ /// Save attr_value to attr_name attribute on path.
+ int add_attr_path(
+ const vector<string> &path, ///< [in] Path to modify.
+ const string &attr_name, ///< [in] Name of attribute.
+ bufferlist &attr_value ///< [in] Value to save.
+ );
+
+ /// Read into attr_value attribute attr_name on path.
+ int get_attr_path(
+ const vector<string> &path, ///< [in] Path to read.
+ const string &attr_name, ///< [in] Attribute to read.
+ bufferlist &attr_value ///< [out] Attribute value read.
+ );
+
+ /// Remove attr from path
+ int remove_attr_path(
+ const vector<string> &path, ///< [in] path from which to remove attr
+ const string &attr_name ///< [in] attr to remove
+ ); ///< @return Error code, 0 on success
+
+private:
+ /* lfn translation functions */
+
+ /**
+ * Gets the version specific lfn attribute tag
+ */
+ const string &get_lfn_attr() const {
+ return lfn_attribute;
+ }
+ const string &get_alt_lfn_attr() const {
+ return lfn_alt_attribute;
+ }
+
+ /**
+ * Gets the filename corresponding to oid in path.
+ *
+ * @param [in] path Path in which to get filename for oid.
+ * @param [in] oid Object for which to get filename.
+ * @param [out] mangled_name Filename for oid, pass NULL if not needed.
+ * @param [out] full_path Fullpath for oid, pass NULL if not needed.
+ * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if
+ * not needed
+ * @return Error Code, 0 on success.
+ */
+ int lfn_get_name(
+ const vector<string> &path,
+ const ghobject_t &oid,
+ string *mangled_name,
+ string *full_path,
+ int *hardlink
+ );
+
+ /// Adjusts path contents when oid is created at name mangled_name.
+ int lfn_created(
+ const vector<string> &path, ///< [in] Path to adjust.
+ const ghobject_t &oid, ///< [in] Object created.
+ const string &mangled_name ///< [in] Filename of created object.
+ );
+
+ /// Removes oid from path while adjusting path contents
+ int lfn_unlink(
+ const vector<string> &path, ///< [in] Path containing oid.
+ const ghobject_t &oid, ///< [in] Object to remove.
+ const string &mangled_name ///< [in] Filename of object to remove.
+ );
+
+ ///Transate a file into and ghobject_t.
+ int lfn_translate(
+ const vector<string> &path, ///< [in] Path containing the file.
+ const string &short_name, ///< [in] Filename to translate.
+ ghobject_t *out ///< [out] Object found.
+ ); ///< @return Negative error code on error, 0 if not an object, 1 else
+
+ /* manglers/demanglers */
+ /// Filters object filenames
+ bool lfn_is_object(
+ const string &short_name ///< [in] Filename to check
+ ); ///< True if short_name is an object, false otherwise
+
+ /// Filters subdir filenames
+ bool lfn_is_subdir(
+ const string &short_name, ///< [in] Filename to check.
+ string *demangled_name ///< [out] Demangled subdir name.
+ ); ///< @return True if short_name is a subdir, false otherwise
+
+ /// Generate object name
+ string lfn_generate_object_name_keyless(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ); ///< @return Generated object name.
+
+ /// Generate object name
+ string lfn_generate_object_name_poolless(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ); ///< @return Generated object name.
+
+ /// Generate object name
+ static string lfn_generate_object_name_current(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ); ///< @return Generated object name.
+
+ /// Generate object name
+ string lfn_generate_object_name(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ) {
+ if (index_version == HASH_INDEX_TAG)
+ return lfn_generate_object_name_keyless(oid);
+ if (index_version == HASH_INDEX_TAG_2)
+ return lfn_generate_object_name_poolless(oid);
+ else
+ return lfn_generate_object_name_current(oid);
+ } ///< @return Generated object name.
+
+ /// Parse object name
+ int lfn_parse_object_name_keyless(
+ const string &long_name, ///< [in] Name to parse
+ ghobject_t *out ///< [out] Resulting Object
+ ); ///< @return True if successful, False otherwise.
+
+ /// Parse object name
+ int lfn_parse_object_name_poolless(
+ const string &long_name, ///< [in] Name to parse
+ ghobject_t *out ///< [out] Resulting Object
+ ); ///< @return True if successful, False otherwise.
+
+ /// Parse object name
+ int lfn_parse_object_name(
+ const string &long_name, ///< [in] Name to parse
+ ghobject_t *out ///< [out] Resulting Object
+ ); ///< @return True if successful, False otherwise.
+
+ /// Checks whether short_name is a hashed filename.
+ bool lfn_is_hashed_filename(
+ const string &short_name ///< [in] Name to check.
+ ); ///< @return True if short_name is hashed, False otherwise.
+
+ /// Checks whether long_name must be hashed.
+ bool lfn_must_hash(
+ const string &long_name ///< [in] Name to check.
+ ); ///< @return True if long_name must be hashed, False otherwise.
+
+ /// Generate hashed name.
+ string lfn_get_short_name(
+ const ghobject_t &oid, ///< [in] Object for which to generate.
+ int i ///< [in] Index of hashed name to generate.
+ ); ///< @return Hashed filename.
+
+ /* other common methods */
+ /// Gets the base path
+ const string &get_base_path(); ///< @return Index base_path
+
+ /// Get full path the subdir
+ string get_full_path_subdir(
+ const vector<string> &rel ///< [in] The subdir.
+ ); ///< @return Full path to rel.
+
+ /// Get full path to object
+ string get_full_path(
+ const vector<string> &rel, ///< [in] Path to object.
+ const string &name ///< [in] Filename of object.
+ ); ///< @return Fullpath to object at name in rel.
+
+ /// Get mangled path component
+ string mangle_path_component(
+ const string &component ///< [in] Component to mangle
+ ); /// @return Mangled component
+
+ /// Demangle component
+ string demangle_path_component(
+ const string &component ///< [in] Subdir name to demangle
+ ); ///< @return Demangled path component.
+
+ /// Decompose full path into object name and filename.
+ int decompose_full_path(
+ const char *in, ///< [in] Full path to object.
+ vector<string> *out, ///< [out] Path to object at in.
+ ghobject_t *oid, ///< [out] Object at in.
+ string *shortname ///< [out] Filename of object at in.
+ ); ///< @return Error Code, 0 on success.
+
+ /// Mangle attribute name
+ string mangle_attr_name(
+ const string &attr ///< [in] Attribute to mangle.
+ ); ///< @return Mangled attribute name.
+
+ /// checks whether long_name could hash to short_name
+ bool short_name_matches(
+ const char *short_name, ///< [in] name to check against
+ const char *cand_long_name ///< [in] candidate long name
+ );
+
+ /// Builds hashed filename
+ void build_filename(
+ const char *old_filename, ///< [in] Filename to convert.
+ int i, ///< [in] Index of hash.
+ char *filename, ///< [out] Resulting filename.
+ int len ///< [in] Size of buffer for filename
+ ); ///< @return Error Code, 0 on success
+
+ /// Get hash of filename
+ int hash_filename(
+ const char *filename, ///< [in] Filename to hash.
+ char *hash, ///< [out] Hash of filename.
+ int len ///< [in] Size of hash buffer.
+ ); ///< @return Error Code, 0 on success.
+
+ friend class TestWrapLFNIndex;
+};
+typedef LFNIndex::IndexedPath IndexedPath;
+
+#endif
diff --git a/src/os/filestore/SequencerPosition.h b/src/os/filestore/SequencerPosition.h
new file mode 100644
index 00000000..164112ee
--- /dev/null
+++ b/src/os/filestore/SequencerPosition.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OS_SEQUENCERPOSITION_H
+#define __CEPH_OS_SEQUENCERPOSITION_H
+
+#include "include/types.h"
+#include "include/cmp.h"
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+#include <ostream>
+
+/**
+ * transaction and op offset
+ */
+struct SequencerPosition {
+ uint64_t seq; ///< seq
+ uint32_t trans; ///< transaction in that seq (0-based)
+ uint32_t op; ///< op in that transaction (0-based)
+
+ SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(seq, bl);
+ encode(trans, bl);
+ encode(op, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(seq, p);
+ decode(trans, p);
+ decode(op, p);
+ DECODE_FINISH(p);
+ }
+ void dump(Formatter *f) const {
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("trans", trans);
+ f->dump_unsigned("op", op);
+ }
+ static void generate_test_instances(list<SequencerPosition*>& o) {
+ o.push_back(new SequencerPosition);
+ o.push_back(new SequencerPosition(1, 2, 3));
+ o.push_back(new SequencerPosition(4, 5, 6));
+ }
+};
+WRITE_CLASS_ENCODER(SequencerPosition)
+
+inline ostream& operator<<(ostream& out, const SequencerPosition& t) {
+ return out << t.seq << "." << t.trans << "." << t.op;
+}
+
+WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op)
+WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op)
+
+
+#endif
diff --git a/src/os/filestore/WBThrottle.cc b/src/os/filestore/WBThrottle.cc
new file mode 100644
index 00000000..ba2ed131
--- /dev/null
+++ b/src/os/filestore/WBThrottle.cc
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+
+#include "os/filestore/WBThrottle.h"
+#include "common/perf_counters.h"
+#include "common/errno.h"
+
+WBThrottle::WBThrottle(CephContext *cct) :
+ cur_ios(0), cur_size(0),
+ cct(cct),
+ logger(NULL),
+ stopping(true),
+ lock("WBThrottle::lock", false, true, false),
+ fs(XFS)
+{
+ {
+ Mutex::Locker l(lock);
+ set_from_conf();
+ }
+ ceph_assert(cct);
+ PerfCountersBuilder b(
+ cct, string("WBThrottle"),
+ l_wbthrottle_first, l_wbthrottle_last);
+ b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data", NULL, 0, unit_t(UNIT_BYTES));
+ b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations");
+ b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations");
+ b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write");
+ b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries");
+ logger = b.create_perf_counters();
+ cct->get_perfcounters_collection()->add(logger);
+ for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i)
+ logger->set(i, 0);
+
+ cct->_conf.add_observer(this);
+}
+
+WBThrottle::~WBThrottle() {
+ ceph_assert(cct);
+ cct->get_perfcounters_collection()->remove(logger);
+ delete logger;
+ cct->_conf.remove_observer(this);
+}
+
+void WBThrottle::start()
+{
+ {
+ Mutex::Locker l(lock);
+ stopping = false;
+ }
+ create("wb_throttle");
+}
+
+void WBThrottle::stop()
+{
+ {
+ Mutex::Locker l(lock);
+ stopping = true;
+ cond.Signal();
+ }
+
+ join();
+}
+
+const char** WBThrottle::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "filestore_wbthrottle_btrfs_bytes_start_flusher",
+ "filestore_wbthrottle_btrfs_bytes_hard_limit",
+ "filestore_wbthrottle_btrfs_ios_start_flusher",
+ "filestore_wbthrottle_btrfs_ios_hard_limit",
+ "filestore_wbthrottle_btrfs_inodes_start_flusher",
+ "filestore_wbthrottle_btrfs_inodes_hard_limit",
+ "filestore_wbthrottle_xfs_bytes_start_flusher",
+ "filestore_wbthrottle_xfs_bytes_hard_limit",
+ "filestore_wbthrottle_xfs_ios_start_flusher",
+ "filestore_wbthrottle_xfs_ios_hard_limit",
+ "filestore_wbthrottle_xfs_inodes_start_flusher",
+ "filestore_wbthrottle_xfs_inodes_hard_limit",
+ NULL
+ };
+ return KEYS;
+}
+
+void WBThrottle::set_from_conf()
+{
+ ceph_assert(lock.is_locked());
+ if (fs == BTRFS) {
+ size_limits.first =
+ cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher;
+ size_limits.second =
+ cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit;
+ io_limits.first =
+ cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher;
+ io_limits.second =
+ cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit;
+ fd_limits.first =
+ cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher;
+ fd_limits.second =
+ cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit;
+ } else if (fs == XFS) {
+ size_limits.first =
+ cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher;
+ size_limits.second =
+ cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit;
+ io_limits.first =
+ cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher;
+ io_limits.second =
+ cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit;
+ fd_limits.first =
+ cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher;
+ fd_limits.second =
+ cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit;
+ } else {
+ ceph_abort_msg("invalid value for fs");
+ }
+ cond.Signal();
+}
+
+void WBThrottle::handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ Mutex::Locker l(lock);
+ for (const char** i = get_tracked_conf_keys(); *i; ++i) {
+ if (changed.count(*i)) {
+ set_from_conf();
+ return;
+ }
+ }
+}
+
+bool WBThrottle::get_next_should_flush(
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next)
+{
+ ceph_assert(lock.is_locked());
+ ceph_assert(next);
+ while (!stopping && (!beyond_limit() || pending_wbs.empty()))
+ cond.Wait(lock);
+ if (stopping)
+ return false;
+ ceph_assert(!pending_wbs.empty());
+ ghobject_t obj(pop_object());
+
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+ pending_wbs.find(obj);
+ *next = boost::make_tuple(obj, i->second.second, i->second.first);
+ pending_wbs.erase(i);
+ return true;
+}
+
+
+void *WBThrottle::entry()
+{
+ Mutex::Locker l(lock);
+ boost::tuple<ghobject_t, FDRef, PendingWB> wb;
+ while (get_next_should_flush(&wb)) {
+ clearing = wb.get<0>();
+ cur_ios -= wb.get<2>().ios;
+ logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
+ logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios);
+ cur_size -= wb.get<2>().size;
+ logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
+ logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size);
+ logger->dec(l_wbthrottle_inodes_dirtied);
+ logger->inc(l_wbthrottle_inodes_wb);
+ lock.Unlock();
+#if defined(HAVE_FDATASYNC)
+ int r = ::fdatasync(**wb.get<1>());
+#else
+ int r = ::fsync(**wb.get<1>());
+#endif
+ if (r < 0) {
+ lderr(cct) << "WBThrottle fsync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+#ifdef HAVE_POSIX_FADVISE
+ if (cct->_conf->filestore_fadvise && wb.get<2>().nocache) {
+ int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
+ ceph_assert(fa_r == 0);
+ }
+#endif
+ lock.Lock();
+ clearing = ghobject_t();
+ cond.Signal();
+ wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
+ }
+ return 0;
+}
+
+void WBThrottle::queue_wb(
+ FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len,
+ bool nocache)
+{
+ Mutex::Locker l(lock);
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
+ pending_wbs.find(hoid);
+ if (wbiter == pending_wbs.end()) {
+ wbiter = pending_wbs.insert(
+ make_pair(hoid,
+ make_pair(
+ PendingWB(),
+ fd))).first;
+ logger->inc(l_wbthrottle_inodes_dirtied);
+ } else {
+ remove_object(hoid);
+ }
+
+ cur_ios++;
+ logger->inc(l_wbthrottle_ios_dirtied);
+ cur_size += len;
+ logger->inc(l_wbthrottle_bytes_dirtied, len);
+
+ wbiter->second.first.add(nocache, len, 1);
+ insert_object(hoid);
+ if (beyond_limit())
+ cond.Signal();
+}
+
+void WBThrottle::clear()
+{
+ Mutex::Locker l(lock);
+ for (ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+ pending_wbs.begin();
+ i != pending_wbs.end();
+ ++i) {
+#ifdef HAVE_POSIX_FADVISE
+ if (cct->_conf->filestore_fadvise && i->second.first.nocache) {
+ int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED);
+ ceph_assert(fa_r == 0);
+ }
+#endif
+
+ }
+ cur_ios = cur_size = 0;
+ logger->set(l_wbthrottle_ios_dirtied, 0);
+ logger->set(l_wbthrottle_bytes_dirtied, 0);
+ logger->set(l_wbthrottle_inodes_dirtied, 0);
+ pending_wbs.clear();
+ lru.clear();
+ rev_lru.clear();
+ cond.Signal();
+}
+
+void WBThrottle::clear_object(const ghobject_t &hoid)
+{
+ Mutex::Locker l(lock);
+ while (clearing == hoid)
+ cond.Wait(lock);
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+ pending_wbs.find(hoid);
+ if (i == pending_wbs.end())
+ return;
+
+ cur_ios -= i->second.first.ios;
+ logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios);
+ cur_size -= i->second.first.size;
+ logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size);
+ logger->dec(l_wbthrottle_inodes_dirtied);
+
+ pending_wbs.erase(i);
+ remove_object(hoid);
+ cond.Signal();
+}
+
+void WBThrottle::throttle()
+{
+ Mutex::Locker l(lock);
+ while (!stopping && need_flush())
+ cond.Wait(lock);
+}
diff --git a/src/os/filestore/WBThrottle.h b/src/os/filestore/WBThrottle.h
new file mode 100644
index 00000000..ef809ea4
--- /dev/null
+++ b/src/os/filestore/WBThrottle.h
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef WBTHROTTLE_H
+#define WBTHROTTLE_H
+
+#include "include/unordered_map.h"
+#include <boost/tuple/tuple.hpp>
+#include "common/Formatter.h"
+#include "common/hobject.h"
+#include "include/interval_set.h"
+#include "FDCache.h"
+#include "common/Thread.h"
+#include "common/ceph_context.h"
+
+class PerfCounters;
+enum {
+ l_wbthrottle_first = 999090,
+ l_wbthrottle_bytes_dirtied,
+ l_wbthrottle_bytes_wb,
+ l_wbthrottle_ios_dirtied,
+ l_wbthrottle_ios_wb,
+ l_wbthrottle_inodes_dirtied,
+ l_wbthrottle_inodes_wb,
+ l_wbthrottle_last
+};
+
+/**
+ * WBThrottle
+ *
+ * Tracks, throttles, and flushes outstanding IO
+ */
+class WBThrottle : Thread, public md_config_obs_t {
+ ghobject_t clearing;
+ /* *_limits.first is the start_flusher limit and
+ * *_limits.second is the hard limit
+ */
+
+ /// Limits on unflushed bytes
+ pair<uint64_t, uint64_t> size_limits;
+
+ /// Limits on unflushed ios
+ pair<uint64_t, uint64_t> io_limits;
+
+ /// Limits on unflushed objects
+ pair<uint64_t, uint64_t> fd_limits;
+
+ uint64_t cur_ios; /// Currently unflushed IOs
+ uint64_t cur_size; /// Currently unflushed bytes
+
+ /**
+ * PendingWB tracks the ios pending on an object.
+ */
+ class PendingWB {
+ public:
+ bool nocache;
+ uint64_t size;
+ uint64_t ios;
+ PendingWB() : nocache(true), size(0), ios(0) {}
+ void add(bool _nocache, uint64_t _size, uint64_t _ios) {
+ if (!_nocache)
+ nocache = false; // only nocache if all writes are nocache
+ size += _size;
+ ios += _ios;
+ }
+ };
+
+ CephContext *cct;
+ PerfCounters *logger;
+ bool stopping;
+ Mutex lock;
+ Cond cond;
+
+
+ /**
+ * Flush objects in lru order
+ */
+ list<ghobject_t> lru;
+ ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator> rev_lru;
+ void remove_object(const ghobject_t &oid) {
+ ceph_assert(lock.is_locked());
+ ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator>::iterator iter =
+ rev_lru.find(oid);
+ if (iter == rev_lru.end())
+ return;
+
+ lru.erase(iter->second);
+ rev_lru.erase(iter);
+ }
+ ghobject_t pop_object() {
+ ceph_assert(!lru.empty());
+ ghobject_t oid(lru.front());
+ lru.pop_front();
+ rev_lru.erase(oid);
+ return oid;
+ }
+ void insert_object(const ghobject_t &oid) {
+ ceph_assert(rev_lru.find(oid) == rev_lru.end());
+ lru.push_back(oid);
+ rev_lru.insert(make_pair(oid, --lru.end()));
+ }
+
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs;
+
+ /// get next flush to perform
+ bool get_next_should_flush(
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush
+ ); ///< @return false if we are shutting down
+public:
+ enum FS {
+ BTRFS,
+ XFS
+ };
+
+private:
+ FS fs;
+
+ void set_from_conf();
+ bool beyond_limit() const {
+ if (cur_ios < io_limits.first &&
+ pending_wbs.size() < fd_limits.first &&
+ cur_size < size_limits.first)
+ return false;
+ else
+ return true;
+ }
+ bool need_flush() const {
+ if (cur_ios < io_limits.second &&
+ pending_wbs.size() < fd_limits.second &&
+ cur_size < size_limits.second)
+ return false;
+ else
+ return true;
+ }
+
+public:
+ explicit WBThrottle(CephContext *cct);
+ ~WBThrottle() override;
+
+ void start();
+ void stop();
+ /// Set fs as XFS or BTRFS
+ void set_fs(FS new_fs) {
+ Mutex::Locker l(lock);
+ fs = new_fs;
+ set_from_conf();
+ }
+
+ /// Queue wb on oid, fd taking throttle (does not block)
+ void queue_wb(
+ FDRef fd, ///< [in] FDRef to oid
+ const ghobject_t &oid, ///< [in] object
+ uint64_t offset, ///< [in] offset written
+ uint64_t len, ///< [in] length written
+ bool nocache ///< [in] try to clear out of cache after write
+ );
+
+ /// Clear all wb (probably due to sync)
+ void clear();
+
+ /// Clear object
+ void clear_object(const ghobject_t &oid);
+
+ /// Block until there is throttle available
+ void throttle();
+
+ /// md_config_obs_t
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override;
+
+ /// Thread
+ void *entry() override;
+};
+
+#endif
diff --git a/src/os/filestore/XfsFileStoreBackend.cc b/src/os/filestore/XfsFileStoreBackend.cc
new file mode 100644
index 00000000..1081d146
--- /dev/null
+++ b/src/os/filestore/XfsFileStoreBackend.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "XfsFileStoreBackend.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/utsname.h>
+
+#include <xfs/xfs.h>
+
+#include "common/errno.h"
+#include "common/linux_version.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") "
+
+XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs):
+ GenericFileStoreBackend(fs), m_has_extsize(false) { }
+
+/*
+ * Set extsize attr on a file to val. Should be a free-standing
+ * function, but dout_prefix expanding to a call to get_basedir_path()
+ * protected member function won't let it.
+ */
+int XfsFileStoreBackend::set_extsize(int fd, unsigned int val)
+{
+ struct fsxattr fsx;
+ struct stat sb;
+ int ret;
+
+ if (fstat(fd, &sb) < 0) {
+ ret = -errno;
+ dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (!S_ISREG(sb.st_mode)) {
+ dout(0) << "set_extsize: invalid target file type" << dendl;
+ return -EINVAL;
+ }
+
+ if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
+ ret = -errno;
+ dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // already set?
+ if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
+ return 0;
+
+ // xfs won't change extent size if any extents are allocated
+ if (fsx.fsx_nextents != 0)
+ return 0;
+
+ fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
+ fsx.fsx_extsize = val;
+
+ if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
+ ret = -errno;
+ dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int XfsFileStoreBackend::detect_features()
+{
+ int ret;
+
+ ret = GenericFileStoreBackend::detect_features();
+ if (ret < 0)
+ return ret;
+
+ // extsize?
+ int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600);
+ if (fd < 0) {
+ ret = -errno;
+ dout(0) << "detect_feature: failed to create test file for extsize attr: "
+ << cpp_strerror(ret) << dendl;
+ goto out;
+ }
+ if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) {
+ ret = -errno;
+ dout(0) << "detect_feature: failed to unlink test file for extsize attr: "
+ << cpp_strerror(ret) << dendl;
+ goto out_close;
+ }
+
+ if (cct()->_conf->filestore_xfs_extsize) {
+ ret = set_extsize(fd, 1U << 15); // a few pages
+ if (ret) {
+ ret = 0;
+ dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl;
+ goto out_close;
+ }
+
+ // make sure we have 3.5 or newer, which includes this fix
+ // aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d
+ // for this set_extsize bug
+ // http://oss.sgi.com/bugzilla/show_bug.cgi?id=874
+ int ver = get_linux_version();
+ if (ver == 0) {
+ dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl;
+ m_has_extsize = false;
+ } else if (ver < KERNEL_VERSION(3, 5, 0)) {
+ dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl;
+ m_has_extsize = false;
+ } else {
+ dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl;
+ m_has_extsize = true;
+ }
+ } else {
+ dout(0) << "detect_feature: extsize is disabled by conf" << dendl;
+ }
+
+out_close:
+ TEMP_FAILURE_RETRY(::close(fd));
+out:
+ return ret;
+}
+
+int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint)
+{
+ if (!m_has_extsize)
+ return -EOPNOTSUPP;
+
+ ceph_assert(hint < UINT_MAX);
+ return set_extsize(fd, hint);
+}
diff --git a/src/os/filestore/XfsFileStoreBackend.h b/src/os/filestore/XfsFileStoreBackend.h
new file mode 100644
index 00000000..e8b81f9a
--- /dev/null
+++ b/src/os/filestore/XfsFileStoreBackend.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_XFSFILESTOREBACKEND_H
+#define CEPH_XFSFILESTOREBACKEND_H
+
+#include "GenericFileStoreBackend.h"
+
+#include "include/int_types.h"
+
+class XfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+ bool m_has_extsize;
+ int set_extsize(int fd, unsigned int val);
+public:
+ explicit XfsFileStoreBackend(FileStore *fs);
+ ~XfsFileStoreBackend() override {}
+ const char *get_name() override {
+ return "xfs";
+ }
+ int detect_features() override;
+ int set_alloc_hint(int fd, uint64_t hint) override;
+};
+
+#endif /* CEPH_XFSFILESTOREBACKEND_H */
diff --git a/src/os/filestore/ZFSFileStoreBackend.cc b/src/os/filestore/ZFSFileStoreBackend.cc
new file mode 100644
index 00000000..e85dbd52
--- /dev/null
+++ b/src/os/filestore/ZFSFileStoreBackend.cc
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/ceph_assert.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+
+#include "ZFSFileStoreBackend.h"
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") "
+
+ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) :
+ GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL),
+ m_filestore_zfs_snap(cct()->_conf->filestore_zfs_snap)
+{
+ int ret = zfs.init();
+ if (ret < 0) {
+ dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl;
+ return;
+ }
+
+ base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM);
+ if (!base_zh) {
+ dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl;
+ return;
+ }
+
+ update_current_zh();
+}
+
+ZFSFileStoreBackend::~ZFSFileStoreBackend()
+{
+ if (base_zh)
+ zfs.close(base_zh);
+ if (current_zh)
+ zfs.close(current_zh);
+}
+
+int ZFSFileStoreBackend::update_current_zh()
+{
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+ ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM);
+ if (zh) {
+ char *mnt;
+ if (zfs.is_mounted(zh, &mnt)) {
+ int ret = get_current_path() == mnt;
+ free(mnt);
+ if (ret) {
+ current_zh = zh;
+ return 0;
+ }
+ } else {
+ int ret = zfs.mount(zh, NULL, 0);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh)
+ << "' got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+ zfs.close(zh);
+ } else {
+ dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl;
+ return -ENOENT;
+ }
+
+ zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM);
+ if (zh) {
+ if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) {
+ current_zh = zh;
+ return 0;
+ }
+ zfs.close(zh);
+ dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl;
+ } else {
+ dout(0) << "update_current_zh: current/ not exist" << dendl;
+ }
+ return -ENOENT;
+}
+
+int ZFSFileStoreBackend::detect_features()
+{
+ if (!current_zh)
+ dout(0) << "detect_features: null zfs handle for current/" << dendl;
+ return 0;
+}
+
+bool ZFSFileStoreBackend::can_checkpoint()
+{
+ return m_filestore_zfs_snap && current_zh != NULL;
+}
+
+int ZFSFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+ return -ENOTDIR;
+ }
+ return 0;
+ } else if (errno != ENOENT) {
+ ret = -errno;
+ dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+ ret = zfs.create(path, ZFS::TYPE_FILESYSTEM);
+ if (ret < 0 && errno != EEXIST) {
+ ret = -errno;
+ dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ ret = update_current_zh();
+ return ret;
+}
+
+static int list_checkpoints_callback(ZFS::Handle *zh, void *data)
+{
+ list<string> *ls = static_cast<list<string> *>(data);
+ string str = ZFS::get_name(zh);
+ size_t pos = str.find('@');
+ ceph_assert(pos != string::npos && pos + 1 != str.length());
+ ls->push_back(str.substr(pos + 1));
+ return 0;
+}
+
+int ZFSFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+ dout(10) << "list_checkpoints:" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ list<string> snaps;
+ int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ls.swap(snaps);
+ return 0;
+}
+
+int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid)
+{
+ dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ // looks like zfsonlinux doesn't flush dirty data when taking snapshot
+ int ret = sync_filesystem(get_current_fd());
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+ ret = zfs.snapshot(path, false);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (cid)
+ *cid = 0;
+ return 0;
+}
+
+int ZFSFileStoreBackend::rollback_to(const string& name)
+{
+ dout(10) << "rollback_to: '" << name << "'" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ // umount current to avoid triggering online rollback deadlock
+ int ret;
+ if (zfs.is_mounted(current_zh, NULL)) {
+ ret = zfs.umount(current_zh, NULL, 0);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl;
+ }
+ }
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+
+ ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT);
+ if (!snap_zh) {
+ dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl;
+ return -ENOENT;
+ }
+
+ ret = zfs.rollback(current_zh, snap_zh, false);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl;
+ }
+
+ if (!zfs.is_mounted(current_zh, NULL)) {
+ int ret = zfs.mount(current_zh, NULL, 0);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+
+ zfs.close(snap_zh);
+ return ret;
+}
+
+int ZFSFileStoreBackend::destroy_checkpoint(const string& name)
+{
+ dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ int ret = zfs.destroy_snaps(current_zh, name.c_str(), true);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl;
+ }
+ return ret;
+}
diff --git a/src/os/filestore/ZFSFileStoreBackend.h b/src/os/filestore/ZFSFileStoreBackend.h
new file mode 100644
index 00000000..b1fa9887
--- /dev/null
+++ b/src/os/filestore/ZFSFileStoreBackend.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ZFSFILESTOREBACKEND_H
+#define CEPH_ZFSFILESTOREBACKEND_H
+
+#ifdef HAVE_LIBZFS
+#include "GenericFileStoreBackend.h"
+#include "os/fs/ZFS.h"
+
+class ZFSFileStoreBackend : public GenericFileStoreBackend {
+private:
+ ZFS zfs;
+ ZFS::Handle *base_zh;
+ ZFS::Handle *current_zh;
+ bool m_filestore_zfs_snap;
+ int update_current_zh();
+public:
+ explicit ZFSFileStoreBackend(FileStore *fs);
+ ~ZFSFileStoreBackend();
+ const char *get_name() override {
+ return "zfs";
+ }
+ int detect_features();
+ bool can_checkpoint();
+ int create_current();
+ int list_checkpoints(list<string>& ls);
+ int create_checkpoint(const string& name, uint64_t *cid);
+ int rollback_to(const string& name);
+ int destroy_checkpoint(const string& name);
+};
+#endif
+#endif
diff --git a/src/os/filestore/chain_xattr.cc b/src/os/filestore/chain_xattr.cc
new file mode 100644
index 00000000..e4dedd29
--- /dev/null
+++ b/src/os/filestore/chain_xattr.cc
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "chain_xattr.h"
+#include <errno.h> // for ERANGE, ENODATA, ENOMEM
+#include <stdio.h> // for size_t, snprintf
+#include <stdlib.h> // for free, malloc
+#include <string.h> // for strcpy, strlen
+#include "include/ceph_assert.h" // for assert
+#include "include/buffer.h"
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "include/ceph_assert.h"
+
+/*
+ * chaining xattrs
+ *
+ * In order to support xattrs that are larger than the xattr size limit that some file systems
+ * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys
+ * are set as follows:
+ * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char
+ * being esacped ("@@").
+ * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@<id>"
+ * where <id> marks the num of xattr in the chain.
+ */
+
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len)
+{
+ int pos = 0;
+
+ while (*name) {
+ switch (*name) {
+ case '@': /* escape it */
+ pos += 2;
+ ceph_assert (pos < raw_len - 1);
+ *raw_name = '@';
+ raw_name++;
+ *raw_name = '@';
+ break;
+ default:
+ pos++;
+ ceph_assert(pos < raw_len - 1);
+ *raw_name = *name;
+ break;
+ }
+ name++;
+ raw_name++;
+ }
+
+ if (!i) {
+ *raw_name = '\0';
+ } else {
+ int r = snprintf(raw_name, raw_len - pos, "@%d", i);
+ ceph_assert(r < raw_len - pos);
+ }
+}
+
+static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first)
+{
+ int pos = 0;
+
+ *is_first = true;
+ while (*raw_name) {
+ switch (*raw_name) {
+ case '@': /* escape it */
+ raw_name++;
+ if (!*raw_name)
+ break;
+ if (*raw_name != '@') {
+ *is_first = false;
+ goto done;
+ }
+
+ /* fall through */
+ default:
+ *name = *raw_name;
+ break;
+ }
+ pos++;
+ ceph_assert(pos < name_len);
+ name++;
+ raw_name++;
+ }
+done:
+ *name = '\0';
+ return pos;
+}
+
+
+// setxattr
+
+static int getxattr_len(const char *fn, const char *name)
+{
+ int i = 0, total = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_getxattr(fn, raw_name, 0, 0);
+ if (!i && r < 0)
+ return r;
+ if (r < 0)
+ break;
+ total += r;
+ i++;
+ } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+ return total;
+}
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ int r;
+ size_t chunk_size;
+
+ if (!size)
+ return getxattr_len(fn, name);
+
+ do {
+ chunk_size = size;
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+ r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+
+ if (r > 0) {
+ pos += r;
+ size -= r;
+ }
+
+ i++;
+ } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+ if (r >= 0) {
+ ret = pos;
+ /* is there another chunk? that can happen if the last read size span over
+ exactly one block */
+ if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_getxattr(fn, raw_name, 0, 0);
+ if (r > 0) { // there's another chunk.. the original buffer was too small
+ ret = -ERANGE;
+ }
+ }
+ }
+ return ret;
+}
+
+int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp)
+{
+ size_t size = 1024; // Initial
+ while (1) {
+ bufferptr buf(size);
+ int r = chain_getxattr(
+ fn,
+ name,
+ buf.c_str(),
+ size);
+ if (r > 0) {
+ buf.set_length(r);
+ if (bp)
+ bp->swap(buf);
+ return r;
+ } else if (r == 0) {
+ return 0;
+ } else {
+ if (r == -ERANGE) {
+ size *= 2;
+ } else {
+ return r;
+ }
+ }
+ }
+ ceph_abort_msg("unreachable");
+ return 0;
+}
+
+static int chain_fgetxattr_len(int fd, const char *name)
+{
+ int i = 0, total = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fgetxattr(fd, raw_name, 0, 0);
+ if (!i && r < 0)
+ return r;
+ if (r < 0)
+ break;
+ total += r;
+ i++;
+ } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+ return total;
+}
+
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ int r;
+ size_t chunk_size;
+
+ if (!size)
+ return chain_fgetxattr_len(fd, name);
+
+ do {
+ chunk_size = size;
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+ r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+
+ if (r > 0) {
+ pos += r;
+ size -= r;
+ }
+
+ i++;
+ } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+ if (r >= 0) {
+ ret = pos;
+ /* is there another chunk? that can happen if the last read size span over
+ exactly one block */
+ if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fgetxattr(fd, raw_name, 0, 0);
+ if (r > 0) { // there's another chunk.. the original buffer was too small
+ ret = -ERANGE;
+ }
+ }
+ }
+ return ret;
+}
+
+
+// setxattr
+
+int get_xattr_block_size(size_t size)
+{
+ if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD)
+ // this may fit in the inode; stripe over short attrs so that XFS
+ // won't kick it out.
+ return CHAIN_XATTR_SHORT_BLOCK_LEN;
+ return CHAIN_XATTR_MAX_BLOCK_LEN;
+}
+
+// removexattr
+
+int chain_removexattr(const char *fn, const char *name)
+{
+ int i = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_removexattr(fn, raw_name);
+ if (!i && r < 0) {
+ return r;
+ }
+ i++;
+ } while (r >= 0);
+ return 0;
+}
+
+int chain_fremovexattr(int fd, const char *name)
+{
+ int i = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fremovexattr(fd, raw_name);
+ if (!i && r < 0) {
+ return r;
+ }
+ i++;
+ } while (r >= 0);
+ return 0;
+}
+
+
+// listxattr
+
+int chain_listxattr(const char *fn, char *names, size_t len) {
+ int r;
+
+ if (!len)
+ return sys_listxattr(fn, names, len) * 2;
+
+ r = sys_listxattr(fn, 0, 0);
+ if (r < 0)
+ return r;
+
+ size_t total_len = r * 2; // should be enough
+ char *full_buf = (char *)malloc(total_len);
+ if (!full_buf)
+ return -ENOMEM;
+
+ r = sys_listxattr(fn, full_buf, total_len);
+ if (r < 0) {
+ free(full_buf);
+ return r;
+ }
+
+ char *p = full_buf;
+ const char *end = full_buf + r;
+ char *dest = names;
+ char *dest_end = names + len;
+
+ while (p < end) {
+ char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int attr_len = strlen(p);
+ bool is_first;
+ int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+ if (is_first) {
+ if (dest + name_len > dest_end) {
+ r = -ERANGE;
+ goto done;
+ }
+ strcpy(dest, name);
+ dest += name_len + 1;
+ }
+ p += attr_len + 1;
+ }
+ r = dest - names;
+
+done:
+ free(full_buf);
+ return r;
+}
+
+int chain_flistxattr(int fd, char *names, size_t len) {
+ int r;
+ char *p;
+ const char * end;
+ char *dest;
+ char *dest_end;
+
+ if (!len)
+ return sys_flistxattr(fd, names, len) * 2;
+
+ r = sys_flistxattr(fd, 0, 0);
+ if (r < 0)
+ return r;
+
+ size_t total_len = r * 2; // should be enough
+ char *full_buf = (char *)malloc(total_len);
+ if (!full_buf)
+ return -ENOMEM;
+
+ r = sys_flistxattr(fd, full_buf, total_len);
+ if (r < 0)
+ goto done;
+
+ p = full_buf;
+ end = full_buf + r;
+ dest = names;
+ dest_end = names + len;
+
+ while (p < end) {
+ char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int attr_len = strlen(p);
+ bool is_first;
+ int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+ if (is_first) {
+ if (dest + name_len > dest_end) {
+ r = -ERANGE;
+ goto done;
+ }
+ strcpy(dest, name);
+ dest += name_len + 1;
+ }
+ p += attr_len + 1;
+ }
+ r = dest - names;
+
+done:
+ free(full_buf);
+ return r;
+}
diff --git a/src/os/filestore/chain_xattr.h b/src/os/filestore/chain_xattr.h
new file mode 100644
index 00000000..a2d17fa6
--- /dev/null
+++ b/src/os/filestore/chain_xattr.h
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OSD_CHAIN_XATTR_H
+#define __CEPH_OSD_CHAIN_XATTR_H
+
+#include "include/compat.h"
+#include <errno.h>
+#include <stdio.h>
+#include "common/xattr.h"
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+
+#if defined(__linux__)
+#include <linux/limits.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2)
+#elif defined(__APPLE__)
+#include <sys/xattr.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2)
+#else
+#define CHAIN_XATTR_MAX_NAME_LEN 128
+#endif
+
+#define CHAIN_XATTR_MAX_BLOCK_LEN 2048
+
+/*
+ * XFS will only inline xattrs < 255 bytes, so for xattrs that are
+ * likely to fit in the inode, stripe over short xattrs.
+ */
+#define CHAIN_XATTR_SHORT_BLOCK_LEN 250
+#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000
+
+// wrappers to hide annoying errno handling.
+
+static inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+ int r = ::ceph_os_fgetxattr(fd, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+ int r = ::ceph_os_getxattr(fn, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+
+static inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size)
+{
+ int r = ::ceph_os_setxattr(fn, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size)
+{
+ int r = ::ceph_os_fsetxattr(fd, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+
+static inline int sys_listxattr(const char *fn, char *names, size_t len)
+{
+ int r = ::ceph_os_listxattr(fn, names, len);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_flistxattr(int fd, char *names, size_t len)
+{
+ int r = ::ceph_os_flistxattr(fd, names, len);
+ return (r < 0 ? -errno : r);
+}
+
+static inline int sys_removexattr(const char *fn, const char *name)
+{
+ int r = ::ceph_os_removexattr(fn, name);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_fremovexattr(int fd, const char *name)
+{
+ int r = ::ceph_os_fremovexattr(fd, name);
+ return (r < 0 ? -errno : r);
+}
+
+
+// wrappers to chain large values across multiple xattrs
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size);
+int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp);
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size);
+
+int get_xattr_block_size(size_t size);
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len);
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_setxattr(
+ const char *fn, const char *name, const void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ size_t max_chunk_size =
+ ensure_single_attr ? size : get_xattr_block_size(size);
+
+ static_assert(
+ !skip_chain_cleanup || ensure_single_attr,
+ "skip_chain_cleanup must imply ensure_single_attr");
+
+ do {
+ size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ size -= chunk_size;
+
+ int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size);
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+ pos += chunk_size;
+ ret = pos;
+ i++;
+ ceph_assert(size == 0 || !ensure_single_attr);
+ } while (size);
+
+ if (ret >= 0 && !skip_chain_cleanup) {
+ int r;
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_removexattr(fn, raw_name);
+ if (r < 0 && r != -ENODATA)
+ ret = r;
+ i++;
+ } while (r != -ENODATA);
+ }
+
+ return ret;
+}
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_fsetxattr(
+ int fd, const char *name, const void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ size_t max_chunk_size =
+ ensure_single_attr ? size : get_xattr_block_size(size);
+
+ static_assert(
+ !skip_chain_cleanup || ensure_single_attr,
+ "skip_chain_cleanup must imply ensure_single_attr");
+
+ do {
+ size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ size -= chunk_size;
+
+ int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+ pos += chunk_size;
+ ret = pos;
+ i++;
+ ceph_assert(size == 0 || !ensure_single_attr);
+ } while (size);
+
+ if (ret >= 0 && !skip_chain_cleanup) {
+ int r;
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fremovexattr(fd, raw_name);
+ if (r < 0 && r != -ENODATA)
+ ret = r;
+ i++;
+ } while (r != -ENODATA);
+ }
+
+ return ret;
+}
+
+int chain_listxattr(const char *fn, char *names, size_t len);
+int chain_flistxattr(int fd, char *names, size_t len);
+int chain_removexattr(const char *fn, const char *name);
+int chain_fremovexattr(int fd, const char *name);
+
+#endif
diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc
new file mode 100644
index 00000000..c40fd0de
--- /dev/null
+++ b/src/os/fs/FS.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#ifdef __linux__
+#include <linux/falloc.h>
+#endif
+
+#include "FS.h"
+
+#include "acconfig.h"
+
+#ifdef HAVE_LIBXFS
+#include "XFS.h"
+#endif
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/mount.h>
+#else
+#include <sys/vfs.h>
+#endif
+#include "include/compat.h"
+
+// ---------------
+
+FS *FS::create(uint64_t f_type)
+{
+ switch (f_type) {
+#ifdef HAVE_LIBXFS
+ case XFS_SUPER_MAGIC:
+ return new XFS;
+#endif
+ default:
+ return new FS;
+ }
+}
+
+FS *FS::create_by_fd(int fd)
+{
+ struct statfs st;
+ ::fstatfs(fd, &st);
+ return create(st.f_type);
+}
+
+// ---------------
+
+int FS::set_alloc_hint(int fd, uint64_t hint)
+{
+ return 0; // no-op
+}
+
+#ifdef HAVE_NAME_TO_HANDLE_AT
+int FS::get_handle(int fd, std::string *h)
+{
+ char buf[sizeof(struct file_handle) + MAX_HANDLE_SZ];
+ struct file_handle *fh = (struct file_handle *)buf;
+ int mount_id;
+
+ fh->handle_bytes = MAX_HANDLE_SZ;
+ int r = name_to_handle_at(fd, "", fh, &mount_id, AT_EMPTY_PATH);
+ if (r < 0) {
+ return -errno;
+ }
+ *h = std::string(buf, fh->handle_bytes + sizeof(struct file_handle));
+ return 0;
+}
+
+int FS::open_handle(int mount_fd, const std::string& h, int flags)
+{
+ if (h.length() < sizeof(struct file_handle)) {
+ return -EINVAL;
+ }
+ struct file_handle *fh = (struct file_handle *)h.data();
+ if (fh->handle_bytes > h.length()) {
+ return -ERANGE;
+ }
+ int fd = open_by_handle_at(mount_fd, fh, flags);
+ if (fd < 0)
+ return -errno;
+ return fd;
+}
+
+#else // HAVE_NAME_TO_HANDLE_AT
+
+int FS::get_handle(int fd, std::string *h)
+{
+ return -EOPNOTSUPP;
+}
+
+int FS::open_handle(int mount_fd, const std::string& h, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif // HAVE_NAME_TO_HANDLE_AT
+
+int FS::copy_file_range(int to_fd, uint64_t to_offset,
+ int from_fd,
+ uint64_t from_offset, uint64_t from_len)
+{
+ ceph_abort_msg("write me");
+}
+
+int FS::zero(int fd, uint64_t offset, uint64_t length)
+{
+ int r;
+
+ /*
+
+ From the fallocate(2) man page:
+
+ Specifying the FALLOC_FL_PUNCH_HOLE flag (available since Linux 2.6.38)
+ in mode deallocates space (i.e., creates a hole) in the byte range
+ starting at offset and continuing for len bytes. Within the specified
+ range, partial filesystem blocks are zeroed, and whole filesystem
+ blocks are removed from the file. After a successful call, subsequent
+ reads from this range will return zeroes.
+
+ The FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE in
+ mode; in other words, even when punching off the end of the file, the
+ file size (as reported by stat(2)) does not change.
+
+ Not all filesystems support FALLOC_FL_PUNCH_HOLE; if a filesystem
+ doesn't support the operation, an error is returned. The operation is
+ supported on at least the following filesystems:
+
+ * XFS (since Linux 2.6.38)
+
+ * ext4 (since Linux 3.0)
+
+ * Btrfs (since Linux 3.7)
+
+ * tmpfs (since Linux 3.5)
+
+ So: we only do this is PUNCH_HOLE *and* KEEP_SIZE are defined.
+
+ */
+#if !defined(__APPLE__) && !defined(__FreeBSD__)
+# ifdef CEPH_HAVE_FALLOCATE
+# ifdef FALLOC_FL_KEEP_SIZE
+ // first try fallocate
+ r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length);
+ if (r < 0) {
+ r = -errno;
+ }
+ if (r != -EOPNOTSUPP) {
+ goto out; // a real error
+ }
+ // if that failed (-EOPNOTSUPP), fall back to writing zeros.
+# endif
+# endif
+#endif
+
+ {
+ // fall back to writing zeros
+ bufferlist bl;
+ bl.append_zero(length);
+ r = ::lseek64(fd, offset, SEEK_SET);
+ if (r < 0) {
+ r = -errno;
+ goto out;
+ }
+ r = bl.write_fd(fd);
+ }
+
+ out:
+ return r;
+}
+
+// ---------------
+
diff --git a/src/os/fs/FS.h b/src/os/fs/FS.h
new file mode 100644
index 00000000..aafa64e5
--- /dev/null
+++ b/src/os/fs/FS.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_FS_H
+#define CEPH_OS_FS_H
+
+#include <errno.h>
+#include <time.h>
+
+#include <string>
+
+#include "include/types.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+
+class FS {
+public:
+ virtual ~FS() { }
+
+ static FS *create(uint64_t f_type);
+ static FS *create_by_fd(int fd);
+
+ virtual const char *get_name() {
+ return "generic";
+ }
+
+ virtual int set_alloc_hint(int fd, uint64_t hint);
+
+ virtual int get_handle(int fd, std::string *h);
+ virtual int open_handle(int mount_fd, const std::string& h, int flags);
+
+ virtual int copy_file_range(int to_fd, uint64_t to_offset,
+ int from_fd,
+ uint64_t from_offset, uint64_t from_len);
+ virtual int zero(int fd, uint64_t offset, uint64_t length);
+
+ // -- aio --
+};
+
+#endif
diff --git a/src/os/fs/XFS.cc b/src/os/fs/XFS.cc
new file mode 100644
index 00000000..c72ee1a0
--- /dev/null
+++ b/src/os/fs/XFS.cc
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "XFS.h"
+
+#include <xfs/xfs.h>
+
+int XFS::set_alloc_hint(int fd, uint64_t val)
+{
+ struct fsxattr fsx;
+ struct stat sb;
+ int ret;
+
+ if (fstat(fd, &sb) < 0) {
+ ret = -errno;
+ return ret;
+ }
+ if (!S_ISREG(sb.st_mode)) {
+ return -EINVAL;
+ }
+
+ if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
+ ret = -errno;
+ return ret;
+ }
+
+ // already set?
+ if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
+ return 0;
+
+ // xfs won't change extent size if any extents are allocated
+ if (fsx.fsx_nextents != 0)
+ return 0;
+
+ fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
+ fsx.fsx_extsize = val;
+
+ if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
+ ret = -errno;
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/src/os/fs/XFS.h b/src/os/fs/XFS.h
new file mode 100644
index 00000000..f0ea717e
--- /dev/null
+++ b/src/os/fs/XFS.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_XFS_H
+#define CEPH_OS_XFS_H
+
+#include "FS.h"
+
+# ifndef XFS_SUPER_MAGIC
+#define XFS_SUPER_MAGIC 0x58465342
+# endif
+
+class XFS : public FS {
+ const char *get_name() override {
+ return "xfs";
+ }
+ int set_alloc_hint(int fd, uint64_t hint) override;
+};
+
+#endif
diff --git a/src/os/fs/ZFS.cc b/src/os/fs/ZFS.cc
new file mode 100644
index 00000000..02520796
--- /dev/null
+++ b/src/os/fs/ZFS.cc
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#define HAVE_IOCTL_IN_SYS_IOCTL_H
+#include <libzfs.h>
+#include "ZFS.h"
+
+const int ZFS::TYPE_FILESYSTEM = ZFS_TYPE_FILESYSTEM;
+const int ZFS::TYPE_SNAPSHOT = ZFS_TYPE_SNAPSHOT;
+const int ZFS::TYPE_VOLUME = ZFS_TYPE_VOLUME;
+const int ZFS::TYPE_DATASET = ZFS_TYPE_DATASET;
+
+ZFS::~ZFS()
+{
+ if (g_zfs)
+ ::libzfs_fini((libzfs_handle_t*)g_zfs);
+}
+
+int ZFS::init()
+{
+ g_zfs = ::libzfs_init();
+ return g_zfs ? 0 : -EINVAL;
+}
+
+ZFS::Handle *ZFS::open(const char *n, int t)
+{
+ return (ZFS::Handle*)::zfs_open((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t);
+}
+
+void ZFS::close(ZFS::Handle *h)
+{
+ ::zfs_close((zfs_handle_t*)h);
+}
+
+const char *ZFS::get_name(ZFS::Handle *h)
+{
+ return ::zfs_get_name((zfs_handle_t*)h);
+}
+
+ZFS::Handle *ZFS::path_to_zhandle(const char *p, int t)
+{
+ return ::zfs_path_to_zhandle((libzfs_handle_t*)g_zfs, (char *)p, (zfs_type_t)t);
+}
+
+int ZFS::create(const char *n, int t)
+{
+ return ::zfs_create((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t, NULL);
+}
+
+int ZFS::snapshot(const char *n, bool r)
+{
+ return ::zfs_snapshot((libzfs_handle_t*)g_zfs, n, (boolean_t)r, NULL);
+}
+
+int ZFS::rollback(ZFS::Handle *h, ZFS::Handle *snap, bool f)
+{
+ return ::zfs_rollback((zfs_handle_t*)h, (zfs_handle_t*)snap, (boolean_t)f);
+}
+
+int ZFS::destroy_snaps(ZFS::Handle *h, const char *n, bool d)
+{
+ return ::zfs_destroy_snaps((zfs_handle_t*)h, (char *)n, (boolean_t)d);
+}
+
+bool ZFS::is_mounted(ZFS::Handle *h, char **p)
+{
+ return (bool)::zfs_is_mounted((zfs_handle_t*)h, p);
+}
+
+int ZFS::mount(ZFS::Handle *h, const char *o, int f)
+{
+ return ::zfs_mount((zfs_handle_t*)h, o, f);
+}
+
+int ZFS::umount(ZFS::Handle *h, const char *o, int f)
+{
+ return ::zfs_unmount((zfs_handle_t*)h, o, f);
+}
+
+int ZFS::iter_snapshots_sorted(ZFS::Handle *h, ZFS::iter_func f, void *d)
+{
+ return ::zfs_iter_snapshots_sorted((zfs_handle_t*)h, (zfs_iter_f)f, d);
+}
diff --git a/src/os/fs/ZFS.h b/src/os/fs/ZFS.h
new file mode 100644
index 00000000..3ebe1110
--- /dev/null
+++ b/src/os/fs/ZFS.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ZFS_H
+#define CEPH_ZFS_H
+
+// Simple wrapper to hide libzfs.h. (it conflicts with standard linux headers)
+class ZFS {
+ void *g_zfs;
+public:
+
+ static const int TYPE_FILESYSTEM;
+ static const int TYPE_SNAPSHOT;
+ static const int TYPE_VOLUME;
+ static const int TYPE_POOL;
+ static const int TYPE_DATASET;
+
+ typedef void Handle;
+ typedef int (*iter_func)(Handle *, void *);
+
+ static const char *get_name(Handle *);
+
+ ZFS() : g_zfs(NULL) {}
+ ~ZFS();
+ int init();
+ Handle *open(const char *, int);
+ void close(Handle *);
+ Handle *path_to_zhandle(const char *, int);
+ int create(const char *, int);
+ int snapshot(const char *, bool);
+ int rollback(Handle *, Handle *, bool);
+ int destroy_snaps(Handle *, const char *, bool);
+ int iter_snapshots_sorted(Handle *, iter_func, void *);
+ int mount(Handle *, const char *, int);
+ int umount(Handle *, const char *, int);
+ bool is_mounted(Handle *, char **);
+};
+
+#endif
diff --git a/src/os/fs/btrfs_ioctl.h b/src/os/fs/btrfs_ioctl.h
new file mode 100644
index 00000000..277498ca
--- /dev/null
+++ b/src/os/fs/btrfs_ioctl.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __IOCTL_
+#define __IOCTL_
+
+#if defined(__linux__)
+#include <linux/ioctl.h>
+#elif defined(__FreeBSD__)
+#include <sys/ioctl.h>
+#endif
+
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+
+/* this should be 4k */
+#define BTRFS_PATH_NAME_MAX 4087
+struct btrfs_ioctl_vol_args {
+ __s64 fd;
+ char name[BTRFS_PATH_NAME_MAX + 1];
+};
+
+#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
+
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
+ __s64 fd;
+ __u64 transid;
+ __u64 flags;
+ __u64 unused[4];
+ char name[BTRFS_SUBVOL_NAME_MAX + 1];
+};
+
+#define BTRFS_INO_LOOKUP_PATH_MAX 4080
+struct btrfs_ioctl_ino_lookup_args {
+ __u64 treeid;
+ __u64 objectid;
+ char name[BTRFS_INO_LOOKUP_PATH_MAX];
+};
+
+struct btrfs_ioctl_search_key {
+ /* which root are we searching. 0 is the tree of tree roots */
+ __u64 tree_id;
+
+ /* keys returned will be >= min and <= max */
+ __u64 min_objectid;
+ __u64 max_objectid;
+
+ /* keys returned will be >= min and <= max */
+ __u64 min_offset;
+ __u64 max_offset;
+
+ /* max and min transids to search for */
+ __u64 min_transid;
+ __u64 max_transid;
+
+ /* keys returned will be >= min and <= max */
+ __u32 min_type;
+ __u32 max_type;
+
+ /*
+ * how many items did userland ask for, and how many are we
+ * returning
+ */
+ __u32 nr_items;
+
+ /* align to 64 bits */
+ __u32 unused;
+
+ /* some extra for later */
+ __u64 unused1;
+ __u64 unused2;
+ __u64 unused3;
+ __u64 unused4;
+};
+
+struct btrfs_ioctl_search_header {
+ __u64 transid;
+ __u64 objectid;
+ __u64 offset;
+ __u32 type;
+ __u32 len;
+};
+
+#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
+/*
+ * the buf is an array of search headers where
+ * each header is followed by the actual item
+ * the type field is expanded to 32 bits for alignment
+ */
+struct btrfs_ioctl_search_args {
+ struct btrfs_ioctl_search_key key;
+ char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
+};
+
+struct btrfs_ioctl_clone_range_args {
+ __s64 src_fd;
+ __u64 src_offset, src_length;
+ __u64 dest_offset;
+};
+
+/* flags for the defrag range ioctl */
+#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+#define BTRFS_DEFRAG_RANGE_START_IO 2
+
+struct btrfs_ioctl_defrag_range_args {
+ /* start of the defrag operation */
+ __u64 start;
+
+ /* number of bytes to defrag, use (u64)-1 to say all */
+ __u64 len;
+
+ /*
+ * flags for the operation, which can include turning
+ * on compression for this one defrag
+ */
+ __u64 flags;
+
+ /*
+ * any extent bigger than this will be considered
+ * already defragged. Use 0 to take the kernel default
+ * Use 1 to say every single extent must be rewritten
+ */
+ __u32 extent_thresh;
+
+ /* spare for later */
+ __u32 unused[5];
+};
+
+struct btrfs_ioctl_space_info {
+ __u64 flags;
+ __u64 total_bytes;
+ __u64 used_bytes;
+};
+
+struct btrfs_ioctl_space_args {
+ __u64 space_slots;
+ __u64 total_spaces;
+ struct btrfs_ioctl_space_info spaces[0];
+};
+
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+ struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
+
+#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+ struct btrfs_ioctl_vol_args)
+
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+ struct btrfs_ioctl_clone_range_args)
+
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
+ struct btrfs_ioctl_defrag_range_args)
+#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
+ struct btrfs_ioctl_search_args)
+#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
+ struct btrfs_ioctl_ino_lookup_args)
+#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
+#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
+ struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
+#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
+ struct btrfs_ioctl_vol_args_v2)
+#endif
diff --git a/src/os/kstore/KStore.cc b/src/os/kstore/KStore.cc
new file mode 100644
index 00000000..bc352bb0
--- /dev/null
+++ b/src/os/kstore/KStore.cc
@@ -0,0 +1,3436 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+#include "KStore.h"
+#include "osd/osd_types.h"
+#include "os/kv.h"
+#include "include/compat.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/Formatter.h"
+
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_kstore
+
+/*
+
+ TODO:
+
+ * superblock, features
+ * refcounted extents (for efficient clone)
+
+ */
+
+const string PREFIX_SUPER = "S"; // field -> value
+const string PREFIX_COLL = "C"; // collection name -> (nothing)
+const string PREFIX_OBJ = "O"; // object name -> onode
+const string PREFIX_DATA = "D"; // nid + offset -> data
+const string PREFIX_OMAP = "M"; // u64 + keyname -> value
+
+/*
+ * object name key structure
+ *
+ * 2 chars: shard (-- for none, or hex digit, so that we sort properly)
+ * encoded u64: poolid + 2^63 (so that it sorts properly)
+ * encoded u32: hash (bit reversed)
+ *
+ * 1 char: '.'
+ *
+ * escaped string: namespace
+ *
+ * 1 char: '<', '=', or '>'. if =, then object key == object name, and
+ * we are followed just by the key. otherwise, we are followed by
+ * the key and then the object name.
+ * escaped string: key
+ * escaped string: object name (unless '=' above)
+ *
+ * encoded u64: snap
+ * encoded u64: generation
+ */
+
+/*
+ * string encoding in the key
+ *
+ * The key string needs to lexicographically sort the same way that
+ * ghobject_t does. We do this by escaping anything <= to '#' with #
+ * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
+ * hex digits.
+ *
+ * We use ! as a terminator for strings; this works because it is < #
+ * and will get escaped if it is present in the string.
+ *
+ */
+
+static void append_escaped(const string &in, string *out)
+{
+ char hexbyte[8];
+ for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+ if ((unsigned char)*i <= '#') {
+ snprintf(hexbyte, sizeof(hexbyte), "#%02x", (uint8_t)*i);
+ out->append(hexbyte);
+ } else if ((unsigned char)*i >= '~') {
+ snprintf(hexbyte, sizeof(hexbyte), "~%02x", (uint8_t)*i);
+ out->append(hexbyte);
+ } else {
+ out->push_back(*i);
+ }
+ }
+ out->push_back('!');
+}
+
+static int decode_escaped(const char *p, string *out)
+{
+ const char *orig_p = p;
+ while (*p && *p != '!') {
+ if (*p == '#' || *p == '~') {
+ unsigned hex;
+ int r = sscanf(++p, "%2x", &hex);
+ if (r < 1)
+ return -EINVAL;
+ out->push_back((char)hex);
+ p += 2;
+ } else {
+ out->push_back(*p++);
+ }
+ }
+ return p - orig_p;
+}
+
+// some things we encode in binary (as le32 or le64); print the
+// resulting key strings nicely
+static string pretty_binary_string(const string& in)
+{
+ char buf[10];
+ string out;
+ out.reserve(in.length() * 3);
+ enum { NONE, HEX, STRING } mode = NONE;
+ unsigned from = 0, i;
+ for (i=0; i < in.length(); ++i) {
+ if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
+ (mode == HEX && in.length() - i >= 4 &&
+ ((in[i] < 32 || (unsigned char)in[i] > 126) ||
+ (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
+ (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
+ (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
+ if (mode == STRING) {
+ out.append(in.substr(from, i - from));
+ out.push_back('\'');
+ }
+ if (mode != HEX) {
+ out.append("0x");
+ mode = HEX;
+ }
+ if (in.length() - i >= 4) {
+ // print a whole u32 at once
+ snprintf(buf, sizeof(buf), "%08x",
+ (uint32_t)(((unsigned char)in[i] << 24) |
+ ((unsigned char)in[i+1] << 16) |
+ ((unsigned char)in[i+2] << 8) |
+ ((unsigned char)in[i+3] << 0)));
+ i += 3;
+ } else {
+ snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
+ }
+ out.append(buf);
+ } else {
+ if (mode != STRING) {
+ out.push_back('\'');
+ mode = STRING;
+ from = i;
+ }
+ }
+ }
+ if (mode == STRING) {
+ out.append(in.substr(from, i - from));
+ out.push_back('\'');
+ }
+ return out;
+}
+
+static void _key_encode_shard(shard_id_t shard, string *key)
+{
+ // make field ordering match with ghobject_t compare operations
+ if (shard == shard_id_t::NO_SHARD) {
+ // otherwise ff will sort *after* 0, not before.
+ key->append("--");
+ } else {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%02x", (int)shard);
+ key->append(buf);
+ }
+}
+static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
+{
+ if (key[0] == '-') {
+ *pshard = shard_id_t::NO_SHARD;
+ } else {
+ unsigned shard;
+ int r = sscanf(key, "%x", &shard);
+ if (r < 1)
+ return NULL;
+ *pshard = shard_id_t(shard);
+ }
+ return key + 2;
+}
+
+static void get_coll_key_range(const coll_t& cid, int bits,
+ string *temp_start, string *temp_end,
+ string *start, string *end)
+{
+ temp_start->clear();
+ temp_end->clear();
+ start->clear();
+ end->clear();
+
+ spg_t pgid;
+ if (cid.is_pg(&pgid)) {
+ _key_encode_shard(pgid.shard, start);
+ *end = *start;
+ *temp_start = *start;
+ *temp_end = *start;
+
+ _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
+ _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
+ _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), start);
+ _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), temp_start);
+ start->append(".");
+ temp_start->append(".");
+
+ _key_encode_u64(pgid.pool() + 0x8000000000000000ull, end);
+ _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_end);
+
+ uint64_t end_hash =
+ hobject_t::_reverse_bits(pgid.ps()) + (1ull << (32-bits));
+ if (end_hash <= 0xffffffffull) {
+ _key_encode_u32(end_hash, end);
+ _key_encode_u32(end_hash, temp_end);
+ end->append(".");
+ temp_end->append(".");
+ } else {
+ _key_encode_u32(0xffffffff, end);
+ _key_encode_u32(0xffffffff, temp_end);
+ end->append(":");
+ temp_end->append(":");
+ }
+ } else {
+ _key_encode_shard(shard_id_t::NO_SHARD, start);
+ _key_encode_u64(-1ull + 0x8000000000000000ull, start);
+ *end = *start;
+ _key_encode_u32(0, start);
+ start->append(".");
+ _key_encode_u32(0xffffffff, end);
+ end->append(":");
+
+ // no separate temp section
+ *temp_start = *end;
+ *temp_end = *end;
+ }
+}
+
+static int get_key_object(const string& key, ghobject_t *oid);
+
+static void get_object_key(CephContext* cct, const ghobject_t& oid,
+ string *key)
+{
+ key->clear();
+
+ _key_encode_shard(oid.shard_id, key);
+ _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
+ _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+ key->append(".");
+
+ append_escaped(oid.hobj.nspace, key);
+
+ if (oid.hobj.get_key().length()) {
+ // is a key... could be < = or >.
+ // (ASCII chars < = and > sort in that order, yay)
+ if (oid.hobj.get_key() < oid.hobj.oid.name) {
+ key->append("<");
+ append_escaped(oid.hobj.get_key(), key);
+ append_escaped(oid.hobj.oid.name, key);
+ } else if (oid.hobj.get_key() > oid.hobj.oid.name) {
+ key->append(">");
+ append_escaped(oid.hobj.get_key(), key);
+ append_escaped(oid.hobj.oid.name, key);
+ } else {
+ // same as no key
+ key->append("=");
+ append_escaped(oid.hobj.oid.name, key);
+ }
+ } else {
+ // no key
+ key->append("=");
+ append_escaped(oid.hobj.oid.name, key);
+ }
+
+ _key_encode_u64(oid.hobj.snap, key);
+ _key_encode_u64(oid.generation, key);
+
+ // sanity check
+ if (true) {
+ ghobject_t t;
+ int r = get_key_object(*key, &t);
+ if (r || t != oid) {
+ derr << " r " << r << dendl;
+ derr << "key " << pretty_binary_string(*key) << dendl;
+ derr << "oid " << oid << dendl;
+ derr << " t " << t << dendl;
+ ceph_assert(t == oid);
+ }
+ }
+}
+
+static int get_key_object(const string& key, ghobject_t *oid)
+{
+ int r;
+ const char *p = key.c_str();
+
+ p = _key_decode_shard(p, &oid->shard_id);
+
+ uint64_t pool;
+ p = _key_decode_u64(p, &pool);
+ oid->hobj.pool = pool - 0x8000000000000000ull;
+
+ unsigned hash;
+ p = _key_decode_u32(p, &hash);
+ oid->hobj.set_bitwise_key_u32(hash);
+ if (*p != '.')
+ return -5;
+ ++p;
+
+ r = decode_escaped(p, &oid->hobj.nspace);
+ if (r < 0)
+ return -6;
+ p += r + 1;
+
+ if (*p == '=') {
+ // no key
+ ++p;
+ r = decode_escaped(p, &oid->hobj.oid.name);
+ if (r < 0)
+ return -7;
+ p += r + 1;
+ } else if (*p == '<' || *p == '>') {
+ // key + name
+ ++p;
+ string okey;
+ r = decode_escaped(p, &okey);
+ if (r < 0)
+ return -8;
+ p += r + 1;
+ r = decode_escaped(p, &oid->hobj.oid.name);
+ if (r < 0)
+ return -9;
+ p += r + 1;
+ oid->hobj.set_key(okey);
+ } else {
+ // malformed
+ return -10;
+ }
+
+ p = _key_decode_u64(p, &oid->hobj.snap.val);
+ p = _key_decode_u64(p, &oid->generation);
+ if (*p) {
+ // if we get something other than a null terminator here,
+ // something goes wrong.
+ return -12;
+ }
+
+ return 0;
+}
+
+
+static void get_data_key(uint64_t nid, uint64_t offset, string *out)
+{
+ _key_encode_u64(nid, out);
+ _key_encode_u64(offset, out);
+}
+
+// '-' < '.' < '~'
+static void get_omap_header(uint64_t id, string *out)
+{
+ _key_encode_u64(id, out);
+ out->push_back('-');
+}
+
+// hmm, I don't think there's any need to escape the user key since we
+// have a clean prefix.
+static void get_omap_key(uint64_t id, const string& key, string *out)
+{
+ _key_encode_u64(id, out);
+ out->push_back('.');
+ out->append(key);
+}
+
+static void rewrite_omap_key(uint64_t id, string old, string *out)
+{
+ _key_encode_u64(id, out);
+ out->append(old.substr(out->length()));
+}
+
+static void decode_omap_key(const string& key, string *user_key)
+{
+ *user_key = key.substr(sizeof(uint64_t) + 1);
+}
+
+static void get_omap_tail(uint64_t id, string *out)
+{
+ _key_encode_u64(id, out);
+ out->push_back('~');
+}
+
+
+
+// Onode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore.onode(" << this << ") "
+
+void KStore::Onode::flush()
+{
+ std::unique_lock<std::mutex> l(flush_lock);
+ dout(20) << __func__ << " " << flush_txns << dendl;
+ while (!flush_txns.empty())
+ flush_cond.wait(l);
+ dout(20) << __func__ << " done" << dendl;
+}
+
+// OnodeHashLRU
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore.lru(" << this << ") "
+
+void KStore::OnodeHashLRU::_touch(OnodeRef o)
+{
+ lru_list_t::iterator p = lru.iterator_to(*o);
+ lru.erase(p);
+ lru.push_front(*o);
+}
+
+void KStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o)
+{
+ std::lock_guard<std::mutex> l(lock);
+ dout(30) << __func__ << " " << oid << " " << o << dendl;
+ ceph_assert(onode_map.count(oid) == 0);
+ onode_map[oid] = o;
+ lru.push_front(*o);
+}
+
+KStore::OnodeRef KStore::OnodeHashLRU::lookup(const ghobject_t& oid)
+{
+ std::lock_guard<std::mutex> l(lock);
+ dout(30) << __func__ << dendl;
+ ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
+ if (p == onode_map.end()) {
+ dout(30) << __func__ << " " << oid << " miss" << dendl;
+ return OnodeRef();
+ }
+ dout(30) << __func__ << " " << oid << " hit " << p->second << dendl;
+ _touch(p->second);
+ return p->second;
+}
+
+void KStore::OnodeHashLRU::clear()
+{
+ std::lock_guard<std::mutex> l(lock);
+ dout(10) << __func__ << dendl;
+ lru.clear();
+ onode_map.clear();
+}
+
+void KStore::OnodeHashLRU::rename(const ghobject_t& old_oid,
+ const ghobject_t& new_oid)
+{
+ std::lock_guard<std::mutex> l(lock);
+ dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl;
+ ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
+ po = onode_map.find(old_oid);
+ pn = onode_map.find(new_oid);
+
+ ceph_assert(po != onode_map.end());
+ if (pn != onode_map.end()) {
+ lru_list_t::iterator p = lru.iterator_to(*pn->second);
+ lru.erase(p);
+ onode_map.erase(pn);
+ }
+ OnodeRef o = po->second;
+
+ // install a non-existent onode it its place
+ po->second.reset(new Onode(cct, old_oid, o->key));
+ lru.push_back(*po->second);
+
+ // fix oid, key
+ onode_map.insert(make_pair(new_oid, o));
+ _touch(o);
+ o->oid = new_oid;
+ get_object_key(cct, new_oid, &o->key);
+}
+
+bool KStore::OnodeHashLRU::get_next(
+ const ghobject_t& after,
+ pair<ghobject_t,OnodeRef> *next)
+{
+ std::lock_guard<std::mutex> l(lock);
+ dout(20) << __func__ << " after " << after << dendl;
+
+ if (after == ghobject_t()) {
+ if (lru.empty()) {
+ return false;
+ }
+ ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.begin();
+ ceph_assert(p != onode_map.end());
+ next->first = p->first;
+ next->second = p->second;
+ return true;
+ }
+
+ ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(after);
+ ceph_assert(p != onode_map.end()); // for now
+ lru_list_t::iterator pi = lru.iterator_to(*p->second);
+ ++pi;
+ if (pi == lru.end()) {
+ return false;
+ }
+ next->first = pi->oid;
+ next->second = onode_map[pi->oid];
+ return true;
+}
+
+int KStore::OnodeHashLRU::trim(int max)
+{
+ std::lock_guard<std::mutex> l(lock);
+ dout(20) << __func__ << " max " << max
+ << " size " << onode_map.size() << dendl;
+ int trimmed = 0;
+ int num = onode_map.size() - max;
+ if (onode_map.size() == 0 || num <= 0)
+ return 0; // don't even try
+
+ lru_list_t::iterator p = lru.end();
+ if (num)
+ --p;
+ while (num > 0) {
+ Onode *o = &*p;
+ int refs = o->nref.load();
+ if (refs > 1) {
+ dout(20) << __func__ << " " << o->oid << " has " << refs
+ << " refs; stopping with " << num << " left to trim" << dendl;
+ break;
+ }
+ dout(30) << __func__ << " trim " << o->oid << dendl;
+ if (p != lru.begin()) {
+ lru.erase(p--);
+ } else {
+ lru.erase(p);
+ ceph_assert(num == 1);
+ }
+ o->get(); // paranoia
+ onode_map.erase(o->oid);
+ o->put();
+ --num;
+ ++trimmed;
+ }
+ return trimmed;
+}
+
+// =======================================================
+
+// Collection
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore(" << store->path << ").collection(" << cid << ") "
+
+KStore::Collection::Collection(KStore *ns, coll_t cid)
+ : CollectionImpl(cid),
+ store(ns),
+ lock("KStore::Collection::lock", true, false),
+ osr(new OpSequencer()),
+ onode_map(store->cct)
+{
+}
+
+void KStore::Collection::flush()
+{
+ osr->flush();
+}
+
+bool KStore::Collection::flush_commit(Context *c)
+{
+ return osr->flush_commit(c);
+}
+
+
+KStore::OnodeRef KStore::Collection::get_onode(
+ const ghobject_t& oid,
+ bool create)
+{
+ ceph_assert(create ? lock.is_wlocked() : lock.is_locked());
+
+ spg_t pgid;
+ if (cid.is_pg(&pgid)) {
+ if (!oid.match(cnode.bits, pgid.ps())) {
+ lderr(store->cct) << __func__ << " oid " << oid << " not part of "
+ << pgid << " bits " << cnode.bits << dendl;
+ ceph_abort();
+ }
+ }
+
+ OnodeRef o = onode_map.lookup(oid);
+ if (o)
+ return o;
+
+ string key;
+ get_object_key(store->cct, oid, &key);
+
+ ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
+ << pretty_binary_string(key) << dendl;
+
+ bufferlist v;
+ int r = store->db->get(PREFIX_OBJ, key, &v);
+ ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
+ Onode *on;
+ if (v.length() == 0) {
+ ceph_assert(r == -ENOENT);
+ if (!create)
+ return OnodeRef();
+
+ // new
+ on = new Onode(store->cct, oid, key);
+ on->dirty = true;
+ } else {
+ // loaded
+ ceph_assert(r >=0);
+ on = new Onode(store->cct, oid, key);
+ on->exists = true;
+ auto p = v.cbegin();
+ decode(on->onode, p);
+ }
+ o.reset(on);
+ onode_map.add(oid, o);
+ return o;
+}
+
+
+
+// =======================================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore(" << path << ") "
+
+KStore::KStore(CephContext *cct, const string& path)
+ : ObjectStore(cct, path),
+ db(NULL),
+ basedir(path),
+ path_fd(-1),
+ fsid_fd(-1),
+ mounted(false),
+ coll_lock("KStore::coll_lock"),
+ nid_last(0),
+ nid_max(0),
+ throttle_ops(cct, "kstore_max_ops", cct->_conf->kstore_max_ops),
+ throttle_bytes(cct, "kstore_max_bytes", cct->_conf->kstore_max_bytes),
+ finisher(cct),
+ kv_sync_thread(this),
+ kv_stop(false),
+ logger(nullptr)
+{
+ _init_logger();
+}
+
+KStore::~KStore()
+{
+ _shutdown_logger();
+ ceph_assert(!mounted);
+ ceph_assert(db == NULL);
+ ceph_assert(fsid_fd < 0);
+}
+
+void KStore::_init_logger()
+{
+ // XXX
+ PerfCountersBuilder b(cct, "KStore",
+ l_kstore_first, l_kstore_last);
+ b.add_time_avg(l_kstore_state_prepare_lat, "state_prepare_lat", "Average prepare state latency");
+ b.add_time_avg(l_kstore_state_kv_queued_lat, "state_kv_queued_lat", "Average kv_queued state latency");
+ b.add_time_avg(l_kstore_state_kv_done_lat, "state_kv_done_lat", "Average kv_done state latency");
+ b.add_time_avg(l_kstore_state_finishing_lat, "state_finishing_lat", "Average finishing state latency");
+ b.add_time_avg(l_kstore_state_done_lat, "state_done_lat", "Average done state latency");
+ logger = b.create_perf_counters();
+ cct->get_perfcounters_collection()->add(logger);
+}
+
+void KStore::_shutdown_logger()
+{
+ // XXX
+ cct->get_perfcounters_collection()->remove(logger);
+ delete logger;
+}
+
+int KStore::_open_path()
+{
+ ceph_assert(path_fd < 0);
+ path_fd = ::open(path.c_str(), O_DIRECTORY|O_CLOEXEC);
+ if (path_fd < 0) {
+ int r = -errno;
+ derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ return 0;
+}
+
+void KStore::_close_path()
+{
+ VOID_TEMP_FAILURE_RETRY(::close(path_fd));
+ path_fd = -1;
+}
+
+int KStore::_open_fsid(bool create)
+{
+ ceph_assert(fsid_fd < 0);
+ int flags = O_RDWR;
+ if (create)
+ flags |= O_CREAT;
+ fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
+ if (fsid_fd < 0) {
+ int err = -errno;
+ derr << __func__ << " " << cpp_strerror(err) << dendl;
+ return err;
+ }
+ return 0;
+}
+
+int KStore::_read_fsid(uuid_d *uuid)
+{
+ char fsid_str[40];
+ memset(fsid_str, 0, sizeof(fsid_str));
+ int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
+ if (ret < 0) {
+ derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (ret > 36)
+ fsid_str[36] = 0;
+ else
+ fsid_str[ret] = 0;
+ if (!uuid->parse(fsid_str)) {
+ derr << __func__ << " unparsable uuid " << fsid_str << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int KStore::_write_fsid()
+{
+ int r = ::ftruncate(fsid_fd, 0);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ string str = stringify(fsid) + "\n";
+ r = safe_write(fsid_fd, str.c_str(), str.length());
+ if (r < 0) {
+ derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ r = ::fsync(fsid_fd);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+void KStore::_close_fsid()
+{
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+}
+
+int KStore::_lock_fsid()
+{
+ struct flock l;
+ memset(&l, 0, sizeof(l));
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ l.l_start = 0;
+ l.l_len = 0;
+ int r = ::fcntl(fsid_fd, F_SETLK, &l);
+ if (r < 0) {
+ int err = errno;
+ derr << __func__ << " failed to lock " << path << "/fsid"
+ << " (is another ceph-osd still running?)"
+ << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ return 0;
+}
+
+bool KStore::test_mount_in_use()
+{
+ // most error conditions mean the mount is not in use (e.g., because
+ // it doesn't exist). only if we fail to lock do we conclude it is
+ // in use.
+ bool ret = false;
+ int r = _open_path();
+ if (r < 0)
+ return false;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+ r = _lock_fsid();
+ if (r < 0)
+ ret = true; // if we can't lock, it is in use
+ _close_fsid();
+ out_path:
+ _close_path();
+ return ret;
+}
+
+int KStore::_open_db(bool create)
+{
+ int r;
+ ceph_assert(!db);
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/db", path.c_str());
+
+ string kv_backend;
+ if (create) {
+ kv_backend = cct->_conf->kstore_backend;
+ } else {
+ r = read_meta("kv_backend", &kv_backend);
+ if (r < 0) {
+ derr << __func__ << " uanble to read 'kv_backend' meta" << dendl;
+ return -EIO;
+ }
+ }
+ dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
+
+ if (create) {
+ int r = ::mkdir(fn, 0755);
+ if (r < 0)
+ r = -errno;
+ if (r < 0 && r != -EEXIST) {
+ derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ // wal_dir, too!
+ char walfn[PATH_MAX];
+ snprintf(walfn, sizeof(walfn), "%s/db.wal", path.c_str());
+ r = ::mkdir(walfn, 0755);
+ if (r < 0)
+ r = -errno;
+ if (r < 0 && r != -EEXIST) {
+ derr << __func__ << " failed to create " << walfn
+ << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ db = KeyValueDB::create(cct, kv_backend, fn);
+ if (!db) {
+ derr << __func__ << " error creating db" << dendl;
+ return -EIO;
+ }
+ string options;
+ if (kv_backend == "rocksdb")
+ options = cct->_conf->kstore_rocksdb_options;
+ db->init(options);
+ stringstream err;
+ if (create)
+ r = db->create_and_open(err);
+ else
+ r = db->open(err);
+ if (r) {
+ derr << __func__ << " erroring opening db: " << err.str() << dendl;
+ delete db;
+ db = NULL;
+ return -EIO;
+ }
+ dout(1) << __func__ << " opened " << kv_backend
+ << " path " << fn << " options " << options << dendl;
+ return 0;
+}
+
+void KStore::_close_db()
+{
+ ceph_assert(db);
+ delete db;
+ db = NULL;
+}
+
+int KStore::_open_collections(int *errors)
+{
+ ceph_assert(coll_map.empty());
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
+ for (it->upper_bound(string());
+ it->valid();
+ it->next()) {
+ coll_t cid;
+ if (cid.parse(it->key())) {
+ CollectionRef c(new Collection(this, cid));
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ try {
+ decode(c->cnode, p);
+ } catch (buffer::error& e) {
+ derr << __func__ << " failed to decode cnode, key:"
+ << pretty_binary_string(it->key()) << dendl;
+ return -EIO;
+ }
+ dout(20) << __func__ << " opened " << cid << dendl;
+ coll_map[cid] = c;
+ } else {
+ derr << __func__ << " unrecognized collection " << it->key() << dendl;
+ if (errors)
+ (*errors)++;
+ }
+ }
+ return 0;
+}
+
+int KStore::mkfs()
+{
+ dout(1) << __func__ << " path " << path << dendl;
+ int r;
+ uuid_d old_fsid;
+
+ r = _open_path();
+ if (r < 0)
+ return r;
+
+ r = _open_fsid(true);
+ if (r < 0)
+ goto out_path_fd;
+
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_close_fsid;
+
+ r = _read_fsid(&old_fsid);
+ if (r < 0 || old_fsid.is_zero()) {
+ if (fsid.is_zero()) {
+ fsid.generate_random();
+ dout(1) << __func__ << " generated fsid " << fsid << dendl;
+ } else {
+ dout(1) << __func__ << " using provided fsid " << fsid << dendl;
+ }
+ // we'll write it last.
+ } else {
+ if (!fsid.is_zero() && fsid != old_fsid) {
+ derr << __func__ << " on-disk fsid " << old_fsid
+ << " != provided " << fsid << dendl;
+ r = -EINVAL;
+ goto out_close_fsid;
+ }
+ fsid = old_fsid;
+ dout(1) << __func__ << " already created, fsid is " << fsid << dendl;
+ goto out_close_fsid;
+ }
+
+ r = _open_db(true);
+ if (r < 0)
+ goto out_close_fsid;
+
+ r = write_meta("kv_backend", cct->_conf->kstore_backend);
+ if (r < 0)
+ goto out_close_db;
+
+ r = write_meta("type", "kstore");
+ if (r < 0)
+ goto out_close_db;
+
+ // indicate mkfs completion/success by writing the fsid file
+ r = _write_fsid();
+ if (r == 0)
+ dout(10) << __func__ << " success" << dendl;
+ else
+ derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
+
+ out_close_db:
+ _close_db();
+ out_close_fsid:
+ _close_fsid();
+ out_path_fd:
+ _close_path();
+ return r;
+}
+
+int KStore::mount()
+{
+ dout(1) << __func__ << " path " << path << dendl;
+
+ if (cct->_conf->kstore_fsck_on_mount) {
+ int rc = fsck(cct->_conf->kstore_fsck_on_mount_deep);
+ if (rc < 0)
+ return rc;
+ }
+
+ int r = _open_path();
+ if (r < 0)
+ return r;
+ r = _open_fsid(false);
+ if (r < 0)
+ goto out_path;
+
+ r = _read_fsid(&fsid);
+ if (r < 0)
+ goto out_fsid;
+
+ r = _lock_fsid();
+ if (r < 0)
+ goto out_fsid;
+
+ r = _open_db(false);
+ if (r < 0)
+ goto out_fsid;
+
+ r = _open_super_meta();
+ if (r < 0)
+ goto out_db;
+
+ r = _open_collections();
+ if (r < 0)
+ goto out_db;
+
+ finisher.start();
+ kv_sync_thread.create("kstore_kv_sync");
+
+ mounted = true;
+ return 0;
+
+ out_db:
+ _close_db();
+ out_fsid:
+ _close_fsid();
+ out_path:
+ _close_path();
+ return r;
+}
+
+int KStore::umount()
+{
+ ceph_assert(mounted);
+ dout(1) << __func__ << dendl;
+
+ _sync();
+ _reap_collections();
+ coll_map.clear();
+
+ dout(20) << __func__ << " stopping kv thread" << dendl;
+ _kv_stop();
+ dout(20) << __func__ << " draining finisher" << dendl;
+ finisher.wait_for_empty();
+ dout(20) << __func__ << " stopping finisher" << dendl;
+ finisher.stop();
+ dout(20) << __func__ << " closing" << dendl;
+
+ mounted = false;
+ _close_db();
+ _close_fsid();
+ _close_path();
+ return 0;
+}
+
+int KStore::fsck(bool deep)
+{
+ dout(1) << __func__ << dendl;
+ int errors = 0;
+ dout(1) << __func__ << " finish with " << errors << " errors" << dendl;
+ return errors;
+}
+
+void KStore::_sync()
+{
+ dout(10) << __func__ << dendl;
+
+ std::unique_lock<std::mutex> l(kv_lock);
+ while (!kv_committing.empty() ||
+ !kv_queue.empty()) {
+ dout(20) << " waiting for kv to commit" << dendl;
+ kv_sync_cond.wait(l);
+ }
+
+ dout(10) << __func__ << " done" << dendl;
+}
+
+int KStore::statfs(struct store_statfs_t* buf0, osd_alert_list_t* alerts)
+{
+ struct statfs buf;
+ buf0->reset();
+ if (alerts) {
+ alerts->clear(); // returns nothing for now
+ }
+ if (::statfs(basedir.c_str(), &buf) < 0) {
+ int r = -errno;
+ ceph_assert(r != -ENOENT);
+ return r;
+ }
+
+ buf0->total = buf.f_blocks * buf.f_bsize;
+ buf0->available = buf.f_bavail * buf.f_bsize;
+
+ return 0;
+}
+
+ObjectStore::CollectionHandle KStore::open_collection(const coll_t& cid)
+{
+ return _get_collection(cid);
+}
+
+ObjectStore::CollectionHandle KStore::create_new_collection(const coll_t& cid)
+{
+ auto *c = new Collection(this, cid);
+ RWLock::WLocker l(coll_lock);
+ new_coll_map[cid] = c;
+ return c;
+}
+
+int KStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf)
+{
+ return -ENOTSUP;
+}
+
+// ---------------
+// cache
+
+KStore::CollectionRef KStore::_get_collection(coll_t cid)
+{
+ RWLock::RLocker l(coll_lock);
+ ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+ if (cp == coll_map.end())
+ return CollectionRef();
+ return cp->second;
+}
+
+void KStore::_queue_reap_collection(CollectionRef& c)
+{
+ dout(10) << __func__ << " " << c->cid << dendl;
+ std::lock_guard<std::mutex> l(reap_lock);
+ removed_collections.push_back(c);
+}
+
+void KStore::_reap_collections()
+{
+ list<CollectionRef> removed_colls;
+ std::lock_guard<std::mutex> l(reap_lock);
+ removed_colls.swap(removed_collections);
+
+ for (list<CollectionRef>::iterator p = removed_colls.begin();
+ p != removed_colls.end();
+ ++p) {
+ CollectionRef c = *p;
+ dout(10) << __func__ << " " << c->cid << dendl;
+ {
+ pair<ghobject_t,OnodeRef> next;
+ while (c->onode_map.get_next(next.first, &next)) {
+ ceph_assert(!next.second->exists);
+ if (!next.second->flush_txns.empty()) {
+ dout(10) << __func__ << " " << c->cid << " " << next.second->oid
+ << " flush_txns " << next.second->flush_txns << dendl;
+ return;
+ }
+ }
+ }
+ c->onode_map.clear();
+ dout(10) << __func__ << " " << c->cid << " done" << dendl;
+ }
+
+ dout(10) << __func__ << " all reaped" << dendl;
+}
+
+// ---------------
+// read operations
+
+bool KStore::exists(CollectionHandle& ch, const ghobject_t& oid)
+{
+ dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists)
+ return false;
+ return true;
+}
+
+int KStore::stat(
+ CollectionHandle& ch,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio)
+{
+ dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists)
+ return -ENOENT;
+ st->st_size = o->onode.size;
+ st->st_blksize = 4096;
+ st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+ st->st_nlink = 1;
+ return 0;
+}
+
+int KStore::set_collection_opts(
+ CollectionHandle& ch,
+ const pool_opts_t& opts)
+{
+ return -EOPNOTSUPP;
+}
+
+int KStore::read(
+ CollectionHandle& ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t length,
+ bufferlist& bl,
+ uint32_t op_flags)
+{
+ dout(15) << __func__ << " " << ch->cid << " " << oid
+ << " " << offset << "~" << length
+ << dendl;
+ bl.clear();
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+
+ int r;
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (offset == length && offset == 0)
+ length = o->onode.size;
+
+ r = _do_read(o, offset, length, bl, op_flags);
+
+ out:
+ dout(10) << __func__ << " " << ch->cid << " " << oid
+ << " " << offset << "~" << length
+ << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_do_read(
+ OnodeRef o,
+ uint64_t offset,
+ size_t length,
+ bufferlist& bl,
+ uint32_t op_flags)
+{
+ int r = 0;
+ uint64_t stripe_size = o->onode.stripe_size;
+ uint64_t stripe_off;
+
+ dout(20) << __func__ << " " << offset << "~" << length << " size "
+ << o->onode.size << " nid " << o->onode.nid << dendl;
+ bl.clear();
+
+ if (offset > o->onode.size) {
+ goto out;
+ }
+ if (offset + length > o->onode.size) {
+ length = o->onode.size - offset;
+ }
+ if (stripe_size == 0) {
+ bl.append_zero(length);
+ r = length;
+ goto out;
+ }
+
+ o->flush();
+
+ stripe_off = offset % stripe_size;
+ while (length > 0) {
+ bufferlist stripe;
+ _do_read_stripe(o, offset - stripe_off, &stripe);
+ dout(30) << __func__ << " stripe " << offset - stripe_off << " got "
+ << stripe.length() << dendl;
+ unsigned swant = std::min<unsigned>(stripe_size - stripe_off, length);
+ if (stripe.length()) {
+ if (swant == stripe.length()) {
+ bl.claim_append(stripe);
+ dout(30) << __func__ << " taking full stripe" << dendl;
+ } else {
+ unsigned l = 0;
+ if (stripe_off < stripe.length()) {
+ l = std::min<uint64_t>(stripe.length() - stripe_off, swant);
+ bufferlist t;
+ t.substr_of(stripe, stripe_off, l);
+ bl.claim_append(t);
+ dout(30) << __func__ << " taking " << stripe_off << "~" << l << dendl;
+ }
+ if (l < swant) {
+ bl.append_zero(swant - l);
+ dout(30) << __func__ << " adding " << swant - l << " zeros" << dendl;
+ }
+ }
+ } else {
+ dout(30) << __func__ << " generating " << swant << " zeros" << dendl;
+ bl.append_zero(swant);
+ }
+ offset += swant;
+ length -= swant;
+ stripe_off = 0;
+ }
+ r = bl.length();
+ dout(30) << " result:\n";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ out:
+ return r;
+}
+
+int KStore::fiemap(
+ CollectionHandle& ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl)
+{
+ map<uint64_t, uint64_t> m;
+ int r = fiemap(ch, oid, offset, len, m);
+ if (r >= 0) {
+ encode(m, bl);
+ }
+ return r;
+}
+
+int KStore::fiemap(
+ CollectionHandle& ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ map<uint64_t, uint64_t>& destmap)
+{
+ CollectionRef c = static_cast<Collection*>(ch.get());
+ if (!c)
+ return -ENOENT;
+ RWLock::RLocker l(c->lock);
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ return -ENOENT;
+ }
+
+ if (offset > o->onode.size)
+ goto out;
+
+ if (offset + len > o->onode.size) {
+ len = o->onode.size - offset;
+ }
+
+ dout(20) << __func__ << " " << offset << "~" << len << " size "
+ << o->onode.size << dendl;
+
+ // FIXME: do something smarter here
+ destmap[0] = o->onode.size;
+
+ out:
+ dout(20) << __func__ << " " << offset << "~" << len
+ << " size = 0 (" << destmap << ")" << dendl;
+ return 0;
+}
+
+int KStore::getattr(
+ CollectionHandle& ch,
+ const ghobject_t& oid,
+ const char *name,
+ bufferptr& value)
+{
+ dout(15) << __func__ << " " << ch->cid << " " << oid << " " << name << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ int r;
+ string k(name);
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (!o->onode.attrs.count(k)) {
+ r = -ENODATA;
+ goto out;
+ }
+ value = o->onode.attrs[k];
+ r = 0;
+ out:
+ dout(10) << __func__ << " " << ch->cid << " " << oid << " " << name
+ << " = " << r << dendl;
+ return r;
+}
+
+int KStore::getattrs(
+ CollectionHandle& ch,
+ const ghobject_t& oid,
+ map<string,bufferptr>& aset)
+{
+ dout(15) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ int r;
+
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ aset = o->onode.attrs;
+ r = 0;
+ out:
+ dout(10) << __func__ << " " << ch->cid << " " << oid
+ << " = " << r << dendl;
+ return r;
+}
+
+int KStore::list_collections(vector<coll_t>& ls)
+{
+ RWLock::RLocker l(coll_lock);
+ for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
+ p != coll_map.end();
+ ++p)
+ ls.push_back(p->first);
+ return 0;
+}
+
+bool KStore::collection_exists(const coll_t& c)
+{
+ RWLock::RLocker l(coll_lock);
+ return coll_map.count(c);
+}
+
+int KStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+ dout(15) << __func__ << " " << ch->cid << dendl;
+ vector<ghobject_t> ls;
+ ghobject_t next;
+ int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
+ &ls, &next);
+ if (r < 0) {
+ derr << __func__ << " collection_list returned: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ *empty = ls.empty();
+ dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
+ return 0;
+}
+
+int KStore::collection_bits(CollectionHandle& ch)
+{
+ dout(15) << __func__ << " " << ch->cid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
+ return c->cnode.bits;
+}
+
+int KStore::collection_list(
+ CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *pnext)
+
+{
+ Collection *c = static_cast<Collection*>(c_.get());
+ c->flush();
+ dout(15) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max << dendl;
+ int r;
+ {
+ RWLock::RLocker l(c->lock);
+ r = _collection_list(c, start, end, max, ls, pnext);
+ }
+
+ dout(10) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max
+ << " = " << r << ", ls.size() = " << ls->size()
+ << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
+ return r;
+}
+
+int KStore::_collection_list(
+ Collection* c, const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+ int r = 0;
+ KeyValueDB::Iterator it;
+ string temp_start_key, temp_end_key;
+ string start_key, end_key;
+ bool set_next = false;
+ string pend;
+ bool temp;
+
+ ghobject_t static_next;
+ if (!pnext)
+ pnext = &static_next;
+
+ if (start == ghobject_t::get_max() ||
+ start.hobj.is_max()) {
+ goto out;
+ }
+ get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
+ &start_key, &end_key);
+ dout(20) << __func__
+ << " range " << pretty_binary_string(temp_start_key)
+ << " to " << pretty_binary_string(temp_end_key)
+ << " and " << pretty_binary_string(start_key)
+ << " to " << pretty_binary_string(end_key)
+ << " start " << start << dendl;
+ it = db->get_iterator(PREFIX_OBJ);
+ if (start == ghobject_t() || start == c->cid.get_min_hobj()) {
+ it->upper_bound(temp_start_key);
+ temp = true;
+ } else {
+ string k;
+ get_object_key(cct, start, &k);
+ if (start.hobj.is_temp()) {
+ temp = true;
+ ceph_assert(k >= temp_start_key && k < temp_end_key);
+ } else {
+ temp = false;
+ ceph_assert(k >= start_key && k < end_key);
+ }
+ dout(20) << " start from " << pretty_binary_string(k)
+ << " temp=" << (int)temp << dendl;
+ it->lower_bound(k);
+ }
+ if (end.hobj.is_max()) {
+ pend = temp ? temp_end_key : end_key;
+ } else {
+ if (end.hobj.is_temp()) {
+ if (temp)
+ get_object_key(cct, end, &pend);
+ else
+ goto out;
+ } else {
+ if (temp)
+ pend = temp_end_key;
+ else
+ get_object_key(cct, end, &pend);
+ }
+ }
+ dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+ while (true) {
+ if (!it->valid() || it->key() >= pend) {
+ if (!it->valid())
+ dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
+ else
+ dout(20) << __func__ << " key " << pretty_binary_string(it->key())
+ << " > " << end << dendl;
+ if (temp) {
+ if (end.hobj.is_temp()) {
+ if (it->valid() && it->key() < temp_end_key) {
+ int r = get_key_object(it->key(), pnext);
+ ceph_assert(r == 0);
+ set_next = true;
+ }
+ break;
+ }
+ dout(30) << __func__ << " switch to non-temp namespace" << dendl;
+ temp = false;
+ it->upper_bound(start_key);
+ if (end.hobj.is_max())
+ pend = end_key;
+ else
+ get_object_key(cct, end, &pend);
+ dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+ continue;
+ }
+ if (it->valid() && it->key() < end_key) {
+ int r = get_key_object(it->key(), pnext);
+ ceph_assert(r == 0);
+ set_next = true;
+ }
+ break;
+ }
+ dout(20) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
+ ghobject_t oid;
+ int r = get_key_object(it->key(), &oid);
+ ceph_assert(r == 0);
+ if (ls->size() >= (unsigned)max) {
+ dout(20) << __func__ << " reached max " << max << dendl;
+ *pnext = oid;
+ set_next = true;
+ break;
+ }
+ ls->push_back(oid);
+ it->next();
+ }
+out:
+ if (!set_next) {
+ *pnext = ghobject_t::get_max();
+ }
+ return r;
+}
+
+// omap reads
+
+KStore::OmapIteratorImpl::OmapIteratorImpl(
+ CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
+ : c(c), o(o), it(it)
+{
+ RWLock::RLocker l(c->lock);
+ if (o->onode.omap_head) {
+ get_omap_key(o->onode.omap_head, string(), &head);
+ get_omap_tail(o->onode.omap_head, &tail);
+ it->lower_bound(head);
+ }
+}
+
+int KStore::OmapIteratorImpl::seek_to_first()
+{
+ RWLock::RLocker l(c->lock);
+ if (o->onode.omap_head) {
+ it->lower_bound(head);
+ } else {
+ it = KeyValueDB::Iterator();
+ }
+ return 0;
+}
+
+int KStore::OmapIteratorImpl::upper_bound(const string& after)
+{
+ RWLock::RLocker l(c->lock);
+ if (o->onode.omap_head) {
+ string key;
+ get_omap_key(o->onode.omap_head, after, &key);
+ it->upper_bound(key);
+ } else {
+ it = KeyValueDB::Iterator();
+ }
+ return 0;
+}
+
+int KStore::OmapIteratorImpl::lower_bound(const string& to)
+{
+ RWLock::RLocker l(c->lock);
+ if (o->onode.omap_head) {
+ string key;
+ get_omap_key(o->onode.omap_head, to, &key);
+ it->lower_bound(key);
+ } else {
+ it = KeyValueDB::Iterator();
+ }
+ return 0;
+}
+
+bool KStore::OmapIteratorImpl::valid()
+{
+ RWLock::RLocker l(c->lock);
+ if (o->onode.omap_head && it->valid() && it->raw_key().second <= tail) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+int KStore::OmapIteratorImpl::next()
+{
+ RWLock::RLocker l(c->lock);
+ if (o->onode.omap_head) {
+ it->next();
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+string KStore::OmapIteratorImpl::key()
+{
+ RWLock::RLocker l(c->lock);
+ ceph_assert(it->valid());
+ string db_key = it->raw_key().second;
+ string user_key;
+ decode_omap_key(db_key, &user_key);
+ return user_key;
+}
+
+bufferlist KStore::OmapIteratorImpl::value()
+{
+ RWLock::RLocker l(c->lock);
+ ceph_assert(it->valid());
+ return it->value();
+}
+
+int KStore::omap_get(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+ )
+{
+ dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.omap_head)
+ goto out;
+ o->flush();
+ {
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+ string head, tail;
+ get_omap_header(o->onode.omap_head, &head);
+ get_omap_tail(o->onode.omap_head, &tail);
+ it->lower_bound(head);
+ while (it->valid()) {
+ if (it->key() == head) {
+ dout(30) << __func__ << " got header" << dendl;
+ *header = it->value();
+ } else if (it->key() >= tail) {
+ dout(30) << __func__ << " reached tail" << dendl;
+ break;
+ } else {
+ string user_key;
+ decode_omap_key(it->key(), &user_key);
+ dout(30) << __func__ << " got " << pretty_binary_string(it->key())
+ << " -> " << user_key << dendl;
+ ceph_assert(it->key() < tail);
+ (*out)[user_key] = it->value();
+ }
+ it->next();
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::omap_get_header(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ bool allow_eio ///< [in] don't assert on eio
+ )
+{
+ dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.omap_head)
+ goto out;
+ o->flush();
+ {
+ string head;
+ get_omap_header(o->onode.omap_head, &head);
+ if (db->get(PREFIX_OMAP, head, header) >= 0) {
+ dout(30) << __func__ << " got header" << dendl;
+ } else {
+ dout(30) << __func__ << " no header" << dendl;
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::omap_get_keys(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
+ )
+{
+ dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.omap_head)
+ goto out;
+ o->flush();
+ {
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+ string head, tail;
+ get_omap_key(o->onode.omap_head, string(), &head);
+ get_omap_tail(o->onode.omap_head, &tail);
+ it->lower_bound(head);
+ while (it->valid()) {
+ if (it->key() >= tail) {
+ dout(30) << __func__ << " reached tail" << dendl;
+ break;
+ }
+ string user_key;
+ decode_omap_key(it->key(), &user_key);
+ dout(30) << __func__ << " got " << pretty_binary_string(it->key())
+ << " -> " << user_key << dendl;
+ ceph_assert(it->key() < tail);
+ keys->insert(user_key);
+ it->next();
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::omap_get_values(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to get
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ )
+{
+ dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.omap_head)
+ goto out;
+ o->flush();
+ for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+ string key;
+ get_omap_key(o->onode.omap_head, *p, &key);
+ bufferlist val;
+ if (db->get(PREFIX_OMAP, key, &val) >= 0) {
+ dout(30) << __func__ << " got " << pretty_binary_string(key)
+ << " -> " << *p << dendl;
+ out->insert(make_pair(*p, val));
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::omap_check_keys(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to check
+ set<string> *out ///< [out] Subset of keys defined on oid
+ )
+{
+ dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ int r = 0;
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ r = -ENOENT;
+ goto out;
+ }
+ if (!o->onode.omap_head)
+ goto out;
+ o->flush();
+ for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+ string key;
+ get_omap_key(o->onode.omap_head, *p, &key);
+ bufferlist val;
+ if (db->get(PREFIX_OMAP, key, &val) >= 0) {
+ dout(30) << __func__ << " have " << pretty_binary_string(key)
+ << " -> " << *p << dendl;
+ out->insert(*p);
+ } else {
+ dout(30) << __func__ << " miss " << pretty_binary_string(key)
+ << " -> " << *p << dendl;
+ }
+ }
+ out:
+ dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+ return r;
+}
+
+ObjectMap::ObjectMapIterator KStore::get_omap_iterator(
+ CollectionHandle& ch, ///< [in] collection
+ const ghobject_t &oid ///< [in] object
+ )
+{
+
+ dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ RWLock::RLocker l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ if (!o || !o->exists) {
+ dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+ return ObjectMap::ObjectMapIterator();
+ }
+ o->flush();
+ dout(10) << __func__ << " header = " << o->onode.omap_head <<dendl;
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+ return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
+}
+
+
+// -----------------
+// write helpers
+
+int KStore::_open_super_meta()
+{
+ // nid
+ {
+ nid_max = 0;
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "nid_max", &bl);
+ auto p = bl.cbegin();
+ try {
+ decode(nid_max, p);
+ } catch (buffer::error& e) {
+ }
+ dout(10) << __func__ << " old nid_max " << nid_max << dendl;
+ nid_last = nid_max;
+ }
+ return 0;
+}
+
+void KStore::_assign_nid(TransContext *txc, OnodeRef o)
+{
+ if (o->onode.nid)
+ return;
+ std::lock_guard<std::mutex> l(nid_lock);
+ o->onode.nid = ++nid_last;
+ dout(20) << __func__ << " " << o->oid << " nid " << o->onode.nid << dendl;
+ if (nid_last > nid_max) {
+ nid_max += cct->_conf->kstore_nid_prealloc;
+ bufferlist bl;
+ encode(nid_max, bl);
+ txc->t->set(PREFIX_SUPER, "nid_max", bl);
+ dout(10) << __func__ << " nid_max now " << nid_max << dendl;
+ }
+}
+
+KStore::TransContext *KStore::_txc_create(OpSequencer *osr)
+{
+ TransContext *txc = new TransContext(osr);
+ txc->t = db->get_transaction();
+ osr->queue_new(txc);
+ dout(20) << __func__ << " osr " << osr << " = " << txc << dendl;
+ return txc;
+}
+
+void KStore::_txc_state_proc(TransContext *txc)
+{
+ while (true) {
+ dout(10) << __func__ << " txc " << txc
+ << " " << txc->get_state_name() << dendl;
+ switch (txc->state) {
+ case TransContext::STATE_PREPARE:
+ txc->log_state_latency(logger, l_kstore_state_prepare_lat);
+ txc->state = TransContext::STATE_KV_QUEUED;
+ if (!cct->_conf->kstore_sync_transaction) {
+ std::lock_guard<std::mutex> l(kv_lock);
+ if (cct->_conf->kstore_sync_submit_transaction) {
+ int r = db->submit_transaction(txc->t);
+ ceph_assert(r == 0);
+ }
+ kv_queue.push_back(txc);
+ kv_cond.notify_one();
+ return;
+ }
+ {
+ int r = db->submit_transaction_sync(txc->t);
+ ceph_assert(r == 0);
+ }
+ break;
+
+ case TransContext::STATE_KV_QUEUED:
+ txc->log_state_latency(logger, l_kstore_state_kv_queued_lat);
+ txc->state = TransContext::STATE_KV_DONE;
+ _txc_finish_kv(txc);
+ // ** fall-thru **
+
+ case TransContext::STATE_KV_DONE:
+ txc->log_state_latency(logger, l_kstore_state_kv_done_lat);
+ txc->state = TransContext::STATE_FINISHING;
+ // ** fall-thru **
+
+ case TransContext::TransContext::STATE_FINISHING:
+ txc->log_state_latency(logger, l_kstore_state_finishing_lat);
+ _txc_finish(txc);
+ return;
+
+ default:
+ derr << __func__ << " unexpected txc " << txc
+ << " state " << txc->get_state_name() << dendl;
+ ceph_abort_msg("unexpected txc state");
+ return;
+ }
+ }
+}
+
+void KStore::_txc_finalize(OpSequencer *osr, TransContext *txc)
+{
+ dout(20) << __func__ << " osr " << osr << " txc " << txc
+ << " onodes " << txc->onodes << dendl;
+
+ // finalize onodes
+ for (set<OnodeRef>::iterator p = txc->onodes.begin();
+ p != txc->onodes.end();
+ ++p) {
+ bufferlist bl;
+ encode((*p)->onode, bl);
+ dout(20) << " onode size is " << bl.length() << dendl;
+ txc->t->set(PREFIX_OBJ, (*p)->key, bl);
+
+ std::lock_guard<std::mutex> l((*p)->flush_lock);
+ (*p)->flush_txns.insert(txc);
+ }
+}
+
+void KStore::_txc_finish_kv(TransContext *txc)
+{
+ dout(20) << __func__ << " txc " << txc << dendl;
+
+ // warning: we're calling onreadable_sync inside the sequencer lock
+ if (txc->onreadable_sync) {
+ txc->onreadable_sync->complete(0);
+ txc->onreadable_sync = NULL;
+ }
+ if (txc->onreadable) {
+ finisher.queue(txc->onreadable);
+ txc->onreadable = NULL;
+ }
+ if (txc->oncommit) {
+ finisher.queue(txc->oncommit);
+ txc->oncommit = NULL;
+ }
+ if (!txc->oncommits.empty()) {
+ finisher.queue(txc->oncommits);
+ }
+
+ throttle_ops.put(txc->ops);
+ throttle_bytes.put(txc->bytes);
+}
+
+void KStore::_txc_finish(TransContext *txc)
+{
+ dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
+ ceph_assert(txc->state == TransContext::STATE_FINISHING);
+
+ for (set<OnodeRef>::iterator p = txc->onodes.begin();
+ p != txc->onodes.end();
+ ++p) {
+ std::lock_guard<std::mutex> l((*p)->flush_lock);
+ dout(20) << __func__ << " onode " << *p << " had " << (*p)->flush_txns
+ << dendl;
+ ceph_assert((*p)->flush_txns.count(txc));
+ (*p)->flush_txns.erase(txc);
+ if ((*p)->flush_txns.empty()) {
+ (*p)->flush_cond.notify_all();
+ (*p)->clear_pending_stripes();
+ }
+ }
+
+ // clear out refs
+ txc->onodes.clear();
+
+ while (!txc->removed_collections.empty()) {
+ _queue_reap_collection(txc->removed_collections.front());
+ txc->removed_collections.pop_front();
+ }
+
+ OpSequencerRef osr = txc->osr;
+ {
+ std::lock_guard<std::mutex> l(osr->qlock);
+ txc->state = TransContext::STATE_DONE;
+ }
+
+ _osr_reap_done(osr.get());
+}
+
+void KStore::_osr_reap_done(OpSequencer *osr)
+{
+ std::lock_guard<std::mutex> l(osr->qlock);
+ dout(20) << __func__ << " osr " << osr << dendl;
+ while (!osr->q.empty()) {
+ TransContext *txc = &osr->q.front();
+ dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
+ << dendl;
+ if (txc->state != TransContext::STATE_DONE) {
+ break;
+ }
+
+ if (txc->first_collection) {
+ txc->first_collection->onode_map.trim(cct->_conf->kstore_onode_map_size);
+ }
+
+ osr->q.pop_front();
+ txc->log_state_latency(logger, l_kstore_state_done_lat);
+ delete txc;
+ osr->qcond.notify_all();
+ if (osr->q.empty())
+ dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
+ }
+}
+
+void KStore::_kv_sync_thread()
+{
+ dout(10) << __func__ << " start" << dendl;
+ std::unique_lock<std::mutex> l(kv_lock);
+ while (true) {
+ ceph_assert(kv_committing.empty());
+ if (kv_queue.empty()) {
+ if (kv_stop)
+ break;
+ dout(20) << __func__ << " sleep" << dendl;
+ kv_sync_cond.notify_all();
+ kv_cond.wait(l);
+ dout(20) << __func__ << " wake" << dendl;
+ } else {
+ dout(20) << __func__ << " committing " << kv_queue.size() << dendl;
+ kv_committing.swap(kv_queue);
+ utime_t start = ceph_clock_now();
+ l.unlock();
+
+ dout(30) << __func__ << " committing txc " << kv_committing << dendl;
+
+ // one transaction to force a sync
+ KeyValueDB::Transaction t = db->get_transaction();
+ if (!cct->_conf->kstore_sync_submit_transaction) {
+ for (std::deque<TransContext *>::iterator it = kv_committing.begin();
+ it != kv_committing.end();
+ ++it) {
+ int r = db->submit_transaction((*it)->t);
+ ceph_assert(r == 0);
+ }
+ }
+ int r = db->submit_transaction_sync(t);
+ ceph_assert(r == 0);
+ utime_t finish = ceph_clock_now();
+ utime_t dur = finish - start;
+ dout(20) << __func__ << " committed " << kv_committing.size()
+ << " in " << dur << dendl;
+ while (!kv_committing.empty()) {
+ TransContext *txc = kv_committing.front();
+ _txc_state_proc(txc);
+ kv_committing.pop_front();
+ }
+
+ // this is as good a place as any ...
+ _reap_collections();
+
+ l.lock();
+ }
+ }
+ dout(10) << __func__ << " finish" << dendl;
+}
+
+
+// ---------------------------
+// transactions
+
+int KStore::queue_transactions(
+ CollectionHandle& ch,
+ vector<Transaction>& tls,
+ TrackedOpRef op,
+ ThreadPool::TPHandle *handle)
+{
+ Context *onreadable;
+ Context *ondisk;
+ Context *onreadable_sync;
+ ObjectStore::Transaction::collect_contexts(
+ tls, &onreadable, &ondisk, &onreadable_sync);
+
+ // set up the sequencer
+ Collection *c = static_cast<Collection*>(ch.get());
+ OpSequencer *osr = c->osr.get();
+ dout(10) << __func__ << " ch " << ch.get() << " " << c->cid << dendl;
+
+ // prepare
+ TransContext *txc = _txc_create(osr);
+ txc->onreadable = onreadable;
+ txc->onreadable_sync = onreadable_sync;
+ txc->oncommit = ondisk;
+
+ for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
+ txc->ops += (*p).get_num_ops();
+ txc->bytes += (*p).get_num_bytes();
+ _txc_add_transaction(txc, &(*p));
+ }
+
+ _txc_finalize(osr, txc);
+
+ throttle_ops.get(txc->ops);
+ throttle_bytes.get(txc->bytes);
+
+ // execute (start)
+ _txc_state_proc(txc);
+ return 0;
+}
+
+void KStore::_txc_add_transaction(TransContext *txc, Transaction *t)
+{
+ Transaction::iterator i = t->begin();
+
+ dout(30) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t->dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ vector<CollectionRef> cvec(i.colls.size());
+ unsigned j = 0;
+ for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
+ ++p, ++j) {
+ cvec[j] = _get_collection(*p);
+
+ // note first collection we reference
+ if (!j && !txc->first_collection)
+ txc->first_collection = cvec[j];
+ }
+ vector<OnodeRef> ovec(i.objects.size());
+
+ for (int pos = 0; i.have_op(); ++pos) {
+ Transaction::Op *op = i.decode_op();
+ int r = 0;
+
+ // no coll or obj
+ if (op->op == Transaction::OP_NOP)
+ continue;
+
+ // collection operations
+ CollectionRef &c = cvec[op->cid];
+ switch (op->op) {
+ case Transaction::OP_RMCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ r = _remove_collection(txc, cid, &c);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ ceph_assert(!c);
+ coll_t cid = i.get_cid(op->cid);
+ r = _create_collection(txc, cid, op->split_bits, &c);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_SPLIT_COLLECTION:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_SPLIT_COLLECTION2:
+ {
+ uint32_t bits = op->split_bits;
+ uint32_t rem = op->split_rem;
+ r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_MERGE_COLLECTION:
+ {
+ uint32_t bits = op->split_bits;
+ r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
+ if (!r)
+ continue;
+ }
+ break;
+
+ case Transaction::OP_COLL_HINT:
+ {
+ uint32_t type = op->hint_type;
+ bufferlist hint;
+ i.decode_bl(hint);
+ auto hiter = hint.cbegin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ decode(pg_num, hiter);
+ decode(num_objs, hiter);
+ dout(10) << __func__ << " collection hint objects is a no-op, "
+ << " pg_num " << pg_num << " num_objects " << num_objs
+ << dendl;
+ } else {
+ // Ignore the hint
+ dout(10) << __func__ << " unknown collection hint " << type << dendl;
+ }
+ continue;
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ r = -EOPNOTSUPP;
+ break;
+
+ case Transaction::OP_COLL_RMATTR:
+ r = -EOPNOTSUPP;
+ break;
+
+ case Transaction::OP_COLL_RENAME:
+ ceph_abort_msg("not implemented");
+ break;
+ }
+ if (r < 0) {
+ derr << " error " << cpp_strerror(r)
+ << " not handled on operation " << op->op
+ << " (op " << pos << ", counting from 0)" << dendl;
+ dout(0) << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t->dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+ ceph_abort_msg("unexpected error");
+ }
+
+ // object operations
+ RWLock::WLocker l(c->lock);
+ OnodeRef &o = ovec[op->oid];
+ if (!o) {
+ // these operations implicity create the object
+ bool create = false;
+ if (op->op == Transaction::OP_TOUCH ||
+ op->op == Transaction::OP_WRITE ||
+ op->op == Transaction::OP_ZERO) {
+ create = true;
+ }
+ ghobject_t oid = i.get_oid(op->oid);
+ o = c->get_onode(oid, create);
+ if (!create) {
+ if (!o || !o->exists) {
+ dout(10) << __func__ << " op " << op->op << " got ENOENT on "
+ << oid << dendl;
+ r = -ENOENT;
+ goto endop;
+ }
+ }
+ }
+
+ switch (op->op) {
+ case Transaction::OP_TOUCH:
+ r = _touch(txc, c, o);
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ bufferlist bl;
+ i.decode_bl(bl);
+ r = _write(txc, c, o, off, len, bl, fadvise_flags);
+ }
+ break;
+
+ case Transaction::OP_ZERO:
+ {
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ r = _zero(txc, c, o, off, len);
+ }
+ break;
+
+ case Transaction::OP_TRIMCACHE:
+ {
+ // deprecated, no-op
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ uint64_t off = op->off;
+ r = _truncate(txc, c, o, off);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ r = _remove(txc, c, o);
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ string name = i.decode_string();
+ bufferlist bl;
+ i.decode_bl(bl);
+ map<string, bufferptr> to_set;
+ to_set[name] = bufferptr(bl.c_str(), bl.length());
+ r = _setattrs(txc, c, o, to_set);
+ }
+ break;
+
+ case Transaction::OP_SETATTRS:
+ {
+ map<string, bufferptr> aset;
+ i.decode_attrset(aset);
+ r = _setattrs(txc, c, o, aset);
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ string name = i.decode_string();
+ r = _rmattr(txc, c, o, name);
+ }
+ break;
+
+ case Transaction::OP_RMATTRS:
+ {
+ r = _rmattrs(txc, c, o);
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ OnodeRef no = c->get_onode(noid, true);
+ r = _clone(txc, c, o, no);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_CLONERANGE2:
+ {
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ OnodeRef no = c->get_onode(noid, true);
+ uint64_t srcoff = op->off;
+ uint64_t len = op->len;
+ uint64_t dstoff = op->dest_off;
+ r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ ceph_abort_msg("not implemented");
+ break;
+
+ case Transaction::OP_COLL_REMOVE:
+ ceph_abort_msg("not implemented");
+ break;
+
+ case Transaction::OP_COLL_MOVE:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_COLL_MOVE_RENAME:
+ {
+ ceph_assert(op->cid == op->dest_cid);
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ OnodeRef no = c->get_onode(noid, true);
+ r = _rename(txc, c, o, no, noid);
+ o.reset();
+ }
+ break;
+
+ case Transaction::OP_TRY_RENAME:
+ {
+ const ghobject_t& noid = i.get_oid(op->dest_oid);
+ OnodeRef no = c->get_onode(noid, true);
+ r = _rename(txc, c, o, no, noid);
+ if (r == -ENOENT)
+ r = 0;
+ o.reset();
+ }
+ break;
+
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ r = _omap_clear(txc, c, o);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ bufferlist aset_bl;
+ i.decode_attrset_bl(&aset_bl);
+ r = _omap_setkeys(txc, c, o, aset_bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ bufferlist keys_bl;
+ i.decode_keyset_bl(&keys_bl);
+ r = _omap_rmkeys(txc, c, o, keys_bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ r = _omap_rmkey_range(txc, c, o, first, last);
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ bufferlist bl;
+ i.decode_bl(bl);
+ r = _omap_setheader(txc, c, o, bl);
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+ {
+ uint64_t expected_object_size = op->expected_object_size;
+ uint64_t expected_write_size = op->expected_write_size;
+ uint32_t flags = op->alloc_hint_flags;
+ r = _setallochint(txc, c, o,
+ expected_object_size,
+ expected_write_size,
+ flags);
+ }
+ break;
+
+ default:
+ derr << "bad op " << op->op << dendl;
+ ceph_abort();
+ }
+
+ endop:
+ if (r < 0) {
+ bool ok = false;
+
+ if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2 ||
+ op->op == Transaction::OP_COLL_ADD))
+ // -ENOENT is usually okay
+ ok = true;
+ if (r == -ENODATA)
+ ok = true;
+
+ if (!ok) {
+ const char *msg = "unexpected error code";
+
+ if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2))
+ msg = "ENOENT on clone suggests osd bug";
+
+ if (r == -ENOSPC)
+ // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+ // by partially applying transactions.
+ msg = "ENOSPC from key value store, misconfigured cluster";
+
+ if (r == -ENOTEMPTY) {
+ msg = "ENOTEMPTY suggests garbage data in osd data dir";
+ }
+
+ dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op->op
+ << " (op " << pos << ", counting from 0)" << dendl;
+ dout(0) << msg << dendl;
+ dout(0) << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t->dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+ ceph_abort_msg("unexpected error");
+ }
+ }
+ }
+}
+
+
+
+// -----------------
+// write operations
+
+int KStore::_touch(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef &o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+ o->exists = true;
+ _assign_nid(txc, o);
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+void KStore::_dump_onode(OnodeRef o)
+{
+ dout(30) << __func__ << " " << o
+ << " nid " << o->onode.nid
+ << " size " << o->onode.size
+ << " expected_object_size " << o->onode.expected_object_size
+ << " expected_write_size " << o->onode.expected_write_size
+ << dendl;
+ for (map<string,bufferptr>::iterator p = o->onode.attrs.begin();
+ p != o->onode.attrs.end();
+ ++p) {
+ dout(30) << __func__ << " attr " << p->first
+ << " len " << p->second.length() << dendl;
+ }
+}
+
+void KStore::_do_read_stripe(OnodeRef o, uint64_t offset, bufferlist *pbl)
+{
+ map<uint64_t,bufferlist>::iterator p = o->pending_stripes.find(offset);
+ if (p == o->pending_stripes.end()) {
+ string key;
+ get_data_key(o->onode.nid, offset, &key);
+ db->get(PREFIX_DATA, key, pbl);
+ o->pending_stripes[offset] = *pbl;
+ } else {
+ *pbl = p->second;
+ }
+}
+
+void KStore::_do_write_stripe(TransContext *txc, OnodeRef o,
+ uint64_t offset, bufferlist& bl)
+{
+ o->pending_stripes[offset] = bl;
+ string key;
+ get_data_key(o->onode.nid, offset, &key);
+ txc->t->set(PREFIX_DATA, key, bl);
+}
+
+void KStore::_do_remove_stripe(TransContext *txc, OnodeRef o, uint64_t offset)
+{
+ o->pending_stripes.erase(offset);
+ string key;
+ get_data_key(o->onode.nid, offset, &key);
+ txc->t->rmkey(PREFIX_DATA, key);
+}
+
+int KStore::_do_write(TransContext *txc,
+ OnodeRef o,
+ uint64_t offset, uint64_t length,
+ bufferlist& orig_bl,
+ uint32_t fadvise_flags)
+{
+ int r = 0;
+
+ dout(20) << __func__
+ << " " << o->oid << " " << offset << "~" << length
+ << " - have " << o->onode.size
+ << " bytes, nid " << o->onode.nid << dendl;
+ _dump_onode(o);
+ o->exists = true;
+
+ if (length == 0) {
+ return 0;
+ }
+
+ uint64_t stripe_size = o->onode.stripe_size;
+ if (!stripe_size) {
+ o->onode.stripe_size = cct->_conf->kstore_default_stripe_size;
+ stripe_size = o->onode.stripe_size;
+ }
+
+ unsigned bl_off = 0;
+ while (length > 0) {
+ uint64_t offset_rem = offset % stripe_size;
+ uint64_t end_rem = (offset + length) % stripe_size;
+ if (offset_rem == 0 && end_rem == 0) {
+ bufferlist bl;
+ bl.substr_of(orig_bl, bl_off, stripe_size);
+ dout(30) << __func__ << " full stripe " << offset << dendl;
+ _do_write_stripe(txc, o, offset, bl);
+ offset += stripe_size;
+ length -= stripe_size;
+ bl_off += stripe_size;
+ continue;
+ }
+ uint64_t stripe_off = offset - offset_rem;
+ bufferlist prev;
+ _do_read_stripe(o, stripe_off, &prev);
+ dout(20) << __func__ << " read previous stripe " << stripe_off
+ << ", got " << prev.length() << dendl;
+ bufferlist bl;
+ if (offset_rem) {
+ unsigned p = std::min<uint64_t>(prev.length(), offset_rem);
+ if (p) {
+ dout(20) << __func__ << " reuse leading " << p << " bytes" << dendl;
+ bl.substr_of(prev, 0, p);
+ }
+ if (p < offset_rem) {
+ dout(20) << __func__ << " add leading " << offset_rem - p << " zeros" << dendl;
+ bl.append_zero(offset_rem - p);
+ }
+ }
+ unsigned use = stripe_size - offset_rem;
+ if (use > length)
+ use -= stripe_size - end_rem;
+ dout(20) << __func__ << " using " << use << " for this stripe" << dendl;
+ bufferlist t;
+ t.substr_of(orig_bl, bl_off, use);
+ bl.claim_append(t);
+ bl_off += use;
+ if (end_rem) {
+ if (end_rem < prev.length()) {
+ unsigned l = prev.length() - end_rem;
+ dout(20) << __func__ << " reuse trailing " << l << " bytes" << dendl;
+ bufferlist t;
+ t.substr_of(prev, end_rem, l);
+ bl.claim_append(t);
+ }
+ }
+ dout(30) << " writing:\n";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+ _do_write_stripe(txc, o, stripe_off, bl);
+ offset += use;
+ length -= use;
+ }
+
+ if (offset > o->onode.size) {
+ dout(20) << __func__ << " extending size to " << offset + length
+ << dendl;
+ o->onode.size = offset;
+ }
+
+ return r;
+}
+
+int KStore::_write(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t length,
+ bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << offset << "~" << length
+ << dendl;
+ _assign_nid(txc, o);
+ int r = _do_write(txc, o, offset, length, bl, fadvise_flags);
+ txc->write_onode(o);
+
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << offset << "~" << length
+ << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_zero(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t length)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << offset << "~" << length
+ << dendl;
+ int r = 0;
+ o->exists = true;
+
+ _dump_onode(o);
+ _assign_nid(txc, o);
+
+ uint64_t stripe_size = o->onode.stripe_size;
+ if (stripe_size) {
+ uint64_t end = offset + length;
+ uint64_t pos = offset;
+ uint64_t stripe_off = pos % stripe_size;
+ while (pos < offset + length) {
+ if (stripe_off || end - pos < stripe_size) {
+ bufferlist stripe;
+ _do_read_stripe(o, pos - stripe_off, &stripe);
+ dout(30) << __func__ << " stripe " << pos - stripe_off << " got "
+ << stripe.length() << dendl;
+ bufferlist bl;
+ bl.substr_of(stripe, 0, std::min<uint64_t>(stripe.length(), stripe_off));
+ if (end >= pos - stripe_off + stripe_size ||
+ end >= o->onode.size) {
+ dout(20) << __func__ << " truncated stripe " << pos - stripe_off
+ << " to " << bl.length() << dendl;
+ } else {
+ auto len = end - (pos - stripe_off + bl.length());
+ bl.append_zero(len);
+ dout(20) << __func__ << " adding " << len << " of zeros" << dendl;
+ if (stripe.length() > bl.length()) {
+ unsigned l = stripe.length() - bl.length();
+ bufferlist t;
+ t.substr_of(stripe, stripe.length() - l, l);
+ dout(20) << __func__ << " keeping tail " << l << " of stripe" << dendl;
+ bl.claim_append(t);
+ }
+ }
+ _do_write_stripe(txc, o, pos - stripe_off, bl);
+ pos += stripe_size - stripe_off;
+ stripe_off = 0;
+ } else {
+ dout(20) << __func__ << " rm stripe " << pos << dendl;
+ _do_remove_stripe(txc, o, pos - stripe_off);
+ pos += stripe_size;
+ }
+ }
+ }
+ if (offset + length > o->onode.size) {
+ o->onode.size = offset + length;
+ dout(20) << __func__ << " extending size to " << offset + length
+ << dendl;
+ }
+ txc->write_onode(o);
+
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << offset << "~" << length
+ << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_do_truncate(TransContext *txc, OnodeRef o, uint64_t offset)
+{
+ uint64_t stripe_size = o->onode.stripe_size;
+
+ o->flush();
+
+ // trim down stripes
+ if (stripe_size) {
+ uint64_t pos = offset;
+ uint64_t stripe_off = pos % stripe_size;
+ while (pos < o->onode.size) {
+ if (stripe_off) {
+ bufferlist stripe;
+ _do_read_stripe(o, pos - stripe_off, &stripe);
+ dout(30) << __func__ << " stripe " << pos - stripe_off << " got "
+ << stripe.length() << dendl;
+ bufferlist t;
+ t.substr_of(stripe, 0, std::min<uint64_t>(stripe_off, stripe.length()));
+ _do_write_stripe(txc, o, pos - stripe_off, t);
+ dout(20) << __func__ << " truncated stripe " << pos - stripe_off
+ << " to " << t.length() << dendl;
+ pos += stripe_size - stripe_off;
+ stripe_off = 0;
+ } else {
+ dout(20) << __func__ << " rm stripe " << pos << dendl;
+ _do_remove_stripe(txc, o, pos - stripe_off);
+ pos += stripe_size;
+ }
+ }
+
+ // trim down cached tail
+ if (o->tail_bl.length()) {
+ if (offset / stripe_size != o->onode.size / stripe_size) {
+ dout(20) << __func__ << " clear cached tail" << dendl;
+ o->clear_tail();
+ }
+ }
+ }
+
+ o->onode.size = offset;
+ dout(10) << __func__ << " truncate size to " << offset << dendl;
+
+ txc->write_onode(o);
+ return 0;
+}
+
+int KStore::_truncate(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << offset
+ << dendl;
+ int r = _do_truncate(txc, o, offset);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << offset
+ << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_do_remove(TransContext *txc,
+ OnodeRef o)
+{
+ string key;
+
+ _do_truncate(txc, o, 0);
+
+ o->onode.size = 0;
+ if (o->onode.omap_head) {
+ _do_omap_clear(txc, o->onode.omap_head);
+ }
+ o->exists = false;
+ o->onode = kstore_onode_t();
+ txc->onodes.erase(o);
+ get_object_key(cct, o->oid, &key);
+ txc->t->rmkey(PREFIX_OBJ, key);
+ return 0;
+}
+
+int KStore::_remove(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef &o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = _do_remove(txc, o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_setattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name,
+ bufferptr& val)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << " (" << val.length() << " bytes)"
+ << dendl;
+ int r = 0;
+ o->onode.attrs[name] = val;
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << " (" << val.length() << " bytes)"
+ << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_setattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const map<string,bufferptr>& aset)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << aset.size() << " keys"
+ << dendl;
+ int r = 0;
+ for (map<string,bufferptr>::const_iterator p = aset.begin();
+ p != aset.end(); ++p) {
+ if (p->second.is_partial())
+ o->onode.attrs[p->first] = bufferptr(p->second.c_str(), p->second.length());
+ else
+ o->onode.attrs[p->first] = p->second;
+ }
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << aset.size() << " keys"
+ << " = " << r << dendl;
+ return r;
+}
+
+
+int KStore::_rmattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << dendl;
+ int r = 0;
+ o->onode.attrs.erase(name);
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " " << name << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_rmattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+ o->onode.attrs.clear();
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+void KStore::_do_omap_clear(TransContext *txc, uint64_t id)
+{
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+ string prefix, tail;
+ get_omap_header(id, &prefix);
+ get_omap_tail(id, &tail);
+ it->lower_bound(prefix);
+ while (it->valid()) {
+ if (it->key() >= tail) {
+ dout(30) << __func__ << " stop at " << tail << dendl;
+ break;
+ }
+ txc->t->rmkey(PREFIX_OMAP, it->key());
+ dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
+ it->next();
+ }
+}
+
+int KStore::_omap_clear(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+ if (o->onode.omap_head != 0) {
+ _do_omap_clear(txc, o->onode.omap_head);
+ }
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_omap_setkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist &bl)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r;
+ auto p = bl.cbegin();
+ __u32 num;
+ if (!o->onode.omap_head) {
+ o->onode.omap_head = o->onode.nid;
+ txc->write_onode(o);
+ }
+ decode(num, p);
+ while (num--) {
+ string key;
+ bufferlist value;
+ decode(key, p);
+ decode(value, p);
+ string final_key;
+ get_omap_key(o->onode.omap_head, key, &final_key);
+ dout(30) << __func__ << " " << pretty_binary_string(final_key)
+ << " <- " << key << dendl;
+ txc->t->set(PREFIX_OMAP, final_key, value);
+ }
+ r = 0;
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_omap_setheader(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef &o,
+ bufferlist& bl)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r;
+ string key;
+ if (!o->onode.omap_head) {
+ o->onode.omap_head = o->onode.nid;
+ txc->write_onode(o);
+ }
+ get_omap_header(o->onode.omap_head, &key);
+ txc->t->set(PREFIX_OMAP, key, bl);
+ r = 0;
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_omap_rmkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const bufferlist& bl)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ int r = 0;
+ auto p = bl.cbegin();
+ __u32 num;
+
+ if (!o->onode.omap_head) {
+ r = 0;
+ goto out;
+ }
+ decode(num, p);
+ while (num--) {
+ string key;
+ decode(key, p);
+ string final_key;
+ get_omap_key(o->onode.omap_head, key, &final_key);
+ dout(30) << __func__ << " rm " << pretty_binary_string(final_key)
+ << " <- " << key << dendl;
+ txc->t->rmkey(PREFIX_OMAP, final_key);
+ }
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_omap_rmkey_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& first, const string& last)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+ KeyValueDB::Iterator it;
+ string key_first, key_last;
+ int r = 0;
+
+ if (!o->onode.omap_head) {
+ goto out;
+ }
+ it = db->get_iterator(PREFIX_OMAP);
+ get_omap_key(o->onode.omap_head, first, &key_first);
+ get_omap_key(o->onode.omap_head, last, &key_last);
+ it->lower_bound(key_first);
+ while (it->valid()) {
+ if (it->key() >= key_last) {
+ dout(30) << __func__ << " stop at " << pretty_binary_string(key_last)
+ << dendl;
+ break;
+ }
+ txc->t->rmkey(PREFIX_OMAP, it->key());
+ dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl;
+ it->next();
+ }
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_setallochint(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags)
+{
+ dout(15) << __func__ << " " << c->cid << " " << o->oid
+ << " object_size " << expected_object_size
+ << " write_size " << expected_write_size
+ << " flags " << flags
+ << dendl;
+ int r = 0;
+ o->onode.expected_object_size = expected_object_size;
+ o->onode.expected_write_size = expected_write_size;
+ o->onode.alloc_hint_flags = flags;
+
+ txc->write_onode(o);
+ dout(10) << __func__ << " " << c->cid << " " << o->oid
+ << " object_size " << expected_object_size
+ << " write_size " << expected_write_size
+ << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_clone(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << dendl;
+ int r = 0;
+ if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
+ derr << __func__ << " mismatched hash on " << oldo->oid
+ << " and " << newo->oid << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist bl;
+ newo->exists = true;
+ _assign_nid(txc, newo);
+
+ // data
+ oldo->flush();
+
+ r = _do_read(oldo, 0, oldo->onode.size, bl, 0);
+ if (r < 0)
+ goto out;
+
+ // truncate any old data
+ r = _do_truncate(txc, newo, 0);
+ if (r < 0)
+ goto out;
+
+ r = _do_write(txc, newo, 0, oldo->onode.size, bl, 0);
+ if (r < 0)
+ goto out;
+
+ newo->onode.attrs = oldo->onode.attrs;
+
+ // clone omap
+ if (newo->onode.omap_head) {
+ dout(20) << __func__ << " clearing old omap data" << dendl;
+ _do_omap_clear(txc, newo->onode.omap_head);
+ }
+ if (oldo->onode.omap_head) {
+ dout(20) << __func__ << " copying omap data" << dendl;
+ if (!newo->onode.omap_head) {
+ newo->onode.omap_head = newo->onode.nid;
+ }
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+ string head, tail;
+ get_omap_header(oldo->onode.omap_head, &head);
+ get_omap_tail(oldo->onode.omap_head, &tail);
+ it->lower_bound(head);
+ while (it->valid()) {
+ string key;
+ if (it->key() >= tail) {
+ dout(30) << __func__ << " reached tail" << dendl;
+ break;
+ } else {
+ dout(30) << __func__ << " got header/data "
+ << pretty_binary_string(it->key()) << dendl;
+ ceph_assert(it->key() < tail);
+ rewrite_omap_key(newo->onode.omap_head, it->key(), &key);
+ txc->t->set(PREFIX_OMAP, key, it->value());
+ }
+ it->next();
+ }
+ }
+
+ txc->write_onode(newo);
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_clone_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ uint64_t srcoff, uint64_t length, uint64_t dstoff)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << " from " << srcoff << "~" << length
+ << " to offset " << dstoff << dendl;
+ int r = 0;
+
+ bufferlist bl;
+ newo->exists = true;
+ _assign_nid(txc, newo);
+
+ r = _do_read(oldo, srcoff, length, bl, 0);
+ if (r < 0)
+ goto out;
+
+ r = _do_write(txc, newo, dstoff, bl.length(), bl, 0);
+ if (r < 0)
+ goto out;
+
+ txc->write_onode(newo);
+
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << newo->oid << " from " << srcoff << "~" << length
+ << " to offset " << dstoff
+ << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_rename(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ const ghobject_t& new_oid)
+{
+ dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+ << new_oid << dendl;
+ int r;
+ ghobject_t old_oid = oldo->oid;
+ bufferlist bl;
+ string old_key, new_key;
+
+ if (newo && newo->exists) {
+ // destination object already exists, remove it first
+ r = _do_remove(txc, newo);
+ if (r < 0)
+ goto out;
+ }
+
+ txc->t->rmkey(PREFIX_OBJ, oldo->key);
+ txc->write_onode(oldo);
+ c->onode_map.rename(old_oid, new_oid); // this adjusts oldo->{oid,key}
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+ << new_oid << " = " << r << dendl;
+ return r;
+}
+
+// collections
+
+int KStore::_create_collection(
+ TransContext *txc,
+ coll_t cid,
+ unsigned bits,
+ CollectionRef *c)
+{
+ dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
+ int r;
+ bufferlist bl;
+
+ {
+ RWLock::WLocker l(coll_lock);
+ if (*c) {
+ r = -EEXIST;
+ goto out;
+ }
+ auto p = new_coll_map.find(cid);
+ ceph_assert(p != new_coll_map.end());
+ *c = p->second;
+ ceph_assert((*c)->cid == cid);
+ (*c)->cnode.bits = bits;
+ coll_map[cid] = *c;
+ new_coll_map.erase(p);
+ }
+ encode((*c)->cnode, bl);
+ txc->t->set(PREFIX_COLL, stringify(cid), bl);
+ r = 0;
+
+ out:
+ dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_remove_collection(TransContext *txc, coll_t cid,
+ CollectionRef *c)
+{
+ dout(15) << __func__ << " " << cid << dendl;
+ int r;
+
+ {
+ RWLock::WLocker l(coll_lock);
+ if (!*c) {
+ r = -ENOENT;
+ goto out;
+ }
+ size_t nonexistent_count = 0;
+ pair<ghobject_t,OnodeRef> next_onode;
+ while ((*c)->onode_map.get_next(next_onode.first, &next_onode)) {
+ if (next_onode.second->exists) {
+ r = -ENOTEMPTY;
+ goto out;
+ }
+ ++nonexistent_count;
+ }
+ vector<ghobject_t> ls;
+ ghobject_t next;
+ // Enumerate onodes in db, up to nonexistent_count + 1
+ // then check if all of them are marked as non-existent.
+ // Bypass the check if returned number is greater than nonexistent_count
+ r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
+ nonexistent_count + 1, &ls, &next);
+ if (r >= 0) {
+ bool exists = false; //ls.size() > nonexistent_count;
+ for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
+ dout(10) << __func__ << " oid " << *it << dendl;
+ auto onode = (*c)->onode_map.lookup(*it);
+ exists = !onode || onode->exists;
+ if (exists) {
+ dout(10) << __func__ << " " << *it
+ << " exists in db" << dendl;
+ }
+ }
+ if (!exists) {
+ coll_map.erase(cid);
+ txc->removed_collections.push_back(*c);
+ c->reset();
+ txc->t->rmkey(PREFIX_COLL, stringify(cid));
+ r = 0;
+ } else {
+ dout(10) << __func__ << " " << cid
+ << " is non-empty" << dendl;
+ r = -ENOTEMPTY;
+ }
+ }
+ }
+
+ out:
+ dout(10) << __func__ << " " << cid << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_split_collection(TransContext *txc,
+ CollectionRef& c,
+ CollectionRef& d,
+ unsigned bits, int rem)
+{
+ dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
+ << " bits " << bits << dendl;
+ int r;
+ RWLock::WLocker l(c->lock);
+ RWLock::WLocker l2(d->lock);
+ c->onode_map.clear();
+ d->onode_map.clear();
+ c->cnode.bits = bits;
+ ceph_assert(d->cnode.bits == bits);
+ r = 0;
+
+ bufferlist bl;
+ encode(c->cnode, bl);
+ txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
+
+ dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
+ << " bits " << bits << " = " << r << dendl;
+ return r;
+}
+
+int KStore::_merge_collection(TransContext *txc,
+ CollectionRef *c,
+ CollectionRef& d,
+ unsigned bits)
+{
+ dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid << " "
+ << " bits " << bits << dendl;
+ int r;
+ RWLock::WLocker l((*c)->lock);
+ RWLock::WLocker l2(d->lock);
+ (*c)->onode_map.clear();
+ d->onode_map.clear();
+ d->cnode.bits = bits;
+ r = 0;
+
+ coll_t cid = (*c)->cid;
+
+ bufferlist bl;
+ encode(d->cnode, bl);
+ txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
+
+ coll_map.erase((*c)->cid);
+ txc->removed_collections.push_back(*c);
+ c->reset();
+ txc->t->rmkey(PREFIX_COLL, stringify(cid));
+
+ dout(10) << __func__ << " " << cid << " to " << d->cid << " "
+ << " bits " << bits << " = " << r << dendl;
+ return r;
+}
+
+// ===========================================
diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h
new file mode 100644
index 00000000..227227fb
--- /dev/null
+++ b/src/os/kstore/KStore.h
@@ -0,0 +1,692 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_KSTORE_H
+#define CEPH_OSD_KSTORE_H
+
+#include "acconfig.h"
+
+#include <unistd.h>
+
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+#include "include/ceph_assert.h"
+#include "include/unordered_map.h"
+#include "common/Finisher.h"
+#include "common/RWLock.h"
+#include "common/Throttle.h"
+#include "common/WorkQueue.h"
+#include "os/ObjectStore.h"
+#include "common/perf_counters.h"
+#include "os/fs/FS.h"
+#include "kv/KeyValueDB.h"
+
+#include "kstore_types.h"
+
+#include "boost/intrusive/list.hpp"
+
+enum {
+ l_kstore_first = 832430,
+ l_kstore_state_prepare_lat,
+ l_kstore_state_kv_queued_lat,
+ l_kstore_state_kv_done_lat,
+ l_kstore_state_finishing_lat,
+ l_kstore_state_done_lat,
+ l_kstore_last
+};
+
+class KStore : public ObjectStore {
+ // -----------------------------------------------------
+ // types
+public:
+
+ class TransContext;
+
+ /// an in-memory object
+ struct Onode {
+ CephContext* cct;
+ std::atomic_int nref; ///< reference count
+
+ ghobject_t oid;
+ string key; ///< key under PREFIX_OBJ where we are stored
+ boost::intrusive::list_member_hook<> lru_item;
+
+ kstore_onode_t onode; ///< metadata stored as value in kv store
+ bool dirty; // ???
+ bool exists;
+
+ std::mutex flush_lock; ///< protect flush_txns
+ std::condition_variable flush_cond; ///< wait here for unapplied txns
+ set<TransContext*> flush_txns; ///< committing txns
+
+ uint64_t tail_offset;
+ bufferlist tail_bl;
+
+ map<uint64_t,bufferlist> pending_stripes; ///< unwritten stripes
+
+ Onode(CephContext* cct, const ghobject_t& o, const string& k)
+ : cct(cct),
+ nref(0),
+ oid(o),
+ key(k),
+ dirty(false),
+ exists(false),
+ tail_offset(0) {
+ }
+
+ void flush();
+ void get() {
+ ++nref;
+ }
+ void put() {
+ if (--nref == 0)
+ delete this;
+ }
+
+ void clear_tail() {
+ tail_offset = 0;
+ tail_bl.clear();
+ }
+ void clear_pending_stripes() {
+ pending_stripes.clear();
+ }
+ };
+ typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+ struct OnodeHashLRU {
+ CephContext* cct;
+ typedef boost::intrusive::list<
+ Onode,
+ boost::intrusive::member_hook<
+ Onode,
+ boost::intrusive::list_member_hook<>,
+ &Onode::lru_item> > lru_list_t;
+
+ std::mutex lock;
+ ceph::unordered_map<ghobject_t,OnodeRef> onode_map; ///< forward lookups
+ lru_list_t lru; ///< lru
+
+ OnodeHashLRU(CephContext* cct) : cct(cct) {}
+
+ void add(const ghobject_t& oid, OnodeRef o);
+ void _touch(OnodeRef o);
+ OnodeRef lookup(const ghobject_t& o);
+ void rename(const ghobject_t& old_oid, const ghobject_t& new_oid);
+ void clear();
+ bool get_next(const ghobject_t& after, pair<ghobject_t,OnodeRef> *next);
+ int trim(int max=-1);
+ };
+
+ class OpSequencer;
+ typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
+
+ struct Collection : public CollectionImpl {
+ KStore *store;
+ kstore_cnode_t cnode;
+ RWLock lock;
+
+ OpSequencerRef osr;
+
+ // cache onodes on a per-collection basis to avoid lock
+ // contention.
+ OnodeHashLRU onode_map;
+
+ OnodeRef get_onode(const ghobject_t& oid, bool create);
+
+ bool contains(const ghobject_t& oid) {
+ if (cid.is_meta())
+ return oid.hobj.pool == -1;
+ spg_t spgid;
+ if (cid.is_pg(&spgid))
+ return
+ spgid.pgid.contains(cnode.bits, oid) &&
+ oid.shard_id == spgid.shard;
+ return false;
+ }
+
+ void flush() override;
+ bool flush_commit(Context *c) override;
+
+ Collection(KStore *ns, coll_t c);
+ };
+ typedef boost::intrusive_ptr<Collection> CollectionRef;
+
+ class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+ CollectionRef c;
+ OnodeRef o;
+ KeyValueDB::Iterator it;
+ string head, tail;
+ public:
+ OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
+ int seek_to_first() override;
+ int upper_bound(const string &after) override;
+ int lower_bound(const string &to) override;
+ bool valid() override;
+ int next() override;
+ string key() override;
+ bufferlist value() override;
+ int status() override {
+ return 0;
+ }
+ };
+
+ struct TransContext {
+ typedef enum {
+ STATE_PREPARE,
+ STATE_AIO_WAIT,
+ STATE_IO_DONE,
+ STATE_KV_QUEUED,
+ STATE_KV_COMMITTING,
+ STATE_KV_DONE,
+ STATE_FINISHING,
+ STATE_DONE,
+ } state_t;
+
+ state_t state;
+
+ const char *get_state_name() {
+ switch (state) {
+ case STATE_PREPARE: return "prepare";
+ case STATE_AIO_WAIT: return "aio_wait";
+ case STATE_IO_DONE: return "io_done";
+ case STATE_KV_QUEUED: return "kv_queued";
+ case STATE_KV_COMMITTING: return "kv_committing";
+ case STATE_KV_DONE: return "kv_done";
+ case STATE_FINISHING: return "finishing";
+ case STATE_DONE: return "done";
+ }
+ return "???";
+ }
+
+ void log_state_latency(PerfCounters *logger, int state) {
+ utime_t lat, now = ceph_clock_now();
+ lat = now - start;
+ logger->tinc(state, lat);
+ start = now;
+ }
+
+ CollectionRef ch;
+ OpSequencerRef osr;
+ boost::intrusive::list_member_hook<> sequencer_item;
+
+ uint64_t ops, bytes;
+
+ set<OnodeRef> onodes; ///< these onodes need to be updated/written
+ KeyValueDB::Transaction t; ///< then we will commit this
+ Context *oncommit; ///< signal on commit
+ Context *onreadable; ///< signal on readable
+ Context *onreadable_sync; ///< signal on readable
+ list<Context*> oncommits; ///< more commit completions
+ list<CollectionRef> removed_collections; ///< colls we removed
+
+ CollectionRef first_collection; ///< first referenced collection
+ utime_t start;
+ explicit TransContext(OpSequencer *o)
+ : state(STATE_PREPARE),
+ osr(o),
+ ops(0),
+ bytes(0),
+ oncommit(NULL),
+ onreadable(NULL),
+ onreadable_sync(NULL),
+ start(ceph_clock_now()){
+ //cout << "txc new " << this << std::endl;
+ }
+ ~TransContext() {
+ //cout << "txc del " << this << std::endl;
+ }
+
+ void write_onode(OnodeRef &o) {
+ onodes.insert(o);
+ }
+ };
+
+ class OpSequencer : public RefCountedObject {
+ public:
+ std::mutex qlock;
+ std::condition_variable qcond;
+ typedef boost::intrusive::list<
+ TransContext,
+ boost::intrusive::member_hook<
+ TransContext,
+ boost::intrusive::list_member_hook<>,
+ &TransContext::sequencer_item> > q_list_t;
+ q_list_t q; ///< transactions
+
+ ~OpSequencer() {
+ ceph_assert(q.empty());
+ }
+
+ void queue_new(TransContext *txc) {
+ std::lock_guard<std::mutex> l(qlock);
+ q.push_back(*txc);
+ }
+
+ void flush() {
+ std::unique_lock<std::mutex> l(qlock);
+ while (!q.empty())
+ qcond.wait(l);
+ }
+
+ bool flush_commit(Context *c) {
+ std::lock_guard<std::mutex> l(qlock);
+ if (q.empty()) {
+ return true;
+ }
+ TransContext *txc = &q.back();
+ if (txc->state >= TransContext::STATE_KV_DONE) {
+ return true;
+ }
+ ceph_assert(txc->state < TransContext::STATE_KV_DONE);
+ txc->oncommits.push_back(c);
+ return false;
+ }
+ };
+
+ struct KVSyncThread : public Thread {
+ KStore *store;
+ explicit KVSyncThread(KStore *s) : store(s) {}
+ void *entry() override {
+ store->_kv_sync_thread();
+ return NULL;
+ }
+ };
+
+ // --------------------------------------------------------
+ // members
+private:
+ KeyValueDB *db;
+ uuid_d fsid;
+ string basedir;
+ int path_fd; ///< open handle to $path
+ int fsid_fd; ///< open handle (locked) to $path/fsid
+ bool mounted;
+
+ RWLock coll_lock; ///< rwlock to protect coll_map
+ ceph::unordered_map<coll_t, CollectionRef> coll_map;
+ map<coll_t,CollectionRef> new_coll_map;
+
+ std::mutex nid_lock;
+ uint64_t nid_last;
+ uint64_t nid_max;
+
+ Throttle throttle_ops, throttle_bytes; ///< submit to commit
+
+ Finisher finisher;
+
+ KVSyncThread kv_sync_thread;
+ std::mutex kv_lock;
+ std::condition_variable kv_cond, kv_sync_cond;
+ bool kv_stop;
+ deque<TransContext*> kv_queue, kv_committing;
+
+ //Logger *logger;
+ PerfCounters *logger;
+ std::mutex reap_lock;
+ list<CollectionRef> removed_collections;
+
+
+ // --------------------------------------------------------
+ // private methods
+
+ void _init_logger();
+ void _shutdown_logger();
+
+ int _open_path();
+ void _close_path();
+ int _open_fsid(bool create);
+ int _lock_fsid();
+ int _read_fsid(uuid_d *f);
+ int _write_fsid();
+ void _close_fsid();
+ int _open_db(bool create);
+ void _close_db();
+ int _open_collections(int *errors=0);
+ void _close_collections();
+
+ int _open_super_meta();
+
+ CollectionRef _get_collection(coll_t cid);
+ void _queue_reap_collection(CollectionRef& c);
+ void _reap_collections();
+
+ void _assign_nid(TransContext *txc, OnodeRef o);
+
+ void _dump_onode(OnodeRef o);
+
+ TransContext *_txc_create(OpSequencer *osr);
+ void _txc_release(TransContext *txc, uint64_t offset, uint64_t length);
+ void _txc_add_transaction(TransContext *txc, Transaction *t);
+ void _txc_finalize(OpSequencer *osr, TransContext *txc);
+ void _txc_state_proc(TransContext *txc);
+ void _txc_finish_kv(TransContext *txc);
+ void _txc_finish(TransContext *txc);
+
+ void _osr_reap_done(OpSequencer *osr);
+
+ void _kv_sync_thread();
+ void _kv_stop() {
+ {
+ std::lock_guard<std::mutex> l(kv_lock);
+ kv_stop = true;
+ kv_cond.notify_all();
+ }
+ kv_sync_thread.join();
+ kv_stop = false;
+ }
+
+ void _do_read_stripe(OnodeRef o, uint64_t offset, bufferlist *pbl);
+ void _do_write_stripe(TransContext *txc, OnodeRef o,
+ uint64_t offset, bufferlist& bl);
+ void _do_remove_stripe(TransContext *txc, OnodeRef o, uint64_t offset);
+
+ int _collection_list(
+ Collection *c, const ghobject_t& start, const ghobject_t& end,
+ int max, vector<ghobject_t> *ls, ghobject_t *next);
+
+public:
+ KStore(CephContext *cct, const string& path);
+ ~KStore() override;
+
+ string get_type() override {
+ return "kstore";
+ }
+
+ bool needs_journal() override { return false; };
+ bool wants_journal() override { return false; };
+ bool allows_journal() override { return false; };
+
+ static int get_block_device_fsid(const string& path, uuid_d *fsid);
+
+ bool test_mount_in_use() override;
+
+ int mount() override;
+ int umount() override;
+ void _sync();
+
+ int fsck(bool deep) override;
+
+
+ int validate_hobject_key(const hobject_t &obj) const override {
+ return 0;
+ }
+ unsigned get_max_attr_name_length() override {
+ return 256; // arbitrary; there is no real limit internally
+ }
+
+ int mkfs() override;
+ int mkjournal() override {
+ return 0;
+ }
+ void dump_perf_counters(Formatter *f) override {
+ f->open_object_section("perf_counters");
+ logger->dump_formatted(f, false);
+ f->close_section();
+ }
+ void get_db_statistics(Formatter *f) override {
+ db->get_statistics(f);
+ }
+ int statfs(struct store_statfs_t *buf,
+ osd_alert_list_t* alerts = nullptr) override;
+ int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override;
+
+ CollectionHandle open_collection(const coll_t& c) override;
+ CollectionHandle create_new_collection(const coll_t& c) override;
+ void set_collection_commit_queue(const coll_t& cid,
+ ContextQueue *commit_queue) override {
+ }
+
+ using ObjectStore::exists;
+ bool exists(CollectionHandle& c, const ghobject_t& oid) override;
+ using ObjectStore::stat;
+ int stat(
+ CollectionHandle& c,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio = false) override; // struct stat?
+ int set_collection_opts(
+ CollectionHandle& c,
+ const pool_opts_t& opts) override;
+ using ObjectStore::read;
+ int read(
+ CollectionHandle& c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags = 0) override;
+ int _do_read(
+ OnodeRef o,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags = 0);
+
+ using ObjectStore::fiemap;
+ int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
+ int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& outbl) override;
+ using ObjectStore::getattr;
+ int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, bufferptr& value) override;
+ using ObjectStore::getattrs;
+ int getattrs(CollectionHandle& c, const ghobject_t& oid, map<string,bufferptr>& aset) override;
+
+ int list_collections(vector<coll_t>& ls) override;
+ bool collection_exists(const coll_t& c) override;
+ int collection_empty(CollectionHandle& c, bool *empty) override;
+ int collection_bits(CollectionHandle& c) override;
+ int collection_list(
+ CollectionHandle &c, const ghobject_t& start, const ghobject_t& end,
+ int max,
+ vector<ghobject_t> *ls, ghobject_t *next) override;
+
+ using ObjectStore::omap_get;
+ int omap_get(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+ ) override;
+
+ using ObjectStore::omap_get_header;
+ /// Get omap header
+ int omap_get_header(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ bool allow_eio = false ///< [in] don't assert on eio
+ ) override;
+
+ using ObjectStore::omap_get_keys;
+ /// Get keys defined on oid
+ int omap_get_keys(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
+ ) override;
+
+ using ObjectStore::omap_get_values;
+ /// Get key values
+ int omap_get_values(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to get
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ ) override;
+
+ using ObjectStore::omap_check_keys;
+ /// Filters keys into out which are defined on oid
+ int omap_check_keys(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to check
+ set<string> *out ///< [out] Subset of keys defined on oid
+ ) override;
+
+ using ObjectStore::get_omap_iterator;
+ ObjectMap::ObjectMapIterator get_omap_iterator(
+ CollectionHandle& c, ///< [in] collection
+ const ghobject_t &oid ///< [in] object
+ ) override;
+
+ void set_fsid(uuid_d u) override {
+ fsid = u;
+ }
+ uuid_d get_fsid() override {
+ return fsid;
+ }
+
+ uint64_t estimate_objects_overhead(uint64_t num_objects) override {
+ return num_objects * 300; //assuming per-object overhead is 300 bytes
+ }
+
+ objectstore_perf_stat_t get_cur_stats() override {
+ return objectstore_perf_stat_t();
+ }
+ const PerfCounters* get_perf_counters() const override {
+ return logger;
+ }
+
+
+ int queue_transactions(
+ CollectionHandle& ch,
+ vector<Transaction>& tls,
+ TrackedOpRef op = TrackedOpRef(),
+ ThreadPool::TPHandle *handle = NULL) override;
+
+ void compact () override {
+ ceph_assert(db);
+ db->compact();
+ }
+
+private:
+ // --------------------------------------------------------
+ // write ops
+
+ int _write(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t len,
+ bufferlist& bl,
+ uint32_t fadvise_flags);
+ int _do_write(TransContext *txc,
+ OnodeRef o,
+ uint64_t offset, uint64_t length,
+ bufferlist& bl,
+ uint32_t fadvise_flags);
+ int _touch(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o);
+ int _zero(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset, size_t len);
+ int _do_truncate(TransContext *txc,
+ OnodeRef o,
+ uint64_t offset);
+ int _truncate(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t offset);
+ int _remove(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o);
+ int _do_remove(TransContext *txc,
+ OnodeRef o);
+ int _setattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name,
+ bufferptr& val);
+ int _setattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const map<string,bufferptr>& aset);
+ int _rmattr(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& name);
+ int _rmattrs(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o);
+ void _do_omap_clear(TransContext *txc, uint64_t id);
+ int _omap_clear(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o);
+ int _omap_setkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist& bl);
+ int _omap_setheader(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ bufferlist& header);
+ int _omap_rmkeys(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const bufferlist& bl);
+ int _omap_rmkey_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ const string& first, const string& last);
+ int _setallochint(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+ int _clone(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo);
+ int _clone_range(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ uint64_t srcoff, uint64_t length, uint64_t dstoff);
+ int _rename(TransContext *txc,
+ CollectionRef& c,
+ OnodeRef& oldo,
+ OnodeRef& newo,
+ const ghobject_t& new_oid);
+ int _create_collection(TransContext *txc, coll_t cid, unsigned bits,
+ CollectionRef *c);
+ int _remove_collection(TransContext *txc, coll_t cid, CollectionRef *c);
+ int _split_collection(TransContext *txc,
+ CollectionRef& c,
+ CollectionRef& d,
+ unsigned bits, int rem);
+ int _merge_collection(TransContext *txc,
+ CollectionRef *c,
+ CollectionRef& d,
+ unsigned bits);
+
+};
+
+static inline void intrusive_ptr_add_ref(KStore::Onode *o) {
+ o->get();
+}
+static inline void intrusive_ptr_release(KStore::Onode *o) {
+ o->put();
+}
+
+static inline void intrusive_ptr_add_ref(KStore::OpSequencer *o) {
+ o->get();
+}
+static inline void intrusive_ptr_release(KStore::OpSequencer *o) {
+ o->put();
+}
+
+#endif
diff --git a/src/os/kstore/kstore_types.cc b/src/os/kstore/kstore_types.cc
new file mode 100644
index 00000000..07270374
--- /dev/null
+++ b/src/os/kstore/kstore_types.cc
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "kstore_types.h"
+#include "common/Formatter.h"
+#include "include/stringify.h"
+
+// cnode_t
+
+void kstore_cnode_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(bits, bl);
+ ENCODE_FINISH(bl);
+}
+
+void kstore_cnode_t::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(bits, p);
+ DECODE_FINISH(p);
+}
+
+void kstore_cnode_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("bits", bits);
+}
+
+void kstore_cnode_t::generate_test_instances(list<kstore_cnode_t*>& o)
+{
+ o.push_back(new kstore_cnode_t());
+ o.push_back(new kstore_cnode_t(0));
+ o.push_back(new kstore_cnode_t(123));
+}
+
+
+// kstore_onode_t
+
+void kstore_onode_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(nid, bl);
+ encode(size, bl);
+ encode(attrs, bl);
+ encode(omap_head, bl);
+ encode(stripe_size, bl);
+ encode(expected_object_size, bl);
+ encode(expected_write_size, bl);
+ encode(alloc_hint_flags, bl);
+ ENCODE_FINISH(bl);
+}
+
+void kstore_onode_t::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(nid, p);
+ decode(size, p);
+ decode(attrs, p);
+ decode(omap_head, p);
+ decode(stripe_size, p);
+ decode(expected_object_size, p);
+ decode(expected_write_size, p);
+ decode(alloc_hint_flags, p);
+ DECODE_FINISH(p);
+}
+
+void kstore_onode_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("nid", nid);
+ f->dump_unsigned("size", size);
+ f->open_object_section("attrs");
+ for (map<string,bufferptr>::const_iterator p = attrs.begin();
+ p != attrs.end(); ++p) {
+ f->open_object_section("attr");
+ f->dump_string("name", p->first);
+ f->dump_unsigned("len", p->second.length());
+ f->close_section();
+ }
+ f->close_section();
+ f->dump_unsigned("omap_head", omap_head);
+ f->dump_unsigned("stripe_size", stripe_size);
+ f->dump_unsigned("expected_object_size", expected_object_size);
+ f->dump_unsigned("expected_write_size", expected_write_size);
+ f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
+}
+
+void kstore_onode_t::generate_test_instances(list<kstore_onode_t*>& o)
+{
+ o.push_back(new kstore_onode_t());
+ // FIXME
+}
diff --git a/src/os/kstore/kstore_types.h b/src/os/kstore/kstore_types.h
new file mode 100644
index 00000000..13c33fb6
--- /dev/null
+++ b/src/os/kstore/kstore_types.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_KSTORE_TYPES_H
+#define CEPH_OSD_KSTORE_TYPES_H
+
+#include <ostream>
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "include/utime.h"
+#include "common/hobject.h"
+
+namespace ceph {
+ class Formatter;
+}
+/// collection metadata
+struct kstore_cnode_t {
+ uint32_t bits; ///< how many bits of coll pgid are significant
+
+ explicit kstore_cnode_t(int b=0) : bits(b) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<kstore_cnode_t*>& o);
+};
+WRITE_CLASS_ENCODER(kstore_cnode_t)
+
+/// onode: per-object metadata
+struct kstore_onode_t {
+ uint64_t nid; ///< numeric id (locally unique)
+ uint64_t size; ///< object size
+ map<string, bufferptr> attrs; ///< attrs
+ uint64_t omap_head; ///< id for omap root node
+ uint32_t stripe_size; ///< stripe size
+
+ uint32_t expected_object_size;
+ uint32_t expected_write_size;
+ uint32_t alloc_hint_flags;
+
+ kstore_onode_t()
+ : nid(0),
+ size(0),
+ omap_head(0),
+ stripe_size(0),
+ expected_object_size(0),
+ expected_write_size(0),
+ alloc_hint_flags(0) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<kstore_onode_t*>& o);
+};
+WRITE_CLASS_ENCODER(kstore_onode_t)
+
+#endif
diff --git a/src/os/kv.h b/src/os/kv.h
new file mode 100644
index 00000000..64048b08
--- /dev/null
+++ b/src/os/kv.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_KV_H
+#define CEPH_OS_KV_H
+
+#include <string>
+#include "include/byteorder.h"
+
+// some key encoding helpers
+template<typename T>
+inline static void _key_encode_u32(uint32_t u, T *key) {
+ uint32_t bu;
+#ifdef CEPH_BIG_ENDIAN
+ bu = u;
+#elif defined(CEPH_LITTLE_ENDIAN)
+ bu = swab(u);
+#else
+# error wtf
+#endif
+ key->append((char*)&bu, 4);
+}
+
+template<typename T>
+inline static void _key_encode_u32(uint32_t u, size_t pos, T *key) {
+ uint32_t bu;
+#ifdef CEPH_BIG_ENDIAN
+ bu = u;
+#elif defined(CEPH_LITTLE_ENDIAN)
+ bu = swab(u);
+#else
+# error wtf
+#endif
+ key->replace(pos, sizeof(bu), (char*)&bu, sizeof(bu));
+}
+
+inline static const char *_key_decode_u32(const char *key, uint32_t *pu) {
+ uint32_t bu;
+ memcpy(&bu, key, 4);
+#ifdef CEPH_BIG_ENDIAN
+ *pu = bu;
+#elif defined(CEPH_LITTLE_ENDIAN)
+ *pu = swab(bu);
+#else
+# error wtf
+#endif
+ return key + 4;
+}
+
+template<typename T>
+inline static void _key_encode_u64(uint64_t u, T *key) {
+ uint64_t bu;
+#ifdef CEPH_BIG_ENDIAN
+ bu = u;
+#elif defined(CEPH_LITTLE_ENDIAN)
+ bu = swab(u);
+#else
+# error wtf
+#endif
+ key->append((char*)&bu, 8);
+}
+
+inline static const char *_key_decode_u64(const char *key, uint64_t *pu) {
+ uint64_t bu;
+ memcpy(&bu, key, 8);
+#ifdef CEPH_BIG_ENDIAN
+ *pu = bu;
+#elif defined(CEPH_LITTLE_ENDIAN)
+ *pu = swab(bu);
+#else
+# error wtf
+#endif
+ return key + 8;
+}
+
+#endif
diff --git a/src/os/memstore/MemStore.cc b/src/os/memstore/MemStore.cc
new file mode 100644
index 00000000..dc1d5ff5
--- /dev/null
+++ b/src/os/memstore/MemStore.cc
@@ -0,0 +1,1801 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "acconfig.h"
+
+#ifdef HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#include "include/types.h"
+#include "include/stringify.h"
+#include "include/unordered_map.h"
+#include "common/errno.h"
+#include "MemStore.h"
+#include "include/compat.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "memstore(" << path << ") "
+
+// for comparing collections for lock ordering
+bool operator>(const MemStore::CollectionRef& l,
+ const MemStore::CollectionRef& r)
+{
+ return (unsigned long)l.get() > (unsigned long)r.get();
+}
+
+
+int MemStore::mount()
+{
+ int r = _load();
+ if (r < 0)
+ return r;
+ finisher.start();
+ return 0;
+}
+
+int MemStore::umount()
+{
+ finisher.wait_for_empty();
+ finisher.stop();
+ return _save();
+}
+
+int MemStore::_save()
+{
+ dout(10) << __func__ << dendl;
+ dump_all();
+ set<coll_t> collections;
+ for (ceph::unordered_map<coll_t,CollectionRef>::iterator p = coll_map.begin();
+ p != coll_map.end();
+ ++p) {
+ dout(20) << __func__ << " coll " << p->first << " " << p->second << dendl;
+ collections.insert(p->first);
+ bufferlist bl;
+ ceph_assert(p->second);
+ p->second->encode(bl);
+ string fn = path + "/" + stringify(p->first);
+ int r = bl.write_file(fn.c_str());
+ if (r < 0)
+ return r;
+ }
+
+ string fn = path + "/collections";
+ bufferlist bl;
+ encode(collections, bl);
+ int r = bl.write_file(fn.c_str());
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+void MemStore::dump_all()
+{
+ Formatter *f = Formatter::create("json-pretty");
+ f->open_object_section("store");
+ dump(f);
+ f->close_section();
+ dout(0) << "dump:";
+ f->flush(*_dout);
+ *_dout << dendl;
+ delete f;
+}
+
+void MemStore::dump(Formatter *f)
+{
+ f->open_array_section("collections");
+ for (ceph::unordered_map<coll_t,CollectionRef>::iterator p = coll_map.begin();
+ p != coll_map.end();
+ ++p) {
+ f->open_object_section("collection");
+ f->dump_string("name", stringify(p->first));
+
+ f->open_array_section("xattrs");
+ for (map<string,bufferptr>::iterator q = p->second->xattr.begin();
+ q != p->second->xattr.end();
+ ++q) {
+ f->open_object_section("xattr");
+ f->dump_string("name", q->first);
+ f->dump_int("length", q->second.length());
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("objects");
+ for (map<ghobject_t,ObjectRef>::iterator q = p->second->object_map.begin();
+ q != p->second->object_map.end();
+ ++q) {
+ f->open_object_section("object");
+ f->dump_string("name", stringify(q->first));
+ if (q->second)
+ q->second->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->close_section();
+ }
+ f->close_section();
+}
+
+int MemStore::_load()
+{
+ dout(10) << __func__ << dendl;
+ bufferlist bl;
+ string fn = path + "/collections";
+ string err;
+ int r = bl.read_file(fn.c_str(), &err);
+ if (r < 0)
+ return r;
+
+ set<coll_t> collections;
+ auto p = bl.cbegin();
+ decode(collections, p);
+
+ for (set<coll_t>::iterator q = collections.begin();
+ q != collections.end();
+ ++q) {
+ string fn = path + "/" + stringify(*q);
+ bufferlist cbl;
+ int r = cbl.read_file(fn.c_str(), &err);
+ if (r < 0)
+ return r;
+ CollectionRef c(new Collection(cct, *q));
+ auto p = cbl.cbegin();
+ c->decode(p);
+ coll_map[*q] = c;
+ used_bytes += c->used_bytes();
+ }
+
+ dump_all();
+
+ return 0;
+}
+
+void MemStore::set_fsid(uuid_d u)
+{
+ int r = write_meta("fsid", stringify(u));
+ ceph_assert(r >= 0);
+}
+
+uuid_d MemStore::get_fsid()
+{
+ string fsid_str;
+ int r = read_meta("fsid", &fsid_str);
+ ceph_assert(r >= 0);
+ uuid_d uuid;
+ bool b = uuid.parse(fsid_str.c_str());
+ ceph_assert(b);
+ return uuid;
+}
+
+int MemStore::mkfs()
+{
+ string fsid_str;
+ int r = read_meta("fsid", &fsid_str);
+ if (r == -ENOENT) {
+ uuid_d fsid;
+ fsid.generate_random();
+ fsid_str = stringify(fsid);
+ r = write_meta("fsid", fsid_str);
+ if (r < 0)
+ return r;
+ dout(1) << __func__ << " new fsid " << fsid_str << dendl;
+ } else if (r < 0) {
+ return r;
+ } else {
+ dout(1) << __func__ << " had fsid " << fsid_str << dendl;
+ }
+
+ string fn = path + "/collections";
+ derr << path << dendl;
+ bufferlist bl;
+ set<coll_t> collections;
+ encode(collections, bl);
+ r = bl.write_file(fn.c_str());
+ if (r < 0)
+ return r;
+
+ r = write_meta("type", "memstore");
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int MemStore::statfs(struct store_statfs_t *st, osd_alert_list_t* alerts)
+{
+ dout(10) << __func__ << dendl;
+ if (alerts) {
+ alerts->clear(); // returns nothing for now
+ }
+ st->reset();
+ st->total = cct->_conf->memstore_device_bytes;
+ st->available = std::max<int64_t>(st->total - used_bytes, 0);
+ dout(10) << __func__ << ": used_bytes: " << used_bytes
+ << "/" << cct->_conf->memstore_device_bytes << dendl;
+ return 0;
+}
+
+int MemStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf)
+{
+ return -ENOTSUP;
+}
+
+objectstore_perf_stat_t MemStore::get_cur_stats()
+{
+ // fixme
+ return objectstore_perf_stat_t();
+}
+
+MemStore::CollectionRef MemStore::get_collection(const coll_t& cid)
+{
+ std::shared_lock l{coll_lock};
+ ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+ if (cp == coll_map.end())
+ return CollectionRef();
+ return cp->second;
+}
+
+ObjectStore::CollectionHandle MemStore::create_new_collection(const coll_t& cid)
+{
+ std::lock_guard l{coll_lock};
+ Collection *c = new Collection(cct, cid);
+ new_coll_map[cid] = c;
+ return c;
+}
+
+
+// ---------------
+// read operations
+
+bool MemStore::exists(CollectionHandle &c_, const ghobject_t& oid)
+{
+ Collection *c = static_cast<Collection*>(c_.get());
+ dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+ if (!c->exists)
+ return false;
+
+ // Perform equivalent of c->get_object_(oid) != NULL. In C++11 the
+ // shared_ptr needs to be compared to nullptr.
+ return (bool)c->get_object(oid);
+}
+
+int MemStore::stat(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio)
+{
+ Collection *c = static_cast<Collection*>(c_.get());
+ dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ st->st_size = o->get_size();
+ st->st_blksize = 4096;
+ st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+ st->st_nlink = 1;
+ return 0;
+}
+
+int MemStore::set_collection_opts(
+ CollectionHandle& ch,
+ const pool_opts_t& opts)
+{
+ return -EOPNOTSUPP;
+}
+
+int MemStore::read(
+ CollectionHandle &c_,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags)
+{
+ Collection *c = static_cast<Collection*>(c_.get());
+ dout(10) << __func__ << " " << c->cid << " " << oid << " "
+ << offset << "~" << len << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ if (offset >= o->get_size())
+ return 0;
+ size_t l = len;
+ if (l == 0 && offset == 0) // note: len == 0 means read the entire object
+ l = o->get_size();
+ else if (offset + l > o->get_size())
+ l = o->get_size() - offset;
+ bl.clear();
+ return o->read(offset, l, bl);
+}
+
+int MemStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+ uint64_t offset, size_t len, bufferlist& bl)
+{
+ map<uint64_t, uint64_t> destmap;
+ int r = fiemap(ch, oid, offset, len, destmap);
+ if (r >= 0)
+ encode(destmap, bl);
+ return r;
+}
+
+int MemStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+ uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap)
+{
+ dout(10) << __func__ << " " << ch->cid << " " << oid << " " << offset << "~"
+ << len << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ size_t l = len;
+ if (offset + l > o->get_size())
+ l = o->get_size() - offset;
+ if (offset >= o->get_size())
+ goto out;
+ destmap[offset] = l;
+ out:
+ return 0;
+}
+
+int MemStore::getattr(CollectionHandle &c_, const ghobject_t& oid,
+ const char *name, bufferptr& value)
+{
+ Collection *c = static_cast<Collection*>(c_.get());
+ dout(10) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
+ if (!c->exists)
+ return -ENOENT;
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ string k(name);
+ std::lock_guard lock{o->xattr_mutex};
+ if (!o->xattr.count(k)) {
+ return -ENODATA;
+ }
+ value = o->xattr[k];
+ return 0;
+}
+
+int MemStore::getattrs(CollectionHandle &c_, const ghobject_t& oid,
+ map<string,bufferptr>& aset)
+{
+ Collection *c = static_cast<Collection*>(c_.get());
+ dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+ if (!c->exists)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->xattr_mutex};
+ aset = o->xattr;
+ return 0;
+}
+
+int MemStore::list_collections(vector<coll_t>& ls)
+{
+ dout(10) << __func__ << dendl;
+ std::shared_lock l{coll_lock};
+ for (ceph::unordered_map<coll_t,CollectionRef>::iterator p = coll_map.begin();
+ p != coll_map.end();
+ ++p) {
+ ls.push_back(p->first);
+ }
+ return 0;
+}
+
+bool MemStore::collection_exists(const coll_t& cid)
+{
+ dout(10) << __func__ << " " << cid << dendl;
+ std::shared_lock l{coll_lock};
+ return coll_map.count(cid);
+}
+
+int MemStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+ dout(10) << __func__ << " " << ch->cid << dendl;
+ CollectionRef c = static_cast<Collection*>(ch.get());
+ std::shared_lock l{c->lock};
+ *empty = c->object_map.empty();
+ return 0;
+}
+
+int MemStore::collection_bits(CollectionHandle& ch)
+{
+ dout(10) << __func__ << " " << ch->cid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ std::shared_lock l{c->lock};
+ return c->bits;
+}
+
+int MemStore::collection_list(CollectionHandle& ch,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ int max,
+ vector<ghobject_t> *ls, ghobject_t *next)
+{
+ Collection *c = static_cast<Collection*>(ch.get());
+ std::shared_lock l{c->lock};
+
+ dout(10) << __func__ << " cid " << ch->cid << " start " << start
+ << " end " << end << dendl;
+ map<ghobject_t,ObjectRef>::iterator p = c->object_map.lower_bound(start);
+ while (p != c->object_map.end() &&
+ ls->size() < (unsigned)max &&
+ p->first < end) {
+ ls->push_back(p->first);
+ ++p;
+ }
+ if (next != NULL) {
+ if (p == c->object_map.end())
+ *next = ghobject_t::get_max();
+ else
+ *next = p->first;
+ }
+ dout(10) << __func__ << " cid " << ch->cid << " got " << ls->size() << dendl;
+ return 0;
+}
+
+int MemStore::omap_get(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+ )
+{
+ dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ *header = o->omap_header;
+ *out = o->omap;
+ return 0;
+}
+
+int MemStore::omap_get_header(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ bool allow_eio ///< [in] don't assert on eio
+ )
+{
+ dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ *header = o->omap_header;
+ return 0;
+}
+
+int MemStore::omap_get_keys(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
+ )
+{
+ dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ for (map<string,bufferlist>::iterator p = o->omap.begin();
+ p != o->omap.end();
+ ++p)
+ keys->insert(p->first);
+ return 0;
+}
+
+int MemStore::omap_get_values(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to get
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ )
+{
+ dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ for (set<string>::const_iterator p = keys.begin();
+ p != keys.end();
+ ++p) {
+ map<string,bufferlist>::iterator q = o->omap.find(*p);
+ if (q != o->omap.end())
+ out->insert(*q);
+ }
+ return 0;
+}
+
+int MemStore::omap_check_keys(
+ CollectionHandle& ch, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to check
+ set<string> *out ///< [out] Subset of keys defined on oid
+ )
+{
+ dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ for (set<string>::const_iterator p = keys.begin();
+ p != keys.end();
+ ++p) {
+ map<string,bufferlist>::iterator q = o->omap.find(*p);
+ if (q != o->omap.end())
+ out->insert(*p);
+ }
+ return 0;
+}
+
+class MemStore::OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+ CollectionRef c;
+ ObjectRef o;
+ map<string,bufferlist>::iterator it;
+public:
+ OmapIteratorImpl(CollectionRef c, ObjectRef o)
+ : c(c), o(o), it(o->omap.begin()) {}
+
+ int seek_to_first() override {
+ std::lock_guard lock{o->omap_mutex};
+ it = o->omap.begin();
+ return 0;
+ }
+ int upper_bound(const string &after) override {
+ std::lock_guard lock{o->omap_mutex};
+ it = o->omap.upper_bound(after);
+ return 0;
+ }
+ int lower_bound(const string &to) override {
+ std::lock_guard lock{o->omap_mutex};
+ it = o->omap.lower_bound(to);
+ return 0;
+ }
+ bool valid() override {
+ std::lock_guard lock{o->omap_mutex};
+ return it != o->omap.end();
+ }
+ int next() override {
+ std::lock_guard lock{o->omap_mutex};
+ ++it;
+ return 0;
+ }
+ string key() override {
+ std::lock_guard lock{o->omap_mutex};
+ return it->first;
+ }
+ bufferlist value() override {
+ std::lock_guard lock{o->omap_mutex};
+ return it->second;
+ }
+ int status() override {
+ return 0;
+ }
+};
+
+ObjectMap::ObjectMapIterator MemStore::get_omap_iterator(
+ CollectionHandle& ch,
+ const ghobject_t& oid)
+{
+ dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+ Collection *c = static_cast<Collection*>(ch.get());
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return ObjectMap::ObjectMapIterator();
+ return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o));
+}
+
+
+// ---------------
+// write operations
+
+int MemStore::queue_transactions(
+ CollectionHandle& ch,
+ vector<Transaction>& tls,
+ TrackedOpRef op,
+ ThreadPool::TPHandle *handle)
+{
+ // because memstore operations are synchronous, we can implement the
+ // Sequencer with a mutex. this guarantees ordering on a given sequencer,
+ // while allowing operations on different sequencers to happen in parallel
+ Collection *c = static_cast<Collection*>(ch.get());
+ std::unique_lock lock{c->sequencer_mutex};
+
+ for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
+ // poke the TPHandle heartbeat just to exercise that code path
+ if (handle)
+ handle->reset_tp_timeout();
+
+ _do_transaction(*p);
+ }
+
+ Context *on_apply = NULL, *on_apply_sync = NULL, *on_commit = NULL;
+ ObjectStore::Transaction::collect_contexts(tls, &on_apply, &on_commit,
+ &on_apply_sync);
+ if (on_apply_sync)
+ on_apply_sync->complete(0);
+ if (on_apply)
+ finisher.queue(on_apply);
+ if (on_commit)
+ finisher.queue(on_commit);
+ return 0;
+}
+
+void MemStore::_do_transaction(Transaction& t)
+{
+ Transaction::iterator i = t.begin();
+ int pos = 0;
+
+ while (i.have_op()) {
+ Transaction::Op *op = i.decode_op();
+ int r = 0;
+
+ switch (op->op) {
+ case Transaction::OP_NOP:
+ break;
+ case Transaction::OP_TOUCH:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _touch(cid, oid);
+ }
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ bufferlist bl;
+ i.decode_bl(bl);
+ r = _write(cid, oid, off, len, bl, fadvise_flags);
+ }
+ break;
+
+ case Transaction::OP_ZERO:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ r = _zero(cid, oid, off, len);
+ }
+ break;
+
+ case Transaction::OP_TRIMCACHE:
+ {
+ // deprecated, no-op
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ r = _truncate(cid, oid, off);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _remove(cid, oid);
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ string name = i.decode_string();
+ bufferlist bl;
+ i.decode_bl(bl);
+ map<string, bufferptr> to_set;
+ to_set[name] = bufferptr(bl.c_str(), bl.length());
+ r = _setattrs(cid, oid, to_set);
+ }
+ break;
+
+ case Transaction::OP_SETATTRS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ map<string, bufferptr> aset;
+ i.decode_attrset(aset);
+ r = _setattrs(cid, oid, aset);
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ string name = i.decode_string();
+ r = _rmattr(cid, oid, name.c_str());
+ }
+ break;
+
+ case Transaction::OP_RMATTRS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _rmattrs(cid, oid);
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ r = _clone(cid, oid, noid);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ r = _clone_range(cid, oid, noid, off, len, off);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE2:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ uint64_t srcoff = op->off;
+ uint64_t len = op->len;
+ uint64_t dstoff = op->dest_off;
+ r = _clone_range(cid, oid, noid, srcoff, len, dstoff);
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ r = _create_collection(cid, op->split_bits);
+ }
+ break;
+
+ case Transaction::OP_COLL_HINT:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t type = op->hint_type;
+ bufferlist hint;
+ i.decode_bl(hint);
+ auto hiter = hint.cbegin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ decode(pg_num, hiter);
+ decode(num_objs, hiter);
+ r = _collection_hint_expected_num_objs(cid, pg_num, num_objs);
+ } else {
+ // Ignore the hint
+ dout(10) << "Unrecognized collection hint type: " << type << dendl;
+ }
+ }
+ break;
+
+ case Transaction::OP_RMCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ r = _destroy_collection(cid);
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ {
+ coll_t ocid = i.get_cid(op->cid);
+ coll_t ncid = i.get_cid(op->dest_cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _collection_add(ncid, ocid, oid);
+ }
+ break;
+
+ case Transaction::OP_COLL_REMOVE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _remove(cid, oid);
+ }
+ break;
+
+ case Transaction::OP_COLL_MOVE:
+ ceph_abort_msg("deprecated");
+ break;
+
+ case Transaction::OP_COLL_MOVE_RENAME:
+ {
+ coll_t oldcid = i.get_cid(op->cid);
+ ghobject_t oldoid = i.get_oid(op->oid);
+ coll_t newcid = i.get_cid(op->dest_cid);
+ ghobject_t newoid = i.get_oid(op->dest_oid);
+ r = _collection_move_rename(oldcid, oldoid, newcid, newoid);
+ if (r == -ENOENT)
+ r = 0;
+ }
+ break;
+
+ case Transaction::OP_TRY_RENAME:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oldoid = i.get_oid(op->oid);
+ ghobject_t newoid = i.get_oid(op->dest_oid);
+ r = _collection_move_rename(cid, oldoid, cid, newoid);
+ if (r == -ENOENT)
+ r = 0;
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ {
+ ceph_abort_msg("not implemented");
+ }
+ break;
+
+ case Transaction::OP_COLL_RMATTR:
+ {
+ ceph_abort_msg("not implemented");
+ }
+ break;
+
+ case Transaction::OP_COLL_RENAME:
+ {
+ ceph_abort_msg("not implemented");
+ }
+ break;
+
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _omap_clear(cid, oid);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ bufferlist aset_bl;
+ i.decode_attrset_bl(&aset_bl);
+ r = _omap_setkeys(cid, oid, aset_bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ bufferlist keys_bl;
+ i.decode_keyset_bl(&keys_bl);
+ r = _omap_rmkeys(cid, oid, keys_bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ r = _omap_rmkeyrange(cid, oid, first, last);
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ bufferlist bl;
+ i.decode_bl(bl);
+ r = _omap_setheader(cid, oid, bl);
+ }
+ break;
+ case Transaction::OP_SPLIT_COLLECTION:
+ ceph_abort_msg("deprecated");
+ break;
+ case Transaction::OP_SPLIT_COLLECTION2:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ uint32_t rem = op->split_rem;
+ coll_t dest = i.get_cid(op->dest_cid);
+ r = _split_collection(cid, bits, rem, dest);
+ }
+ break;
+ case Transaction::OP_MERGE_COLLECTION:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ coll_t dest = i.get_cid(op->dest_cid);
+ r = _merge_collection(cid, bits, dest);
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+ {
+ r = 0;
+ }
+ break;
+
+ case Transaction::OP_COLL_SET_BITS:
+ {
+ r = 0;
+ }
+ break;
+
+ default:
+ derr << "bad op " << op->op << dendl;
+ ceph_abort();
+ }
+
+ if (r < 0) {
+ bool ok = false;
+
+ if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2 ||
+ op->op == Transaction::OP_COLL_ADD))
+ // -ENOENT is usually okay
+ ok = true;
+ if (r == -ENODATA)
+ ok = true;
+
+ if (!ok) {
+ const char *msg = "unexpected error code";
+
+ if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2))
+ msg = "ENOENT on clone suggests osd bug";
+
+ if (r == -ENOSPC)
+ // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+ // by partially applying transactions.
+ msg = "ENOSPC from MemStore, misconfigured cluster or insufficient memory";
+
+ if (r == -ENOTEMPTY) {
+ msg = "ENOTEMPTY suggests garbage data in osd data dir";
+ dump_all();
+ }
+
+ derr << " error " << cpp_strerror(r) << " not handled on operation " << op->op
+ << " (op " << pos << ", counting from 0)" << dendl;
+ dout(0) << msg << dendl;
+ dout(0) << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t.dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+ ceph_abort_msg("unexpected error");
+ }
+ }
+
+ ++pos;
+ }
+}
+
+int MemStore::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ c->get_or_create_object(oid);
+ return 0;
+}
+
+int MemStore::_write(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len, const bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << " "
+ << offset << "~" << len << dendl;
+ ceph_assert(len == bl.length());
+
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ if (len > 0) {
+ const ssize_t old_size = o->get_size();
+ o->write(offset, bl);
+ used_bytes += (o->get_size() - old_size);
+ }
+
+ return 0;
+}
+
+int MemStore::_zero(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~"
+ << len << dendl;
+ bufferlist bl;
+ bl.append_zero(len);
+ return _write(cid, oid, offset, len, bl);
+}
+
+int MemStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << " " << size << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ const ssize_t old_size = o->get_size();
+ int r = o->truncate(size);
+ used_bytes += (o->get_size() - old_size);
+ return r;
+}
+
+int MemStore::_remove(const coll_t& cid, const ghobject_t& oid)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+ std::lock_guard l{c->lock};
+
+ auto i = c->object_hash.find(oid);
+ if (i == c->object_hash.end())
+ return -ENOENT;
+ used_bytes -= i->second->get_size();
+ c->object_hash.erase(i);
+ c->object_map.erase(oid);
+
+ return 0;
+}
+
+int MemStore::_setattrs(const coll_t& cid, const ghobject_t& oid,
+ map<string,bufferptr>& aset)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->xattr_mutex};
+ for (map<string,bufferptr>::const_iterator p = aset.begin(); p != aset.end(); ++p)
+ o->xattr[p->first] = p->second;
+ return 0;
+}
+
+int MemStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << " " << name << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->xattr_mutex};
+ auto i = o->xattr.find(name);
+ if (i == o->xattr.end())
+ return -ENODATA;
+ o->xattr.erase(i);
+ return 0;
+}
+
+int MemStore::_rmattrs(const coll_t& cid, const ghobject_t& oid)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->xattr_mutex};
+ o->xattr.clear();
+ return 0;
+}
+
+int MemStore::_clone(const coll_t& cid, const ghobject_t& oldoid,
+ const ghobject_t& newoid)
+{
+ dout(10) << __func__ << " " << cid << " " << oldoid
+ << " -> " << newoid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef oo = c->get_object(oldoid);
+ if (!oo)
+ return -ENOENT;
+ ObjectRef no = c->get_or_create_object(newoid);
+ used_bytes += oo->get_size() - no->get_size();
+ no->clone(oo.get(), 0, oo->get_size(), 0);
+
+ // take xattr and omap locks with std::lock()
+ std::scoped_lock l{oo->xattr_mutex,
+ no->xattr_mutex,
+ oo->omap_mutex,
+ no->omap_mutex};
+
+ no->omap_header = oo->omap_header;
+ no->omap = oo->omap;
+ no->xattr = oo->xattr;
+ return 0;
+}
+
+int MemStore::_clone_range(const coll_t& cid, const ghobject_t& oldoid,
+ const ghobject_t& newoid,
+ uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(10) << __func__ << " " << cid << " "
+ << oldoid << " " << srcoff << "~" << len << " -> "
+ << newoid << " " << dstoff << "~" << len
+ << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef oo = c->get_object(oldoid);
+ if (!oo)
+ return -ENOENT;
+ ObjectRef no = c->get_or_create_object(newoid);
+ if (srcoff >= oo->get_size())
+ return 0;
+ if (srcoff + len >= oo->get_size())
+ len = oo->get_size() - srcoff;
+
+ const ssize_t old_size = no->get_size();
+ no->clone(oo.get(), srcoff, len, dstoff);
+ used_bytes += (no->get_size() - old_size);
+
+ return len;
+}
+
+int MemStore::_omap_clear(const coll_t& cid, const ghobject_t &oid)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ o->omap.clear();
+ o->omap_header.clear();
+ return 0;
+}
+
+int MemStore::_omap_setkeys(const coll_t& cid, const ghobject_t &oid,
+ bufferlist& aset_bl)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ auto p = aset_bl.cbegin();
+ __u32 num;
+ decode(num, p);
+ while (num--) {
+ string key;
+ decode(key, p);
+ decode(o->omap[key], p);
+ }
+ return 0;
+}
+
+int MemStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &oid,
+ bufferlist& keys_bl)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ auto p = keys_bl.cbegin();
+ __u32 num;
+ decode(num, p);
+ while (num--) {
+ string key;
+ decode(key, p);
+ o->omap.erase(key);
+ }
+ return 0;
+}
+
+int MemStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+ const string& first, const string& last)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << " " << first
+ << " " << last << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ map<string,bufferlist>::iterator p = o->omap.lower_bound(first);
+ map<string,bufferlist>::iterator e = o->omap.lower_bound(last);
+ o->omap.erase(p, e);
+ return 0;
+}
+
+int MemStore::_omap_setheader(const coll_t& cid, const ghobject_t &oid,
+ const bufferlist &bl)
+{
+ dout(10) << __func__ << " " << cid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ std::lock_guard lock{o->omap_mutex};
+ o->omap_header = bl;
+ return 0;
+}
+
+int MemStore::_create_collection(const coll_t& cid, int bits)
+{
+ dout(10) << __func__ << " " << cid << dendl;
+ std::lock_guard l{coll_lock};
+ auto result = coll_map.insert(std::make_pair(cid, CollectionRef()));
+ if (!result.second)
+ return -EEXIST;
+ auto p = new_coll_map.find(cid);
+ ceph_assert(p != new_coll_map.end());
+ result.first->second = p->second;
+ result.first->second->bits = bits;
+ new_coll_map.erase(p);
+ return 0;
+}
+
+int MemStore::_destroy_collection(const coll_t& cid)
+{
+ dout(10) << __func__ << " " << cid << dendl;
+ std::lock_guard l{coll_lock};
+ ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+ if (cp == coll_map.end())
+ return -ENOENT;
+ {
+ std::shared_lock l2{cp->second->lock};
+ if (!cp->second->object_map.empty())
+ return -ENOTEMPTY;
+ cp->second->exists = false;
+ }
+ used_bytes -= cp->second->used_bytes();
+ coll_map.erase(cp);
+ return 0;
+}
+
+int MemStore::_collection_add(const coll_t& cid, const coll_t& ocid, const ghobject_t& oid)
+{
+ dout(10) << __func__ << " " << cid << " " << ocid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+ CollectionRef oc = get_collection(ocid);
+ if (!oc)
+ return -ENOENT;
+
+ std::scoped_lock l{std::min(&(*c), &(*oc))->lock,
+ std::max(&(*c), &(*oc))->lock};
+
+ if (c->object_hash.count(oid))
+ return -EEXIST;
+ if (oc->object_hash.count(oid) == 0)
+ return -ENOENT;
+ ObjectRef o = oc->object_hash[oid];
+ c->object_map[oid] = o;
+ c->object_hash[oid] = o;
+ return 0;
+}
+
+int MemStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+ coll_t cid, const ghobject_t& oid)
+{
+ dout(10) << __func__ << " " << oldcid << " " << oldoid << " -> "
+ << cid << " " << oid << dendl;
+ CollectionRef c = get_collection(cid);
+ if (!c)
+ return -ENOENT;
+ CollectionRef oc = get_collection(oldcid);
+ if (!oc)
+ return -ENOENT;
+
+ // note: c and oc may be the same
+ ceph_assert(&(*c) == &(*oc));
+
+ std::lock_guard l{c->lock};
+ if (c->object_hash.count(oid))
+ return -EEXIST;
+ if (oc->object_hash.count(oldoid) == 0)
+ return -ENOENT;
+ {
+ ObjectRef o = oc->object_hash[oldoid];
+ c->object_map[oid] = o;
+ c->object_hash[oid] = o;
+ oc->object_map.erase(oldoid);
+ oc->object_hash.erase(oldoid);
+ }
+ return 0;
+}
+
+int MemStore::_split_collection(const coll_t& cid, uint32_t bits, uint32_t match,
+ coll_t dest)
+{
+ dout(10) << __func__ << " " << cid << " " << bits << " " << match << " "
+ << dest << dendl;
+ CollectionRef sc = get_collection(cid);
+ if (!sc)
+ return -ENOENT;
+ CollectionRef dc = get_collection(dest);
+ if (!dc)
+ return -ENOENT;
+
+ std::scoped_lock l{std::min(&(*sc), &(*dc))->lock,
+ std::max(&(*sc), &(*dc))->lock};
+
+ map<ghobject_t,ObjectRef>::iterator p = sc->object_map.begin();
+ while (p != sc->object_map.end()) {
+ if (p->first.match(bits, match)) {
+ dout(20) << " moving " << p->first << dendl;
+ dc->object_map.insert(make_pair(p->first, p->second));
+ dc->object_hash.insert(make_pair(p->first, p->second));
+ sc->object_hash.erase(p->first);
+ sc->object_map.erase(p++);
+ } else {
+ ++p;
+ }
+ }
+
+ sc->bits = bits;
+ ceph_assert(dc->bits == (int)bits);
+
+ return 0;
+}
+
+int MemStore::_merge_collection(const coll_t& cid, uint32_t bits, coll_t dest)
+{
+ dout(10) << __func__ << " " << cid << " " << bits << " "
+ << dest << dendl;
+ CollectionRef sc = get_collection(cid);
+ if (!sc)
+ return -ENOENT;
+ CollectionRef dc = get_collection(dest);
+ if (!dc)
+ return -ENOENT;
+ {
+ std::scoped_lock l{std::min(&(*sc), &(*dc))->lock,
+ std::max(&(*sc), &(*dc))->lock};
+
+ map<ghobject_t,ObjectRef>::iterator p = sc->object_map.begin();
+ while (p != sc->object_map.end()) {
+ dout(20) << " moving " << p->first << dendl;
+ dc->object_map.insert(make_pair(p->first, p->second));
+ dc->object_hash.insert(make_pair(p->first, p->second));
+ sc->object_hash.erase(p->first);
+ sc->object_map.erase(p++);
+ }
+
+ dc->bits = bits;
+ }
+
+ {
+ std::lock_guard l{coll_lock};
+ ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+ ceph_assert(cp != coll_map.end());
+ used_bytes -= cp->second->used_bytes();
+ coll_map.erase(cp);
+ }
+
+ return 0;
+}
+
+namespace {
+struct BufferlistObject : public MemStore::Object {
+ ceph::spinlock mutex;
+ bufferlist data;
+
+ size_t get_size() const override { return data.length(); }
+
+ int read(uint64_t offset, uint64_t len, bufferlist &bl) override;
+ int write(uint64_t offset, const bufferlist &bl) override;
+ int clone(Object *src, uint64_t srcoff, uint64_t len,
+ uint64_t dstoff) override;
+ int truncate(uint64_t offset) override;
+
+ void encode(bufferlist& bl) const override {
+ ENCODE_START(1, 1, bl);
+ encode(data, bl);
+ encode_base(bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& p) override {
+ DECODE_START(1, p);
+ decode(data, p);
+ decode_base(p);
+ DECODE_FINISH(p);
+ }
+};
+}
+// BufferlistObject
+int BufferlistObject::read(uint64_t offset, uint64_t len,
+ bufferlist &bl)
+{
+ std::lock_guard<decltype(mutex)> lock(mutex);
+ bl.substr_of(data, offset, len);
+ return bl.length();
+}
+
+int BufferlistObject::write(uint64_t offset, const bufferlist &src)
+{
+ unsigned len = src.length();
+
+ std::lock_guard<decltype(mutex)> lock(mutex);
+
+ // before
+ bufferlist newdata;
+ if (get_size() >= offset) {
+ newdata.substr_of(data, 0, offset);
+ } else {
+ if (get_size()) {
+ newdata.substr_of(data, 0, get_size());
+ }
+ newdata.append_zero(offset - get_size());
+ }
+
+ newdata.append(src);
+
+ // after
+ if (get_size() > offset + len) {
+ bufferlist tail;
+ tail.substr_of(data, offset + len, get_size() - (offset + len));
+ newdata.append(tail);
+ }
+
+ data.claim(newdata);
+ return 0;
+}
+
+int BufferlistObject::clone(Object *src, uint64_t srcoff,
+ uint64_t len, uint64_t dstoff)
+{
+ auto srcbl = dynamic_cast<BufferlistObject*>(src);
+ if (srcbl == nullptr)
+ return -ENOTSUP;
+
+ bufferlist bl;
+ {
+ std::lock_guard<decltype(srcbl->mutex)> lock(srcbl->mutex);
+ if (srcoff == dstoff && len == src->get_size()) {
+ data = srcbl->data;
+ return 0;
+ }
+ bl.substr_of(srcbl->data, srcoff, len);
+ }
+ return write(dstoff, bl);
+}
+
+int BufferlistObject::truncate(uint64_t size)
+{
+ std::lock_guard<decltype(mutex)> lock(mutex);
+ if (get_size() > size) {
+ bufferlist bl;
+ bl.substr_of(data, 0, size);
+ data.claim(bl);
+ } else if (get_size() == size) {
+ // do nothing
+ } else {
+ data.append_zero(size - get_size());
+ }
+ return 0;
+}
+
+// PageSetObject
+
+struct MemStore::PageSetObject : public Object {
+ PageSet data;
+ uint64_t data_len;
+#if defined(__GLIBCXX__)
+ // use a thread-local vector for the pages returned by PageSet, so we
+ // can avoid allocations in read/write()
+ static thread_local PageSet::page_vector tls_pages;
+#endif
+
+ explicit PageSetObject(size_t page_size) : data(page_size), data_len(0) {}
+
+ size_t get_size() const override { return data_len; }
+
+ int read(uint64_t offset, uint64_t len, bufferlist &bl) override;
+ int write(uint64_t offset, const bufferlist &bl) override;
+ int clone(Object *src, uint64_t srcoff, uint64_t len,
+ uint64_t dstoff) override;
+ int truncate(uint64_t offset) override;
+
+ void encode(bufferlist& bl) const override {
+ ENCODE_START(1, 1, bl);
+ encode(data_len, bl);
+ data.encode(bl);
+ encode_base(bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& p) override {
+ DECODE_START(1, p);
+ decode(data_len, p);
+ data.decode(p);
+ decode_base(p);
+ DECODE_FINISH(p);
+ }
+};
+
+#if defined(__GLIBCXX__)
+// use a thread-local vector for the pages returned by PageSet, so we
+// can avoid allocations in read/write()
+thread_local PageSet::page_vector MemStore::PageSetObject::tls_pages;
+#define DEFINE_PAGE_VECTOR(name)
+#else
+#define DEFINE_PAGE_VECTOR(name) PageSet::page_vector name;
+#endif
+
+int MemStore::PageSetObject::read(uint64_t offset, uint64_t len, bufferlist& bl)
+{
+ const auto start = offset;
+ const auto end = offset + len;
+ auto remaining = len;
+
+ DEFINE_PAGE_VECTOR(tls_pages);
+ data.get_range(offset, len, tls_pages);
+
+ // allocate a buffer for the data
+ buffer::ptr buf(len);
+
+ auto p = tls_pages.begin();
+ while (remaining) {
+ // no more pages in range
+ if (p == tls_pages.end() || (*p)->offset >= end) {
+ buf.zero(offset - start, remaining);
+ break;
+ }
+ auto page = *p;
+
+ // fill any holes between pages with zeroes
+ if (page->offset > offset) {
+ const auto count = std::min(remaining, page->offset - offset);
+ buf.zero(offset - start, count);
+ remaining -= count;
+ offset = page->offset;
+ if (!remaining)
+ break;
+ }
+
+ // read from page
+ const auto page_offset = offset - page->offset;
+ const auto count = min(remaining, data.get_page_size() - page_offset);
+
+ buf.copy_in(offset - start, count, page->data + page_offset);
+
+ remaining -= count;
+ offset += count;
+
+ ++p;
+ }
+
+ tls_pages.clear(); // drop page refs
+
+ bl.append(std::move(buf));
+ return len;
+}
+
+int MemStore::PageSetObject::write(uint64_t offset, const bufferlist &src)
+{
+ unsigned len = src.length();
+
+ DEFINE_PAGE_VECTOR(tls_pages);
+ // make sure the page range is allocated
+ data.alloc_range(offset, src.length(), tls_pages);
+
+ auto page = tls_pages.begin();
+
+ auto p = src.begin();
+ while (len > 0) {
+ unsigned page_offset = offset - (*page)->offset;
+ unsigned pageoff = data.get_page_size() - page_offset;
+ unsigned count = min(len, pageoff);
+ p.copy(count, (*page)->data + page_offset);
+ offset += count;
+ len -= count;
+ if (count == pageoff)
+ ++page;
+ }
+ if (data_len < offset)
+ data_len = offset;
+ tls_pages.clear(); // drop page refs
+ return 0;
+}
+
+int MemStore::PageSetObject::clone(Object *src, uint64_t srcoff,
+ uint64_t len, uint64_t dstoff)
+{
+ const int64_t delta = dstoff - srcoff;
+
+ auto &src_data = static_cast<PageSetObject*>(src)->data;
+ const uint64_t src_page_size = src_data.get_page_size();
+
+ auto &dst_data = data;
+ const auto dst_page_size = dst_data.get_page_size();
+
+ DEFINE_PAGE_VECTOR(tls_pages);
+ PageSet::page_vector dst_pages;
+
+ while (len) {
+ // limit to 16 pages at a time so tls_pages doesn't balloon in size
+ auto count = std::min(len, (uint64_t)src_page_size * 16);
+ src_data.get_range(srcoff, count, tls_pages);
+
+ // allocate the destination range
+ // TODO: avoid allocating pages for holes in the source range
+ dst_data.alloc_range(srcoff + delta, count, dst_pages);
+ auto dst_iter = dst_pages.begin();
+
+ for (auto &src_page : tls_pages) {
+ auto sbegin = std::max(srcoff, src_page->offset);
+ auto send = std::min(srcoff + count, src_page->offset + src_page_size);
+
+ // zero-fill holes before src_page
+ if (srcoff < sbegin) {
+ while (dst_iter != dst_pages.end()) {
+ auto &dst_page = *dst_iter;
+ auto dbegin = std::max(srcoff + delta, dst_page->offset);
+ auto dend = std::min(sbegin + delta, dst_page->offset + dst_page_size);
+ std::fill(dst_page->data + dbegin - dst_page->offset,
+ dst_page->data + dend - dst_page->offset, 0);
+ if (dend < dst_page->offset + dst_page_size)
+ break;
+ ++dst_iter;
+ }
+ const auto c = sbegin - srcoff;
+ count -= c;
+ len -= c;
+ }
+
+ // copy data from src page to dst pages
+ while (dst_iter != dst_pages.end()) {
+ auto &dst_page = *dst_iter;
+ auto dbegin = std::max(sbegin + delta, dst_page->offset);
+ auto dend = std::min(send + delta, dst_page->offset + dst_page_size);
+
+ std::copy(src_page->data + (dbegin - delta) - src_page->offset,
+ src_page->data + (dend - delta) - src_page->offset,
+ dst_page->data + dbegin - dst_page->offset);
+ if (dend < dst_page->offset + dst_page_size)
+ break;
+ ++dst_iter;
+ }
+
+ const auto c = send - sbegin;
+ count -= c;
+ len -= c;
+ srcoff = send;
+ dstoff = send + delta;
+ }
+ tls_pages.clear(); // drop page refs
+
+ // zero-fill holes after the last src_page
+ if (count > 0) {
+ while (dst_iter != dst_pages.end()) {
+ auto &dst_page = *dst_iter;
+ auto dbegin = std::max(dstoff, dst_page->offset);
+ auto dend = std::min(dstoff + count, dst_page->offset + dst_page_size);
+ std::fill(dst_page->data + dbegin - dst_page->offset,
+ dst_page->data + dend - dst_page->offset, 0);
+ ++dst_iter;
+ }
+ srcoff += count;
+ dstoff += count;
+ len -= count;
+ }
+ dst_pages.clear(); // drop page refs
+ }
+
+ // update object size
+ if (data_len < dstoff)
+ data_len = dstoff;
+ return 0;
+}
+
+int MemStore::PageSetObject::truncate(uint64_t size)
+{
+ data.free_pages_after(size);
+ data_len = size;
+
+ const auto page_size = data.get_page_size();
+ const auto page_offset = size & ~(page_size-1);
+ if (page_offset == size)
+ return 0;
+
+ DEFINE_PAGE_VECTOR(tls_pages);
+ // write zeroes to the rest of the last page
+ data.get_range(page_offset, page_size, tls_pages);
+ if (tls_pages.empty())
+ return 0;
+
+ auto page = tls_pages.begin();
+ auto data = (*page)->data;
+ std::fill(data + (size - page_offset), data + page_size, 0);
+ tls_pages.clear(); // drop page ref
+ return 0;
+}
+
+
+MemStore::ObjectRef MemStore::Collection::create_object() const {
+ if (use_page_set)
+ return new PageSetObject(cct->_conf->memstore_page_size);
+ return new BufferlistObject();
+}
diff --git a/src/os/memstore/MemStore.h b/src/os/memstore/MemStore.h
new file mode 100644
index 00000000..3d361631
--- /dev/null
+++ b/src/os/memstore/MemStore.h
@@ -0,0 +1,414 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013- Sage Weil <sage@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_MEMSTORE_H
+#define CEPH_MEMSTORE_H
+
+#include <mutex>
+#include <boost/intrusive_ptr.hpp>
+
+#include "include/unordered_map.h"
+#include "common/Finisher.h"
+#include "common/RefCountedObj.h"
+#include "common/RWLock.h"
+#include "os/ObjectStore.h"
+#include "PageSet.h"
+#include "include/ceph_assert.h"
+
+class MemStore : public ObjectStore {
+public:
+ struct Object : public RefCountedObject {
+ ceph::mutex xattr_mutex{ceph::make_mutex("MemStore::Object::xattr_mutex")};
+ ceph::mutex omap_mutex{ceph::make_mutex("MemStore::Object::omap_mutex")};
+ map<string,bufferptr> xattr;
+ bufferlist omap_header;
+ map<string,bufferlist> omap;
+
+ typedef boost::intrusive_ptr<Object> Ref;
+ friend void intrusive_ptr_add_ref(Object *o) { o->get(); }
+ friend void intrusive_ptr_release(Object *o) { o->put(); }
+
+ Object() : RefCountedObject(nullptr, 0) {}
+ // interface for object data
+ virtual size_t get_size() const = 0;
+ virtual int read(uint64_t offset, uint64_t len, bufferlist &bl) = 0;
+ virtual int write(uint64_t offset, const bufferlist &bl) = 0;
+ virtual int clone(Object *src, uint64_t srcoff, uint64_t len,
+ uint64_t dstoff) = 0;
+ virtual int truncate(uint64_t offset) = 0;
+ virtual void encode(bufferlist& bl) const = 0;
+ virtual void decode(bufferlist::const_iterator& p) = 0;
+
+ void encode_base(bufferlist& bl) const {
+ using ceph::encode;
+ encode(xattr, bl);
+ encode(omap_header, bl);
+ encode(omap, bl);
+ }
+ void decode_base(bufferlist::const_iterator& p) {
+ using ceph::decode;
+ decode(xattr, p);
+ decode(omap_header, p);
+ decode(omap, p);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_int("data_len", get_size());
+ f->dump_int("omap_header_len", omap_header.length());
+
+ f->open_array_section("xattrs");
+ for (map<string,bufferptr>::const_iterator p = xattr.begin();
+ p != xattr.end();
+ ++p) {
+ f->open_object_section("xattr");
+ f->dump_string("name", p->first);
+ f->dump_int("length", p->second.length());
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("omap");
+ for (map<string,bufferlist>::const_iterator p = omap.begin();
+ p != omap.end();
+ ++p) {
+ f->open_object_section("pair");
+ f->dump_string("key", p->first);
+ f->dump_int("length", p->second.length());
+ f->close_section();
+ }
+ f->close_section();
+ }
+ };
+ typedef Object::Ref ObjectRef;
+
+ struct PageSetObject;
+ struct Collection : public CollectionImpl {
+ int bits = 0;
+ CephContext *cct;
+ bool use_page_set;
+ ceph::unordered_map<ghobject_t, ObjectRef> object_hash; ///< for lookup
+ map<ghobject_t, ObjectRef> object_map; ///< for iteration
+ map<string,bufferptr> xattr;
+ /// for object_{map,hash}
+ ceph::shared_mutex lock{
+ ceph::make_shared_mutex("MemStore::Collection::lock", true, false)};
+
+ bool exists = true;
+ ceph::mutex sequencer_mutex{
+ ceph::make_mutex("MemStore::Collection::sequencer_mutex")};
+
+ typedef boost::intrusive_ptr<Collection> Ref;
+ friend void intrusive_ptr_add_ref(Collection *c) { c->get(); }
+ friend void intrusive_ptr_release(Collection *c) { c->put(); }
+
+ ObjectRef create_object() const;
+
+ // NOTE: The lock only needs to protect the object_map/hash, not the
+ // contents of individual objects. The osd is already sequencing
+ // reads and writes, so we will never see them concurrently at this
+ // level.
+
+ ObjectRef get_object(ghobject_t oid) {
+ std::shared_lock l{lock};
+ auto o = object_hash.find(oid);
+ if (o == object_hash.end())
+ return ObjectRef();
+ return o->second;
+ }
+
+ ObjectRef get_or_create_object(ghobject_t oid) {
+ std::lock_guard l{lock};
+ auto result = object_hash.emplace(oid, ObjectRef());
+ if (result.second)
+ object_map[oid] = result.first->second = create_object();
+ return result.first->second;
+ }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(xattr, bl);
+ encode(use_page_set, bl);
+ uint32_t s = object_map.size();
+ encode(s, bl);
+ for (map<ghobject_t, ObjectRef>::const_iterator p = object_map.begin();
+ p != object_map.end();
+ ++p) {
+ encode(p->first, bl);
+ p->second->encode(bl);
+ }
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(xattr, p);
+ decode(use_page_set, p);
+ uint32_t s;
+ decode(s, p);
+ while (s--) {
+ ghobject_t k;
+ decode(k, p);
+ auto o = create_object();
+ o->decode(p);
+ object_map.insert(make_pair(k, o));
+ object_hash.insert(make_pair(k, o));
+ }
+ DECODE_FINISH(p);
+ }
+
+ uint64_t used_bytes() const {
+ uint64_t result = 0;
+ for (map<ghobject_t, ObjectRef>::const_iterator p = object_map.begin();
+ p != object_map.end();
+ ++p) {
+ result += p->second->get_size();
+ }
+
+ return result;
+ }
+
+ void flush() override {
+ }
+ bool flush_commit(Context *c) override {
+ return true;
+ }
+
+ explicit Collection(CephContext *cct, coll_t c)
+ : CollectionImpl(c),
+ cct(cct),
+ use_page_set(cct->_conf->memstore_page_set) {}
+ };
+ typedef Collection::Ref CollectionRef;
+
+private:
+ class OmapIteratorImpl;
+
+
+ ceph::unordered_map<coll_t, CollectionRef> coll_map;
+ /// rwlock to protect coll_map
+ ceph::shared_mutex coll_lock{
+ ceph::make_shared_mutex("MemStore::coll_lock")};
+ map<coll_t,CollectionRef> new_coll_map;
+
+ CollectionRef get_collection(const coll_t& cid);
+
+ Finisher finisher;
+
+ uint64_t used_bytes;
+
+ void _do_transaction(Transaction& t);
+
+ int _touch(const coll_t& cid, const ghobject_t& oid);
+ int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
+ const bufferlist& bl, uint32_t fadvise_flags = 0);
+ int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
+ int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+ int _remove(const coll_t& cid, const ghobject_t& oid);
+ int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset);
+ int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name);
+ int _rmattrs(const coll_t& cid, const ghobject_t& oid);
+ int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid);
+ int _clone_range(const coll_t& cid, const ghobject_t& oldoid,
+ const ghobject_t& newoid,
+ uint64_t srcoff, uint64_t len, uint64_t dstoff);
+ int _omap_clear(const coll_t& cid, const ghobject_t &oid);
+ int _omap_setkeys(const coll_t& cid, const ghobject_t &oid, bufferlist& aset_bl);
+ int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, bufferlist& keys_bl);
+ int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+ const string& first, const string& last);
+ int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl);
+
+ int _collection_hint_expected_num_objs(const coll_t& cid, uint32_t pg_num,
+ uint64_t num_objs) const { return 0; }
+ int _create_collection(const coll_t& c, int bits);
+ int _destroy_collection(const coll_t& c);
+ int _collection_add(const coll_t& cid, const coll_t& ocid, const ghobject_t& oid);
+ int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+ coll_t cid, const ghobject_t& o);
+ int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest);
+ int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest);
+
+ int _save();
+ int _load();
+
+ void dump(Formatter *f);
+ void dump_all();
+
+public:
+ MemStore(CephContext *cct, const string& path)
+ : ObjectStore(cct, path),
+ finisher(cct),
+ used_bytes(0) {}
+ ~MemStore() override { }
+
+ string get_type() override {
+ return "memstore";
+ }
+
+ bool test_mount_in_use() override {
+ return false;
+ }
+
+ int mount() override;
+ int umount() override;
+
+ int fsck(bool deep) override {
+ return 0;
+ }
+
+ int validate_hobject_key(const hobject_t &obj) const override {
+ return 0;
+ }
+ unsigned get_max_attr_name_length() override {
+ return 256; // arbitrary; there is no real limit internally
+ }
+
+ int mkfs() override;
+ int mkjournal() override {
+ return 0;
+ }
+ bool wants_journal() override {
+ return false;
+ }
+ bool allows_journal() override {
+ return false;
+ }
+ bool needs_journal() override {
+ return false;
+ }
+
+ int get_devices(set<string> *ls) override {
+ // no devices for us!
+ return 0;
+ }
+
+ int statfs(struct store_statfs_t *buf,
+ osd_alert_list_t* alerts = nullptr) override;
+ int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override;
+
+ bool exists(CollectionHandle &c, const ghobject_t& oid) override;
+ int stat(CollectionHandle &c, const ghobject_t& oid,
+ struct stat *st, bool allow_eio = false) override;
+ int set_collection_opts(
+ CollectionHandle& c,
+ const pool_opts_t& opts) override;
+ int read(
+ CollectionHandle &c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags = 0) override;
+ using ObjectStore::fiemap;
+ int fiemap(CollectionHandle& c, const ghobject_t& oid,
+ uint64_t offset, size_t len, bufferlist& bl) override;
+ int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset,
+ size_t len, map<uint64_t, uint64_t>& destmap) override;
+ int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
+ bufferptr& value) override;
+ int getattrs(CollectionHandle &c, const ghobject_t& oid,
+ map<string,bufferptr>& aset) override;
+
+ int list_collections(vector<coll_t>& ls) override;
+
+ CollectionHandle open_collection(const coll_t& c) override {
+ return get_collection(c);
+ }
+ CollectionHandle create_new_collection(const coll_t& c) override;
+
+ void set_collection_commit_queue(const coll_t& cid,
+ ContextQueue *commit_queue) override {
+ }
+
+ bool collection_exists(const coll_t& c) override;
+ int collection_empty(CollectionHandle& c, bool *empty) override;
+ int collection_bits(CollectionHandle& c) override;
+ int collection_list(CollectionHandle& cid,
+ const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *next) override;
+
+ using ObjectStore::omap_get;
+ int omap_get(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ map<string, bufferlist> *out /// < [out] Key to value map
+ ) override;
+
+ using ObjectStore::omap_get_header;
+ /// Get omap header
+ int omap_get_header(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ bufferlist *header, ///< [out] omap header
+ bool allow_eio = false ///< [in] don't assert on eio
+ ) override;
+
+ using ObjectStore::omap_get_keys;
+ /// Get keys defined on oid
+ int omap_get_keys(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
+ ) override;
+
+ using ObjectStore::omap_get_values;
+ /// Get key values
+ int omap_get_values(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to get
+ map<string, bufferlist> *out ///< [out] Returned keys and values
+ ) override;
+
+ using ObjectStore::omap_check_keys;
+ /// Filters keys into out which are defined on oid
+ int omap_check_keys(
+ CollectionHandle& c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ const set<string> &keys, ///< [in] Keys to check
+ set<string> *out ///< [out] Subset of keys defined on oid
+ ) override;
+
+ using ObjectStore::get_omap_iterator;
+ ObjectMap::ObjectMapIterator get_omap_iterator(
+ CollectionHandle& c, ///< [in] collection
+ const ghobject_t &oid ///< [in] object
+ ) override;
+
+ void set_fsid(uuid_d u) override;
+ uuid_d get_fsid() override;
+
+ uint64_t estimate_objects_overhead(uint64_t num_objects) override {
+ return 0; //do not care
+ }
+
+ objectstore_perf_stat_t get_cur_stats() override;
+
+ const PerfCounters* get_perf_counters() const override {
+ return nullptr;
+ }
+
+
+ int queue_transactions(
+ CollectionHandle& ch,
+ vector<Transaction>& tls,
+ TrackedOpRef op = TrackedOpRef(),
+ ThreadPool::TPHandle *handle = NULL) override;
+};
+
+
+
+
+#endif
diff --git a/src/os/memstore/PageSet.h b/src/os/memstore/PageSet.h
new file mode 100644
index 00000000..8e243281
--- /dev/null
+++ b/src/os/memstore/PageSet.h
@@ -0,0 +1,232 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013- Sage Weil <sage@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PAGESET_H
+#define CEPH_PAGESET_H
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <mutex>
+#include <vector>
+#include <boost/intrusive/avl_set.hpp>
+#include <boost/intrusive_ptr.hpp>
+
+#include "include/encoding.h"
+
+struct Page {
+ char *const data;
+ boost::intrusive::avl_set_member_hook<> hook;
+ uint64_t offset;
+
+ // avoid RefCountedObject because it has a virtual destructor
+ std::atomic<uint16_t> nrefs;
+ void get() { ++nrefs; }
+ void put() { if (--nrefs == 0) delete this; }
+
+ typedef boost::intrusive_ptr<Page> Ref;
+ friend void intrusive_ptr_add_ref(Page *p) { p->get(); }
+ friend void intrusive_ptr_release(Page *p) { p->put(); }
+
+ // key-value comparison functor for avl
+ struct Less {
+ bool operator()(uint64_t offset, const Page &page) const {
+ return offset < page.offset;
+ }
+ bool operator()(const Page &page, uint64_t offset) const {
+ return page.offset < offset;
+ }
+ bool operator()(const Page &lhs, const Page &rhs) const {
+ return lhs.offset < rhs.offset;
+ }
+ };
+ void encode(bufferlist &bl, size_t page_size) const {
+ using ceph::encode;
+ bl.append(buffer::copy(data, page_size));
+ encode(offset, bl);
+ }
+ void decode(bufferlist::const_iterator &p, size_t page_size) {
+ using ceph::decode;
+ p.copy(page_size, data);
+ decode(offset, p);
+ }
+
+ static Ref create(size_t page_size, uint64_t offset = 0) {
+ // ensure proper alignment of the Page
+ const auto align = alignof(Page);
+ page_size = (page_size + align - 1) & ~(align - 1);
+ // allocate the Page and its data in a single buffer
+ auto buffer = new char[page_size + sizeof(Page)];
+ // place the Page structure at the end of the buffer
+ return new (buffer + page_size) Page(buffer, offset);
+ }
+
+ // copy disabled
+ Page(const Page&) = delete;
+ const Page& operator=(const Page&) = delete;
+
+ private: // private constructor, use create() instead
+ Page(char *data, uint64_t offset) : data(data), offset(offset), nrefs(1) {}
+
+ static void operator delete(void *p) {
+ delete[] reinterpret_cast<Page*>(p)->data;
+ }
+};
+
+class PageSet {
+ public:
+ // alloc_range() and get_range() return page refs in a vector
+ typedef std::vector<Page::Ref> page_vector;
+
+ private:
+ // store pages in a boost intrusive avl_set
+ typedef Page::Less page_cmp;
+ typedef boost::intrusive::member_hook<Page,
+ boost::intrusive::avl_set_member_hook<>,
+ &Page::hook> member_option;
+ typedef boost::intrusive::avl_set<Page,
+ boost::intrusive::compare<page_cmp>, member_option> page_set;
+
+ typedef typename page_set::iterator iterator;
+
+ page_set pages;
+ uint64_t page_size;
+
+ typedef std::mutex lock_type;
+ lock_type mutex;
+
+ void free_pages(iterator cur, iterator end) {
+ while (cur != end) {
+ Page *page = &*cur;
+ cur = pages.erase(cur);
+ page->put();
+ }
+ }
+
+ int count_pages(uint64_t offset, uint64_t len) const {
+ // count the overlapping pages
+ int count = 0;
+ if (offset % page_size) {
+ count++;
+ size_t rem = page_size - offset % page_size;
+ len = len <= rem ? 0 : len - rem;
+ }
+ count += len / page_size;
+ if (len % page_size)
+ count++;
+ return count;
+ }
+
+ public:
+ explicit PageSet(size_t page_size) : page_size(page_size) {}
+ PageSet(PageSet &&rhs)
+ : pages(std::move(rhs.pages)), page_size(rhs.page_size) {}
+ ~PageSet() {
+ free_pages(pages.begin(), pages.end());
+ }
+
+ // disable copy
+ PageSet(const PageSet&) = delete;
+ const PageSet& operator=(const PageSet&) = delete;
+
+ bool empty() const { return pages.empty(); }
+ size_t size() const { return pages.size(); }
+ size_t get_page_size() const { return page_size; }
+
+ // allocate all pages that intersect the range [offset,length)
+ void alloc_range(uint64_t offset, uint64_t length, page_vector &range) {
+ // loop in reverse so we can provide hints to avl_set::insert_check()
+ // and get O(1) insertions after the first
+ uint64_t position = offset + length - 1;
+
+ range.resize(count_pages(offset, length));
+ auto out = range.rbegin();
+
+ std::lock_guard<lock_type> lock(mutex);
+ iterator cur = pages.end();
+ while (length) {
+ const uint64_t page_offset = position & ~(page_size-1);
+
+ typename page_set::insert_commit_data commit;
+ auto insert = pages.insert_check(cur, page_offset, page_cmp(), commit);
+ if (insert.second) {
+ auto page = Page::create(page_size, page_offset);
+ cur = pages.insert_commit(*page, commit);
+
+ // assume that the caller will write to the range [offset,length),
+ // so we only need to zero memory outside of this range
+
+ // zero end of page past offset + length
+ if (offset + length < page->offset + page_size)
+ std::fill(page->data + offset + length - page->offset,
+ page->data + page_size, 0);
+ // zero front of page between page_offset and offset
+ if (offset > page->offset)
+ std::fill(page->data, page->data + offset - page->offset, 0);
+ } else { // exists
+ cur = insert.first;
+ }
+ // add a reference to output vector
+ out->reset(&*cur);
+ ++out;
+
+ auto c = std::min(length, (position & (page_size-1)) + 1);
+ position -= c;
+ length -= c;
+ }
+ // make sure we sized the vector correctly
+ ceph_assert(out == range.rend());
+ }
+
+ // return all allocated pages that intersect the range [offset,length)
+ void get_range(uint64_t offset, uint64_t length, page_vector &range) {
+ auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp());
+ while (cur != pages.end() && cur->offset < offset + length)
+ range.push_back(&*cur++);
+ }
+
+ void free_pages_after(uint64_t offset) {
+ std::lock_guard<lock_type> lock(mutex);
+ auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp());
+ if (cur == pages.end())
+ return;
+ if (cur->offset < offset)
+ cur++;
+ free_pages(cur, pages.end());
+ }
+
+ void encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(page_size, bl);
+ unsigned count = pages.size();
+ encode(count, bl);
+ for (auto p = pages.rbegin(); p != pages.rend(); ++p)
+ p->encode(bl, page_size);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ using ceph::decode;
+ ceph_assert(empty());
+ decode(page_size, p);
+ unsigned count;
+ decode(count, p);
+ auto cur = pages.end();
+ for (unsigned i = 0; i < count; i++) {
+ auto page = Page::create(page_size);
+ page->decode(p, page_size);
+ cur = pages.insert_before(cur, *page);
+ }
+ }
+};
+
+#endif // CEPH_PAGESET_H