diff options
Diffstat (limited to 'src/os')
91 files changed, 66558 insertions, 0 deletions
diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt new file mode 100644 index 00000000..8fc8d353 --- /dev/null +++ b/src/os/CMakeLists.txt @@ -0,0 +1,153 @@ +set(libos_srcs + ObjectStore.cc + Transaction.cc + filestore/chain_xattr.cc + filestore/BtrfsFileStoreBackend.cc + filestore/DBObjectMap.cc + filestore/FileJournal.cc + filestore/FileStore.cc + filestore/JournalThrottle.cc + filestore/GenericFileStoreBackend.cc + filestore/JournalingObjectStore.cc + filestore/HashIndex.cc + filestore/IndexManager.cc + filestore/LFNIndex.cc + filestore/WBThrottle.cc + memstore/MemStore.cc + kstore/KStore.cc + kstore/kstore_types.cc + fs/FS.cc) + +if(WITH_BLUESTORE) + list(APPEND libos_srcs + bluestore/Allocator.cc + bluestore/BitmapFreelistManager.cc + bluestore/BlockDevice.cc + bluestore/BlueFS.cc + bluestore/bluefs_types.cc + bluestore/BlueRocksEnv.cc + bluestore/BlueStore.cc + bluestore/bluestore_types.cc + bluestore/fastbmap_allocator_impl.cc + bluestore/FreelistManager.cc + bluestore/StupidAllocator.cc + bluestore/BitmapAllocator.cc + bluestore/AvlAllocator.cc + bluestore/HybridAllocator.cc + ) +endif(WITH_BLUESTORE) + +if(HAVE_LIBAIO OR HAVE_POSIXAIO) + list(APPEND libos_srcs + bluestore/KernelDevice.cc + bluestore/aio.cc) +endif() + +if(WITH_FUSE) + list(APPEND libos_srcs + FuseStore.cc) +endif(WITH_FUSE) + +if(WITH_PMEM) + list(APPEND libos_srcs + bluestore/PMEMDevice.cc) +endif(WITH_PMEM) + +if(HAVE_LIBXFS) + list(APPEND libos_srcs + filestore/XfsFileStoreBackend.cc + fs/XFS.cc) +endif() + +if(HAVE_LIBZFS) + add_library(os_zfs_objs OBJECT + filestore/ZFSFileStoreBackend.cc + fs/ZFS.cc) + target_include_directories(os_zfs_objs SYSTEM PRIVATE + ${ZFS_INCLUDE_DIRS}) + list(APPEND libos_srcs $<TARGET_OBJECTS:os_zfs_objs>) +endif() + +if(WITH_SPDK) + list(APPEND libos_srcs + bluestore/NVMEDevice.cc) +endif() + +add_library(os STATIC ${libos_srcs}) + +target_link_libraries(os heap_profiler kv) + +if(WITH_BLUEFS) + add_library(bluefs SHARED + bluestore/BlueRocksEnv.cc) + target_include_directories(bluefs SYSTEM PUBLIC + $<TARGET_PROPERTY:RocksDB::RocksDB,INTERFACE_INCLUDE_DIRECTORIES>) + target_link_libraries(bluefs global) + install(TARGETS bluefs DESTINATION lib) +endif(WITH_BLUEFS) + +if(HAVE_LIBAIO) + target_link_libraries(os ${AIO_LIBRARIES}) +endif(HAVE_LIBAIO) + +if(WITH_FUSE) + target_link_libraries(os FUSE::FUSE) +endif() + +if(HAVE_LIBZFS) + target_link_libraries(os ${ZFS_LIBRARIES}) +endif() + +if(WITH_SPDK) + target_link_libraries(os + ${SPDK_LIBRARIES}) +endif() + +if(WITH_LTTNG) + add_dependencies(os objectstore-tp) +endif() + +target_link_libraries(os kv) + +add_dependencies(os compressor_plugins) +add_dependencies(os crypto_plugins) + + +if(WITH_BLUESTORE) + add_executable(ceph-bluestore-tool + bluestore/bluestore_tool.cc) + target_link_libraries(ceph-bluestore-tool + os global) + install(TARGETS ceph-bluestore-tool + DESTINATION bin) +endif() + +if(WITH_PMEM) + include(ExternalProject) + ExternalProject_Add(nvml_ext + DOWNLOAD_DIR ${CMAKE_BINARY_DIR}/src/ + GIT_REPOSITORY "https://github.com/ceph/nvml.git" + GIT_TAG "dd622819dd4ee97d3920f913c70be" + SOURCE_DIR ${CMAKE_BINARY_DIR}/src/nvml + CONFIGURE_COMMAND "" + BUILD_COMMAND $(MAKE) + BUILD_IN_SOURCE 1 + INSTALL_COMMAND "true") + + ExternalProject_Add_Step(nvml_ext forcebuild + DEPENDEES configure + DEPENDERS build + COMMAND "true" + ALWAYS 1) + add_library(pmem STATIC IMPORTED GLOBAL) + add_dependencies(pmem nvml_ext) + set_target_properties(pmem PROPERTIES + IMPORTED_LOCATION "${CMAKE_BINARY_DIR}/src/nvml/src/nondebug/libpmem.a" + INTERFACE_LINK_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) + target_link_libraries(os pmem) + target_include_directories(os SYSTEM PRIVATE "${CMAKE_BINARY_DIR}/src/nvml/src/include") +endif(WITH_PMEM) + +if(WITH_LTTNG AND WITH_EVENTTRACE) + add_dependencies(os eventtrace_tp) +endif() diff --git a/src/os/FuseStore.cc b/src/os/FuseStore.cc new file mode 100644 index 00000000..7ae21f3a --- /dev/null +++ b/src/os/FuseStore.cc @@ -0,0 +1,1281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include "FuseStore.h" +#include "os/ObjectStore.h" +#include "include/stringify.h" +#include "common/errno.h" + +#define FUSE_USE_VERSION 30 +#include <fuse.h> +#include <fuse_lowlevel.h> +#include "include/ceph_fuse.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <fcntl.h> /* Definition of AT_* constants */ +#include <sys/stat.h> + +#if defined(__APPLE__) || defined(__FreeBSD__) +#include <sys/param.h> +#include <sys/mount.h> +#endif + +#define dout_context store->cct +#define dout_subsys ceph_subsys_fuse +#include "common/debug.h" +#undef dout_prefix +#define dout_prefix *_dout << "fuse " + +// some fuse-y bits of state +struct fs_info { + struct fuse_args args; + struct fuse *f; +#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0) + struct fuse_chan *ch; +#endif + char *mountpoint; +}; + +int FuseStore::open_file(string p, struct fuse_file_info *fi, + std::function<int(bufferlist *bl)> f) +{ + if (open_files.count(p)) { + OpenFile *o = open_files[p]; + fi->fh = reinterpret_cast<uint64_t>(o); + ++o->ref; + return 0; + } + bufferlist bl; + int r = f(&bl); + if (r < 0) { + return r; + } + OpenFile *o = new OpenFile; + o->path = p; + o->bl.claim(bl); + open_files[p] = o; + fi->fh = reinterpret_cast<uint64_t>(o); + ++o->ref; + return 0; +} + +FuseStore::FuseStore(ObjectStore *s, string p) + : store(s), + mount_point(p), + fuse_thread(this) +{ + info = new fs_info(); +} + +FuseStore::~FuseStore() +{ + delete info; +} + +/* + * / - root directory + * $cid/ + * $cid/type - objectstore type + * $cid/bitwise_hash_start = lowest hash value + * $cid/bitwise_hash_end = highest hash value + * $cid/bitwise_hash_bits - how many bits are significant + * $cid/pgmeta/ - pgmeta object + * $cid/all/ - all objects + * $cid/all/$obj/ + * $cid/all/$obj/bitwise_hash + * $cid/all/$obj/data + * $cid/all/$obj/omap/$key + * $cid/all/$obj/attr/$name + * $cid/by_bitwise_hash/$hash/$bits/$obj - all objects with this (bitwise) hash (prefix) + */ +enum { + FN_ROOT = 1, + FN_TYPE, + FN_COLLECTION, + FN_HASH_START, + FN_HASH_END, + FN_HASH_BITS, + FN_OBJECT, + FN_OBJECT_HASH, + FN_OBJECT_DATA, + FN_OBJECT_OMAP_HEADER, + FN_OBJECT_OMAP, + FN_OBJECT_OMAP_VAL, + FN_OBJECT_ATTR, + FN_OBJECT_ATTR_VAL, + FN_ALL, + FN_HASH_DIR, + FN_HASH_VAL, +}; + +static int parse_fn(CephContext* cct, const char *path, coll_t *cid, + ghobject_t *oid, string *key, + uint32_t *hash, uint32_t *hash_bits) +{ + list<string> v; + for (const char *p = path; *p; ++p) { + if (*p == '/') + continue; + const char *e; + for (e = p + 1; *e && *e != '/'; e++) ; + string c(p, e-p); + v.push_back(c); + p = e; + if (!*p) + break; + } + ldout(cct, 10) << __func__ << " path " << path << " -> " << v << dendl; + + if (v.empty()) + return FN_ROOT; + + if (v.front() == "type") + return FN_TYPE; + + if (!cid->parse(v.front())) { + return -ENOENT; + } + if (v.size() == 1) + return FN_COLLECTION; + v.pop_front(); + + if (v.front() == "bitwise_hash_start") + return FN_HASH_START; + if (v.front() == "bitwise_hash_end") + return FN_HASH_END; + if (v.front() == "bitwise_hash_bits") + return FN_HASH_BITS; + if (v.front() == "pgmeta") { + spg_t pgid; + if (cid->is_pg(&pgid)) { + *oid = pgid.make_pgmeta_oid(); + v.pop_front(); + if (v.empty()) + return FN_OBJECT; + goto do_object; + } + return -ENOENT; + } + if (v.front() == "all") { + v.pop_front(); + if (v.empty()) + return FN_ALL; + goto do_dir; + } + if (v.front() == "by_bitwise_hash") { + v.pop_front(); + if (v.empty()) + return FN_HASH_DIR; + unsigned long hv, hm; + int r = sscanf(v.front().c_str(), "%lx", &hv); + if (r != 1) + return -ENOENT; + int shift = 32 - v.front().length() * 4; + v.pop_front(); + if (v.empty()) + return FN_HASH_DIR; + r = sscanf(v.front().c_str(), "%ld", &hm); + if (r != 1) + return -ENOENT; + if (hm < 1 || hm > 32) + return -ENOENT; + v.pop_front(); + *hash = hv << shift;//hobject_t::_reverse_bits(hv << shift); + *hash_bits = hm; + if (v.empty()) + return FN_HASH_VAL; + goto do_dir; + } + return -ENOENT; + + do_dir: + { + string o = v.front(); + if (!oid->parse(o)) { + return -ENOENT; + } + v.pop_front(); + if (v.empty()) + return FN_OBJECT; + } + + do_object: + if (v.front() == "data") + return FN_OBJECT_DATA; + if (v.front() == "omap_header") + return FN_OBJECT_OMAP_HEADER; + if (v.front() == "omap") { + v.pop_front(); + if (v.empty()) + return FN_OBJECT_OMAP; + *key = v.front(); + v.pop_front(); + if (v.empty()) + return FN_OBJECT_OMAP_VAL; + return -ENOENT; + } + if (v.front() == "attr") { + v.pop_front(); + if (v.empty()) + return FN_OBJECT_ATTR; + *key = v.front(); + v.pop_front(); + if (v.empty()) + return FN_OBJECT_ATTR_VAL; + return -ENOENT; + } + if (v.front() == "bitwise_hash") + return FN_OBJECT_HASH; + return -ENOENT; +} + + +static int os_getattr(const char *path, struct stat *stbuf +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + , struct fuse_file_info *fi +#endif + ) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << dendl; + coll_t cid; + ghobject_t oid; + string key; + uint32_t hash_value, hash_bits; + int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value, + &hash_bits); + if (t < 0) + return t; + + std::lock_guard<std::mutex> l(fs->lock); + + stbuf->st_size = 0; + stbuf->st_uid = 0; + stbuf->st_gid = 0; + stbuf->st_mode = S_IFREG | 0700; + + auto ch = fs->store->open_collection(cid); + + switch (t) { + case FN_OBJECT_OMAP: + case FN_OBJECT_ATTR: + case FN_OBJECT: + case FN_OBJECT_DATA: + case FN_OBJECT_OMAP_HEADER: + case FN_OBJECT_OMAP_VAL: + { + spg_t pgid; + if (cid.is_pg(&pgid)) { + if (!ch) { + return -ENOENT; + } + int bits = fs->store->collection_bits(ch); + if (bits >= 0 && !oid.match(bits, pgid.ps())) { + // sorry, not part of this PG + return -ENOENT; + } + } + } + break; + } + + switch (t) { + case FN_OBJECT_OMAP: + case FN_OBJECT_ATTR: + case FN_OBJECT: + if (!fs->store->exists(ch, oid)) + return -ENOENT; + // fall-thru + case FN_ALL: + case FN_HASH_DIR: + case FN_HASH_VAL: + case FN_COLLECTION: + if (!fs->store->collection_exists(cid)) + return -ENOENT; + // fall-thru + case FN_ROOT: + stbuf->st_mode = S_IFDIR | 0700; + return 0; + + case FN_TYPE: + stbuf->st_size = fs->store->get_type().length() + 1; + break; + + case FN_OBJECT_HASH: + if (!fs->store->exists(ch, oid)) + return -ENOENT; + stbuf->st_size = 9; + return 0; + + case FN_HASH_END: + if (!ch) + return -ENOENT; + if (fs->store->collection_bits(ch) < 0) + return -ENOENT; + // fall-thru + case FN_HASH_START: + stbuf->st_size = 9; + return 0; + + case FN_HASH_BITS: + { + if (!ch) + return -ENOENT; + int bits = fs->store->collection_bits(ch); + if (bits < 0) + return -ENOENT; + char buf[12]; + snprintf(buf, sizeof(buf), "%d\n", bits); + stbuf->st_size = strlen(buf); + } + return 0; + + case FN_OBJECT_DATA: + { + if (!fs->store->exists(ch, oid)) + return -ENOENT; + int r = fs->store->stat(ch, oid, stbuf); + if (r < 0) + return r; + } + break; + + case FN_OBJECT_OMAP_HEADER: + { + if (!fs->store->exists(ch, oid)) + return -ENOENT; + bufferlist bl; + fs->store->omap_get_header(ch, oid, &bl); + stbuf->st_size = bl.length(); + } + break; + + case FN_OBJECT_OMAP_VAL: + { + if (!fs->store->exists(ch, oid)) + return -ENOENT; + set<string> k; + k.insert(key); + map<string,bufferlist> v; + fs->store->omap_get_values(ch, oid, k, &v); + if (!v.count(key)) { + return -ENOENT; + } + stbuf->st_size = v[key].length(); + } + break; + + case FN_OBJECT_ATTR_VAL: + { + if (!fs->store->exists(ch, oid)) + return -ENOENT; + bufferptr v; + int r = fs->store->getattr(ch, oid, key.c_str(), v); + if (r == -ENODATA) + r = -ENOENT; + if (r < 0) + return r; + stbuf->st_size = v.length(); + } + break; + + default: + return -ENOENT; + } + + return 0; +} + +static int os_readdir(const char *path, + void *buf, + fuse_fill_dir_t filler, + off_t offset, + struct fuse_file_info *fi +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + , enum fuse_readdir_flags +#endif + ) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset + << dendl; + coll_t cid; + ghobject_t oid; + string key; + uint32_t hash_value, hash_bits; + int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value, + &hash_bits); + if (t < 0) + return t; + + std::lock_guard<std::mutex> l(fs->lock); + + auto ch = fs->store->open_collection(cid); + + // we can't shift 32 bits or else off_t will go negative + const int hash_shift = 31; + + switch (t) { + case FN_ROOT: + { + filler_compat(filler, buf, "type", NULL, 0); + vector<coll_t> cls; + fs->store->list_collections(cls); + for (auto c : cls) { + int r = filler_compat(filler, buf, stringify(c).c_str(), NULL, 0); + if (r > 0) + break; + } + } + break; + + case FN_COLLECTION: + { + if (!ch) { + return -ENOENT; + } + filler_compat(filler, buf, "bitwise_hash_start", NULL, 0); + if (fs->store->collection_bits(ch) >= 0) { + filler_compat(filler, buf, "bitwise_hash_end", NULL, 0); + filler_compat(filler, buf, "bitwise_hash_bits", NULL, 0); + } + filler_compat(filler, buf, "all", NULL, 0); + filler_compat(filler, buf, "by_bitwise_hash", NULL, 0); + spg_t pgid; + if (cid.is_pg(&pgid) && + fs->store->exists(ch, pgid.make_pgmeta_oid())) { + filler_compat(filler, buf, "pgmeta", NULL, 0); + } + } + break; + + case FN_OBJECT: + { + filler_compat(filler, buf, "bitwise_hash", NULL, 0); + filler_compat(filler, buf, "data", NULL, 0); + filler_compat(filler, buf, "omap", NULL, 0); + filler_compat(filler, buf, "attr", NULL, 0); + filler_compat(filler, buf, "omap_header", NULL, 0); + } + break; + + case FN_HASH_VAL: + case FN_ALL: + { + uint32_t bitwise_hash = (offset >> hash_shift) & 0xffffffff; + uint32_t hashoff = offset - (bitwise_hash << hash_shift); + int skip = hashoff; + ghobject_t next = cid.get_min_hobj(); + if (offset) { + // obey the offset + next.hobj.set_hash(hobject_t::_reverse_bits(bitwise_hash)); + } else if (t == FN_HASH_VAL) { + next.hobj.set_hash(hobject_t::_reverse_bits(hash_value)); + } + ghobject_t last; + if (t == FN_HASH_VAL) { + last = next; + uint64_t rev_end = (hash_value | (0xffffffff >> hash_bits)) + 1; + if (rev_end >= 0x100000000) + last = ghobject_t::get_max(); + else + last.hobj.set_hash(hobject_t::_reverse_bits(rev_end)); + } else { + last = ghobject_t::get_max(); + } + ldout(fs->store->cct, 10) << __func__ << std::hex + << " offset " << offset << " hash " + << hobject_t::_reverse_bits(hash_value) + << std::dec + << "/" << hash_bits + << " first " << next << " last " << last + << dendl; + while (true) { + vector<ghobject_t> ls; + int r = fs->store->collection_list( + ch, next, last, 1000, &ls, &next); + if (r < 0) + return r; + for (auto p : ls) { + if (skip) { + --skip; + continue; + } + uint32_t cur_bitwise_hash = p.hobj.get_bitwise_key_u32(); + if (cur_bitwise_hash != bitwise_hash) { + bitwise_hash = cur_bitwise_hash; + hashoff = 0; + } + ++hashoff; + uint64_t cur_off = ((uint64_t)bitwise_hash << hash_shift) | + (uint64_t)hashoff; + string s = stringify(p); + r = filler_compat(filler, buf, s.c_str(), NULL, cur_off); + if (r) + break; + } + if (r) + break; + if (next == ghobject_t::get_max() || next == last) + break; + } + } + break; + + case FN_OBJECT_OMAP: + { + set<string> keys; + fs->store->omap_get_keys(ch, oid, &keys); + unsigned skip = offset; + for (auto k : keys) { + if (skip) { + --skip; + continue; + } + ++offset; + int r = filler_compat(filler, buf, k.c_str(), NULL, offset); + if (r) + break; + } + } + break; + + case FN_OBJECT_ATTR: + { + map<string,bufferptr> aset; + fs->store->getattrs(ch, oid, aset); + unsigned skip = offset; + for (auto a : aset) { + if (skip) { + --skip; + continue; + } + ++offset; + int r = filler_compat(filler, buf, a.first.c_str(), NULL, offset); + if (r) + break; + } + } + break; + } + return 0; +} + +static int os_open(const char *path, struct fuse_file_info *fi) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << dendl; + coll_t cid; + ghobject_t oid; + string key; + uint32_t hash_value, hash_bits; + int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value, + &hash_bits); + if (t < 0) + return t; + + std::lock_guard<std::mutex> l(fs->lock); + + auto ch = fs->store->open_collection(cid); + + bufferlist *pbl = 0; + switch (t) { + case FN_TYPE: + pbl = new bufferlist; + pbl->append(fs->store->get_type()); + pbl->append("\n"); + break; + + case FN_HASH_START: + { + pbl = new bufferlist; + spg_t pgid; + if (cid.is_pg(&pgid)) { + unsigned long h; + h = hobject_t::_reverse_bits(pgid.ps()); + char buf[10]; + snprintf(buf, sizeof(buf), "%08lx\n", h); + pbl->append(buf); + } else { + pbl->append("00000000\n"); + } + } + break; + + case FN_HASH_END: + { + if (!ch) { + return -ENOENT; + } + spg_t pgid; + unsigned long h; + if (cid.is_pg(&pgid)) { + int hash_bits = fs->store->collection_bits(ch); + if (hash_bits >= 0) { + uint64_t rev_start = hobject_t::_reverse_bits(pgid.ps()); + uint64_t rev_end = (rev_start | (0xffffffff >> hash_bits)); + h = rev_end; + } else { + return -ENOENT; + } + } else { + h = 0xffffffff; + } + char buf[10]; + snprintf(buf, sizeof(buf), "%08lx\n", h); + pbl = new bufferlist; + pbl->append(buf); + } + break; + + case FN_HASH_BITS: + { + if (!ch) { + return -ENOENT; + } + int r = fs->store->collection_bits(ch); + if (r < 0) + return r; + char buf[12]; + snprintf(buf, sizeof(buf), "%d\n", r); + pbl = new bufferlist; + pbl->append(buf); + } + break; + + case FN_OBJECT_HASH: + { + pbl = new bufferlist; + char buf[10]; + snprintf(buf, sizeof(buf), "%08x\n", + (unsigned)oid.hobj.get_bitwise_key_u32()); + pbl->append(buf); + } + break; + + case FN_OBJECT_DATA: + { + int r = fs->open_file( + path, fi, + [&](bufferlist *pbl) { + return fs->store->read(ch, oid, 0, 0, *pbl); + }); + if (r < 0) { + return r; + } + } + break; + + case FN_OBJECT_ATTR_VAL: + { + int r = fs->open_file( + path, fi, + [&](bufferlist *pbl) { + bufferptr bp; + int r = fs->store->getattr(ch, oid, key.c_str(), bp); + if (r < 0) + return r; + pbl->append(bp); + return 0; + }); + if (r < 0) + return r; + } + break; + + case FN_OBJECT_OMAP_VAL: + { + int r = fs->open_file( + path, fi, + [&](bufferlist *pbl) { + set<string> k; + k.insert(key); + map<string,bufferlist> v; + int r = fs->store->omap_get_values(ch, oid, k, &v); + if (r < 0) + return r; + *pbl = v[key]; + return 0; + }); + if (r < 0) + return r; + } + break; + + case FN_OBJECT_OMAP_HEADER: + { + int r = fs->open_file( + path, fi, + [&](bufferlist *pbl) { + return fs->store->omap_get_header(ch, oid, pbl); + }); + if (r < 0) + return r; + } + break; + } + + if (pbl) { + FuseStore::OpenFile *o = new FuseStore::OpenFile; + o->bl.claim(*pbl); + fi->fh = reinterpret_cast<uint64_t>(o); + } + return 0; +} + +static int os_mkdir(const char *path, mode_t mode) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << dendl; + coll_t cid; + ghobject_t oid; + string key; + uint32_t hash_value, hash_bits; + int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value, + &hash_bits); + if (f < 0) + return f; + + std::lock_guard<std::mutex> l(fs->lock); + + ObjectStore::CollectionHandle ch; + + ObjectStore::Transaction t; + switch (f) { + case FN_OBJECT: + { + ch = fs->store->open_collection(cid); + if (!ch) { + return -ENOENT; + } + spg_t pgid; + if (cid.is_pg(&pgid)) { + int bits = fs->store->collection_bits(ch); + if (bits >= 0 && !oid.match(bits, pgid.ps())) { + // sorry, not part of this PG + return -EINVAL; + } + } + t.touch(cid, oid); + ch = fs->store->open_collection(cid); + } + break; + + case FN_COLLECTION: + if (cid.is_pg()) { + // use the mode for the bit count. e.g., mkdir --mode=0003 + // mnt/0.7_head will create 0.7 with bits = 3. + mode &= 0777; + if (mode >= 32) + return -EINVAL; + } else { + mode = 0; + } + t.create_collection(cid, mode); + ch = fs->store->create_new_collection(cid); + break; + + default: + return -EPERM; + } + + if (!t.empty()) { + fs->store->queue_transaction(ch, std::move(t)); + } + + return 0; +} + +static int os_chmod(const char *path, mode_t mode +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + , struct fuse_file_info *fi +#endif + ) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << dendl; + return 0; +} + +static int os_create(const char *path, mode_t mode, struct fuse_file_info *fi) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << dendl; + coll_t cid; + ghobject_t oid; + string key; + uint32_t hash_value, hash_bits; + int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value, + &hash_bits); + if (f < 0) + return f; + + std::lock_guard<std::mutex> l(fs->lock); + + ObjectStore::CollectionHandle ch = fs->store->open_collection(cid); + + ObjectStore::Transaction t; + bufferlist *pbl = 0; + switch (f) { + case FN_OBJECT_DATA: + { + pbl = new bufferlist; + fs->store->read(ch, oid, 0, 0, *pbl); + } + break; + + case FN_OBJECT_ATTR_VAL: + { + pbl = new bufferlist; + bufferptr bp; + int r = fs->store->getattr(ch, oid, key.c_str(), bp); + if (r == -ENODATA) { + bufferlist empty; + t.setattr(cid, oid, key.c_str(), empty); + } + pbl->append(bp); + } + break; + + case FN_OBJECT_OMAP_VAL: + { + pbl = new bufferlist; + set<string> k; + k.insert(key); + map<string,bufferlist> v; + fs->store->omap_get_values(ch, oid, k, &v); + if (v.count(key) == 0) { + map<string,bufferlist> aset; + aset[key] = bufferlist(); + t.omap_setkeys(cid, oid, aset); + } else { + *pbl = v[key]; + } + } + break; + } + + if (!t.empty()) { + fs->store->queue_transaction(ch, std::move(t)); + } + + if (pbl) { + FuseStore::OpenFile *o = new FuseStore::OpenFile; + o->bl.claim(*pbl); + o->dirty = true; + fi->fh = reinterpret_cast<uint64_t>(o); + } + return 0; +} + +static int os_release(const char *path, struct fuse_file_info *fi) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << dendl; + std::lock_guard<std::mutex> l(fs->lock); + FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh); + if (--o->ref == 0) { + ldout(fs->store->cct, 10) << __func__ << " closing last " << o->path << dendl; + fs->open_files.erase(o->path); + delete o; + } + return 0; +} + +static int os_read(const char *path, char *buf, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset + << " size " << size << dendl; + std::lock_guard<std::mutex> l(fs->lock); + FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh); + if (!o) + return 0; + if (offset >= o->bl.length()) + return 0; + if (offset + size > o->bl.length()) + size = o->bl.length() - offset; + bufferlist r; + r.substr_of(o->bl, offset, size); + memcpy(buf, r.c_str(), r.length()); + return r.length(); +} + +static int os_write(const char *path, const char *buf, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset + << " size " << size << dendl; + std::lock_guard<std::mutex> l(fs->lock); + FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh); + if (!o) + return 0; + + bufferlist final; + if (offset) { + if (offset > o->bl.length()) { + final.substr_of(o->bl, 0, offset); + } else { + final.claim_append(o->bl); + size_t zlen = offset - final.length(); + final.append_zero(zlen); + } + } + final.append(buf, size); + if (offset + size < o->bl.length()) { + bufferlist rest; + rest.substr_of(o->bl, offset + size, o->bl.length() - offset - size); + final.claim_append(rest); + } + o->bl = final; + o->dirty = true; + return size; +} + +int os_flush(const char *path, struct fuse_file_info *fi) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << dendl; + coll_t cid; + ghobject_t oid; + string key; + uint32_t hash_value, hash_bits; + int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value, + &hash_bits); + if (f < 0) + return f; + + std::lock_guard<std::mutex> l(fs->lock); + + FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh); + if (!o) + return 0; + if (!o->dirty) + return 0; + + ObjectStore::CollectionHandle ch = fs->store->open_collection(cid); + + ObjectStore::Transaction t; + + switch (f) { + case FN_OBJECT_DATA: + t.write(cid, oid, 0, o->bl.length(), o->bl); + break; + + case FN_OBJECT_ATTR_VAL: + t.setattr(cid, oid, key.c_str(), o->bl); + break; + + case FN_OBJECT_OMAP_VAL: + { + map<string,bufferlist> aset; + aset[key] = o->bl; + t.omap_setkeys(cid, oid, aset); + break; + } + + case FN_OBJECT_OMAP_HEADER: + t.omap_setheader(cid, oid, o->bl); + break; + + default: + return 0; + } + + fs->store->queue_transaction(ch, std::move(t)); + + return 0; +} + +static int os_unlink(const char *path) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << dendl; + coll_t cid; + ghobject_t oid; + string key; + uint32_t hash_value, hash_bits; + int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value, + &hash_bits); + if (f < 0) + return f; + + std::lock_guard<std::mutex> l(fs->lock); + + ObjectStore::CollectionHandle ch = fs->store->open_collection(cid); + ObjectStore::Transaction t; + + switch (f) { + case FN_OBJECT_OMAP_VAL: + { + set<string> keys; + keys.insert(key); + t.omap_rmkeys(cid, oid, keys); + } + break; + + case FN_OBJECT_ATTR_VAL: + t.rmattr(cid, oid, key.c_str()); + break; + + case FN_OBJECT_OMAP_HEADER: + { + bufferlist empty; + t.omap_setheader(cid, oid, empty); + } + break; + + case FN_OBJECT: + t.remove(cid, oid); + break; + + case FN_COLLECTION: + { + bool empty; + int r = fs->store->collection_empty(ch, &empty); + if (r < 0) + return r; + if (!empty) + return -ENOTEMPTY; + t.remove_collection(cid); + } + break; + + case FN_OBJECT_DATA: + t.truncate(cid, oid, 0); + break; + + default: + return -EPERM; + } + + fs->store->queue_transaction(ch, std::move(t)); + + return 0; +} + +static int os_truncate(const char *path, off_t size +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + , struct fuse_file_info *fi +#endif + ) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << " size " << size << dendl; + coll_t cid; + ghobject_t oid; + string key; + uint32_t hash_value, hash_bits; + int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value, + &hash_bits); + if (f < 0) + return f; + + if (f == FN_OBJECT_OMAP_VAL || + f == FN_OBJECT_ATTR_VAL || + f == FN_OBJECT_OMAP_HEADER) { + if (size) + return -EPERM; + return 0; + } + if (f != FN_OBJECT_DATA) + return -EPERM; + + std::lock_guard<std::mutex> l(fs->lock); + + if (fs->open_files.count(path)) { + FuseStore::OpenFile *o = fs->open_files[path]; + if (o->bl.length() > size) { + bufferlist t; + t.substr_of(o->bl, 0, size); + o->bl.swap(t); + } + } + + ObjectStore::CollectionHandle ch = fs->store->open_collection(cid); + ObjectStore::Transaction t; + t.truncate(cid, oid, size); + fs->store->queue_transaction(ch, std::move(t)); + return 0; +} + +static int os_statfs(const char *path, struct statvfs *stbuf) +{ + fuse_context *fc = fuse_get_context(); + FuseStore *fs = static_cast<FuseStore*>(fc->private_data); + ldout(fs->store->cct, 10) << __func__ << " " << path << dendl; + std::lock_guard<std::mutex> l(fs->lock); + + struct store_statfs_t s; + int r = fs->store->statfs(&s); + if (r < 0) + return r; + stbuf->f_bsize = 4096; // LIES! + stbuf->f_blocks = s.total / 4096; + stbuf->f_bavail = s.available / 4096; + stbuf->f_bfree = stbuf->f_bavail; + + ldout(fs->store->cct, 10) << __func__ << " " << path << ": " + << stbuf->f_bavail << "/" << stbuf->f_blocks << dendl; + return 0; +} + +static struct fuse_operations fs_oper = { + getattr: os_getattr, + readlink: 0, +#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0) + getdir: 0, +#endif + mknod: 0, + mkdir: os_mkdir, + unlink: os_unlink, + rmdir: os_unlink, + symlink: 0, + rename: 0, + link: 0, + chmod: os_chmod, + chown: 0, + truncate: os_truncate, +#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0) + utime: 0, +#endif + open: os_open, + read: os_read, + write: os_write, + statfs: os_statfs, + flush: os_flush, + release: os_release, + fsync: 0, + setxattr: 0, + getxattr: 0, + listxattr: 0, + removexattr: 0, + opendir: 0, + readdir: os_readdir, + releasedir: 0, + fsyncdir: 0, + init: 0, + destroy: 0, + access: 0, + create: os_create, +}; + +int FuseStore::main() +{ + const char *v[] = { + "foo", + mount_point.c_str(), + "-f", + "-d", // debug + }; + int c = 3; + auto fuse_debug = store->cct->_conf.get_val<bool>("fuse_debug"); + if (fuse_debug) + ++c; + return fuse_main(c, (char**)v, &fs_oper, (void*)this); +} + +int FuseStore::start() +{ + dout(10) << __func__ << dendl; + + memset(&info->args, 0, sizeof(info->args)); + const char *v[] = { + "foo", + mount_point.c_str(), + "-f", // foreground + "-d", // debug + }; + int c = 3; +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + int rc; + struct fuse_cmdline_opts opts = {}; +#endif + auto fuse_debug = store->cct->_conf.get_val<bool>("fuse_debug"); + if (fuse_debug) + ++c; + fuse_args a = FUSE_ARGS_INIT(c, (char**)v); + info->args = a; +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + if (fuse_parse_cmdline(&info->args, &opts) == -1) { +#else + if (fuse_parse_cmdline(&info->args, &info->mountpoint, NULL, NULL) == -1) { +#endif + derr << __func__ << " failed to parse args" << dendl; + return -EINVAL; + } + +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + info->mountpoint = opts.mountpoint; + info->f = fuse_new(&info->args, &fs_oper, sizeof(fs_oper), (void*)this); + if (!info->f) { + derr << __func__ << " fuse_new failed" << dendl; + return -EIO; + } + + rc = fuse_mount(info->f, info->mountpoint); + if (rc != 0) { + derr << __func__ << " fuse_mount failed" << dendl; + return -EIO; + } +#else + info->ch = fuse_mount(info->mountpoint, &info->args); + if (!info->ch) { + derr << __func__ << " fuse_mount failed" << dendl; + return -EIO; + } + + info->f = fuse_new(info->ch, &info->args, &fs_oper, sizeof(fs_oper), + (void*)this); + if (!info->f) { + fuse_unmount(info->mountpoint, info->ch); + derr << __func__ << " fuse_new failed" << dendl; + return -EIO; + } +#endif + + fuse_thread.create("fusestore"); + dout(10) << __func__ << " done" << dendl; + return 0; +} + +int FuseStore::loop() +{ + dout(10) << __func__ << " enter" << dendl; + int r = fuse_loop(info->f); + if (r) + derr << __func__ << " got " << cpp_strerror(r) << dendl; + dout(10) << __func__ << " exit" << dendl; + return r; +} + +int FuseStore::stop() +{ + dout(10) << __func__ << " enter" << dendl; +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + fuse_unmount(info->f); +#else + fuse_unmount(info->mountpoint, info->ch); +#endif + fuse_thread.join(); + fuse_destroy(info->f); + dout(10) << __func__ << " exit" << dendl; + return 0; +} diff --git a/src/os/FuseStore.h b/src/os/FuseStore.h new file mode 100644 index 00000000..db39ca5e --- /dev/null +++ b/src/os/FuseStore.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OS_FUSESTORE_H +#define CEPH_OS_FUSESTORE_H + +#include <string> +#include <map> +#include <mutex> +#include <functional> + +#include "common/Thread.h" +#include "include/buffer.h" + +class ObjectStore; + +class FuseStore { +public: + ObjectStore *store; + std::string mount_point; + struct fs_info *info; + std::mutex lock; + + struct OpenFile { + std::string path; + bufferlist bl; + bool dirty = false; + int ref = 0; + }; + std::map<std::string,OpenFile*> open_files; + + int open_file(std::string p, struct fuse_file_info *fi, + std::function<int(bufferlist *bl)> f); + + class FuseThread : public Thread { + FuseStore *fs; + public: + explicit FuseThread(FuseStore *f) : fs(f) {} + void *entry() override { + fs->loop(); + return NULL; + } + } fuse_thread; + + FuseStore(ObjectStore *s, std::string p); + ~FuseStore(); + + int main(); + int start(); + int loop(); + int stop(); +}; + +#endif diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h new file mode 100644 index 00000000..f903333a --- /dev/null +++ b/src/os/ObjectMap.h @@ -0,0 +1,172 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef OS_KEYVALUESTORE_H +#define OS_KEYVALUESTORE_H + +#include <memory> +#include <string> +#include <vector> +#include "kv/KeyValueDB.h" +#include "common/hobject.h" + +class SequencerPosition; + +/** + * Encapsulates the FileStore key value store + * + * Implementations of this interface will be used to implement TMAP + */ +class ObjectMap { +public: + CephContext* cct; + boost::scoped_ptr<KeyValueDB> db; + /// Set keys and values from specified map + virtual int set_keys( + const ghobject_t &oid, ///< [in] object containing map + const map<string, bufferlist> &set, ///< [in] key to value map to set + const SequencerPosition *spos=0 ///< [in] sequencer position + ) = 0; + + /// Set header + virtual int set_header( + const ghobject_t &oid, ///< [in] object containing map + const bufferlist &bl, ///< [in] header to set + const SequencerPosition *spos=0 ///< [in] sequencer position + ) = 0; + + /// Retrieve header + virtual int get_header( + const ghobject_t &oid, ///< [in] object containing map + bufferlist *bl ///< [out] header to set + ) = 0; + + /// Clear all map keys and values from oid + virtual int clear( + const ghobject_t &oid, ///< [in] object containing map + const SequencerPosition *spos=0 ///< [in] sequencer position + ) = 0; + + /// Clear all map keys and values in to_clear from oid + virtual int rm_keys( + const ghobject_t &oid, ///< [in] object containing map + const set<string> &to_clear, ///< [in] Keys to clear + const SequencerPosition *spos=0 ///< [in] sequencer position + ) = 0; + + /// Clear all omap keys and the header + virtual int clear_keys_header( + const ghobject_t &oid, ///< [in] oid to clear + const SequencerPosition *spos=0 ///< [in] sequencer position + ) = 0; + + /// Get all keys and values + virtual int get( + const ghobject_t &oid, ///< [in] object containing map + bufferlist *header, ///< [out] Returned Header + map<string, bufferlist> *out ///< [out] Returned keys and values + ) = 0; + + /// Get values for supplied keys + virtual int get_keys( + const ghobject_t &oid, ///< [in] object containing map + set<string> *keys ///< [out] Keys defined on oid + ) = 0; + + /// Get values for supplied keys + virtual int get_values( + const ghobject_t &oid, ///< [in] object containing map + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) = 0; + + /// Check key existence + virtual int check_keys( + const ghobject_t &oid, ///< [in] object containing map + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) = 0; + + /// Get xattrs + virtual int get_xattrs( + const ghobject_t &oid, ///< [in] object + const set<string> &to_get, ///< [in] keys to get + map<string, bufferlist> *out ///< [out] subset of attrs/vals defined + ) = 0; + + /// Get all xattrs + virtual int get_all_xattrs( + const ghobject_t &oid, ///< [in] object + set<string> *out ///< [out] attrs and values + ) = 0; + + /// set xattrs in to_set + virtual int set_xattrs( + const ghobject_t &oid, ///< [in] object + const map<string, bufferlist> &to_set,///< [in] attrs/values to set + const SequencerPosition *spos=0 ///< [in] sequencer position + ) = 0; + + /// remove xattrs in to_remove + virtual int remove_xattrs( + const ghobject_t &oid, ///< [in] object + const set<string> &to_remove, ///< [in] attrs to remove + const SequencerPosition *spos=0 ///< [in] sequencer position + ) = 0; + + + /// Clone keys from oid map to target map + virtual int clone( + const ghobject_t &oid, ///< [in] object containing map + const ghobject_t &target, ///< [in] target of clone + const SequencerPosition *spos=0 ///< [in] sequencer position + ) { return 0; } + + /// Rename map because of name change + virtual int rename( + const ghobject_t &from, ///< [in] object containing map + const ghobject_t &to, ///< [in] new name + const SequencerPosition *spos=0 ///< [in] sequencer position + ) { return 0; } + + /// For testing clone keys from oid map to target map using faster but more complex method + virtual int legacy_clone( + const ghobject_t &oid, ///< [in] object containing map + const ghobject_t &target, ///< [in] target of clone + const SequencerPosition *spos=0 ///< [in] sequencer position + ) { return 0; } + + /// Ensure all previous writes are durable + virtual int sync( + const ghobject_t *oid=0, ///< [in] object + const SequencerPosition *spos=0 ///< [in] Sequencer + ) { return 0; } + + virtual int check(std::ostream &out, bool repair = false, bool force = false) { return 0; } + + virtual void compact() {} + + typedef KeyValueDB::SimplestIteratorImpl ObjectMapIteratorImpl; + typedef std::shared_ptr<ObjectMapIteratorImpl> ObjectMapIterator; + virtual ObjectMapIterator get_iterator(const ghobject_t &oid) { + return ObjectMapIterator(); + } + + virtual KeyValueDB *get_db() { return nullptr; } + + ObjectMap(CephContext* cct, KeyValueDB *db) : cct(cct), db(db) {} + virtual ~ObjectMap() {} +}; + +#endif diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc new file mode 100644 index 00000000..f9c6073d --- /dev/null +++ b/src/os/ObjectStore.cc @@ -0,0 +1,160 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include <ctype.h> +#include <sstream> +#include "ObjectStore.h" +#include "common/Formatter.h" +#include "common/safe_io.h" + +#include "filestore/FileStore.h" +#include "memstore/MemStore.h" +#if defined(WITH_BLUESTORE) +#include "bluestore/BlueStore.h" +#endif +#include "kstore/KStore.h" + +void decode_str_str_map_to_bl(bufferlist::const_iterator& p, + bufferlist *out) +{ + auto start = p; + __u32 n; + decode(n, p); + unsigned len = 4; + while (n--) { + __u32 l; + decode(l, p); + p.advance(l); + len += 4 + l; + decode(l, p); + p.advance(l); + len += 4 + l; + } + start.copy(len, *out); +} + +void decode_str_set_to_bl(bufferlist::const_iterator& p, + bufferlist *out) +{ + auto start = p; + __u32 n; + decode(n, p); + unsigned len = 4; + while (n--) { + __u32 l; + decode(l, p); + p.advance(l); + len += 4 + l; + } + start.copy(len, *out); +} + +ObjectStore *ObjectStore::create(CephContext *cct, + const string& type, + const string& data, + const string& journal, + osflagbits_t flags) +{ + if (type == "filestore") { + return new FileStore(cct, data, journal, flags); + } + if (type == "memstore") { + return new MemStore(cct, data); + } +#if defined(WITH_BLUESTORE) + if (type == "bluestore") { + return new BlueStore(cct, data); + } + if (type == "random") { + if (rand() % 2) { + return new FileStore(cct, data, journal, flags); + } else { + return new BlueStore(cct, data); + } + } +#else + if (type == "random") { + return new FileStore(cct, data, journal, flags); + } +#endif + if (type == "kstore" && + cct->check_experimental_feature_enabled("kstore")) { + return new KStore(cct, data); + } + return NULL; +} + +int ObjectStore::probe_block_device_fsid( + CephContext *cct, + const string& path, + uuid_d *fsid) +{ + int r; + +#if defined(WITH_BLUESTORE) + // first try bluestore -- it has a crc on its header and will fail + // reliably. + r = BlueStore::get_block_device_fsid(cct, path, fsid); + if (r == 0) { + lgeneric_dout(cct, 0) << __func__ << " " << path << " is bluestore, " + << *fsid << dendl; + return r; + } +#endif + + // okay, try FileStore (journal). + r = FileStore::get_block_device_fsid(cct, path, fsid); + if (r == 0) { + lgeneric_dout(cct, 0) << __func__ << " " << path << " is filestore, " + << *fsid << dendl; + return r; + } + + return -EINVAL; +} + +int ObjectStore::write_meta(const std::string& key, + const std::string& value) +{ + string v = value; + v += "\n"; + int r = safe_write_file(path.c_str(), key.c_str(), + v.c_str(), v.length(), 0600); + if (r < 0) + return r; + return 0; +} + +int ObjectStore::read_meta(const std::string& key, + std::string *value) +{ + char buf[4096]; + int r = safe_read_file(path.c_str(), key.c_str(), + buf, sizeof(buf)); + if (r <= 0) + return r; + // drop trailing newlines + while (r && isspace(buf[r-1])) { + --r; + } + *value = string(buf, r); + return 0; +} + + + + +ostream& operator<<(ostream& out, const ObjectStore::Transaction& tx) { + + return out << "Transaction(" << &tx << ")"; +} diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h new file mode 100644 index 00000000..1c120689 --- /dev/null +++ b/src/os/ObjectStore.h @@ -0,0 +1,1925 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_OBJECTSTORE_H +#define CEPH_OBJECTSTORE_H + +#include "include/Context.h" +#include "include/buffer.h" +#include "include/types.h" +#include "include/stringify.h" +#include "osd/osd_types.h" +#include "common/TrackedOp.h" +#include "common/WorkQueue.h" +#include "ObjectMap.h" + +#include <errno.h> +#include <sys/stat.h> +#include <vector> +#include <map> + +#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun) +#include <sys/statvfs.h> +#else +#include <sys/vfs.h> /* or <sys/statfs.h> */ +#endif + +#define OPS_PER_PTR 32 + +class CephContext; + +using std::vector; +using std::string; +using std::map; + +namespace ceph { + class Formatter; +} + +/* + * low-level interface to the local OSD file system + */ + +class Logger; +class ContextQueue; + +static inline void encode(const map<string,bufferptr> *attrset, bufferlist &bl) { + encode(*attrset, bl); +} + +// this isn't the best place for these, but... +void decode_str_str_map_to_bl(bufferlist::const_iterator& p, bufferlist *out); +void decode_str_set_to_bl(bufferlist::const_iterator& p, bufferlist *out); + +// Flag bits +typedef uint32_t osflagbits_t; +const int SKIP_JOURNAL_REPLAY = 1 << 0; +const int SKIP_MOUNT_OMAP = 1 << 1; + +class ObjectStore { +protected: + string path; + +public: + CephContext* cct; + /** + * create - create an ObjectStore instance. + * + * This is invoked once at initialization time. + * + * @param type type of store. This is a string from the configuration file. + * @param data path (or other descriptor) for data + * @param journal path (or other descriptor) for journal (optional) + * @param flags which filestores should check if applicable + */ + static ObjectStore *create(CephContext *cct, + const string& type, + const string& data, + const string& journal, + osflagbits_t flags = 0); + + /** + * probe a block device to learn the uuid of the owning OSD + * + * @param cct cct + * @param path path to device + * @param fsid [out] osd uuid + */ + static int probe_block_device_fsid( + CephContext *cct, + const string& path, + uuid_d *fsid); + + /** + * Fetch Object Store statistics. + * + * Currently only latency of write and apply times are measured. + * + * This appears to be called with nothing locked. + */ + virtual objectstore_perf_stat_t get_cur_stats() = 0; + + /** + * Fetch Object Store performance counters. + * + * + * This appears to be called with nothing locked. + */ + virtual const PerfCounters* get_perf_counters() const = 0; + + /** + * a collection also orders transactions + * + * Any transactions queued under a given collection will be applied in + * sequence. Transactions queued under different collections may run + * in parallel. + * + * ObjectStore users my get collection handles with open_collection() (or, + * for bootstrapping a new collection, create_new_collection()). + */ + struct CollectionImpl : public RefCountedObject { + const coll_t cid; + + CollectionImpl(const coll_t& c) + : RefCountedObject(NULL, 0), + cid(c) {} + + /// wait for any queued transactions to apply + // block until any previous transactions are visible. specifically, + // collection_list and collection_empty need to reflect prior operations. + virtual void flush() = 0; + + /** + * Async flush_commit + * + * There are two cases: + * 1) collection is currently idle: the method returns true. c is + * not touched. + * 2) collection is not idle: the method returns false and c is + * called asynchronously with a value of 0 once all transactions + * queued on this collection prior to the call have been applied + * and committed. + */ + virtual bool flush_commit(Context *c) = 0; + + const coll_t &get_cid() { + return cid; + } + }; + typedef boost::intrusive_ptr<CollectionImpl> CollectionHandle; + + + /********************************* + * + * Object Contents and semantics + * + * All ObjectStore objects are identified as a named object + * (ghobject_t and hobject_t) in a named collection (coll_t). + * ObjectStore operations support the creation, mutation, deletion + * and enumeration of objects within a collection. Enumeration is + * in sorted key order (where keys are sorted by hash). Object names + * are globally unique. + * + * Each object has four distinct parts: byte data, xattrs, omap_header + * and omap entries. + * + * The data portion of an object is conceptually equivalent to a + * file in a file system. Random and Partial access for both read + * and write operations is required. The ability to have a sparse + * implementation of the data portion of an object is beneficial for + * some workloads, but not required. There is a system-wide limit on + * the maximum size of an object, which is typically around 100 MB. + * + * Xattrs are equivalent to the extended attributes of file + * systems. Xattrs are a set of key/value pairs. Sub-value access + * is not required. It is possible to enumerate the set of xattrs in + * key order. At the implementation level, xattrs are used + * exclusively internal to Ceph and the implementer can expect the + * total size of all of the xattrs on an object to be relatively + * small, i.e., less than 64KB. Much of Ceph assumes that accessing + * xattrs on temporally adjacent object accesses (recent past or + * near future) is inexpensive. + * + * omap_header is a single blob of data. It can be read or written + * in total. + * + * Omap entries are conceptually the same as xattrs + * but in a different address space. In other words, you can have + * the same key as an xattr and an omap entry and they have distinct + * values. Enumeration of xattrs doesn't include omap entries and + * vice versa. The size and access characteristics of omap entries + * are very different from xattrs. In particular, the value portion + * of an omap entry can be quite large (MBs). More importantly, the + * interface must support efficient range queries on omap entries even + * when there are a large numbers of entries. + * + *********************************/ + + /******************************* + * + * Collections + * + * A collection is simply a grouping of objects. Collections have + * names (coll_t) and can be enumerated in order. Like an + * individual object, a collection also has a set of xattrs. + * + * + */ + + + /********************************* + * transaction + * + * A Transaction represents a sequence of primitive mutation + * operations. + * + * Three events in the life of a Transaction result in + * callbacks. Any Transaction can contain any number of callback + * objects (Context) for any combination of the three classes of + * callbacks: + * + * on_applied_sync, on_applied, and on_commit. + * + * The "on_applied" and "on_applied_sync" callbacks are invoked when + * the modifications requested by the Transaction are visible to + * subsequent ObjectStore operations, i.e., the results are + * readable. The only conceptual difference between on_applied and + * on_applied_sync is the specific thread and locking environment in + * which the callbacks operate. "on_applied_sync" is called + * directly by an ObjectStore execution thread. It is expected to + * execute quickly and must not acquire any locks of the calling + * environment. Conversely, "on_applied" is called from the separate + * Finisher thread, meaning that it can contend for calling + * environment locks. NB, on_applied and on_applied_sync are + * sometimes called on_readable and on_readable_sync. + * + * The "on_commit" callback is also called from the Finisher thread + * and indicates that all of the mutations have been durably + * committed to stable storage (i.e., are now software/hardware + * crashproof). + * + * At the implementation level, each mutation primitive (and its + * associated data) can be serialized to a single buffer. That + * serialization, however, does not copy any data, but (using the + * bufferlist library) will reference the original buffers. This + * implies that the buffer that contains the data being submitted + * must remain stable until the on_commit callback completes. In + * practice, bufferlist handles all of this for you and this + * subtlety is only relevant if you are referencing an existing + * buffer via buffer::raw_static. + * + * Some implementations of ObjectStore choose to implement their own + * form of journaling that uses the serialized form of a + * Transaction. This requires that the encode/decode logic properly + * version itself and handle version upgrades that might change the + * format of the encoded Transaction. This has already happened a + * couple of times and the Transaction object contains some helper + * variables that aid in this legacy decoding: + * + * sobject_encoding detects an older/simpler version of oid + * present in pre-bobtail versions of ceph. use_pool_override + * also detects a situation where the pool of an oid can be + * overridden for legacy operations/buffers. For non-legacy + * implementations of ObjectStore, neither of these fields are + * relevant. + * + * + * TRANSACTION ISOLATION + * + * Except as noted above, isolation is the responsibility of the + * caller. In other words, if any storage element (storage element + * == any of the four portions of an object as described above) is + * altered by a transaction (including deletion), the caller + * promises not to attempt to read that element while the + * transaction is pending (here pending means from the time of + * issuance until the "on_applied_sync" callback has been + * received). Violations of isolation need not be detected by + * ObjectStore and there is no corresponding error mechanism for + * reporting an isolation violation (crashing would be the + * appropriate way to report an isolation violation if detected). + * + * Enumeration operations may violate transaction isolation as + * described above when a storage element is being created or + * deleted as part of a transaction. In this case, ObjectStore is + * allowed to consider the enumeration operation to either precede + * or follow the violating transaction element. In other words, the + * presence/absence of the mutated element in the enumeration is + * entirely at the discretion of ObjectStore. The arbitrary ordering + * applies independently to each transaction element. For example, + * if a transaction contains two mutating elements "create A" and + * "delete B". And an enumeration operation is performed while this + * transaction is pending. It is permissible for ObjectStore to + * report any of the four possible combinations of the existence of + * A and B. + * + */ + class Transaction { + public: + enum { + OP_NOP = 0, + OP_TOUCH = 9, // cid, oid + OP_WRITE = 10, // cid, oid, offset, len, bl + OP_ZERO = 11, // cid, oid, offset, len + OP_TRUNCATE = 12, // cid, oid, len + OP_REMOVE = 13, // cid, oid + OP_SETATTR = 14, // cid, oid, attrname, bl + OP_SETATTRS = 15, // cid, oid, attrset + OP_RMATTR = 16, // cid, oid, attrname + OP_CLONE = 17, // cid, oid, newoid + OP_CLONERANGE = 18, // cid, oid, newoid, offset, len + OP_CLONERANGE2 = 30, // cid, oid, newoid, srcoff, len, dstoff + + OP_TRIMCACHE = 19, // cid, oid, offset, len **DEPRECATED** + + OP_MKCOLL = 20, // cid + OP_RMCOLL = 21, // cid + OP_COLL_ADD = 22, // cid, oldcid, oid + OP_COLL_REMOVE = 23, // cid, oid + OP_COLL_SETATTR = 24, // cid, attrname, bl + OP_COLL_RMATTR = 25, // cid, attrname + OP_COLL_SETATTRS = 26, // cid, attrset + OP_COLL_MOVE = 8, // newcid, oldcid, oid + + OP_RMATTRS = 28, // cid, oid + OP_COLL_RENAME = 29, // cid, newcid + + OP_OMAP_CLEAR = 31, // cid + OP_OMAP_SETKEYS = 32, // cid, attrset + OP_OMAP_RMKEYS = 33, // cid, keyset + OP_OMAP_SETHEADER = 34, // cid, header + OP_SPLIT_COLLECTION = 35, // cid, bits, destination + OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination + doesn't create the destination */ + OP_OMAP_RMKEYRANGE = 37, // cid, oid, firstkey, lastkey + OP_COLL_MOVE_RENAME = 38, // oldcid, oldoid, newcid, newoid + + OP_SETALLOCHINT = 39, // cid, oid, object_size, write_size + OP_COLL_HINT = 40, // cid, type, bl + + OP_TRY_RENAME = 41, // oldcid, oldoid, newoid + + OP_COLL_SET_BITS = 42, // cid, bits + + OP_MERGE_COLLECTION = 43, // cid, destination + }; + + // Transaction hint type + enum { + COLL_HINT_EXPECTED_NUM_OBJECTS = 1, + }; + + struct Op { + ceph_le32 op; + ceph_le32 cid; + ceph_le32 oid; + ceph_le64 off; + ceph_le64 len; + ceph_le32 dest_cid; + ceph_le32 dest_oid; //OP_CLONE, OP_CLONERANGE + ceph_le64 dest_off; //OP_CLONERANGE + union { + struct { + ceph_le32 hint_type; //OP_COLL_HINT + }; + struct { + ceph_le32 alloc_hint_flags; //OP_SETALLOCHINT + }; + }; + ceph_le64 expected_object_size; //OP_SETALLOCHINT + ceph_le64 expected_write_size; //OP_SETALLOCHINT + ceph_le32 split_bits; //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS, + //OP_MKCOLL + ceph_le32 split_rem; //OP_SPLIT_COLLECTION2 + } __attribute__ ((packed)) ; + + struct TransactionData { + ceph_le64 ops; + ceph_le32 largest_data_len; + ceph_le32 largest_data_off; + ceph_le32 largest_data_off_in_data_bl; + ceph_le32 fadvise_flags; + + TransactionData() noexcept : + ops(init_le64(0)), + largest_data_len(init_le32(0)), + largest_data_off(init_le32(0)), + largest_data_off_in_data_bl(init_le32(0)), + fadvise_flags(init_le32(0)) { } + + // override default move operations to reset default values + TransactionData(TransactionData&& other) noexcept : + ops(other.ops), + largest_data_len(other.largest_data_len), + largest_data_off(other.largest_data_off), + largest_data_off_in_data_bl(other.largest_data_off_in_data_bl), + fadvise_flags(other.fadvise_flags) { + other.ops = 0; + other.largest_data_len = 0; + other.largest_data_off = 0; + other.largest_data_off_in_data_bl = 0; + other.fadvise_flags = 0; + } + TransactionData& operator=(TransactionData&& other) noexcept { + ops = other.ops; + largest_data_len = other.largest_data_len; + largest_data_off = other.largest_data_off; + largest_data_off_in_data_bl = other.largest_data_off_in_data_bl; + fadvise_flags = other.fadvise_flags; + other.ops = 0; + other.largest_data_len = 0; + other.largest_data_off = 0; + other.largest_data_off_in_data_bl = 0; + other.fadvise_flags = 0; + return *this; + } + + TransactionData(const TransactionData& other) = default; + TransactionData& operator=(const TransactionData& other) = default; + + void encode(bufferlist& bl) const { + bl.append((char*)this, sizeof(TransactionData)); + } + void decode(bufferlist::const_iterator &bl) { + bl.copy(sizeof(TransactionData), (char*)this); + } + } __attribute__ ((packed)) ; + + private: + TransactionData data; + + map<coll_t, __le32> coll_index; + map<ghobject_t, __le32> object_index; + + __le32 coll_id {0}; + __le32 object_id {0}; + + bufferlist data_bl; + bufferlist op_bl; + + list<Context *> on_applied; + list<Context *> on_commit; + list<Context *> on_applied_sync; + + public: + Transaction() = default; + + explicit Transaction(bufferlist::const_iterator &dp) { + decode(dp); + } + explicit Transaction(bufferlist &nbl) { + auto dp = nbl.cbegin(); + decode(dp); + } + + // override default move operations to reset default values + Transaction(Transaction&& other) noexcept : + data(std::move(other.data)), + coll_index(std::move(other.coll_index)), + object_index(std::move(other.object_index)), + coll_id(other.coll_id), + object_id(other.object_id), + data_bl(std::move(other.data_bl)), + op_bl(std::move(other.op_bl)), + on_applied(std::move(other.on_applied)), + on_commit(std::move(other.on_commit)), + on_applied_sync(std::move(other.on_applied_sync)) { + other.coll_id = 0; + other.object_id = 0; + } + + Transaction& operator=(Transaction&& other) noexcept { + data = std::move(other.data); + coll_index = std::move(other.coll_index); + object_index = std::move(other.object_index); + coll_id = other.coll_id; + object_id = other.object_id; + data_bl = std::move(other.data_bl); + op_bl = std::move(other.op_bl); + on_applied = std::move(other.on_applied); + on_commit = std::move(other.on_commit); + on_applied_sync = std::move(other.on_applied_sync); + other.coll_id = 0; + other.object_id = 0; + return *this; + } + + Transaction(const Transaction& other) = default; + Transaction& operator=(const Transaction& other) = default; + + // expose object_index for FileStore::Op's benefit + const map<ghobject_t, __le32>& get_object_index() const { + return object_index; + } + + /* Operations on callback contexts */ + void register_on_applied(Context *c) { + if (!c) return; + on_applied.push_back(c); + } + void register_on_commit(Context *c) { + if (!c) return; + on_commit.push_back(c); + } + void register_on_applied_sync(Context *c) { + if (!c) return; + on_applied_sync.push_back(c); + } + void register_on_complete(Context *c) { + if (!c) return; + RunOnDeleteRef _complete (std::make_shared<RunOnDelete>(c)); + register_on_applied(new ContainerContext<RunOnDeleteRef>(_complete)); + register_on_commit(new ContainerContext<RunOnDeleteRef>(_complete)); + } + bool has_contexts() const { + return + !on_commit.empty() || + !on_applied.empty() || + !on_applied_sync.empty(); + } + + static void collect_contexts( + vector<Transaction>& t, + Context **out_on_applied, + Context **out_on_commit, + Context **out_on_applied_sync) { + ceph_assert(out_on_applied); + ceph_assert(out_on_commit); + ceph_assert(out_on_applied_sync); + list<Context *> on_applied, on_commit, on_applied_sync; + for (auto& i : t) { + on_applied.splice(on_applied.end(), i.on_applied); + on_commit.splice(on_commit.end(), i.on_commit); + on_applied_sync.splice(on_applied_sync.end(), i.on_applied_sync); + } + *out_on_applied = C_Contexts::list_to_context(on_applied); + *out_on_commit = C_Contexts::list_to_context(on_commit); + *out_on_applied_sync = C_Contexts::list_to_context(on_applied_sync); + } + static void collect_contexts( + vector<Transaction>& t, + list<Context*> *out_on_applied, + list<Context*> *out_on_commit, + list<Context*> *out_on_applied_sync) { + ceph_assert(out_on_applied); + ceph_assert(out_on_commit); + ceph_assert(out_on_applied_sync); + for (auto& i : t) { + out_on_applied->splice(out_on_applied->end(), i.on_applied); + out_on_commit->splice(out_on_commit->end(), i.on_commit); + out_on_applied_sync->splice(out_on_applied_sync->end(), + i.on_applied_sync); + } + } + + Context *get_on_applied() { + return C_Contexts::list_to_context(on_applied); + } + Context *get_on_commit() { + return C_Contexts::list_to_context(on_commit); + } + Context *get_on_applied_sync() { + return C_Contexts::list_to_context(on_applied_sync); + } + + void set_fadvise_flags(uint32_t flags) { + data.fadvise_flags = flags; + } + void set_fadvise_flag(uint32_t flag) { + data.fadvise_flags = data.fadvise_flags | flag; + } + uint32_t get_fadvise_flags() { return data.fadvise_flags; } + + void swap(Transaction& other) noexcept { + std::swap(data, other.data); + std::swap(on_applied, other.on_applied); + std::swap(on_commit, other.on_commit); + std::swap(on_applied_sync, other.on_applied_sync); + + std::swap(coll_index, other.coll_index); + std::swap(object_index, other.object_index); + std::swap(coll_id, other.coll_id); + std::swap(object_id, other.object_id); + op_bl.swap(other.op_bl); + data_bl.swap(other.data_bl); + } + + void _update_op(Op* op, + vector<__le32> &cm, + vector<__le32> &om) { + + switch (op->op) { + case OP_NOP: + break; + + case OP_TOUCH: + case OP_REMOVE: + case OP_SETATTR: + case OP_SETATTRS: + case OP_RMATTR: + case OP_RMATTRS: + case OP_COLL_REMOVE: + case OP_OMAP_CLEAR: + case OP_OMAP_SETKEYS: + case OP_OMAP_RMKEYS: + case OP_OMAP_RMKEYRANGE: + case OP_OMAP_SETHEADER: + case OP_WRITE: + case OP_ZERO: + case OP_TRUNCATE: + case OP_SETALLOCHINT: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + op->cid = cm[op->cid]; + op->oid = om[op->oid]; + break; + + case OP_CLONERANGE2: + case OP_CLONE: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + ceph_assert(op->dest_oid < om.size()); + op->cid = cm[op->cid]; + op->oid = om[op->oid]; + op->dest_oid = om[op->dest_oid]; + break; + + case OP_MKCOLL: + case OP_RMCOLL: + case OP_COLL_SETATTR: + case OP_COLL_RMATTR: + case OP_COLL_SETATTRS: + case OP_COLL_HINT: + case OP_COLL_SET_BITS: + ceph_assert(op->cid < cm.size()); + op->cid = cm[op->cid]; + break; + + case OP_COLL_ADD: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + ceph_assert(op->dest_cid < om.size()); + op->cid = cm[op->cid]; + op->dest_cid = cm[op->dest_cid]; + op->oid = om[op->oid]; + break; + + case OP_COLL_MOVE_RENAME: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + ceph_assert(op->dest_cid < cm.size()); + ceph_assert(op->dest_oid < om.size()); + op->cid = cm[op->cid]; + op->oid = om[op->oid]; + op->dest_cid = cm[op->dest_cid]; + op->dest_oid = om[op->dest_oid]; + break; + + case OP_TRY_RENAME: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->oid < om.size()); + ceph_assert(op->dest_oid < om.size()); + op->cid = cm[op->cid]; + op->oid = om[op->oid]; + op->dest_oid = om[op->dest_oid]; + break; + + case OP_SPLIT_COLLECTION2: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->dest_cid < cm.size()); + op->cid = cm[op->cid]; + op->dest_cid = cm[op->dest_cid]; + break; + + case OP_MERGE_COLLECTION: + ceph_assert(op->cid < cm.size()); + ceph_assert(op->dest_cid < cm.size()); + op->cid = cm[op->cid]; + op->dest_cid = cm[op->dest_cid]; + break; + + default: + ceph_abort_msg("Unknown OP"); + } + } + void _update_op_bl( + bufferlist& bl, + vector<__le32> &cm, + vector<__le32> &om) { + for (auto& bp : bl.buffers()) { + ceph_assert(bp.length() % sizeof(Op) == 0); + + char* raw_p = const_cast<char*>(bp.c_str()); + char* raw_end = raw_p + bp.length(); + while (raw_p < raw_end) { + _update_op(reinterpret_cast<Op*>(raw_p), cm, om); + raw_p += sizeof(Op); + } + } + } + /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction + void append(Transaction& other) { + + data.ops = data.ops + other.data.ops; + if (other.data.largest_data_len > data.largest_data_len) { + data.largest_data_len = other.data.largest_data_len; + data.largest_data_off = other.data.largest_data_off; + data.largest_data_off_in_data_bl = data_bl.length() + other.data.largest_data_off_in_data_bl; + } + data.fadvise_flags = data.fadvise_flags | other.data.fadvise_flags; + on_applied.splice(on_applied.end(), other.on_applied); + on_commit.splice(on_commit.end(), other.on_commit); + on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync); + + //append coll_index & object_index + vector<__le32> cm(other.coll_index.size()); + map<coll_t, __le32>::iterator coll_index_p; + for (coll_index_p = other.coll_index.begin(); + coll_index_p != other.coll_index.end(); + ++coll_index_p) { + cm[coll_index_p->second] = _get_coll_id(coll_index_p->first); + } + + vector<__le32> om(other.object_index.size()); + map<ghobject_t, __le32>::iterator object_index_p; + for (object_index_p = other.object_index.begin(); + object_index_p != other.object_index.end(); + ++object_index_p) { + om[object_index_p->second] = _get_object_id(object_index_p->first); + } + + //the other.op_bl SHOULD NOT be changes during append operation, + //we use additional bufferlist to avoid this problem + bufferlist other_op_bl; + { + bufferptr other_op_bl_ptr(other.op_bl.length()); + other.op_bl.copy(0, other.op_bl.length(), other_op_bl_ptr.c_str()); + other_op_bl.append(std::move(other_op_bl_ptr)); + } + + //update other_op_bl with cm & om + //When the other is appended to current transaction, all coll_index and + //object_index in other.op_buffer should be updated by new index of the + //combined transaction + _update_op_bl(other_op_bl, cm, om); + + //append op_bl + op_bl.append(other_op_bl); + //append data_bl + data_bl.append(other.data_bl); + } + + /** Inquires about the Transaction as a whole. */ + + /// How big is the encoded Transaction buffer? + uint64_t get_encoded_bytes() { + //layout: data_bl + op_bl + coll_index + object_index + data + + // coll_index size, object_index size and sizeof(transaction_data) + // all here, so they may be computed at compile-time + size_t final_size = sizeof(__u32) * 2 + sizeof(data); + + // coll_index second and object_index second + final_size += (coll_index.size() + object_index.size()) * sizeof(__le32); + + // coll_index first + for (auto p = coll_index.begin(); p != coll_index.end(); ++p) { + final_size += p->first.encoded_size(); + } + + // object_index first + for (auto p = object_index.begin(); p != object_index.end(); ++p) { + final_size += p->first.encoded_size(); + } + + return data_bl.length() + + op_bl.length() + + final_size; + } + + /// Retain old version for regression testing purposes + uint64_t get_encoded_bytes_test() { + using ceph::encode; + //layout: data_bl + op_bl + coll_index + object_index + data + bufferlist bl; + encode(coll_index, bl); + encode(object_index, bl); + + return data_bl.length() + + op_bl.length() + + bl.length() + + sizeof(data); + } + + uint64_t get_num_bytes() { + return get_encoded_bytes(); + } + /// Size of largest data buffer to the "write" operation encountered so far + uint32_t get_data_length() { + return data.largest_data_len; + } + /// offset within the encoded buffer to the start of the largest data buffer that's encoded + uint32_t get_data_offset() { + if (data.largest_data_off_in_data_bl) { + return data.largest_data_off_in_data_bl + + sizeof(__u8) + // encode struct_v + sizeof(__u8) + // encode compat_v + sizeof(__u32) + // encode len + sizeof(__u32); // data_bl len + } + return 0; // none + } + /// offset of buffer as aligned to destination within object. + int get_data_alignment() { + if (!data.largest_data_len) + return 0; + return (0 - get_data_offset()) & ~CEPH_PAGE_MASK; + } + /// Is the Transaction empty (no operations) + bool empty() { + return !data.ops; + } + /// Number of operations in the transaction + int get_num_ops() { + return data.ops; + } + + /** + * iterator + * + * Helper object to parse Transactions. + * + * ObjectStore instances use this object to step down the encoded + * buffer decoding operation codes and parameters as we go. + * + */ + class iterator { + Transaction *t; + + uint64_t ops; + char* op_buffer_p; + + bufferlist::const_iterator data_bl_p; + + public: + vector<coll_t> colls; + vector<ghobject_t> objects; + + private: + explicit iterator(Transaction *t) + : t(t), + data_bl_p(t->data_bl.cbegin()), + colls(t->coll_index.size()), + objects(t->object_index.size()) { + + ops = t->data.ops; + op_buffer_p = t->op_bl.c_str(); + + map<coll_t, __le32>::iterator coll_index_p; + for (coll_index_p = t->coll_index.begin(); + coll_index_p != t->coll_index.end(); + ++coll_index_p) { + colls[coll_index_p->second] = coll_index_p->first; + } + + map<ghobject_t, __le32>::iterator object_index_p; + for (object_index_p = t->object_index.begin(); + object_index_p != t->object_index.end(); + ++object_index_p) { + objects[object_index_p->second] = object_index_p->first; + } + } + + friend class Transaction; + + public: + + bool have_op() { + return ops > 0; + } + Op* decode_op() { + ceph_assert(ops > 0); + + Op* op = reinterpret_cast<Op*>(op_buffer_p); + op_buffer_p += sizeof(Op); + ops--; + + return op; + } + string decode_string() { + using ceph::decode; + string s; + decode(s, data_bl_p); + return s; + } + void decode_bp(bufferptr& bp) { + using ceph::decode; + decode(bp, data_bl_p); + } + void decode_bl(bufferlist& bl) { + using ceph::decode; + decode(bl, data_bl_p); + } + void decode_attrset(map<string,bufferptr>& aset) { + using ceph::decode; + decode(aset, data_bl_p); + } + void decode_attrset(map<string,bufferlist>& aset) { + using ceph::decode; + decode(aset, data_bl_p); + } + void decode_attrset_bl(bufferlist *pbl) { + decode_str_str_map_to_bl(data_bl_p, pbl); + } + void decode_keyset(set<string> &keys){ + using ceph::decode; + decode(keys, data_bl_p); + } + void decode_keyset_bl(bufferlist *pbl){ + decode_str_set_to_bl(data_bl_p, pbl); + } + + const ghobject_t &get_oid(__le32 oid_id) { + ceph_assert(oid_id < objects.size()); + return objects[oid_id]; + } + const coll_t &get_cid(__le32 cid_id) { + ceph_assert(cid_id < colls.size()); + return colls[cid_id]; + } + uint32_t get_fadvise_flags() const { + return t->get_fadvise_flags(); + } + }; + + iterator begin() { + return iterator(this); + } + +private: + void _build_actions_from_tbl(); + + /** + * Helper functions to encode the various mutation elements of a + * transaction. These are 1:1 with the operation codes (see + * enumeration above). These routines ensure that the + * encoder/creator of a transaction gets the right data in the + * right place. Sadly, there's no corresponding version nor any + * form of seat belts for the decoder. + */ + Op* _get_next_op() { + if (op_bl.get_append_buffer_unused_tail_length() < sizeof(Op)) { + op_bl.reserve(sizeof(Op) * OPS_PER_PTR); + } + // append_hole ensures bptr merging. Even huge number of ops + // shouldn't result in overpopulating bl::_buffers. + char* const p = op_bl.append_hole(sizeof(Op)).c_str(); + memset(p, 0, sizeof(Op)); + return reinterpret_cast<Op*>(p); + } + __le32 _get_coll_id(const coll_t& coll) { + map<coll_t, __le32>::iterator c = coll_index.find(coll); + if (c != coll_index.end()) + return c->second; + + __le32 index_id = coll_id++; + coll_index[coll] = index_id; + return index_id; + } + __le32 _get_object_id(const ghobject_t& oid) { + map<ghobject_t, __le32>::iterator o = object_index.find(oid); + if (o != object_index.end()) + return o->second; + + __le32 index_id = object_id++; + object_index[oid] = index_id; + return index_id; + } + +public: + /// noop. 'nuf said + void nop() { + Op* _op = _get_next_op(); + _op->op = OP_NOP; + data.ops = data.ops + 1; + } + /** + * touch + * + * Ensure the existance of an object in a collection. Create an + * empty object if necessary + */ + void touch(const coll_t& cid, const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_TOUCH; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + /** + * Write data to an offset within an object. If the object is too + * small, it is expanded as needed. It is possible to specify an + * offset beyond the current end of an object and it will be + * expanded as needed. Simple implementations of ObjectStore will + * just zero the data between the old end of the object and the + * newly provided data. More sophisticated implementations of + * ObjectStore will omit the untouched data and store it as a + * "hole" in the file. + * + * Note that a 0-length write does not affect the size of the object. + */ + void write(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len, + const bufferlist& write_data, uint32_t flags = 0) { + using ceph::encode; + uint32_t orig_len = data_bl.length(); + Op* _op = _get_next_op(); + _op->op = OP_WRITE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->off = off; + _op->len = len; + encode(write_data, data_bl); + + ceph_assert(len == write_data.length()); + data.fadvise_flags = data.fadvise_flags | flags; + if (write_data.length() > data.largest_data_len) { + data.largest_data_len = write_data.length(); + data.largest_data_off = off; + data.largest_data_off_in_data_bl = orig_len + sizeof(__u32); // we are about to + } + data.ops = data.ops + 1; + } + /** + * zero out the indicated byte range within an object. Some + * ObjectStore instances may optimize this to release the + * underlying storage space. + * + * If the zero range extends beyond the end of the object, the object + * size is extended, just as if we were writing a buffer full of zeros. + * EXCEPT if the length is 0, in which case (just like a 0-length write) + * we do not adjust the object size. + */ + void zero(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len) { + Op* _op = _get_next_op(); + _op->op = OP_ZERO; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->off = off; + _op->len = len; + data.ops = data.ops + 1; + } + /// Discard all data in the object beyond the specified size. + void truncate(const coll_t& cid, const ghobject_t& oid, uint64_t off) { + Op* _op = _get_next_op(); + _op->op = OP_TRUNCATE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->off = off; + data.ops = data.ops + 1; + } + /// Remove an object. All four parts of the object are removed. + void remove(const coll_t& cid, const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_REMOVE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + /// Set an xattr of an object + void setattr(const coll_t& cid, const ghobject_t& oid, const char* name, bufferlist& val) { + string n(name); + setattr(cid, oid, n, val); + } + /// Set an xattr of an object + void setattr(const coll_t& cid, const ghobject_t& oid, const string& s, bufferlist& val) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_SETATTR; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(s, data_bl); + encode(val, data_bl); + data.ops = data.ops + 1; + } + /// Set multiple xattrs of an object + void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferptr>& attrset) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_SETATTRS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(attrset, data_bl); + data.ops = data.ops + 1; + } + /// Set multiple xattrs of an object + void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferlist>& attrset) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_SETATTRS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(attrset, data_bl); + data.ops = data.ops + 1; + } + /// remove an xattr from an object + void rmattr(const coll_t& cid, const ghobject_t& oid, const char *name) { + string n(name); + rmattr(cid, oid, n); + } + /// remove an xattr from an object + void rmattr(const coll_t& cid, const ghobject_t& oid, const string& s) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_RMATTR; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(s, data_bl); + data.ops = data.ops + 1; + } + /// remove all xattrs from an object + void rmattrs(const coll_t& cid, const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_RMATTRS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + /** + * Clone an object into another object. + * + * Low-cost (e.g., O(1)) cloning (if supported) is best, but + * fallback to an O(n) copy is allowed. All four parts of the + * object are cloned (data, xattrs, omap header, omap + * entries). + * + * The destination named object may already exist, in + * which case its previous contents are discarded. + */ + void clone(const coll_t& cid, const ghobject_t& oid, + const ghobject_t& noid) { + Op* _op = _get_next_op(); + _op->op = OP_CLONE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->dest_oid = _get_object_id(noid); + data.ops = data.ops + 1; + } + /** + * Clone a byte range from one object to another. + * + * The data portion of the destination object receives a copy of a + * portion of the data from the source object. None of the other + * three parts of an object is copied from the source. + * + * The destination object size may be extended to the dstoff + len. + * + * The source range *must* overlap with the source object data. If it does + * not the result is undefined. + */ + void clone_range(const coll_t& cid, const ghobject_t& oid, + const ghobject_t& noid, + uint64_t srcoff, uint64_t srclen, uint64_t dstoff) { + Op* _op = _get_next_op(); + _op->op = OP_CLONERANGE2; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->dest_oid = _get_object_id(noid); + _op->off = srcoff; + _op->len = srclen; + _op->dest_off = dstoff; + data.ops = data.ops + 1; + } + + /// Create the collection + void create_collection(const coll_t& cid, int bits) { + Op* _op = _get_next_op(); + _op->op = OP_MKCOLL; + _op->cid = _get_coll_id(cid); + _op->split_bits = bits; + data.ops = data.ops + 1; + } + + /** + * Give the collection a hint. + * + * @param cid - collection id. + * @param type - hint type. + * @param hint - the hint payload, which contains the customized + * data along with the hint type. + */ + void collection_hint(const coll_t& cid, uint32_t type, const bufferlist& hint) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_COLL_HINT; + _op->cid = _get_coll_id(cid); + _op->hint_type = type; + encode(hint, data_bl); + data.ops = data.ops + 1; + } + + /// remove the collection, the collection must be empty + void remove_collection(const coll_t& cid) { + Op* _op = _get_next_op(); + _op->op = OP_RMCOLL; + _op->cid = _get_coll_id(cid); + data.ops = data.ops + 1; + } + void collection_move(const coll_t& cid, const coll_t &oldcid, const ghobject_t& oid) + __attribute__ ((deprecated)) { + // NOTE: we encode this as a fixed combo of ADD + REMOVE. they + // always appear together, so this is effectively a single MOVE. + Op* _op = _get_next_op(); + _op->op = OP_COLL_ADD; + _op->cid = _get_coll_id(oldcid); + _op->oid = _get_object_id(oid); + _op->dest_cid = _get_coll_id(cid); + data.ops = data.ops + 1; + + _op = _get_next_op(); + _op->op = OP_COLL_REMOVE; + _op->cid = _get_coll_id(oldcid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + void collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, + const coll_t &cid, const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_COLL_MOVE_RENAME; + _op->cid = _get_coll_id(oldcid); + _op->oid = _get_object_id(oldoid); + _op->dest_cid = _get_coll_id(cid); + _op->dest_oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + void try_rename(const coll_t &cid, const ghobject_t& oldoid, + const ghobject_t& oid) { + Op* _op = _get_next_op(); + _op->op = OP_TRY_RENAME; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oldoid); + _op->dest_oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + + /// Remove omap from oid + void omap_clear( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid ///< [in] Object from which to remove omap + ) { + Op* _op = _get_next_op(); + _op->op = OP_OMAP_CLEAR; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data.ops = data.ops + 1; + } + /// Set keys on oid omap. Replaces duplicate keys. + void omap_setkeys( + const coll_t& cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object to update + const map<string, bufferlist> &attrset ///< [in] Replacement keys and values + ) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_OMAP_SETKEYS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(attrset, data_bl); + data.ops = data.ops + 1; + } + + /// Set keys on an oid omap (bufferlist variant). + void omap_setkeys( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object to update + const bufferlist &attrset_bl ///< [in] Replacement keys and values + ) { + Op* _op = _get_next_op(); + _op->op = OP_OMAP_SETKEYS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data_bl.append(attrset_bl); + data.ops = data.ops + 1; + } + + /// Remove keys from oid omap + void omap_rmkeys( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object from which to remove the omap + const set<string> &keys ///< [in] Keys to clear + ) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_OMAP_RMKEYS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(keys, data_bl); + data.ops = data.ops + 1; + } + + /// Remove keys from oid omap + void omap_rmkeys( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object from which to remove the omap + const bufferlist &keys_bl ///< [in] Keys to clear + ) { + Op* _op = _get_next_op(); + _op->op = OP_OMAP_RMKEYS; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + data_bl.append(keys_bl); + data.ops = data.ops + 1; + } + + /// Remove key range from oid omap + void omap_rmkeyrange( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object from which to remove the omap keys + const string& first, ///< [in] first key in range + const string& last ///< [in] first key past range, range is [first,last) + ) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_OMAP_RMKEYRANGE; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(first, data_bl); + encode(last, data_bl); + data.ops = data.ops + 1; + } + + /// Set omap header + void omap_setheader( + const coll_t &cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object + const bufferlist &bl ///< [in] Header value + ) { + using ceph::encode; + Op* _op = _get_next_op(); + _op->op = OP_OMAP_SETHEADER; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + encode(bl, data_bl); + data.ops = data.ops + 1; + } + + /// Split collection based on given prefixes, objects matching the specified bits/rem are + /// moved to the new collection + void split_collection( + const coll_t &cid, + uint32_t bits, + uint32_t rem, + const coll_t &destination) { + Op* _op = _get_next_op(); + _op->op = OP_SPLIT_COLLECTION2; + _op->cid = _get_coll_id(cid); + _op->dest_cid = _get_coll_id(destination); + _op->split_bits = bits; + _op->split_rem = rem; + data.ops = data.ops + 1; + } + + /// Merge collection into another. + void merge_collection( + coll_t cid, + coll_t destination, + uint32_t bits) { + Op* _op = _get_next_op(); + _op->op = OP_MERGE_COLLECTION; + _op->cid = _get_coll_id(cid); + _op->dest_cid = _get_coll_id(destination); + _op->split_bits = bits; + data.ops = data.ops + 1; + } + + void collection_set_bits( + const coll_t &cid, + int bits) { + Op* _op = _get_next_op(); + _op->op = OP_COLL_SET_BITS; + _op->cid = _get_coll_id(cid); + _op->split_bits = bits; + data.ops = data.ops + 1; + } + + /// Set allocation hint for an object + /// make 0 values(expected_object_size, expected_write_size) noops for all implementations + void set_alloc_hint( + const coll_t &cid, + const ghobject_t &oid, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags + ) { + Op* _op = _get_next_op(); + _op->op = OP_SETALLOCHINT; + _op->cid = _get_coll_id(cid); + _op->oid = _get_object_id(oid); + _op->expected_object_size = expected_object_size; + _op->expected_write_size = expected_write_size; + _op->alloc_hint_flags = flags; + data.ops = data.ops + 1; + } + + void encode(bufferlist& bl) const { + //layout: data_bl + op_bl + coll_index + object_index + data + ENCODE_START(9, 9, bl); + encode(data_bl, bl); + encode(op_bl, bl); + encode(coll_index, bl); + encode(object_index, bl); + data.encode(bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl) { + DECODE_START(9, bl); + DECODE_OLDEST(9); + + decode(data_bl, bl); + decode(op_bl, bl); + decode(coll_index, bl); + decode(object_index, bl); + data.decode(bl); + coll_id = coll_index.size(); + object_id = object_index.size(); + + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f); + static void generate_test_instances(list<Transaction*>& o); + }; + + int queue_transaction(CollectionHandle& ch, + Transaction&& t, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL) { + vector<Transaction> tls; + tls.push_back(std::move(t)); + return queue_transactions(ch, tls, op, handle); + } + + virtual int queue_transactions( + CollectionHandle& ch, vector<Transaction>& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL) = 0; + + + public: + ObjectStore(CephContext* cct, + const std::string& path_) : path(path_), cct(cct) {} + virtual ~ObjectStore() {} + + // no copying + explicit ObjectStore(const ObjectStore& o) = delete; + const ObjectStore& operator=(const ObjectStore& o) = delete; + + // versioning + virtual int upgrade() { + return 0; + } + + virtual void get_db_statistics(Formatter *f) { } + virtual void generate_db_histogram(Formatter *f) { } + virtual int flush_cache(ostream *os = NULL) { return -1; } + virtual void dump_perf_counters(Formatter *f) {} + virtual void dump_cache_stats(Formatter *f) {} + virtual void dump_cache_stats(ostream& os) {} + + virtual string get_type() = 0; + + // mgmt + virtual bool test_mount_in_use() = 0; + virtual int mount() = 0; + virtual int umount() = 0; + virtual int fsck(bool deep) { + return -EOPNOTSUPP; + } + virtual int repair(bool deep) { + return -EOPNOTSUPP; + } + virtual int quick_fix() { + return -EOPNOTSUPP; + } + + virtual void set_cache_shards(unsigned num) { } + + /** + * Returns 0 if the hobject is valid, -error otherwise + * + * Errors: + * -ENAMETOOLONG: locator/namespace/name too large + */ + virtual int validate_hobject_key(const hobject_t &obj) const = 0; + + virtual unsigned get_max_attr_name_length() = 0; + virtual int mkfs() = 0; // wipe + virtual int mkjournal() = 0; // journal only + virtual bool needs_journal() = 0; //< requires a journal + virtual bool wants_journal() = 0; //< prefers a journal + virtual bool allows_journal() = 0; //< allows a journal + + /// enumerate hardware devices (by 'devname', e.g., 'sda' as in /sys/block/sda) + virtual int get_devices(std::set<string> *devls) { + return -EOPNOTSUPP; + } + + /// true if a txn is readable immediately after it is queued. + virtual bool is_sync_onreadable() const { + return true; + } + + /** + * is_rotational + * + * Check whether store is backed by a rotational (HDD) or non-rotational + * (SSD) device. + * + * This must be usable *before* the store is mounted. + * + * @return true for HDD, false for SSD + */ + virtual bool is_rotational() { + return true; + } + + /** + * is_journal_rotational + * + * Check whether journal is backed by a rotational (HDD) or non-rotational + * (SSD) device. + * + * + * @return true for HDD, false for SSD + */ + virtual bool is_journal_rotational() { + return true; + } + + virtual string get_default_device_class() { + return is_rotational() ? "hdd" : "ssd"; + } + + virtual int get_numa_node( + int *numa_node, + set<int> *nodes, + set<string> *failed) { + return -EOPNOTSUPP; + } + + + virtual bool can_sort_nibblewise() { + return false; // assume a backend cannot, unless it says otherwise + } + + virtual int statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts = nullptr) = 0; + virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) = 0; + + virtual void collect_metadata(map<string,string> *pm) { } + + /** + * write_meta - write a simple configuration key out-of-band + * + * Write a simple key/value pair for basic store configuration + * (e.g., a uuid or magic number) to an unopened/unmounted store. + * The default implementation writes this to a plaintext file in the + * path. + * + * A newline is appended. + * + * @param key key name (e.g., "fsid") + * @param value value (e.g., a uuid rendered as a string) + * @returns 0 for success, or an error code + */ + virtual int write_meta(const std::string& key, + const std::string& value); + + /** + * read_meta - read a simple configuration key out-of-band + * + * Read a simple key value to an unopened/mounted store. + * + * Trailing whitespace is stripped off. + * + * @param key key name + * @param value pointer to value string + * @returns 0 for success, or an error code + */ + virtual int read_meta(const std::string& key, + std::string *value); + + /** + * get ideal max value for collection_list() + * + * default to some arbitrary values; the implementation will override. + */ + virtual int get_ideal_list_max() { return 64; } + + + /** + * get a collection handle + * + * Provide a trivial handle as a default to avoid converting legacy + * implementations. + */ + virtual CollectionHandle open_collection(const coll_t &cid) = 0; + + /** + * get a collection handle for a soon-to-be-created collection + * + * This handle must be used by queue_transaction that includes a + * create_collection call in order to become valid. It will become the + * reference to the created collection. + */ + virtual CollectionHandle create_new_collection(const coll_t &cid) = 0; + + /** + * set ContextQueue for a collection + * + * After that, oncommits of Transaction will queue into commit_queue. + * And osd ShardThread will call oncommits. + */ + virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0; + + /** + * Synchronous read operations + */ + + /** + * exists -- Test for existance of object + * + * @param cid collection for object + * @param oid oid of object + * @returns true if object exists, false otherwise + */ + virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0; + /** + * set_collection_opts -- set pool options for a collectioninformation for an object + * + * @param cid collection + * @param opts new collection options + * @returns 0 on success, negative error code on failure. + */ + virtual int set_collection_opts( + CollectionHandle& c, + const pool_opts_t& opts) = 0; + + /** + * stat -- get information for an object + * + * @param cid collection for object + * @param oid oid of object + * @param st output information for the object + * @param allow_eio if false, assert on -EIO operation failure + * @returns 0 on success, negative error code on failure. + */ + virtual int stat( + CollectionHandle &c, + const ghobject_t& oid, + struct stat *st, + bool allow_eio = false) = 0; + /** + * read -- read a byte range of data from an object + * + * Note: if reading from an offset past the end of the object, we + * return 0 (not, say, -EINVAL). + * + * @param cid collection for object + * @param oid oid of object + * @param offset location offset of first byte to be read + * @param len number of bytes to be read + * @param bl output bufferlist + * @param op_flags is CEPH_OSD_OP_FLAG_* + * @returns number of bytes read on success, or negative error code on failure. + */ + virtual int read( + CollectionHandle &c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0) = 0; + + /** + * fiemap -- get extent map of data of an object + * + * Returns an encoded map of the extents of an object's data portion + * (map<offset,size>). + * + * A non-enlightened implementation is free to return the extent (offset, len) + * as the sole extent. + * + * @param cid collection for object + * @param oid oid of object + * @param offset location offset of first byte to be read + * @param len number of bytes to be read + * @param bl output bufferlist for extent map information. + * @returns 0 on success, negative error code on failure. + */ + virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, + uint64_t offset, size_t len, bufferlist& bl) = 0; + virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, + uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) = 0; + + /** + * getattr -- get an xattr of an object + * + * @param cid collection for object + * @param oid oid of object + * @param name name of attr to read + * @param value place to put output result. + * @returns 0 on success, negative error code on failure. + */ + virtual int getattr(CollectionHandle &c, const ghobject_t& oid, + const char *name, bufferptr& value) = 0; + + /** + * getattr -- get an xattr of an object + * + * @param cid collection for object + * @param oid oid of object + * @param name name of attr to read + * @param value place to put output result. + * @returns 0 on success, negative error code on failure. + */ + int getattr( + CollectionHandle &c, const ghobject_t& oid, + const string& name, bufferlist& value) { + bufferptr bp; + int r = getattr(c, oid, name.c_str(), bp); + value.push_back(bp); + return r; + } + + /** + * getattrs -- get all of the xattrs of an object + * + * @param cid collection for object + * @param oid oid of object + * @param aset place to put output result. + * @returns 0 on success, negative error code on failure. + */ + virtual int getattrs(CollectionHandle &c, const ghobject_t& oid, + map<string,bufferptr>& aset) = 0; + + /** + * getattrs -- get all of the xattrs of an object + * + * @param cid collection for object + * @param oid oid of object + * @param aset place to put output result. + * @returns 0 on success, negative error code on failure. + */ + int getattrs(CollectionHandle &c, const ghobject_t& oid, + map<string,bufferlist>& aset) { + map<string,bufferptr> bmap; + int r = getattrs(c, oid, bmap); + for (map<string,bufferptr>::iterator i = bmap.begin(); + i != bmap.end(); + ++i) { + aset[i->first].append(i->second); + } + return r; + } + + + // collections + + /** + * list_collections -- get all of the collections known to this ObjectStore + * + * @param ls list of the collections in sorted order. + * @returns 0 on success, negative error code on failure. + */ + virtual int list_collections(vector<coll_t>& ls) = 0; + + /** + * does a collection exist? + * + * @param c collection + * @returns true if it exists, false otherwise + */ + virtual bool collection_exists(const coll_t& c) = 0; + + /** + * is a collection empty? + * + * @param c collection + * @param empty true if the specified collection is empty, false otherwise + * @returns 0 on success, negative error code on failure. + */ + virtual int collection_empty(CollectionHandle& c, bool *empty) = 0; + + /** + * return the number of significant bits of the coll_t::pgid. + * + * This should return what the last create_collection or split_collection + * set. A legacy backend may return -EAGAIN if the value is unavailable + * (because we upgraded from an older version, e.g., FileStore). + */ + virtual int collection_bits(CollectionHandle& c) = 0; + + + /** + * list contents of a collection that fall in the range [start, end) and no more than a specified many result + * + * @param c collection + * @param start list object that sort >= this value + * @param end list objects that sort < this value + * @param max return no more than this many results + * @param seq return no objects with snap < seq + * @param ls [out] result + * @param next [out] next item sorts >= this value + * @return zero on success, or negative error + */ + virtual int collection_list(CollectionHandle &c, + const ghobject_t& start, const ghobject_t& end, + int max, + vector<ghobject_t> *ls, ghobject_t *next) = 0; + + virtual int collection_list_legacy(CollectionHandle &c, + const ghobject_t& start, + const ghobject_t& end, int max, + std::vector<ghobject_t> *ls, + ghobject_t *next) { + return collection_list(c, start, end, max, ls, next); + } + + /// OMAP + /// Get omap contents + virtual int omap_get( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) = 0; + + /// Get omap header + virtual int omap_get_header( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio = false ///< [in] don't assert on eio + ) = 0; + + /// Get keys defined on oid + virtual int omap_get_keys( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set<string> *keys ///< [out] Keys defined on oid + ) = 0; + + /// Get key values + virtual int omap_get_values( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) = 0; + + /// Filters keys into out which are defined on oid + virtual int omap_check_keys( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) = 0; + + /** + * Returns an object map iterator + * + * Warning! The returned iterator is an implicit lock on filestore + * operations in c. Do not use filestore methods on c while the returned + * iterator is live. (Filling in a transaction is no problem). + * + * @return iterator, null on error + */ + virtual ObjectMap::ObjectMapIterator get_omap_iterator( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) = 0; + + virtual int flush_journal() { return -EOPNOTSUPP; } + + virtual int dump_journal(ostream& out) { return -EOPNOTSUPP; } + + virtual int snapshot(const string& name) { return -EOPNOTSUPP; } + + /** + * Set and get internal fsid for this instance. No external data is modified + */ + virtual void set_fsid(uuid_d u) = 0; + virtual uuid_d get_fsid() = 0; + + /** + * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store + * - num objects - total (including witeouts) object count to measure used space for. + */ + virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0; + + + // DEBUG + virtual void inject_data_error(const ghobject_t &oid) {} + virtual void inject_mdata_error(const ghobject_t &oid) {} + + virtual void compact() {} + virtual bool has_builtin_csum() const { + return false; + } +}; +WRITE_CLASS_ENCODER(ObjectStore::Transaction) +WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData) + +ostream& operator<<(ostream& out, const ObjectStore::Transaction& tx); + +#endif diff --git a/src/os/Transaction.cc b/src/os/Transaction.cc new file mode 100644 index 00000000..ad390a1c --- /dev/null +++ b/src/os/Transaction.cc @@ -0,0 +1,516 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ObjectStore.h" +#include "common/Formatter.h" + +void ObjectStore::Transaction::dump(ceph::Formatter *f) +{ + f->open_array_section("ops"); + iterator i = begin(); + int op_num = 0; + bool stop_looping = false; + while (i.have_op() && !stop_looping) { + Transaction::Op *op = i.decode_op(); + f->open_object_section("op"); + f->dump_int("op_num", op_num); + + switch (op->op) { + case Transaction::OP_NOP: + f->dump_string("op_name", "nop"); + break; + case Transaction::OP_TOUCH: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + f->dump_string("op_name", "touch"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + } + break; + + case Transaction::OP_WRITE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + bufferlist bl; + i.decode_bl(bl); + f->dump_string("op_name", "write"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->dump_unsigned("length", len); + f->dump_unsigned("offset", off); + f->dump_unsigned("bufferlist length", bl.length()); + } + break; + + case Transaction::OP_ZERO: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + f->dump_string("op_name", "zero"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->dump_unsigned("offset", off); + f->dump_unsigned("length", len); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + f->dump_string("op_name", "trim_cache"); + } + break; + + case Transaction::OP_TRUNCATE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + f->dump_string("op_name", "truncate"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->dump_unsigned("offset", off); + } + break; + + case Transaction::OP_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + f->dump_string("op_name", "remove"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + } + break; + + case Transaction::OP_SETATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + f->dump_string("op_name", "setattr"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->dump_string("name", name); + f->dump_unsigned("length", bl.length()); + } + break; + + case Transaction::OP_SETATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + map<string, bufferptr> aset; + i.decode_attrset(aset); + f->dump_string("op_name", "setattrs"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->open_object_section("attr_lens"); + for (map<string,bufferptr>::iterator p = aset.begin(); + p != aset.end(); ++p) { + f->dump_unsigned(p->first.c_str(), p->second.length()); + } + f->close_section(); + } + break; + + case Transaction::OP_RMATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + string name = i.decode_string(); + f->dump_string("op_name", "rmattr"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->dump_string("name", name); + } + break; + + case Transaction::OP_RMATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + f->dump_string("op_name", "rmattrs"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + } + break; + + case Transaction::OP_CLONE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + f->dump_string("op_name", "clone"); + f->dump_stream("collection") << cid; + f->dump_stream("src_oid") << oid; + f->dump_stream("dst_oid") << noid; + } + break; + + case Transaction::OP_CLONERANGE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + uint64_t off = op->off; + uint64_t len = op->len; + f->dump_string("op_name", "clonerange"); + f->dump_stream("collection") << cid; + f->dump_stream("src_oid") << oid; + f->dump_stream("dst_oid") << noid; + f->dump_unsigned("offset", off); + f->dump_unsigned("len", len); + } + break; + + case Transaction::OP_CLONERANGE2: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + f->dump_string("op_name", "clonerange2"); + f->dump_stream("collection") << cid; + f->dump_stream("src_oid") << oid; + f->dump_stream("dst_oid") << noid; + f->dump_unsigned("src_offset", srcoff); + f->dump_unsigned("len", len); + f->dump_unsigned("dst_offset", dstoff); + } + break; + + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + f->dump_string("op_name", "mkcoll"); + f->dump_stream("collection") << cid; + } + break; + + case Transaction::OP_COLL_HINT: + { + using ceph::decode; + coll_t cid = i.get_cid(op->cid); + uint32_t type = op->hint_type; + f->dump_string("op_name", "coll_hint"); + f->dump_stream("collection") << cid; + f->dump_unsigned("type", type); + bufferlist hint; + i.decode_bl(hint); + auto hiter = hint.cbegin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + decode(pg_num, hiter); + decode(num_objs, hiter); + f->dump_unsigned("pg_num", pg_num); + f->dump_unsigned("expected_num_objects", num_objs); + } + } + break; + + case Transaction::OP_COLL_SET_BITS: + { + coll_t cid = i.get_cid(op->cid); + f->dump_string("op_name", "coll_set_bits"); + f->dump_stream("collection") << cid; + f->dump_unsigned("bits", op->split_bits); + } + break; + + case Transaction::OP_RMCOLL: + { + coll_t cid = i.get_cid(op->cid); + f->dump_string("op_name", "rmcoll"); + f->dump_stream("collection") << cid; + } + break; + + case Transaction::OP_COLL_ADD: + { + coll_t ocid = i.get_cid(op->cid); + coll_t ncid = i.get_cid(op->dest_cid); + ghobject_t oid = i.get_oid(op->oid); + f->dump_string("op_name", "collection_add"); + f->dump_stream("src_collection") << ocid; + f->dump_stream("dst_collection") << ncid; + f->dump_stream("oid") << oid; + } + break; + + case Transaction::OP_COLL_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + f->dump_string("op_name", "collection_remove"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + } + break; + + case Transaction::OP_COLL_MOVE: + { + coll_t ocid = i.get_cid(op->cid); + coll_t ncid = i.get_cid(op->dest_cid); + ghobject_t oid = i.get_oid(op->oid); + f->open_object_section("collection_move"); + f->dump_stream("src_collection") << ocid; + f->dump_stream("dst_collection") << ncid; + f->dump_stream("oid") << oid; + f->close_section(); + } + break; + + case Transaction::OP_COLL_SETATTR: + { + coll_t cid = i.get_cid(op->cid); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + f->dump_string("op_name", "collection_setattr"); + f->dump_stream("collection") << cid; + f->dump_string("name", name); + f->dump_unsigned("length", bl.length()); + } + break; + + case Transaction::OP_COLL_RMATTR: + { + coll_t cid = i.get_cid(op->cid); + string name = i.decode_string(); + f->dump_string("op_name", "collection_rmattr"); + f->dump_stream("collection") << cid; + f->dump_string("name", name); + } + break; + + case Transaction::OP_COLL_RENAME: + { + f->dump_string("op_name", "collection_rename"); + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + f->dump_string("op_name", "omap_clear"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + } + break; + + case Transaction::OP_OMAP_SETKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + map<string, bufferlist> aset; + i.decode_attrset(aset); + f->dump_string("op_name", "omap_setkeys"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->open_object_section("attr_lens"); + for (map<string, bufferlist>::iterator p = aset.begin(); + p != aset.end(); ++p) { + f->dump_unsigned(p->first.c_str(), p->second.length()); + } + f->close_section(); + } + break; + + case Transaction::OP_OMAP_RMKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + set<string> keys; + i.decode_keyset(keys); + f->dump_string("op_name", "omap_rmkeys"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->open_array_section("attrs"); + for (auto& k : keys) { + f->dump_string("", k.c_str()); + } + f->close_section(); + } + break; + + case Transaction::OP_OMAP_SETHEADER: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + bufferlist bl; + i.decode_bl(bl); + f->dump_string("op_name", "omap_setheader"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->dump_stream("header_length") << bl.length(); + } + break; + + case Transaction::OP_SPLIT_COLLECTION: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + coll_t dest = i.get_cid(op->dest_cid); + f->dump_string("op_name", "op_split_collection_create"); + f->dump_stream("collection") << cid; + f->dump_stream("bits") << bits; + f->dump_stream("rem") << rem; + f->dump_stream("dest") << dest; + } + break; + + case Transaction::OP_SPLIT_COLLECTION2: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + coll_t dest = i.get_cid(op->dest_cid); + f->dump_string("op_name", "op_split_collection"); + f->dump_stream("collection") << cid; + f->dump_stream("bits") << bits; + f->dump_stream("rem") << rem; + f->dump_stream("dest") << dest; + } + break; + + case Transaction::OP_MERGE_COLLECTION: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + coll_t dest = i.get_cid(op->dest_cid); + f->dump_string("op_name", "op_merge_collection"); + f->dump_stream("collection") << cid; + f->dump_stream("dest") << dest; + f->dump_stream("bits") << bits; + } + break; + + case Transaction::OP_OMAP_RMKEYRANGE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + f->dump_string("op_name", "op_omap_rmkeyrange"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->dump_string("first", first); + f->dump_string("last", last); + } + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + coll_t old_cid = i.get_cid(op->cid); + ghobject_t old_oid = i.get_oid(op->oid); + coll_t new_cid = i.get_cid(op->dest_cid); + ghobject_t new_oid = i.get_oid(op->dest_oid); + f->dump_string("op_name", "op_coll_move_rename"); + f->dump_stream("old_collection") << old_cid; + f->dump_stream("old_oid") << old_oid; + f->dump_stream("new_collection") << new_cid; + f->dump_stream("new_oid") << new_oid; + } + break; + + case Transaction::OP_TRY_RENAME: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t old_oid = i.get_oid(op->oid); + ghobject_t new_oid = i.get_oid(op->dest_oid); + f->dump_string("op_name", "op_coll_move_rename"); + f->dump_stream("collection") << cid; + f->dump_stream("old_oid") << old_oid; + f->dump_stream("new_oid") << new_oid; + } + break; + + case Transaction::OP_SETALLOCHINT: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t expected_object_size = op->expected_object_size; + uint64_t expected_write_size = op->expected_write_size; + f->dump_string("op_name", "op_setallochint"); + f->dump_stream("collection") << cid; + f->dump_stream("oid") << oid; + f->dump_stream("expected_object_size") << expected_object_size; + f->dump_stream("expected_write_size") << expected_write_size; + } + break; + + default: + f->dump_string("op_name", "unknown"); + f->dump_unsigned("op_code", op->op); + stop_looping = true; + break; + } + f->close_section(); + op_num++; + } + f->close_section(); +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transaction*>& o) +{ + o.push_back(new Transaction); + + Transaction *t = new Transaction; + t->nop(); + o.push_back(t); + + t = new Transaction; + coll_t c(spg_t(pg_t(1,2), shard_id_t::NO_SHARD)); + coll_t c2(spg_t(pg_t(4,5), shard_id_t::NO_SHARD)); + ghobject_t o1(hobject_t("obj", "", 123, 456, -1, "")); + ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, "")); + ghobject_t o3(hobject_t("obj3", "", 123, 456, -1, "")); + t->touch(c, o1); + bufferlist bl; + bl.append("some data"); + t->write(c, o1, 1, bl.length(), bl); + t->zero(c, o1, 22, 33); + t->truncate(c, o1, 99); + t->remove(c, o1); + o.push_back(t); + + t = new Transaction; + t->setattr(c, o1, "key", bl); + map<string,bufferptr> m; + m["a"] = buffer::copy("this", 4); + m["b"] = buffer::copy("that", 4); + t->setattrs(c, o1, m); + t->rmattr(c, o1, "b"); + t->rmattrs(c, o1); + + t->clone(c, o1, o2); + t->clone(c, o1, o3); + t->clone_range(c, o1, o2, 1, 12, 99); + + t->create_collection(c, 12); + t->collection_move_rename(c, o2, c2, o3); + t->remove_collection(c); + o.push_back(t); +} + +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" diff --git a/src/os/bluestore/Allocator.cc b/src/os/bluestore/Allocator.cc new file mode 100644 index 00000000..0ac9a15a --- /dev/null +++ b/src/os/bluestore/Allocator.cc @@ -0,0 +1,203 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Allocator.h" +#include "StupidAllocator.h" +#include "BitmapAllocator.h" +#include "AvlAllocator.h" +#include "HybridAllocator.h" +#include "common/debug.h" +#include "common/admin_socket.h" +#define dout_subsys ceph_subsys_bluestore + +class Allocator::SocketHook : public AdminSocketHook { + Allocator *alloc; + + friend class Allocator; + std::string name; +public: + explicit SocketHook(Allocator *alloc, + const std::string& _name) : + alloc(alloc), name(_name) + { + AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); + if (name.empty()) { + name = to_string((uintptr_t)this); + } + if (admin_socket) { + int r = admin_socket->register_command(("bluestore allocator dump " + name).c_str(), + ("bluestore allocator dump " + name).c_str(), + this, + "dump allocator free regions"); + if (r != 0) + alloc = nullptr; //some collision, disable + if (alloc) { + r = admin_socket->register_command(("bluestore allocator score " + name).c_str(), + ("bluestore allocator score " + name).c_str(), + this, + "give score on allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)"); + ceph_assert(r == 0); + r = admin_socket->register_command(("bluestore allocator fragmentation " + name).c_str(), + ("bluestore allocator fragmentation " + name).c_str(), + this, + "give allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)"); + ceph_assert(r == 0); + } + } + } + ~SocketHook() + { + AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); + if (admin_socket && alloc) { + int r = admin_socket->unregister_command(("bluestore allocator dump " + name).c_str()); + ceph_assert(r == 0); + r = admin_socket->unregister_command(("bluestore allocator score " + name).c_str()); + ceph_assert(r == 0); + r = admin_socket->unregister_command(("bluestore allocator fragmentation " + name).c_str()); + ceph_assert(r == 0); + } + } + + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + stringstream ss; + bool r = true; + if (command == "bluestore allocator dump " + name) { + Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); + f->open_array_section("free_regions"); + auto iterated_allocation = [&](size_t off, size_t len) { + ceph_assert(len > 0); + f->open_object_section("free"); + char off_hex[30]; + char len_hex[30]; + snprintf(off_hex, sizeof(off_hex) - 1, "0x%lx", off); + snprintf(len_hex, sizeof(len_hex) - 1, "0x%lx", len); + f->dump_string("offset", off_hex); + f->dump_string("length", len_hex); + f->close_section(); + }; + alloc->dump(iterated_allocation); + + + f->close_section(); + f->flush(ss); + } else if (command == "bluestore allocator score " + name) { + Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); + f->open_object_section("fragmentation_score"); + f->dump_float("fragmentation_rating", alloc->get_fragmentation_score()); + f->close_section(); + f->flush(ss); + delete f; + } else if (command == "bluestore allocator fragmentation " + name) { + Formatter* f = Formatter::create(format, "json-pretty", "json-pretty"); + f->open_object_section("fragmentation"); + f->dump_float("fragmentation_rating", alloc->get_fragmentation()); + f->close_section(); + f->flush(ss); + delete f; + } else { + ss << "Invalid command" << std::endl; + r = false; + } + out.append(ss); + return r; + } + +}; +Allocator::Allocator(const std::string& name) +{ + asok_hook = new SocketHook(this, name); +} + + +Allocator::~Allocator() +{ + delete asok_hook; +} + +const string& Allocator::get_name() const { + return asok_hook->name; +} + +Allocator *Allocator::create(CephContext* cct, string type, + int64_t size, int64_t block_size, const std::string& name) +{ + Allocator* alloc = nullptr; + if (type == "stupid") { + alloc = new StupidAllocator(cct, name, block_size); + } else if (type == "bitmap") { + alloc = new BitmapAllocator(cct, size, block_size, name); + } else if (type == "avl") { + return new AvlAllocator(cct, size, block_size, name); + } else if (type == "hybrid") { + return new HybridAllocator(cct, size, block_size, + cct->_conf.get_val<uint64_t>("bluestore_hybrid_alloc_mem_cap"), + name); + } + if (alloc == nullptr) { + lderr(cct) << "Allocator::" << __func__ << " unknown alloc type " + << type << dendl; + } + return alloc; +} + +void Allocator::release(const PExtentVector& release_vec) +{ + interval_set<uint64_t> release_set; + for (auto e : release_vec) { + release_set.insert(e.offset, e.length); + } + release(release_set); +} + +/** + * Gives fragmentation a numeric value. + * + * Following algorithm applies value to each existing free unallocated block. + * Value of single block is a multiply of size and per-byte-value. + * Per-byte-value is greater for larger blocks. + * Assume block size X has value per-byte p; then block size 2*X will have per-byte value 1.1*p. + * + * This could be expressed in logarithms, but for speed this is interpolated inside ranges. + * [1] [2..3] [4..7] [8..15] ... + * ^ ^ ^ ^ + * 1.1 1.1^2 1.1^3 1.1^4 ... + * + * Final score is obtained by proportion between score that would have been obtained + * in condition of absolute fragmentation and score in no fragmentation at all. + */ +double Allocator::get_fragmentation_score() +{ + // this value represents how much worth is 2X bytes in one chunk then in X + X bytes + static const double double_size_worth = 1.1 ; + std::vector<double> scales{1}; + double score_sum = 0; + size_t sum = 0; + + auto get_score = [&](size_t v) -> double { + size_t sc = sizeof(v) * 8 - clz(v) - 1; //assign to grade depending on log2(len) + while (scales.size() <= sc + 1) { + //unlikely expand scales vector + scales.push_back(scales[scales.size() - 1] * double_size_worth); + } + + size_t sc_shifted = size_t(1) << sc; + double x = double(v - sc_shifted) / sc_shifted; //x is <0,1) in its scale grade + // linear extrapolation in its scale grade + double score = (sc_shifted ) * scales[sc] * (1-x) + + (sc_shifted * 2) * scales[sc+1] * x; + return score; + }; + + auto iterated_allocation = [&](size_t off, size_t len) { + ceph_assert(len > 0); + score_sum += get_score(len); + sum += len; + }; + dump(iterated_allocation); + + + double ideal = get_score(sum); + double terrible = sum * get_score(1); + return (ideal - score_sum) / (ideal - terrible); +} diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h new file mode 100644 index 00000000..7b571395 --- /dev/null +++ b/src/os/bluestore/Allocator.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_OS_BLUESTORE_ALLOCATOR_H +#define CEPH_OS_BLUESTORE_ALLOCATOR_H + +#include <ostream> +#include "include/ceph_assert.h" +#include "os/bluestore/bluestore_types.h" +#include <functional> + +class Allocator { +public: + explicit Allocator(const std::string& name); + virtual ~Allocator(); + + /* + * Allocate required number of blocks in n number of extents. + * Min and Max number of extents are limited by: + * a. alloc unit + * b. max_alloc_size. + * as no extent can be lesser than alloc_unit and greater than max_alloc size. + * Apart from that extents can vary between these lower and higher limits according + * to free block search algorithm and availability of contiguous space. + */ + virtual int64_t allocate(uint64_t want_size, uint64_t alloc_unit, + uint64_t max_alloc_size, int64_t hint, + PExtentVector *extents) = 0; + + int64_t allocate(uint64_t want_size, uint64_t alloc_unit, + int64_t hint, PExtentVector *extents) { + return allocate(want_size, alloc_unit, want_size, hint, extents); + } + + /* Bulk release. Implementations may override this method to handle the whole + * set at once. This could save e.g. unnecessary mutex dance. */ + virtual void release(const interval_set<uint64_t>& release_set) = 0; + void release(const PExtentVector& release_set); + + virtual void dump() = 0; + virtual void dump(std::function<void(uint64_t offset, uint64_t length)> notify) = 0; + + virtual void init_add_free(uint64_t offset, uint64_t length) = 0; + virtual void init_rm_free(uint64_t offset, uint64_t length) = 0; + + virtual uint64_t get_free() = 0; + virtual double get_fragmentation() + { + return 0.0; + } + virtual double get_fragmentation_score(); + virtual void shutdown() = 0; + + static Allocator *create(CephContext* cct, string type, int64_t size, + int64_t block_size, const std::string& name = ""); + + const string& get_name() const; + +private: + class SocketHook; + SocketHook* asok_hook = nullptr; +}; + +#endif diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc new file mode 100755 index 00000000..0ac70baa --- /dev/null +++ b/src/os/bluestore/AvlAllocator.cc @@ -0,0 +1,430 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "AvlAllocator.h" + +#include <limits> + +#include "common/config_proxy.h" +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore +#undef dout_prefix +#define dout_prefix *_dout << "AvlAllocator " + +MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc); + +namespace { + // a light-weight "range_seg_t", which only used as the key when searching in + // range_tree and range_size_tree + struct range_t { + uint64_t start; + uint64_t end; + }; +} + +/* + * This is a helper function that can be used by the allocator to find + * a suitable block to allocate. This will search the specified AVL + * tree looking for a block that matches the specified criteria. + */ +template<class Tree> +uint64_t AvlAllocator::_block_picker(const Tree& t, + uint64_t *cursor, + uint64_t size, + uint64_t align) +{ + const auto compare = t.key_comp(); + for (auto rs = t.lower_bound(range_t{*cursor, size}, compare); + rs != t.end(); ++rs) { + uint64_t offset = p2roundup(rs->start, align); + if (offset + size <= rs->end) { + *cursor = offset + size; + return offset; + } + } + /* + * If we know we've searched the whole tree (*cursor == 0), give up. + * Otherwise, reset the cursor to the beginning and try again. + */ + if (*cursor == 0) { + return -1ULL; + } + *cursor = 0; + return _block_picker(t, cursor, size, align); +} + +void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size) +{ + ceph_assert(size != 0); + + uint64_t end = start + size; + + auto rs_after = range_tree.upper_bound(range_t{start, end}, + range_tree.key_comp()); + + /* Make sure we don't overlap with either of our neighbors */ + auto rs_before = range_tree.end(); + if (rs_after != range_tree.begin()) { + rs_before = std::prev(rs_after); + } + + bool merge_before = (rs_before != range_tree.end() && rs_before->end == start); + bool merge_after = (rs_after != range_tree.end() && rs_after->start == end); + + if (merge_before && merge_after) { + _range_size_tree_rm(*rs_before); + _range_size_tree_rm(*rs_after); + rs_after->start = rs_before->start; + range_tree.erase_and_dispose(rs_before, dispose_rs{}); + _range_size_tree_try_insert(*rs_after); + } else if (merge_before) { + _range_size_tree_rm(*rs_before); + rs_before->end = end; + _range_size_tree_try_insert(*rs_before); + } else if (merge_after) { + _range_size_tree_rm(*rs_after); + rs_after->start = start; + _range_size_tree_try_insert(*rs_after); + } else { + _try_insert_range(start, end, &rs_after); + } +} + +void AvlAllocator::_process_range_removal(uint64_t start, uint64_t end, + AvlAllocator::range_tree_t::iterator& rs) +{ + bool left_over = (rs->start != start); + bool right_over = (rs->end != end); + + _range_size_tree_rm(*rs); + + if (left_over && right_over) { + auto old_right_end = rs->end; + auto insert_pos = rs; + ceph_assert(insert_pos != range_tree.end()); + ++insert_pos; + rs->end = start; + + // Insert tail first to be sure insert_pos hasn't been disposed. + // This woulnd't dispose rs though since it's out of range_size_tree. + // Don't care about a small chance of 'not-the-best-choice-for-removal' case + // which might happen if rs has the lowest size. + _try_insert_range(end, old_right_end, &insert_pos); + _range_size_tree_try_insert(*rs); + + } else if (left_over) { + rs->end = start; + _range_size_tree_try_insert(*rs); + } else if (right_over) { + rs->start = end; + _range_size_tree_try_insert(*rs); + } else { + range_tree.erase_and_dispose(rs, dispose_rs{}); + } +} + +void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size) +{ + uint64_t end = start + size; + + ceph_assert(size != 0); + ceph_assert(size <= num_free); + + auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp()); + /* Make sure we completely overlap with someone */ + ceph_assert(rs != range_tree.end()); + ceph_assert(rs->start <= start); + ceph_assert(rs->end >= end); + + _process_range_removal(start, end, rs); +} + +void AvlAllocator::_try_remove_from_tree(uint64_t start, uint64_t size, + std::function<void(uint64_t, uint64_t, bool)> cb) +{ + uint64_t end = start + size; + + ceph_assert(size != 0); + + auto rs = range_tree.find(range_t{ start, end }, + range_tree.key_comp()); + + if (rs == range_tree.end() || rs->start >= end) { + cb(start, size, false); + return; + } + + do { + + auto next_rs = rs; + ++next_rs; + + if (start < rs->start) { + cb(start, rs->start - start, false); + start = rs->start; + } + auto range_end = std::min(rs->end, end); + _process_range_removal(start, range_end, rs); + cb(start, range_end - start, true); + start = range_end; + + rs = next_rs; + } while (rs != range_tree.end() && rs->start < end && start < end); + if (start < end) { + cb(start, end - start, false); + } +} + +int64_t AvlAllocator::_allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, // unused, for now! + PExtentVector* extents) +{ + uint64_t allocated = 0; + while (allocated < want) { + uint64_t offset, length; + int r = _allocate(std::min(max_alloc_size, want - allocated), + unit, &offset, &length); + if (r < 0) { + // Allocation failed. + break; + } + extents->emplace_back(offset, length); + allocated += length; + } + return allocated ? allocated : -ENOSPC; +} + +int AvlAllocator::_allocate( + uint64_t size, + uint64_t unit, + uint64_t *offset, + uint64_t *length) +{ + uint64_t max_size = 0; + if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) { + max_size = p->end - p->start; + } + + bool force_range_size_alloc = false; + if (max_size < size) { + if (max_size < unit) { + return -ENOSPC; + } + size = p2align(max_size, unit); + ceph_assert(size > 0); + force_range_size_alloc = true; + } + /* + * Find the largest power of 2 block size that evenly divides the + * requested size. This is used to try to allocate blocks with similar + * alignment from the same area (i.e. same cursor bucket) but it does + * not guarantee that other allocations sizes may exist in the same + * region. + */ + const uint64_t align = size & -size; + ceph_assert(align != 0); + uint64_t *cursor = &lbas[cbits(align) - 1]; + + const int free_pct = num_free * 100 / num_total; + uint64_t start = 0; + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if (force_range_size_alloc || + max_size < range_size_alloc_threshold || + free_pct < range_size_alloc_free_pct) { + *cursor = 0; + do { + start = _block_picker(range_size_tree, cursor, size, unit); + if (start != -1ULL || !force_range_size_alloc) { + break; + } + // try to collect smaller extents as we could fail to retrieve + // that large block due to misaligned extents + size = p2align(size >> 1, unit); + } while (size >= unit); + } else { + start = _block_picker(range_tree, cursor, size, unit); + } + if (start == -1ULL) { + return -ENOSPC; + } + + _remove_from_tree(start, size); + + *offset = start; + *length = size; + return 0; +} + +void AvlAllocator::_release(const interval_set<uint64_t>& release_set) +{ + for (auto p = release_set.begin(); p != release_set.end(); ++p) { + const auto offset = p.get_start(); + const auto length = p.get_len(); + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + _add_to_tree(offset, length); + } +} + +void AvlAllocator::_release(const PExtentVector& release_set) { + for (auto& e : release_set) { + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << e.offset + << " length 0x" << e.length + << std::dec << dendl; + _add_to_tree(e.offset, e.length); + } +} + +void AvlAllocator::_shutdown() +{ + range_size_tree.clear(); + range_tree.clear_and_dispose(dispose_rs{}); +} + +AvlAllocator::AvlAllocator(CephContext* cct, + int64_t device_size, + int64_t block_size, + uint64_t max_mem, + const std::string& name) : + Allocator(name), + num_total(device_size), + block_size(block_size), + range_size_alloc_threshold( + cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")), + range_size_alloc_free_pct( + cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")), + range_count_cap(max_mem / sizeof(range_seg_t)), + cct(cct) +{} + +AvlAllocator::AvlAllocator(CephContext* cct, + int64_t device_size, + int64_t block_size, + const std::string& name) : + Allocator(name), + num_total(device_size), + block_size(block_size), + range_size_alloc_threshold( + cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")), + range_size_alloc_free_pct( + cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")), + cct(cct) +{} + +AvlAllocator::~AvlAllocator() +{ + shutdown(); +} + +int64_t AvlAllocator::allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, // unused, for now! + PExtentVector* extents) +{ + ldout(cct, 10) << __func__ << std::hex + << " want 0x" << want + << " unit 0x" << unit + << " max_alloc_size 0x" << max_alloc_size + << " hint 0x" << hint + << std::dec << dendl; + ceph_assert(isp2(unit)); + ceph_assert(want % unit == 0); + + if (max_alloc_size == 0) { + max_alloc_size = want; + } + if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max(); + max_alloc_size >= cap) { + max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size); + } + std::lock_guard l(lock); + return _allocate(want, unit, max_alloc_size, hint, extents); +} + +void AvlAllocator::release(const interval_set<uint64_t>& release_set) { + std::lock_guard l(lock); + _release(release_set); +} + +uint64_t AvlAllocator::get_free() +{ + std::lock_guard l(lock); + return num_free; +} + +double AvlAllocator::get_fragmentation() +{ + std::lock_guard l(lock); + return _get_fragmentation(); +} + +void AvlAllocator::dump() +{ + std::lock_guard l(lock); + _dump(); +} + +void AvlAllocator::_dump() const +{ + ldout(cct, 0) << __func__ << " range_tree: " << dendl; + for (auto& rs : range_tree) { + ldout(cct, 0) << std::hex + << "0x" << rs.start << "~" << rs.end + << std::dec + << dendl; + } + + ldout(cct, 0) << __func__ << " range_size_tree: " << dendl; + for (auto& rs : range_size_tree) { + ldout(cct, 0) << std::hex + << "0x" << rs.start << "~" << rs.end + << std::dec + << dendl; + } +} + +void AvlAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify) +{ + for (auto& rs : range_tree) { + notify(rs.start, rs.end - rs.start); + } +} + +void AvlAllocator::init_add_free(uint64_t offset, uint64_t length) +{ + std::lock_guard l(lock); + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + _add_to_tree(offset, length); +} + +void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length) +{ + std::lock_guard l(lock); + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + _remove_from_tree(offset, length); +} + +void AvlAllocator::shutdown() +{ + std::lock_guard l(lock); + _shutdown(); +} diff --git a/src/os/bluestore/AvlAllocator.h b/src/os/bluestore/AvlAllocator.h new file mode 100755 index 00000000..bcc3f8b0 --- /dev/null +++ b/src/os/bluestore/AvlAllocator.h @@ -0,0 +1,257 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <mutex> +#include <boost/intrusive/avl_set.hpp> + +#include "Allocator.h" +#include "os/bluestore/bluestore_types.h" +#include "include/mempool.h" + +struct range_seg_t { + MEMPOOL_CLASS_HELPERS(); ///< memory monitoring + uint64_t start; ///< starting offset of this segment + uint64_t end; ///< ending offset (non-inclusive) + + range_seg_t(uint64_t start, uint64_t end) + : start{start}, + end{end} + {} + // Tree is sorted by offset, greater offsets at the end of the tree. + struct before_t { + template<typename KeyLeft, typename KeyRight> + bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const { + return lhs.end <= rhs.start; + } + }; + boost::intrusive::avl_set_member_hook<> offset_hook; + + // Tree is sorted by size, larger sizes at the end of the tree. + struct shorter_t { + template<typename KeyType> + bool operator()(const range_seg_t& lhs, const KeyType& rhs) const { + auto lhs_size = lhs.end - lhs.start; + auto rhs_size = rhs.end - rhs.start; + if (lhs_size < rhs_size) { + return true; + } else if (lhs_size > rhs_size) { + return false; + } else { + return lhs.start < rhs.start; + } + } + }; + inline uint64_t length() const { + return end - start; + } + boost::intrusive::avl_set_member_hook<> size_hook; +}; + +class AvlAllocator : public Allocator { + struct dispose_rs { + void operator()(range_seg_t* p) + { + delete p; + } + }; + +protected: + /* + * ctor intended for the usage from descendant class(es) which + * provides handling for spilled over entries + * (when entry count >= max_entries) + */ + AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size, + uint64_t max_mem, + const std::string& name); + +public: + AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size, + const std::string& name); + ~AvlAllocator(); + int64_t allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector *extents) override; + void release(const interval_set<uint64_t>& release_set) override; + int64_t get_capacity() const { + return num_total; + } + + uint64_t get_block_size() const { + return block_size; + } + uint64_t get_free() override; + double get_fragmentation() override; + + void dump() override; + void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override; + void init_add_free(uint64_t offset, uint64_t length) override; + void init_rm_free(uint64_t offset, uint64_t length) override; + void shutdown() override; + +private: + template<class Tree> + uint64_t _block_picker(const Tree& t, uint64_t *cursor, uint64_t size, + uint64_t align); + int _allocate( + uint64_t size, + uint64_t unit, + uint64_t *offset, + uint64_t *length); + + using range_tree_t = + boost::intrusive::avl_set< + range_seg_t, + boost::intrusive::compare<range_seg_t::before_t>, + boost::intrusive::member_hook< + range_seg_t, + boost::intrusive::avl_set_member_hook<>, + &range_seg_t::offset_hook>>; + range_tree_t range_tree; ///< main range tree + /* + * The range_size_tree should always contain the + * same number of segments as the range_tree. + * The only difference is that the range_size_tree + * is ordered by segment sizes. + */ + using range_size_tree_t = + boost::intrusive::avl_multiset< + range_seg_t, + boost::intrusive::compare<range_seg_t::shorter_t>, + boost::intrusive::member_hook< + range_seg_t, + boost::intrusive::avl_set_member_hook<>, + &range_seg_t::size_hook>, + boost::intrusive::constant_time_size<true>>; + range_size_tree_t range_size_tree; + + const int64_t num_total; ///< device size + const uint64_t block_size; ///< block size + uint64_t num_free = 0; ///< total bytes in freelist + + /* + * This value defines the number of elements in the ms_lbas array. + * The value of 64 was chosen as it covers all power of 2 buckets + * up to UINT64_MAX. + * This is the equivalent of highest-bit of UINT64_MAX. + */ + static constexpr unsigned MAX_LBAS = 64; + uint64_t lbas[MAX_LBAS] = {0}; + + /* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the allocator cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ + uint64_t range_size_alloc_threshold = 0; + /* + * The minimum free space, in percent, which must be available + * in allocator to continue allocations in a first-fit fashion. + * Once the allocator's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ + int range_size_alloc_free_pct = 0; + + /* + * Max amount of range entries allowed. 0 - unlimited + */ + uint64_t range_count_cap = 0; + + void _range_size_tree_rm(range_seg_t& r) { + ceph_assert(num_free >= r.length()); + num_free -= r.length(); + range_size_tree.erase(r); + + } + void _range_size_tree_try_insert(range_seg_t& r) { + if (_try_insert_range(r.start, r.end)) { + range_size_tree.insert(r); + num_free += r.length(); + } else { + range_tree.erase_and_dispose(r, dispose_rs{}); + } + } + bool _try_insert_range(uint64_t start, + uint64_t end, + range_tree_t::iterator* insert_pos = nullptr) { + bool res = !range_count_cap || range_size_tree.size() < range_count_cap; + bool remove_lowest = false; + if (!res) { + if (end - start > _lowest_size_available()) { + remove_lowest = true; + res = true; + } + } + if (!res) { + _spillover_range(start, end); + } else { + // NB: we should do insertion before the following removal + // to avoid potential iterator disposal insertion might depend on. + if (insert_pos) { + auto new_rs = new range_seg_t{ start, end }; + range_tree.insert_before(*insert_pos, *new_rs); + range_size_tree.insert(*new_rs); + num_free += new_rs->length(); + } + if (remove_lowest) { + auto r = range_size_tree.begin(); + _range_size_tree_rm(*r); + _spillover_range(r->start, r->end); + range_tree.erase_and_dispose(*r, dispose_rs{}); + } + } + return res; + } + virtual void _spillover_range(uint64_t start, uint64_t end) { + // this should be overriden when range count cap is present, + // i.e. (range_count_cap > 0) + ceph_assert(false); + } +protected: + // called when extent to be released/marked free + virtual void _add_to_tree(uint64_t start, uint64_t size); + +protected: + CephContext* cct; + std::mutex lock; + + double _get_fragmentation() const { + auto free_blocks = p2align(num_free, block_size) / block_size; + if (free_blocks <= 1) { + return .0; + } + return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1)); + } + void _dump() const; + + uint64_t _lowest_size_available() { + auto rs = range_size_tree.begin(); + return rs != range_size_tree.end() ? rs->length() : 0; + } + + int64_t _allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector *extents); + + void _release(const interval_set<uint64_t>& release_set); + void _release(const PExtentVector& release_set); + void _shutdown(); + + void _process_range_removal(uint64_t start, uint64_t end, range_tree_t::iterator& rs); + void _remove_from_tree(uint64_t start, uint64_t size); + void _try_remove_from_tree(uint64_t start, uint64_t size, + std::function<void(uint64_t offset, uint64_t length, bool found)> cb); + + uint64_t _get_free() const { + return num_free; + } +}; diff --git a/src/os/bluestore/BitmapAllocator.cc b/src/os/bluestore/BitmapAllocator.cc new file mode 100755 index 00000000..c24a333a --- /dev/null +++ b/src/os/bluestore/BitmapAllocator.cc @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "BitmapAllocator.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore +#undef dout_prefix +#define dout_prefix *_dout << "fbmap_alloc " << this << " " + +BitmapAllocator::BitmapAllocator(CephContext* _cct, + int64_t capacity, + int64_t alloc_unit, + const std::string& name) : + Allocator(name), + cct(_cct) +{ + ldout(cct, 10) << __func__ << " 0x" << std::hex << capacity << "/" + << alloc_unit << std::dec << dendl; + _init(capacity, alloc_unit, false); +} + +int64_t BitmapAllocator::allocate( + uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size, + int64_t hint, PExtentVector *extents) +{ + uint64_t allocated = 0; + size_t old_size = extents->size(); + ldout(cct, 10) << __func__ << std::hex << " 0x" << want_size + << "/" << alloc_unit << "," << max_alloc_size << "," << hint + << std::dec << dendl; + + + _allocate_l2(want_size, alloc_unit, max_alloc_size, hint, + &allocated, extents); + if (!allocated) { + return -ENOSPC; + } + for (auto i = old_size; i < extents->size(); ++i) { + auto& e = (*extents)[i]; + ldout(cct, 10) << __func__ + << " extent: 0x" << std::hex << e.offset << "~" << e.length + << "/" << alloc_unit << "," << max_alloc_size << "," << hint + << std::dec << dendl; + } + return int64_t(allocated); +} + +void BitmapAllocator::release( + const interval_set<uint64_t>& release_set) +{ + for (auto r : release_set) { + ldout(cct, 10) << __func__ << " 0x" << std::hex << r.first << "~" << r.second + << std::dec << dendl; + } + _free_l2(release_set); + ldout(cct, 10) << __func__ << " done" << dendl; +} + + +void BitmapAllocator::init_add_free(uint64_t offset, uint64_t length) +{ + ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + + auto mas = get_min_alloc_size(); + uint64_t offs = round_up_to(offset, mas); + uint64_t l = p2align(offset + length - offs, mas); + + _mark_free(offs, l); + ldout(cct, 10) << __func__ << " done" << dendl; +} +void BitmapAllocator::init_rm_free(uint64_t offset, uint64_t length) +{ + ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + auto mas = get_min_alloc_size(); + uint64_t offs = round_up_to(offset, mas); + uint64_t l = p2align(offset + length - offs, mas); + _mark_allocated(offs, l); + ldout(cct, 10) << __func__ << " done" << dendl; +} + +void BitmapAllocator::shutdown() +{ + ldout(cct, 1) << __func__ << dendl; + _shutdown(); +} + +void BitmapAllocator::dump() +{ + // bin -> interval count + std::map<size_t, size_t> bins_overall; + collect_stats(bins_overall); + auto it = bins_overall.begin(); + while (it != bins_overall.end()) { + ldout(cct, 0) << __func__ + << " bin " << it->first + << "(< " << byte_u_t((1 << (it->first + 1)) * get_min_alloc_size()) << ")" + << " : " << it->second << " extents" + << dendl; + ++it; + } +} + +void BitmapAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify) +{ + size_t alloc_size = get_min_alloc_size(); + auto multiply_by_alloc_size = [alloc_size, notify](size_t off, size_t len) { + notify(off * alloc_size, len * alloc_size); + }; + std::lock_guard lck(lock); + l1.dump(multiply_by_alloc_size); +} diff --git a/src/os/bluestore/BitmapAllocator.h b/src/os/bluestore/BitmapAllocator.h new file mode 100755 index 00000000..51ebaa42 --- /dev/null +++ b/src/os/bluestore/BitmapAllocator.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H +#define CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H + +#include <mutex> + +#include "Allocator.h" +#include "os/bluestore/bluestore_types.h" +#include "fastbmap_allocator_impl.h" +#include "include/mempool.h" +#include "common/debug.h" + +class BitmapAllocator : public Allocator, + public AllocatorLevel02<AllocatorLevel01Loose> { + CephContext* cct; + +public: + BitmapAllocator(CephContext* _cct, int64_t capacity, int64_t alloc_unit, const std::string& name); + ~BitmapAllocator() override + { + } + + int64_t allocate( + uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size, + int64_t hint, PExtentVector *extents) override; + + void release( + const interval_set<uint64_t>& release_set) override; + + using Allocator::release; + + uint64_t get_free() override + { + return get_available(); + } + + void dump() override; + void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override; + double get_fragmentation() override + { + return _get_fragmentation(); + } + + void init_add_free(uint64_t offset, uint64_t length) override; + void init_rm_free(uint64_t offset, uint64_t length) override; + + void shutdown() override; +}; + +#endif diff --git a/src/os/bluestore/BitmapFreelistManager.cc b/src/os/bluestore/BitmapFreelistManager.cc new file mode 100644 index 00000000..13c60b49 --- /dev/null +++ b/src/os/bluestore/BitmapFreelistManager.cc @@ -0,0 +1,600 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "BitmapFreelistManager.h" +#include "kv/KeyValueDB.h" +#include "os/kv.h" + +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore +#undef dout_prefix +#define dout_prefix *_dout << "freelist " + +void make_offset_key(uint64_t offset, std::string *key) +{ + key->reserve(10); + _key_encode_u64(offset, key); +} + +struct XorMergeOperator : public KeyValueDB::MergeOperator { + void merge_nonexistent( + const char *rdata, size_t rlen, std::string *new_value) override { + *new_value = std::string(rdata, rlen); + } + void merge( + const char *ldata, size_t llen, + const char *rdata, size_t rlen, + std::string *new_value) override { + ceph_assert(llen == rlen); + *new_value = std::string(ldata, llen); + for (size_t i = 0; i < rlen; ++i) { + (*new_value)[i] ^= rdata[i]; + } + } + // We use each operator name and each prefix to construct the + // overall RocksDB operator name for consistency check at open time. + const char *name() const override { + return "bitwise_xor"; + } +}; + +void BitmapFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix) +{ + std::shared_ptr<XorMergeOperator> merge_op(new XorMergeOperator); + db->set_merge_operator(prefix, merge_op); +} + +BitmapFreelistManager::BitmapFreelistManager(CephContext* cct, + string meta_prefix, + string bitmap_prefix) + : FreelistManager(cct), + meta_prefix(meta_prefix), + bitmap_prefix(bitmap_prefix), + enumerate_bl_pos(0) +{ +} + +int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity, + KeyValueDB::Transaction txn) +{ + bytes_per_block = granularity; + ceph_assert(isp2(bytes_per_block)); + size = p2align(new_size, bytes_per_block); + blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key; + + _init_misc(); + + blocks = size / bytes_per_block; + if (blocks / blocks_per_key * blocks_per_key != blocks) { + blocks = (blocks / blocks_per_key + 1) * blocks_per_key; + dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size + << " to 0x" << (blocks * bytes_per_block) + << " (0x" << blocks << " blocks)" << std::dec << dendl; + // set past-eof blocks as allocated + _xor(size, blocks * bytes_per_block - size, txn); + } + dout(10) << __func__ + << " size 0x" << std::hex << size + << " bytes_per_block 0x" << bytes_per_block + << " blocks 0x" << blocks + << " blocks_per_key 0x" << blocks_per_key + << std::dec << dendl; + { + bufferlist bl; + encode(bytes_per_block, bl); + txn->set(meta_prefix, "bytes_per_block", bl); + } + { + bufferlist bl; + encode(blocks_per_key, bl); + txn->set(meta_prefix, "blocks_per_key", bl); + } + { + bufferlist bl; + encode(blocks, bl); + txn->set(meta_prefix, "blocks", bl); + } + { + bufferlist bl; + encode(size, bl); + txn->set(meta_prefix, "size", bl); + } + return 0; +} + +int BitmapFreelistManager::expand(uint64_t new_size, KeyValueDB::Transaction txn) +{ + assert(new_size > size); + ceph_assert(isp2(bytes_per_block)); + + uint64_t blocks0 = size / bytes_per_block; + if (blocks0 / blocks_per_key * blocks_per_key != blocks0) { + blocks0 = (blocks0 / blocks_per_key + 1) * blocks_per_key; + dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size + << " to 0x" << (blocks0 * bytes_per_block) + << " (0x" << blocks0 << " blocks)" << std::dec << dendl; + // reset previous past-eof blocks to unallocated + _xor(size, blocks0 * bytes_per_block - size, txn); + } + + size = p2align(new_size, bytes_per_block); + blocks = size / bytes_per_block; + + if (blocks / blocks_per_key * blocks_per_key != blocks) { + blocks = (blocks / blocks_per_key + 1) * blocks_per_key; + dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size + << " to 0x" << (blocks * bytes_per_block) + << " (0x" << blocks << " blocks)" << std::dec << dendl; + // set past-eof blocks as allocated + _xor(size, blocks * bytes_per_block - size, txn); + } + + dout(10) << __func__ + << " size 0x" << std::hex << size + << " bytes_per_block 0x" << bytes_per_block + << " blocks 0x" << blocks + << " blocks_per_key 0x" << blocks_per_key + << std::dec << dendl; + { + bufferlist bl; + encode(blocks, bl); + txn->set(meta_prefix, "blocks", bl); + } + { + bufferlist bl; + encode(size, bl); + txn->set(meta_prefix, "size", bl); + } + return 0; +} + +int BitmapFreelistManager::init(KeyValueDB *kvdb) +{ + dout(1) << __func__ << dendl; + + KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix); + it->lower_bound(string()); + + // load meta + while (it->valid()) { + string k = it->key(); + if (k == "bytes_per_block") { + bufferlist bl = it->value(); + auto p = bl.cbegin(); + decode(bytes_per_block, p); + dout(10) << __func__ << " bytes_per_block 0x" << std::hex + << bytes_per_block << std::dec << dendl; + } else if (k == "blocks") { + bufferlist bl = it->value(); + auto p = bl.cbegin(); + decode(blocks, p); + dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec + << dendl; + } else if (k == "size") { + bufferlist bl = it->value(); + auto p = bl.cbegin(); + decode(size, p); + dout(10) << __func__ << " size 0x" << std::hex << size << std::dec + << dendl; + } else if (k == "blocks_per_key") { + bufferlist bl = it->value(); + auto p = bl.cbegin(); + decode(blocks_per_key, p); + dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key + << std::dec << dendl; + } else { + derr << __func__ << " unrecognized meta " << k << dendl; + return -EIO; + } + it->next(); + } + + dout(10) << __func__ << std::hex + << " size 0x" << size + << " bytes_per_block 0x" << bytes_per_block + << " blocks 0x" << blocks + << " blocks_per_key 0x" << blocks_per_key + << std::dec << dendl; + _init_misc(); + return 0; +} + +void BitmapFreelistManager::_init_misc() +{ + bufferptr z(blocks_per_key >> 3); + memset(z.c_str(), 0xff, z.length()); + all_set_bl.clear(); + all_set_bl.append(z); + + block_mask = ~(bytes_per_block - 1); + + bytes_per_key = bytes_per_block * blocks_per_key; + key_mask = ~(bytes_per_key - 1); + dout(10) << __func__ << std::hex << " bytes_per_key 0x" << bytes_per_key + << ", key_mask 0x" << key_mask << std::dec + << dendl; +} + +void BitmapFreelistManager::shutdown() +{ + dout(1) << __func__ << dendl; +} + +void BitmapFreelistManager::enumerate_reset() +{ + std::lock_guard l(lock); + enumerate_offset = 0; + enumerate_bl_pos = 0; + enumerate_bl.clear(); + enumerate_p.reset(); +} + +int get_next_clear_bit(bufferlist& bl, int start) +{ + const char *p = bl.c_str(); + int bits = bl.length() << 3; + while (start < bits) { + // byte = start / 8 (or start >> 3) + // bit = start % 8 (or start & 7) + unsigned char byte_mask = 1 << (start & 7); + if ((p[start >> 3] & byte_mask) == 0) { + return start; + } + ++start; + } + return -1; // not found +} + +int get_next_set_bit(bufferlist& bl, int start) +{ + const char *p = bl.c_str(); + int bits = bl.length() << 3; + while (start < bits) { + int which_byte = start / 8; + int which_bit = start % 8; + unsigned char byte_mask = 1 << which_bit; + if (p[which_byte] & byte_mask) { + return start; + } + ++start; + } + return -1; // not found +} + +bool BitmapFreelistManager::enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length) +{ + std::lock_guard l(lock); + + // initial base case is a bit awkward + if (enumerate_offset == 0 && enumerate_bl_pos == 0) { + dout(10) << __func__ << " start" << dendl; + enumerate_p = kvdb->get_iterator(bitmap_prefix); + enumerate_p->lower_bound(string()); + // we assert that the first block is always allocated; it's true, + // and it simplifies our lives a bit. + ceph_assert(enumerate_p->valid()); + string k = enumerate_p->key(); + const char *p = k.c_str(); + _key_decode_u64(p, &enumerate_offset); + enumerate_bl = enumerate_p->value(); + ceph_assert(enumerate_offset == 0); + ceph_assert(get_next_set_bit(enumerate_bl, 0) == 0); + } + + if (enumerate_offset >= size) { + dout(10) << __func__ << " end" << dendl; + return false; + } + + // skip set bits to find offset + while (true) { + enumerate_bl_pos = get_next_clear_bit(enumerate_bl, enumerate_bl_pos); + if (enumerate_bl_pos >= 0) { + *offset = _get_offset(enumerate_offset, enumerate_bl_pos); + dout(30) << __func__ << " found clear bit, key 0x" << std::hex + << enumerate_offset << " bit 0x" << enumerate_bl_pos + << " offset 0x" << *offset + << std::dec << dendl; + break; + } + dout(30) << " no more clear bits in 0x" << std::hex << enumerate_offset + << std::dec << dendl; + enumerate_p->next(); + enumerate_bl.clear(); + if (!enumerate_p->valid()) { + enumerate_offset += bytes_per_key; + enumerate_bl_pos = 0; + *offset = _get_offset(enumerate_offset, enumerate_bl_pos); + break; + } + string k = enumerate_p->key(); + const char *p = k.c_str(); + uint64_t next = enumerate_offset + bytes_per_key; + _key_decode_u64(p, &enumerate_offset); + enumerate_bl = enumerate_p->value(); + enumerate_bl_pos = 0; + if (enumerate_offset > next) { + dout(30) << " no key at 0x" << std::hex << next << ", got 0x" + << enumerate_offset << std::dec << dendl; + *offset = next; + break; + } + } + + // skip clear bits to find the end + uint64_t end = 0; + if (enumerate_p->valid()) { + while (true) { + enumerate_bl_pos = get_next_set_bit(enumerate_bl, enumerate_bl_pos); + if (enumerate_bl_pos >= 0) { + end = _get_offset(enumerate_offset, enumerate_bl_pos); + dout(30) << __func__ << " found set bit, key 0x" << std::hex + << enumerate_offset << " bit 0x" << enumerate_bl_pos + << " offset 0x" << end << std::dec + << dendl; + end = std::min(get_alloc_units() * bytes_per_block, end); + *length = end - *offset; + dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length + << std::dec << dendl; + return true; + } + dout(30) << " no more set bits in 0x" << std::hex << enumerate_offset + << std::dec << dendl; + enumerate_p->next(); + enumerate_bl.clear(); + enumerate_bl_pos = 0; + if (!enumerate_p->valid()) { + break; + } + string k = enumerate_p->key(); + const char *p = k.c_str(); + _key_decode_u64(p, &enumerate_offset); + enumerate_bl = enumerate_p->value(); + } + } + + if (enumerate_offset < size) { + end = get_alloc_units() * bytes_per_block; + *length = end - *offset; + dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length + << std::dec << dendl; + enumerate_offset = size; + enumerate_bl_pos = blocks_per_key; + return true; + } + + dout(10) << __func__ << " end" << dendl; + return false; +} + +void BitmapFreelistManager::dump(KeyValueDB *kvdb) +{ + enumerate_reset(); + uint64_t offset, length; + while (enumerate_next(kvdb, &offset, &length)) { + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + } +} + +void BitmapFreelistManager::_verify_range(KeyValueDB *kvdb, + uint64_t offset, uint64_t length, + int val) +{ + unsigned errors = 0; + uint64_t first_key = offset & key_mask; + uint64_t last_key = (offset + length - 1) & key_mask; + if (first_key == last_key) { + string k; + make_offset_key(first_key, &k); + bufferlist bl; + kvdb->get(bitmap_prefix, k, &bl); + if (bl.length() > 0) { + const char *p = bl.c_str(); + unsigned s = (offset & ~key_mask) / bytes_per_block; + unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block; + for (unsigned i = s; i <= e; ++i) { + int has = !!(p[i >> 3] & (1ull << (i & 7))); + if (has != val) { + derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x" + << i << " has 0x" << has << " expected 0x" << val + << std::dec << dendl; + ++errors; + } + } + } else { + if (val) { + derr << __func__ << " key 0x" << std::hex << first_key + << " not present, expected 0x" << val << std::dec << dendl; + ++errors; + } + } + } else { + // first key + { + string k; + make_offset_key(first_key, &k); + bufferlist bl; + kvdb->get(bitmap_prefix, k, &bl); + if (bl.length()) { + const char *p = bl.c_str(); + unsigned s = (offset & ~key_mask) / bytes_per_block; + unsigned e = blocks_per_key; + for (unsigned i = s; i < e; ++i) { + int has = !!(p[i >> 3] & (1ull << (i & 7))); + if (has != val) { + derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x" + << i << " has 0x" << has << " expected 0x" << val << std::dec + << dendl; + ++errors; + } + } + } else { + if (val) { + derr << __func__ << " key 0x" << std::hex << first_key + << " not present, expected 0x" << val << std::dec << dendl; + ++errors; + } + } + first_key += bytes_per_key; + } + // middle keys + if (first_key < last_key) { + while (first_key < last_key) { + string k; + make_offset_key(first_key, &k); + bufferlist bl; + kvdb->get(bitmap_prefix, k, &bl); + if (bl.length() > 0) { + const char *p = bl.c_str(); + for (unsigned i = 0; i < blocks_per_key; ++i) { + int has = !!(p[i >> 3] & (1ull << (i & 7))); + if (has != val) { + derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x" + << i << " has 0x" << has << " expected 0x" << val + << std::dec << dendl; + ++errors; + } + } + } else { + if (val) { + derr << __func__ << " key 0x" << std::hex << first_key + << " not present, expected 0x" << val << std::dec << dendl; + ++errors; + } + } + first_key += bytes_per_key; + } + } + ceph_assert(first_key == last_key); + { + string k; + make_offset_key(first_key, &k); + bufferlist bl; + kvdb->get(bitmap_prefix, k, &bl); + if (bl.length() > 0) { + const char *p = bl.c_str(); + unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block; + for (unsigned i = 0; i < e; ++i) { + int has = !!(p[i >> 3] & (1ull << (i & 7))); + if (has != val) { + derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x" + << i << " has 0x" << has << " expected 0x" << val << std::dec + << dendl; + ++errors; + } + } + } else { + if (val) { + derr << __func__ << " key 0x" << std::hex << first_key + << " not present, expected 0x" << val << std::dec << dendl; + ++errors; + } + } + } + } + if (errors) { + derr << __func__ << " saw " << errors << " errors" << dendl; + ceph_abort_msg("bitmap freelist errors"); + } +} + +void BitmapFreelistManager::allocate( + uint64_t offset, uint64_t length, + KeyValueDB::Transaction txn) +{ + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + _xor(offset, length, txn); +} + +void BitmapFreelistManager::release( + uint64_t offset, uint64_t length, + KeyValueDB::Transaction txn) +{ + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + _xor(offset, length, txn); +} + +void BitmapFreelistManager::_xor( + uint64_t offset, uint64_t length, + KeyValueDB::Transaction txn) +{ + // must be block aligned + ceph_assert((offset & block_mask) == offset); + ceph_assert((length & block_mask) == length); + + uint64_t first_key = offset & key_mask; + uint64_t last_key = (offset + length - 1) & key_mask; + dout(20) << __func__ << " first_key 0x" << std::hex << first_key + << " last_key 0x" << last_key << std::dec << dendl; + + if (first_key == last_key) { + bufferptr p(blocks_per_key >> 3); + p.zero(); + unsigned s = (offset & ~key_mask) / bytes_per_block; + unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block; + for (unsigned i = s; i <= e; ++i) { + p[i >> 3] ^= 1ull << (i & 7); + } + string k; + make_offset_key(first_key, &k); + bufferlist bl; + bl.append(p); + dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": "; + bl.hexdump(*_dout, false); + *_dout << dendl; + txn->merge(bitmap_prefix, k, bl); + } else { + // first key + { + bufferptr p(blocks_per_key >> 3); + p.zero(); + unsigned s = (offset & ~key_mask) / bytes_per_block; + unsigned e = blocks_per_key; + for (unsigned i = s; i < e; ++i) { + p[i >> 3] ^= 1ull << (i & 7); + } + string k; + make_offset_key(first_key, &k); + bufferlist bl; + bl.append(p); + dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": "; + bl.hexdump(*_dout, false); + *_dout << dendl; + txn->merge(bitmap_prefix, k, bl); + first_key += bytes_per_key; + } + // middle keys + while (first_key < last_key) { + string k; + make_offset_key(first_key, &k); + dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec + << ": "; + all_set_bl.hexdump(*_dout, false); + *_dout << dendl; + txn->merge(bitmap_prefix, k, all_set_bl); + first_key += bytes_per_key; + } + ceph_assert(first_key == last_key); + { + bufferptr p(blocks_per_key >> 3); + p.zero(); + unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block; + for (unsigned i = 0; i <= e; ++i) { + p[i >> 3] ^= 1ull << (i & 7); + } + string k; + make_offset_key(first_key, &k); + bufferlist bl; + bl.append(p); + dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": "; + bl.hexdump(*_dout, false); + *_dout << dendl; + txn->merge(bitmap_prefix, k, bl); + } + } +} diff --git a/src/os/bluestore/BitmapFreelistManager.h b/src/os/bluestore/BitmapFreelistManager.h new file mode 100644 index 00000000..9f076e77 --- /dev/null +++ b/src/os/bluestore/BitmapFreelistManager.h @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OS_BLUESTORE_BITMAPFREELISTMANAGER_H +#define CEPH_OS_BLUESTORE_BITMAPFREELISTMANAGER_H + +#include "FreelistManager.h" + +#include <string> +#include <mutex> + +#include "common/ceph_mutex.h" +#include "include/buffer.h" +#include "kv/KeyValueDB.h" + +class BitmapFreelistManager : public FreelistManager { + std::string meta_prefix, bitmap_prefix; + std::shared_ptr<KeyValueDB::MergeOperator> merge_op; + ceph::mutex lock = ceph::make_mutex("BitmapFreelistManager::lock"); + + uint64_t size; ///< size of device (bytes) + uint64_t bytes_per_block; ///< bytes per block (bdev_block_size) + uint64_t blocks_per_key; ///< blocks (bits) per key/value pair + uint64_t bytes_per_key; ///< bytes per key/value pair + uint64_t blocks; ///< size of device (blocks, size rounded up) + + uint64_t block_mask; ///< mask to convert byte offset to block offset + uint64_t key_mask; ///< mask to convert offset to key offset + + bufferlist all_set_bl; + + KeyValueDB::Iterator enumerate_p; + uint64_t enumerate_offset; ///< logical offset; position + bufferlist enumerate_bl; ///< current key at enumerate_offset + int enumerate_bl_pos; ///< bit position in enumerate_bl + + uint64_t _get_offset(uint64_t key_off, int bit) { + return key_off + bit * bytes_per_block; + } + + void _init_misc(); + + void _verify_range(KeyValueDB *kvdb, + uint64_t offset, uint64_t length, int val); + void _xor( + uint64_t offset, uint64_t length, + KeyValueDB::Transaction txn); + +public: + BitmapFreelistManager(CephContext* cct, string meta_prefix, + string bitmap_prefix); + + static void setup_merge_operator(KeyValueDB *db, string prefix); + + int create(uint64_t size, uint64_t granularity, + KeyValueDB::Transaction txn) override; + + int expand(uint64_t new_size, + KeyValueDB::Transaction txn) override; + + int init(KeyValueDB *kvdb) override; + void shutdown() override; + + void dump(KeyValueDB *kvdb) override; + + void enumerate_reset() override; + bool enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length) override; + + void allocate( + uint64_t offset, uint64_t length, + KeyValueDB::Transaction txn) override; + void release( + uint64_t offset, uint64_t length, + KeyValueDB::Transaction txn) override; + + inline uint64_t get_size() const override { + return size; + } + inline uint64_t get_alloc_units() const override { + return size / bytes_per_block; + } + inline uint64_t get_alloc_size() const override { + return bytes_per_block; + } + +}; + +#endif diff --git a/src/os/bluestore/BlockDevice.cc b/src/os/bluestore/BlockDevice.cc new file mode 100644 index 00000000..edfc2fb9 --- /dev/null +++ b/src/os/bluestore/BlockDevice.cc @@ -0,0 +1,157 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <libgen.h> +#include <unistd.h> + +#include "BlockDevice.h" + +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) +#include "KernelDevice.h" +#endif + +#if defined(HAVE_SPDK) +#include "NVMEDevice.h" +#endif + +#if defined(HAVE_PMEM) +#include "PMEMDevice.h" +#include "libpmem.h" +#endif + +#include "common/debug.h" +#include "common/EventTrace.h" +#include "common/errno.h" +#include "include/compat.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bdev +#undef dout_prefix +#define dout_prefix *_dout << "bdev " + +void IOContext::aio_wait() +{ + std::unique_lock l(lock); + // see _aio_thread for waker logic + while (num_running.load() > 0) { + dout(10) << __func__ << " " << this + << " waiting for " << num_running.load() << " aios to complete" + << dendl; + cond.wait(l); + } + dout(20) << __func__ << " " << this << " done" << dendl; +} + +uint64_t IOContext::get_num_ios() const +{ + // this is about the simplest model for transaction cost you can + // imagine. there is some fixed overhead cost by saying there is a + // minimum of one "io". and then we have some cost per "io" that is + // a configurable (with different hdd and ssd defaults), and add + // that to the bytes value. + uint64_t ios = 0; +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) + ios += pending_aios.size(); +#endif +#ifdef HAVE_SPDK + ios += total_nseg; +#endif + return ios; +} + +void IOContext::release_running_aios() +{ + ceph_assert(!num_running); +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) + // release aio contexts (including pinned buffers). + running_aios.clear(); +#endif +} + +BlockDevice *BlockDevice::create(CephContext* cct, const string& path, + aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv) +{ + string type = "kernel"; + char buf[PATH_MAX + 1]; + int r = ::readlink(path.c_str(), buf, sizeof(buf) - 1); + if (r >= 0) { + buf[r] = '\0'; + char *bname = ::basename(buf); + if (strncmp(bname, SPDK_PREFIX, sizeof(SPDK_PREFIX)-1) == 0) + type = "ust-nvme"; + } + +#if defined(HAVE_PMEM) + if (type == "kernel") { + int is_pmem = 0; + size_t map_len = 0; + void *addr = pmem_map_file(path.c_str(), 0, PMEM_FILE_EXCL, O_RDONLY, &map_len, &is_pmem); + if (addr != NULL) { + if (is_pmem) + type = "pmem"; + else + dout(1) << path.c_str() << " isn't pmem file" << dendl; + pmem_unmap(addr, map_len); + } else { + dout(1) << "pmem_map_file:" << path.c_str() << " failed." << pmem_errormsg() << dendl; + } + } +#endif + + dout(1) << __func__ << " path " << path << " type " << type << dendl; + +#if defined(HAVE_PMEM) + if (type == "pmem") { + return new PMEMDevice(cct, cb, cbpriv); + } +#endif +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) + if (type == "kernel") { + return new KernelDevice(cct, cb, cbpriv, d_cb, d_cbpriv); + } +#endif +#if defined(HAVE_SPDK) + if (type == "ust-nvme") { + return new NVMEDevice(cct, cb, cbpriv); + } +#endif + + + derr << __func__ << " unknown backend " << type << dendl; + ceph_abort(); + return NULL; +} + +void BlockDevice::queue_reap_ioc(IOContext *ioc) +{ + std::lock_guard l(ioc_reap_lock); + if (ioc_reap_count.load() == 0) + ++ioc_reap_count; + ioc_reap_queue.push_back(ioc); +} + +void BlockDevice::reap_ioc() +{ + if (ioc_reap_count.load()) { + std::lock_guard l(ioc_reap_lock); + for (auto p : ioc_reap_queue) { + dout(20) << __func__ << " reap ioc " << p << dendl; + delete p; + } + ioc_reap_queue.clear(); + --ioc_reap_count; + } +} diff --git a/src/os/bluestore/BlockDevice.h b/src/os/bluestore/BlockDevice.h new file mode 100644 index 00000000..315d46c1 --- /dev/null +++ b/src/os/bluestore/BlockDevice.h @@ -0,0 +1,245 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OS_BLUESTORE_BLOCKDEVICE_H +#define CEPH_OS_BLUESTORE_BLOCKDEVICE_H + +#include <atomic> +#include <condition_variable> +#include <list> +#include <map> +#include <mutex> +#include <set> +#include <string> +#include <vector> + +#include "acconfig.h" +#include "common/ceph_mutex.h" + +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) +#include "ceph_aio.h" +#endif +#include "include/ceph_assert.h" +#include "include/buffer.h" +#include "include/interval_set.h" +#define SPDK_PREFIX "spdk:" + +#if defined(__linux__) +#if !defined(F_SET_FILE_RW_HINT) +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) +#endif +// These values match Linux definition +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 +#define WRITE_LIFE_NOT_SET 0 // No hint information set +#define WRITE_LIFE_NONE 1 // No hints about write life time +#define WRITE_LIFE_SHORT 2 // Data written has a short life time +#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time +#define WRITE_LIFE_LONG 4 // Data written has a long life time +#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time +#define WRITE_LIFE_MAX 6 +#else +// On systems don't have WRITE_LIFE_* only use one FD +// And all files are created equal +#define WRITE_LIFE_NOT_SET 0 // No hint information set +#define WRITE_LIFE_NONE 0 // No hints about write life time +#define WRITE_LIFE_SHORT 0 // Data written has a short life time +#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time +#define WRITE_LIFE_LONG 0 // Data written has a long life time +#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time +#define WRITE_LIFE_MAX 1 +#endif + +class CephContext; + +/// track in-flight io +struct IOContext { +private: + ceph::mutex lock = ceph::make_mutex("IOContext::lock"); + ceph::condition_variable cond; + int r = 0; + +public: + CephContext* cct; + void *priv; +#ifdef HAVE_SPDK + void *nvme_task_first = nullptr; + void *nvme_task_last = nullptr; + std::atomic_int total_nseg = {0}; +#endif + +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) + std::list<aio_t> pending_aios; ///< not yet submitted + std::list<aio_t> running_aios; ///< submitting or submitted +#endif + std::atomic_int num_pending = {0}; + std::atomic_int num_running = {0}; + bool allow_eio; + + explicit IOContext(CephContext* cct, void *p, bool allow_eio = false) + : cct(cct), priv(p), allow_eio(allow_eio) + {} + + // no copying + IOContext(const IOContext& other) = delete; + IOContext &operator=(const IOContext& other) = delete; + + bool has_pending_aios() { + return num_pending.load(); + } + void release_running_aios(); + void aio_wait(); + uint64_t get_num_ios() const; + + void try_aio_wake() { + assert(num_running >= 1); + + std::lock_guard l(lock); + if (num_running.fetch_sub(1) == 1) { + + // we might have some pending IOs submitted after the check + // as there is no lock protection for aio_submit. + // Hence we might have false conditional trigger. + // aio_wait has to handle that hence do not care here. + cond.notify_all(); + } + } + + void set_return_value(int _r) { + r = _r; + } + + int get_return_value() const { + return r; + } +}; + + +class BlockDevice { +public: + CephContext* cct; + typedef void (*aio_callback_t)(void *handle, void *aio); +private: + ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock"); + std::vector<IOContext*> ioc_reap_queue; + std::atomic_int ioc_reap_count = {0}; + +protected: + uint64_t size; + uint64_t block_size; + bool support_discard = false; + bool rotational = true; + bool lock_exclusive = true; + +public: + aio_callback_t aio_callback; + void *aio_callback_priv; + BlockDevice(CephContext* cct, aio_callback_t cb, void *cbpriv) + : cct(cct), + size(0), + block_size(0), + aio_callback(cb), + aio_callback_priv(cbpriv) + {} + virtual ~BlockDevice() = default; + + static BlockDevice *create( + CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); + virtual bool supported_bdev_label() { return true; } + virtual bool is_rotational() { return rotational; } + + virtual void aio_submit(IOContext *ioc) = 0; + + void set_no_exclusive_lock() { + lock_exclusive = false; + } + + uint64_t get_size() const { return size; } + uint64_t get_block_size() const { return block_size; } + + /// hook to provide utilization of thinly-provisioned device + virtual bool get_thin_utilization(uint64_t *total, uint64_t *avail) const { + return false; + } + + virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0; + + virtual int get_devname(std::string *out) { + return -ENOENT; + } + virtual int get_devices(std::set<std::string> *ls) { + std::string s; + if (get_devname(&s) == 0) { + ls->insert(s); + } + return 0; + } + virtual int get_numa_node(int *node) const { + return -EOPNOTSUPP; + } + + virtual int read( + uint64_t off, + uint64_t len, + bufferlist *pbl, + IOContext *ioc, + bool buffered) = 0; + virtual int read_random( + uint64_t off, + uint64_t len, + char *buf, + bool buffered) = 0; + virtual int write( + uint64_t off, + bufferlist& bl, + bool buffered, + int write_hint = WRITE_LIFE_NOT_SET) = 0; + + virtual int aio_read( + uint64_t off, + uint64_t len, + bufferlist *pbl, + IOContext *ioc) = 0; + virtual int aio_write( + uint64_t off, + bufferlist& bl, + IOContext *ioc, + bool buffered, + int write_hint = WRITE_LIFE_NOT_SET) = 0; + virtual int flush() = 0; + virtual int discard(uint64_t offset, uint64_t len) { return 0; } + virtual int queue_discard(interval_set<uint64_t> &to_release) { return -1; } + virtual void discard_drain() { return; } + + void queue_reap_ioc(IOContext *ioc); + void reap_ioc(); + + // for managing buffered readers/writers + virtual int invalidate_cache(uint64_t off, uint64_t len) = 0; + virtual int open(const std::string& path) = 0; + virtual void close() = 0; + +protected: + bool is_valid_io(uint64_t off, uint64_t len) const { + return (off % block_size == 0 && + len % block_size == 0 && + len > 0 && + off < size && + off + len <= size); + } +}; + +#endif //CEPH_OS_BLUESTORE_BLOCKDEVICE_H diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc new file mode 100644 index 00000000..f7bda939 --- /dev/null +++ b/src/os/bluestore/BlueFS.cc @@ -0,0 +1,3665 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "boost/algorithm/string.hpp" +#include "BlueFS.h" + +#include "common/debug.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "BlockDevice.h" +#include "Allocator.h" +#include "include/ceph_assert.h" +#include "common/admin_socket.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bluefs +#undef dout_prefix +#define dout_prefix *_dout << "bluefs " + +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer, + bluefs_file_reader_buffer, bluefs_file_reader); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs); + +static void wal_discard_cb(void *priv, void* priv2) { + BlueFS *bluefs = static_cast<BlueFS*>(priv); + interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); + bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp); +} + +static void db_discard_cb(void *priv, void* priv2) { + BlueFS *bluefs = static_cast<BlueFS*>(priv); + interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); + bluefs->handle_discard(BlueFS::BDEV_DB, *tmp); +} + +static void slow_discard_cb(void *priv, void* priv2) { + BlueFS *bluefs = static_cast<BlueFS*>(priv); + interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); + bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp); +} + +class BlueFS::SocketHook : public AdminSocketHook { + BlueFS* bluefs; +public: + static BlueFS::SocketHook* create(BlueFS* bluefs) + { + BlueFS::SocketHook* hook = nullptr; + AdminSocket* admin_socket = bluefs->cct->get_admin_socket(); + if (admin_socket) { + hook = new BlueFS::SocketHook(bluefs); + int r = admin_socket->register_command("bluestore bluefs available", + "bluestore bluefs available " + "name=alloc_size,type=CephInt,req=false", + hook, + "Report available space for bluefs. " + "If alloc_size set, make simulation."); + if (r != 0) { + ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl; + delete hook; + hook = nullptr; + } else { + r = admin_socket->register_command("bluestore bluefs stats", + "bluestore bluefs stats", + hook, + "Dump internal statistics for bluefs."); + ceph_assert(r == 0); + r = admin_socket->register_command("bluefs debug_inject_read_zeros", + "bluefs debug_inject_read_zeros", + hook, + "Injects 8K zeros into next BlueFS read. Debug only."); + ceph_assert(r == 0); + } + } + return hook; + } + + ~SocketHook() { + AdminSocket* admin_socket = bluefs->cct->get_admin_socket(); + admin_socket->unregister_commands(this); + } +private: + SocketHook(BlueFS* bluefs) : + bluefs(bluefs) {} + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + stringstream ss; + bool r = true; + if (command == "bluestore bluefs available") { + int64_t alloc_size = 0; + cmd_getval(bluefs->cct, cmdmap, "alloc_size", alloc_size); + if ((alloc_size & (alloc_size - 1)) != 0) { + ss << "Invalid allocation size:'" << alloc_size << std::endl; + } + if (alloc_size == 0) + alloc_size = bluefs->cct->_conf->bluefs_alloc_size; + Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); + f->open_object_section("bluefs_available_space"); + for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) { + if (bluefs->bdev[dev]) { + f->open_object_section("dev"); + f->dump_string("device", bluefs->get_device_name(dev)); + ceph_assert(bluefs->alloc[dev]); + f->dump_int("free", bluefs->alloc[dev]->get_free()); + f->close_section(); + } + } + size_t extra_space = 0; + if (bluefs->slow_dev_expander) { + extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size); + } + f->dump_int("available_from_bluestore", extra_space); + f->close_section(); + f->flush(ss); + delete f; + } else if (command == "bluestore bluefs stats") { + bluefs->dump_block_extents(ss); + bluefs->dump_volume_selector(ss); + } else if (command == "bluefs debug_inject_read_zeros") { + bluefs->inject_read_zeros++; + } else { + ss << "Invalid command" << std::endl; + r = false; + } + out.append(ss); + return r; + } +}; + +BlueFS::BlueFS(CephContext* cct) + : cct(cct), + bdev(MAX_BDEV), + ioc(MAX_BDEV), + block_all(MAX_BDEV) +{ + discard_cb[BDEV_WAL] = wal_discard_cb; + discard_cb[BDEV_DB] = db_discard_cb; + discard_cb[BDEV_SLOW] = slow_discard_cb; + asok_hook = SocketHook::create(this); +} + +BlueFS::~BlueFS() +{ + delete asok_hook; + for (auto p : ioc) { + if (p) + p->aio_wait(); + } + for (auto p : bdev) { + if (p) { + p->close(); + delete p; + } + } + for (auto p : ioc) { + delete p; + } +} + +void BlueFS::_init_logger() +{ + PerfCountersBuilder b(cct, "bluefs", + l_bluefs_first, l_bluefs_last); + b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes", + "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes", + "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes", + "Total bytes (main db device)", + "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes", + "Used bytes (main db device)", + "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", + "Total bytes (wal device)", + "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes", + "Used bytes (wal device)", + "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes", + "Total bytes (slow device)", + "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes", + "Used bytes (slow device)", + "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluefs_num_files, "num_files", "File count", + "f", PerfCountersBuilder::PRIO_USEFUL); + b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log", + "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluefs_log_compactions, "log_compactions", + "Compactions of the metadata log"); + b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes", + "Bytes written to the metadata log", "j", + PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal", + "Files written to WAL"); + b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst", + "Files written to SSTs"); + b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal", + "Bytes written to WAL", "wal", + PerfCountersBuilder::PRIO_CRITICAL); + b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst", + "Bytes written to SSTs", "sst", + PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow", + "Bytes written to WAL/SSTs at slow device", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal", + "Maximum bytes allocated from WAL"); + b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db", + "Maximum bytes allocated from DB"); + b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow", + "Maximum bytes allocated from SLOW"); + + b.add_u64_counter(l_bluefs_read_random_count, "read_random_count", + "random read requests processed"); + b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes", + "Bytes requested in random read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count", + "random reads requests going to disk"); + b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes", + "Bytes read from disk in random read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count", + "random read requests processed using prefetch buffer"); + b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes", + "Bytes read from prefetch buffer in random read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + + b.add_u64_counter(l_bluefs_read_count, "read_count", + "buffered read requests processed"); + b.add_u64_counter(l_bluefs_read_bytes, "read_bytes", + "Bytes requested in buffered read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + + b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count", + "prefetch read requests processed"); + b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes", + "Bytes requested in prefetch read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate", + "How many times bluefs read found page with all 0s"); + b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors", + "How many times bluefs read found transient page with all 0s"); + + logger = b.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); +} + +void BlueFS::_shutdown_logger() +{ + cct->get_perfcounters_collection()->remove(logger); + delete logger; +} + +void BlueFS::_update_logger_stats() +{ + // we must be holding the lock + logger->set(l_bluefs_num_files, file_map.size()); + logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size); + + if (alloc[BDEV_WAL]) { + logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size()); + logger->set(l_bluefs_wal_used_bytes, + block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free()); + } + if (alloc[BDEV_DB]) { + logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size()); + logger->set(l_bluefs_db_used_bytes, + block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free()); + } + if (alloc[BDEV_SLOW]) { + logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size()); + logger->set(l_bluefs_slow_used_bytes, + block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free()); + } +} + +int BlueFS::add_block_device(unsigned id, const string& path, bool trim, + bool shared_with_bluestore) +{ + dout(10) << __func__ << " bdev " << id << " path " << path << dendl; + ceph_assert(id < bdev.size()); + ceph_assert(bdev[id] == NULL); + BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, + discard_cb[id], static_cast<void*>(this)); + if (shared_with_bluestore) { + b->set_no_exclusive_lock(); + } + int r = b->open(path); + if (r < 0) { + delete b; + return r; + } + if (trim) { + b->discard(0, b->get_size()); + } + + dout(1) << __func__ << " bdev " << id << " path " << path + << " size " << byte_u_t(b->get_size()) << dendl; + bdev[id] = b; + ioc[id] = new IOContext(cct, NULL); + return 0; +} + +bool BlueFS::bdev_support_label(unsigned id) +{ + ceph_assert(id < bdev.size()); + ceph_assert(bdev[id]); + return bdev[id]->supported_bdev_label(); +} + +uint64_t BlueFS::get_block_device_size(unsigned id) +{ + if (id < bdev.size() && bdev[id]) + return bdev[id]->get_size(); + return 0; +} + +void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length, + bool skip) +{ + dout(1) << __func__ << " bdev " << id + << " 0x" << std::hex << offset << "~" << length << std::dec + << " skip " << skip + << dendl; + + ceph_assert(id < bdev.size()); + ceph_assert(bdev[id]); + ceph_assert(bdev[id]->get_size() >= offset + length); + block_all[id].insert(offset, length); + + if (id < alloc.size() && alloc[id]) { + if (!skip) + log_t.op_alloc_add(id, offset, length); + + alloc[id]->init_add_free(offset, length); + } + + if (logger) + logger->inc(l_bluefs_gift_bytes, length); + dout(10) << __func__ << " done" << dendl; +} + +int BlueFS::reclaim_blocks(unsigned id, uint64_t want, + PExtentVector *extents) +{ + std::unique_lock l(lock); + dout(1) << __func__ << " bdev " << id + << " want 0x" << std::hex << want << std::dec << dendl; + ceph_assert(id < alloc.size()); + ceph_assert(alloc[id]); + + int64_t got = alloc[id]->allocate(want, alloc_size[id], 0, extents); + ceph_assert(got != 0); + if (got < 0) { + derr << __func__ << " failed to allocate space to return to bluestore" + << dendl; + alloc[id]->dump(); + return got; + } + + for (auto& p : *extents) { + block_all[id].erase(p.offset, p.length); + log_t.op_alloc_rm(id, p.offset, p.length); + } + + flush_bdev(); + int r = _flush_and_sync_log(l); + ceph_assert(r == 0); + + logger->inc(l_bluefs_reclaim_bytes, got); + dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want + << " got " << *extents << dendl; + return 0; +} + +void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release) +{ + dout(10) << __func__ << " bdev " << id << dendl; + ceph_assert(alloc[id]); + alloc[id]->release(to_release); +} + +uint64_t BlueFS::get_used() +{ + std::lock_guard l(lock); + uint64_t used = 0; + for (unsigned id = 0; id < MAX_BDEV; ++id) { + if (alloc[id]) { + used += block_all[id].size() - alloc[id]->get_free(); + } + } + return used; +} + +uint64_t BlueFS::get_total(unsigned id) +{ + std::lock_guard l(lock); + ceph_assert(id < block_all.size()); + return block_all[id].size(); +} + +uint64_t BlueFS::get_free(unsigned id) +{ + std::lock_guard l(lock); + ceph_assert(id < alloc.size()); + return alloc[id]->get_free(); +} + +void BlueFS::dump_perf_counters(Formatter *f) +{ + f->open_object_section("bluefs_perf_counters"); + logger->dump_formatted(f,0); + f->close_section(); +} + +void BlueFS::dump_block_extents(ostream& out) +{ + for (unsigned i = 0; i < MAX_BDEV; ++i) { + if (!bdev[i]) { + continue; + } + auto owned = get_total(i); + auto free = get_free(i); + + out << i << " : device size 0x" << std::hex << bdev[i]->get_size() + << " : own 0x" << block_all[i] + << " = 0x" << owned + << " : using 0x" << owned - free + << std::dec << "(" << byte_u_t(owned - free) << ")"; + if (i == _get_slow_device_id()) { + ceph_assert(slow_dev_expander); + ceph_assert(alloc[i]); + free = slow_dev_expander->available_freespace(alloc_size[i]); + out << std::hex + << " : bluestore has 0x" << free + << std::dec << "(" << byte_u_t(free) << ") available"; + } + out << "\n"; + } +} + +void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage) +{ + std::lock_guard l(lock); + usage->resize(bdev.size()); + for (unsigned id = 0; id < bdev.size(); ++id) { + if (!bdev[id]) { + (*usage)[id] = make_pair(0, 0); + continue; + } + (*usage)[id].first = alloc[id]->get_free(); + (*usage)[id].second = block_all[id].size(); + uint64_t used = + (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size(); + dout(10) << __func__ << " bdev " << id + << " free " << (*usage)[id].first + << " (" << byte_u_t((*usage)[id].first) << ")" + << " / " << (*usage)[id].second + << " (" << byte_u_t((*usage)[id].second) << ")" + << ", used " << used << "%" + << dendl; + } +} + +int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " bdev " << id << dendl; + if (id >= block_all.size()) + return -EINVAL; + *extents = block_all[id]; + return 0; +} + +int BlueFS::mkfs(uuid_d osd_uuid) +{ + std::unique_lock l(lock); + dout(1) << __func__ + << " osd_uuid " << osd_uuid + << dendl; + + // set volume selector if not provided before/outside + if (vselector == nullptr) { + vselector.reset( + new OriginalVolumeSelector( + get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, + get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, + get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100)); + } + + _init_alloc(); + _init_logger(); + + super.version = 1; + super.block_size = bdev[BDEV_DB]->get_block_size(); + super.osd_uuid = osd_uuid; + super.uuid.generate_random(); + dout(1) << __func__ << " uuid " << super.uuid << dendl; + + // init log + FileRef log_file = new File; + log_file->fnode.ino = 1; + log_file->vselector_hint = vselector->get_hint_by_device(BDEV_WAL); + int r = _allocate( + vselector->select_prefer_bdev(log_file->vselector_hint), + cct->_conf->bluefs_max_log_runway, + &log_file->fnode); + vselector->add_usage(log_file->vselector_hint, log_file->fnode); + ceph_assert(r == 0); + log_writer = _create_writer(log_file); + + // initial txn + log_t.op_init(); + for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { + interval_set<uint64_t>& p = block_all[bdev]; + if (p.empty()) + continue; + for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { + dout(20) << __func__ << " op_alloc_add " << bdev << " 0x" + << std::hex << q.get_start() << "~" << q.get_len() << std::dec + << dendl; + log_t.op_alloc_add(bdev, q.get_start(), q.get_len()); + } + } + _flush_and_sync_log(l); + + // write supers + super.log_fnode = log_file->fnode; + _write_super(BDEV_DB); + flush_bdev(); + + // clean up + super = bluefs_super_t(); + _close_writer(log_writer); + log_writer = NULL; + block_all.clear(); + vselector.reset(nullptr); + _stop_alloc(); + _shutdown_logger(); + + dout(10) << __func__ << " success" << dendl; + return 0; +} + +void BlueFS::_init_alloc() +{ + dout(20) << __func__ << dendl; + alloc.resize(MAX_BDEV); + alloc_size.resize(MAX_BDEV, 0); + pending_release.resize(MAX_BDEV); + + if (bdev[BDEV_WAL]) { + alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size; + } + if (bdev[BDEV_SLOW]) { + alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size; + alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size; + } else { + alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size; + } + // new wal and db devices are never shared + if (bdev[BDEV_NEWWAL]) { + alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size; + } + if (bdev[BDEV_NEWDB]) { + alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size; + } + + for (unsigned id = 0; id < bdev.size(); ++id) { + if (!bdev[id]) { + continue; + } + ceph_assert(bdev[id]->get_size()); + std::string name = "bluefs-"; + const char* devnames[] = {"wal","db","slow"}; + if (id <= BDEV_SLOW) + name += devnames[id]; + else + name += to_string(uintptr_t(this)); + ceph_assert(alloc_size[id]); + dout(1) << __func__ << " id " << id + << " alloc_size 0x" << std::hex << alloc_size[id] + << " size 0x" << bdev[id]->get_size() << std::dec << dendl; + alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, + bdev[id]->get_size(), + alloc_size[id], name); + interval_set<uint64_t>& p = block_all[id]; + for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { + alloc[id]->init_add_free(q.get_start(), q.get_len()); + } + } +} + +void BlueFS::_stop_alloc() +{ + dout(20) << __func__ << dendl; + for (auto p : bdev) { + if (p) + p->discard_drain(); + } + + for (auto p : alloc) { + if (p != nullptr) { + p->shutdown(); + delete p; + } + } + alloc.clear(); +} + +int BlueFS::read(uint8_t ndev, uint64_t off, uint64_t len, + ceph::buffer::list *pbl, IOContext *ioc, bool buffered) +{ + dout(10) << __func__ << " dev " << int(ndev) + << ": 0x" << std::hex << off << "~" << len << std::dec + << (buffered ? " buffered" : "") + << dendl; + int r; + bufferlist bl; + r = bdev[ndev]->read(off, len, &bl, ioc, buffered); + if (r != 0) { + return r; + } + uint64_t block_size = bdev[ndev]->get_block_size(); + if (inject_read_zeros) { + if (len >= block_size * 2) { + derr << __func__ << " injecting error, zeros at " + << int(ndev) << ": 0x" << std::hex << (off + len / 2) + << "~" << (block_size * 2) << std::dec << dendl; + //use beginning, replace 8K in the middle with zeros, use tail + bufferlist temp; + bl.splice(0, len / 2 - block_size, &temp); + temp.append_zero(block_size * 2); + bl.splice(block_size * 2, len / 2 - block_size, &temp); + bl = temp; + inject_read_zeros--; + } + } + //make a check if there is a block with all 0 + uint64_t to_check_len = len; + uint64_t skip = p2nphase(off, block_size); + if (skip >= to_check_len) { + return r; + } + auto it = bl.begin(); + it.seek(skip); + to_check_len -= skip; + bool all_zeros = false; + while (all_zeros == false && to_check_len >= block_size) { + // checking 0s step + unsigned block_left = block_size; + unsigned avail; + const char* data; + all_zeros = true; + while (all_zeros && block_left > 0) { + avail = it.get_ptr_and_advance(block_left, &data); + block_left -= avail; + all_zeros = mem_is_zero(data, avail); + } + // skipping step + while (block_left > 0) { + avail = it.get_ptr_and_advance(block_left, &data); + block_left -= avail; + } + to_check_len -= block_size; + } + if (all_zeros) { + logger->inc(l_bluefs_read_zeros_candidate, 1); + bufferlist bl_reread; + r = bdev[ndev]->read(off, len, &bl_reread, ioc, buffered); + if (r != 0) { + return r; + } + // check if both read gave the same + if (!bl.contents_equal(bl_reread)) { + // report problems to log, but continue, maybe it will be good now... + derr << __func__ << " initial read of " << int(ndev) + << ": 0x" << std::hex << off << "~" << len + << std::dec << ": different then re-read " << dendl; + logger->inc(l_bluefs_read_zeros_errors, 1); + } + // use second read will be better if is different + pbl->append(bl_reread); + } else { + pbl->append(bl); + } + return r; +} + +int BlueFS::read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered) +{ + dout(10) << __func__ << " dev " << int(ndev) + << ": 0x" << std::hex << off << "~" << len << std::dec + << (buffered ? " buffered" : "") + << dendl; + int r; + r = bdev[ndev]->read_random(off, len, buf, buffered); + if (r != 0) { + return r; + } + uint64_t block_size = bdev[ndev]->get_block_size(); + if (inject_read_zeros) { + if (len >= block_size * 2) { + derr << __func__ << " injecting error, zeros at " + << int(ndev) << ": 0x" << std::hex << (off + len / 2) + << "~" << (block_size * 2) << std::dec << dendl; + //zero middle 8K + memset(buf + len / 2 - block_size, 0, block_size * 2); + inject_read_zeros--; + } + } + //make a check if there is a block with all 0 + uint64_t to_check_len = len; + const char* data = buf; + uint64_t skip = p2nphase(off, block_size); + if (skip >= to_check_len) { + return r; + } + to_check_len -= skip; + data += skip; + + bool all_zeros = false; + while (all_zeros == false && to_check_len >= block_size) { + if (mem_is_zero(data, block_size)) { + // at least one block is all zeros + all_zeros = true; + break; + } + data += block_size; + to_check_len -= block_size; + } + if (all_zeros) { + logger->inc(l_bluefs_read_zeros_candidate, 1); + std::unique_ptr<char[]> data_reread(new char[len]); + r = bdev[ndev]->read_random(off, len, &data_reread[0], buffered); + if (r != 0) { + return r; + } + // check if both read gave the same + if (memcmp(buf, &data_reread[0], len) != 0) { + derr << __func__ << " initial read of " << int(ndev) + << ": 0x" << std::hex << off << "~" << len + << std::dec << ": different then re-read " << dendl; + logger->inc(l_bluefs_read_zeros_errors, 1); + // second read is probably better + memcpy(buf, &data_reread[0], len); + } + } + return r; +} + +int BlueFS::mount() +{ + dout(1) << __func__ << dendl; + + int r = _open_super(); + if (r < 0) { + derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; + goto out; + } + + // set volume selector if not provided before/outside + if (vselector == nullptr) { + vselector.reset( + new OriginalVolumeSelector( + get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, + get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, + get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100)); + } + + block_all.clear(); + block_all.resize(MAX_BDEV); + _init_alloc(); + _init_logger(); + + r = _replay(false, false); + if (r < 0) { + derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; + _stop_alloc(); + goto out; + } + + // init freelist + for (auto& p : file_map) { + dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl; + for (auto& q : p.second->fnode.extents) { + alloc[q.bdev]->init_rm_free(q.offset, q.length); + } + } + + // set up the log for future writes + log_writer = _create_writer(_get_file(1)); + ceph_assert(log_writer->file->fnode.ino == 1); + log_writer->pos = log_writer->file->fnode.size; + dout(10) << __func__ << " log write pos set to 0x" + << std::hex << log_writer->pos << std::dec + << dendl; + + return 0; + + out: + super = bluefs_super_t(); + return r; +} + +void BlueFS::umount(bool avoid_compact) +{ + dout(1) << __func__ << dendl; + + sync_metadata(avoid_compact); + + _close_writer(log_writer); + log_writer = NULL; + + vselector.reset(nullptr); + _stop_alloc(); + file_map.clear(); + dir_map.clear(); + super = bluefs_super_t(); + log_t.clear(); + _shutdown_logger(); +} + +int BlueFS::prepare_new_device(int id) +{ + dout(1) << __func__ << dendl; + + if(id == BDEV_NEWDB) { + int new_log_dev_cur = BDEV_WAL; + int new_log_dev_next = BDEV_WAL; + if (!bdev[BDEV_WAL]) { + new_log_dev_cur = BDEV_NEWDB; + new_log_dev_next = BDEV_DB; + } + _rewrite_log_sync(false, + BDEV_NEWDB, + new_log_dev_cur, + new_log_dev_next, + RENAME_DB2SLOW); + //} + } else if(id == BDEV_NEWWAL) { + _rewrite_log_sync(false, BDEV_DB, BDEV_NEWWAL, BDEV_WAL, REMOVE_WAL); + } else { + assert(false); + } + return 0; +} + +void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id) +{ + if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB]) + bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm); + if (bdev[BDEV_WAL]) + bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm); +} + +void BlueFS::get_devices(set<string> *ls) +{ + for (unsigned i = 0; i < MAX_BDEV; ++i) { + if (bdev[i]) { + bdev[i]->get_devices(ls); + } + } +} + +int BlueFS::fsck() +{ + std::lock_guard l(lock); + dout(1) << __func__ << dendl; + // hrm, i think we check everything on mount... + return 0; +} + +int BlueFS::_write_super(int dev) +{ + // build superblock + bufferlist bl; + encode(super, bl); + uint32_t crc = bl.crc32c(-1); + encode(crc, bl); + dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl; + dout(10) << __func__ << " superblock " << super.version << dendl; + dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; + ceph_assert(bl.length() <= get_super_length()); + bl.append_zero(get_super_length() - bl.length()); + + bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT); + dout(20) << __func__ << " v " << super.version + << " crc 0x" << std::hex << crc + << " offset 0x" << get_super_offset() << std::dec + << dendl; + return 0; +} + +int BlueFS::_open_super() +{ + dout(10) << __func__ << dendl; + + bufferlist bl; + uint32_t expected_crc, crc; + int r; + + // always the second block + r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(), + &bl, ioc[BDEV_DB], false); + if (r < 0) + return r; + + auto p = bl.cbegin(); + decode(super, p); + { + bufferlist t; + t.substr_of(bl, 0, p.get_off()); + crc = t.crc32c(-1); + } + decode(expected_crc, p); + if (crc != expected_crc) { + derr << __func__ << " bad crc on superblock, expected 0x" + << std::hex << expected_crc << " != actual 0x" << crc << std::dec + << dendl; + return -EIO; + } + dout(10) << __func__ << " superblock " << super.version << dendl; + dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; + return 0; +} + +int BlueFS::_replay(bool noop, bool to_stdout) +{ + dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl; + ino_last = 1; // by the log + log_seq = 0; + + FileRef log_file; + log_file = _get_file(1); + if (!noop) { + log_file->fnode = super.log_fnode; + log_file->vselector_hint = + vselector->get_hint_by_device(BDEV_WAL); + } else { + // do not use fnode from superblock in 'noop' mode - log_file's one should + // be fine and up-to-date + ceph_assert(log_file->fnode.ino == 1); + ceph_assert(log_file->fnode.extents.size() != 0); + } + dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; + if (unlikely(to_stdout)) { + std::cout << " log_fnode " << super.log_fnode << std::endl; + } + + FileReader *log_reader = new FileReader( + log_file, cct->_conf->bluefs_max_prefetch, + false, // !random + true); // ignore eof + while (true) { + ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0); + uint64_t pos = log_reader->buf.pos; + uint64_t read_pos = pos; + bufferlist bl; + { + int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size, + &bl, NULL); + if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) { + r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl); + } + assert(r == (int)super.block_size); + read_pos += r; + } + uint64_t more = 0; + uint64_t seq; + uuid_d uuid; + { + auto p = bl.cbegin(); + __u8 a, b; + uint32_t len; + decode(a, p); + decode(b, p); + decode(len, p); + decode(uuid, p); + decode(seq, p); + if (len + 6 > bl.length()) { + more = round_up_to(len + 6 - bl.length(), super.block_size); + } + } + if (uuid != super.uuid) { + dout(10) << __func__ << " 0x" << std::hex << pos << std::dec + << ": stop: uuid " << uuid << " != super.uuid " << super.uuid + << dendl; + break; + } + if (seq != log_seq + 1) { + dout(10) << __func__ << " 0x" << std::hex << pos << std::dec + << ": stop: seq " << seq << " != expected " << log_seq + 1 + << dendl; + break; + } + if (more) { + dout(20) << __func__ << " need 0x" << std::hex << more << std::dec + << " more bytes" << dendl; + bufferlist t; + int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL); + if (r < (int)more) { + dout(10) << __func__ << " 0x" << std::hex << pos + << ": stop: len is 0x" << bl.length() + more << std::dec + << ", which is past eof" << dendl; + if (cct->_conf->bluefs_replay_recovery) { + //try to search for more data + r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t); + if (r < (int)more) { + //in normal mode we must read r==more, for recovery it is too strict + break; + } + } + } + ceph_assert(r == (int)more); + bl.claim_append(t); + read_pos += r; + } + bluefs_transaction_t t; + try { + auto p = bl.cbegin(); + decode(t, p); + } + catch (buffer::error& e) { + dout(10) << __func__ << " 0x" << std::hex << pos << std::dec + << ": stop: failed to decode: " << e.what() + << dendl; + delete log_reader; + return -EIO; + } + ceph_assert(seq == t.seq); + dout(10) << __func__ << " 0x" << std::hex << pos << std::dec + << ": " << t << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": " << t << std::endl; + } + + auto p = t.op_bl.cbegin(); + while (!p.end()) { + __u8 op; + decode(op, p); + switch (op) { + + case bluefs_transaction_t::OP_INIT: + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_init" << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_init" << std::endl; + } + + ceph_assert(t.seq == 1); + break; + + case bluefs_transaction_t::OP_JUMP: + { + uint64_t next_seq; + uint64_t offset; + decode(next_seq, p); + decode(offset, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_jump seq " << next_seq + << " offset 0x" << std::hex << offset << std::dec << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_jump seq " << next_seq + << " offset 0x" << std::hex << offset << std::dec + << std::endl; + } + + ceph_assert(next_seq >= log_seq); + log_seq = next_seq - 1; // we will increment it below + uint64_t skip = offset - read_pos; + if (skip) { + bufferlist junk; + int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk, + NULL); + if (r != (int)skip) { + dout(10) << __func__ << " 0x" << std::hex << read_pos + << ": stop: failed to skip to " << offset + << std::dec << dendl; + ceph_abort_msg("problem with op_jump"); + } + } + } + break; + + case bluefs_transaction_t::OP_JUMP_SEQ: + { + uint64_t next_seq; + decode(next_seq, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_jump_seq " << next_seq << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_jump_seq " << next_seq << std::endl; + } + + ceph_assert(next_seq >= log_seq); + log_seq = next_seq - 1; // we will increment it below + } + break; + + case bluefs_transaction_t::OP_ALLOC_ADD: + { + __u8 id; + uint64_t offset, length; + decode(id, p); + decode(offset, p); + decode(length, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_alloc_add " << " " << (int)id + << ":0x" << std::hex << offset << "~" << length << std::dec + << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_alloc_add " << " " << (int)id + << ":0x" << std::hex << offset << "~" << length << std::dec + << std::endl; + } + + if (!noop) { + block_all[id].insert(offset, length); + alloc[id]->init_add_free(offset, length); + } + } + break; + + case bluefs_transaction_t::OP_ALLOC_RM: + { + __u8 id; + uint64_t offset, length; + decode(id, p); + decode(offset, p); + decode(length, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_alloc_rm " << " " << (int)id + << ":0x" << std::hex << offset << "~" << length << std::dec + << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_alloc_rm " << " " << (int)id + << ":0x" << std::hex << offset << "~" << length << std::dec + << std::endl; + } + + if (!noop) { + block_all[id].erase(offset, length); + alloc[id]->init_rm_free(offset, length); + } + } + break; + + case bluefs_transaction_t::OP_DIR_LINK: + { + string dirname, filename; + uint64_t ino; + decode(dirname, p); + decode(filename, p); + decode(ino, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_dir_link " << " " << dirname << "/" << filename + << " to " << ino + << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_dir_link " << " " << dirname << "/" << filename + << " to " << ino + << std::endl; + } + + if (!noop) { + FileRef file = _get_file(ino); + ceph_assert(file->fnode.ino); + map<string,DirRef>::iterator q = dir_map.find(dirname); + ceph_assert(q != dir_map.end()); + map<string,FileRef>::iterator r = q->second->file_map.find(filename); + ceph_assert(r == q->second->file_map.end()); + + vselector->sub_usage(file->vselector_hint, file->fnode); + file->vselector_hint = + vselector->get_hint_by_dir(dirname); + vselector->add_usage(file->vselector_hint, file->fnode); + + q->second->file_map[filename] = file; + ++file->refs; + } + } + break; + + case bluefs_transaction_t::OP_DIR_UNLINK: + { + string dirname, filename; + decode(dirname, p); + decode(filename, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_dir_unlink " << " " << dirname << "/" << filename + << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_dir_unlink " << " " << dirname << "/" << filename + << std::endl; + } + + if (!noop) { + map<string,DirRef>::iterator q = dir_map.find(dirname); + ceph_assert(q != dir_map.end()); + map<string,FileRef>::iterator r = q->second->file_map.find(filename); + ceph_assert(r != q->second->file_map.end()); + ceph_assert(r->second->refs > 0); + --r->second->refs; + q->second->file_map.erase(r); + } + } + break; + + case bluefs_transaction_t::OP_DIR_CREATE: + { + string dirname; + decode(dirname, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_dir_create " << dirname << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_dir_create " << dirname << std::endl; + } + + if (!noop) { + map<string,DirRef>::iterator q = dir_map.find(dirname); + ceph_assert(q == dir_map.end()); + dir_map[dirname] = new Dir; + } + } + break; + + case bluefs_transaction_t::OP_DIR_REMOVE: + { + string dirname; + decode(dirname, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_dir_remove " << dirname << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_dir_remove " << dirname << std::endl; + } + + if (!noop) { + map<string,DirRef>::iterator q = dir_map.find(dirname); + ceph_assert(q != dir_map.end()); + ceph_assert(q->second->file_map.empty()); + dir_map.erase(q); + } + } + break; + + case bluefs_transaction_t::OP_FILE_UPDATE: + { + bluefs_fnode_t fnode; + decode(fnode, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_file_update " << " " << fnode << " " << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_file_update " << " " << fnode << std::endl; + } + + if (!noop) { + FileRef f = _get_file(fnode.ino); + if (fnode.ino != 1) { + vselector->sub_usage(f->vselector_hint, f->fnode); + } + f->fnode = fnode; + if (fnode.ino != 1) { + vselector->add_usage(f->vselector_hint, f->fnode); + } + + if (fnode.ino > ino_last) { + ino_last = fnode.ino; + } + } + } + break; + + case bluefs_transaction_t::OP_FILE_REMOVE: + { + uint64_t ino; + decode(ino, p); + dout(20) << __func__ << " 0x" << std::hex << pos << std::dec + << ": op_file_remove " << ino << dendl; + if (unlikely(to_stdout)) { + std::cout << " 0x" << std::hex << pos << std::dec + << ": op_file_remove " << ino << std::endl; + } + + if (!noop) { + auto p = file_map.find(ino); + ceph_assert(p != file_map.end()); + vselector->sub_usage(p->second->vselector_hint, p->second->fnode); + file_map.erase(p); + } + } + break; + + default: + derr << __func__ << " 0x" << std::hex << pos << std::dec + << ": stop: unrecognized op " << (int)op << dendl; + delete log_reader; + return -EIO; + } + } + ceph_assert(p.end()); + + // we successfully replayed the transaction; bump the seq and log size + ++log_seq; + log_file->fnode.size = log_reader->buf.pos; + } + vselector->add_usage(log_file->vselector_hint, log_file->fnode); + + dout(10) << __func__ << " log file size was 0x" + << std::hex << log_file->fnode.size << std::dec << dendl; + if (unlikely(to_stdout)) { + std::cout << " log file size was 0x" + << std::hex << log_file->fnode.size << std::dec << std::endl; + } + + delete log_reader; + + if (!noop) { + // verify file link counts are all >0 + for (auto& p : file_map) { + if (p.second->refs == 0 && + p.second->fnode.ino > 1) { + derr << __func__ << " file with link count 0: " << p.second->fnode + << dendl; + return -EIO; + } + } + } + + dout(10) << __func__ << " done" << dendl; + return 0; +} + +int BlueFS::log_dump() +{ + // only dump log file's content + int r = _replay(true, true); + if (r < 0) { + derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +int BlueFS::device_migrate_to_existing( + CephContext *cct, + const set<int>& devs_source, + int dev_target) +{ + vector<byte> buf; + bool buffered = cct->_conf->bluefs_buffered_io; + + dout(10) << __func__ << " devs_source " << devs_source + << " dev_target " << dev_target << dendl; + assert(dev_target < (int)MAX_BDEV); + + int flags = 0; + flags |= devs_source.count(BDEV_DB) ? + (REMOVE_DB | RENAME_SLOW2DB) : 0; + flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; + int dev_target_new = dev_target; + + // Slow device without separate DB one is addressed via BDEV_DB + // Hence need renaming. + if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) { + dev_target_new = BDEV_DB; + dout(0) << __func__ << " super to be written to " << dev_target << dendl; + } + + for (auto& p : file_map) { + //do not copy log + if (p.second->fnode.ino == 1) { + continue; + } + dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl; + + auto& fnode_extents = p.second->fnode.extents; + + bool rewrite = false; + for (auto ext_it = fnode_extents.begin(); + ext_it != p.second->fnode.extents.end(); + ++ext_it) { + if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) { + rewrite = true; + break; + } + } + if (rewrite) { + dout(10) << __func__ << " migrating" << dendl; + + // read entire file + bufferlist bl; + for (auto old_ext : fnode_extents) { + buf.resize(old_ext.length); + int r = bdev[old_ext.bdev]->read_random( + old_ext.offset, + old_ext.length, + (char*)&buf.at(0), + buffered); + if (r != 0) { + derr << __func__ << " failed to read 0x" << std::hex + << old_ext.offset << "~" << old_ext.length << std::dec + << " from " << (int)dev_target << dendl; + return -EIO; + } + bl.append((char*)&buf[0], old_ext.length); + } + + // write entire file + PExtentVector extents; + auto l = _allocate_without_fallback(dev_target, bl.length(), &extents); + if (l < 0) { + derr << __func__ << " unable to allocate len 0x" << std::hex + << bl.length() << std::dec << " from " << (int)dev_target + << ": " << cpp_strerror(l) << dendl; + return -ENOSPC; + } + + uint64_t off = 0; + for (auto& i : extents) { + bufferlist cur; + uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off); + ceph_assert(cur_len > 0); + cur.substr_of(bl, off, cur_len); + int r = bdev[dev_target]->write(i.offset, cur, buffered); + ceph_assert(r == 0); + off += cur_len; + } + + // release old extents + for (auto old_ext : fnode_extents) { + PExtentVector to_release; + to_release.emplace_back(old_ext.offset, old_ext.length); + alloc[old_ext.bdev]->release(to_release); + } + + // update fnode + fnode_extents.clear(); + for (auto& i : extents) { + fnode_extents.emplace_back(dev_target_new, i.offset, i.length); + } + } else { + for (auto ext_it = fnode_extents.begin(); + ext_it != p.second->fnode.extents.end(); + ++ext_it) { + if (dev_target != dev_target_new && ext_it->bdev == dev_target) { + dout(20) << __func__ << " " << " ... adjusting extent 0x" + << std::hex << ext_it->offset << std::dec + << " bdev " << dev_target << " -> " << dev_target_new + << dendl; + ext_it->bdev = dev_target_new; + } + } + } + } + // new logging device in the current naming scheme + int new_log_dev_cur = bdev[BDEV_WAL] ? + BDEV_WAL : + bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW; + + // new logging device in new naming scheme + int new_log_dev_next = new_log_dev_cur; + + if (devs_source.count(new_log_dev_cur)) { + // SLOW device is addressed via BDEV_DB too hence either WAL or DB + new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ? + BDEV_DB : + BDEV_WAL; + + dout(0) << __func__ << " log moved from " << new_log_dev_cur + << " to " << new_log_dev_next << dendl; + + new_log_dev_cur = + (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ? + BDEV_SLOW : + new_log_dev_next; + } + + _rewrite_log_sync( + false, + (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB, + new_log_dev_cur, + new_log_dev_next, + flags); + return 0; +} + +int BlueFS::device_migrate_to_new( + CephContext *cct, + const set<int>& devs_source, + int dev_target) +{ + vector<byte> buf; + bool buffered = cct->_conf->bluefs_buffered_io; + + dout(10) << __func__ << " devs_source " << devs_source + << " dev_target " << dev_target << dendl; + assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL); + + int flags = 0; + + flags |= devs_source.count(BDEV_DB) ? + (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) : + 0; + flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; + int dev_target_new = dev_target; //FIXME: remove, makes no sense + + for (auto& p : file_map) { + //do not copy log + if (p.second->fnode.ino == 1) { + continue; + } + dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl; + + auto& fnode_extents = p.second->fnode.extents; + + bool rewrite = false; + for (auto ext_it = fnode_extents.begin(); + ext_it != p.second->fnode.extents.end(); + ++ext_it) { + if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) { + rewrite = true; + break; + } + } + if (rewrite) { + dout(10) << __func__ << " migrating" << dendl; + + // read entire file + bufferlist bl; + for (auto old_ext : fnode_extents) { + buf.resize(old_ext.length); + int r = bdev[old_ext.bdev]->read_random( + old_ext.offset, + old_ext.length, + (char*)&buf.at(0), + buffered); + if (r != 0) { + derr << __func__ << " failed to read 0x" << std::hex + << old_ext.offset << "~" << old_ext.length << std::dec + << " from " << (int)dev_target << dendl; + return -EIO; + } + bl.append((char*)&buf[0], old_ext.length); + } + + // write entire file + PExtentVector extents; + auto l = _allocate_without_fallback(dev_target, bl.length(), &extents); + if (l < 0) { + derr << __func__ << " unable to allocate len 0x" << std::hex + << bl.length() << std::dec << " from " << (int)dev_target + << ": " << cpp_strerror(l) << dendl; + return -ENOSPC; + } + + uint64_t off = 0; + for (auto& i : extents) { + bufferlist cur; + uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off); + ceph_assert(cur_len > 0); + cur.substr_of(bl, off, cur_len); + int r = bdev[dev_target]->write(i.offset, cur, buffered); + ceph_assert(r == 0); + off += cur_len; + } + + // release old extents + for (auto old_ext : fnode_extents) { + PExtentVector to_release; + to_release.emplace_back(old_ext.offset, old_ext.length); + alloc[old_ext.bdev]->release(to_release); + } + + // update fnode + fnode_extents.clear(); + for (auto& i : extents) { + fnode_extents.emplace_back(dev_target_new, i.offset, i.length); + } + } + } + // new logging device in the current naming scheme + int new_log_dev_cur = + bdev[BDEV_NEWWAL] ? + BDEV_NEWWAL : + bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ? + BDEV_WAL : + bdev[BDEV_NEWDB] ? + BDEV_NEWDB : + bdev[BDEV_DB] && !(flags & REMOVE_DB)? + BDEV_DB : + BDEV_SLOW; + + // new logging device in new naming scheme + int new_log_dev_next = + new_log_dev_cur == BDEV_NEWWAL ? + BDEV_WAL : + new_log_dev_cur == BDEV_NEWDB ? + BDEV_DB : + new_log_dev_cur; + + int super_dev = + dev_target == BDEV_NEWDB ? + BDEV_NEWDB : + bdev[BDEV_DB] ? + BDEV_DB : + BDEV_SLOW; + + _rewrite_log_sync( + false, + super_dev, + new_log_dev_cur, + new_log_dev_next, + flags); + return 0; +} + +BlueFS::FileRef BlueFS::_get_file(uint64_t ino) +{ + auto p = file_map.find(ino); + if (p == file_map.end()) { + FileRef f = new File; + file_map[ino] = f; + dout(30) << __func__ << " ino " << ino << " = " << f + << " (new)" << dendl; + return f; + } else { + dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl; + return p->second; + } +} + +void BlueFS::_drop_link(FileRef file) +{ + dout(20) << __func__ << " had refs " << file->refs + << " on " << file->fnode << dendl; + ceph_assert(file->refs > 0); + --file->refs; + if (file->refs == 0) { + dout(20) << __func__ << " destroying " << file->fnode << dendl; + ceph_assert(file->num_reading.load() == 0); + vselector->sub_usage(file->vselector_hint, file->fnode); + log_t.op_file_remove(file->fnode.ino); + for (auto& r : file->fnode.extents) { + pending_release[r.bdev].insert(r.offset, r.length); + } + file_map.erase(file->fnode.ino); + file->deleted = true; + + if (file->dirty_seq) { + ceph_assert(file->dirty_seq > log_seq_stable); + ceph_assert(dirty_files.count(file->dirty_seq)); + auto it = dirty_files[file->dirty_seq].iterator_to(*file); + dirty_files[file->dirty_seq].erase(it); + file->dirty_seq = 0; + } + } +} + +int64_t BlueFS::_read_random( + FileReader *h, ///< [in] read from here + uint64_t off, ///< [in] offset + size_t len, ///< [in] this many bytes + char *out) ///< [out] optional: or copy it here +{ + auto* buf = &h->buf; + + int64_t ret = 0; + dout(10) << __func__ << " h " << h + << " 0x" << std::hex << off << "~" << len << std::dec + << " from " << h->file->fnode << dendl; + + ++h->file->num_reading; + + if (!h->ignore_eof && + off + len > h->file->fnode.size) { + if (off > h->file->fnode.size) + len = 0; + else + len = h->file->fnode.size - off; + dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" + << std::hex << len << std::dec << dendl; + } + logger->inc(l_bluefs_read_random_count, 1); + logger->inc(l_bluefs_read_random_bytes, len); + + std::shared_lock s_lock(h->lock); + buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader); + while (len > 0) { + if (off < buf->bl_off || off >= buf->get_buf_end()) { + s_lock.unlock(); + uint64_t x_off = 0; + auto p = h->file->fnode.seek(off, &x_off); + ceph_assert(p != h->file->fnode.extents.end()); + uint64_t l = std::min(p->length - x_off, static_cast<uint64_t>(len)); + //hard cap to 1GB + l = std::min(l, uint64_t(1) << 30); + dout(20) << __func__ << " read random 0x" + << std::hex << x_off << "~" << l << std::dec + << " of " << *p << dendl; + int r; + if (!cct->_conf->bluefs_check_for_zeros) { + r = bdev[p->bdev]->read_random(p->offset + x_off, l, out, + cct->_conf->bluefs_buffered_io); + } else { + r = read_random(p->bdev, p->offset + x_off, l, out, + cct->_conf->bluefs_buffered_io); + } + ceph_assert(r == 0); + off += l; + len -= l; + ret += l; + out += l; + + logger->inc(l_bluefs_read_random_disk_count, 1); + logger->inc(l_bluefs_read_random_disk_bytes, l); + if (len > 0) { + s_lock.lock(); + } + } else { + auto left = buf->get_buf_remaining(off); + int64_t r = std::min(len, left); + logger->inc(l_bluefs_read_random_buffer_count, 1); + logger->inc(l_bluefs_read_random_buffer_bytes, r); + dout(20) << __func__ << " left 0x" << std::hex << left + << " 0x" << off << "~" << len << std::dec + << dendl; + + if (out) { + // NOTE: h->bl is normally a contiguous buffer so c_str() is free. + memcpy(out, buf->bl.c_str() + off - buf->bl_off, r); + out += r; + } + + dout(30) << __func__ << " result chunk (0x" + << std::hex << r << std::dec << " bytes):\n"; + bufferlist t; + t.substr_of(buf->bl, off - buf->bl_off, r); + t.hexdump(*_dout); + *_dout << dendl; + + off += r; + len -= r; + ret += r; + buf->pos += r; + } + } + dout(20) << __func__ << " got " << ret << dendl; + --h->file->num_reading; + return ret; +} + +int64_t BlueFS::_read( + FileReader *h, ///< [in] read from here + FileReaderBuffer *buf, ///< [in] reader state + uint64_t off, ///< [in] offset + size_t len, ///< [in] this many bytes + bufferlist *outbl, ///< [out] optional: reference the result here + char *out) ///< [out] optional: or copy it here +{ + bool prefetch = !outbl && !out; + dout(10) << __func__ << " h " << h + << " 0x" << std::hex << off << "~" << len << std::dec + << " from " << h->file->fnode + << (prefetch ? " prefetch" : "") + << dendl; + + ++h->file->num_reading; + + if (!h->ignore_eof && + off + len > h->file->fnode.size) { + if (off > h->file->fnode.size) + len = 0; + else + len = h->file->fnode.size - off; + dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" + << std::hex << len << std::dec << dendl; + } + logger->inc(l_bluefs_read_count, 1); + logger->inc(l_bluefs_read_bytes, len); + if (prefetch) { + logger->inc(l_bluefs_read_prefetch_count, 1); + logger->inc(l_bluefs_read_prefetch_bytes, len); + } + + if (outbl) + outbl->clear(); + + int64_t ret = 0; + std::shared_lock s_lock(h->lock); + while (len > 0) { + size_t left; + if (off < buf->bl_off || off >= buf->get_buf_end()) { + s_lock.unlock(); + std::unique_lock u_lock(h->lock); + buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader); + if (off < buf->bl_off || off >= buf->get_buf_end()) { + // if precondition hasn't changed during locking upgrade. + buf->bl.clear(); + buf->bl_off = off & super.block_mask(); + uint64_t x_off = 0; + auto p = h->file->fnode.seek(buf->bl_off, &x_off); + if (p == h->file->fnode.extents.end()) { + dout(5) << __func__ << " reading less then required " + << ret << "<" << ret + len << dendl; + break; + } + + uint64_t want = round_up_to(len + (off & ~super.block_mask()), + super.block_size); + want = std::max(want, buf->max_prefetch); + uint64_t l = std::min(p->length - x_off, want); + //hard cap to 1GB + l = std::min(l, uint64_t(1) << 30); + uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size); + if (!h->ignore_eof && + buf->bl_off + l > eof_offset) { + l = eof_offset - buf->bl_off; + } + dout(20) << __func__ << " fetching 0x" + << std::hex << x_off << "~" << l << std::dec + << " of " << *p << dendl; + int r; + if (!cct->_conf->bluefs_check_for_zeros) { + r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev], + cct->_conf->bluefs_buffered_io); + } else { + r = read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev], + cct->_conf->bluefs_buffered_io); + } + ceph_assert(r == 0); + } + u_lock.unlock(); + s_lock.lock(); + // we should recheck if buffer is valid after lock downgrade + continue; + } + left = buf->get_buf_remaining(off); + dout(20) << __func__ << " left 0x" << std::hex << left + << " len 0x" << len << std::dec << dendl; + + int64_t r = std::min(len, left); + if (outbl) { + bufferlist t; + t.substr_of(buf->bl, off - buf->bl_off, r); + outbl->claim_append(t); + } + if (out) { + // NOTE: h->bl is normally a contiguous buffer so c_str() is free. + memcpy(out, buf->bl.c_str() + off - buf->bl_off, r); + out += r; + } + + dout(30) << __func__ << " result chunk (0x" + << std::hex << r << std::dec << " bytes):\n"; + bufferlist t; + t.substr_of(buf->bl, off - buf->bl_off, r); + t.hexdump(*_dout); + *_dout << dendl; + + off += r; + len -= r; + ret += r; + buf->pos += r; + } + dout(20) << __func__ << " got " << ret << dendl; + ceph_assert(!outbl || (int)outbl->length() == ret); + --h->file->num_reading; + return ret; +} + +void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length) +{ + dout(10) << __func__ << " file " << f->fnode + << " 0x" << std::hex << offset << "~" << length << std::dec + << dendl; + if (offset & ~super.block_mask()) { + offset &= super.block_mask(); + length = round_up_to(length, super.block_size); + } + uint64_t x_off = 0; + auto p = f->fnode.seek(offset, &x_off); + while (length > 0 && p != f->fnode.extents.end()) { + uint64_t x_len = std::min(p->length - x_off, length); + bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len); + dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len + << std:: dec << " of " << *p << dendl; + offset += x_len; + length -= x_len; + } +} + +uint64_t BlueFS::_estimate_log_size() +{ + int avg_dir_size = 40; // fixme + int avg_file_size = 12; + uint64_t size = 4096 * 2; + size += file_map.size() * (1 + sizeof(bluefs_fnode_t)); + for (auto& p : block_all) + size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2); + size += dir_map.size() + (1 + avg_dir_size); + size += file_map.size() * (1 + avg_dir_size + avg_file_size); + return round_up_to(size, super.block_size); +} + +void BlueFS::compact_log() +{ + std::unique_lock<ceph::mutex> l(lock); + if (!cct->_conf->bluefs_replay_recovery_disable_compact) { + if (cct->_conf->bluefs_compact_log_sync) { + _compact_log_sync(); + } else { + _compact_log_async(l); + } + } +} + +bool BlueFS::_should_compact_log() +{ + uint64_t current = log_writer->file->fnode.size; + uint64_t expected = _estimate_log_size(); + float ratio = (float)current / (float)expected; + dout(10) << __func__ << " current 0x" << std::hex << current + << " expected " << expected << std::dec + << " ratio " << ratio + << (new_log ? " (async compaction in progress)" : "") + << dendl; + if (new_log || + current < cct->_conf->bluefs_log_compact_min_size || + ratio < cct->_conf->bluefs_log_compact_min_ratio) { + return false; + } + return true; +} + +void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t, + int flags) +{ + t->seq = 1; + t->uuid = super.uuid; + dout(20) << __func__ << " op_init" << dendl; + + t->op_init(); + for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { + interval_set<uint64_t>& p = block_all[bdev]; + for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { + auto bdev_new = bdev; + if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) { + continue; + } + if ((flags & REMOVE_DB) && bdev == BDEV_DB) { + continue; + } + if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { + bdev_new = BDEV_DB; + } + if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { + bdev_new = BDEV_SLOW; + } + if (bdev == BDEV_NEWDB) { + // REMOVE_DB xor RENAME_DB + ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); + ceph_assert(!(flags & RENAME_SLOW2DB)); + bdev_new = BDEV_DB; + } + if (bdev == BDEV_NEWWAL) { + ceph_assert(flags & REMOVE_WAL); + bdev_new = BDEV_WAL; + } + dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x" + << std::hex << q.get_start() << "~" << q.get_len() << std::dec + << dendl; + t->op_alloc_add(bdev_new, q.get_start(), q.get_len()); + } + } + for (auto& p : file_map) { + if (p.first == 1) + continue; + ceph_assert(p.first > 1); + + for(auto& e : p.second->fnode.extents) { + auto bdev = e.bdev; + auto bdev_new = bdev; + ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL)); + if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { + bdev_new = BDEV_DB; + } + if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { + bdev_new = BDEV_SLOW; + } + if (bdev == BDEV_NEWDB) { + // REMOVE_DB xor RENAME_DB + ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); + ceph_assert(!(flags & RENAME_SLOW2DB)); + bdev_new = BDEV_DB; + } + if (bdev == BDEV_NEWWAL) { + ceph_assert(flags & REMOVE_WAL); + bdev_new = BDEV_WAL; + } + e.bdev = bdev_new; + } + dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl; + t->op_file_update(p.second->fnode); + } + for (auto& p : dir_map) { + dout(20) << __func__ << " op_dir_create " << p.first << dendl; + t->op_dir_create(p.first); + for (auto& q : p.second->file_map) { + dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first + << " to " << q.second->fnode.ino << dendl; + t->op_dir_link(p.first, q.first, q.second->fnode.ino); + } + } +} + +void BlueFS::_compact_log_sync() +{ + dout(10) << __func__ << dendl; + auto prefer_bdev = + vselector->select_prefer_bdev(log_writer->file->vselector_hint); + _rewrite_log_sync(true, + BDEV_DB, + prefer_bdev, + prefer_bdev, + 0); + logger->inc(l_bluefs_log_compactions); +} + +void BlueFS::_rewrite_log_sync(bool allocate_with_fallback, + int super_dev, + int log_dev, + int log_dev_new, + int flags) +{ + File *log_file = log_writer->file.get(); + + // clear out log (be careful who calls us!!!) + log_t.clear(); + + dout(20) << __func__ << " super_dev:" << super_dev + << " log_dev:" << log_dev + << " log_dev_new:" << log_dev_new + << " flags:" << flags + << dendl; + bluefs_transaction_t t; + _compact_log_dump_metadata(&t, flags); + + dout(20) << __func__ << " op_jump_seq " << log_seq << dendl; + t.op_jump_seq(log_seq); + + bufferlist bl; + encode(t, bl); + _pad_bl(bl); + + uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway; + dout(20) << __func__ << " need " << need << dendl; + + bluefs_fnode_t old_fnode; + int r; + log_file->fnode.swap_extents(old_fnode); + if (allocate_with_fallback) { + r = _allocate(log_dev, need, &log_file->fnode); + ceph_assert(r == 0); + } else { + PExtentVector extents; + r = _allocate_without_fallback(log_dev, + need, + &extents); + ceph_assert(r == 0); + for (auto& p : extents) { + log_file->fnode.append_extent( + bluefs_extent_t(log_dev, p.offset, p.length)); + } + } + + _close_writer(log_writer); + + log_file->fnode.size = bl.length(); + vselector->sub_usage(log_file->vselector_hint, old_fnode); + vselector->add_usage(log_file->vselector_hint, log_file->fnode); + + log_writer = _create_writer(log_file); + log_writer->append(bl); + r = _flush(log_writer, true); + ceph_assert(r == 0); +#ifdef HAVE_LIBAIO + if (!cct->_conf->bluefs_sync_write) { + list<aio_t> completed_ios; + _claim_completed_aios(log_writer, &completed_ios); + wait_for_aio(log_writer); + completed_ios.clear(); + } +#endif + flush_bdev(); + + super.log_fnode = log_file->fnode; + // rename device if needed + if (log_dev != log_dev_new) { + dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl; + for (auto& p : super.log_fnode.extents) { + p.bdev = log_dev_new; + } + } + dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl; + + ++super.version; + _write_super(super_dev); + flush_bdev(); + + dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl; + for (auto& r : old_fnode.extents) { + pending_release[r.bdev].insert(r.offset, r.length); + } +} + +/* + * 1. Allocate a new extent to continue the log, and then log an event + * that jumps the log write position to the new extent. At this point, the + * old extent(s) won't be written to, and reflect everything to compact. + * New events will be written to the new region that we'll keep. + * + * 2. While still holding the lock, encode a bufferlist that dumps all of the + * in-memory fnodes and names. This will become the new beginning of the + * log. The last event will jump to the log continuation extent from #1. + * + * 3. Queue a write to a new extent for the new beginnging of the log. + * + * 4. Drop lock and wait + * + * 5. Retake the lock. + * + * 6. Update the log_fnode to splice in the new beginning. + * + * 7. Write the new superblock. + * + * 8. Release the old log space. Clean up. + */ +void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l) +{ + dout(10) << __func__ << dendl; + File *log_file = log_writer->file.get(); + ceph_assert(!new_log); + ceph_assert(!new_log_writer); + + // create a new log [writer] so that we know compaction is in progress + // (see _should_compact_log) + new_log = new File; + new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode + + // 0. wait for any racing flushes to complete. (We do not want to block + // in _flush_sync_log with jump_to set or else a racing thread might flush + // our entries and our jump_to update won't be correct.) + while (log_flushing) { + dout(10) << __func__ << " log is currently flushing, waiting" << dendl; + log_cond.wait(l); + } + + vselector->sub_usage(log_file->vselector_hint, log_file->fnode); + + // 1. allocate new log space and jump to it. + old_log_jump_to = log_file->fnode.get_allocated(); + dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to + << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl; + int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint), + cct->_conf->bluefs_max_log_runway, + &log_file->fnode); + ceph_assert(r == 0); + //adjust usage as flush below will need it + vselector->add_usage(log_file->vselector_hint, log_file->fnode); + dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; + + // update the log file change and log a jump to the offset where we want to + // write the new entries + log_t.op_file_update(log_file->fnode); + log_t.op_jump(log_seq, old_log_jump_to); + + flush_bdev(); // FIXME? + + _flush_and_sync_log(l, 0, old_log_jump_to); + + // 2. prepare compacted log + bluefs_transaction_t t; + //avoid record two times in log_t and _compact_log_dump_metadata. + log_t.clear(); + _compact_log_dump_metadata(&t, 0); + + uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL], + std::max(alloc_size[BDEV_DB], + alloc_size[BDEV_SLOW])); + + // conservative estimate for final encoded size + new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2, + max_alloc_size); + t.op_jump(log_seq, new_log_jump_to); + + // allocate + //FIXME: check if we want DB here? + r = _allocate(BlueFS::BDEV_DB, new_log_jump_to, + &new_log->fnode); + ceph_assert(r == 0); + + // we might have some more ops in log_t due to _allocate call + t.claim_ops(log_t); + + bufferlist bl; + encode(t, bl); + _pad_bl(bl); + + dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to + << std::dec << dendl; + + new_log_writer = _create_writer(new_log); + new_log_writer->append(bl); + + // 3. flush + r = _flush(new_log_writer, true); + ceph_assert(r == 0); + + // 4. wait + _flush_bdev_safely(new_log_writer); + + // 5. update our log fnode + // discard first old_log_jump_to extents + + dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec + << " of " << log_file->fnode.extents << dendl; + uint64_t discarded = 0; + mempool::bluefs::vector<bluefs_extent_t> old_extents; + while (discarded < old_log_jump_to) { + ceph_assert(!log_file->fnode.extents.empty()); + bluefs_extent_t& e = log_file->fnode.extents.front(); + bluefs_extent_t temp = e; + if (discarded + e.length <= old_log_jump_to) { + dout(10) << __func__ << " remove old log extent " << e << dendl; + discarded += e.length; + log_file->fnode.pop_front_extent(); + } else { + dout(10) << __func__ << " remove front of old log extent " << e << dendl; + uint64_t drop = old_log_jump_to - discarded; + temp.length = drop; + e.offset += drop; + e.length -= drop; + discarded += drop; + dout(10) << __func__ << " kept " << e << " removed " << temp << dendl; + } + old_extents.push_back(temp); + } + auto from = log_file->fnode.extents.begin(); + auto to = log_file->fnode.extents.end(); + while (from != to) { + new_log->fnode.append_extent(*from); + ++from; + } + + vselector->sub_usage(log_file->vselector_hint, log_file->fnode); + + // clear the extents from old log file, they are added to new log + log_file->fnode.clear_extents(); + // swap the log files. New log file is the log file now. + new_log->fnode.swap_extents(log_file->fnode); + + log_writer->pos = log_writer->file->fnode.size = + log_writer->pos - old_log_jump_to + new_log_jump_to; + + vselector->add_usage(log_file->vselector_hint, log_file->fnode); + + // 6. write the super block to reflect the changes + dout(10) << __func__ << " writing super" << dendl; + super.log_fnode = log_file->fnode; + ++super.version; + _write_super(BDEV_DB); + + lock.unlock(); + flush_bdev(); + lock.lock(); + + // 7. release old space + dout(10) << __func__ << " release old log extents " << old_extents << dendl; + for (auto& r : old_extents) { + pending_release[r.bdev].insert(r.offset, r.length); + } + + // delete the new log, remove from the dirty files list + _close_writer(new_log_writer); + if (new_log->dirty_seq) { + ceph_assert(dirty_files.count(new_log->dirty_seq)); + auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log); + dirty_files[new_log->dirty_seq].erase(it); + } + new_log_writer = nullptr; + new_log = nullptr; + log_cond.notify_all(); + + dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; + logger->inc(l_bluefs_log_compactions); +} + +void BlueFS::_pad_bl(bufferlist& bl) +{ + uint64_t partial = bl.length() % super.block_size; + if (partial) { + dout(10) << __func__ << " padding with 0x" << std::hex + << super.block_size - partial << " zeros" << std::dec << dendl; + bl.append_zero(super.block_size - partial); + } +} + + +int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l, + uint64_t want_seq, + uint64_t jump_to) +{ + while (log_flushing) { + dout(10) << __func__ << " want_seq " << want_seq + << " log is currently flushing, waiting" << dendl; + ceph_assert(!jump_to); + log_cond.wait(l); + } + if (want_seq && want_seq <= log_seq_stable) { + dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable " + << log_seq_stable << ", done" << dendl; + ceph_assert(!jump_to); + return 0; + } + if (log_t.empty() && dirty_files.empty()) { + dout(10) << __func__ << " want_seq " << want_seq + << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl; + ceph_assert(!jump_to); + return 0; + } + + vector<interval_set<uint64_t>> to_release(pending_release.size()); + to_release.swap(pending_release); + + uint64_t seq = log_t.seq = ++log_seq; + ceph_assert(want_seq == 0 || want_seq <= seq); + log_t.uuid = super.uuid; + + // log dirty files + auto lsi = dirty_files.find(seq); + if (lsi != dirty_files.end()) { + dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl; + for (auto &f : lsi->second) { + dout(20) << __func__ << " op_file_update " << f.fnode << dendl; + log_t.op_file_update(f.fnode); + } + } + + dout(10) << __func__ << " " << log_t << dendl; + ceph_assert(!log_t.empty()); + + // allocate some more space (before we run out)? + int64_t runway = log_writer->file->fnode.get_allocated() - + log_writer->get_effective_write_pos(); + bool just_expanded_log = false; + if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) { + dout(10) << __func__ << " allocating more log runway (0x" + << std::hex << runway << std::dec << " remaining)" << dendl; + while (new_log_writer) { + dout(10) << __func__ << " waiting for async compaction" << dendl; + log_cond.wait(l); + } + vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode); + int r = _allocate( + vselector->select_prefer_bdev(log_writer->file->vselector_hint), + cct->_conf->bluefs_max_log_runway, + &log_writer->file->fnode); + ceph_assert(r == 0); + vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode); + log_t.op_file_update(log_writer->file->fnode); + just_expanded_log = true; + } + + bufferlist bl; + bl.reserve(super.block_size); + encode(log_t, bl); + // pad to block boundary + size_t realign = super.block_size - (bl.length() % super.block_size); + if (realign && realign != super.block_size) + bl.append_zero(realign); + + logger->inc(l_bluefs_logged_bytes, bl.length()); + + if (just_expanded_log) { + ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss + } + + log_writer->append(bl); + + log_t.clear(); + log_t.seq = 0; // just so debug output is less confusing + log_flushing = true; + + int r = _flush(log_writer, true); + ceph_assert(r == 0); + + if (jump_to) { + dout(10) << __func__ << " jumping log offset from 0x" << std::hex + << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl; + log_writer->pos = jump_to; + vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size); + log_writer->file->fnode.size = jump_to; + vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size); + } + + _flush_bdev_safely(log_writer); + + log_flushing = false; + log_cond.notify_all(); + + // clean dirty files + if (seq > log_seq_stable) { + log_seq_stable = seq; + dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl; + + auto p = dirty_files.begin(); + while (p != dirty_files.end()) { + if (p->first > log_seq_stable) { + dout(20) << __func__ << " done cleaning up dirty files" << dendl; + break; + } + + auto l = p->second.begin(); + while (l != p->second.end()) { + File *file = &*l; + ceph_assert(file->dirty_seq > 0); + ceph_assert(file->dirty_seq <= log_seq_stable); + dout(20) << __func__ << " cleaned file " << file->fnode << dendl; + file->dirty_seq = 0; + p->second.erase(l++); + } + + ceph_assert(p->second.empty()); + dirty_files.erase(p++); + } + } else { + dout(20) << __func__ << " log_seq_stable " << log_seq_stable + << " already >= out seq " << seq + << ", we lost a race against another log flush, done" << dendl; + } + + for (unsigned i = 0; i < to_release.size(); ++i) { + if (!to_release[i].empty()) { + /* OK, now we have the guarantee alloc[i] won't be null. */ + int r = 0; + if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) { + r = bdev[i]->queue_discard(to_release[i]); + if (r == 0) + continue; + } else if (cct->_conf->bdev_enable_discard) { + for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) { + bdev[i]->discard(p.get_start(), p.get_len()); + } + } + alloc[i]->release(to_release[i]); + } + } + + _update_logger_stats(); + + return 0; +} + +int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) +{ + dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos + << " 0x" << offset << "~" << length << std::dec + << " to " << h->file->fnode << dendl; + ceph_assert(!h->file->deleted); + ceph_assert(h->file->num_readers.load() == 0); + + h->buffer_appender.flush(); + + bool buffered; + if (h->file->fnode.ino == 1) + buffered = false; + else + buffered = cct->_conf->bluefs_buffered_io; + + if (offset + length <= h->pos) + return 0; + if (offset < h->pos) { + length -= h->pos - offset; + offset = h->pos; + dout(10) << " still need 0x" + << std::hex << offset << "~" << length << std::dec + << dendl; + } + ceph_assert(offset <= h->file->fnode.size); + + uint64_t allocated = h->file->fnode.get_allocated(); + vselector->sub_usage(h->file->vselector_hint, h->file->fnode); + // do not bother to dirty the file if we are overwriting + // previously allocated extents. + bool must_dirty = false; + if (allocated < offset + length) { + // we should never run out of log space here; see the min runway check + // in _flush_and_sync_log. + ceph_assert(h->file->fnode.ino != 1); + int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint), + offset + length - allocated, + &h->file->fnode); + if (r < 0) { + derr << __func__ << " allocated: 0x" << std::hex << allocated + << " offset: 0x" << offset << " length: 0x" << length << std::dec + << dendl; + vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo + ceph_abort_msg("bluefs enospc"); + return r; + } + if (cct->_conf->bluefs_preextend_wal_files && + h->writer_type == WRITER_WAL) { + // NOTE: this *requires* that rocksdb also has log recycling + // enabled and is therefore doing robust CRCs on the log + // records. otherwise, we will fail to reply the rocksdb log + // properly due to garbage on the device. + h->file->fnode.size = h->file->fnode.get_allocated(); + dout(10) << __func__ << " extending WAL size to 0x" << std::hex + << h->file->fnode.size << std::dec << " to include allocated" + << dendl; + } + must_dirty = true; + } + if (h->file->fnode.size < offset + length) { + h->file->fnode.size = offset + length; + if (h->file->fnode.ino > 1) { + // we do not need to dirty the log file (or it's compacting + // replacement) when the file size changes because replay is + // smart enough to discover it on its own. + must_dirty = true; + } + } + if (must_dirty) { + h->file->fnode.mtime = ceph_clock_now(); + ceph_assert(h->file->fnode.ino >= 1); + if (h->file->dirty_seq == 0) { + h->file->dirty_seq = log_seq + 1; + dirty_files[h->file->dirty_seq].push_back(*h->file); + dout(20) << __func__ << " dirty_seq = " << log_seq + 1 + << " (was clean)" << dendl; + } else { + if (h->file->dirty_seq != log_seq + 1) { + // need re-dirty, erase from list first + ceph_assert(dirty_files.count(h->file->dirty_seq)); + auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file); + dirty_files[h->file->dirty_seq].erase(it); + h->file->dirty_seq = log_seq + 1; + dirty_files[h->file->dirty_seq].push_back(*h->file); + dout(20) << __func__ << " dirty_seq = " << log_seq + 1 + << " (was " << h->file->dirty_seq << ")" << dendl; + } else { + dout(20) << __func__ << " dirty_seq = " << log_seq + 1 + << " (unchanged, do nothing) " << dendl; + } + } + } + dout(20) << __func__ << " file now " << h->file->fnode << dendl; + + uint64_t x_off = 0; + auto p = h->file->fnode.seek(offset, &x_off); + ceph_assert(p != h->file->fnode.extents.end()); + dout(20) << __func__ << " in " << *p << " x_off 0x" + << std::hex << x_off << std::dec << dendl; + + unsigned partial = x_off & ~super.block_mask(); + bufferlist bl; + if (partial) { + dout(20) << __func__ << " using partial tail 0x" + << std::hex << partial << std::dec << dendl; + ceph_assert(h->tail_block.length() == partial); + bl.claim_append_piecewise(h->tail_block); + x_off -= partial; + offset -= partial; + length += partial; + dout(20) << __func__ << " waiting for previous aio to complete" << dendl; + for (auto p : h->iocv) { + if (p) { + p->aio_wait(); + } + } + } + if (length == partial + h->buffer.length()) { + bl.claim_append_piecewise(h->buffer); + } else { + bufferlist t; + h->buffer.splice(0, length, &t); + bl.claim_append_piecewise(t); + t.substr_of(h->buffer, length, h->buffer.length() - length); + h->buffer.swap(t); + dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec + << " unflushed" << dendl; + } + ceph_assert(bl.length() == length); + + switch (h->writer_type) { + case WRITER_WAL: + logger->inc(l_bluefs_bytes_written_wal, length); + break; + case WRITER_SST: + logger->inc(l_bluefs_bytes_written_sst, length); + break; + } + + dout(30) << "dump:\n"; + bl.hexdump(*_dout); + *_dout << dendl; + + h->pos = offset + length; + h->tail_block.clear(); + + uint64_t bloff = 0; + uint64_t bytes_written_slow = 0; + while (length > 0) { + uint64_t x_len = std::min(p->length - x_off, length); + bufferlist t; + t.substr_of(bl, bloff, x_len); + unsigned tail = x_len & ~super.block_mask(); + if (tail) { + size_t zlen = super.block_size - tail; + dout(20) << __func__ << " caching tail of 0x" + << std::hex << tail + << " and padding block with 0x" << zlen + << std::dec << dendl; + h->tail_block.substr_of(bl, bl.length() - tail, tail); + if (h->file->fnode.ino > 1) { + // we are using the page_aligned_appender, and can safely use + // the tail of the raw buffer. + const bufferptr &last = t.back(); + if (last.unused_tail_length() < zlen) { + derr << " wtf, last is " << last << " from " << t << dendl; + ceph_assert(last.unused_tail_length() >= zlen); + } + bufferptr z = last; + z.set_offset(last.offset() + last.length()); + z.set_length(zlen); + z.zero(); + t.append(z, 0, zlen); + } else { + t.append_zero(zlen); + } + } + if (cct->_conf->bluefs_sync_write) { + bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint); + } else { + bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint); + } + h->dirty_devs[p->bdev] = true; + if (p->bdev == BDEV_SLOW) { + bytes_written_slow += t.length(); + } + + bloff += x_len; + length -= x_len; + ++p; + x_off = 0; + } + logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow); + for (unsigned i = 0; i < MAX_BDEV; ++i) { + if (bdev[i]) { + if (h->iocv[i] && h->iocv[i]->has_pending_aios()) { + bdev[i]->aio_submit(h->iocv[i]); + } + } + } + vselector->add_usage(h->file->vselector_hint, h->file->fnode); + dout(20) << __func__ << " h " << h << " pos now 0x" + << std::hex << h->pos << std::dec << dendl; + return 0; +} + +#ifdef HAVE_LIBAIO +// we need to retire old completed aios so they don't stick around in +// memory indefinitely (along with their bufferlist refs). +void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls) +{ + for (auto p : h->iocv) { + if (p) { + ls->splice(ls->end(), p->running_aios); + } + } + dout(10) << __func__ << " got " << ls->size() << " aios" << dendl; +} + +void BlueFS::wait_for_aio(FileWriter *h) +{ + // NOTE: this is safe to call without a lock, as long as our reference is + // stable. + dout(10) << __func__ << " " << h << dendl; + utime_t start = ceph_clock_now(); + for (auto p : h->iocv) { + if (p) { + p->aio_wait(); + } + } + dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl; +} +#endif + +int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l) +{ + bool flushed = false; + int r = _flush(h, force, &flushed); + if (r == 0 && flushed) { + _maybe_compact_log(l); + } + return r; +} + +int BlueFS::_flush(FileWriter *h, bool force, bool *flushed) +{ + h->buffer_appender.flush(); + uint64_t length = h->buffer.length(); + uint64_t offset = h->pos; + if (flushed) { + *flushed = false; + } + if (!force && + length < cct->_conf->bluefs_min_flush_size) { + dout(10) << __func__ << " " << h << " ignoring, length " << length + << " < min_flush_size " << cct->_conf->bluefs_min_flush_size + << dendl; + return 0; + } + if (length == 0) { + dout(10) << __func__ << " " << h << " no dirty data on " + << h->file->fnode << dendl; + return 0; + } + dout(10) << __func__ << " " << h << " 0x" + << std::hex << offset << "~" << length << std::dec + << " to " << h->file->fnode << dendl; + ceph_assert(h->pos <= h->file->fnode.size); + int r = _flush_range(h, offset, length); + if (flushed) { + *flushed = true; + } + return r; +} + +int BlueFS::_truncate(FileWriter *h, uint64_t offset) +{ + dout(10) << __func__ << " 0x" << std::hex << offset << std::dec + << " file " << h->file->fnode << dendl; + if (h->file->deleted) { + dout(10) << __func__ << " deleted, no-op" << dendl; + return 0; + } + + // we never truncate internal log files + ceph_assert(h->file->fnode.ino > 1); + + h->buffer_appender.flush(); + + // truncate off unflushed data? + if (h->pos < offset && + h->pos + h->buffer.length() > offset) { + bufferlist t; + dout(20) << __func__ << " tossing out last " << offset - h->pos + << " unflushed bytes" << dendl; + t.substr_of(h->buffer, 0, offset - h->pos); + h->buffer.swap(t); + ceph_abort_msg("actually this shouldn't happen"); + } + if (h->buffer.length()) { + int r = _flush(h, true); + if (r < 0) + return r; + } + if (offset == h->file->fnode.size) { + return 0; // no-op! + } + if (offset > h->file->fnode.size) { + ceph_abort_msg("truncate up not supported"); + } + ceph_assert(h->file->fnode.size >= offset); + vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size); + h->file->fnode.size = offset; + vselector->add_usage(h->file->vselector_hint, h->file->fnode.size); + log_t.op_file_update(h->file->fnode); + return 0; +} + +int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l) +{ + dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl; + int r = _flush(h, true); + if (r < 0) + return r; + uint64_t old_dirty_seq = h->file->dirty_seq; + + _flush_bdev_safely(h); + + if (old_dirty_seq) { + uint64_t s = log_seq; + dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq + << ") on " << h->file->fnode << ", flushing log" << dendl; + _flush_and_sync_log(l, old_dirty_seq); + ceph_assert(h->file->dirty_seq == 0 || // cleaned + h->file->dirty_seq > s); // or redirtied by someone else + } + return 0; +} + +void BlueFS::_flush_bdev_safely(FileWriter *h) +{ + std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs; + h->dirty_devs.fill(false); +#ifdef HAVE_LIBAIO + if (!cct->_conf->bluefs_sync_write) { + list<aio_t> completed_ios; + _claim_completed_aios(h, &completed_ios); + lock.unlock(); + wait_for_aio(h); + completed_ios.clear(); + flush_bdev(flush_devs); + lock.lock(); + } else +#endif + { + lock.unlock(); + flush_bdev(flush_devs); + lock.lock(); + } +} + +void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs) +{ + // NOTE: this is safe to call without a lock. + dout(20) << __func__ << dendl; + for (unsigned i = 0; i < MAX_BDEV; i++) { + if (dirty_bdevs[i]) + bdev[i]->flush(); + } +} + +void BlueFS::flush_bdev() +{ + // NOTE: this is safe to call without a lock. + dout(20) << __func__ << dendl; + for (auto p : bdev) { + if (p) + p->flush(); + } +} + +const char* BlueFS::get_device_name(unsigned id) +{ + if (id >= MAX_BDEV) return "BDEV_INV"; + const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"}; + return names[id]; +} + +int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents) +{ + int r = -ENOSPC; + if (slow_dev_expander) { + auto id = _get_slow_device_id(); + auto min_alloc_size = alloc_size[id]; + ceph_assert(id <= alloc.size() && alloc[id]); + auto min_need = round_up_to(need, min_alloc_size); + need = std::max(need, + slow_dev_expander->get_recommended_expansion_delta( + alloc[id]->get_free(), block_all[id].size())); + + need = round_up_to(need, min_alloc_size); + dout(10) << __func__ << " expanding slow device by 0x" + << std::hex << need << std::dec + << dendl; + r = slow_dev_expander->allocate_freespace(min_need, need, extents); + } + return r; +} + +int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len, + PExtentVector* extents) +{ + dout(10) << __func__ << " len 0x" << std::hex << len << std::dec + << " from " << (int)id << dendl; + assert(id < alloc.size()); + if (!alloc[id]) { + return -ENOENT; + } + extents->reserve(4); // 4 should be (more than) enough for most allocations + uint64_t min_alloc_size = alloc_size[id]; + uint64_t left = round_up_to(len, min_alloc_size); + int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents); + if (alloc_len < 0 || alloc_len < (int64_t)left) { + if (alloc_len > 0) { + alloc[id]->release(*extents); + } + if (bdev[id]) + derr << __func__ << " failed to allocate 0x" << std::hex << left + << " on bdev " << (int)id + << ", free 0x" << alloc[id]->get_free() << std::dec << dendl; + else + derr << __func__ << " failed to allocate 0x" << std::hex << left + << " on bdev " << (int)id << ", dne" << std::dec << dendl; + if (alloc[id]) + alloc[id]->dump(); + return -ENOSPC; + } + + return 0; +} + +int BlueFS::_allocate(uint8_t id, uint64_t len, + bluefs_fnode_t* node) +{ + dout(10) << __func__ << " len 0x" << std::hex << len << std::dec + << " from " << (int)id << dendl; + ceph_assert(id < alloc.size()); + int64_t alloc_len = 0; + PExtentVector extents; + uint64_t hint = 0; + if (alloc[id]) { + if (!node->extents.empty() && node->extents.back().bdev == id) { + hint = node->extents.back().end(); + } + extents.reserve(4); // 4 should be (more than) enough for most allocations + alloc_len = alloc[id]->allocate(round_up_to(len, alloc_size[id]), + alloc_size[id], hint, &extents); + } + if (!alloc[id] || + alloc_len < 0 || + alloc_len < (int64_t)round_up_to(len, alloc_size[id])) { + if (alloc_len > 0) { + alloc[id]->release(extents); + } + if (id != BDEV_SLOW) { + if (bdev[id]) { + dout(1) << __func__ << " failed to allocate 0x" << std::hex << len + << " on bdev " << (int)id + << ", free 0x" << alloc[id]->get_free() + << "; fallback to bdev " << (int)id + 1 + << std::dec << dendl; + } + return _allocate(id + 1, len, node); + } + dout(1) << __func__ << " unable to allocate 0x" << std::hex << len + << " on bdev " << (int)id << ", free 0x" + << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1) + << "; fallback to slow device expander " + << std::dec << dendl; + extents.clear(); + if (_expand_slow_device(len, extents) == 0) { + id = _get_slow_device_id(); + for (auto& e : extents) { + _add_block_extent(id, e.offset, e.length); + } + extents.clear(); + auto* last_alloc = alloc[id]; + ceph_assert(last_alloc); + // try again + alloc_len = last_alloc->allocate(round_up_to(len, alloc_size[id]), + alloc_size[id], hint, &extents); + if (alloc_len < 0 || alloc_len < (int64_t)len) { + if (alloc_len > 0) { + last_alloc->release(extents); + } + derr << __func__ << " failed to allocate 0x" << std::hex << len + << " on bdev " << (int)id + << ", free 0x" << last_alloc->get_free() << std::dec << dendl; + return -ENOSPC; + } + } else { + derr << __func__ << " failed to expand slow device to fit +0x" + << std::hex << len << std::dec + << dendl; + return -ENOSPC; + } + } else { + uint64_t total_allocated = + block_all[id].size() - alloc[id]->get_free(); + if (max_bytes[id] < total_allocated) { + logger->set(max_bytes_pcounters[id], total_allocated); + max_bytes[id] = total_allocated; + } + } + + for (auto& p : extents) { + node->append_extent(bluefs_extent_t(id, p.offset, p.length)); + } + + return 0; +} + +int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len) +{ + dout(10) << __func__ << " file " << f->fnode << " 0x" + << std::hex << off << "~" << len << std::dec << dendl; + if (f->deleted) { + dout(10) << __func__ << " deleted, no-op" << dendl; + return 0; + } + ceph_assert(f->fnode.ino > 1); + uint64_t allocated = f->fnode.get_allocated(); + if (off + len > allocated) { + uint64_t want = off + len - allocated; + vselector->sub_usage(f->vselector_hint, f->fnode); + + int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint), + want, + &f->fnode); + vselector->add_usage(f->vselector_hint, f->fnode); + if (r < 0) + return r; + log_t.op_file_update(f->fnode); + } + return 0; +} + +void BlueFS::sync_metadata(bool avoid_compact) +{ + std::unique_lock l(lock); + if (log_t.empty() && dirty_files.empty()) { + dout(10) << __func__ << " - no pending log events" << dendl; + } else { + dout(10) << __func__ << dendl; + utime_t start = ceph_clock_now(); + flush_bdev(); // FIXME? + _flush_and_sync_log(l); + dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl; + } + + if (!avoid_compact) { + _maybe_compact_log(l); + } +} + +void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l) +{ + if (!cct->_conf->bluefs_replay_recovery_disable_compact && + _should_compact_log()) { + if (cct->_conf->bluefs_compact_log_sync) { + _compact_log_sync(); + } else { + _compact_log_async(l); + } + } +} + +int BlueFS::open_for_write( + const string& dirname, + const string& filename, + FileWriter **h, + bool overwrite) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << dirname << "/" << filename << dendl; + map<string,DirRef>::iterator p = dir_map.find(dirname); + DirRef dir; + if (p == dir_map.end()) { + // implicitly create the dir + dout(20) << __func__ << " dir " << dirname + << " does not exist" << dendl; + return -ENOENT; + } else { + dir = p->second; + } + + FileRef file; + bool create = false; + map<string,FileRef>::iterator q = dir->file_map.find(filename); + if (q == dir->file_map.end()) { + if (overwrite) { + dout(20) << __func__ << " dir " << dirname << " (" << dir + << ") file " << filename + << " does not exist" << dendl; + return -ENOENT; + } + file = new File; + file->fnode.ino = ++ino_last; + file_map[ino_last] = file; + dir->file_map[filename] = file; + ++file->refs; + create = true; + } else { + // overwrite existing file? + file = q->second; + if (overwrite) { + dout(20) << __func__ << " dir " << dirname << " (" << dir + << ") file " << filename + << " already exists, overwrite in place" << dendl; + } else { + dout(20) << __func__ << " dir " << dirname << " (" << dir + << ") file " << filename + << " already exists, truncate + overwrite" << dendl; + vselector->sub_usage(file->vselector_hint, file->fnode); + file->fnode.size = 0; + for (auto& p : file->fnode.extents) { + pending_release[p.bdev].insert(p.offset, p.length); + } + + file->fnode.clear_extents(); + } + } + ceph_assert(file->fnode.ino > 1); + + file->fnode.mtime = ceph_clock_now(); + file->vselector_hint = vselector->get_hint_by_dir(dirname); + + dout(20) << __func__ << " mapping " << dirname << "/" << filename + << " vsel_hint " << file->vselector_hint + << dendl; + + log_t.op_file_update(file->fnode); + if (create) + log_t.op_dir_link(dirname, filename, file->fnode.ino); + + *h = _create_writer(file); + + if (boost::algorithm::ends_with(filename, ".log")) { + (*h)->writer_type = BlueFS::WRITER_WAL; + if (logger && !overwrite) { + logger->inc(l_bluefs_files_written_wal); + } + } else if (boost::algorithm::ends_with(filename, ".sst")) { + (*h)->writer_type = BlueFS::WRITER_SST; + if (logger) { + logger->inc(l_bluefs_files_written_sst); + } + } + + dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; + return 0; +} + +BlueFS::FileWriter *BlueFS::_create_writer(FileRef f) +{ + FileWriter *w = new FileWriter(f); + for (unsigned i = 0; i < MAX_BDEV; ++i) { + if (bdev[i]) { + w->iocv[i] = new IOContext(cct, NULL); + } + } + return w; +} + +void BlueFS::_close_writer(FileWriter *h) +{ + dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl; + h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer); + for (unsigned i=0; i<MAX_BDEV; ++i) { + if (bdev[i]) { + if (h->iocv[i]) { + h->iocv[i]->aio_wait(); + bdev[i]->queue_reap_ioc(h->iocv[i]); + } + } + } + delete h; +} + +int BlueFS::open_for_read( + const string& dirname, + const string& filename, + FileReader **h, + bool random) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << dirname << "/" << filename + << (random ? " (random)":" (sequential)") << dendl; + map<string,DirRef>::iterator p = dir_map.find(dirname); + if (p == dir_map.end()) { + dout(20) << __func__ << " dir " << dirname << " not found" << dendl; + return -ENOENT; + } + DirRef dir = p->second; + + map<string,FileRef>::iterator q = dir->file_map.find(filename); + if (q == dir->file_map.end()) { + dout(20) << __func__ << " dir " << dirname << " (" << dir + << ") file " << filename + << " not found" << dendl; + return -ENOENT; + } + File *file = q->second.get(); + + *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch, + random, false); + dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; + return 0; +} + +int BlueFS::rename( + const string& old_dirname, const string& old_filename, + const string& new_dirname, const string& new_filename) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << old_dirname << "/" << old_filename + << " -> " << new_dirname << "/" << new_filename << dendl; + map<string,DirRef>::iterator p = dir_map.find(old_dirname); + if (p == dir_map.end()) { + dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl; + return -ENOENT; + } + DirRef old_dir = p->second; + map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename); + if (q == old_dir->file_map.end()) { + dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir + << ") file " << old_filename + << " not found" << dendl; + return -ENOENT; + } + FileRef file = q->second; + + p = dir_map.find(new_dirname); + if (p == dir_map.end()) { + dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl; + return -ENOENT; + } + DirRef new_dir = p->second; + q = new_dir->file_map.find(new_filename); + if (q != new_dir->file_map.end()) { + dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir + << ") file " << new_filename + << " already exists, unlinking" << dendl; + ceph_assert(q->second != file); + log_t.op_dir_unlink(new_dirname, new_filename); + _drop_link(q->second); + } + + dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " " + << " " << file->fnode << dendl; + + new_dir->file_map[new_filename] = file; + old_dir->file_map.erase(old_filename); + + log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino); + log_t.op_dir_unlink(old_dirname, old_filename); + return 0; +} + +int BlueFS::mkdir(const string& dirname) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << dirname << dendl; + map<string,DirRef>::iterator p = dir_map.find(dirname); + if (p != dir_map.end()) { + dout(20) << __func__ << " dir " << dirname << " exists" << dendl; + return -EEXIST; + } + dir_map[dirname] = new Dir; + log_t.op_dir_create(dirname); + return 0; +} + +int BlueFS::rmdir(const string& dirname) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << dirname << dendl; + map<string,DirRef>::iterator p = dir_map.find(dirname); + if (p == dir_map.end()) { + dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl; + return -ENOENT; + } + DirRef dir = p->second; + if (!dir->file_map.empty()) { + dout(20) << __func__ << " dir " << dirname << " not empty" << dendl; + return -ENOTEMPTY; + } + dir_map.erase(dirname); + log_t.op_dir_remove(dirname); + return 0; +} + +bool BlueFS::dir_exists(const string& dirname) +{ + std::lock_guard l(lock); + map<string,DirRef>::iterator p = dir_map.find(dirname); + bool exists = p != dir_map.end(); + dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl; + return exists; +} + +int BlueFS::stat(const string& dirname, const string& filename, + uint64_t *size, utime_t *mtime) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << dirname << "/" << filename << dendl; + map<string,DirRef>::iterator p = dir_map.find(dirname); + if (p == dir_map.end()) { + dout(20) << __func__ << " dir " << dirname << " not found" << dendl; + return -ENOENT; + } + DirRef dir = p->second; + map<string,FileRef>::iterator q = dir->file_map.find(filename); + if (q == dir->file_map.end()) { + dout(20) << __func__ << " dir " << dirname << " (" << dir + << ") file " << filename + << " not found" << dendl; + return -ENOENT; + } + File *file = q->second.get(); + dout(10) << __func__ << " " << dirname << "/" << filename + << " " << file->fnode << dendl; + if (size) + *size = file->fnode.size; + if (mtime) + *mtime = file->fnode.mtime; + return 0; +} + +int BlueFS::lock_file(const string& dirname, const string& filename, + FileLock **plock) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << dirname << "/" << filename << dendl; + map<string,DirRef>::iterator p = dir_map.find(dirname); + if (p == dir_map.end()) { + dout(20) << __func__ << " dir " << dirname << " not found" << dendl; + return -ENOENT; + } + DirRef dir = p->second; + map<string,FileRef>::iterator q = dir->file_map.find(filename); + File *file; + if (q == dir->file_map.end()) { + dout(20) << __func__ << " dir " << dirname << " (" << dir + << ") file " << filename + << " not found, creating" << dendl; + file = new File; + file->fnode.ino = ++ino_last; + file->fnode.mtime = ceph_clock_now(); + file_map[ino_last] = file; + dir->file_map[filename] = file; + ++file->refs; + log_t.op_file_update(file->fnode); + log_t.op_dir_link(dirname, filename, file->fnode.ino); + } else { + file = q->second.get(); + if (file->locked) { + dout(10) << __func__ << " already locked" << dendl; + return -ENOLCK; + } + } + file->locked = true; + *plock = new FileLock(file); + dout(10) << __func__ << " locked " << file->fnode + << " with " << *plock << dendl; + return 0; +} + +int BlueFS::unlock_file(FileLock *fl) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl; + ceph_assert(fl->file->locked); + fl->file->locked = false; + delete fl; + return 0; +} + +int BlueFS::readdir(const string& dirname, vector<string> *ls) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << dirname << dendl; + if (dirname.empty()) { + // list dirs + ls->reserve(dir_map.size() + 2); + for (auto& q : dir_map) { + ls->push_back(q.first); + } + } else { + // list files in dir + map<string,DirRef>::iterator p = dir_map.find(dirname); + if (p == dir_map.end()) { + dout(20) << __func__ << " dir " << dirname << " not found" << dendl; + return -ENOENT; + } + DirRef dir = p->second; + ls->reserve(dir->file_map.size() + 2); + for (auto& q : dir->file_map) { + ls->push_back(q.first); + } + } + ls->push_back("."); + ls->push_back(".."); + return 0; +} + +int BlueFS::unlink(const string& dirname, const string& filename) +{ + std::lock_guard l(lock); + dout(10) << __func__ << " " << dirname << "/" << filename << dendl; + map<string,DirRef>::iterator p = dir_map.find(dirname); + if (p == dir_map.end()) { + dout(20) << __func__ << " dir " << dirname << " not found" << dendl; + return -ENOENT; + } + DirRef dir = p->second; + map<string,FileRef>::iterator q = dir->file_map.find(filename); + if (q == dir->file_map.end()) { + dout(20) << __func__ << " file " << dirname << "/" << filename + << " not found" << dendl; + return -ENOENT; + } + FileRef file = q->second; + if (file->locked) { + dout(20) << __func__ << " file " << dirname << "/" << filename + << " is locked" << dendl; + return -EBUSY; + } + dir->file_map.erase(filename); + log_t.op_dir_unlink(dirname, filename); + _drop_link(file); + return 0; +} + +bool BlueFS::wal_is_rotational() +{ + if (bdev[BDEV_WAL]) { + return bdev[BDEV_WAL]->is_rotational(); + } else if (bdev[BDEV_DB]) { + return bdev[BDEV_DB]->is_rotational(); + } + return bdev[BDEV_SLOW]->is_rotational(); +} + +/* + Algorithm. + do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there. + Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future, + and try if using it will produce healthy bluefs transaction. + We encode already known bluefs log extents and search disk for these bytes. + When we find it, we decode following bytes as extent. + We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction. + */ +int BlueFS::do_replay_recovery_read(FileReader *log_reader, + size_t replay_pos, + size_t read_offset, + size_t read_len, + bufferlist* bl) { + dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos << + " needs 0x" << read_offset << "~" << read_len << std::dec << dendl; + + bluefs_fnode_t& log_fnode = log_reader->file->fnode; + bufferlist bin_extents; + encode(log_fnode.extents, bin_extents); + dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl; + + // cannot process if too small to effectively search + ceph_assert(bin_extents.length() >= 32); + bufferlist last_32; + last_32.substr_of(bin_extents, bin_extents.length() - 32, 32); + + //read fixed part from replay_pos to end of bluefs_log extents + bufferlist fixed; + uint64_t e_off = 0; + auto e = log_fnode.seek(replay_pos, &e_off); + ceph_assert(e != log_fnode.extents.end()); + int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev], + cct->_conf->bluefs_buffered_io); + ceph_assert(r == 0); + //capture dev of last good extent + uint8_t last_e_dev = e->bdev; + uint64_t last_e_off = e->offset; + ++e; + while (e != log_fnode.extents.end()) { + r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev], + cct->_conf->bluefs_buffered_io); + ceph_assert(r == 0); + last_e_dev = e->bdev; + ++e; + } + ceph_assert(replay_pos + fixed.length() == read_offset); + + dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl; + + struct compare { + bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const { + if (a.bdev < b.bdev) return true; + if (a.offset < b.offset) return true; + return a.length < b.length; + } + }; + std::set<bluefs_extent_t, compare> extents_rejected; + for (int dcnt = 0; dcnt < 3; dcnt++) { + uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV; + if (bdev[dev] == nullptr) continue; + dout(2) << __func__ << " processing " << get_device_name(dev) << dendl; + interval_set<uint64_t> disk_regions; + disk_regions.insert(0, bdev[dev]->get_size()); + for (auto f : file_map) { + auto& e = f.second->fnode.extents; + for (auto& p : e) { + if (p.bdev == dev) { + disk_regions.erase(p.offset, p.length); + } + } + } + size_t disk_regions_count = disk_regions.num_intervals(); + dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl; + + auto reg = disk_regions.lower_bound(last_e_off); + //for all except first, start from beginning + last_e_off = 0; + if (reg == disk_regions.end()) { + reg = disk_regions.begin(); + } + const uint64_t chunk_size = 4 * 1024 * 1024; + const uint64_t page_size = 4096; + const uint64_t max_extent_size = 16; + uint64_t overlay_size = last_32.length() + max_extent_size; + for (size_t i = 0; i < disk_regions_count; reg++, i++) { + if (reg == disk_regions.end()) { + reg = disk_regions.begin(); + } + uint64_t pos = reg.get_start(); + uint64_t len = reg.get_len(); + + std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]}; + char* raw_data = raw_data_p.get(); + memset(raw_data, 0, page_size); + + while (len > last_32.length()) { + uint64_t chunk_len = len > chunk_size ? chunk_size : len; + dout(5) << __func__ << " read " + << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl; + r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io); + ceph_assert(r == 0); + + //search for fixed_last_32 + char* chunk_b = raw_data + page_size; + char* chunk_e = chunk_b + chunk_len; + + char* search_b = chunk_b - overlay_size; + char* search_e = chunk_e; + + for (char* sp = search_b; ; sp += last_32.length()) { + sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length()); + if (sp == nullptr) { + break; + } + + char* n = sp + last_32.length(); + dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl; + bufferlist test; + test.append(n, std::min<size_t>(max_extent_size, chunk_e - n)); + bluefs_extent_t ne; + try { + bufferlist::const_iterator p = test.begin(); + decode(ne, p); + } catch (buffer::error& e) { + continue; + } + if (extents_rejected.count(ne) != 0) { + dout(5) << __func__ << " extent " << ne << " already refected" <<dendl; + continue; + } + //insert as rejected already. if we succeed, it wouldn't make difference. + extents_rejected.insert(ne); + + if (ne.bdev >= MAX_BDEV || + bdev[ne.bdev] == nullptr || + ne.length > 16 * 1024 * 1024 || + (ne.length & 4095) != 0 || + ne.offset + ne.length > bdev[ne.bdev]->get_size() || + (ne.offset & 4095) != 0) { + dout(5) << __func__ << " refusing extent " << ne << dendl; + continue; + } + dout(5) << __func__ << " checking extent " << ne << dendl; + + //read candidate extent - whole + bufferlist candidate; + candidate.append(fixed); + r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev], + cct->_conf->bluefs_buffered_io); + ceph_assert(r == 0); + + //check if transaction & crc is ok + bluefs_transaction_t t; + try { + bufferlist::const_iterator p = candidate.begin(); + decode(t, p); + } + catch (buffer::error& e) { + dout(5) << __func__ << " failed match" << dendl; + continue; + } + + //success, it seems a probable candidate + uint64_t l = std::min<uint64_t>(ne.length, read_len); + //trim to required size + bufferlist requested_read; + requested_read.substr_of(candidate, fixed.length(), l); + bl->append(requested_read); + dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl; + log_fnode.append_extent(ne); + log_fnode.recalc_allocated(); + log_reader->buf.pos += l; + return l; + } + //save overlay for next search + memcpy(search_b, chunk_e - overlay_size, overlay_size); + pos += chunk_len; + len -= chunk_len; + } + } + } + return 0; +} + +// =============================================== +// OriginalVolumeSelector + +void* OriginalVolumeSelector::get_hint_by_device(uint8_t dev) const { + return reinterpret_cast<void*>(dev); +} +void* OriginalVolumeSelector::get_hint_by_dir(const string& dirname) const { + uint8_t res = BlueFS::BDEV_DB; + if (dirname.length() > 5) { + // the "db.slow" and "db.wal" directory names are hard-coded at + // match up with bluestore. the slow device is always the second + // one (when a dedicated block.db device is present and used at + // bdev 0). the wal device is always last. + if (boost::algorithm::ends_with(dirname, ".slow")) { + res = BlueFS::BDEV_SLOW; + } + else if (boost::algorithm::ends_with(dirname, ".wal")) { + res = BlueFS::BDEV_WAL; + } + } + return reinterpret_cast<void*>(res); +} + +uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint) +{ + return (uint8_t)(reinterpret_cast<uint64_t>(hint)); +} + +void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const +{ + res.emplace_back(base, db_total); + res.emplace_back(base + ".slow", slow_total); +} + +#undef dout_prefix +#define dout_prefix *_dout << "OriginalVolumeSelector: " + +void OriginalVolumeSelector::dump(ostream& sout) { + sout<< "wal_total:" << wal_total + << ", db_total:" << db_total + << ", slow_total:" << slow_total + << std::endl; +} diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h new file mode 100644 index 00000000..2115870f --- /dev/null +++ b/src/os/bluestore/BlueFS.h @@ -0,0 +1,682 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_OS_BLUESTORE_BLUEFS_H +#define CEPH_OS_BLUESTORE_BLUEFS_H + +#include <atomic> +#include <mutex> +#include <limits> + +#include "bluefs_types.h" +#include "common/RefCountedObj.h" +#include "BlockDevice.h" + +#include "boost/intrusive/list.hpp" +#include <boost/intrusive_ptr.hpp> + +class PerfCounters; + +class Allocator; + +enum { + l_bluefs_first = 732600, + l_bluefs_gift_bytes, + l_bluefs_reclaim_bytes, + l_bluefs_db_total_bytes, + l_bluefs_db_used_bytes, + l_bluefs_wal_total_bytes, + l_bluefs_wal_used_bytes, + l_bluefs_slow_total_bytes, + l_bluefs_slow_used_bytes, + l_bluefs_num_files, + l_bluefs_log_bytes, + l_bluefs_log_compactions, + l_bluefs_logged_bytes, + l_bluefs_files_written_wal, + l_bluefs_files_written_sst, + l_bluefs_bytes_written_wal, + l_bluefs_bytes_written_sst, + l_bluefs_bytes_written_slow, + l_bluefs_max_bytes_wal, + l_bluefs_max_bytes_db, + l_bluefs_max_bytes_slow, + l_bluefs_read_random_count, + l_bluefs_read_random_bytes, + l_bluefs_read_random_disk_count, + l_bluefs_read_random_disk_bytes, + l_bluefs_read_random_buffer_count, + l_bluefs_read_random_buffer_bytes, + l_bluefs_read_count, + l_bluefs_read_bytes, + l_bluefs_read_prefetch_count, + l_bluefs_read_prefetch_bytes, + l_bluefs_read_zeros_candidate, + l_bluefs_read_zeros_errors, + + l_bluefs_last, +}; + +class BlueFSDeviceExpander { +protected: + ~BlueFSDeviceExpander() {} +public: + virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free, + uint64_t bluefs_total) = 0; + virtual int allocate_freespace( + uint64_t min_size, + uint64_t size, + PExtentVector& extents) = 0; + /** Reports amount of space that can be transferred to BlueFS. + * This gives either current state, when alloc_size is currently used + * BlueFS's size, or simulation when alloc_size is different. + * @params + * alloc_size - allocation unit size to check + */ + virtual size_t available_freespace(uint64_t alloc_size) = 0; +}; + +class BlueFSVolumeSelector { +public: + typedef std::vector<std::pair<std::string, uint64_t>> paths; + + virtual ~BlueFSVolumeSelector() { + } + virtual void* get_hint_by_device(uint8_t dev) const = 0; + virtual void* get_hint_by_dir(const string& dirname) const = 0; + + virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; + virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; + virtual void add_usage(void* file_hint, uint64_t fsize) = 0; + virtual void sub_usage(void* file_hint, uint64_t fsize) = 0; + virtual uint8_t select_prefer_bdev(void* hint) = 0; + virtual void get_paths(const std::string& base, paths& res) const = 0; + virtual void dump(ostream& sout) = 0; +}; +class BlueFS; + +class BlueFS { +public: + CephContext* cct; + static constexpr unsigned MAX_BDEV = 5; + static constexpr unsigned BDEV_WAL = 0; + static constexpr unsigned BDEV_DB = 1; + static constexpr unsigned BDEV_SLOW = 2; + static constexpr unsigned BDEV_NEWWAL = 3; + static constexpr unsigned BDEV_NEWDB = 4; + + enum { + WRITER_UNKNOWN, + WRITER_WAL, + WRITER_SST, + }; + + struct File : public RefCountedObject { + MEMPOOL_CLASS_HELPERS(); + + bluefs_fnode_t fnode; + int refs; + uint64_t dirty_seq; + bool locked; + bool deleted; + boost::intrusive::list_member_hook<> dirty_item; + + std::atomic_int num_readers, num_writers; + std::atomic_int num_reading; + + void* vselector_hint = nullptr; + + File() + : RefCountedObject(NULL, 0), + refs(0), + dirty_seq(0), + locked(false), + deleted(false), + num_readers(0), + num_writers(0), + num_reading(0), + vselector_hint(nullptr) + {} + ~File() override { + ceph_assert(num_readers.load() == 0); + ceph_assert(num_writers.load() == 0); + ceph_assert(num_reading.load() == 0); + ceph_assert(!locked); + } + + friend void intrusive_ptr_add_ref(File *f) { + f->get(); + } + friend void intrusive_ptr_release(File *f) { + f->put(); + } + }; + typedef boost::intrusive_ptr<File> FileRef; + + typedef boost::intrusive::list< + File, + boost::intrusive::member_hook< + File, + boost::intrusive::list_member_hook<>, + &File::dirty_item> > dirty_file_list_t; + + struct Dir : public RefCountedObject { + MEMPOOL_CLASS_HELPERS(); + + mempool::bluefs::map<string,FileRef> file_map; + + Dir() : RefCountedObject(NULL, 0) {} + + friend void intrusive_ptr_add_ref(Dir *d) { + d->get(); + } + friend void intrusive_ptr_release(Dir *d) { + d->put(); + } + }; + typedef boost::intrusive_ptr<Dir> DirRef; + + struct FileWriter { + MEMPOOL_CLASS_HELPERS(); + + FileRef file; + uint64_t pos; ///< start offset for buffer + bufferlist buffer; ///< new data to write (at end of file) + bufferlist tail_block; ///< existing partial block at end of file, if any + bufferlist::page_aligned_appender buffer_appender; //< for const char* only + int writer_type = 0; ///< WRITER_* + int write_hint = WRITE_LIFE_NOT_SET; + + ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock"); + std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev + std::array<bool, MAX_BDEV> dirty_devs; + + FileWriter(FileRef f) + : file(f), + pos(0), + buffer_appender(buffer.get_page_aligned_appender( + g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) { + ++file->num_writers; + iocv.fill(nullptr); + dirty_devs.fill(false); + if (f->fnode.ino == 1) { + write_hint = WRITE_LIFE_MEDIUM; + } + } + // NOTE: caller must call BlueFS::close_writer() + ~FileWriter() { + --file->num_writers; + } + + // note: BlueRocksEnv uses this append exclusively, so it's safe + // to use buffer_appender exclusively here (e.g., it's notion of + // offset will remain accurate). + void append(const char *buf, size_t len) { + uint64_t l0 = buffer.length(); + ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); + buffer_appender.append(buf, len); + } + + // note: used internally only, for ino 1 or 0. + void append(ceph::buffer::list& bl) { + uint64_t l0 = buffer.length(); + ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max()); + buffer.claim_append(bl); + } + + uint64_t get_effective_write_pos() { + buffer_appender.flush(); + return pos + buffer.length(); + } + }; + + struct FileReaderBuffer { + MEMPOOL_CLASS_HELPERS(); + + uint64_t bl_off; ///< prefetch buffer logical offset + bufferlist bl; ///< prefetch buffer + uint64_t pos; ///< current logical offset + uint64_t max_prefetch; ///< max allowed prefetch + + explicit FileReaderBuffer(uint64_t mpf) + : bl_off(0), + pos(0), + max_prefetch(mpf) {} + + uint64_t get_buf_end() { + return bl_off + bl.length(); + } + uint64_t get_buf_remaining(uint64_t p) { + if (p >= bl_off && p < bl_off + bl.length()) + return bl_off + bl.length() - p; + return 0; + } + + void skip(size_t n) { + pos += n; + } + void seek(uint64_t offset) { + pos = offset; + } + }; + + struct FileReader { + MEMPOOL_CLASS_HELPERS(); + + FileRef file; + FileReaderBuffer buf; + bool random; + bool ignore_eof; ///< used when reading our log file + + ceph::shared_mutex lock { + ceph::make_shared_mutex(std::string(), false, false, false) + }; + + + FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) + : file(f), + buf(mpf), + random(rand), + ignore_eof(ie) { + ++file->num_readers; + } + ~FileReader() { + --file->num_readers; + } + }; + + struct FileLock { + MEMPOOL_CLASS_HELPERS(); + + FileRef file; + explicit FileLock(FileRef f) : file(f) {} + }; + +private: + ceph::mutex lock = ceph::make_mutex("BlueFS::lock"); + + PerfCounters *logger = nullptr; + + uint64_t max_bytes[MAX_BDEV] = {0}; + uint64_t max_bytes_pcounters[MAX_BDEV] = { + l_bluefs_max_bytes_wal, + l_bluefs_max_bytes_db, + l_bluefs_max_bytes_slow, + }; + + // cache + mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir + mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File + + // map of dirty files, files of same dirty_seq are grouped into list. + map<uint64_t, dirty_file_list_t> dirty_files; + + bluefs_super_t super; ///< latest superblock (as last written) + uint64_t ino_last = 0; ///< last assigned ino (this one is in use) + uint64_t log_seq = 0; ///< last used log seq (by current pending log_t) + uint64_t log_seq_stable = 0; ///< last stable/synced log seq + FileWriter *log_writer = 0; ///< writer for the log + bluefs_transaction_t log_t; ///< pending, unwritten log transaction + bool log_flushing = false; ///< true while flushing the log + ceph::condition_variable log_cond; + + uint64_t new_log_jump_to = 0; + uint64_t old_log_jump_to = 0; + FileRef new_log = nullptr; + FileWriter *new_log_writer = nullptr; + + /* + * There are up to 3 block devices: + * + * BDEV_DB db/ - the primary db device + * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL + * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills + */ + vector<BlockDevice*> bdev; ///< block devices we can use + vector<IOContext*> ioc; ///< IOContexts for bdevs + vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own + vector<Allocator*> alloc; ///< allocators for bdevs + vector<uint64_t> alloc_size; ///< alloc size for each device + vector<interval_set<uint64_t>> pending_release; ///< extents to release + + BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev + + BlueFSDeviceExpander* slow_dev_expander = nullptr; + std::unique_ptr<BlueFSVolumeSelector> vselector; + + class SocketHook; + SocketHook* asok_hook = nullptr; + // used to trigger zeros into read (debug / verify) + std::atomic<uint64_t> inject_read_zeros{0}; + + void _init_logger(); + void _shutdown_logger(); + void _update_logger_stats(); + + void _init_alloc(); + void _stop_alloc(); + + void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros + + FileRef _get_file(uint64_t ino); + void _drop_link(FileRef f); + + unsigned _get_slow_device_id() { + return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; + } + const char* get_device_name(unsigned id); + int _expand_slow_device(uint64_t min_size, PExtentVector& extents); + int _allocate(uint8_t bdev, uint64_t len, + bluefs_fnode_t* node); + int _allocate_without_fallback(uint8_t id, uint64_t len, + PExtentVector* extents); + + int _flush_range(FileWriter *h, uint64_t offset, uint64_t length); + int _flush(FileWriter *h, bool focce, std::unique_lock<ceph::mutex>& l); + int _flush(FileWriter *h, bool force, bool *flushed = nullptr); + int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l); + +#ifdef HAVE_LIBAIO + void _claim_completed_aios(FileWriter *h, list<aio_t> *ls); + void wait_for_aio(FileWriter *h); // safe to call without a lock +#endif + + int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l, + uint64_t want_seq = 0, + uint64_t jump_to = 0); + uint64_t _estimate_log_size(); + bool _should_compact_log(); + + enum { + REMOVE_DB = 1, + REMOVE_WAL = 2, + RENAME_SLOW2DB = 4, + RENAME_DB2SLOW = 8, + }; + void _compact_log_dump_metadata(bluefs_transaction_t *t, + int flags); + void _compact_log_sync(); + void _compact_log_async(std::unique_lock<ceph::mutex>& l); + + void _rewrite_log_sync(bool allocate_with_fallback, + int super_dev, + int log_dev, + int new_log_dev, + int flags); + + //void _aio_finish(void *priv); + + void _flush_bdev_safely(FileWriter *h); + void flush_bdev(); // this is safe to call without a lock + void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock + + int _preallocate(FileRef f, uint64_t off, uint64_t len); + int _truncate(FileWriter *h, uint64_t off); + + int64_t _read( + FileReader *h, ///< [in] read from here + FileReaderBuffer *buf, ///< [in] reader state + uint64_t offset, ///< [in] offset + size_t len, ///< [in] this many bytes + bufferlist *outbl, ///< [out] optional: reference the result here + char *out); ///< [out] optional: or copy it here + int64_t _read_random( + FileReader *h, ///< [in] read from here + uint64_t offset, ///< [in] offset + size_t len, ///< [in] this many bytes + char *out); ///< [out] optional: or copy it here + + void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length); + + int _open_super(); + int _write_super(int dev); + int _replay(bool noop, bool to_stdout = false); ///< replay journal + + FileWriter *_create_writer(FileRef f); + void _close_writer(FileWriter *h); + + // always put the super in the second 4k block. FIXME should this be + // block size independent? + unsigned get_super_offset() { + return 4096; + } + unsigned get_super_length() { + return 4096; + } + + void _add_block_extent(unsigned bdev, uint64_t offset, uint64_t len, + bool skip=false); + +public: + BlueFS(CephContext* cct); + ~BlueFS(); + + // the super is always stored on bdev 0 + int mkfs(uuid_d osd_uuid); + int mount(); + void umount(bool avoid_compact = false); + int prepare_new_device(int id); + + int log_dump(); + + void collect_metadata(map<string,string> *pm, unsigned skip_bdev_id); + void get_devices(set<string> *ls); + uint64_t get_alloc_size(int id) { + return alloc_size[id]; + } + int fsck(); + + int device_migrate_to_new( + CephContext *cct, + const set<int>& devs_source, + int dev_target); + int device_migrate_to_existing( + CephContext *cct, + const set<int>& devs_source, + int dev_target); + + uint64_t get_used(); + uint64_t get_total(unsigned id); + uint64_t get_free(unsigned id); + void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...] + void dump_perf_counters(Formatter *f); + + void dump_block_extents(ostream& out); + + /// get current extents that we own for given block device + int get_block_extents(unsigned id, interval_set<uint64_t> *extents); + + int open_for_write( + const string& dir, + const string& file, + FileWriter **h, + bool overwrite); + + int open_for_read( + const string& dir, + const string& file, + FileReader **h, + bool random = false); + + void close_writer(FileWriter *h) { + std::lock_guard l(lock); + _close_writer(h); + } + + int rename(const string& old_dir, const string& old_file, + const string& new_dir, const string& new_file); + + int readdir(const string& dirname, vector<string> *ls); + + int unlink(const string& dirname, const string& filename); + int mkdir(const string& dirname); + int rmdir(const string& dirname); + bool wal_is_rotational(); + + bool dir_exists(const string& dirname); + int stat(const string& dirname, const string& filename, + uint64_t *size, utime_t *mtime); + + int lock_file(const string& dirname, const string& filename, FileLock **p); + int unlock_file(FileLock *l); + + void compact_log(); + + /// sync any uncommitted state to disk + void sync_metadata(bool avoid_compact); + /// test and compact log, if necessary + void _maybe_compact_log(std::unique_lock<ceph::mutex>& l); + + void set_slow_device_expander(BlueFSDeviceExpander* a) { + slow_dev_expander = a; + } + void set_volume_selector(BlueFSVolumeSelector* s) { + vselector.reset(s); + } + void dump_volume_selector(ostream& sout) { + vselector->dump(sout); + } + void get_vselector_paths(const std::string& base, + BlueFSVolumeSelector::paths& res) const { + return vselector->get_paths(base, res); + } + + int add_block_device(unsigned bdev, const string& path, bool trim, + bool shared_with_bluestore=false); + bool bdev_support_label(unsigned id); + uint64_t get_block_device_size(unsigned bdev); + + /// gift more block space + void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len, + bool skip=false) { + std::unique_lock l(lock); + _add_block_extent(bdev, offset, len, skip); + int r = _flush_and_sync_log(l); + ceph_assert(r == 0); + } + + /// reclaim block space + int reclaim_blocks(unsigned bdev, uint64_t want, + PExtentVector *extents); + + // handler for discard event + void handle_discard(unsigned dev, interval_set<uint64_t>& to_release); + + void flush(FileWriter *h, bool force = false) { + std::unique_lock l(lock); + int r = _flush(h, force, l); + ceph_assert(r == 0); + } + + void append_try_flush(FileWriter *h, const char* buf, size_t len) { + size_t max_size = 1ull << 30; // cap to 1GB + while (len > 0) { + bool need_flush = true; + auto l0 = h->buffer.length(); + if (l0 < max_size) { + size_t l = std::min(len, max_size - l0); + h->append(buf, l); + buf += l; + len -= l; + need_flush = h->buffer.length() >= cct->_conf->bluefs_min_flush_size; + } + if (need_flush) { + flush(h, true); + // make sure we've made any progress with flush hence the + // loop doesn't iterate forever + ceph_assert(h->buffer.length() < max_size); + } + } + } + void flush_range(FileWriter *h, uint64_t offset, uint64_t length) { + std::lock_guard l(lock); + _flush_range(h, offset, length); + } + int fsync(FileWriter *h) { + std::unique_lock l(lock); + int r = _fsync(h, l); + _maybe_compact_log(l); + return r; + } + int64_t read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len, + bufferlist *outbl, char *out) { + // no need to hold the global lock here; we only touch h and + // h->file, and read vs write or delete is already protected (via + // atomics and asserts). + return _read(h, buf, offset, len, outbl, out); + } + int64_t read_random(FileReader *h, uint64_t offset, size_t len, + char *out) { + // no need to hold the global lock here; we only touch h and + // h->file, and read vs write or delete is already protected (via + // atomics and asserts). + return _read_random(h, offset, len, out); + } + void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) { + std::lock_guard l(lock); + _invalidate_cache(f, offset, len); + } + int preallocate(FileRef f, uint64_t offset, uint64_t len) { + std::lock_guard l(lock); + return _preallocate(f, offset, len); + } + int truncate(FileWriter *h, uint64_t offset) { + std::lock_guard l(lock); + return _truncate(h, offset); + } + int do_replay_recovery_read(FileReader *log, + size_t log_pos, + size_t read_offset, + size_t read_len, + bufferlist* bl); + + /// test purpose methods + const PerfCounters* get_perf_counters() const { + return logger; + } + +private: + // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...) + // They are used for checking if read values are all 0, and reread if so. + int read(uint8_t ndev, uint64_t off, uint64_t len, + ceph::buffer::list *pbl, IOContext *ioc, bool buffered); + int read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered); +}; + +class OriginalVolumeSelector : public BlueFSVolumeSelector { + uint64_t wal_total; + uint64_t db_total; + uint64_t slow_total; + +public: + OriginalVolumeSelector( + uint64_t _wal_total, + uint64_t _db_total, + uint64_t _slow_total) + : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {} + + void* get_hint_by_device(uint8_t dev) const override; + void* get_hint_by_dir(const string& dirname) const override; + + void add_usage(void* hint, const bluefs_fnode_t& fnode) override { + // do nothing + return; + } + void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { + // do nothing + return; + } + void add_usage(void* hint, uint64_t fsize) override { + // do nothing + return; + } + void sub_usage(void* hint, uint64_t fsize) override { + // do nothing + return; + } + + uint8_t select_prefer_bdev(void* hint) override; + void get_paths(const std::string& base, paths& res) const override; + void dump(ostream& sout) override; +}; + +#endif diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc new file mode 100644 index 00000000..df626395 --- /dev/null +++ b/src/os/bluestore/BlueRocksEnv.cc @@ -0,0 +1,583 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "BlueRocksEnv.h" +#include "BlueFS.h" +#include "include/stringify.h" +#include "kv/RocksDBStore.h" +#include "string.h" + +rocksdb::Status err_to_status(int r) +{ + switch (r) { + case 0: + return rocksdb::Status::OK(); + case -ENOENT: + return rocksdb::Status::NotFound(rocksdb::Status::kNone); + case -EINVAL: + return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone); + case -EIO: + case -EEXIST: + return rocksdb::Status::IOError(rocksdb::Status::kNone); + case -ENOLCK: + return rocksdb::Status::IOError(strerror(r)); + default: + // FIXME :( + ceph_abort_msg("unrecognized error code"); + return rocksdb::Status::NotSupported(rocksdb::Status::kNone); + } +} + +// A file abstraction for reading sequentially through a file +class BlueRocksSequentialFile : public rocksdb::SequentialFile { + BlueFS *fs; + BlueFS::FileReader *h; + public: + BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {} + ~BlueRocksSequentialFile() override { + delete h; + } + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override { + int64_t r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch); + ceph_assert(r >= 0); + *result = rocksdb::Slice(scratch, r); + return rocksdb::Status::OK(); + } + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + rocksdb::Status Skip(uint64_t n) override { + h->buf.skip(n); + return rocksdb::Status::OK(); + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + rocksdb::Status InvalidateCache(size_t offset, size_t length) override { + fs->invalidate_cache(h->file, offset, length); + return rocksdb::Status::OK(); + } +}; + +// A file abstraction for randomly reading the contents of a file. +class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile { + BlueFS *fs; + BlueFS::FileReader *h; + public: + BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {} + ~BlueRocksRandomAccessFile() override { + delete h; + } + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result, + char* scratch) const override { + int64_t r = fs->read_random(h, offset, n, scratch); + ceph_assert(r >= 0); + *result = rocksdb::Slice(scratch, r); + return rocksdb::Status::OK(); + } + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to eachother by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + size_t GetUniqueId(char* id, size_t max_size) const override { + return snprintf(id, max_size, "%016llx", + (unsigned long long)h->file->fnode.ino); + }; + + // Readahead the file starting from offset by n bytes for caching. + rocksdb::Status Prefetch(uint64_t offset, size_t n) override { + fs->read(h, &h->buf, offset, n, nullptr, nullptr); + return rocksdb::Status::OK(); + } + + //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; + + void Hint(AccessPattern pattern) override { + if (pattern == RANDOM) + h->buf.max_prefetch = 4096; + else if (pattern == SEQUENTIAL) + h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch; + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + rocksdb::Status InvalidateCache(size_t offset, size_t length) override { + fs->invalidate_cache(h->file, offset, length); + return rocksdb::Status::OK(); + } +}; + + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class BlueRocksWritableFile : public rocksdb::WritableFile { + BlueFS *fs; + BlueFS::FileWriter *h; + public: + BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {} + ~BlueRocksWritableFile() override { + fs->close_writer(h); + } + + // Indicates if the class makes use of unbuffered I/O + /*bool UseOSBuffer() const { + return true; + }*/ + + // This is needed when you want to allocate + // AlignedBuffer for use with file I/O classes + // Used for unbuffered file I/O when UseOSBuffer() returns false + /*size_t GetRequiredBufferAlignment() const { + return c_DefaultPageSize; + }*/ + + rocksdb::Status Append(const rocksdb::Slice& data) override { + fs->append_try_flush(h, data.data(), data.size()); + return rocksdb::Status::OK(); + } + + // Positioned write for unbuffered access default forward + // to simple append as most of the tests are buffered by default + rocksdb::Status PositionedAppend( + const rocksdb::Slice& /* data */, + uint64_t /* offset */) override { + return rocksdb::Status::NotSupported(); + } + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + rocksdb::Status Truncate(uint64_t size) override { + // we mirror the posix env, which does nothing here; instead, it + // truncates to the final size on close. whatever! + return rocksdb::Status::OK(); + //int r = fs->truncate(h, size); + // return err_to_status(r); + } + + rocksdb::Status Close() override { + Flush(); + + // mimic posix env, here. shrug. + size_t block_size; + size_t last_allocated_block; + GetPreallocationStatus(&block_size, &last_allocated_block); + if (last_allocated_block > 0) { + int r = fs->truncate(h, h->pos); + if (r < 0) + return err_to_status(r); + } + + return rocksdb::Status::OK(); + } + + rocksdb::Status Flush() override { + fs->flush(h); + return rocksdb::Status::OK(); + } + + rocksdb::Status Sync() override { // sync data + fs->fsync(h); + return rocksdb::Status::OK(); + } + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + bool IsSyncThreadSafe() const override { + return true; + } + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + bool UseDirectIO() const { + return false; + } + + void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint) override { + h->write_hint = (const int)hint; + } + + /* + * Get the size of valid data in the file. + */ + uint64_t GetFileSize() override { + return h->file->fnode.size + h->buffer.length();; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + size_t GetUniqueId(char* id, size_t max_size) const override { + return snprintf(id, max_size, "%016llx", + (unsigned long long)h->file->fnode.ino); + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + rocksdb::Status InvalidateCache(size_t offset, size_t length) override { + fs->invalidate_cache(h->file, offset, length); + return rocksdb::Status::OK(); + } + + using rocksdb::WritableFile::RangeSync; + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + rocksdb::Status RangeSync(off_t offset, off_t nbytes) { + // round down to page boundaries + int partial = offset & 4095; + offset -= partial; + nbytes += partial; + nbytes &= ~4095; + if (nbytes) + fs->flush_range(h, offset, nbytes); + return rocksdb::Status::OK(); + } + + protected: + using rocksdb::WritableFile::Allocate; + /* + * Pre-allocate space for a file. + */ + rocksdb::Status Allocate(off_t offset, off_t len) { + int r = fs->preallocate(h->file, offset, len); + return err_to_status(r); + } +}; + + +// Directory object represents collection of files and implements +// filesystem operations that can be executed on directories. +class BlueRocksDirectory : public rocksdb::Directory { + BlueFS *fs; + public: + explicit BlueRocksDirectory(BlueFS *f) : fs(f) {} + + // Fsync directory. Can be called concurrently from multiple threads. + rocksdb::Status Fsync() override { + // it is sufficient to flush the log. + fs->sync_metadata(false); + return rocksdb::Status::OK(); + } +}; + +// Identifies a locked file. +class BlueRocksFileLock : public rocksdb::FileLock { + public: + BlueFS *fs; + BlueFS::FileLock *lock; + BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { } + ~BlueRocksFileLock() override { + } +}; + + +// -------------------- +// --- BlueRocksEnv --- +// -------------------- + +BlueRocksEnv::BlueRocksEnv(BlueFS *f) + : EnvWrapper(Env::Default()), // forward most of it to POSIX + fs(f) +{ + +} + +rocksdb::Status BlueRocksEnv::NewSequentialFile( + const std::string& fname, + std::unique_ptr<rocksdb::SequentialFile>* result, + const rocksdb::EnvOptions& options) +{ + if (fname[0] == '/') + return target()->NewSequentialFile(fname, result, options); + std::string dir, file; + split(fname, &dir, &file); + BlueFS::FileReader *h; + int r = fs->open_for_read(dir, file, &h, false); + if (r < 0) + return err_to_status(r); + result->reset(new BlueRocksSequentialFile(fs, h)); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::NewRandomAccessFile( + const std::string& fname, + std::unique_ptr<rocksdb::RandomAccessFile>* result, + const rocksdb::EnvOptions& options) +{ + std::string dir, file; + split(fname, &dir, &file); + BlueFS::FileReader *h; + int r = fs->open_for_read(dir, file, &h, true); + if (r < 0) + return err_to_status(r); + result->reset(new BlueRocksRandomAccessFile(fs, h)); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::NewWritableFile( + const std::string& fname, + std::unique_ptr<rocksdb::WritableFile>* result, + const rocksdb::EnvOptions& options) +{ + std::string dir, file; + split(fname, &dir, &file); + BlueFS::FileWriter *h; + int r = fs->open_for_write(dir, file, &h, false); + if (r < 0) + return err_to_status(r); + result->reset(new BlueRocksWritableFile(fs, h)); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::ReuseWritableFile( + const std::string& new_fname, + const std::string& old_fname, + std::unique_ptr<rocksdb::WritableFile>* result, + const rocksdb::EnvOptions& options) +{ + std::string old_dir, old_file; + split(old_fname, &old_dir, &old_file); + std::string new_dir, new_file; + split(new_fname, &new_dir, &new_file); + + int r = fs->rename(old_dir, old_file, new_dir, new_file); + if (r < 0) + return err_to_status(r); + + BlueFS::FileWriter *h; + r = fs->open_for_write(new_dir, new_file, &h, true); + if (r < 0) + return err_to_status(r); + result->reset(new BlueRocksWritableFile(fs, h)); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::NewDirectory( + const std::string& name, + std::unique_ptr<rocksdb::Directory>* result) +{ + if (!fs->dir_exists(name)) + return rocksdb::Status::NotFound(name, strerror(ENOENT)); + result->reset(new BlueRocksDirectory(fs)); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname) +{ + if (fname[0] == '/') + return target()->FileExists(fname); + std::string dir, file; + split(fname, &dir, &file); + if (fs->stat(dir, file, NULL, NULL) == 0) + return rocksdb::Status::OK(); + return err_to_status(-ENOENT); +} + +rocksdb::Status BlueRocksEnv::GetChildren( + const std::string& dir, + std::vector<std::string>* result) +{ + result->clear(); + int r = fs->readdir(dir, result); + if (r < 0) + return rocksdb::Status::NotFound(dir, strerror(ENOENT));// return err_to_status(r); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname) +{ + std::string dir, file; + split(fname, &dir, &file); + int r = fs->unlink(dir, file); + if (r < 0) + return err_to_status(r); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname) +{ + int r = fs->mkdir(dirname); + if (r < 0) + return err_to_status(r); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname) +{ + int r = fs->mkdir(dirname); + if (r < 0 && r != -EEXIST) + return err_to_status(r); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname) +{ + int r = fs->rmdir(dirname); + if (r < 0) + return err_to_status(r); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::GetFileSize( + const std::string& fname, + uint64_t* file_size) +{ + std::string dir, file; + split(fname, &dir, &file); + int r = fs->stat(dir, file, file_size, NULL); + if (r < 0) + return err_to_status(r); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) +{ + std::string dir, file; + split(fname, &dir, &file); + utime_t mtime; + int r = fs->stat(dir, file, NULL, &mtime); + if (r < 0) + return err_to_status(r); + *file_mtime = mtime.sec(); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::RenameFile( + const std::string& src, + const std::string& target) +{ + std::string old_dir, old_file; + split(src, &old_dir, &old_file); + std::string new_dir, new_file; + split(target, &new_dir, &new_file); + + int r = fs->rename(old_dir, old_file, new_dir, new_file); + if (r < 0) + return err_to_status(r); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::LinkFile( + const std::string& src, + const std::string& target) +{ + ceph_abort(); +} + +rocksdb::Status BlueRocksEnv::AreFilesSame( + const std::string& first, + const std::string& second, bool* res) +{ + for (auto& path : {first, second}) { + if (fs->dir_exists(path)) { + continue; + } + std::string dir, file; + split(path, &dir, &file); + int r = fs->stat(dir, file, nullptr, nullptr); + if (!r) { + continue; + } else if (r == -ENOENT) { + return rocksdb::Status::NotFound("AreFilesSame", path); + } else { + return err_to_status(r); + } + } + *res = (first == second); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::LockFile( + const std::string& fname, + rocksdb::FileLock** lock) +{ + std::string dir, file; + split(fname, &dir, &file); + BlueFS::FileLock *l = NULL; + int r = fs->lock_file(dir, file, &l); + if (r < 0) + return err_to_status(r); + *lock = new BlueRocksFileLock(fs, l); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock) +{ + BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock); + int r = fs->unlock_file(l->lock); + if (r < 0) + return err_to_status(r); + delete lock; + lock = nullptr; + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::GetAbsolutePath( + const std::string& db_path, + std::string* output_path) +{ + // this is a lie... + *output_path = "/" + db_path; + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::NewLogger( + const std::string& fname, + std::shared_ptr<rocksdb::Logger>* result) +{ + // ignore the filename :) + result->reset(create_rocksdb_ceph_logger()); + return rocksdb::Status::OK(); +} + +rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path) +{ + static int foo = 0; + *path = "temp_" + stringify(++foo); + return rocksdb::Status::OK(); +} diff --git a/src/os/bluestore/BlueRocksEnv.h b/src/os/bluestore/BlueRocksEnv.h new file mode 100644 index 00000000..82cffcd8 --- /dev/null +++ b/src/os/bluestore/BlueRocksEnv.h @@ -0,0 +1,164 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_OS_BLUESTORE_BLUEROCKSENV_H +#define CEPH_OS_BLUESTORE_BLUEROCKSENV_H + +#include <memory> +#include <string> + +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/env_mirror.h" + +#include "include/ceph_assert.h" +#include "kv/RocksDBStore.h" + +class BlueFS; + +class BlueRocksEnv : public rocksdb::EnvWrapper { + void split(const std::string &fn, std::string *dir, std::string *file) { + size_t slash = fn.rfind('/'); + *file = fn.substr(slash + 1); + while (slash && fn[slash-1] == '/') + --slash; + *dir = fn.substr(0, slash); + } + +public: + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure, stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + rocksdb::Status NewSequentialFile( + const std::string& fname, + std::unique_ptr<rocksdb::SequentialFile>* result, + const rocksdb::EnvOptions& options) override; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure, stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + rocksdb::Status NewRandomAccessFile( + const std::string& fname, + std::unique_ptr<rocksdb::RandomAccessFile>* result, + const rocksdb::EnvOptions& options) override; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure, stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + rocksdb::Status NewWritableFile( + const std::string& fname, + std::unique_ptr<rocksdb::WritableFile>* result, + const rocksdb::EnvOptions& options) override; + + // Reuse an existing file by renaming it and opening it as writable. + rocksdb::Status ReuseWritableFile( + const std::string& fname, + const std::string& old_fname, + std::unique_ptr<rocksdb::WritableFile>* result, + const rocksdb::EnvOptions& options) override; + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + rocksdb::Status NewDirectory( + const std::string& name, + std::unique_ptr<rocksdb::Directory>* result) override; + + // Returns OK if the named file exists. + // NotFound if the named file does not exist, + // the calling process does not have permission to determine + // whether this file exists, or if the path is invalid. + // IOError if an IO Error was encountered + rocksdb::Status FileExists(const std::string& fname) override; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + rocksdb::Status GetChildren(const std::string& dir, + std::vector<std::string>* result) override; + + // Delete the named file. + rocksdb::Status DeleteFile(const std::string& fname) override; + + // Create the specified directory. Returns error if directory exists. + rocksdb::Status CreateDir(const std::string& dirname) override; + + // Create directory if missing. Return Ok if it exists, or successful in + // Creating. + rocksdb::Status CreateDirIfMissing(const std::string& dirname) override; + + // Delete the specified directory. + rocksdb::Status DeleteDir(const std::string& dirname) override; + + // Store the size of fname in *file_size. + rocksdb::Status GetFileSize(const std::string& fname, uint64_t* file_size) override; + + // Store the last modification time of fname in *file_mtime. + rocksdb::Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override; + // Rename file src to target. + rocksdb::Status RenameFile(const std::string& src, + const std::string& target) override; + // Hard Link file src to target. + rocksdb::Status LinkFile(const std::string& src, const std::string& target) override; + + // Tell if two files are identical + rocksdb::Status AreFilesSame(const std::string& first, + const std::string& second, bool* res) override; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + rocksdb::Status LockFile(const std::string& fname, rocksdb::FileLock** lock) override; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + rocksdb::Status UnlockFile(rocksdb::FileLock* lock) override; + + // *path is set to a temporary directory that can be used for testing. It may + // or may not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + rocksdb::Status GetTestDirectory(std::string* path) override; + + // Create and return a log file for storing informational messages. + rocksdb::Status NewLogger( + const std::string& fname, + std::shared_ptr<rocksdb::Logger>* result) override; + + // Get full directory name for this db. + rocksdb::Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) override; + + explicit BlueRocksEnv(BlueFS *f); +private: + BlueFS *fs; +}; + +#endif diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc new file mode 100644 index 00000000..d701ef4d --- /dev/null +++ b/src/os/bluestore/BlueStore.cc @@ -0,0 +1,15265 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include <boost/container/flat_set.hpp> +#include "boost/algorithm/string.hpp" + +#include "include/cpp-btree/btree_set.h" + +#include "BlueStore.h" +#include "os/kv.h" +#include "include/compat.h" +#include "include/intarith.h" +#include "include/stringify.h" +#include "include/str_map.h" +#include "include/util.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "common/PriorityCache.h" +#include "Allocator.h" +#include "FreelistManager.h" +#include "BlueFS.h" +#include "BlueRocksEnv.h" +#include "auth/Crypto.h" +#include "common/EventTrace.h" +#include "perfglue/heap_profiler.h" +#include "common/blkdev.h" +#include "common/numa.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore + +using bid_t = decltype(BlueStore::Blob::id); + +// bluestore_cache_onode +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode, + bluestore_cache_onode); + +// bluestore_cache_other +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer, + bluestore_Buffer); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent, + bluestore_Extent); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob, + bluestore_Blob); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob, + bluestore_SharedBlob); + +// bluestore_txc +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext, + bluestore_txc); + + +// kv store prefixes +const string PREFIX_SUPER = "S"; // field -> value +const string PREFIX_STAT = "T"; // field -> value(int64 array) +const string PREFIX_COLL = "C"; // collection name -> cnode_t +const string PREFIX_OBJ = "O"; // object name -> onode_t +const string PREFIX_OMAP = "M"; // u64 + keyname -> value +const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll) +const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t +const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist) +const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager) +const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t + +const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs"; + +// write a label in the first block. always use this size. note that +// bluefs makes a matching assumption about the location of its +// superblock (always the second block of the device). +#define BDEV_LABEL_BLOCK_SIZE 4096 + +// reserve: label (4k) + bluefs super (4k), which means we start at 8k. +#define SUPER_RESERVED 8192 + +#define OBJECT_MAX_SIZE 0xffffffff // 32 bits + + +/* + * extent map blob encoding + * + * we use the low bits of the blobid field to indicate some common scenarios + * and spanning vs local ids. See ExtentMap::{encode,decode}_some(). + */ +#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous +#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0 +#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent +#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id +#define BLOBID_SHIFT_BITS 4 + +/* + * object name key structure + * + * encoded u8: shard + 2^7 (so that it sorts properly) + * encoded u64: poolid + 2^63 (so that it sorts properly) + * encoded u32: hash (bit reversed) + * + * escaped string: namespace + * + * escaped string: key or object name + * 1 char: '<', '=', or '>'. if =, then object key == object name, and + * we are done. otherwise, we are followed by the object name. + * escaped string: object name (unless '=' above) + * + * encoded u64: snap + * encoded u64: generation + * 'o' + */ +#define ONODE_KEY_SUFFIX 'o' + +/* + * extent shard key + * + * object prefix key + * u32 + * 'x' + */ +#define EXTENT_SHARD_KEY_SUFFIX 'x' + +/* + * string encoding in the key + * + * The key string needs to lexicographically sort the same way that + * ghobject_t does. We do this by escaping anything <= to '#' with # + * plus a 2 digit hex string, and anything >= '~' with ~ plus the two + * hex digits. + * + * We use ! as a terminator for strings; this works because it is < # + * and will get escaped if it is present in the string. + * + * NOTE: There is a bug in this implementation: due to implicit + * character type conversion in comparison it may produce unexpected + * ordering. Unfortunately fixing the bug would mean invalidating the + * keys in existing deployments. Instead we do additional sorting + * where it is needed. + */ +template<typename S> +static void append_escaped(const string &in, S *out) +{ + char hexbyte[in.length() * 3 + 1]; + char* ptr = &hexbyte[0]; + for (string::const_iterator i = in.begin(); i != in.end(); ++i) { + if (*i <= '#') { // bug: unexpected result for *i > 0x7f + *ptr++ = '#'; + *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f]; + *ptr++ = "0123456789abcdef"[*i & 0x0f]; + } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f + *ptr++ = '~'; + *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f]; + *ptr++ = "0123456789abcdef"[*i & 0x0f]; + } else { + *ptr++ = *i; + } + } + *ptr++ = '!'; + out->append(hexbyte, ptr - &hexbyte[0]); +} + +inline unsigned h2i(char c) +{ + if ((c >= '0') && (c <= '9')) { + return c - 0x30; + } else if ((c >= 'a') && (c <= 'f')) { + return c - 'a' + 10; + } else if ((c >= 'A') && (c <= 'F')) { + return c - 'A' + 10; + } else { + return 256; // make it always larger than 255 + } +} + +static int decode_escaped(const char *p, string *out) +{ + char buff[256]; + char* ptr = &buff[0]; + char* max = &buff[252]; + const char *orig_p = p; + while (*p && *p != '!') { + if (*p == '#' || *p == '~') { + unsigned hex = 0; + p++; + hex = h2i(*p++) << 4; + if (hex > 255) { + return -EINVAL; + } + hex |= h2i(*p++); + if (hex > 255) { + return -EINVAL; + } + *ptr++ = hex; + } else { + *ptr++ = *p++; + } + if (ptr > max) { + out->append(buff, ptr-buff); + ptr = &buff[0]; + } + } + if (ptr != buff) { + out->append(buff, ptr-buff); + } + return p - orig_p; +} + +// some things we encode in binary (as le32 or le64); print the +// resulting key strings nicely +template<typename S> +static string pretty_binary_string(const S& in) +{ + char buf[10]; + string out; + out.reserve(in.length() * 3); + enum { NONE, HEX, STRING } mode = NONE; + unsigned from = 0, i; + for (i=0; i < in.length(); ++i) { + if ((in[i] < 32 || (unsigned char)in[i] > 126) || + (mode == HEX && in.length() - i >= 4 && + ((in[i] < 32 || (unsigned char)in[i] > 126) || + (in[i+1] < 32 || (unsigned char)in[i+1] > 126) || + (in[i+2] < 32 || (unsigned char)in[i+2] > 126) || + (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) { + if (mode == STRING) { + out.append(in.c_str() + from, i - from); + out.push_back('\''); + } + if (mode != HEX) { + out.append("0x"); + mode = HEX; + } + if (in.length() - i >= 4) { + // print a whole u32 at once + snprintf(buf, sizeof(buf), "%08x", + (uint32_t)(((unsigned char)in[i] << 24) | + ((unsigned char)in[i+1] << 16) | + ((unsigned char)in[i+2] << 8) | + ((unsigned char)in[i+3] << 0))); + i += 3; + } else { + snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]); + } + out.append(buf); + } else { + if (mode != STRING) { + out.push_back('\''); + mode = STRING; + from = i; + } + } + } + if (mode == STRING) { + out.append(in.c_str() + from, i - from); + out.push_back('\''); + } + return out; +} + +template<typename T> +static void _key_encode_shard(shard_id_t shard, T *key) +{ + key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80)); +} + +static const char *_key_decode_shard(const char *key, shard_id_t *pshard) +{ + pshard->id = (uint8_t)*key - (uint8_t)0x80; + return key + 1; +} + +static void get_coll_range(const coll_t& cid, int bits, + ghobject_t *temp_start, ghobject_t *temp_end, + ghobject_t *start, ghobject_t *end) +{ + spg_t pgid; + if (cid.is_pg(&pgid)) { + start->shard_id = pgid.shard; + *temp_start = *start; + + start->hobj.pool = pgid.pool(); + temp_start->hobj.pool = -2ll - pgid.pool(); + + *end = *start; + *temp_end = *temp_start; + + uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps()); + start->hobj.set_bitwise_key_u32(reverse_hash); + temp_start->hobj.set_bitwise_key_u32(reverse_hash); + + uint64_t end_hash = reverse_hash + (1ull << (32 - bits)); + if (end_hash > 0xffffffffull) + end_hash = 0xffffffffull; + + end->hobj.set_bitwise_key_u32(end_hash); + temp_end->hobj.set_bitwise_key_u32(end_hash); + } else { + start->shard_id = shard_id_t::NO_SHARD; + start->hobj.pool = -1ull; + + *end = *start; + start->hobj.set_bitwise_key_u32(0); + end->hobj.set_bitwise_key_u32(0xffffffff); + + // no separate temp section + *temp_start = *end; + *temp_end = *end; + } + + start->generation = 0; + end->generation = 0; + temp_start->generation = 0; + temp_end->generation = 0; +} + +static void get_shared_blob_key(uint64_t sbid, string *key) +{ + key->clear(); + _key_encode_u64(sbid, key); +} + +static int get_key_shared_blob(const string& key, uint64_t *sbid) +{ + const char *p = key.c_str(); + if (key.length() < sizeof(uint64_t)) + return -1; + _key_decode_u64(p, sbid); + return 0; +} + +template<typename S> +static void _key_encode_prefix(const ghobject_t& oid, S *key) +{ + _key_encode_shard(oid.shard_id, key); + _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key); + _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key); +} + +static const char *_key_decode_prefix(const char *p, ghobject_t *oid) +{ + p = _key_decode_shard(p, &oid->shard_id); + + uint64_t pool; + p = _key_decode_u64(p, &pool); + oid->hobj.pool = pool - 0x8000000000000000ull; + + unsigned hash; + p = _key_decode_u32(p, &hash); + + oid->hobj.set_bitwise_key_u32(hash); + + return p; +} + +#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4) + +template<typename S> +static int get_key_object(const S& key, ghobject_t *oid) +{ + int r; + const char *p = key.c_str(); + + if (key.length() < ENCODED_KEY_PREFIX_LEN) + return -1; + + p = _key_decode_prefix(p, oid); + + if (key.length() == ENCODED_KEY_PREFIX_LEN) + return -2; + + r = decode_escaped(p, &oid->hobj.nspace); + if (r < 0) + return -2; + p += r + 1; + + string k; + r = decode_escaped(p, &k); + if (r < 0) + return -3; + p += r + 1; + if (*p == '=') { + // no key + ++p; + oid->hobj.oid.name = k; + } else if (*p == '<' || *p == '>') { + // key + name + ++p; + r = decode_escaped(p, &oid->hobj.oid.name); + if (r < 0) + return -5; + p += r + 1; + oid->hobj.set_key(k); + } else { + // malformed + return -6; + } + + p = _key_decode_u64(p, &oid->hobj.snap.val); + p = _key_decode_u64(p, &oid->generation); + + if (*p != ONODE_KEY_SUFFIX) { + return -7; + } + p++; + if (*p) { + // if we get something other than a null terminator here, + // something goes wrong. + return -8; + } + + return 0; +} + +template<typename S> +static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key) +{ + key->clear(); + + size_t max_len = ENCODED_KEY_PREFIX_LEN + + (oid.hobj.nspace.length() * 3 + 1) + + (oid.hobj.get_key().length() * 3 + 1) + + 1 + // for '<', '=', or '>' + (oid.hobj.oid.name.length() * 3 + 1) + + 8 + 8 + 1; + key->reserve(max_len); + + _key_encode_prefix(oid, key); + + append_escaped(oid.hobj.nspace, key); + + if (oid.hobj.get_key().length()) { + // is a key... could be < = or >. + append_escaped(oid.hobj.get_key(), key); + // (ASCII chars < = and > sort in that order, yay) + int r = oid.hobj.get_key().compare(oid.hobj.oid.name); + if (r) { + key->append(r > 0 ? ">" : "<"); + append_escaped(oid.hobj.oid.name, key); + } else { + // same as no key + key->append("="); + } + } else { + // no key + append_escaped(oid.hobj.oid.name, key); + key->append("="); + } + + _key_encode_u64(oid.hobj.snap, key); + _key_encode_u64(oid.generation, key); + + key->push_back(ONODE_KEY_SUFFIX); + + // sanity check + if (true) { + ghobject_t t; + int r = get_key_object(*key, &t); + if (r || t != oid) { + derr << " r " << r << dendl; + derr << "key " << pretty_binary_string(*key) << dendl; + derr << "oid " << oid << dendl; + derr << " t " << t << dendl; + ceph_assert(r == 0 && t == oid); + } + } +} + + +// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing +// char lets us quickly test whether it is a shard key without decoding any +// of the prefix bytes. +template<typename S> +static void get_extent_shard_key(const S& onode_key, uint32_t offset, + string *key) +{ + key->clear(); + key->reserve(onode_key.length() + 4 + 1); + key->append(onode_key.c_str(), onode_key.size()); + _key_encode_u32(offset, key); + key->push_back(EXTENT_SHARD_KEY_SUFFIX); +} + +static void rewrite_extent_shard_key(uint32_t offset, string *key) +{ + ceph_assert(key->size() > sizeof(uint32_t) + 1); + ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX); + _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key); +} + +template<typename S> +static void generate_extent_shard_key_and_apply( + const S& onode_key, + uint32_t offset, + string *key, + std::function<void(const string& final_key)> apply) +{ + if (key->empty()) { // make full key + ceph_assert(!onode_key.empty()); + get_extent_shard_key(onode_key, offset, key); + } else { + rewrite_extent_shard_key(offset, key); + } + apply(*key); +} + +int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset) +{ + ceph_assert(key.size() > sizeof(uint32_t) + 1); + ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX); + int okey_len = key.size() - sizeof(uint32_t) - 1; + *onode_key = key.substr(0, okey_len); + const char *p = key.data() + okey_len; + _key_decode_u32(p, offset); + return 0; +} + +static bool is_extent_shard_key(const string& key) +{ + return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX; +} + +// '-' < '.' < '~' +static void get_omap_header(uint64_t id, string *out) +{ + _key_encode_u64(id, out); + out->push_back('-'); +} + +// hmm, I don't think there's any need to escape the user key since we +// have a clean prefix. +static void get_omap_key(uint64_t id, const string& key, string *out) +{ + _key_encode_u64(id, out); + out->push_back('.'); + out->append(key); +} + +static void rewrite_omap_key(uint64_t id, string old, string *out) +{ + _key_encode_u64(id, out); + out->append(old.c_str() + out->length(), old.size() - out->length()); +} + +static void decode_omap_key(const string& key, string *user_key) +{ + *user_key = key.substr(sizeof(uint64_t) + 1); +} + +static void get_omap_tail(uint64_t id, string *out) +{ + _key_encode_u64(id, out); + out->push_back('~'); +} + +static void get_deferred_key(uint64_t seq, string *out) +{ + _key_encode_u64(seq, out); +} + +static void get_pool_stat_key(int64_t pool_id, string *key) +{ + key->clear(); + _key_encode_u64(pool_id, key); +} + +static int get_key_pool_stat(const string& key, uint64_t* pool_id) +{ + const char *p = key.c_str(); + if (key.length() < sizeof(uint64_t)) + return -1; + _key_decode_u64(p, pool_id); + return 0; +} + +template <int LogLevelV> +void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em) +{ + uint64_t pos = 0; + for (auto& s : em.shards) { + dout(LogLevelV) << __func__ << " shard " << *s.shard_info + << (s.loaded ? " (loaded)" : "") + << (s.dirty ? " (dirty)" : "") + << dendl; + } + for (auto& e : em.extent_map) { + dout(LogLevelV) << __func__ << " " << e << dendl; + ceph_assert(e.logical_offset >= pos); + pos = e.logical_offset + e.length; + const bluestore_blob_t& blob = e.blob->get_blob(); + if (blob.has_csum()) { + vector<uint64_t> v; + unsigned n = blob.get_csum_count(); + for (unsigned i = 0; i < n; ++i) + v.push_back(blob.get_csum_item(i)); + dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec + << dendl; + } + std::lock_guard l(e.blob->shared_blob->get_cache()->lock); + for (auto& i : e.blob->shared_blob->bc.buffer_map) { + dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first + << "~" << i.second->length << std::dec + << " " << *i.second << dendl; + } + } +} + +template <int LogLevelV> +void _dump_onode(CephContext *cct, const BlueStore::Onode& o) +{ + if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>()) + return; + dout(LogLevelV) << __func__ << " " << &o << " " << o.oid + << " nid " << o.onode.nid + << " size 0x" << std::hex << o.onode.size + << " (" << std::dec << o.onode.size << ")" + << " expected_object_size " << o.onode.expected_object_size + << " expected_write_size " << o.onode.expected_write_size + << " in " << o.onode.extent_map_shards.size() << " shards" + << ", " << o.extent_map.spanning_blob_map.size() + << " spanning blobs" + << dendl; + for (auto p = o.onode.attrs.begin(); + p != o.onode.attrs.end(); + ++p) { + dout(LogLevelV) << __func__ << " attr " << p->first + << " len " << p->second.length() << dendl; + } + _dump_extent_map<LogLevelV>(cct, o.extent_map); +} + +template <int LogLevelV> +void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t) +{ + dout(LogLevelV) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t->dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; +} + +// merge operators + +struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator { + void merge_nonexistent( + const char *rdata, size_t rlen, std::string *new_value) override { + *new_value = std::string(rdata, rlen); + } + void merge( + const char *ldata, size_t llen, + const char *rdata, size_t rlen, + std::string *new_value) override { + ceph_assert(llen == rlen); + ceph_assert((rlen % 8) == 0); + new_value->resize(rlen); + const ceph_le64* lv = (const ceph_le64*)ldata; + const ceph_le64* rv = (const ceph_le64*)rdata; + ceph_le64* nv = &(ceph_le64&)new_value->at(0); + for (size_t i = 0; i < rlen >> 3; ++i) { + nv[i] = lv[i] + rv[i]; + } + } + // We use each operator name and each prefix to construct the + // overall RocksDB operator name for consistency check at open time. + const char *name() const override { + return "int64_array"; + } +}; + + +// Buffer + +ostream& operator<<(ostream& out, const BlueStore::Buffer& b) +{ + out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex + << b.offset << "~" << b.length << std::dec + << " " << BlueStore::Buffer::get_state_name(b.state); + if (b.flags) + out << " " << BlueStore::Buffer::get_flag_name(b.flags); + return out << ")"; +} + +namespace { + +/* + * Due to a bug in key string encoding (see a comment for append_escaped) + * the KeyValueDB iterator does not lexicographically sort the same + * way that ghobject_t does: objects with the same hash may have wrong order. + * + * This is the iterator wrapper that fixes the keys order. + */ + +class CollectionListIterator { +public: + CollectionListIterator(const KeyValueDB::Iterator &it) + : m_it(it) { + } + virtual ~CollectionListIterator() { + } + + virtual bool valid() const = 0; + virtual const ghobject_t &oid() const = 0; + virtual void lower_bound(const ghobject_t &oid) = 0; + virtual void upper_bound(const ghobject_t &oid) = 0; + virtual void next() = 0; + + virtual int cmp(const ghobject_t &oid) const = 0; + + bool is_ge(const ghobject_t &oid) const { + return cmp(oid) >= 0; + } + + bool is_lt(const ghobject_t &oid) const { + return cmp(oid) < 0; + } + +protected: + KeyValueDB::Iterator m_it; +}; + +class SimpleCollectionListIterator : public CollectionListIterator { +public: + SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it) + : CollectionListIterator(it), m_cct(cct) { + } + + bool valid() const override { + return m_it->valid(); + } + + const ghobject_t &oid() const override { + ceph_assert(valid()); + + return m_oid; + } + + void lower_bound(const ghobject_t &oid) override { + string key; + get_object_key(m_cct, oid, &key); + + m_it->lower_bound(key); + get_oid(); + } + + void upper_bound(const ghobject_t &oid) override { + string key; + get_object_key(m_cct, oid, &key); + + m_it->upper_bound(key); + get_oid(); + } + + void next() override { + ceph_assert(valid()); + + m_it->next(); + get_oid(); + } + + int cmp(const ghobject_t &oid) const override { + ceph_assert(valid()); + + string key; + get_object_key(m_cct, oid, &key); + + return m_it->key().compare(key); + } + +private: + CephContext *m_cct; + ghobject_t m_oid; + + void get_oid() { + if (!valid()) { + return; + } + + if (is_extent_shard_key(m_it->key())) { + next(); + return; + } + + m_oid = ghobject_t(); + int r = get_key_object(m_it->key(), &m_oid); + ceph_assert(r == 0); + } +}; + +class SortedCollectionListIterator : public CollectionListIterator { +public: + SortedCollectionListIterator(const KeyValueDB::Iterator &it) + : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) { + } + + bool valid() const override { + return m_chunk_iter != m_chunk.end(); + } + + const ghobject_t &oid() const override { + ceph_assert(valid()); + + return m_chunk_iter->first; + } + + void lower_bound(const ghobject_t &oid) override { + std::string key; + _key_encode_prefix(oid, &key); + + m_it->lower_bound(key); + m_chunk_iter = m_chunk.end(); + if (!get_next_chunk()) { + return; + } + + if (this->oid().shard_id != oid.shard_id || + this->oid().hobj.pool != oid.hobj.pool || + this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) { + return; + } + + m_chunk_iter = m_chunk.lower_bound(oid); + if (m_chunk_iter == m_chunk.end()) { + get_next_chunk(); + } + } + + void upper_bound(const ghobject_t &oid) override { + lower_bound(oid); + + if (valid() && this->oid() == oid) { + next(); + } + } + + void next() override { + ceph_assert(valid()); + + m_chunk_iter++; + if (m_chunk_iter == m_chunk.end()) { + get_next_chunk(); + } + } + + int cmp(const ghobject_t &oid) const override { + ceph_assert(valid()); + + if (this->oid() < oid) { + return -1; + } + if (this->oid() > oid) { + return 1; + } + return 0; + } + +private: + std::map<ghobject_t, std::string> m_chunk; + std::map<ghobject_t, std::string>::iterator m_chunk_iter; + + bool get_next_chunk() { + while (m_it->valid() && is_extent_shard_key(m_it->key())) { + m_it->next(); + } + + if (!m_it->valid()) { + return false; + } + + ghobject_t oid; + int r = get_key_object(m_it->key(), &oid); + ceph_assert(r == 0); + + m_chunk.clear(); + while (true) { + m_chunk.insert({oid, m_it->key()}); + + do { + m_it->next(); + } while (m_it->valid() && is_extent_shard_key(m_it->key())); + + if (!m_it->valid()) { + break; + } + + ghobject_t next; + r = get_key_object(m_it->key(), &next); + ceph_assert(r == 0); + if (next.shard_id != oid.shard_id || + next.hobj.pool != oid.hobj.pool || + next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) { + break; + } + oid = next; + } + + m_chunk_iter = m_chunk.begin(); + return true; + } +}; + +} // anonymous namespace + +// Garbage Collector + +void BlueStore::GarbageCollector::process_protrusive_extents( + const BlueStore::ExtentMap& extent_map, + uint64_t start_offset, + uint64_t end_offset, + uint64_t start_touch_offset, + uint64_t end_touch_offset, + uint64_t min_alloc_size) +{ + ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset); + + uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size); + uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size); + + dout(30) << __func__ << " (hex): [" << std::hex + << lookup_start_offset << ", " << lookup_end_offset + << ")" << std::dec << dendl; + + for (auto it = extent_map.seek_lextent(lookup_start_offset); + it != extent_map.extent_map.end() && + it->logical_offset < lookup_end_offset; + ++it) { + uint64_t alloc_unit_start = it->logical_offset / min_alloc_size; + uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size; + + dout(30) << __func__ << " " << *it + << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end + << dendl; + + Blob* b = it->blob.get(); + + if (it->logical_offset >=start_touch_offset && + it->logical_end() <= end_touch_offset) { + // Process extents within the range affected by + // the current write request. + // Need to take into account if existing extents + // can be merged with them (uncompressed case) + if (!b->get_blob().is_compressed()) { + if (blob_info_counted && used_alloc_unit == alloc_unit_start) { + --blob_info_counted->expected_allocations; // don't need to allocate + // new AU for compressed + // data since another + // collocated uncompressed + // blob already exists + dout(30) << __func__ << " --expected:" + << alloc_unit_start << dendl; + } + used_alloc_unit = alloc_unit_end; + blob_info_counted = nullptr; + } + } else if (b->get_blob().is_compressed()) { + + // additionally we take compressed blobs that were not impacted + // by the write into account too + BlobInfo& bi = + affected_blobs.emplace( + b, BlobInfo(b->get_referenced_bytes())).first->second; + + int adjust = + (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1; + bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust; + dout(30) << __func__ << " expected_allocations=" + << bi.expected_allocations << " end_au:" + << alloc_unit_end << dendl; + + blob_info_counted = &bi; + used_alloc_unit = alloc_unit_end; + + ceph_assert(it->length <= bi.referenced_bytes); + bi.referenced_bytes -= it->length; + dout(30) << __func__ << " affected_blob:" << *b + << " unref 0x" << std::hex << it->length + << " referenced = 0x" << bi.referenced_bytes + << std::dec << dendl; + // NOTE: we can't move specific blob to resulting GC list here + // when reference counter == 0 since subsequent extents might + // decrement its expected_allocation. + // Hence need to enumerate all the extents first. + if (!bi.collect_candidate) { + bi.first_lextent = it; + bi.collect_candidate = true; + } + bi.last_lextent = it; + } else { + if (blob_info_counted && used_alloc_unit == alloc_unit_start) { + // don't need to allocate new AU for compressed data since another + // collocated uncompressed blob already exists + --blob_info_counted->expected_allocations; + dout(30) << __func__ << " --expected_allocations:" + << alloc_unit_start << dendl; + } + used_alloc_unit = alloc_unit_end; + blob_info_counted = nullptr; + } + } + + for (auto b_it = affected_blobs.begin(); + b_it != affected_blobs.end(); + ++b_it) { + Blob* b = b_it->first; + BlobInfo& bi = b_it->second; + if (bi.referenced_bytes == 0) { + uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length(); + int64_t blob_expected_for_release = + round_up_to(len_on_disk, min_alloc_size) / min_alloc_size; + + dout(30) << __func__ << " " << *(b_it->first) + << " expected4release=" << blob_expected_for_release + << " expected_allocations=" << bi.expected_allocations + << dendl; + int64_t benefit = blob_expected_for_release - bi.expected_allocations; + if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) { + if (bi.collect_candidate) { + auto it = bi.first_lextent; + bool bExit = false; + do { + if (it->blob.get() == b) { + extents_to_collect.insert(it->logical_offset, it->length); + } + bExit = it == bi.last_lextent; + ++it; + } while (!bExit); + } + expected_for_release += blob_expected_for_release; + expected_allocations += bi.expected_allocations; + } + } + } +} + +int64_t BlueStore::GarbageCollector::estimate( + uint64_t start_offset, + uint64_t length, + const BlueStore::ExtentMap& extent_map, + const BlueStore::old_extent_map_t& old_extents, + uint64_t min_alloc_size) +{ + + affected_blobs.clear(); + extents_to_collect.clear(); + used_alloc_unit = boost::optional<uint64_t >(); + blob_info_counted = nullptr; + + uint64_t gc_start_offset = start_offset; + uint64_t gc_end_offset = start_offset + length; + + uint64_t end_offset = start_offset + length; + + for (auto it = old_extents.begin(); it != old_extents.end(); ++it) { + Blob* b = it->e.blob.get(); + if (b->get_blob().is_compressed()) { + + // update gc_start_offset/gc_end_offset if needed + gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start()); + gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end()); + + auto o = it->e.logical_offset; + auto l = it->e.length; + + uint64_t ref_bytes = b->get_referenced_bytes(); + // micro optimization to bypass blobs that have no more references + if (ref_bytes != 0) { + dout(30) << __func__ << " affected_blob:" << *b + << " unref 0x" << std::hex << o << "~" << l + << std::dec << dendl; + affected_blobs.emplace(b, BlobInfo(ref_bytes)); + } + } + } + dout(30) << __func__ << " gc range(hex): [" << std::hex + << gc_start_offset << ", " << gc_end_offset + << ")" << std::dec << dendl; + + // enumerate preceeding extents to check if they reference affected blobs + if (gc_start_offset < start_offset || gc_end_offset > end_offset) { + process_protrusive_extents(extent_map, + gc_start_offset, + gc_end_offset, + start_offset, + end_offset, + min_alloc_size); + } + return expected_for_release - expected_allocations; +} + +// Cache + +BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type, + PerfCounters *logger) +{ + Cache *c = nullptr; + + if (type == "lru") + c = new LRUCache(cct); + else if (type == "2q") + c = new TwoQCache(cct); + else + ceph_abort_msg("unrecognized cache type"); + + c->logger = logger; + return c; +} + +void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max) +{ + std::lock_guard l(lock); + if (cct->_conf->objectstore_blackhole) { + // do not trim if we are throwing away IOs a layer down + return; + } + _trim(onode_max, buffer_max); +} + +void BlueStore::Cache::trim_all() +{ + std::lock_guard l(lock); + // we should not be shutting down after the blackhole is enabled + assert(!cct->_conf->objectstore_blackhole); + _trim(0, 0); +} + +// LRUCache +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") " + +void BlueStore::LRUCache::_touch_onode(OnodeRef& o) +{ + auto p = onode_lru.iterator_to(*o); + onode_lru.erase(p); + onode_lru.push_front(*o); +} + +void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max) +{ + dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max + << " buffers " << buffer_size << " / " << buffer_max + << dendl; + + _audit("trim start"); + + // buffers + while (buffer_size > buffer_max) { + auto i = buffer_lru.rbegin(); + if (i == buffer_lru.rend()) { + // stop if buffer_lru is now empty + break; + } + + Buffer *b = &*i; + ceph_assert(b->is_clean()); + dout(20) << __func__ << " rm " << *b << dendl; + b->space->_rm_buffer(this, b); + } + + // onodes + if (onode_max >= onode_lru.size() || + last_pinned == onode_lru.begin()) { + return; // don't even try + } + uint64_t num = onode_lru.size() - onode_max; + + auto p = last_pinned; + last_pinned = onode_lru.end(); + ceph_assert(p != onode_lru.begin()); + --p; + int skipped = 0; + int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned; + while (num > 0) { + Onode *o = &*p; + int refs = o->nref.load(); + if (refs > 1) { + dout(20) << __func__ << " " << o->oid << " has " << refs + << " refs, skipping" << dendl; + if (++skipped >= max_skipped) { + dout(15) << __func__ << " maximum skip pinned reached; stopping with " + << num << " left to trim" << dendl; + last_pinned = p; + break; + } + + if (p == onode_lru.begin()) { + break; + } else { + p--; + num--; + continue; + } + } + dout(30) << __func__ << " rm " << o->oid << dendl; + if (p != onode_lru.begin()) { + _onode_lru_erase(p--); + } else { + _onode_lru_erase(p); + num = 1; // fake num to end the loop + // in we might still have some pinned onodes + } + o->get(); // paranoia + o->c->onode_map.remove(o->oid); + o->put(); + --num; + } +} + +#ifdef DEBUG_CACHE +void BlueStore::LRUCache::_audit(const char *when) +{ + dout(10) << __func__ << " " << when << " start" << dendl; + uint64_t s = 0; + for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) { + s += i->length; + } + if (s != buffer_size) { + derr << __func__ << " buffer_size " << buffer_size << " actual " << s + << dendl; + for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) { + derr << __func__ << " " << *i << dendl; + } + ceph_assert(s == buffer_size); + } + dout(20) << __func__ << " " << when << " buffer_size " << buffer_size + << " ok" << dendl; +} +#endif + +// TwoQCache +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") " + + +void BlueStore::TwoQCache::_touch_onode(OnodeRef& o) +{ + auto p = onode_lru.iterator_to(*o); + _onode_lru_erase(p); + onode_lru.push_front(*o); +} + +void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near) +{ + dout(20) << __func__ << " level " << level << " near " << near + << " on " << *b + << " which has cache_private " << b->cache_private << dendl; + if (near) { + b->cache_private = near->cache_private; + switch (b->cache_private) { + case BUFFER_WARM_IN: + buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b); + break; + case BUFFER_WARM_OUT: + ceph_assert(b->is_empty()); + buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b); + break; + case BUFFER_HOT: + buffer_hot.insert(buffer_hot.iterator_to(*near), *b); + break; + default: + ceph_abort_msg("bad cache_private"); + } + } else if (b->cache_private == BUFFER_NEW) { + b->cache_private = BUFFER_WARM_IN; + if (level > 0) { + buffer_warm_in.push_front(*b); + } else { + // take caller hint to start at the back of the warm queue + buffer_warm_in.push_back(*b); + } + } else { + // we got a hint from discard + switch (b->cache_private) { + case BUFFER_WARM_IN: + // stay in warm_in. move to front, even though 2Q doesn't actually + // do this. + dout(20) << __func__ << " move to front of warm " << *b << dendl; + buffer_warm_in.push_front(*b); + break; + case BUFFER_WARM_OUT: + b->cache_private = BUFFER_HOT; + // move to hot. fall-thru + case BUFFER_HOT: + dout(20) << __func__ << " move to front of hot " << *b << dendl; + buffer_hot.push_front(*b); + break; + default: + ceph_abort_msg("bad cache_private"); + } + } + if (!b->is_empty()) { + buffer_bytes += b->length; + buffer_list_bytes[b->cache_private] += b->length; + } +} + +void BlueStore::TwoQCache::_rm_buffer(Buffer *b) +{ + dout(20) << __func__ << " " << *b << dendl; + if (!b->is_empty()) { + ceph_assert(buffer_bytes >= b->length); + buffer_bytes -= b->length; + ceph_assert(buffer_list_bytes[b->cache_private] >= b->length); + buffer_list_bytes[b->cache_private] -= b->length; + } + switch (b->cache_private) { + case BUFFER_WARM_IN: + buffer_warm_in.erase(buffer_warm_in.iterator_to(*b)); + break; + case BUFFER_WARM_OUT: + buffer_warm_out.erase(buffer_warm_out.iterator_to(*b)); + break; + case BUFFER_HOT: + buffer_hot.erase(buffer_hot.iterator_to(*b)); + break; + default: + ceph_abort_msg("bad cache_private"); + } +} + +void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b) +{ + TwoQCache *src = static_cast<TwoQCache*>(srcc); + src->_rm_buffer(b); + + // preserve which list we're on (even if we can't preserve the order!) + switch (b->cache_private) { + case BUFFER_WARM_IN: + ceph_assert(!b->is_empty()); + buffer_warm_in.push_back(*b); + break; + case BUFFER_WARM_OUT: + ceph_assert(b->is_empty()); + buffer_warm_out.push_back(*b); + break; + case BUFFER_HOT: + ceph_assert(!b->is_empty()); + buffer_hot.push_back(*b); + break; + default: + ceph_abort_msg("bad cache_private"); + } + if (!b->is_empty()) { + buffer_bytes += b->length; + buffer_list_bytes[b->cache_private] += b->length; + } +} + +void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta) +{ + dout(20) << __func__ << " delta " << delta << " on " << *b << dendl; + if (!b->is_empty()) { + ceph_assert((int64_t)buffer_bytes + delta >= 0); + buffer_bytes += delta; + ceph_assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0); + buffer_list_bytes[b->cache_private] += delta; + } +} + +void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max) +{ + dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max + << " buffers " << buffer_bytes << " / " << buffer_max + << dendl; + + _audit("trim start"); + + // buffers + if (buffer_bytes > buffer_max) { + uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio; + uint64_t khot = buffer_max - kin; + + // pre-calculate kout based on average buffer size too, + // which is typical(the warm_in and hot lists may change later) + uint64_t kout = 0; + uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size(); + if (buffer_num) { + uint64_t buffer_avg_size = buffer_bytes / buffer_num; + ceph_assert(buffer_avg_size); + uint64_t calculated_buffer_num = buffer_max / buffer_avg_size; + kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio; + } + + if (buffer_list_bytes[BUFFER_HOT] < khot) { + // hot is small, give slack to warm_in + kin += khot - buffer_list_bytes[BUFFER_HOT]; + } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) { + // warm_in is small, give slack to hot + khot += kin - buffer_list_bytes[BUFFER_WARM_IN]; + } + + // adjust warm_in list + int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin; + uint64_t evicted = 0; + + while (to_evict_bytes > 0) { + auto p = buffer_warm_in.rbegin(); + if (p == buffer_warm_in.rend()) { + // stop if warm_in list is now empty + break; + } + + Buffer *b = &*p; + ceph_assert(b->is_clean()); + dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl; + ceph_assert(buffer_bytes >= b->length); + buffer_bytes -= b->length; + ceph_assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length); + buffer_list_bytes[BUFFER_WARM_IN] -= b->length; + to_evict_bytes -= b->length; + evicted += b->length; + b->state = Buffer::STATE_EMPTY; + b->data.clear(); + buffer_warm_in.erase(buffer_warm_in.iterator_to(*b)); + buffer_warm_out.push_front(*b); + b->cache_private = BUFFER_WARM_OUT; + } + + if (evicted > 0) { + dout(20) << __func__ << " evicted " << byte_u_t(evicted) + << " from warm_in list, done evicting warm_in buffers" + << dendl; + } + + // adjust hot list + to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot; + evicted = 0; + + while (to_evict_bytes > 0) { + auto p = buffer_hot.rbegin(); + if (p == buffer_hot.rend()) { + // stop if hot list is now empty + break; + } + + Buffer *b = &*p; + dout(20) << __func__ << " buffer_hot rm " << *b << dendl; + ceph_assert(b->is_clean()); + // adjust evict size before buffer goes invalid + to_evict_bytes -= b->length; + evicted += b->length; + b->space->_rm_buffer(this, b); + } + + if (evicted > 0) { + dout(20) << __func__ << " evicted " << byte_u_t(evicted) + << " from hot list, done evicting hot buffers" + << dendl; + } + + // adjust warm out list too, if necessary + int64_t num = buffer_warm_out.size() - kout; + while (num-- > 0) { + Buffer *b = &*buffer_warm_out.rbegin(); + ceph_assert(b->is_empty()); + dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl; + b->space->_rm_buffer(this, b); + } + } + + // onodes + if (onode_max >= onode_lru.size() || + last_pinned == onode_lru.begin()) { + return; // don't even try + } + uint64_t num = onode_lru.size() - onode_max; + + auto p = last_pinned; + last_pinned = onode_lru.end(); + ceph_assert(p != onode_lru.begin()); + --p; + int skipped = 0; + int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned; + while (num > 0) { + Onode *o = &*p; + dout(20) << __func__ << " considering " << o << dendl; + int refs = o->nref.load(); + if (refs > 1) { + dout(20) << __func__ << " " << o->oid << " has " << refs + << " refs; skipping" << dendl; + if (++skipped >= max_skipped) { + dout(15) << __func__ << " maximum skip pinned reached; stopping with " + << num << " left to trim" << dendl; + last_pinned = p; + break; + } + + if (p == onode_lru.begin()) { + break; + } else { + p--; + num--; + continue; + } + } + dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl; + if (p != onode_lru.begin()) { + _onode_lru_erase(p--); + } else { + _onode_lru_erase(p); + num = 1; // fake num to end the loop + // in we might still have some pinned onodes + } + o->get(); // paranoia + o->c->onode_map.remove(o->oid); + o->put(); + --num; + } +} + +#ifdef DEBUG_CACHE +void BlueStore::TwoQCache::_audit(const char *when) +{ + dout(10) << __func__ << " " << when << " start" << dendl; + uint64_t s = 0; + for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) { + s += i->length; + } + + uint64_t hot_bytes = s; + if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) { + derr << __func__ << " hot_list_bytes " + << buffer_list_bytes[BUFFER_HOT] + << " != actual " << hot_bytes + << dendl; + ceph_assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]); + } + + for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) { + s += i->length; + } + + uint64_t warm_in_bytes = s - hot_bytes; + if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) { + derr << __func__ << " warm_in_list_bytes " + << buffer_list_bytes[BUFFER_WARM_IN] + << " != actual " << warm_in_bytes + << dendl; + ceph_assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]); + } + + if (s != buffer_bytes) { + derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s + << dendl; + ceph_assert(s == buffer_bytes); + } + + dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes + << " ok" << dendl; +} +#endif + + +// BufferSpace + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") " + +void BlueStore::BufferSpace::_clear(Cache* cache) +{ + // note: we already hold cache->lock + ldout(cache->cct, 20) << __func__ << dendl; + while (!buffer_map.empty()) { + _rm_buffer(cache, buffer_map.begin()); + } +} + +int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length) +{ + // note: we already hold cache->lock + ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length + << std::dec << dendl; + int cache_private = 0; + cache->_audit("discard start"); + auto i = _data_lower_bound(offset); + uint32_t end = offset + length; + while (i != buffer_map.end()) { + Buffer *b = i->second.get(); + if (b->offset >= end) { + break; + } + if (b->cache_private > cache_private) { + cache_private = b->cache_private; + } + if (b->offset < offset) { + int64_t front = offset - b->offset; + if (b->end() > end) { + // drop middle (split) + uint32_t tail = b->end() - end; + if (b->data.length()) { + bufferlist bl; + bl.substr_of(b->data, b->length - tail, tail); + Buffer *nb = new Buffer(this, b->state, b->seq, end, bl); + nb->maybe_rebuild(); + _add_buffer(cache, nb, 0, b); + } else { + _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail), + 0, b); + } + if (!b->is_writing()) { + cache->_adjust_buffer_size(b, front - (int64_t)b->length); + } + b->truncate(front); + b->maybe_rebuild(); + cache->_audit("discard end 1"); + break; + } else { + // drop tail + if (!b->is_writing()) { + cache->_adjust_buffer_size(b, front - (int64_t)b->length); + } + b->truncate(front); + b->maybe_rebuild(); + ++i; + continue; + } + } + if (b->end() <= end) { + // drop entire buffer + _rm_buffer(cache, i++); + continue; + } + // drop front + uint32_t keep = b->end() - end; + if (b->data.length()) { + bufferlist bl; + bl.substr_of(b->data, b->length - keep, keep); + Buffer *nb = new Buffer(this, b->state, b->seq, end, bl); + nb->maybe_rebuild(); + _add_buffer(cache, nb, 0, b); + } else { + _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b); + } + _rm_buffer(cache, i); + cache->_audit("discard end 2"); + break; + } + return cache_private; +} + +void BlueStore::BufferSpace::read( + Cache* cache, + uint32_t offset, + uint32_t length, + BlueStore::ready_regions_t& res, + interval_set<uint32_t>& res_intervals, + int flags) +{ + res.clear(); + res_intervals.clear(); + uint32_t want_bytes = length; + uint32_t end = offset + length; + + { + std::lock_guard l(cache->lock); + for (auto i = _data_lower_bound(offset); + i != buffer_map.end() && offset < end && i->first < end; + ++i) { + Buffer *b = i->second.get(); + ceph_assert(b->end() > offset); + + bool val = false; + if (flags & BYPASS_CLEAN_CACHE) + val = b->is_writing(); + else + val = b->is_writing() || b->is_clean(); + if (val) { + if (b->offset < offset) { + uint32_t skip = offset - b->offset; + uint32_t l = min(length, b->length - skip); + res[offset].substr_of(b->data, skip, l); + res_intervals.insert(offset, l); + offset += l; + length -= l; + if (!b->is_writing()) { + cache->_touch_buffer(b); + } + continue; + } + if (b->offset > offset) { + uint32_t gap = b->offset - offset; + if (length <= gap) { + break; + } + offset += gap; + length -= gap; + } + if (!b->is_writing()) { + cache->_touch_buffer(b); + } + if (b->length > length) { + res[offset].substr_of(b->data, 0, length); + res_intervals.insert(offset, length); + break; + } else { + res[offset].append(b->data); + res_intervals.insert(offset, b->length); + if (b->length == length) + break; + offset += b->length; + length -= b->length; + } + } + } + } + + uint64_t hit_bytes = res_intervals.size(); + ceph_assert(hit_bytes <= want_bytes); + uint64_t miss_bytes = want_bytes - hit_bytes; + cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes); + cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes); +} + +void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq) +{ + auto i = writing.begin(); + while (i != writing.end()) { + if (i->seq > seq) { + break; + } + if (i->seq < seq) { + ++i; + continue; + } + + Buffer *b = &*i; + ceph_assert(b->is_writing()); + + if (b->flags & Buffer::FLAG_NOCACHE) { + writing.erase(i++); + ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl; + buffer_map.erase(b->offset); + } else { + b->state = Buffer::STATE_CLEAN; + writing.erase(i++); + b->maybe_rebuild(); + b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data); + cache->_add_buffer(b, 1, nullptr); + ldout(cache->cct, 20) << __func__ << " added " << *b << dendl; + } + } + + cache->_audit("finish_write end"); +} + +void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r) +{ + std::lock_guard lk(cache->lock); + if (buffer_map.empty()) + return; + + auto p = --buffer_map.end(); + while (true) { + if (p->second->end() <= pos) + break; + + if (p->second->offset < pos) { + ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl; + size_t left = pos - p->second->offset; + size_t right = p->second->length - left; + if (p->second->data.length()) { + bufferlist bl; + bl.substr_of(p->second->data, left, right); + r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl), + 0, p->second.get()); + } else { + r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right), + 0, p->second.get()); + } + cache->_adjust_buffer_size(p->second.get(), -right); + p->second->truncate(left); + break; + } + + ceph_assert(p->second->end() > pos); + ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl; + if (p->second->data.length()) { + r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, + p->second->offset - pos, p->second->data), + 0, p->second.get()); + } else { + r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, + p->second->offset - pos, p->second->length), + 0, p->second.get()); + } + if (p == buffer_map.begin()) { + _rm_buffer(cache, p); + break; + } else { + _rm_buffer(cache, p--); + } + } + ceph_assert(writing.empty()); +} + +// OnodeSpace + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") " + +BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o) +{ + std::lock_guard l(cache->lock); + auto p = onode_map.find(oid); + if (p != onode_map.end()) { + ldout(cache->cct, 30) << __func__ << " " << oid << " " << o + << " raced, returning existing " << p->second + << dendl; + return p->second; + } + ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl; + onode_map[oid] = o; + cache->_add_onode(o, 1); + return o; +} + +BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid) +{ + ldout(cache->cct, 30) << __func__ << dendl; + OnodeRef o; + bool hit = false; + + { + std::lock_guard l(cache->lock); + ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid); + if (p == onode_map.end()) { + ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl; + } else { + ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second + << dendl; + cache->_touch_onode(p->second); + hit = true; + o = p->second; + } + } + + if (hit) { + cache->logger->inc(l_bluestore_onode_hits); + } else { + cache->logger->inc(l_bluestore_onode_misses); + } + return o; +} + +void BlueStore::OnodeSpace::clear() +{ + std::lock_guard l(cache->lock); + ldout(cache->cct, 10) << __func__ << dendl; + for (auto &p : onode_map) { + cache->_rm_onode(p.second); + } + onode_map.clear(); +} + +bool BlueStore::OnodeSpace::empty() +{ + std::lock_guard l(cache->lock); + return onode_map.empty(); +} + +void BlueStore::OnodeSpace::rename( + OnodeRef& oldo, + const ghobject_t& old_oid, + const ghobject_t& new_oid, + const mempool::bluestore_cache_meta::string& new_okey) +{ + std::lock_guard l(cache->lock); + ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid + << dendl; + ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn; + po = onode_map.find(old_oid); + pn = onode_map.find(new_oid); + ceph_assert(po != pn); + + ceph_assert(po != onode_map.end()); + if (pn != onode_map.end()) { + ldout(cache->cct, 30) << __func__ << " removing target " << pn->second + << dendl; + cache->_rm_onode(pn->second); + onode_map.erase(pn); + } + OnodeRef o = po->second; + + // install a non-existent onode at old location + oldo.reset(new Onode(o->c, old_oid, o->key)); + po->second = oldo; + cache->_add_onode(po->second, 1); + + // add at new position and fix oid, key + onode_map.insert(make_pair(new_oid, o)); + cache->_touch_onode(o); + o->oid = new_oid; + o->key = new_okey; +} + +bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f) +{ + std::lock_guard l(cache->lock); + ldout(cache->cct, 20) << __func__ << dendl; + for (auto& i : onode_map) { + if (f(i.second)) { + return true; + } + } + return false; +} + +template <int LogLevelV = 30> +void BlueStore::OnodeSpace::dump(CephContext *cct) +{ + for (auto& i : onode_map) { + ldout(cct, LogLevelV) << i.first << " : " << i.second << dendl; + } +} + +// SharedBlob + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") " + +ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb) +{ + out << "SharedBlob(" << &sb; + + if (sb.loaded) { + out << " loaded " << *sb.persistent; + } else { + out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec; + } + return out << ")"; +} + +BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll) + : coll(_coll), sbid_unloaded(i) +{ + ceph_assert(sbid_unloaded > 0); + if (get_cache()) { + get_cache()->add_blob(); + } +} + +BlueStore::SharedBlob::~SharedBlob() +{ + if (loaded && persistent) { + delete persistent; + } +} + +void BlueStore::SharedBlob::put() +{ + if (--nref == 0) { + ldout(coll->store->cct, 20) << __func__ << " " << this + << " removing self from set " << get_parent() + << dendl; + again: + auto coll_snap = coll; + if (coll_snap) { + std::lock_guard l(coll_snap->cache->lock); + if (coll_snap != coll) { + goto again; + } + if (!coll_snap->shared_blob_set.remove(this, true)) { + // race with lookup + return; + } + bc._clear(coll_snap->cache); + coll_snap->cache->rm_blob(); + } + delete this; + } +} + +void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length) +{ + ceph_assert(persistent); + persistent->ref_map.get(offset, length); +} + +void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length, + PExtentVector *r, + bool *unshare) +{ + ceph_assert(persistent); + persistent->ref_map.put(offset, length, r, + unshare && !*unshare ? unshare : nullptr); +} + +void BlueStore::SharedBlob::finish_write(uint64_t seq) +{ + while (true) { + Cache *cache = coll->cache; + std::lock_guard l(cache->lock); + if (coll->cache != cache) { + ldout(coll->store->cct, 20) << __func__ + << " raced with sb cache update, was " << cache + << ", now " << coll->cache << ", retrying" + << dendl; + continue; + } + bc._finish_write(cache, seq); + break; + } +} + +// SharedBlobSet + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") " + +template <int LogLevelV = 30> +void BlueStore::SharedBlobSet::dump(CephContext *cct) +{ + std::lock_guard l(lock); + for (auto& i : sb_map) { + ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl; + } +} + +// Blob + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.blob(" << this << ") " + +ostream& operator<<(ostream& out, const BlueStore::Blob& b) +{ + out << "Blob(" << &b; + if (b.is_spanning()) { + out << " spanning " << b.id; + } + out << " " << b.get_blob() << " " << b.get_blob_use_tracker(); + if (b.shared_blob) { + out << " " << *b.shared_blob; + } else { + out << " (shared_blob=NULL)"; + } + out << ")"; + return out; +} + +void BlueStore::Blob::discard_unallocated(Collection *coll) +{ + if (get_blob().is_shared()) { + return; + } + if (get_blob().is_compressed()) { + bool discard = false; + bool all_invalid = true; + for (auto e : get_blob().get_extents()) { + if (!e.is_valid()) { + discard = true; + } else { + all_invalid = false; + } + } + ceph_assert(discard == all_invalid); // in case of compressed blob all + // or none pextents are invalid. + if (discard) { + shared_blob->bc.discard(shared_blob->get_cache(), 0, + get_blob().get_logical_length()); + } + } else { + size_t pos = 0; + for (auto e : get_blob().get_extents()) { + if (!e.is_valid()) { + ldout(coll->store->cct, 20) << __func__ << " 0x" << std::hex << pos + << "~" << e.length + << std::dec << dendl; + shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length); + } + pos += e.length; + } + if (get_blob().can_prune_tail()) { + dirty_blob().prune_tail(); + used_in_blob.prune_tail(get_blob().get_ondisk_length()); + auto cct = coll->store->cct; //used by dout + dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl; + } + } +} + +void BlueStore::Blob::get_ref( + Collection *coll, + uint32_t offset, + uint32_t length) +{ + // Caller has to initialize Blob's logical length prior to increment + // references. Otherwise one is neither unable to determine required + // amount of counters in case of per-au tracking nor obtain min_release_size + // for single counter mode. + ceph_assert(get_blob().get_logical_length() != 0); + auto cct = coll->store->cct; + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << " " << *this << dendl; + + if (used_in_blob.is_empty()) { + uint32_t min_release_size = + get_blob().get_release_size(coll->store->min_alloc_size); + uint64_t l = get_blob().get_logical_length(); + dout(20) << __func__ << " init 0x" << std::hex << l << ", " + << min_release_size << std::dec << dendl; + used_in_blob.init(l, min_release_size); + } + used_in_blob.get( + offset, + length); +} + +bool BlueStore::Blob::put_ref( + Collection *coll, + uint32_t offset, + uint32_t length, + PExtentVector *r) +{ + PExtentVector logical; + + auto cct = coll->store->cct; + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << " " << *this << dendl; + + bool empty = used_in_blob.put( + offset, + length, + &logical); + r->clear(); + // nothing to release + if (!empty && logical.empty()) { + return false; + } + + bluestore_blob_t& b = dirty_blob(); + return b.release_extents(empty, logical, r); +} + +bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size, + uint32_t target_blob_size, + uint32_t b_offset, + uint32_t *length0) { + ceph_assert(min_alloc_size); + ceph_assert(target_blob_size); + if (!get_blob().is_mutable()) { + return false; + } + + uint32_t length = *length0; + uint32_t end = b_offset + length; + + // Currently for the sake of simplicity we omit blob reuse if data is + // unaligned with csum chunk. Later we can perform padding if needed. + if (get_blob().has_csum() && + ((b_offset % get_blob().get_csum_chunk_size()) != 0 || + (end % get_blob().get_csum_chunk_size()) != 0)) { + return false; + } + + auto blen = get_blob().get_logical_length(); + uint32_t new_blen = blen; + + // make sure target_blob_size isn't less than current blob len + target_blob_size = std::max(blen, target_blob_size); + + if (b_offset >= blen) { + // new data totally stands out of the existing blob + new_blen = end; + } else { + // new data overlaps with the existing blob + new_blen = std::max(blen, end); + + uint32_t overlap = 0; + if (new_blen > blen) { + overlap = blen - b_offset; + } else { + overlap = length; + } + + if (!get_blob().is_unallocated(b_offset, overlap)) { + // abort if any piece of the overlap has already been allocated + return false; + } + } + + if (new_blen > blen) { + int64_t overflow = int64_t(new_blen) - target_blob_size; + // Unable to decrease the provided length to fit into max_blob_size + if (overflow >= length) { + return false; + } + + // FIXME: in some cases we could reduce unused resolution + if (get_blob().has_unused()) { + return false; + } + + if (overflow > 0) { + new_blen -= overflow; + length -= overflow; + *length0 = length; + } + + if (new_blen > blen) { + dirty_blob().add_tail(new_blen); + used_in_blob.add_tail(new_blen, + get_blob().get_release_size(min_alloc_size)); + } + } + return true; +} + +void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r) +{ + auto cct = coll->store->cct; //used by dout + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " start " << *this << dendl; + ceph_assert(blob.can_split()); + ceph_assert(used_in_blob.can_split()); + bluestore_blob_t &lb = dirty_blob(); + bluestore_blob_t &rb = r->dirty_blob(); + + used_in_blob.split( + blob_offset, + &(r->used_in_blob)); + + lb.split(blob_offset, rb); + shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc); + + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " finish " << *this << dendl; + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " and " << *r << dendl; +} + +#ifndef CACHE_BLOB_BL +void BlueStore::Blob::decode( + Collection *coll, + bufferptr::const_iterator& p, + uint64_t struct_v, + uint64_t* sbid, + bool include_ref_map) +{ + denc(blob, p, struct_v); + if (blob.is_shared()) { + denc(*sbid, p); + } + if (include_ref_map) { + if (struct_v > 1) { + used_in_blob.decode(p); + } else { + used_in_blob.clear(); + bluestore_extent_ref_map_t legacy_ref_map; + legacy_ref_map.decode(p); + for (auto r : legacy_ref_map.ref_map) { + get_ref( + coll, + r.first, + r.second.refs * r.second.length); + } + } + } +} +#endif + +// Extent + +ostream& operator<<(ostream& out, const BlueStore::Extent& e) +{ + return out << std::hex << "0x" << e.logical_offset << "~" << e.length + << ": 0x" << e.blob_offset << "~" << e.length << std::dec + << " " << *e.blob; +} + +// OldExtent +BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c, + uint32_t lo, + uint32_t o, + uint32_t l, + BlobRef& b) { + OldExtent* oe = new OldExtent(lo, o, l, b); + b->put_ref(c.get(), o, l, &(oe->r)); + oe->blob_empty = !b->is_referenced(); + return oe; +} + +// ExtentMap + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") " + +BlueStore::ExtentMap::ExtentMap(Onode *o) + : onode(o), + inline_bl( + o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) { +} + +void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc, + CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff, + uint64_t& length, uint64_t& dstoff) { + + auto cct = onode->c->store->cct; + bool inject_21040 = + cct->_conf->bluestore_debug_inject_bug21040; + vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size()); + for (auto& e : oldo->extent_map.extent_map) { + e.blob->last_encoded_id = -1; + } + + int n = 0; + uint64_t end = srcoff + length; + uint32_t dirty_range_begin = 0; + uint32_t dirty_range_end = 0; + bool src_dirty = false; + for (auto ep = oldo->extent_map.seek_lextent(srcoff); + ep != oldo->extent_map.extent_map.end(); + ++ep) { + auto& e = *ep; + if (e.logical_offset >= end) { + break; + } + dout(20) << __func__ << " src " << e << dendl; + BlobRef cb; + bool blob_duped = true; + if (e.blob->last_encoded_id >= 0) { + cb = id_to_blob[e.blob->last_encoded_id]; + blob_duped = false; + } else { + // dup the blob + const bluestore_blob_t& blob = e.blob->get_blob(); + // make sure it is shared + if (!blob.is_shared()) { + c->make_blob_shared(b->_assign_blobid(txc), e.blob); + if (!inject_21040 && !src_dirty) { + src_dirty = true; + dirty_range_begin = e.logical_offset; + } else if (inject_21040 && + dirty_range_begin == 0 && dirty_range_end == 0) { + dirty_range_begin = e.logical_offset; + } + ceph_assert(e.logical_end() > 0); + // -1 to exclude next potential shard + dirty_range_end = e.logical_end() - 1; + } else { + c->load_shared_blob(e.blob->shared_blob); + } + cb = new Blob(); + e.blob->last_encoded_id = n; + id_to_blob[n] = cb; + e.blob->dup(*cb); + // bump the extent refs on the copied blob's extents + for (auto p : blob.get_extents()) { + if (p.is_valid()) { + e.blob->shared_blob->get_ref(p.offset, p.length); + } + } + txc->write_shared_blob(e.blob->shared_blob); + dout(20) << __func__ << " new " << *cb << dendl; + } + + int skip_front, skip_back; + if (e.logical_offset < srcoff) { + skip_front = srcoff - e.logical_offset; + } else { + skip_front = 0; + } + if (e.logical_end() > end) { + skip_back = e.logical_end() - end; + } else { + skip_back = 0; + } + + Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff, + e.blob_offset + skip_front, e.length - skip_front - skip_back, cb); + newo->extent_map.extent_map.insert(*ne); + ne->blob->get_ref(c.get(), ne->blob_offset, ne->length); + // fixme: we may leave parts of new blob unreferenced that could + // be freed (relative to the shared_blob). + txc->statfs_delta.stored() += ne->length; + if (e.blob->get_blob().is_compressed()) { + txc->statfs_delta.compressed_original() += ne->length; + if (blob_duped) { + txc->statfs_delta.compressed() += + cb->get_blob().get_compressed_payload_length(); + } + } + dout(20) << __func__ << " dst " << *ne << dendl; + ++n; + } + if ((!inject_21040 && src_dirty) || + (inject_21040 && dirty_range_end > dirty_range_begin)) { + oldo->extent_map.dirty_range(dirty_range_begin, + dirty_range_end - dirty_range_begin); + txc->write_onode(oldo); + } + txc->write_onode(newo); + + if (dstoff + length > newo->onode.size) { + newo->onode.size = dstoff + length; + } + newo->extent_map.dirty_range(dstoff, length); +} +void BlueStore::ExtentMap::update(KeyValueDB::Transaction t, + bool force) +{ + auto cct = onode->c->store->cct; //used by dout + dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl; + if (onode->onode.extent_map_shards.empty()) { + if (inline_bl.length() == 0) { + unsigned n; + // we need to encode inline_bl to measure encoded length + bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n); + inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl); + ceph_assert(!never_happen); + size_t len = inline_bl.length(); + dout(20) << __func__ << " inline shard " << len << " bytes from " << n + << " extents" << dendl; + if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) { + request_reshard(0, OBJECT_MAX_SIZE); + return; + } + } + // will persist in the onode key. + } else { + // pending shard update + struct dirty_shard_t { + Shard *shard; + bufferlist bl; + dirty_shard_t(Shard *s) : shard(s) {} + }; + vector<dirty_shard_t> encoded_shards; + // allocate slots for all shards in a single call instead of + // doing multiple allocations - one per each dirty shard + encoded_shards.reserve(shards.size()); + + auto p = shards.begin(); + auto prev_p = p; + while (p != shards.end()) { + ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset); + auto n = p; + ++n; + if (p->dirty) { + uint32_t endoff; + if (n == shards.end()) { + endoff = OBJECT_MAX_SIZE; + } else { + endoff = n->shard_info->offset; + } + encoded_shards.emplace_back(dirty_shard_t(&(*p))); + bufferlist& bl = encoded_shards.back().bl; + if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset, + bl, &p->extents)) { + if (force) { + derr << __func__ << " encode_some needs reshard" << dendl; + ceph_assert(!force); + } + } + size_t len = bl.length(); + + dout(20) << __func__ << " shard 0x" << std::hex + << p->shard_info->offset << std::dec << " is " << len + << " bytes (was " << p->shard_info->bytes << ") from " + << p->extents << " extents" << dendl; + + if (!force) { + if (len > cct->_conf->bluestore_extent_map_shard_max_size) { + // we are big; reshard ourselves + request_reshard(p->shard_info->offset, endoff); + } + // avoid resharding the trailing shard, even if it is small + else if (n != shards.end() && + len < g_conf()->bluestore_extent_map_shard_min_size) { + ceph_assert(endoff != OBJECT_MAX_SIZE); + if (p == shards.begin()) { + // we are the first shard, combine with next shard + request_reshard(p->shard_info->offset, endoff + 1); + } else { + // combine either with the previous shard or the next, + // whichever is smaller + if (prev_p->shard_info->bytes > n->shard_info->bytes) { + request_reshard(p->shard_info->offset, endoff + 1); + } else { + request_reshard(prev_p->shard_info->offset, endoff); + } + } + } + } + } + prev_p = p; + p = n; + } + if (needs_reshard()) { + return; + } + + // schedule DB update for dirty shards + string key; + for (auto& it : encoded_shards) { + it.shard->dirty = false; + it.shard->shard_info->bytes = it.bl.length(); + generate_extent_shard_key_and_apply( + onode->key, + it.shard->shard_info->offset, + &key, + [&](const string& final_key) { + t->set(PREFIX_OBJ, final_key, it.bl); + } + ); + } + } +} + +bid_t BlueStore::ExtentMap::allocate_spanning_blob_id() +{ + if (spanning_blob_map.empty()) + return 0; + bid_t bid = spanning_blob_map.rbegin()->first + 1; + // bid is valid and available. + if (bid >= 0) + return bid; + // Find next unused bid; + bid = rand() % (numeric_limits<bid_t>::max() + 1); + const auto begin_bid = bid; + do { + if (!spanning_blob_map.count(bid)) + return bid; + else { + bid++; + if (bid < 0) bid = 0; + } + } while (bid != begin_bid); + auto cct = onode->c->store->cct; // used by dout + _dump_onode<0>(cct, *onode); + ceph_abort_msg("no available blob id"); +} + +void BlueStore::ExtentMap::reshard( + KeyValueDB *db, + KeyValueDB::Transaction t) +{ + auto cct = onode->c->store->cct; // used by dout + + dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << "," + << needs_reshard_end << ")" << std::dec + << " of " << onode->onode.extent_map_shards.size() + << " shards on " << onode->oid << dendl; + for (auto& p : spanning_blob_map) { + dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second + << dendl; + } + // determine shard index range + unsigned si_begin = 0, si_end = 0; + if (!shards.empty()) { + while (si_begin + 1 < shards.size() && + shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) { + ++si_begin; + } + needs_reshard_begin = shards[si_begin].shard_info->offset; + for (si_end = si_begin; si_end < shards.size(); ++si_end) { + if (shards[si_end].shard_info->offset >= needs_reshard_end) { + needs_reshard_end = shards[si_end].shard_info->offset; + break; + } + } + if (si_end == shards.size()) { + needs_reshard_end = OBJECT_MAX_SIZE; + } + dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")" + << " over 0x[" << std::hex << needs_reshard_begin << "," + << needs_reshard_end << ")" << std::dec << dendl; + } + + fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin)); + + // we may need to fault in a larger interval later must have all + // referring extents for spanning blobs loaded in order to have + // accurate use_tracker values. + uint32_t spanning_scan_begin = needs_reshard_begin; + uint32_t spanning_scan_end = needs_reshard_end; + + // remove old keys + string key; + for (unsigned i = si_begin; i < si_end; ++i) { + generate_extent_shard_key_and_apply( + onode->key, shards[i].shard_info->offset, &key, + [&](const string& final_key) { + t->rmkey(PREFIX_OBJ, final_key); + } + ); + } + + // calculate average extent size + unsigned bytes = 0; + unsigned extents = 0; + if (onode->onode.extent_map_shards.empty()) { + bytes = inline_bl.length(); + extents = extent_map.size(); + } else { + for (unsigned i = si_begin; i < si_end; ++i) { + bytes += shards[i].shard_info->bytes; + extents += shards[i].extents; + } + } + unsigned target = cct->_conf->bluestore_extent_map_shard_target_size; + unsigned slop = target * + cct->_conf->bluestore_extent_map_shard_target_size_slop; + unsigned extent_avg = bytes / std::max(1u, extents); + dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target + << ", slop " << slop << dendl; + + // reshard + unsigned estimate = 0; + unsigned offset = needs_reshard_begin; + vector<bluestore_onode_t::shard_info> new_shard_info; + unsigned max_blob_end = 0; + Extent dummy(needs_reshard_begin); + for (auto e = extent_map.lower_bound(dummy); + e != extent_map.end(); + ++e) { + if (e->logical_offset >= needs_reshard_end) { + break; + } + dout(30) << " extent " << *e << dendl; + + // disfavor shard boundaries that span a blob + bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset; + if (estimate && + estimate + extent_avg > target + (would_span ? slop : 0)) { + // new shard + if (offset == needs_reshard_begin) { + new_shard_info.emplace_back(bluestore_onode_t::shard_info()); + new_shard_info.back().offset = offset; + dout(20) << __func__ << " new shard 0x" << std::hex << offset + << std::dec << dendl; + } + offset = e->logical_offset; + new_shard_info.emplace_back(bluestore_onode_t::shard_info()); + new_shard_info.back().offset = offset; + dout(20) << __func__ << " new shard 0x" << std::hex << offset + << std::dec << dendl; + estimate = 0; + } + estimate += extent_avg; + unsigned bs = e->blob_start(); + if (bs < spanning_scan_begin) { + spanning_scan_begin = bs; + } + uint32_t be = e->blob_end(); + if (be > max_blob_end) { + max_blob_end = be; + } + if (be > spanning_scan_end) { + spanning_scan_end = be; + } + } + if (new_shard_info.empty() && (si_begin > 0 || + si_end < shards.size())) { + // we resharded a partial range; we must produce at least one output + // shard + new_shard_info.emplace_back(bluestore_onode_t::shard_info()); + new_shard_info.back().offset = needs_reshard_begin; + dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin + << std::dec << " (singleton degenerate case)" << dendl; + } + + auto& sv = onode->onode.extent_map_shards; + dout(20) << __func__ << " new " << new_shard_info << dendl; + dout(20) << __func__ << " old " << sv << dendl; + if (sv.empty()) { + // no old shards to keep + sv.swap(new_shard_info); + init_shards(true, true); + } else { + // splice in new shards + sv.erase(sv.begin() + si_begin, sv.begin() + si_end); + shards.erase(shards.begin() + si_begin, shards.begin() + si_end); + sv.insert( + sv.begin() + si_begin, + new_shard_info.begin(), + new_shard_info.end()); + shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard()); + si_end = si_begin + new_shard_info.size(); + + ceph_assert(sv.size() == shards.size()); + + // note that we need to update every shard_info of shards here, + // as sv might have been totally re-allocated above + for (unsigned i = 0; i < shards.size(); i++) { + shards[i].shard_info = &sv[i]; + } + + // mark newly added shards as dirty + for (unsigned i = si_begin; i < si_end; ++i) { + shards[i].loaded = true; + shards[i].dirty = true; + } + } + dout(20) << __func__ << " fin " << sv << dendl; + inline_bl.clear(); + + if (sv.empty()) { + // no more shards; unspan all previously spanning blobs + auto p = spanning_blob_map.begin(); + while (p != spanning_blob_map.end()) { + p->second->id = -1; + dout(30) << __func__ << " un-spanning " << *p->second << dendl; + p = spanning_blob_map.erase(p); + } + } else { + // identify new spanning blobs + dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex + << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl; + if (spanning_scan_begin < needs_reshard_begin) { + fault_range(db, spanning_scan_begin, + needs_reshard_begin - spanning_scan_begin); + } + if (spanning_scan_end > needs_reshard_end) { + fault_range(db, needs_reshard_end, + spanning_scan_end - needs_reshard_end); + } + auto sp = sv.begin() + si_begin; + auto esp = sv.end(); + unsigned shard_start = sp->offset; + unsigned shard_end; + ++sp; + if (sp == esp) { + shard_end = OBJECT_MAX_SIZE; + } else { + shard_end = sp->offset; + } + Extent dummy(needs_reshard_begin); + + bool was_too_many_blobs_check = false; + auto too_many_blobs_threshold = + g_conf()->bluestore_debug_too_many_blobs_threshold; + auto& dumped_onodes = onode->c->cache->dumped_onodes; + decltype(onode->c->cache->dumped_onodes)::value_type* oid_slot = nullptr; + decltype(onode->c->cache->dumped_onodes)::value_type* oldest_slot = nullptr; + + for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) { + if (e->logical_offset >= needs_reshard_end) { + break; + } + dout(30) << " extent " << *e << dendl; + while (e->logical_offset >= shard_end) { + shard_start = shard_end; + ceph_assert(sp != esp); + ++sp; + if (sp == esp) { + shard_end = OBJECT_MAX_SIZE; + } else { + shard_end = sp->offset; + } + dout(30) << __func__ << " shard 0x" << std::hex << shard_start + << " to 0x" << shard_end << std::dec << dendl; + } + + if (e->blob_escapes_range(shard_start, shard_end - shard_start)) { + if (!e->blob->is_spanning()) { + // We have two options: (1) split the blob into pieces at the + // shard boundaries (and adjust extents accordingly), or (2) + // mark it spanning. We prefer to cut the blob if we can. Note that + // we may have to split it multiple times--potentially at every + // shard boundary. + bool must_span = false; + BlobRef b = e->blob; + if (b->can_split()) { + uint32_t bstart = e->blob_start(); + uint32_t bend = e->blob_end(); + for (const auto& sh : shards) { + if (bstart < sh.shard_info->offset && + bend > sh.shard_info->offset) { + uint32_t blob_offset = sh.shard_info->offset - bstart; + if (b->can_split_at(blob_offset)) { + dout(20) << __func__ << " splitting blob, bstart 0x" + << std::hex << bstart << " blob_offset 0x" + << blob_offset << std::dec << " " << *b << dendl; + b = split_blob(b, blob_offset, sh.shard_info->offset); + // switch b to the new right-hand side, in case it + // *also* has to get split. + bstart += blob_offset; + onode->c->store->logger->inc(l_bluestore_blob_split); + } else { + must_span = true; + break; + } + } + } + } else { + must_span = true; + } + if (must_span) { + auto bid = allocate_spanning_blob_id(); + b->id = bid; + spanning_blob_map[b->id] = b; + dout(20) << __func__ << " adding spanning " << *b << dendl; + if (!was_too_many_blobs_check && + too_many_blobs_threshold && + spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) { + + was_too_many_blobs_check = true; + for (size_t i = 0; i < dumped_onodes.size(); ++i) { + if (dumped_onodes[i].first == onode->oid) { + oid_slot = &dumped_onodes[i]; + break; + } + if (!oldest_slot || (oldest_slot && + dumped_onodes[i].second < oldest_slot->second)) { + oldest_slot = &dumped_onodes[i]; + } + } + } + } + } + } else { + if (e->blob->is_spanning()) { + spanning_blob_map.erase(e->blob->id); + e->blob->id = -1; + dout(30) << __func__ << " un-spanning " << *e->blob << dendl; + } + } + } + bool do_dump = (!oid_slot && was_too_many_blobs_check) || + (oid_slot && + (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60))); + if (do_dump) { + dout(0) << __func__ + << " spanning blob count exceeds threshold, " + << spanning_blob_map.size() << " spanning blobs" + << dendl; + _dump_onode<0>(cct, *onode); + if (oid_slot) { + oid_slot->second = mono_clock::now(); + } else { + ceph_assert(oldest_slot); + oldest_slot->first = onode->oid; + oldest_slot->second = mono_clock::now(); + } + } + } + + clear_needs_reshard(); +} + +bool BlueStore::ExtentMap::encode_some( + uint32_t offset, + uint32_t length, + bufferlist& bl, + unsigned *pn) +{ + auto cct = onode->c->store->cct; //used by dout + Extent dummy(offset); + auto start = extent_map.lower_bound(dummy); + uint32_t end = offset + length; + + __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level. + + unsigned n = 0; + size_t bound = 0; + bool must_reshard = false; + for (auto p = start; + p != extent_map.end() && p->logical_offset < end; + ++p, ++n) { + ceph_assert(p->logical_offset >= offset); + p->blob->last_encoded_id = -1; + if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) { + dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << " hit new spanning blob " << *p << dendl; + request_reshard(p->blob_start(), p->blob_end()); + must_reshard = true; + } + if (!must_reshard) { + denc_varint(0, bound); // blobid + denc_varint(0, bound); // logical_offset + denc_varint(0, bound); // len + denc_varint(0, bound); // blob_offset + + p->blob->bound_encode( + bound, + struct_v, + p->blob->shared_blob->get_sbid(), + false); + } + } + if (must_reshard) { + return true; + } + + denc(struct_v, bound); + denc_varint(0, bound); // number of extents + + { + auto app = bl.get_contiguous_appender(bound); + denc(struct_v, app); + denc_varint(n, app); + if (pn) { + *pn = n; + } + + n = 0; + uint64_t pos = 0; + uint64_t prev_len = 0; + for (auto p = start; + p != extent_map.end() && p->logical_offset < end; + ++p, ++n) { + unsigned blobid; + bool include_blob = false; + if (p->blob->is_spanning()) { + blobid = p->blob->id << BLOBID_SHIFT_BITS; + blobid |= BLOBID_FLAG_SPANNING; + } else if (p->blob->last_encoded_id < 0) { + p->blob->last_encoded_id = n + 1; // so it is always non-zero + include_blob = true; + blobid = 0; // the decoder will infer the id from n + } else { + blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS; + } + if (p->logical_offset == pos) { + blobid |= BLOBID_FLAG_CONTIGUOUS; + } + if (p->blob_offset == 0) { + blobid |= BLOBID_FLAG_ZEROOFFSET; + } + if (p->length == prev_len) { + blobid |= BLOBID_FLAG_SAMELENGTH; + } else { + prev_len = p->length; + } + denc_varint(blobid, app); + if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) { + denc_varint_lowz(p->logical_offset - pos, app); + } + if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) { + denc_varint_lowz(p->blob_offset, app); + } + if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) { + denc_varint_lowz(p->length, app); + } + pos = p->logical_end(); + if (include_blob) { + p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false); + } + } + } + /*derr << __func__ << bl << dendl; + derr << __func__ << ":"; + bl.hexdump(*_dout); + *_dout << dendl; + */ + return false; +} + +unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl) +{ + auto cct = onode->c->store->cct; //used by dout + /* + derr << __func__ << ":"; + bl.hexdump(*_dout); + *_dout << dendl; + */ + + ceph_assert(bl.get_num_buffers() <= 1); + auto p = bl.front().begin_deep(); + __u8 struct_v; + denc(struct_v, p); + // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level below. + ceph_assert(struct_v == 1 || struct_v == 2); + + uint32_t num; + denc_varint(num, p); + vector<BlobRef> blobs(num); + uint64_t pos = 0; + uint64_t prev_len = 0; + unsigned n = 0; + + while (!p.end()) { + Extent *le = new Extent(); + uint64_t blobid; + denc_varint(blobid, p); + if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) { + uint64_t gap; + denc_varint_lowz(gap, p); + pos += gap; + } + le->logical_offset = pos; + if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) { + denc_varint_lowz(le->blob_offset, p); + } else { + le->blob_offset = 0; + } + if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) { + denc_varint_lowz(prev_len, p); + } + le->length = prev_len; + + if (blobid & BLOBID_FLAG_SPANNING) { + dout(30) << __func__ << " getting spanning blob " + << (blobid >> BLOBID_SHIFT_BITS) << dendl; + le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS)); + } else { + blobid >>= BLOBID_SHIFT_BITS; + if (blobid) { + le->assign_blob(blobs[blobid - 1]); + ceph_assert(le->blob); + } else { + Blob *b = new Blob(); + uint64_t sbid = 0; + b->decode(onode->c, p, struct_v, &sbid, false); + blobs[n] = b; + onode->c->open_shared_blob(sbid, b); + le->assign_blob(b); + } + // we build ref_map dynamically for non-spanning blobs + le->blob->get_ref( + onode->c, + le->blob_offset, + le->length); + } + pos += prev_len; + ++n; + extent_map.insert(*le); + } + + ceph_assert(n == num); + return num; +} + +void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p) +{ + // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level. + __u8 struct_v = 2; + + denc(struct_v, p); + denc_varint((uint32_t)0, p); + size_t key_size = 0; + denc_varint((uint32_t)0, key_size); + p += spanning_blob_map.size() * key_size; + for (const auto& i : spanning_blob_map) { + i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true); + } +} + +void BlueStore::ExtentMap::encode_spanning_blobs( + bufferlist::contiguous_appender& p) +{ + // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level. + __u8 struct_v = 2; + + denc(struct_v, p); + denc_varint(spanning_blob_map.size(), p); + for (auto& i : spanning_blob_map) { + denc_varint(i.second->id, p); + i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true); + } +} + +void BlueStore::ExtentMap::decode_spanning_blobs( + bufferptr::const_iterator& p) +{ + __u8 struct_v; + denc(struct_v, p); + // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level. + ceph_assert(struct_v == 1 || struct_v == 2); + + unsigned n; + denc_varint(n, p); + while (n--) { + BlobRef b(new Blob()); + denc_varint(b->id, p); + spanning_blob_map[b->id] = b; + uint64_t sbid = 0; + b->decode(onode->c, p, struct_v, &sbid, true); + onode->c->open_shared_blob(sbid, b); + } +} + +void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty) +{ + shards.resize(onode->onode.extent_map_shards.size()); + unsigned i = 0; + for (auto &s : onode->onode.extent_map_shards) { + shards[i].shard_info = &s; + shards[i].loaded = loaded; + shards[i].dirty = dirty; + ++i; + } +} + +void BlueStore::ExtentMap::fault_range( + KeyValueDB *db, + uint32_t offset, + uint32_t length) +{ + auto cct = onode->c->store->cct; //used by dout + dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + auto start = seek_shard(offset); + auto last = seek_shard(offset + length); + + if (start < 0) + return; + + ceph_assert(last >= start); + string key; + while (start <= last) { + ceph_assert((size_t)start < shards.size()); + auto p = &shards[start]; + if (!p->loaded) { + dout(30) << __func__ << " opening shard 0x" << std::hex + << p->shard_info->offset << std::dec << dendl; + bufferlist v; + generate_extent_shard_key_and_apply( + onode->key, p->shard_info->offset, &key, + [&](const string& final_key) { + int r = db->get(PREFIX_OBJ, final_key, &v); + if (r < 0) { + derr << __func__ << " missing shard 0x" << std::hex + << p->shard_info->offset << std::dec << " for " << onode->oid + << dendl; + ceph_assert(r >= 0); + } + } + ); + p->extents = decode_some(v); + p->loaded = true; + dout(20) << __func__ << " open shard 0x" << std::hex + << p->shard_info->offset + << " for range 0x" << offset << "~" << length << std::dec + << " (" << v.length() << " bytes)" << dendl; + ceph_assert(p->dirty == false); + ceph_assert(v.length() == p->shard_info->bytes); + onode->c->store->logger->inc(l_bluestore_onode_shard_misses); + } else { + onode->c->store->logger->inc(l_bluestore_onode_shard_hits); + } + ++start; + } +} + +void BlueStore::ExtentMap::dirty_range( + uint32_t offset, + uint32_t length) +{ + auto cct = onode->c->store->cct; //used by dout + dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + if (shards.empty()) { + dout(20) << __func__ << " mark inline shard dirty" << dendl; + inline_bl.clear(); + return; + } + auto start = seek_shard(offset); + if (length == 0) { + length = 1; + } + auto last = seek_shard(offset + length - 1); + if (start < 0) + return; + + ceph_assert(last >= start); + while (start <= last) { + ceph_assert((size_t)start < shards.size()); + auto p = &shards[start]; + if (!p->loaded) { + derr << __func__ << "on write 0x" << std::hex << offset + << "~" << length << " shard 0x" << p->shard_info->offset + << std::dec << " is not loaded, can't mark dirty" << dendl; + ceph_abort_msg("can't mark unloaded shard dirty"); + } + if (!p->dirty) { + dout(20) << __func__ << " mark shard 0x" << std::hex + << p->shard_info->offset << std::dec << " dirty" << dendl; + p->dirty = true; + } + ++start; + } +} + +BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find( + uint64_t offset) +{ + Extent dummy(offset); + return extent_map.find(dummy); +} + +BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent( + uint64_t offset) +{ + Extent dummy(offset); + auto fp = extent_map.lower_bound(dummy); + if (fp != extent_map.begin()) { + --fp; + if (fp->logical_end() <= offset) { + ++fp; + } + } + return fp; +} + +BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent( + uint64_t offset) const +{ + Extent dummy(offset); + auto fp = extent_map.lower_bound(dummy); + if (fp != extent_map.begin()) { + --fp; + if (fp->logical_end() <= offset) { + ++fp; + } + } + return fp; +} + +bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length) +{ + auto fp = seek_lextent(offset); + if (fp == extent_map.end() || fp->logical_offset >= offset + length) { + return false; + } + return true; +} + +int BlueStore::ExtentMap::compress_extent_map( + uint64_t offset, + uint64_t length) +{ + auto cct = onode->c->store->cct; //used by dout + if (extent_map.empty()) + return 0; + int removed = 0; + auto p = seek_lextent(offset); + if (p != extent_map.begin()) { + --p; // start to the left of offset + } + // the caller should have just written to this region + ceph_assert(p != extent_map.end()); + + // identify the *next* shard + auto pshard = shards.begin(); + while (pshard != shards.end() && + p->logical_offset >= pshard->shard_info->offset) { + ++pshard; + } + uint64_t shard_end; + if (pshard != shards.end()) { + shard_end = pshard->shard_info->offset; + } else { + shard_end = OBJECT_MAX_SIZE; + } + + auto n = p; + for (++n; n != extent_map.end(); p = n++) { + if (n->logical_offset > offset + length) { + break; // stop after end + } + while (n != extent_map.end() && + p->logical_end() == n->logical_offset && + p->blob == n->blob && + p->blob_offset + p->length == n->blob_offset && + n->logical_offset < shard_end) { + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << " next shard 0x" << shard_end << std::dec + << " merging " << *p << " and " << *n << dendl; + p->length += n->length; + rm(n++); + ++removed; + } + if (n == extent_map.end()) { + break; + } + if (n->logical_offset >= shard_end) { + ceph_assert(pshard != shards.end()); + ++pshard; + if (pshard != shards.end()) { + shard_end = pshard->shard_info->offset; + } else { + shard_end = OBJECT_MAX_SIZE; + } + } + } + if (removed) { + onode->c->store->logger->inc(l_bluestore_extent_compress, removed); + } + return removed; +} + +void BlueStore::ExtentMap::punch_hole( + CollectionRef &c, + uint64_t offset, + uint64_t length, + old_extent_map_t *old_extents) +{ + auto p = seek_lextent(offset); + uint64_t end = offset + length; + while (p != extent_map.end()) { + if (p->logical_offset >= end) { + break; + } + if (p->logical_offset < offset) { + if (p->logical_end() > end) { + // split and deref middle + uint64_t front = offset - p->logical_offset; + OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front, + length, p->blob); + old_extents->push_back(*oe); + add(end, + p->blob_offset + front + length, + p->length - front - length, + p->blob); + p->length = front; + break; + } else { + // deref tail + ceph_assert(p->logical_end() > offset); // else seek_lextent bug + uint64_t keep = offset - p->logical_offset; + OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep, + p->length - keep, p->blob); + old_extents->push_back(*oe); + p->length = keep; + ++p; + continue; + } + } + if (p->logical_offset + p->length <= end) { + // deref whole lextent + OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset, + p->length, p->blob); + old_extents->push_back(*oe); + rm(p++); + continue; + } + // deref head + uint64_t keep = p->logical_end() - end; + OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset, + p->length - keep, p->blob); + old_extents->push_back(*oe); + + add(end, p->blob_offset + p->length - keep, keep, p->blob); + rm(p); + break; + } +} + +BlueStore::Extent *BlueStore::ExtentMap::set_lextent( + CollectionRef &c, + uint64_t logical_offset, + uint64_t blob_offset, uint64_t length, BlobRef b, + old_extent_map_t *old_extents) +{ + // We need to have completely initialized Blob to increment its ref counters. + ceph_assert(b->get_blob().get_logical_length() != 0); + + // Do get_ref prior to punch_hole to prevent from putting reused blob into + // old_extents list if we overwre the blob totally + // This might happen during WAL overwrite. + b->get_ref(onode->c, blob_offset, length); + + if (old_extents) { + punch_hole(c, logical_offset, length, old_extents); + } + + Extent *le = new Extent(logical_offset, blob_offset, length, b); + extent_map.insert(*le); + if (spans_shard(logical_offset, length)) { + request_reshard(logical_offset, logical_offset + length); + } + return le; +} + +BlueStore::BlobRef BlueStore::ExtentMap::split_blob( + BlobRef lb, + uint32_t blob_offset, + uint32_t pos) +{ + auto cct = onode->c->store->cct; //used by dout + + uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset; + dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos + << " blob_offset 0x" << blob_offset << std::dec << " " << *lb + << dendl; + BlobRef rb = onode->c->new_blob(); + lb->split(onode->c, blob_offset, rb.get()); + + for (auto ep = seek_lextent(pos); + ep != extent_map.end() && ep->logical_offset < end_pos; + ++ep) { + if (ep->blob != lb) { + continue; + } + if (ep->logical_offset < pos) { + // split extent + size_t left = pos - ep->logical_offset; + Extent *ne = new Extent(pos, 0, ep->length - left, rb); + extent_map.insert(*ne); + ep->length = left; + dout(30) << __func__ << " split " << *ep << dendl; + dout(30) << __func__ << " to " << *ne << dendl; + } else { + // switch blob + ceph_assert(ep->blob_offset >= blob_offset); + + ep->blob = rb; + ep->blob_offset -= blob_offset; + dout(30) << __func__ << " adjusted " << *ep << dendl; + } + } + return rb; +} + +// Onode + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " " + +BlueStore::Onode* BlueStore::Onode::decode( + CollectionRef c, + const ghobject_t& oid, + const string& key, + const bufferlist& v) +{ + Onode* on = new Onode(c.get(), oid, key); + on->exists = true; + auto p = v.front().begin_deep(); + on->onode.decode(p); + for (auto& i : on->onode.attrs) { + i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } + + // initialize extent_map + on->extent_map.decode_spanning_blobs(p); + if (on->onode.extent_map_shards.empty()) { + denc(on->extent_map.inline_bl, p); + on->extent_map.decode_some(on->extent_map.inline_bl); + on->extent_map.inline_bl.reassign_to_mempool( + mempool::mempool_bluestore_cache_data); + } + else { + on->extent_map.init_shards(false, false); + } + return on; +} + +void BlueStore::Onode::flush() +{ + if (flushing_count.load()) { + ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl; + std::unique_lock l(flush_lock); + while (flushing_count.load()) { + flush_cond.wait(l); + } + } + ldout(c->store->cct, 20) << __func__ << " done" << dendl; +} + +// ======================================================= +// WriteContext + +/// Checks for writes to the same pextent within a blob +bool BlueStore::WriteContext::has_conflict( + BlobRef b, + uint64_t loffs, + uint64_t loffs_end, + uint64_t min_alloc_size) +{ + ceph_assert((loffs % min_alloc_size) == 0); + ceph_assert((loffs_end % min_alloc_size) == 0); + for (auto w : writes) { + if (b == w.b) { + auto loffs2 = p2align(w.logical_offset, min_alloc_size); + auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size); + if ((loffs <= loffs2 && loffs_end > loffs2) || + (loffs >= loffs2 && loffs < loffs2_end)) { + return true; + } + } + } + return false; +} + +// ======================================================= + +// DeferredBatch +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") " + +void BlueStore::DeferredBatch::prepare_write( + CephContext *cct, + uint64_t seq, uint64_t offset, uint64_t length, + bufferlist::const_iterator& blp) +{ + _discard(cct, offset, length); + auto i = iomap.insert(make_pair(offset, deferred_io())); + ceph_assert(i.second); // this should be a new insertion + i.first->second.seq = seq; + blp.copy(length, i.first->second.bl); + i.first->second.bl.reassign_to_mempool( + mempool::mempool_bluestore_writing_deferred); + dout(20) << __func__ << " seq " << seq + << " 0x" << std::hex << offset << "~" << length + << " crc " << i.first->second.bl.crc32c(-1) + << std::dec << dendl; + seq_bytes[seq] += length; +#ifdef DEBUG_DEFERRED + _audit(cct); +#endif +} + +void BlueStore::DeferredBatch::_discard( + CephContext *cct, uint64_t offset, uint64_t length) +{ + generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + auto p = iomap.lower_bound(offset); + if (p != iomap.begin()) { + --p; + auto end = p->first + p->second.bl.length(); + if (end > offset) { + bufferlist head; + head.substr_of(p->second.bl, 0, offset - p->first); + dout(20) << __func__ << " keep head " << p->second.seq + << " 0x" << std::hex << p->first << "~" << p->second.bl.length() + << " -> 0x" << head.length() << std::dec << dendl; + auto i = seq_bytes.find(p->second.seq); + ceph_assert(i != seq_bytes.end()); + if (end > offset + length) { + bufferlist tail; + tail.substr_of(p->second.bl, offset + length - p->first, + end - (offset + length)); + dout(20) << __func__ << " keep tail " << p->second.seq + << " 0x" << std::hex << p->first << "~" << p->second.bl.length() + << " -> 0x" << tail.length() << std::dec << dendl; + auto &n = iomap[offset + length]; + n.bl.swap(tail); + n.seq = p->second.seq; + i->second -= length; + } else { + i->second -= end - offset; + } + ceph_assert(i->second >= 0); + p->second.bl.swap(head); + } + ++p; + } + while (p != iomap.end()) { + if (p->first >= offset + length) { + break; + } + auto i = seq_bytes.find(p->second.seq); + ceph_assert(i != seq_bytes.end()); + auto end = p->first + p->second.bl.length(); + if (end > offset + length) { + unsigned drop_front = offset + length - p->first; + unsigned keep_tail = end - (offset + length); + dout(20) << __func__ << " truncate front " << p->second.seq + << " 0x" << std::hex << p->first << "~" << p->second.bl.length() + << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail + << " to 0x" << (offset + length) << "~" << keep_tail + << std::dec << dendl; + auto &s = iomap[offset + length]; + s.seq = p->second.seq; + s.bl.substr_of(p->second.bl, drop_front, keep_tail); + i->second -= drop_front; + } else { + dout(20) << __func__ << " drop " << p->second.seq + << " 0x" << std::hex << p->first << "~" << p->second.bl.length() + << std::dec << dendl; + i->second -= p->second.bl.length(); + } + ceph_assert(i->second >= 0); + p = iomap.erase(p); + } +} + +void BlueStore::DeferredBatch::_audit(CephContext *cct) +{ + map<uint64_t,int> sb; + for (auto p : seq_bytes) { + sb[p.first] = 0; // make sure we have the same set of keys + } + uint64_t pos = 0; + for (auto& p : iomap) { + ceph_assert(p.first >= pos); + sb[p.second.seq] += p.second.bl.length(); + pos = p.first + p.second.bl.length(); + } + ceph_assert(sb == seq_bytes); +} + + +// Collection + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") " + +BlueStore::Collection::Collection(BlueStore *store_, Cache *c, coll_t cid) + : CollectionImpl(cid), + store(store_), + cache(c), + lock("BlueStore::Collection::lock", true, false), + exists(true), + onode_map(c), + commit_queue(nullptr) +{ +} + +bool BlueStore::Collection::flush_commit(Context *c) +{ + return osr->flush_commit(c); +} + +void BlueStore::Collection::flush() +{ + osr->flush(); +} + +void BlueStore::Collection::flush_all_but_last() +{ + osr->flush_all_but_last(); +} + +void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b) +{ + ceph_assert(!b->shared_blob); + const bluestore_blob_t& blob = b->get_blob(); + if (!blob.is_shared()) { + b->shared_blob = new SharedBlob(this); + return; + } + + b->shared_blob = shared_blob_set.lookup(sbid); + if (b->shared_blob) { + ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid + << std::dec << " had " << *b->shared_blob << dendl; + } else { + b->shared_blob = new SharedBlob(sbid, this); + shared_blob_set.add(this, b->shared_blob.get()); + ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid + << std::dec << " opened " << *b->shared_blob + << dendl; + } +} + +void BlueStore::Collection::load_shared_blob(SharedBlobRef sb) +{ + if (!sb->is_loaded()) { + + bufferlist v; + string key; + auto sbid = sb->get_sbid(); + get_shared_blob_key(sbid, &key); + int r = store->db->get(PREFIX_SHARED_BLOB, key, &v); + if (r < 0) { + lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid + << std::dec << " not found at key " + << pretty_binary_string(key) << dendl; + ceph_abort_msg("uh oh, missing shared_blob"); + } + + sb->loaded = true; + sb->persistent = new bluestore_shared_blob_t(sbid); + auto p = v.cbegin(); + decode(*(sb->persistent), p); + ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid + << std::dec << " loaded shared_blob " << *sb << dendl; + } +} + +void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b) +{ + ldout(store->cct, 10) << __func__ << " " << *b << dendl; + ceph_assert(!b->shared_blob->is_loaded()); + + // update blob + bluestore_blob_t& blob = b->dirty_blob(); + blob.set_flag(bluestore_blob_t::FLAG_SHARED); + + // update shared blob + b->shared_blob->loaded = true; + b->shared_blob->persistent = new bluestore_shared_blob_t(sbid); + shared_blob_set.add(this, b->shared_blob.get()); + for (auto p : blob.get_extents()) { + if (p.is_valid()) { + b->shared_blob->get_ref( + p.offset, + p.length); + } + } + ldout(store->cct, 20) << __func__ << " now " << *b << dendl; +} + +uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb) +{ + ldout(store->cct, 10) << __func__ << " " << *sb << dendl; + ceph_assert(sb->is_loaded()); + + uint64_t sbid = sb->get_sbid(); + shared_blob_set.remove(sb); + sb->loaded = false; + delete sb->persistent; + sb->sbid_unloaded = 0; + ldout(store->cct, 20) << __func__ << " now " << *sb << dendl; + return sbid; +} + +BlueStore::OnodeRef BlueStore::Collection::get_onode( + const ghobject_t& oid, + bool create) +{ + ceph_assert(create ? lock.is_wlocked() : lock.is_locked()); + + spg_t pgid; + if (cid.is_pg(&pgid)) { + if (!oid.match(cnode.bits, pgid.ps())) { + lderr(store->cct) << __func__ << " oid " << oid << " not part of " + << pgid << " bits " << cnode.bits << dendl; + ceph_abort(); + } + } + + OnodeRef o = onode_map.lookup(oid); + if (o) + return o; + + string key; + get_object_key(store->cct, oid, &key); + + ldout(store->cct, 20) << __func__ << " oid " << oid << " key " + << pretty_binary_string(key) << dendl; + + bufferlist v; + int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v); + ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl; + Onode *on; + if (v.length() == 0) { + ceph_assert(r == -ENOENT); + if (!store->cct->_conf->bluestore_debug_misc && + !create) + return OnodeRef(); + + // new object, new onode + on = new Onode(this, oid, key); + } else { + // loaded + ceph_assert(r >= 0); + on = Onode::decode(this, oid, key, v); + } + o.reset(on); + return onode_map.add(oid, o); +} + +void BlueStore::Collection::split_cache( + Collection *dest) +{ + ldout(store->cct, 10) << __func__ << " to " << dest << dendl; + + // lock (one or both) cache shards + std::lock(cache->lock, dest->cache->lock); + std::lock_guard l(cache->lock, std::adopt_lock); + std::lock_guard l2(dest->cache->lock, std::adopt_lock); + + int destbits = dest->cnode.bits; + spg_t destpg; + bool is_pg = dest->cid.is_pg(&destpg); + ceph_assert(is_pg); + + auto p = onode_map.onode_map.begin(); + while (p != onode_map.onode_map.end()) { + OnodeRef o = p->second; + if (!p->second->oid.match(destbits, destpg.pgid.ps())) { + // onode does not belong to this child + ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid + << dendl; + ++p; + } else { + ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid + << dendl; + + cache->_rm_onode(p->second); + p = onode_map.onode_map.erase(p); + + o->c = dest; + dest->cache->_add_onode(o, 1); + dest->onode_map.onode_map[o->oid] = o; + dest->onode_map.cache = dest->cache; + + // move over shared blobs and buffers. cover shared blobs from + // both extent map and spanning blob map (the full extent map + // may not be faulted in) + vector<SharedBlob*> sbvec; + for (auto& e : o->extent_map.extent_map) { + sbvec.push_back(e.blob->shared_blob.get()); + } + for (auto& b : o->extent_map.spanning_blob_map) { + sbvec.push_back(b.second->shared_blob.get()); + } + for (auto sb : sbvec) { + if (sb->coll == dest) { + ldout(store->cct, 20) << __func__ << " already moved " << *sb + << dendl; + continue; + } + ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl; + if (sb->get_sbid()) { + ldout(store->cct, 20) << __func__ + << " moving registration " << *sb << dendl; + shared_blob_set.remove(sb); + dest->shared_blob_set.add(dest, sb); + } + sb->coll = dest; + if (dest->cache != cache) { + for (auto& i : sb->bc.buffer_map) { + if (!i.second->is_writing()) { + ldout(store->cct, 20) << __func__ << " moving " << *i.second + << dendl; + dest->cache->_move_buffer(cache, i.second.get()); + } + } + } + } + } + } +} + +// ======================================================= + +// MempoolThread + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") " + +void *BlueStore::MempoolThread::entry() +{ + std::unique_lock l(lock); + + uint32_t prev_config_change = store->config_changed.load(); + uint64_t base = store->osd_memory_base; + double fragmentation = store->osd_memory_expected_fragmentation; + uint64_t target = store->osd_memory_target; + uint64_t min = store->osd_memory_cache_min; + uint64_t max = min; + + // When setting the maximum amount of memory to use for cache, first + // assume some base amount of memory for the OSD and then fudge in + // some overhead for fragmentation that scales with cache usage. + uint64_t ltarget = (1.0 - fragmentation) * target; + if (ltarget > base + min) { + max = ltarget - base; + } + + binned_kv_cache = store->db->get_priority_cache(); + if (store->cache_autotune && binned_kv_cache != nullptr) { + pcm = std::make_shared<PriorityCache::Manager>( + store->cct, min, max, target, true); + pcm->insert("kv", binned_kv_cache, true); + pcm->insert("meta", meta_cache, true); + pcm->insert("data", data_cache, true); + } + + utime_t next_balance = ceph_clock_now(); + utime_t next_resize = ceph_clock_now(); + + bool interval_stats_trim = false; + while (!stop) { + // Update pcm cache settings if related configuration was changed + uint32_t cur_config_change = store->config_changed.load(); + if (cur_config_change != prev_config_change) { + _update_cache_settings(); + prev_config_change = cur_config_change; + } + + // Before we trim, check and see if it's time to rebalance/resize. + double autotune_interval = store->cache_autotune_interval; + double resize_interval = store->osd_memory_cache_resize_interval; + + if (autotune_interval > 0 && next_balance < ceph_clock_now()) { + _adjust_cache_settings(); + + // Log events at 5 instead of 20 when balance happens. + interval_stats_trim = true; + + if (pcm != nullptr) { + pcm->balance(); + } + + next_balance = ceph_clock_now(); + next_balance += autotune_interval; + } + if (resize_interval > 0 && next_resize < ceph_clock_now()) { + if (ceph_using_tcmalloc() && pcm != nullptr) { + pcm->tune_memory(); + } + next_resize = ceph_clock_now(); + next_resize += resize_interval; + } + + // Now Trim + _trim_shards(interval_stats_trim); + interval_stats_trim = false; + + store->_update_cache_logger(); + auto wait = ceph::make_timespan( + store->cct->_conf->bluestore_cache_trim_interval); + cond.wait_for(l, wait); + } + stop = false; + return NULL; +} + +void BlueStore::MempoolThread::_adjust_cache_settings() +{ + if (binned_kv_cache != nullptr) { + binned_kv_cache->set_cache_ratio(store->cache_kv_ratio); + } + meta_cache->set_cache_ratio(store->cache_meta_ratio); + data_cache->set_cache_ratio(store->cache_data_ratio); +} + +void BlueStore::MempoolThread::_trim_shards(bool interval_stats) +{ + auto cct = store->cct; + size_t num_shards = store->cache_shards.size(); + + int64_t kv_used = store->db->get_cache_usage(); + int64_t meta_used = meta_cache->_get_used_bytes(); + int64_t data_used = data_cache->_get_used_bytes(); + + uint64_t cache_size = store->cache_size; + int64_t kv_alloc = + static_cast<int64_t>(store->cache_kv_ratio * cache_size); + int64_t meta_alloc = + static_cast<int64_t>(store->cache_meta_ratio * cache_size); + int64_t data_alloc = + static_cast<int64_t>(store->cache_data_ratio * cache_size); + + if (pcm != nullptr && binned_kv_cache != nullptr) { + cache_size = pcm->get_tuned_mem(); + kv_alloc = binned_kv_cache->get_committed_size(); + meta_alloc = meta_cache->get_committed_size(); + data_alloc = data_cache->get_committed_size(); + } + + if (interval_stats) { + ldout(cct, 5) << __func__ << " cache_size: " << cache_size + << " kv_alloc: " << kv_alloc + << " kv_used: " << kv_used + << " meta_alloc: " << meta_alloc + << " meta_used: " << meta_used + << " data_alloc: " << data_alloc + << " data_used: " << data_used << dendl; + } else { + ldout(cct, 20) << __func__ << " cache_size: " << cache_size + << " kv_alloc: " << kv_alloc + << " kv_used: " << kv_used + << " meta_alloc: " << meta_alloc + << " meta_used: " << meta_used + << " data_alloc: " << data_alloc + << " data_used: " << data_used << dendl; + } + + uint64_t max_shard_onodes = static_cast<uint64_t>( + (meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode()); + uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards); + + auto debug_max_onodes = g_conf()->bluestore_debug_max_cached_onodes; + if (debug_max_onodes) { + max_shard_onodes = debug_max_onodes; + } + ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes + << " max_shard_buffer: " << max_shard_buffer << dendl; + + for (auto i : store->cache_shards) { + i->trim(max_shard_onodes, max_shard_buffer); + } +} + +void BlueStore::MempoolThread::_update_cache_settings() +{ + // Nothing to do if pcm is not used. + if (pcm == nullptr) { + return; + } + + auto cct = store->cct; + uint64_t target = store->osd_memory_target; + uint64_t base = store->osd_memory_base; + uint64_t min = store->osd_memory_cache_min; + uint64_t max = min; + double fragmentation = store->osd_memory_expected_fragmentation; + + uint64_t ltarget = (1.0 - fragmentation) * target; + if (ltarget > base + min) { + max = ltarget - base; + } + + // set pcm cache levels + pcm->set_target_memory(target); + pcm->set_min_memory(min); + pcm->set_max_memory(max); + + ldout(cct, 5) << __func__ << " updated pcm target: " << target + << " pcm min: " << min + << " pcm max: " << max + << dendl; +} + +// ======================================================= + +// OmapIteratorImpl + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") " + +BlueStore::OmapIteratorImpl::OmapIteratorImpl( + CollectionRef c, OnodeRef o, KeyValueDB::Iterator it) + : c(c), o(o), it(it) +{ + RWLock::RLocker l(c->lock); + if (o->onode.has_omap()) { + get_omap_key(o->onode.nid, string(), &head); + get_omap_tail(o->onode.nid, &tail); + it->lower_bound(head); + } +} + +string BlueStore::OmapIteratorImpl::_stringify() const +{ + stringstream s; + s << " omap_iterator(cid = " << c->cid + <<", oid = " << o->oid << ")"; + return s.str(); +} + +int BlueStore::OmapIteratorImpl::seek_to_first() +{ + RWLock::RLocker l(c->lock); + auto start1 = mono_clock::now(); + if (o->onode.has_omap()) { + it->lower_bound(head); + } else { + it = KeyValueDB::Iterator(); + } + c->store->log_latency( + __func__, + l_bluestore_omap_seek_to_first_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + + return 0; +} + +int BlueStore::OmapIteratorImpl::upper_bound(const string& after) +{ + RWLock::RLocker l(c->lock); + auto start1 = mono_clock::now(); + if (o->onode.has_omap()) { + string key; + get_omap_key(o->onode.nid, after, &key); + ldout(c->store->cct,20) << __func__ << " after " << after << " key " + << pretty_binary_string(key) << dendl; + it->upper_bound(key); + } else { + it = KeyValueDB::Iterator(); + } + c->store->log_latency_fn( + __func__, + l_bluestore_omap_upper_bound_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age, + [&] (const ceph::timespan& lat) { + return ", after = " + after + + _stringify(); + } + ); + return 0; +} + +int BlueStore::OmapIteratorImpl::lower_bound(const string& to) +{ + RWLock::RLocker l(c->lock); + auto start1 = mono_clock::now(); + if (o->onode.has_omap()) { + string key; + get_omap_key(o->onode.nid, to, &key); + ldout(c->store->cct,20) << __func__ << " to " << to << " key " + << pretty_binary_string(key) << dendl; + it->lower_bound(key); + } else { + it = KeyValueDB::Iterator(); + } + c->store->log_latency_fn( + __func__, + l_bluestore_omap_lower_bound_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age, + [&] (const ceph::timespan& lat) { + return ", to = " + to + + _stringify(); + } + ); + return 0; +} + +bool BlueStore::OmapIteratorImpl::valid() +{ + RWLock::RLocker l(c->lock); + bool r = o->onode.has_omap() && it && it->valid() && + it->raw_key().second < tail; + if (it && it->valid()) { + ldout(c->store->cct,20) << __func__ << " is at " + << pretty_binary_string(it->raw_key().second) + << dendl; + } + return r; +} + +int BlueStore::OmapIteratorImpl::next() +{ + int r = -1; + RWLock::RLocker l(c->lock); + auto start1 = mono_clock::now(); + if (o->onode.has_omap()) { + it->next(); + r = 0; + } + c->store->log_latency( + __func__, + l_bluestore_omap_next_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + + return r; +} + +string BlueStore::OmapIteratorImpl::key() +{ + RWLock::RLocker l(c->lock); + ceph_assert(it->valid()); + string db_key = it->raw_key().second; + string user_key; + decode_omap_key(db_key, &user_key); + + return user_key; +} + +bufferlist BlueStore::OmapIteratorImpl::value() +{ + RWLock::RLocker l(c->lock); + ceph_assert(it->valid()); + return it->value(); +} + + +// ===================================== + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore(" << path << ") " + + +static void aio_cb(void *priv, void *priv2) +{ + BlueStore *store = static_cast<BlueStore*>(priv); + BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2); + c->aio_finish(store); +} + +static void discard_cb(void *priv, void *priv2) +{ + BlueStore *store = static_cast<BlueStore*>(priv); + interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); + store->handle_discard(*tmp); +} + +void BlueStore::handle_discard(interval_set<uint64_t>& to_release) +{ + dout(10) << __func__ << dendl; + ceph_assert(alloc); + alloc->release(to_release); +} + +BlueStore::BlueStore(CephContext *cct, const string& path) + : ObjectStore(cct, path), + throttle_bytes(cct, "bluestore_throttle_bytes", + cct->_conf->bluestore_throttle_bytes), + throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes", + cct->_conf->bluestore_throttle_bytes + + cct->_conf->bluestore_throttle_deferred_bytes), + deferred_finisher(cct, "defered_finisher", "dfin"), + finisher(cct, "commit_finisher", "cfin"), + kv_sync_thread(this), + kv_finalize_thread(this), + mempool_thread(this) +{ + _init_logger(); + cct->_conf.add_observer(this); + set_cache_shards(1); +} + +BlueStore::BlueStore(CephContext *cct, + const string& path, + uint64_t _min_alloc_size) + : ObjectStore(cct, path), + throttle_bytes(cct, "bluestore_throttle_bytes", + cct->_conf->bluestore_throttle_bytes), + throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes", + cct->_conf->bluestore_throttle_bytes + + cct->_conf->bluestore_throttle_deferred_bytes), + deferred_finisher(cct, "defered_finisher", "dfin"), + finisher(cct, "commit_finisher", "cfin"), + kv_sync_thread(this), + kv_finalize_thread(this), + min_alloc_size(_min_alloc_size), + min_alloc_size_order(ctz(_min_alloc_size)), + mempool_thread(this) +{ + _init_logger(); + cct->_conf.add_observer(this); + set_cache_shards(1); +} + +BlueStore::~BlueStore() +{ + cct->_conf.remove_observer(this); + _shutdown_logger(); + ceph_assert(!mounted); + ceph_assert(db == NULL); + ceph_assert(bluefs == NULL); + ceph_assert(fsid_fd < 0); + ceph_assert(path_fd < 0); + for (auto i : cache_shards) { + delete i; + } + cache_shards.clear(); +} + +const char **BlueStore::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "bluestore_csum_type", + "bluestore_compression_mode", + "bluestore_compression_algorithm", + "bluestore_compression_min_blob_size", + "bluestore_compression_min_blob_size_ssd", + "bluestore_compression_min_blob_size_hdd", + "bluestore_compression_max_blob_size", + "bluestore_compression_max_blob_size_ssd", + "bluestore_compression_max_blob_size_hdd", + "bluestore_compression_required_ratio", + "bluestore_max_alloc_size", + "bluestore_prefer_deferred_size", + "bluestore_prefer_deferred_size_hdd", + "bluestore_prefer_deferred_size_ssd", + "bluestore_deferred_batch_ops", + "bluestore_deferred_batch_ops_hdd", + "bluestore_deferred_batch_ops_ssd", + "bluestore_throttle_bytes", + "bluestore_throttle_deferred_bytes", + "bluestore_throttle_cost_per_io_hdd", + "bluestore_throttle_cost_per_io_ssd", + "bluestore_throttle_cost_per_io", + "bluestore_max_blob_size", + "bluestore_max_blob_size_ssd", + "bluestore_max_blob_size_hdd", + "osd_memory_target", + "osd_memory_target_cgroup_limit_ratio", + "osd_memory_base", + "osd_memory_cache_min", + "osd_memory_expected_fragmentation", + "bluestore_cache_autotune", + "bluestore_cache_autotune_interval", + "bluestore_warn_on_legacy_statfs", + NULL + }; + return KEYS; +} + +void BlueStore::handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + if (changed.count("bluestore_warn_on_legacy_statfs")) { + _check_legacy_statfs_alert(); + } + + if (changed.count("bluestore_csum_type")) { + _set_csum(); + } + if (changed.count("bluestore_compression_mode") || + changed.count("bluestore_compression_algorithm") || + changed.count("bluestore_compression_min_blob_size") || + changed.count("bluestore_compression_max_blob_size")) { + if (bdev) { + _set_compression(); + } + } + if (changed.count("bluestore_max_blob_size") || + changed.count("bluestore_max_blob_size_ssd") || + changed.count("bluestore_max_blob_size_hdd")) { + if (bdev) { + // only after startup + _set_blob_size(); + } + } + if (changed.count("bluestore_prefer_deferred_size") || + changed.count("bluestore_prefer_deferred_size_hdd") || + changed.count("bluestore_prefer_deferred_size_ssd") || + changed.count("bluestore_max_alloc_size") || + changed.count("bluestore_deferred_batch_ops") || + changed.count("bluestore_deferred_batch_ops_hdd") || + changed.count("bluestore_deferred_batch_ops_ssd")) { + if (bdev) { + // only after startup + _set_alloc_sizes(); + } + } + if (changed.count("bluestore_throttle_cost_per_io") || + changed.count("bluestore_throttle_cost_per_io_hdd") || + changed.count("bluestore_throttle_cost_per_io_ssd")) { + if (bdev) { + _set_throttle_params(); + } + } + if (changed.count("bluestore_throttle_bytes")) { + throttle_bytes.reset_max(conf->bluestore_throttle_bytes); + throttle_deferred_bytes.reset_max( + conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes); + } + if (changed.count("bluestore_throttle_deferred_bytes")) { + throttle_deferred_bytes.reset_max( + conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes); + } + if (changed.count("osd_memory_target") || + changed.count("osd_memory_base") || + changed.count("osd_memory_cache_min") || + changed.count("osd_memory_expected_fragmentation")) { + _update_osd_memory_options(); + } +} + +void BlueStore::_set_compression() +{ + auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode); + if (m) { + _clear_compression_alert(); + comp_mode = *m; + } else { + derr << __func__ << " unrecognized value '" + << cct->_conf->bluestore_compression_mode + << "' for bluestore_compression_mode, reverting to 'none'" + << dendl; + comp_mode = Compressor::COMP_NONE; + string s("unknown mode: "); + s += cct->_conf->bluestore_compression_mode; + _set_compression_alert(true, s.c_str()); + } + + compressor = nullptr; + + if (cct->_conf->bluestore_compression_min_blob_size) { + comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size; + } else { + ceph_assert(bdev); + if (bdev->is_rotational()) { + comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd; + } else { + comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd; + } + } + + if (cct->_conf->bluestore_compression_max_blob_size) { + comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size; + } else { + ceph_assert(bdev); + if (bdev->is_rotational()) { + comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd; + } else { + comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd; + } + } + + auto& alg_name = cct->_conf->bluestore_compression_algorithm; + if (!alg_name.empty()) { + compressor = Compressor::create(cct, alg_name); + if (!compressor) { + derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor" + << dendl; + _set_compression_alert(false, alg_name.c_str()); + } + } + + dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode) + << " alg " << (compressor ? compressor->get_type_name() : "(none)") + << " min_blob " << comp_min_blob_size + << " max_blob " << comp_max_blob_size + << dendl; +} + +void BlueStore::_set_csum() +{ + csum_type = Checksummer::CSUM_NONE; + int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type); + if (t > Checksummer::CSUM_NONE) + csum_type = t; + + dout(10) << __func__ << " csum_type " + << Checksummer::get_csum_type_string(csum_type) + << dendl; +} + +void BlueStore::_set_throttle_params() +{ + if (cct->_conf->bluestore_throttle_cost_per_io) { + throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io; + } else { + ceph_assert(bdev); + if (bdev->is_rotational()) { + throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd; + } else { + throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd; + } + } + + dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io + << dendl; +} +void BlueStore::_set_blob_size() +{ + if (cct->_conf->bluestore_max_blob_size) { + max_blob_size = cct->_conf->bluestore_max_blob_size; + } else { + ceph_assert(bdev); + if (bdev->is_rotational()) { + max_blob_size = cct->_conf->bluestore_max_blob_size_hdd; + } else { + max_blob_size = cct->_conf->bluestore_max_blob_size_ssd; + } + } + dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size + << std::dec << dendl; +} + +void BlueStore::_update_osd_memory_options() +{ + osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target"); + osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base"); + osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation"); + osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min"); + config_changed++; + dout(10) << __func__ + << " osd_memory_target " << osd_memory_target + << " osd_memory_base " << osd_memory_base + << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation + << " osd_memory_cache_min " << osd_memory_cache_min + << dendl; +} + +int BlueStore::_set_cache_sizes() +{ + ceph_assert(bdev); + cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune"); + cache_autotune_interval = + cct->_conf.get_val<double>("bluestore_cache_autotune_interval"); + osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target"); + osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base"); + osd_memory_expected_fragmentation = + cct->_conf.get_val<double>("osd_memory_expected_fragmentation"); + osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min"); + osd_memory_cache_resize_interval = + cct->_conf.get_val<double>("osd_memory_cache_resize_interval"); + + if (cct->_conf->bluestore_cache_size) { + cache_size = cct->_conf->bluestore_cache_size; + } else { + // choose global cache size based on backend type + if (bdev->is_rotational()) { + cache_size = cct->_conf->bluestore_cache_size_hdd; + } else { + cache_size = cct->_conf->bluestore_cache_size_ssd; + } + } + + cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio; + if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) { + derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio + << ") must be in range [0,1.0]" << dendl; + return -EINVAL; + } + + cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio; + if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) { + derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio + << ") must be in range [0,1.0]" << dendl; + return -EINVAL; + } + + if (cache_meta_ratio + cache_kv_ratio > 1.0) { + derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio + << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio + << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0" + << dendl; + return -EINVAL; + } + + cache_data_ratio = + (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio; + if (cache_data_ratio < 0) { + // deal with floating point imprecision + cache_data_ratio = 0; + } + + dout(1) << __func__ << " cache_size " << cache_size + << " meta " << cache_meta_ratio + << " kv " << cache_kv_ratio + << " data " << cache_data_ratio + << dendl; + return 0; +} + +int BlueStore::write_meta(const std::string& key, const std::string& value) +{ + bluestore_bdev_label_t label; + string p = path + "/block"; + int r = _read_bdev_label(cct, p, &label); + if (r < 0) { + return ObjectStore::write_meta(key, value); + } + label.meta[key] = value; + r = _write_bdev_label(cct, p, label); + ceph_assert(r == 0); + return ObjectStore::write_meta(key, value); +} + +int BlueStore::read_meta(const std::string& key, std::string *value) +{ + bluestore_bdev_label_t label; + string p = path + "/block"; + int r = _read_bdev_label(cct, p, &label); + if (r < 0) { + return ObjectStore::read_meta(key, value); + } + auto i = label.meta.find(key); + if (i == label.meta.end()) { + return ObjectStore::read_meta(key, value); + } + *value = i->second; + return 0; +} + +void BlueStore::_init_logger() +{ + PerfCountersBuilder b(cct, "bluestore", + l_bluestore_first, l_bluestore_last); + b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat", + "Average kv_thread flush latency", + "fl_l", PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat", + "Average kv_thread commit latency"); + b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat", + "Average kv_sync thread latency", + "ks_l", PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat", + "Average kv_finalize thread latency", + "kf_l", PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat", + "Average prepare state latency"); + b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat", + "Average aio_wait state latency", + "io_l", PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat", + "Average io_done state latency"); + b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat", + "Average kv_queued state latency"); + b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat", + "Average kv_commiting state latency"); + b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat", + "Average kv_done state latency"); + b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat", + "Average deferred_queued state latency"); + b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat", + "Average aio_wait state latency"); + b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat", + "Average cleanup state latency"); + b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat", + "Average finishing state latency"); + b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat", + "Average done state latency"); + b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat", + "Average submit throttle latency", + "th_l", PerfCountersBuilder::PRIO_CRITICAL); + b.add_time_avg(l_bluestore_submit_lat, "submit_lat", + "Average submit latency", + "s_l", PerfCountersBuilder::PRIO_CRITICAL); + b.add_time_avg(l_bluestore_commit_lat, "commit_lat", + "Average commit latency", + "c_l", PerfCountersBuilder::PRIO_CRITICAL); + b.add_time_avg(l_bluestore_read_lat, "read_lat", + "Average read latency", + "r_l", PerfCountersBuilder::PRIO_CRITICAL); + b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat", + "Average read onode metadata latency"); + b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat", + "Average read latency"); + b.add_time_avg(l_bluestore_compress_lat, "compress_lat", + "Average compress latency"); + b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat", + "Average decompress latency"); + b.add_time_avg(l_bluestore_csum_lat, "csum_lat", + "Average checksum latency"); + b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count", + "Sum for beneficial compress ops"); + b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count", + "Sum for compress ops rejected due to low net gain of space"); + b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes", + "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops", + "Sum for deferred write op"); + b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes", + "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops", + "Sum for write penalty read ops"); + b.add_u64(l_bluestore_allocated, "bluestore_allocated", + "Sum for allocated bytes"); + b.add_u64(l_bluestore_stored, "bluestore_stored", + "Sum for stored bytes"); + b.add_u64(l_bluestore_compressed, "bluestore_compressed", + "Sum for stored compressed bytes", + "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated", + "Sum for bytes allocated for compressed data", + "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original", + "Sum for original bytes that were compressed", + "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluestore_onodes, "bluestore_onodes", + "Number of onodes in cache"); + b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits", + "Sum for onode-lookups hit in the cache"); + b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses", + "Sum for onode-lookups missed in the cache"); + b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits", + "Sum for onode-shard lookups hit in the cache"); + b.add_u64_counter(l_bluestore_onode_shard_misses, + "bluestore_onode_shard_misses", + "Sum for onode-shard lookups missed in the cache"); + b.add_u64(l_bluestore_extents, "bluestore_extents", + "Number of extents in cache"); + b.add_u64(l_bluestore_blobs, "bluestore_blobs", + "Number of blobs in cache"); + b.add_u64(l_bluestore_buffers, "bluestore_buffers", + "Number of buffers in cache"); + b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes", + "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes", + "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes", + "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES)); + + b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big", + "Large aligned writes into fresh blobs"); + b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes", + "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs", + "Large aligned writes into fresh blobs (blobs)"); + b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small", + "Small writes into existing or sparse small blobs"); + b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes", + "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_write_small_unused, + "bluestore_write_small_unused", + "Small writes into unused portion of existing blob"); + b.add_u64_counter(l_bluestore_write_small_deferred, + "bluestore_write_small_deferred", + "Small overwrites using deferred"); + b.add_u64_counter(l_bluestore_write_small_pre_read, + "bluestore_write_small_pre_read", + "Small writes that required we read some data (possibly " + "cached) to fill out the block"); + b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new", + "Small write into new (sparse) blob"); + + b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed"); + b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard", + "Onode extent map reshard events"); + b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split", + "Sum for blob splitting due to resharding"); + b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress", + "Sum for extents that have been removed due to compression"); + b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged", + "Sum for extents that have been merged due to garbage " + "collection"); + b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio", + "Read EIO errors propagated to high level callers"); + b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries", + "Read operations that required at least one retry due to failed checksum validation"); + b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros", + "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000"); + b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat", + "Average omap iterator seek_to_first call latency"); + b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat", + "Average omap iterator upper_bound call latency"); + b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat", + "Average omap iterator lower_bound call latency"); + b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat", + "Average omap iterator next call latency"); + b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat", + "Average omap get_keys call latency"); + b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat", + "Average omap get_values call latency"); + b.add_time_avg(l_bluestore_clist_lat, "clist_lat", + "Average collection listing latency"); + b.add_time_avg(l_bluestore_remove_lat, "remove_lat", + "Average removal latency"); + + logger = b.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); +} + +int BlueStore::_reload_logger() +{ + struct store_statfs_t store_statfs; + int r = statfs(&store_statfs); + if (r >= 0) { + logger->set(l_bluestore_allocated, store_statfs.allocated); + logger->set(l_bluestore_stored, store_statfs.data_stored); + logger->set(l_bluestore_compressed, store_statfs.data_compressed); + logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated); + logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original); + } + return r; +} + +void BlueStore::_shutdown_logger() +{ + cct->get_perfcounters_collection()->remove(logger); + delete logger; +} + +int BlueStore::get_block_device_fsid(CephContext* cct, const string& path, + uuid_d *fsid) +{ + bluestore_bdev_label_t label; + int r = _read_bdev_label(cct, path, &label); + if (r < 0) + return r; + *fsid = label.osd_uuid; + return 0; +} + +int BlueStore::_open_path() +{ + // sanity check(s) + auto osd_max_object_size = + cct->_conf.get_val<Option::size_t>("osd_max_object_size"); + if (osd_max_object_size >= (size_t)OBJECT_MAX_SIZE) { + derr << __func__ << " osd_max_object_size >= 0x" << std::hex << OBJECT_MAX_SIZE + << "; BlueStore has hard limit of 0x" << OBJECT_MAX_SIZE << "." << std::dec << dendl; + return -EINVAL; + } + ceph_assert(path_fd < 0); + path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC)); + if (path_fd < 0) { + int r = -errno; + derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r) + << dendl; + return r; + } + return 0; +} + +void BlueStore::_close_path() +{ + VOID_TEMP_FAILURE_RETRY(::close(path_fd)); + path_fd = -1; +} + +int BlueStore::_write_bdev_label(CephContext *cct, + string path, bluestore_bdev_label_t label) +{ + dout(10) << __func__ << " path " << path << " label " << label << dendl; + bufferlist bl; + encode(label, bl); + uint32_t crc = bl.crc32c(-1); + encode(crc, bl); + ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE); + bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length()); + z.zero(); + bl.append(std::move(z)); + + int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC)); + if (fd < 0) { + fd = -errno; + derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd) + << dendl; + return fd; + } + int r = bl.write_fd(fd); + if (r < 0) { + derr << __func__ << " failed to write to " << path + << ": " << cpp_strerror(r) << dendl; + goto out; + } + r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " failed to fsync " << path + << ": " << cpp_strerror(r) << dendl; + } +out: + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; +} + +int BlueStore::_read_bdev_label(CephContext* cct, string path, + bluestore_bdev_label_t *label) +{ + dout(10) << __func__ << dendl; + int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC)); + if (fd < 0) { + fd = -errno; + derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd) + << dendl; + return fd; + } + bufferlist bl; + int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + if (r < 0) { + derr << __func__ << " failed to read from " << path + << ": " << cpp_strerror(r) << dendl; + return r; + } + + uint32_t crc, expected_crc; + auto p = bl.cbegin(); + try { + decode(*label, p); + bufferlist t; + t.substr_of(bl, 0, p.get_off()); + crc = t.crc32c(-1); + decode(expected_crc, p); + } + catch (buffer::error& e) { + dout(2) << __func__ << " unable to decode label at offset " << p.get_off() + << ": " << e.what() + << dendl; + return -ENOENT; + } + if (crc != expected_crc) { + derr << __func__ << " bad crc on label, expected " << expected_crc + << " != actual " << crc << dendl; + return -EIO; + } + dout(10) << __func__ << " got " << *label << dendl; + return 0; +} + +int BlueStore::_check_or_set_bdev_label( + string path, uint64_t size, string desc, bool create) +{ + bluestore_bdev_label_t label; + if (create) { + label.osd_uuid = fsid; + label.size = size; + label.btime = ceph_clock_now(); + label.description = desc; + int r = _write_bdev_label(cct, path, label); + if (r < 0) + return r; + } else { + int r = _read_bdev_label(cct, path, &label); + if (r < 0) + return r; + if (cct->_conf->bluestore_debug_permit_any_bdev_label) { + dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid + << " and fsid " << fsid << " check bypassed" << dendl; + } + else if (label.osd_uuid != fsid) { + derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid + << " does not match our fsid " << fsid << dendl; + return -EIO; + } + } + return 0; +} + +void BlueStore::_set_alloc_sizes(void) +{ + max_alloc_size = cct->_conf->bluestore_max_alloc_size; + + if (cct->_conf->bluestore_prefer_deferred_size) { + prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size; + } else { + ceph_assert(bdev); + if (bdev->is_rotational()) { + prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd; + } else { + prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd; + } + } + + if (cct->_conf->bluestore_deferred_batch_ops) { + deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops; + } else { + ceph_assert(bdev); + if (bdev->is_rotational()) { + deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd; + } else { + deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd; + } + } + + dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size + << std::dec << " order " << (int)min_alloc_size_order + << " max_alloc_size 0x" << std::hex << max_alloc_size + << " prefer_deferred_size 0x" << prefer_deferred_size + << std::dec + << " deferred_batch_ops " << deferred_batch_ops + << dendl; +} + +int BlueStore::_open_bdev(bool create) +{ + ceph_assert(bdev == NULL); + string p = path + "/block"; + bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this)); + int r = bdev->open(p); + if (r < 0) + goto fail; + + if (create && cct->_conf->bdev_enable_discard) { + bdev->discard(0, bdev->get_size()); + } + + if (bdev->supported_bdev_label()) { + r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create); + if (r < 0) + goto fail_close; + } + + // initialize global block parameters + block_size = bdev->get_block_size(); + block_mask = ~(block_size - 1); + block_size_order = ctz(block_size); + ceph_assert(block_size == 1u << block_size_order); + // and set cache_size based on device type + r = _set_cache_sizes(); + if (r < 0) { + goto fail_close; + } + return 0; + + fail_close: + bdev->close(); + fail: + delete bdev; + bdev = NULL; + return r; +} + +void BlueStore::_validate_bdev() +{ + ceph_assert(bdev); + ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that + uint64_t dev_size = bdev->get_size(); + if (dev_size < + _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) { + dout(1) << __func__ << " main device size " << byte_u_t(dev_size) + << " is too small, disable bluestore_bluefs_min for now" + << dendl; + ceph_assert(dev_size >= _get_ondisk_reserved()); + + int r = cct->_conf.set_val("bluestore_bluefs_min", "0"); + ceph_assert(r == 0); + } +} + +void BlueStore::_close_bdev() +{ + ceph_assert(bdev); + bdev->close(); + delete bdev; + bdev = NULL; +} + +int BlueStore::_open_fm(KeyValueDB::Transaction t) +{ + ceph_assert(fm == NULL); + fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC); + ceph_assert(fm); + if (t) { + // create mode. initialize freespace + dout(20) << __func__ << " initializing freespace" << dendl; + { + bufferlist bl; + bl.append(freelist_type); + t->set(PREFIX_SUPER, "freelist_type", bl); + } + // being able to allocate in units less than bdev block size + // seems to be a bad idea. + ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size); + fm->create(bdev->get_size(), (int64_t)min_alloc_size, t); + + // allocate superblock reserved space. note that we do not mark + // bluefs space as allocated in the freelist; we instead rely on + // bluefs_extents. + auto reserved = _get_ondisk_reserved(); + fm->allocate(0, reserved, t); + + if (cct->_conf->bluestore_bluefs) { + ceph_assert(bluefs_extents.num_intervals() == 1); + interval_set<uint64_t>::iterator p = bluefs_extents.begin(); + reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size); + dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec + << " for bluefs" << dendl; + } + + if (cct->_conf->bluestore_debug_prefill > 0) { + uint64_t end = bdev->get_size() - reserved; + dout(1) << __func__ << " pre-fragmenting freespace, using " + << cct->_conf->bluestore_debug_prefill << " with max free extent " + << cct->_conf->bluestore_debug_prefragment_max << dendl; + uint64_t start = p2roundup(reserved, min_alloc_size); + uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size; + float r = cct->_conf->bluestore_debug_prefill; + r /= 1.0 - r; + bool stop = false; + + while (!stop && start < end) { + uint64_t l = (rand() % max_b + 1) * min_alloc_size; + if (start + l > end) { + l = end - start; + l = p2align(l, min_alloc_size); + } + ceph_assert(start + l <= end); + + uint64_t u = 1 + (uint64_t)(r * (double)l); + u = p2roundup(u, min_alloc_size); + if (start + l + u > end) { + u = end - (start + l); + // trim to align so we don't overflow again + u = p2align(u, min_alloc_size); + stop = true; + } + ceph_assert(start + l + u <= end); + + dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l + << " use 0x" << u << std::dec << dendl; + + if (u == 0) { + // break if u has been trimmed to nothing + break; + } + + fm->allocate(start + l, u, t); + start += l + u; + } + } + } + + int r = fm->init(db); + if (r < 0) { + derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl; + delete fm; + fm = NULL; + return r; + } + // if space size tracked by free list manager is that higher than actual + // dev size one can hit out-of-space allocation which will result + // in data loss and/or assertions + // Probably user altered the device size somehow. + // The only fix for now is to redeploy OSD. + if (fm->get_size() >= bdev->get_size() + min_alloc_size) { + ostringstream ss; + ss << "slow device size mismatch detected, " + << " fm size(" << fm->get_size() + << ") > slow device size(" << bdev->get_size() + << "), Please stop using this OSD as it might cause data loss."; + _set_disk_size_mismatch_alert(ss.str()); + } + return 0; +} + +void BlueStore::_close_fm() +{ + dout(10) << __func__ << dendl; + ceph_assert(fm); + fm->shutdown(); + delete fm; + fm = NULL; +} + +int BlueStore::_open_alloc() +{ + ceph_assert(alloc == NULL); + ceph_assert(bdev->get_size()); + + if (bluefs) { + bluefs_extents.clear(); + auto r = bluefs->get_block_extents(bluefs_shared_bdev, &bluefs_extents); + if (r < 0) { + lderr(cct) << __func__ << " failed to retrieve bluefs_extents: " + << cpp_strerror(r) << dendl; + + return r; + } + dout(10) << __func__ << " bluefs extents 0x" + << std::hex << bluefs_extents << std::dec + << dendl; + } + + alloc = Allocator::create(cct, cct->_conf->bluestore_allocator, + bdev->get_size(), + min_alloc_size, "block"); + if (!alloc) { + lderr(cct) << __func__ << " Allocator::unknown alloc type " + << cct->_conf->bluestore_allocator + << dendl; + return -EINVAL; + } + + uint64_t num = 0, bytes = 0; + + dout(1) << __func__ << " opening allocation metadata" << dendl; + // initialize from freelist + fm->enumerate_reset(); + uint64_t offset, length; + while (fm->enumerate_next(db, &offset, &length)) { + alloc->init_add_free(offset, length); + ++num; + bytes += length; + } + fm->enumerate_reset(); + + // also mark bluefs space as allocated + for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) { + alloc->init_rm_free(e.get_start(), e.get_len()); + } + + dout(1) << __func__ << " loaded " << byte_u_t(bytes) + << " in " << num << " extents" + << " available " << byte_u_t(alloc->get_free()) + << dendl; + + return 0; +} + +void BlueStore::_close_alloc() +{ + ceph_assert(bdev); + bdev->discard_drain(); + + ceph_assert(alloc); + alloc->shutdown(); + delete alloc; + alloc = NULL; + bluefs_extents.clear(); +} + +int BlueStore::_open_fsid(bool create) +{ + ceph_assert(fsid_fd < 0); + int flags = O_RDWR|O_CLOEXEC; + if (create) + flags |= O_CREAT; + fsid_fd = ::openat(path_fd, "fsid", flags, 0644); + if (fsid_fd < 0) { + int err = -errno; + derr << __func__ << " " << cpp_strerror(err) << dendl; + return err; + } + return 0; +} + +int BlueStore::_read_fsid(uuid_d *uuid) +{ + char fsid_str[40]; + memset(fsid_str, 0, sizeof(fsid_str)); + int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) { + derr << __func__ << " failed: " << cpp_strerror(ret) << dendl; + return ret; + } + if (ret > 36) + fsid_str[36] = 0; + else + fsid_str[ret] = 0; + if (!uuid->parse(fsid_str)) { + derr << __func__ << " unparsable uuid " << fsid_str << dendl; + return -EINVAL; + } + return 0; +} + +int BlueStore::_write_fsid() +{ + int r = ::ftruncate(fsid_fd, 0); + if (r < 0) { + r = -errno; + derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl; + return r; + } + string str = stringify(fsid) + "\n"; + r = safe_write(fsid_fd, str.c_str(), str.length()); + if (r < 0) { + derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl; + return r; + } + r = ::fsync(fsid_fd); + if (r < 0) { + r = -errno; + derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +void BlueStore::_close_fsid() +{ + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; +} + +int BlueStore::_lock_fsid() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + derr << __func__ << " failed to lock " << path << "/fsid" + << " (is another ceph-osd still running?)" + << cpp_strerror(err) << dendl; + return -err; + } + return 0; +} + +bool BlueStore::is_rotational() +{ + if (bdev) { + return bdev->is_rotational(); + } + + bool rotational = true; + int r = _open_path(); + if (r < 0) + goto out; + r = _open_fsid(false); + if (r < 0) + goto out_path; + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + r = _open_bdev(false); + if (r < 0) + goto out_fsid; + rotational = bdev->is_rotational(); + _close_bdev(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + out: + return rotational; +} + +bool BlueStore::is_journal_rotational() +{ + if (!bluefs) { + dout(5) << __func__ << " bluefs disabled, default to store media type" + << dendl; + return is_rotational(); + } + dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl; + return bluefs->wal_is_rotational(); +} + +bool BlueStore::test_mount_in_use() +{ + // most error conditions mean the mount is not in use (e.g., because + // it doesn't exist). only if we fail to lock do we conclude it is + // in use. + bool ret = false; + int r = _open_path(); + if (r < 0) + return false; + r = _open_fsid(false); + if (r < 0) + goto out_path; + r = _lock_fsid(); + if (r < 0) + ret = true; // if we can't lock, it is in use + _close_fsid(); + out_path: + _close_path(); + return ret; +} + +int BlueStore::_minimal_open_bluefs(bool create) +{ + int r; + bluefs = new BlueFS(cct); + + string bfn; + struct stat st; + + bfn = path + "/block.db"; + if (::stat(bfn.c_str(), &st) == 0) { + r = bluefs->add_block_device( + BlueFS::BDEV_DB, bfn, + create && cct->_conf->bdev_enable_discard); + if (r < 0) { + derr << __func__ << " add block device(" << bfn << ") returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + + if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) { + r = _check_or_set_bdev_label( + bfn, + bluefs->get_block_device_size(BlueFS::BDEV_DB), + "bluefs db", create); + if (r < 0) { + derr << __func__ + << " check block device(" << bfn << ") label returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + if (create) { + bluefs->add_block_extent( + BlueFS::BDEV_DB, + SUPER_RESERVED, + bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED); + } + bluefs_shared_bdev = BlueFS::BDEV_SLOW; + bluefs_single_shared_device = false; + } else { + r = -errno; + if (::lstat(bfn.c_str(), &st) == -1) { + r = 0; + bluefs_shared_bdev = BlueFS::BDEV_DB; + } else { + derr << __func__ << " " << bfn << " symlink exists but target unusable: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + + // shared device + bfn = path + "/block"; + // never trim here + r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false, + true /* shared with bluestore */); + if (r < 0) { + derr << __func__ << " add block device(" << bfn << ") returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + if (create) { + // note: we always leave the first SUPER_RESERVED (8k) of the device unused + uint64_t initial = + bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio + + cct->_conf->bluestore_bluefs_gift_ratio); + initial = std::max(initial, cct->_conf->bluestore_bluefs_min); + uint64_t alloc_size = cct->_conf->bluefs_shared_alloc_size; + if (alloc_size % min_alloc_size) { + derr << __func__ << " bluefs_shared_alloc_size 0x" << std::hex + << alloc_size << " is not a multiple of " + << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl; + r = -EINVAL; + goto free_bluefs; + } + // align to bluefs's alloc_size + initial = p2roundup(initial, alloc_size); + // put bluefs in the middle of the device in case it is an HDD + uint64_t start = p2align((bdev->get_size() - initial) / 2, alloc_size); + //avoiding superblock overwrite + start = std::max(alloc_size, start); + ceph_assert(start >=_get_ondisk_reserved()); + + bluefs->add_block_extent(bluefs_shared_bdev, start, initial); + bluefs_extents.insert(start, initial); + ++out_of_sync_fm; + } + + bfn = path + "/block.wal"; + if (::stat(bfn.c_str(), &st) == 0) { + r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn, + create && cct->_conf->bdev_enable_discard); + if (r < 0) { + derr << __func__ << " add block device(" << bfn << ") returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + + if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) { + r = _check_or_set_bdev_label( + bfn, + bluefs->get_block_device_size(BlueFS::BDEV_WAL), + "bluefs wal", create); + if (r < 0) { + derr << __func__ << " check block device(" << bfn + << ") label returned: " << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + + if (create) { + bluefs->add_block_extent( + BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE, + bluefs->get_block_device_size(BlueFS::BDEV_WAL) - + BDEV_LABEL_BLOCK_SIZE); + } + bluefs_single_shared_device = false; + } else { + r = 0; + if (::lstat(bfn.c_str(), &st) != -1) { + r = -errno; + derr << __func__ << " " << bfn << " symlink exists but target unusable: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + return 0; + +free_bluefs: + ceph_assert(bluefs); + delete bluefs; + bluefs = NULL; + return r; +} + +int BlueStore::_open_bluefs(bool create) +{ + int r = _minimal_open_bluefs(create); + if (r < 0) { + return r; + } + RocksDBBlueFSVolumeSelector* vselector = nullptr; + if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) { + + string options = cct->_conf->bluestore_rocksdb_options; + + rocksdb::Options rocks_opts; + int r = RocksDBStore::ParseOptionsFromStringStatic( + cct, + options, + rocks_opts, + nullptr); + if (r < 0) { + return r; + } + + double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor; + vselector = + new RocksDBBlueFSVolumeSelector( + bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, + bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, + bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100, + 1024 * 1024 * 1024, //FIXME: set expected l0 size here + rocks_opts.max_bytes_for_level_base, + rocks_opts.max_bytes_for_level_multiplier, + reserved_factor, + cct->_conf->bluestore_volume_selection_reserved, + cct->_conf->bluestore_volume_selection_policy != "rocksdb_original"); + } + if (create) { + bluefs->mkfs(fsid); + } + bluefs->set_volume_selector(vselector); + r = bluefs->mount(); + if (r < 0) { + derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl; + } + return r; +} + +void BlueStore::_close_bluefs(bool cold_close) +{ + bluefs->umount(cold_close); + _minimal_close_bluefs(); +} + +void BlueStore::_minimal_close_bluefs() +{ + delete bluefs; + bluefs = NULL; +} + +int BlueStore::_is_bluefs(bool create, bool* ret) +{ + if (create) { + *ret = cct->_conf->bluestore_bluefs; + } else { + string s; + int r = read_meta("bluefs", &s); + if (r < 0) { + derr << __func__ << " unable to read 'bluefs' meta" << dendl; + return -EIO; + } + if (s == "1") { + *ret = true; + } else if (s == "0") { + *ret = false; + } else { + derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting" + << dendl; + return -EIO; + } + } + return 0; +} + +/* +* opens both DB and dependant super_meta, FreelistManager and allocator +* in the proper order +*/ +int BlueStore::_open_db_and_around(bool read_only) +{ + int r; + bool do_bluefs = false; + _is_bluefs(false, &do_bluefs); // ignore err code + if (do_bluefs) { + // open in read-only first to read FM list and init allocator + // as they might be needed for some BlueFS procedures + r = _open_db(false, false, true); + if (r < 0) + return r; + + r = _open_super_meta(); + if (r < 0) { + goto out_db; + } + + r = _open_fm(nullptr); + if (r < 0) + goto out_db; + + r = _open_alloc(); + if (r < 0) + goto out_fm; + + // now open in R/W mode + if (!read_only) { + _close_db(true); + + r = _open_db(false, false, false); + if (r < 0) { + _close_alloc(); + _close_fm(); + return r; + } + } + } else { + r = _open_db(false, false); + if (r < 0) { + return r; + } + r = _open_super_meta(); + if (r < 0) { + goto out_db; + } + + r = _open_fm(nullptr); + if (r < 0) + goto out_db; + + r = _open_alloc(); + if (r < 0) + goto out_fm; + } + return 0; + + out_fm: + _close_fm(); + out_db: + _close_db(read_only); + return r; +} + +void BlueStore::_close_db_and_around(bool read_only) +{ + if (bluefs) { + if (!read_only && out_of_sync_fm.fetch_and(0)) { + _sync_bluefs_and_fm(); + } + _close_db(read_only); + while(!read_only && out_of_sync_fm.fetch_and(0)) { + // if seen some allocations during close - repeat open_db, sync fm, close + dout(0) << __func__ << " syncing FreelistManager" << dendl; + int r = _open_db(false, false, false); + if (r < 0) { + derr << __func__ + << " unable to open db, FreelistManager is probably out of sync" + << dendl; + break; + } + _sync_bluefs_and_fm(); + _close_db(false); + } + if (!_kv_only) { + _close_alloc(); + _close_fm(); + } + } else { + _close_alloc(); + _close_fm(); + _close_db(read_only); + } +} + +// updates legacy bluefs related recs in DB to a state valid for +// downgrades from nautilus. +void BlueStore::_sync_bluefs_and_fm() +{ + if (cct->_conf->bluestore_bluefs_db_compatibility) { + bufferlist bl; + encode(bluefs_extents, bl); + dout(20) << __func__ << " bluefs_extents at KV is now 0x" + << std::hex << bluefs_extents << std::dec + << dendl; + KeyValueDB::Transaction synct = db->get_transaction(); + synct->set(PREFIX_SUPER, "bluefs_extents", bl); + synct->set(PREFIX_SUPER, "bluefs_extents_back", bl); + + // Nice thing is that we don't need to update FreelistManager here. + // It always has corresponding bits set to 'Free' for both Nautilus+ and + // pre-Nautilis releases. + // So once we get an extent to bluefs_extents this means it's + // been free in allocator and hence it's free in FM too. + + db->submit_transaction_sync(synct); + } +} + +int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only) +{ + int r; + ceph_assert(!db); + ceph_assert(!(create && read_only)); + string fn = path + "/db"; + string options; + stringstream err; + std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator); + + string kv_backend; + std::vector<KeyValueDB::ColumnFamily> cfs; + + if (create) { + kv_backend = cct->_conf->bluestore_kvbackend; + } else { + r = read_meta("kv_backend", &kv_backend); + if (r < 0) { + derr << __func__ << " unable to read 'kv_backend' meta" << dendl; + return -EIO; + } + } + dout(10) << __func__ << " kv_backend = " << kv_backend << dendl; + + bool do_bluefs; + r = _is_bluefs(create, &do_bluefs); + if (r < 0) { + return r; + } + dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl; + + map<string,string> kv_options; + // force separate wal dir for all new deployments. + kv_options["separate_wal_dir"] = 1; + rocksdb::Env *env = NULL; + if (do_bluefs) { + dout(10) << __func__ << " initializing bluefs" << dendl; + if (kv_backend != "rocksdb") { + derr << " backend must be rocksdb to use bluefs" << dendl; + return -EINVAL; + } + + r = _open_bluefs(create); + if (r < 0) { + return r; + } + + if (cct->_conf->bluestore_bluefs_env_mirror) { + rocksdb::Env* a = new BlueRocksEnv(bluefs); + rocksdb::Env* b = rocksdb::Env::Default(); + if (create) { + string cmd = "rm -rf " + path + "/db " + + path + "/db.slow " + + path + "/db.wal"; + int r = system(cmd.c_str()); + (void)r; + } + env = new rocksdb::EnvMirror(b, a, false, true); + } else { + env = new BlueRocksEnv(bluefs); + + // simplify the dir names, too, as "seen" by rocksdb + fn = "db"; + } + bluefs->set_slow_device_expander(this); + BlueFSVolumeSelector::paths paths; + bluefs->get_vselector_paths(fn, paths); + + if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) { + // we have both block.db and block; tell rocksdb! + // note: the second (last) size value doesn't really matter + ostringstream db_paths; + bool first = true; + for (auto& p : paths) { + if (!first) { + db_paths << " "; + } + first = false; + db_paths << p.first << "," << p.second; + + } + kv_options["db_paths"] = db_paths.str(); + dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl; + } + + if (create) { + for (auto& p : paths) { + env->CreateDir(p.first); + } + // Selectors don't provide wal path so far hence create explicitly + env->CreateDir(fn + ".wal"); + } else { + std::vector<std::string> res; + // check for dir presence + auto r = env->GetChildren(fn+".wal", &res); + if (r.IsNotFound()) { + kv_options.erase("separate_wal_dir"); + } + } + } else { + string walfn = path + "/db.wal"; + + if (create) { + int r = ::mkdir(fn.c_str(), 0755); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r) + << dendl; + return r; + } + + // wal_dir, too! + r = ::mkdir(walfn.c_str(), 0755); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " failed to create " << walfn + << ": " << cpp_strerror(r) + << dendl; + return r; + } + } else { + struct stat st; + r = ::stat(walfn.c_str(), &st); + if (r < 0 && errno == ENOENT) { + kv_options.erase("separate_wal_dir"); + } + } + } + + + db = KeyValueDB::create(cct, + kv_backend, + fn, + kv_options, + static_cast<void*>(env)); + if (!db) { + derr << __func__ << " error creating db" << dendl; + if (bluefs) { + _close_bluefs(read_only); + } + // delete env manually here since we can't depend on db to do this + // under this case + delete env; + env = NULL; + return -EIO; + } + + FreelistManager::setup_merge_operators(db); + db->set_merge_operator(PREFIX_STAT, merge_op); + db->set_cache_size(cache_kv_ratio * cache_size); + + if (kv_backend == "rocksdb") { + options = cct->_conf->bluestore_rocksdb_options; + + map<string,string> cf_map; + cct->_conf.with_val<string>("bluestore_rocksdb_cfs", + get_str_map, + &cf_map, + " \t"); + for (auto& i : cf_map) { + dout(10) << "column family " << i.first << ": " << i.second << dendl; + cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second)); + } + } + + db->init(options); + if (to_repair_db) + return 0; + if (create) { + if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) { + r = db->create_and_open(err, cfs); + } else { + r = db->create_and_open(err); + } + } else { + // we pass in cf list here, but it is only used if the db already has + // column families created. + r = read_only ? + db->open_read_only(err, cfs) : + db->open(err, cfs); + } + if (r) { + derr << __func__ << " erroring opening db: " << err.str() << dendl; + _close_db(read_only); + return -EIO; + } + dout(1) << __func__ << " opened " << kv_backend + << " path " << fn << " options " << options << dendl; + return 0; +} + +void BlueStore::_close_db(bool cold_close) +{ + ceph_assert(db); + delete db; + db = NULL; + if (bluefs) { + _close_bluefs(cold_close); + } +} + +void BlueStore::_dump_alloc_on_failure() +{ + auto dump_interval = + cct->_conf->bluestore_bluefs_alloc_failure_dump_interval; + if (dump_interval > 0 && + next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) { + alloc->dump(); + next_dump_on_bluefs_alloc_failure = ceph_clock_now(); + next_dump_on_bluefs_alloc_failure += dump_interval; + } +} + + +int BlueStore::allocate_bluefs_freespace( + uint64_t min_size, + uint64_t size, + PExtentVector* extents_out) +{ + ceph_assert(min_size <= size); + if (size) { + // round up to alloc size + uint64_t alloc_size = bluefs->get_alloc_size(bluefs_shared_bdev); + min_size = p2roundup(min_size, alloc_size); + size = p2roundup(size, alloc_size); + + PExtentVector extents_local; + PExtentVector* extents = extents_out ? extents_out : &extents_local; + + + uint64_t gift; + uint64_t allocated = 0; + int64_t alloc_len; + do { + // hard cap to fit into 32 bits + gift = std::min<uint64_t>(size, 1ull << 31); + dout(10) << __func__ << " gifting " << gift + << " (" << byte_u_t(gift) << ")" << dendl; + + alloc_len = alloc->allocate(gift, alloc_size, 0, 0, extents); + if (alloc_len > 0) { + allocated += alloc_len; + size -= alloc_len; + } + + if (alloc_len < 0 || + (alloc_len < (int64_t)gift && (min_size > allocated))) { + derr << __func__ + << " failed to allocate on 0x" << std::hex << gift + << " min_size 0x" << min_size + << " > allocated total 0x" << allocated + << " bluefs_shared_alloc_size 0x" << alloc_size + << " allocated 0x" << (alloc_len < 0 ? 0 : alloc_len) + << " available 0x " << alloc->get_free() + << std::dec << dendl; + + _dump_alloc_on_failure(); + alloc->release(*extents); + extents->clear(); + return -ENOSPC; + } + } while (size && alloc_len > 0); + for (auto& e : *extents) { + dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl; + bluefs_extents.insert(e.offset, e.length); + ++out_of_sync_fm; + // apply to bluefs if not requested from outside + if (!extents_out) { + bluefs->add_block_extent(bluefs_shared_bdev, e.offset, e.length); + } + } + } + return 0; +} + +size_t BlueStore::available_freespace(uint64_t alloc_size) { + size_t total = 0; + auto iterated_allocation = [&](size_t off, size_t len) { + //only count in size that is alloc_size aligned + size_t dist_to_alignment; + size_t offset_in_block = off & (alloc_size - 1); + if (offset_in_block == 0) + dist_to_alignment = 0; + else + dist_to_alignment = alloc_size - offset_in_block; + if (dist_to_alignment >= len) + return; + len -= dist_to_alignment; + total += p2align(len, alloc_size); + }; + alloc->dump(iterated_allocation); + return total; +} + +int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total) +{ + float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total; + + uint64_t my_free = alloc->get_free(); + uint64_t total = bdev->get_size(); + float my_free_ratio = (float)my_free / (float)total; + + uint64_t total_free = bluefs_free + my_free; + + float bluefs_ratio = (float)bluefs_free / (float)total_free; + + dout(10) << __func__ + << " bluefs " << byte_u_t(bluefs_free) + << " free (" << bluefs_free_ratio + << ") bluestore " << byte_u_t(my_free) + << " free (" << my_free_ratio + << "), bluefs_ratio " << bluefs_ratio + << dendl; + + uint64_t gift = 0; + uint64_t reclaim = 0; + if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) { + gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free; + dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio + << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio + << ", should gift " << byte_u_t(gift) << dendl; + } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) { + reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free; + if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min) + reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min; + dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio + << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio + << ", should reclaim " << byte_u_t(reclaim) << dendl; + } + + // don't take over too much of the freespace + uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free; + if (bluefs_total < cct->_conf->bluestore_bluefs_min && + cct->_conf->bluestore_bluefs_min < free_cap) { + uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total; + dout(10) << __func__ << " bluefs_total " << bluefs_total + << " < min " << cct->_conf->bluestore_bluefs_min + << ", should gift " << byte_u_t(g) << dendl; + if (g > gift) + gift = g; + reclaim = 0; + } + uint64_t min_free = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free"); + if (bluefs_free < min_free && + min_free < free_cap) { + uint64_t g = min_free - bluefs_free; + dout(10) << __func__ << " bluefs_free " << bluefs_free + << " < min " << min_free + << ", should gift " << byte_u_t(g) << dendl; + if (g > gift) + gift = g; + reclaim = 0; + } + ceph_assert((int64_t)gift >= 0); + ceph_assert((int64_t)reclaim >= 0); + return gift > 0 ? (int64_t)gift : -(int64_t)reclaim; +} + +int BlueStore::_balance_bluefs_freespace() +{ + int ret = 0; + ceph_assert(bluefs); + + vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ... + bluefs->get_usage(&bluefs_usage); + ceph_assert(bluefs_usage.size() > bluefs_shared_bdev); + + bool clear_alert = true; + if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) { + auto& p = bluefs_usage[bluefs_shared_bdev]; + if (p.first != p.second) { + auto& db = bluefs_usage[BlueFS::BDEV_DB]; + ostringstream ss; + ss << "spilled over " << byte_u_t(p.second - p.first) + << " metadata from 'db' device (" << byte_u_t(db.second - db.first) + << " used of " << byte_u_t(db.second) << ") to slow device"; + _set_spillover_alert(ss.str()); + clear_alert = false; + } + } + if (clear_alert) { + _clear_spillover_alert(); + } + + // fixme: look at primary bdev only for now + int64_t delta = _get_bluefs_size_delta( + bluefs_usage[bluefs_shared_bdev].first, + bluefs_usage[bluefs_shared_bdev].second); + + // reclaim from bluefs? + if (delta < 0) { + // round up to alloc size + uint64_t alloc_size = bluefs->get_alloc_size(bluefs_shared_bdev); + auto reclaim = p2roundup(uint64_t(-delta), alloc_size); + + // hard cap to fit into 32 bits + reclaim = std::min<uint64_t>(reclaim, 1ull << 31); + dout(10) << __func__ << " reclaiming " << reclaim + << " (" << byte_u_t(reclaim) << ")" << dendl; + + while (reclaim > 0) { + // NOTE: this will block and do IO. + PExtentVector extents; + int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim, + &extents); + if (r < 0) { + derr << __func__ << " failed to reclaim space from bluefs" + << dendl; + break; + } + for (auto e : extents) { + ++out_of_sync_fm; + bluefs_extents.erase(e.offset, e.length); + bluefs_extents_reclaiming.insert(e.offset, e.length); + reclaim -= e.length; + } + } + + ret = 1; + } + + return ret; +} + +int BlueStore::_open_collections() +{ + dout(10) << __func__ << dendl; + collections_had_errors = false; + ceph_assert(coll_map.empty()); + KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL); + for (it->upper_bound(string()); + it->valid(); + it->next()) { + coll_t cid; + if (cid.parse(it->key())) { + CollectionRef c( + new Collection( + this, + cache_shards[cid.hash_to_shard(cache_shards.size())], + cid)); + bufferlist bl = it->value(); + auto p = bl.cbegin(); + try { + decode(c->cnode, p); + } catch (buffer::error& e) { + derr << __func__ << " failed to decode cnode, key:" + << pretty_binary_string(it->key()) << dendl; + return -EIO; + } + dout(20) << __func__ << " opened " << cid << " " << c + << " " << c->cnode << dendl; + _osr_attach(c.get()); + coll_map[cid] = c; + + } else { + derr << __func__ << " unrecognized collection " << it->key() << dendl; + collections_had_errors = true; + } + } + return 0; +} + +void BlueStore::_fsck_collections(int64_t* errors) +{ + if (collections_had_errors) { + dout(10) << __func__ << dendl; + KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL); + for (it->upper_bound(string()); + it->valid(); + it->next()) { + coll_t cid; + if (!cid.parse(it->key())) { + derr << __func__ << " unrecognized collection " << it->key() << dendl; + if (errors) { + (*errors)++; + } + } + } + } +} + +void BlueStore::_open_statfs() +{ + osd_pools.clear(); + vstatfs.reset(); + + bufferlist bl; + int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl); + if (r >= 0) { + per_pool_stat_collection = false; + if (size_t(bl.length()) >= sizeof(vstatfs.values)) { + auto it = bl.cbegin(); + vstatfs.decode(it); + dout(10) << __func__ << " store_statfs is found" << dendl; + } else { + dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl; + } + _check_legacy_statfs_alert(); + } else { + per_pool_stat_collection = true; + dout(10) << __func__ << " per-pool statfs is enabled" << dendl; + KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT); + for (it->upper_bound(string()); + it->valid(); + it->next()) { + + uint64_t pool_id; + int r = get_key_pool_stat(it->key(), &pool_id); + ceph_assert(r == 0); + + bufferlist bl; + bl = it->value(); + auto p = bl.cbegin(); + auto& st = osd_pools[pool_id]; + try { + st.decode(p); + vstatfs += st; + + dout(30) << __func__ << " pool " << pool_id + << " statfs " << st << dendl; + } catch (buffer::error& e) { + derr << __func__ << " failed to decode pool stats, key:" + << pretty_binary_string(it->key()) << dendl; + } + } + } + dout(30) << __func__ << " statfs " << vstatfs << dendl; + +} + +int BlueStore::_setup_block_symlink_or_file( + string name, + string epath, + uint64_t size, + bool create) +{ + dout(20) << __func__ << " name " << name << " path " << epath + << " size " << size << " create=" << (int)create << dendl; + int r = 0; + int flags = O_RDWR|O_CLOEXEC; + if (create) + flags |= O_CREAT; + if (epath.length()) { + r = ::symlinkat(epath.c_str(), path_fd, name.c_str()); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to create " << name << " symlink to " + << epath << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) { + int fd = ::openat(path_fd, epath.c_str(), flags, 0644); + if (fd < 0) { + r = -errno; + derr << __func__ << " failed to open " << epath << " file: " + << cpp_strerror(r) << dendl; + return r; + } + // write the Transport ID of the NVMe device + // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0" + // where "0000:02:00.0" is the selector of a PCI device, see + // the first column of "lspci -mm -n -D" + string trid{"trtype:PCIe "}; + trid += "traddr:"; + trid += epath.substr(strlen(SPDK_PREFIX)); + r = ::write(fd, trid.c_str(), trid.size()); + ceph_assert(r == static_cast<int>(trid.size())); + dout(1) << __func__ << " created " << name << " symlink to " + << epath << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + } + if (size) { + int fd = ::openat(path_fd, name.c_str(), flags, 0644); + if (fd >= 0) { + // block file is present + struct stat st; + int r = ::fstat(fd, &st); + if (r == 0 && + S_ISREG(st.st_mode) && // if it is a regular file + st.st_size == 0) { // and is 0 bytes + r = ::ftruncate(fd, size); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to resize " << name << " file to " + << size << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + + if (cct->_conf->bluestore_block_preallocate_file) { + r = ::ceph_posix_fallocate(fd, 0, size); + if (r > 0) { + derr << __func__ << " failed to prefallocate " << name << " file to " + << size << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return -r; + } + } + dout(1) << __func__ << " resized " << name << " file to " + << byte_u_t(size) << dendl; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } else { + int r = -errno; + if (r != -ENOENT) { + derr << __func__ << " failed to open " << name << " file: " + << cpp_strerror(r) << dendl; + return r; + } + } + } + return 0; +} + +int BlueStore::mkfs() +{ + dout(1) << __func__ << " path " << path << dendl; + int r; + uuid_d old_fsid; + + if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) { + derr << __func__ << " osd_max_object_size " + << cct->_conf->osd_max_object_size << " > bluestore max " + << OBJECT_MAX_SIZE << dendl; + return -EINVAL; + } + + { + string done; + r = read_meta("mkfs_done", &done); + if (r == 0) { + dout(1) << __func__ << " already created" << dendl; + if (cct->_conf->bluestore_fsck_on_mkfs) { + r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep); + if (r < 0) { + derr << __func__ << " fsck found fatal error: " << cpp_strerror(r) + << dendl; + return r; + } + if (r > 0) { + derr << __func__ << " fsck found " << r << " errors" << dendl; + r = -EIO; + } + } + return r; // idempotent + } + } + + { + string type; + r = read_meta("type", &type); + if (r == 0) { + if (type != "bluestore") { + derr << __func__ << " expected bluestore, but type is " << type << dendl; + return -EIO; + } + } else { + r = write_meta("type", "bluestore"); + if (r < 0) + return r; + } + } + + freelist_type = "bitmap"; + + r = _open_path(); + if (r < 0) + return r; + + r = _open_fsid(true); + if (r < 0) + goto out_path_fd; + + r = _lock_fsid(); + if (r < 0) + goto out_close_fsid; + + r = _read_fsid(&old_fsid); + if (r < 0 || old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << __func__ << " generated fsid " << fsid << dendl; + } else { + dout(1) << __func__ << " using provided fsid " << fsid << dendl; + } + // we'll write it later. + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << __func__ << " on-disk fsid " << old_fsid + << " != provided " << fsid << dendl; + r = -EINVAL; + goto out_close_fsid; + } + fsid = old_fsid; + } + + r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path, + cct->_conf->bluestore_block_size, + cct->_conf->bluestore_block_create); + if (r < 0) + goto out_close_fsid; + if (cct->_conf->bluestore_bluefs) { + r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path, + cct->_conf->bluestore_block_wal_size, + cct->_conf->bluestore_block_wal_create); + if (r < 0) + goto out_close_fsid; + r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path, + cct->_conf->bluestore_block_db_size, + cct->_conf->bluestore_block_db_create); + if (r < 0) + goto out_close_fsid; + } + + r = _open_bdev(true); + if (r < 0) + goto out_close_fsid; + + // choose min_alloc_size + if (cct->_conf->bluestore_min_alloc_size) { + min_alloc_size = cct->_conf->bluestore_min_alloc_size; + } else { + ceph_assert(bdev); + if (bdev->is_rotational()) { + min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd; + } else { + min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd; + } + } + _validate_bdev(); + + // make sure min_alloc_size is power of 2 aligned. + if (!isp2(min_alloc_size)) { + derr << __func__ << " min_alloc_size 0x" + << std::hex << min_alloc_size << std::dec + << " is not power of 2 aligned!" + << dendl; + r = -EINVAL; + goto out_close_bdev; + } + + r = _open_db(true); + if (r < 0) + goto out_close_bdev; + + { + KeyValueDB::Transaction t = db->get_transaction(); + r = _open_fm(t); + if (r < 0) + goto out_close_db; + { + bufferlist bl; + encode((uint64_t)0, bl); + t->set(PREFIX_SUPER, "nid_max", bl); + t->set(PREFIX_SUPER, "blobid_max", bl); + } + + { + bufferlist bl; + encode((uint64_t)min_alloc_size, bl); + t->set(PREFIX_SUPER, "min_alloc_size", bl); + } + + ondisk_format = latest_ondisk_format; + _prepare_ondisk_format_super(t); + db->submit_transaction_sync(t); + } + + r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend); + if (r < 0) + goto out_close_fm; + + r = write_meta("bluefs", stringify(bluefs ? 1 : 0)); + if (r < 0) + goto out_close_fm; + + if (fsid != old_fsid) { + r = _write_fsid(); + if (r < 0) { + derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl; + goto out_close_fm; + } + } + + if (out_of_sync_fm.fetch_and(0)) { + _sync_bluefs_and_fm(); + } + + out_close_fm: + _close_fm(); + out_close_db: + _close_db(false); + out_close_bdev: + _close_bdev(); + out_close_fsid: + _close_fsid(); + out_path_fd: + _close_path(); + + if (r == 0 && + cct->_conf->bluestore_fsck_on_mkfs) { + int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep); + if (rc < 0) + return rc; + if (rc > 0) { + derr << __func__ << " fsck found " << rc << " errors" << dendl; + r = -EIO; + } + } + + if (r == 0) { + // indicate success by writing the 'mkfs_done' file + r = write_meta("mkfs_done", "yes"); + } + + if (r < 0) { + derr << __func__ << " failed, " << cpp_strerror(r) << dendl; + } else { + dout(0) << __func__ << " success" << dendl; + } + return r; +} + +int BlueStore::_mount_for_bluefs() +{ + int r = _open_path(); + ceph_assert(r == 0); + r = _open_fsid(false); + ceph_assert(r == 0); + r = _read_fsid(&fsid); + ceph_assert(r == 0); + r = _lock_fsid(); + ceph_assert(r == 0); + r = _open_bluefs(false); + ceph_assert(r == 0); + return r; +} + +void BlueStore::_umount_for_bluefs() +{ + _close_bluefs(false); + _close_fsid(); + _close_path(); +} + +int BlueStore::add_new_bluefs_device(int id, const string& dev_path) +{ + dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl; + int r; + ceph_assert(path_fd < 0); + + ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB); + + if (!cct->_conf->bluestore_bluefs) { + derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; + return -EIO; + } + + r = _mount_for_bluefs(); + + int reserved = 0; + if (id == BlueFS::BDEV_NEWWAL) { + string p = path + "/block.wal"; + r = _setup_block_symlink_or_file("block.wal", dev_path, + cct->_conf->bluestore_block_wal_size, + true); + ceph_assert(r == 0); + + r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p, + cct->_conf->bdev_enable_discard); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { + r = _check_or_set_bdev_label( + p, + bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL), + "bluefs wal", + true); + ceph_assert(r == 0); + } + + reserved = BDEV_LABEL_BLOCK_SIZE; + } else if (id == BlueFS::BDEV_NEWDB) { + string p = path + "/block.db"; + r = _setup_block_symlink_or_file("block.db", dev_path, + cct->_conf->bluestore_block_db_size, + true); + ceph_assert(r == 0); + + r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p, + cct->_conf->bdev_enable_discard); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { + r = _check_or_set_bdev_label( + p, + bluefs->get_block_device_size(BlueFS::BDEV_NEWDB), + "bluefs db", + true); + ceph_assert(r == 0); + } + reserved = SUPER_RESERVED; + } + + bluefs->umount(); + bluefs->mount(); + + bluefs->add_block_extent( + id, + reserved, + bluefs->get_block_device_size(id) - reserved, true); + + r = bluefs->prepare_new_device(id); + ceph_assert(r == 0); + + if (r < 0) { + derr << __func__ << " failed, " << cpp_strerror(r) << dendl; + } else { + dout(0) << __func__ << " success" << dendl; + } + + _umount_for_bluefs(); + return r; +} + +int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source, + int id) +{ + dout(10) << __func__ << " id:" << id << dendl; + ceph_assert(path_fd < 0); + + ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB); + + if (!cct->_conf->bluestore_bluefs) { + derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; + return -EIO; + } + + int r = _mount_for_bluefs(); + + // require bluestore_bluefs_min_free to be free at target device! + uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free"); + for(auto src_id : devs_source) { + used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id); + } + uint64_t target_free = bluefs->get_free(id); + if (id == BlueFS::BDEV_SLOW && target_free < used_space) { + // will need to remount full BlueStore instance to allocate more space + _umount_for_bluefs(); + + r = mount(); + ceph_assert(r == 0); + dout(1) << __func__ + << " Allocating more space at slow device for BlueFS: +" + << used_space - target_free << " bytes" << dendl; + r = allocate_bluefs_freespace( + used_space - target_free, + used_space - target_free, + nullptr); + + umount(); + if (r != 0) { + derr << __func__ + << " can't migrate, unable to allocate extra space: " + << used_space - target_free << " at target:" << id + << dendl; + return -ENOSPC; + } + + r = _mount_for_bluefs(); + ceph_assert(r == 0); + } else if (target_free < used_space) { + derr << __func__ + << " can't migrate, free space at target: " << target_free + << " is less than required space: " << used_space + << dendl; + return -ENOSPC; + } + r = bluefs->device_migrate_to_existing(cct, devs_source, id); + if (r < 0) { + derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl; + goto shutdown; + } + + if (devs_source.count(BlueFS::BDEV_DB)) { + r = unlink(string(path + "/block.db").c_str()); + ceph_assert(r == 0); + } + if (devs_source.count(BlueFS::BDEV_WAL)) { + r = unlink(string(path + "/block.wal").c_str()); + ceph_assert(r == 0); + } + +shutdown: + _umount_for_bluefs(); + return r; +} + +int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source, + int id, + const string& dev_path) +{ + dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl; + int r; + ceph_assert(path_fd < 0); + + ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB); + + if (!cct->_conf->bluestore_bluefs) { + derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; + return -EIO; + } + + r = _mount_for_bluefs(); + + int reserved = 0; + string link_db; + string link_wal; + if (devs_source.count(BlueFS::BDEV_DB) && + bluefs_shared_bdev != BlueFS::BDEV_DB) { + link_db = path + "/block.db"; + } + if (devs_source.count(BlueFS::BDEV_WAL)) { + link_wal = path + "/block.wal"; + } + + size_t target_size; + string target_name; + if (id == BlueFS::BDEV_NEWWAL) { + target_name = "block.wal"; + target_size = cct->_conf->bluestore_block_wal_size; + + r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path, + cct->_conf->bdev_enable_discard); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { + r = _check_or_set_bdev_label( + dev_path, + bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL), + "bluefs wal", + true); + ceph_assert(r == 0); + } + reserved = BDEV_LABEL_BLOCK_SIZE; + } else if (id == BlueFS::BDEV_NEWDB) { + target_name = "block.db"; + target_size = cct->_conf->bluestore_block_db_size; + + r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path, + cct->_conf->bdev_enable_discard); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { + r = _check_or_set_bdev_label( + dev_path, + bluefs->get_block_device_size(BlueFS::BDEV_NEWDB), + "bluefs db", + true); + ceph_assert(r == 0); + } + reserved = SUPER_RESERVED; + } + + bluefs->umount(); + bluefs->mount(); + + bluefs->add_block_extent( + id, reserved, bluefs->get_block_device_size(id) - reserved); + + r = bluefs->device_migrate_to_new(cct, devs_source, id); + + if (r < 0) { + derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl; + goto shutdown; + } + + if (!link_db.empty()) { + r = unlink(link_db.c_str()); + ceph_assert(r == 0); + } + if (!link_wal.empty()) { + r = unlink(link_wal.c_str()); + ceph_assert(r == 0); + } + r = _setup_block_symlink_or_file( + target_name, + dev_path, + target_size, + true); + ceph_assert(r == 0); + dout(0) << __func__ << " success" << dendl; + +shutdown: + _umount_for_bluefs(); + return r; +} + +string BlueStore::get_device_path(unsigned id) +{ + string res; + if (id < BlueFS::MAX_BDEV) { + switch (id) { + case BlueFS::BDEV_WAL: + res = path + "/block.wal"; + break; + case BlueFS::BDEV_DB: + if (id == bluefs_shared_bdev) { + res = path + "/block"; + } else { + res = path + "/block.db"; + } + break; + case BlueFS::BDEV_SLOW: + res = path + "/block"; + break; + } + } + return res; +} + +int BlueStore::expand_devices(ostream& out) +{ + int r = cold_open(); + ceph_assert(r == 0); + bluefs->dump_block_extents(out); + out << "Expanding DB/WAL..." << std::endl; + for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) { + if (devid == bluefs_shared_bdev ) { + continue; + } + uint64_t size = bluefs->get_block_device_size(devid); + if (size == 0) { + // no bdev + continue; + } + + interval_set<uint64_t> before; + bluefs->get_block_extents(devid, &before); + ceph_assert(!before.empty()); + uint64_t end = before.range_end(); + if (end < size) { + out << devid + <<" : expanding " << " from 0x" << std::hex + << end << " to 0x" << size << std::dec << std::endl; + bluefs->add_block_extent(devid, end, size-end); + string p = get_device_path(devid); + const char* path = p.c_str(); + if (path == nullptr) { + derr << devid + <<": can't find device path " << dendl; + continue; + } + bluestore_bdev_label_t label; + int r = _read_bdev_label(cct, path, &label); + if (r < 0) { + derr << "unable to read label for " << path << ": " + << cpp_strerror(r) << dendl; + continue; + } + label.size = size; + r = _write_bdev_label(cct, path, label); + if (r < 0) { + derr << "unable to write label for " << path << ": " + << cpp_strerror(r) << dendl; + continue; + } + out << devid + <<" : size label updated to " << size + << std::endl; + } + } + uint64_t size0 = fm->get_size(); + uint64_t size = bdev->get_size(); + cold_close(); + if (size0 < size) { + out << "Expanding Main..." << std::endl; + int r = _mount(false); + ceph_assert(r == 0); + + out << bluefs_shared_bdev + <<" : expanding " << " from 0x" << std::hex + << size0 << " to 0x" << size << std::dec << std::endl; + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + r = fm->expand(size, txn); + ceph_assert(r == 0); + db->submit_transaction_sync(txn); + + // always reference to slow device here + string p = get_device_path(BlueFS::BDEV_SLOW); + ceph_assert(!p.empty()); + const char* path = p.c_str(); + bluestore_bdev_label_t label; + r = _read_bdev_label(cct, path, &label); + if (r < 0) { + derr << "unable to read label for " << path << ": " + << cpp_strerror(r) << dendl; + } else { + label.size = size; + r = _write_bdev_label(cct, path, label); + if (r < 0) { + derr << "unable to write label for " << path << ": " + << cpp_strerror(r) << dendl; + } else { + out << bluefs_shared_bdev + <<" : size label updated to " << size + << std::endl; + } + } + umount(); + } + return r; +} + +int BlueStore::dump_bluefs_sizes(ostream& out) +{ + int r = cold_open(); + ceph_assert(r == 0); + bluefs->dump_block_extents(out); + cold_close(); + return r; +} + +void BlueStore::set_cache_shards(unsigned num) +{ + dout(10) << __func__ << " " << num << dendl; + size_t old = cache_shards.size(); + ceph_assert(num >= old); + cache_shards.resize(num); + for (unsigned i = old; i < num; ++i) { + cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type, + logger); + } +} + +int BlueStore::_mount(bool kv_only, bool open_db) +{ + dout(1) << __func__ << " path " << path << dendl; + + _kv_only = kv_only; + + { + string type; + int r = read_meta("type", &type); + if (r < 0) { + derr << __func__ << " failed to load os-type: " << cpp_strerror(r) + << dendl; + return r; + } + + if (type != "bluestore") { + derr << __func__ << " expected bluestore, but type is " << type << dendl; + return -EIO; + } + } + + if (cct->_conf->bluestore_fsck_on_mount) { + int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep); + if (rc < 0) + return rc; + if (rc > 0) { + derr << __func__ << " fsck found " << rc << " errors" << dendl; + return -EIO; + } + } + + if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) { + derr << __func__ << " osd_max_object_size " + << cct->_conf->osd_max_object_size << " > bluestore max " + << OBJECT_MAX_SIZE << dendl; + return -EINVAL; + } + + int r = _open_path(); + if (r < 0) + return r; + r = _open_fsid(false); + if (r < 0) + goto out_path; + + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + + r = _open_bdev(false); + if (r < 0) + goto out_fsid; + + if (open_db) { + r = _open_db_and_around(false); + } else { + // we can bypass db open exclusively in case of kv_only mode + ceph_assert(kv_only); + r = _open_db(false, true); + if (r < 0) + goto out_bdev; + } + + if (kv_only) + return 0; + + r = _upgrade_super(); + if (r < 0) { + goto out_db; + } + + r = _open_collections(); + if (r < 0) + goto out_db; + + r = _reload_logger(); + if (r < 0) + goto out_coll; + + _kv_start(); + + r = _deferred_replay(); + if (r < 0) + goto out_stop; + + mempool_thread.init(); + + if (!per_pool_stat_collection && + cct->_conf->bluestore_fsck_quick_fix_on_mount == true) { + dout(1) << __func__ << " quick-fix on mount" << dendl; + _fsck_on_open(FSCK_SHALLOW, true); + + //reread statfs + //FIXME minor: replace with actual open/close? + _open_statfs(); + + _check_legacy_statfs_alert(); + } + + mounted = true; + return 0; + + out_stop: + _kv_stop(); + out_coll: + _flush_cache(); + out_db: + _close_db_and_around(false); + out_bdev: + _close_bdev(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + return r; +} + +int BlueStore::umount() +{ + ceph_assert(_kv_only || mounted); + dout(1) << __func__ << dendl; + + _osr_drain_all(); + + mounted = false; + if (!_kv_only) { + mempool_thread.shutdown(); + dout(20) << __func__ << " stopping kv thread" << dendl; + _kv_stop(); + _flush_cache(); + dout(20) << __func__ << " closing" << dendl; + + } + _close_db_and_around(false); + _close_bdev(); + _close_fsid(); + _close_path(); + + if (cct->_conf->bluestore_fsck_on_umount) { + int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep); + if (rc < 0) + return rc; + if (rc > 0) { + derr << __func__ << " fsck found " << rc << " errors" << dendl; + return -EIO; + } + } + return 0; +} + +int BlueStore::cold_open() +{ + int r = _open_path(); + if (r < 0) + return r; + r = _open_fsid(false); + if (r < 0) + goto out_path; + + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + + r = _open_bdev(false); + if (r < 0) + goto out_fsid; + r = _open_db_and_around(true); + if (r < 0) { + goto out_bdev; + } + return 0; + out_bdev: + _close_bdev(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + return r; +} +int BlueStore::cold_close() +{ + _close_db_and_around(true); + _close_bdev(); + _close_fsid(); + _close_path(); + return 0; +} + +static void apply(uint64_t off, + uint64_t len, + uint64_t granularity, + BlueStore::mempool_dynamic_bitset &bitset, + std::function<void(uint64_t, + BlueStore::mempool_dynamic_bitset &)> f) { + auto end = round_up_to(off + len, granularity); + while (off < end) { + uint64_t pos = off / granularity; + f(pos, bitset); + off += granularity; + } +} + +int _fsck_sum_extents( + const PExtentVector& extents, + bool compressed, + store_statfs_t& expected_statfs) +{ + for (auto e : extents) { + if (!e.is_valid()) + continue; + expected_statfs.allocated += e.length; + if (compressed) { + expected_statfs.data_compressed_allocated += e.length; + } + } + return 0; +} + +int BlueStore::_fsck_check_extents( + const coll_t& cid, + const ghobject_t& oid, + const PExtentVector& extents, + bool compressed, + mempool_dynamic_bitset &used_blocks, + uint64_t granularity, + BlueStoreRepairer* repairer, + store_statfs_t& expected_statfs, + FSCKDepth depth) +{ + dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl; + int errors = 0; + for (auto e : extents) { + if (!e.is_valid()) + continue; + expected_statfs.allocated += e.length; + if (compressed) { + expected_statfs.data_compressed_allocated += e.length; + } + if (depth != FSCK_SHALLOW) { + bool already = false; + apply( + e.offset, e.length, granularity, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + ceph_assert(pos < bs.size()); + if (bs.test(pos)) { + if (repairer) { + repairer->note_misreference( + pos * min_alloc_size, min_alloc_size, !already); + } + if (!already) { + derr << "fsck error: " << oid << " extent " << e + << " or a subset is already allocated (misreferenced)" << dendl; + ++errors; + already = true; + } + } + else + bs.set(pos); + }); + if (repairer) { + repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid); + } + + if (e.end() > bdev->get_size()) { + derr << "fsck error: " << oid << " extent " << e + << " past end of block device" << dendl; + ++errors; + } + } + } + return errors; +} + +void BlueStore::_fsck_check_pool_statfs( + BlueStore::per_pool_statfs& expected_pool_statfs, + int64_t& errors, + int64_t& warnings, + BlueStoreRepairer* repairer) +{ + auto it = db->get_iterator(PREFIX_STAT); + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + string key = it->key(); + if (key == BLUESTORE_GLOBAL_STATFS_KEY) { + if (repairer) { + ++errors; + repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY); + derr << "fsck error: " << "legacy statfs record found, removing" + << dendl; + } + continue; + } + uint64_t pool_id; + if (get_key_pool_stat(key, &pool_id) < 0) { + derr << "fsck error: bad key " << key + << "in statfs namespece" << dendl; + if (repairer) { + repairer->remove_key(db, PREFIX_STAT, key); + } + ++errors; + continue; + } + + volatile_statfs vstatfs; + bufferlist bl = it->value(); + auto blp = bl.cbegin(); + try { + vstatfs.decode(blp); + } catch (buffer::error& e) { + derr << "fsck error: failed to decode Pool StatFS record" + << pretty_binary_string(key) << dendl; + if (repairer) { + dout(20) << __func__ << " undecodable Pool StatFS record, key:'" + << pretty_binary_string(key) + << "', removing" << dendl; + repairer->remove_key(db, PREFIX_STAT, key); + } + ++errors; + vstatfs.reset(); + } + auto stat_it = expected_pool_statfs.find(pool_id); + if (stat_it == expected_pool_statfs.end()) { + if (vstatfs.is_empty()) { + // we don't consider that as an error since empty pool statfs + // are left in DB for now + dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x" + << std::hex << pool_id << std::dec << dendl; + if (repairer) { + // but we need to increment error count in case of repair + // to have proper counters at the end + // (as repairer increments recovery counter anyway). + ++errors; + } + } else { + derr << "fsck error: found stray Pool StatFS record for pool id 0x" + << std::hex << pool_id << std::dec << dendl; + ++errors; + } + if (repairer) { + repairer->remove_key(db, PREFIX_SHARED_BLOB, key); + } + continue; + } + store_statfs_t statfs; + vstatfs.publish(&statfs); + if (!(stat_it->second == statfs)) { + derr << "fsck error: actual " << statfs + << " != expected " << stat_it->second + << " for pool " + << std::hex << pool_id << std::dec << dendl; + if (repairer) { + repairer->fix_statfs(db, key, stat_it->second); + } + ++errors; + } + expected_pool_statfs.erase(stat_it); + } + } // if (it) + for (auto& s : expected_pool_statfs) { + if (s.second.is_zero()) { + // we might lack empty statfs recs in DB + continue; + } + derr << "fsck error: missing Pool StatFS record for pool " + << std::hex << s.first << std::dec << dendl; + if (repairer) { + string key; + get_pool_stat_key(s.first, &key); + repairer->fix_statfs(db, key, s.second); + } + ++errors; + } + if (!per_pool_stat_collection && + cct->_conf->bluestore_fsck_error_on_no_per_pool_stats && + repairer) { + // by virtue of running this method, we correct the top-level + // error of having global stats + repairer->inc_repaired(); + } +} + +BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow( + BlueStore::FSCKDepth depth, + int64_t pool_id, + BlueStore::CollectionRef c, + const ghobject_t& oid, + const string& key, + const bufferlist& value, + mempool::bluestore_fsck::list<string>& expecting_shards, + map<BlobRef, bluestore_blob_t::unused_t>* referenced, + const BlueStore::FSCK_ObjectCtx& ctx) +{ + auto& errors = ctx.errors; + auto& num_objects = ctx.num_objects; + auto& num_extents = ctx.num_extents; + auto& num_blobs = ctx.num_blobs; + auto& num_sharded_objects = ctx.num_sharded_objects; + auto& num_spanning_blobs = ctx.num_spanning_blobs; + auto used_blocks = ctx.used_blocks; + auto sb_info_lock = ctx.sb_info_lock; + auto& sb_info = ctx.sb_info; + auto repairer = ctx.repairer; + + store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ? + &ctx.expected_pool_statfs[pool_id] : + &ctx.expected_store_statfs; + + dout(10) << __func__ << " " << oid << dendl; + OnodeRef o; + o.reset(Onode::decode(c, oid, key, value)); + ++num_objects; + + num_spanning_blobs += o->extent_map.spanning_blob_map.size(); + + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + _dump_onode<30>(cct, *o); + // shards + if (!o->extent_map.shards.empty()) { + ++num_sharded_objects; + if (depth != FSCK_SHALLOW) { + for (auto& s : o->extent_map.shards) { + dout(20) << __func__ << " shard " << *s.shard_info << dendl; + expecting_shards.push_back(string()); + get_extent_shard_key(o->key, s.shard_info->offset, + &expecting_shards.back()); + if (s.shard_info->offset >= o->onode.size) { + derr << "fsck error: " << oid << " shard 0x" << std::hex + << s.shard_info->offset << " past EOF at 0x" << o->onode.size + << std::dec << dendl; + ++errors; + } + } + } + } + + // lextents + uint64_t pos = 0; + mempool::bluestore_fsck::map<BlobRef, + bluestore_blob_use_tracker_t> ref_map; + for (auto& l : o->extent_map.extent_map) { + dout(20) << __func__ << " " << l << dendl; + if (l.logical_offset < pos) { + derr << "fsck error: " << oid << " lextent at 0x" + << std::hex << l.logical_offset + << " overlaps with the previous, which ends at 0x" << pos + << std::dec << dendl; + ++errors; + } + if (depth != FSCK_SHALLOW && + o->extent_map.spans_shard(l.logical_offset, l.length)) { + derr << "fsck error: " << oid << " lextent at 0x" + << std::hex << l.logical_offset << "~" << l.length + << " spans a shard boundary" + << std::dec << dendl; + ++errors; + } + pos = l.logical_offset + l.length; + res_statfs->data_stored += l.length; + ceph_assert(l.blob); + const bluestore_blob_t& blob = l.blob->get_blob(); + + auto& ref = ref_map[l.blob]; + if (ref.is_empty()) { + uint32_t min_release_size = blob.get_release_size(min_alloc_size); + uint32_t l = blob.get_logical_length(); + ref.init(l, min_release_size); + } + ref.get( + l.blob_offset, + l.length); + ++num_extents; + if (depth != FSCK_SHALLOW && + blob.has_unused()) { + ceph_assert(referenced); + auto p = referenced->find(l.blob); + bluestore_blob_t::unused_t* pu; + if (p == referenced->end()) { + pu = &(*referenced)[l.blob]; + } + else { + pu = &p->second; + } + uint64_t blob_len = blob.get_logical_length(); + ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0); + ceph_assert(l.blob_offset + l.length <= blob_len); + uint64_t chunk_size = blob_len / (sizeof(*pu) * 8); + uint64_t start = l.blob_offset / chunk_size; + uint64_t end = + round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size; + for (auto i = start; i < end; ++i) { + (*pu) |= (1u << i); + } + } + } //for (auto& l : o->extent_map.extent_map) + + for (auto& i : ref_map) { + ++num_blobs; + const bluestore_blob_t& blob = i.first->get_blob(); + bool equal = + depth == FSCK_SHALLOW ? true : + i.first->get_blob_use_tracker().equal(i.second); + if (!equal) { + derr << "fsck error: " << oid << " blob " << *i.first + << " doesn't match expected ref_map " << i.second << dendl; + ++errors; + } + if (blob.is_compressed()) { + res_statfs->data_compressed += blob.get_compressed_payload_length(); + res_statfs->data_compressed_original += + i.first->get_referenced_bytes(); + } + if (blob.is_shared()) { + if (i.first->shared_blob->get_sbid() > blobid_max) { + derr << "fsck error: " << oid << " blob " << blob + << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max " + << blobid_max << dendl; + ++errors; + } + else if (i.first->shared_blob->get_sbid() == 0) { + derr << "fsck error: " << oid << " blob " << blob + << " marked as shared but has uninitialized sbid" + << dendl; + ++errors; + } + // the below lock is optional and provided in multithreading mode only + if (sb_info_lock) { + sb_info_lock->lock(); + } + sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()]; + ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid); + ceph_assert(sbi.pool_id == INT64_MIN || + sbi.pool_id == oid.hobj.get_logical_pool()); + sbi.cid = c->cid; + sbi.pool_id = oid.hobj.get_logical_pool(); + sbi.sb = i.first->shared_blob; + sbi.oids.push_back(oid); + sbi.compressed = blob.is_compressed(); + for (auto e : blob.get_extents()) { + if (e.is_valid()) { + sbi.ref_map.get(e.offset, e.length); + } + } + if (sb_info_lock) { + sb_info_lock->unlock(); + } + } else if (depth != FSCK_SHALLOW) { + ceph_assert(used_blocks); + errors += _fsck_check_extents(c->cid, oid, blob.get_extents(), + blob.is_compressed(), + *used_blocks, + fm->get_alloc_size(), + repairer, + *res_statfs, + depth); + } else { + errors += _fsck_sum_extents( + blob.get_extents(), + blob.is_compressed(), + *res_statfs); + } + } // for (auto& i : ref_map) + + { + auto &sbm = o->extent_map.spanning_blob_map; + size_t broken = 0; + BlobRef first_broken; + for (auto it = sbm.begin(); it != sbm.end();) { + auto it1 = it++; + if (ref_map.count(it1->second) == 0) { + if (!broken) { + first_broken = it1->second; + ++errors; + } + broken++; + if (repairer) { + sbm.erase(it1); + } + } + } + if (broken) { + derr << "fsck error: " << oid << " - " << broken + << " zombie spanning blob(s) found, the first one: " + << *first_broken << dendl; + if(repairer) { + auto txn = repairer->fix_spanning_blobs(db); + _record_onode(o, txn); + } + } + } + + return o; +} + +#include "common/WorkQueue.h" + +class ShallowFSCKThreadPool : public ThreadPool +{ +public: + ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) : + ThreadPool(cct_, nm, tn, n) { + } + void worker(ThreadPool::WorkThread* wt) override { + int next_wq = 0; + while (!_stop) { + next_wq %= work_queues.size(); + WorkQueue_ *wq = work_queues[next_wq++]; + + void* item = wq->_void_dequeue(); + if (item) { + processing++; + TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval); + wq->_void_process(item, tp_handle); + processing--; + } + } + } + template <size_t BatchLen> + struct FSCKWorkQueue : public ThreadPool::WorkQueue_ + { + struct Entry { + int64_t pool_id; + BlueStore::CollectionRef c; + ghobject_t oid; + string key; + bufferlist value; + }; + struct Batch { + std::atomic<size_t> running = { 0 }; + size_t entry_count = 0; + std::array<Entry, BatchLen> entries; + + int64_t errors = 0; + int64_t warnings = 0; + uint64_t num_objects = 0; + uint64_t num_extents = 0; + uint64_t num_blobs = 0; + uint64_t num_sharded_objects = 0; + uint64_t num_spanning_blobs = 0; + store_statfs_t expected_store_statfs; + BlueStore::per_pool_statfs expected_pool_statfs; + }; + + size_t batchCount; + BlueStore* store = nullptr; + + mempool::bluestore_fsck::list<string>* expecting_shards = nullptr; + ceph::mutex* sb_info_lock = nullptr; + BlueStore::sb_info_map_t* sb_info = nullptr; + BlueStoreRepairer* repairer = nullptr; + + Batch* batches = nullptr; + size_t last_batch_pos = 0; + bool batch_acquired = false; + + FSCKWorkQueue(std::string n, + size_t _batchCount, + BlueStore* _store, + mempool::bluestore_fsck::list<string>& _expecting_shards, + ceph::mutex* _sb_info_lock, + BlueStore::sb_info_map_t& _sb_info, + BlueStoreRepairer* _repairer) : + WorkQueue_(n, time_t(), time_t()), + batchCount(_batchCount), + store(_store), + expecting_shards(&_expecting_shards), + sb_info_lock(_sb_info_lock), + sb_info(&_sb_info), + repairer(_repairer) + { + batches = new Batch[batchCount]; + } + ~FSCKWorkQueue() { + delete[] batches; + } + + /// Remove all work items from the queue. + void _clear() override { + //do nothing + } + /// Check whether there is anything to do. + bool _empty() override { + ceph_assert(false); + } + + /// Get the next work item to process. + void* _void_dequeue() override { + size_t pos = rand() % batchCount; + size_t pos0 = pos; + do { + auto& batch = batches[pos]; + if (batch.running.fetch_add(1) == 0) { + if (batch.entry_count) { + return &batch; + } + } + batch.running--; + pos++; + pos %= batchCount; + } while (pos != pos0); + return nullptr; + } + /** @brief Process the work item. + * This function will be called several times in parallel + * and must therefore be thread-safe. */ + void _void_process(void* item, TPHandle& handle) override { + Batch* batch = (Batch*)item; + + BlueStore::FSCK_ObjectCtx ctx( + batch->errors, + batch->warnings, + batch->num_objects, + batch->num_extents, + batch->num_blobs, + batch->num_sharded_objects, + batch->num_spanning_blobs, + nullptr, // used_blocks + nullptr, // used_omap_head; + nullptr, // used_per_pool_omap_head; + nullptr, // used_pgmeta_omap_head; + sb_info_lock, + *sb_info, + batch->expected_store_statfs, + batch->expected_pool_statfs, + repairer); + + for (size_t i = 0; i < batch->entry_count; i++) { + auto& entry = batch->entries[i]; + + store->fsck_check_objects_shallow( + BlueStore::FSCK_SHALLOW, + entry.pool_id, + entry.c, + entry.oid, + entry.key, + entry.value, + *expecting_shards, + nullptr, // referenced + ctx); + } + //std::cout << "processed " << batch << std::endl; + batch->entry_count = 0; + batch->running--; + } + /** @brief Synchronously finish processing a work item. + * This function is called after _void_process with the global thread pool lock held, + * so at most one copy will execute simultaneously for a given thread pool. + * It can be used for non-thread-safe finalization. */ + void _void_process_finish(void*) override { + ceph_assert(false); + } + + bool queue( + int64_t pool_id, + BlueStore::CollectionRef c, + const ghobject_t& oid, + const string& key, + const bufferlist& value) { + bool res = false; + size_t pos0 = last_batch_pos; + if (!batch_acquired) { + do { + auto& batch = batches[last_batch_pos]; + if (batch.running.fetch_add(1) == 0) { + if (batch.entry_count < BatchLen) { + batch_acquired = true; + break; + } + } + batch.running.fetch_sub(1); + last_batch_pos++; + last_batch_pos %= batchCount; + } while (last_batch_pos != pos0); + } + if (batch_acquired) { + auto& batch = batches[last_batch_pos]; + ceph_assert(batch.running); + ceph_assert(batch.entry_count < BatchLen); + + auto& entry = batch.entries[batch.entry_count]; + entry.pool_id = pool_id; + entry.c = c; + entry.oid = oid; + entry.key = key; + entry.value = value; + + ++batch.entry_count; + if (batch.entry_count == BatchLen) { + batch_acquired = false; + batch.running.fetch_sub(1); + last_batch_pos++; + last_batch_pos %= batchCount; + } + res = true; + } + return res; + } + + void finalize(ThreadPool& tp, + BlueStore::FSCK_ObjectCtx& ctx) { + if (batch_acquired) { + auto& batch = batches[last_batch_pos]; + ceph_assert(batch.running); + batch.running.fetch_sub(1); + } + tp.stop(); + + for (size_t i = 0; i < batchCount; i++) { + auto& batch = batches[i]; + + //process leftovers if any + if (batch.entry_count) { + TPHandle tp_handle(store->cct, + nullptr, + timeout_interval, + suicide_interval); + ceph_assert(batch.running == 0); + + batch.running++; // just to be on-par with the regular call + _void_process(&batch, tp_handle); + } + ceph_assert(batch.entry_count == 0); + + ctx.errors += batch.errors; + ctx.warnings += batch.warnings; + ctx.num_objects += batch.num_objects; + ctx.num_extents += batch.num_extents; + ctx.num_blobs += batch.num_blobs; + ctx.num_sharded_objects += batch.num_sharded_objects; + ctx.num_spanning_blobs += batch.num_spanning_blobs; + ctx.expected_store_statfs.add(batch.expected_store_statfs); + + for (auto it = batch.expected_pool_statfs.begin(); + it != batch.expected_pool_statfs.end(); + it++) { + ctx.expected_pool_statfs[it->first].add(it->second); + } + } + } + }; +}; + +void BlueStore::_fsck_check_objects(FSCKDepth depth, + BlueStore::FSCK_ObjectCtx& ctx) +{ + //no need for the below lock when in non-shallow mode as + // there is no multithreading in this case + if (depth != FSCK_SHALLOW) { + ctx.sb_info_lock = nullptr; + } + + auto& errors = ctx.errors; + auto used_omap_head = ctx.used_omap_head; + auto used_pgmeta_omap_head = ctx.used_pgmeta_omap_head; + auto sb_info_lock = ctx.sb_info_lock; + auto& sb_info = ctx.sb_info; + auto repairer = ctx.repairer; + + uint64_t_btree_t used_nids; + + size_t processed_myself = 0; + + auto it = db->get_iterator(PREFIX_OBJ); + mempool::bluestore_fsck::list<string> expecting_shards; + if (it) { + const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads; + typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ; + std::unique_ptr<WQ> wq( + new WQ( + "FSCKWorkQueue", + (thread_count ? : 1) * 32, + this, + expecting_shards, + sb_info_lock, + sb_info, + repairer)); + + ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count); + + thread_pool.add_work_queue(wq.get()); + if (depth == FSCK_SHALLOW && thread_count > 0) { + //not the best place but let's check anyway + ceph_assert(sb_info_lock); + thread_pool.start(); + } + + //fill global if not overriden below + CollectionRef c; + int64_t pool_id = -1; + spg_t pgid; + for (it->lower_bound(string()); it->valid(); it->next()) { + dout(30) << __func__ << " key " + << pretty_binary_string(it->key()) << dendl; + if (is_extent_shard_key(it->key())) { + if (depth == FSCK_SHALLOW) { + continue; + } + while (!expecting_shards.empty() && + expecting_shards.front() < it->key()) { + derr << "fsck error: missing shard key " + << pretty_binary_string(expecting_shards.front()) + << dendl; + ++errors; + expecting_shards.pop_front(); + } + if (!expecting_shards.empty() && + expecting_shards.front() == it->key()) { + // all good + expecting_shards.pop_front(); + continue; + } + + uint32_t offset; + string okey; + get_key_extent_shard(it->key(), &okey, &offset); + derr << "fsck error: stray shard 0x" << std::hex << offset + << std::dec << dendl; + if (expecting_shards.empty()) { + derr << "fsck error: " << pretty_binary_string(it->key()) + << " is unexpected" << dendl; + ++errors; + continue; + } + while (expecting_shards.front() > it->key()) { + derr << "fsck error: saw " << pretty_binary_string(it->key()) + << dendl; + derr << "fsck error: exp " + << pretty_binary_string(expecting_shards.front()) << dendl; + ++errors; + expecting_shards.pop_front(); + if (expecting_shards.empty()) { + break; + } + } + continue; + } + + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + if (r < 0) { + derr << "fsck error: bad object key " + << pretty_binary_string(it->key()) << dendl; + ++errors; + continue; + } + if (!c || + oid.shard_id != pgid.shard || + oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || + !c->contains(oid)) { + c = nullptr; + for (auto& p : coll_map) { + if (p.second->contains(oid)) { + c = p.second; + break; + } + } + if (!c) { + derr << "fsck error: stray object " << oid + << " not owned by any collection" << dendl; + ++errors; + continue; + } + pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID; + dout(20) << __func__ << " collection " << c->cid << " " << c->cnode + << dendl; + } + + if (depth != FSCK_SHALLOW && + !expecting_shards.empty()) { + for (auto& k : expecting_shards) { + derr << "fsck error: missing shard key " + << pretty_binary_string(k) << dendl; + } + ++errors; + expecting_shards.clear(); + } + + bool queued = false; + if (depth == FSCK_SHALLOW && thread_count > 0) { + queued = wq->queue( + pool_id, + c, + oid, + it->key(), + it->value()); + } + OnodeRef o; + map<BlobRef, bluestore_blob_t::unused_t> referenced; + + if (!queued) { + ++processed_myself; + + o = fsck_check_objects_shallow( + depth, + pool_id, + c, + oid, + it->key(), + it->value(), + expecting_shards, + &referenced, + ctx); + } + + if (depth != FSCK_SHALLOW) { + ceph_assert(o != nullptr); + if (o->onode.nid) { + if (o->onode.nid > nid_max) { + derr << "fsck error: " << oid << " nid " << o->onode.nid + << " > nid_max " << nid_max << dendl; + ++errors; + } + if (used_nids.count(o->onode.nid)) { + derr << "fsck error: " << oid << " nid " << o->onode.nid + << " already in use" << dendl; + ++errors; + continue; // go for next object + } + used_nids.insert(o->onode.nid); + } + for (auto& i : referenced) { + dout(20) << __func__ << " referenced 0x" << std::hex << i.second + << std::dec << " for " << *i.first << dendl; + const bluestore_blob_t& blob = i.first->get_blob(); + if (i.second & blob.unused) { + derr << "fsck error: " << oid << " blob claims unused 0x" + << std::hex << blob.unused + << " but extents reference 0x" << i.second << std::dec + << " on blob " << *i.first << dendl; + ++errors; + } + if (blob.has_csum()) { + uint64_t blob_len = blob.get_logical_length(); + uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8); + unsigned csum_count = blob.get_csum_count(); + unsigned csum_chunk_size = blob.get_csum_chunk_size(); + for (unsigned p = 0; p < csum_count; ++p) { + unsigned pos = p * csum_chunk_size; + unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit] + unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size; + unsigned mask = 1u << firstbit; + for (unsigned b = firstbit + 1; b <= lastbit; ++b) { + mask |= 1u << b; + } + if ((blob.unused & mask) == mask) { + // this csum chunk region is marked unused + if (blob.get_csum_item(p) != 0) { + derr << "fsck error: " << oid + << " blob claims csum chunk 0x" << std::hex << pos + << "~" << csum_chunk_size + << " is unused (mask 0x" << mask << " of unused 0x" + << blob.unused << ") but csum is non-zero 0x" + << blob.get_csum_item(p) << std::dec << " on blob " + << *i.first << dendl; + ++errors; + } + } + } + } + } + // omap + if (o->onode.has_omap()) { + ceph_assert(used_omap_head); + ceph_assert(used_pgmeta_omap_head); + auto m = + o->onode.is_pgmeta_omap() ? used_pgmeta_omap_head : used_omap_head; + if (m->count(o->onode.nid)) { + derr << "fsck error: " << oid << " omap_head " << o->onode.nid + << " already in use" << dendl; + ++errors; + } else { + m->insert(o->onode.nid); + } + } + if (depth == FSCK_DEEP) { + bufferlist bl; + uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap; + uint64_t offset = 0; + do { + uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block); + int r = _do_read(c.get(), o, offset, l, bl, + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + if (r < 0) { + ++errors; + derr << "fsck error: " << oid << std::hex + << " error during read: " + << " " << offset << "~" << l + << " " << cpp_strerror(r) << std::dec + << dendl; + break; + } + offset += l; + } while (offset < o->onode.size); + } // deep + } //if (depth != FSCK_SHALLOW) + } // for (it->lower_bound(string()); it->valid(); it->next()) + if (depth == FSCK_SHALLOW && thread_count > 0) { + wq->finalize(thread_pool, ctx); + if (processed_myself) { + // may be needs more threads? + dout(0) << __func__ << " partial offload" + << ", done myself " << processed_myself + << " of " << ctx.num_objects + << "objects, threads " << thread_count + << dendl; + } + } + } // if (it) +} +/** +An overview for currently implemented repair logics +performed in fsck in two stages: detection(+preparation) and commit. +Detection stage (in processing order): + (Issue -> Repair action to schedule) + - Detect undecodable keys for Shared Blobs -> Remove + - Detect undecodable records for Shared Blobs -> Remove + (might trigger missed Shared Blob detection below) + - Detect stray records for Shared Blobs -> Remove + - Detect misreferenced pextents -> Fix + Prepare Bloom-like filter to track cid/oid -> pextent + Prepare list of extents that are improperly referenced + Enumerate Onode records that might use 'misreferenced' pextents + (Bloom-like filter applied to reduce computation) + Per each questinable Onode enumerate all blobs and identify broken ones + (i.e. blobs having 'misreferences') + Rewrite each broken blob data by allocating another extents and + copying data there + If blob is shared - unshare it and mark corresponding Shared Blob + for removal + Release previously allocated space + Update Extent Map + - Detect missed Shared Blobs -> Recreate + - Detect undecodable deferred transaction -> Remove + - Detect Freelist Manager's 'false free' entries -> Mark as used + - Detect Freelist Manager's leaked entries -> Mark as free + - Detect statfs inconsistency - Update + Commit stage (separate DB commit per each step): + - Apply leaked FM entries fix + - Apply 'false free' FM entries fix + - Apply 'Remove' actions + - Apply fix for misreference pextents + - Apply Shared Blob recreate + (can be merged with the step above if misreferences were dectected) + - Apply StatFS update +*/ +int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair) +{ + dout(1) << __func__ + << (repair ? " repair" : " check") + << (depth == FSCK_DEEP ? " (deep)" : + depth == FSCK_SHALLOW ? " (shallow)" : " (regular)") + << dendl; + + // in deep mode we need R/W write access to be able to replay deferred ops + bool read_only = !(repair || depth == FSCK_DEEP); + + int r = _open_path(); + if (r < 0) + return r; + r = _open_fsid(false); + if (r < 0) + goto out_path; + + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + + r = _open_bdev(false); + if (r < 0) + goto out_fsid; + + r = _open_db_and_around(read_only); + if (r < 0) + goto out_bdev; + + if (!read_only) { + r = _upgrade_super(); + if (r < 0) { + goto out_db; + } + } + + r = _open_collections(); + if (r < 0) + goto out_db; + + mempool_thread.init(); + + // we need finisher and kv_{sync,finalize}_thread *just* for replay + // enable in repair or deep mode modes only + if (!read_only) { + _kv_start(); + r = _deferred_replay(); + _kv_stop(); + } + if (r < 0) + goto out_scan; + + r = _fsck_on_open(depth, repair); + +out_scan: + mempool_thread.shutdown(); + _flush_cache(); +out_db: + _close_db_and_around(false); +out_bdev: + _close_bdev(); +out_fsid: + _close_fsid(); +out_path: + _close_path(); + + return r; +} + +int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) +{ + dout(1) << __func__ + << " <<<START>>>" + << (repair ? " repair" : " check") + << (depth == FSCK_DEEP ? " (deep)" : + depth == FSCK_SHALLOW ? " (shallow)" : " (regular)") + << " start" << dendl; + int64_t errors = 0; + int64_t warnings = 0; + unsigned repaired = 0; + + uint64_t_btree_t used_omap_head; + uint64_t_btree_t used_per_pool_omap_head; + uint64_t_btree_t used_pgmeta_omap_head; + uint64_t_btree_t used_sbids; + + mempool_dynamic_bitset used_blocks; + KeyValueDB::Iterator it; + store_statfs_t expected_store_statfs, actual_statfs; + per_pool_statfs expected_pool_statfs; + + sb_info_map_t sb_info; + + uint64_t num_objects = 0; + uint64_t num_extents = 0; + uint64_t num_blobs = 0; + uint64_t num_spanning_blobs = 0; + uint64_t num_shared_blobs = 0; + uint64_t num_sharded_objects = 0; + BlueStoreRepairer repairer; + + utime_t start = ceph_clock_now(); + + _fsck_collections(&errors); + used_blocks.resize(fm->get_alloc_units()); + apply( + 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + ceph_assert(pos < bs.size()); + bs.set(pos); + } + ); + if (repair) { + repairer.get_space_usage_tracker().init( + bdev->get_size(), + min_alloc_size); + } + + if (bluefs) { + if( cct->_conf->bluestore_bluefs_db_compatibility) { + interval_set<uint64_t> bluefs_extents_db; + bufferlist bl; + db->get(PREFIX_SUPER, "bluefs_extents", &bl); + auto p = bl.cbegin(); + auto prev_errors = errors; + try { + decode(bluefs_extents_db, p); + bluefs_extents_db.union_of(bluefs_extents); + bluefs_extents_db.subtract(bluefs_extents); + if (!bluefs_extents_db.empty()) { + derr << "fsck error: bluefs_extents inconsistency, " + << "downgrade to previous releases might be broken." + << dendl; + ++errors; + } + } + catch (buffer::error& e) { + derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl; + ++errors; + } + if (errors != prev_errors && repair) { + repairer.fix_bluefs_extents(out_of_sync_fm); + } + } + + for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) { + apply( + e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + ceph_assert(pos < bs.size()); + bs.set(pos); + }); + } + int r = bluefs->fsck(); + if (r < 0) { + return r; + } + if (r > 0) + errors += r; + } + + if (!per_pool_stat_collection) { + const char *w; + if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) { + w = "error"; + ++errors; + } else { + w = "warning"; + ++warnings; + } + derr << "fsck " << w << ": store not yet converted to per-pool stats" + << dendl; + } + // get expected statfs; reset unaffected fields to be able to compare + // structs + statfs(&actual_statfs); + actual_statfs.total = 0; + actual_statfs.internally_reserved = 0; + actual_statfs.available = 0; + actual_statfs.internal_metadata = 0; + actual_statfs.omap_allocated = 0; + + if (g_conf()->bluestore_debug_fsck_abort) { + dout(1) << __func__ << " debug abort" << dendl; + goto out_scan; + } + // walk PREFIX_OBJ + { + dout(1) << __func__ << " walking object keyspace" << dendl; + ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock"); + BlueStore::FSCK_ObjectCtx ctx( + errors, + warnings, + num_objects, + num_extents, + num_blobs, + num_sharded_objects, + num_spanning_blobs, + &used_blocks, + &used_omap_head, + nullptr, + &used_pgmeta_omap_head, + &sb_info_lock, + sb_info, + expected_store_statfs, + expected_pool_statfs, + repair ? &repairer : nullptr); + _fsck_check_objects(depth, + ctx); + } + + dout(1) << __func__ << " checking shared_blobs" << dendl; + it = db->get_iterator(PREFIX_SHARED_BLOB); + if (it) { + // FIXME minor: perhaps simplify for shallow mode? + // fill global if not overriden below + auto expected_statfs = &expected_store_statfs; + + for (it->lower_bound(string()); it->valid(); it->next()) { + string key = it->key(); + uint64_t sbid; + if (get_key_shared_blob(key, &sbid)) { + derr << "fsck error: bad key '" << key + << "' in shared blob namespace" << dendl; + if (repair) { + repairer.remove_key(db, PREFIX_SHARED_BLOB, key); + } + ++errors; + continue; + } + auto p = sb_info.find(sbid); + if (p == sb_info.end()) { + derr << "fsck error: found stray shared blob data for sbid 0x" + << std::hex << sbid << std::dec << dendl; + if (repair) { + repairer.remove_key(db, PREFIX_SHARED_BLOB, key); + } + ++errors; + } else { + ++num_shared_blobs; + sb_info_t& sbi = p->second; + bluestore_shared_blob_t shared_blob(sbid); + bufferlist bl = it->value(); + auto blp = bl.cbegin(); + try { + decode(shared_blob, blp); + } catch (buffer::error& e) { + ++errors; + // Force update and don't report as missing + sbi.updated = sbi.passed = true; + + derr << "fsck error: failed to decode Shared Blob" + << pretty_binary_string(it->key()) << dendl; + if (repair) { + dout(20) << __func__ << " undecodable Shared Blob, key:'" + << pretty_binary_string(it->key()) + << "', removing" << dendl; + repairer.remove_key(db, PREFIX_DEFERRED, it->key()); + } + continue; + } + dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl; + if (shared_blob.ref_map != sbi.ref_map) { + derr << "fsck error: shared blob 0x" << std::hex << sbid + << std::dec << " ref_map " << shared_blob.ref_map + << " != expected " << sbi.ref_map << dendl; + sbi.updated = true; // will update later in repair mode only! + ++errors; + } + PExtentVector extents; + for (auto &r : shared_blob.ref_map.ref_map) { + extents.emplace_back(bluestore_pextent_t(r.first, r.second.length)); + } + if (per_pool_stat_collection || repair) { + expected_statfs = &expected_pool_statfs[sbi.pool_id]; + } + errors += _fsck_check_extents(sbi.cid, + p->second.oids.front(), + extents, + p->second.compressed, + used_blocks, + fm->get_alloc_size(), + repair ? &repairer : nullptr, + *expected_statfs, + depth); + sbi.passed = true; + } + } + } // if (it) + + if (repair && repairer.preprocess_misreference(db)) { + + dout(1) << __func__ << " sorting out misreferenced extents" << dendl; + auto& space_tracker = repairer.get_space_usage_tracker(); + auto& misref_extents = repairer.get_misreferences(); + interval_set<uint64_t> to_release; + it = db->get_iterator(PREFIX_OBJ); + if (it) { + // fill global if not overriden below + auto expected_statfs = &expected_store_statfs; + + CollectionRef c; + spg_t pgid; + KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn(); + bool bypass_rest = false; + for (it->lower_bound(string()); it->valid() && !bypass_rest; + it->next()) { + dout(30) << __func__ << " key " + << pretty_binary_string(it->key()) << dendl; + if (is_extent_shard_key(it->key())) { + continue; + } + + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + if (r < 0 || !space_tracker.is_used(oid)) { + continue; + } + + if (!c || + oid.shard_id != pgid.shard || + oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || + !c->contains(oid)) { + c = nullptr; + for (auto& p : coll_map) { + if (p.second->contains(oid)) { + c = p.second; + break; + } + } + if (!c) { + continue; + } + if (per_pool_stat_collection || repair) { + auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID; + expected_statfs = &expected_pool_statfs[pool_id]; + } + } + if (!space_tracker.is_used(c->cid)) { + continue; + } + + dout(20) << __func__ << " check misreference for col:" << c->cid + << " obj:" << oid << dendl; + + OnodeRef o; + o.reset(Onode::decode(c, oid, it->key(), it->value())); + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + mempool::bluestore_fsck::set<BlobRef> blobs; + + for (auto& e : o->extent_map.extent_map) { + blobs.insert(e.blob); + } + bool need_onode_update = false; + bool first_dump = true; + for(auto b : blobs) { + bool broken_blob = false; + auto& pextents = b->dirty_blob().dirty_extents(); + for (auto& e : pextents) { + if (!e.is_valid()) { + continue; + } + // for the sake of simplicity and proper shared blob handling + // always rewrite the whole blob even when it's partially + // misreferenced. + if (misref_extents.intersects(e.offset, e.length)) { + if (first_dump) { + first_dump = false; + _dump_onode<10>(cct, *o); + } + broken_blob = true; + break; + } + } + if (!broken_blob) + continue; + bool compressed = b->get_blob().is_compressed(); + need_onode_update = true; + dout(10) << __func__ + << " fix misreferences in oid:" << oid + << " " << *b << dendl; + uint64_t b_off = 0; + PExtentVector pext_to_release; + pext_to_release.reserve(pextents.size()); + // rewriting all valid pextents + for (auto e = pextents.begin(); e != pextents.end(); + b_off += e->length, e++) { + if (!e->is_valid()) { + continue; + } + PExtentVector exts; + int64_t alloc_len = alloc->allocate(e->length, min_alloc_size, + 0, 0, &exts); + if (alloc_len < 0 || alloc_len < (int64_t)e->length) { + derr << __func__ + << " failed to allocate 0x" << std::hex << e->length + << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len) + << " min_alloc_size 0x" << min_alloc_size + << " available 0x " << alloc->get_free() + << std::dec << dendl; + if (alloc_len > 0) { + alloc->release(exts); + } + bypass_rest = true; + break; + } + expected_statfs->allocated += e->length; + if (compressed) { + expected_statfs->data_compressed_allocated += e->length; + } + + bufferlist bl; + IOContext ioc(cct, NULL, true); // allow EIO + r = bdev->read(e->offset, e->length, &bl, &ioc, false); + if (r < 0) { + derr << __func__ << " failed to read from 0x" << std::hex << e->offset + <<"~" << e->length << std::dec << dendl; + ceph_abort_msg("read failed, wtf"); + } + pext_to_release.push_back(*e); + e = pextents.erase(e); + e = pextents.insert(e, exts.begin(), exts.end()); + b->get_blob().map_bl( + b_off, bl, + [&](uint64_t offset, bufferlist& t) { + int r = bdev->write(offset, t, false); + ceph_assert(r == 0); + }); + e += exts.size() - 1; + for (auto& p : exts) { + fm->allocate(p.offset, p.length, txn); + } + } // for (auto e = pextents.begin(); e != pextents.end(); e++) { + + if (b->get_blob().is_shared()) { + b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED); + + auto sb_it = sb_info.find(b->shared_blob->get_sbid()); + ceph_assert(sb_it != sb_info.end()); + sb_info_t& sbi = sb_it->second; + + for (auto& r : sbi.ref_map.ref_map) { + expected_statfs->allocated -= r.second.length; + if (sbi.compressed) { + // NB: it's crucial to use compressed flag from sb_info_t + // as we originally used that value while accumulating + // expected_statfs + expected_statfs->data_compressed_allocated -= r.second.length; + } + } + sbi.updated = sbi.passed = true; + sbi.ref_map.clear(); + + // relying on blob's pextents to decide what to release. + for (auto& p : pext_to_release) { + to_release.union_insert(p.offset, p.length); + } + } else { + for (auto& p : pext_to_release) { + expected_statfs->allocated -= p.length; + if (compressed) { + expected_statfs->data_compressed_allocated -= p.length; + } + to_release.union_insert(p.offset, p.length); + } + } + if (bypass_rest) { + break; + } + } // for(auto b : blobs) + if (need_onode_update) { + o->extent_map.dirty_range(0, OBJECT_MAX_SIZE); + _record_onode(o, txn); + } + } // for (it->lower_bound(string()); it->valid(); it->next()) + + for (auto it = to_release.begin(); it != to_release.end(); ++it) { + dout(10) << __func__ << " release 0x" << std::hex << it.get_start() + << "~" << it.get_len() << std::dec << dendl; + fm->release(it.get_start(), it.get_len(), txn); + } + alloc->release(to_release); + to_release.clear(); + } // if (it) { + } //if (repair && repairer.preprocess_misreference()) { + + if (depth != FSCK_SHALLOW) { + for (auto &p : sb_info) { + sb_info_t& sbi = p.second; + if (!sbi.passed) { + derr << "fsck error: missing " << *sbi.sb << dendl; + ++errors; + } + if (repair && (!sbi.passed || sbi.updated)) { + auto sbid = p.first; + if (sbi.ref_map.empty()) { + ceph_assert(sbi.passed); + dout(20) << __func__ << " " << *sbi.sb + << " is empty, removing" << dendl; + repairer.fix_shared_blob(db, sbid, nullptr); + } else { + bufferlist bl; + bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map)); + encode(persistent, bl); + dout(20) << __func__ << " " << *sbi.sb + << " is " << bl.length() << " bytes, updating" << dendl; + + repairer.fix_shared_blob(db, sbid, &bl); + } + } + } + } + sb_info.clear(); + + // check global stats only if fscking (not repairing) w/o per-pool stats + if (!per_pool_stat_collection && + !repair && + !(actual_statfs == expected_store_statfs)) { + derr << "fsck error: actual " << actual_statfs + << " != expected " << expected_store_statfs << dendl; + if (repair) { + repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY, + expected_store_statfs); + } + ++errors; + } + + dout(1) << __func__ << " checking pool_statfs" << dendl; + _fsck_check_pool_statfs(expected_pool_statfs, + errors, warnings, repair ? &repairer : nullptr); + + if (depth != FSCK_SHALLOW) { + dout(1) << __func__ << " checking for stray omap data" << dendl; + it = db->get_iterator(PREFIX_OMAP); + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + uint64_t omap_head; + _key_decode_u64(it->key().c_str(), &omap_head); + if (used_omap_head.count(omap_head) == 0) { + derr << "fsck error: found stray omap data on omap_head " + << omap_head << dendl; + ++errors; + } + } + } + it = db->get_iterator(PREFIX_PGMETA_OMAP); + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + uint64_t omap_head; + _key_decode_u64(it->key().c_str(), &omap_head); + if (used_pgmeta_omap_head.count(omap_head) == 0) { + derr << "fsck error: found stray (pgmeta) omap data on omap_head " + << omap_head << dendl; + ++errors; + } + } + } + dout(1) << __func__ << " checking deferred events" << dendl; + it = db->get_iterator(PREFIX_DEFERRED); + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + bufferlist bl = it->value(); + auto p = bl.cbegin(); + bluestore_deferred_transaction_t wt; + try { + decode(wt, p); + } catch (buffer::error& e) { + derr << "fsck error: failed to decode deferred txn " + << pretty_binary_string(it->key()) << dendl; + if (repair) { + dout(20) << __func__ << " undecodable deferred TXN record, key: '" + << pretty_binary_string(it->key()) + << "', removing" << dendl; + repairer.remove_key(db, PREFIX_DEFERRED, it->key()); + } + continue; + } + dout(20) << __func__ << " deferred " << wt.seq + << " ops " << wt.ops.size() + << " released 0x" << std::hex << wt.released << std::dec << dendl; + for (auto e = wt.released.begin(); e != wt.released.end(); ++e) { + apply( + e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + ceph_assert(pos < bs.size()); + bs.set(pos); + } + ); + } + } + } + + dout(1) << __func__ << " checking freelist vs allocated" << dendl; + { + // remove bluefs_extents from used set since the freelist doesn't + // know they are allocated. + for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) { + apply( + e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + ceph_assert(pos < bs.size()); + bs.reset(pos); + } + ); + } + fm->enumerate_reset(); + uint64_t offset, length; + while (fm->enumerate_next(db, &offset, &length)) { + bool intersects = false; + apply( + offset, length, fm->get_alloc_size(), used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + ceph_assert(pos < bs.size()); + if (bs.test(pos)) { + if (offset == SUPER_RESERVED && + length == min_alloc_size - SUPER_RESERVED) { + // this is due to the change just after luminous to min_alloc_size + // granularity allocations, and our baked in assumption at the top + // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used + // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless, + // since we will never allocate this region below min_alloc_size. + dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED" + << " and min_alloc_size, 0x" << std::hex << offset << "~" + << length << std::dec << dendl; + } else { + intersects = true; + if (repair) { + repairer.fix_false_free(db, fm, + pos * min_alloc_size, + min_alloc_size); + } + } + } else { + bs.set(pos); + } + } + ); + if (intersects) { + derr << "fsck error: free extent 0x" << std::hex << offset + << "~" << length << std::dec + << " intersects allocated blocks" << dendl; + ++errors; + } + } + fm->enumerate_reset(); + size_t count = used_blocks.count(); + if (used_blocks.size() != count) { + ceph_assert(used_blocks.size() > count); + used_blocks.flip(); + size_t start = used_blocks.find_first(); + while (start != decltype(used_blocks)::npos) { + size_t cur = start; + while (true) { + size_t next = used_blocks.find_next(cur); + if (next != cur + 1) { + ++errors; + derr << "fsck error: leaked extent 0x" << std::hex + << ((uint64_t)start * fm->get_alloc_size()) << "~" + << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec + << dendl; + if (repair) { + repairer.fix_leaked(db, + fm, + start * min_alloc_size, + (cur + 1 - start) * min_alloc_size); + } + start = next; + break; + } + cur = next; + } + } + used_blocks.flip(); + } + } + } + if (repair) { + dout(5) << __func__ << " applying repair results" << dendl; + repaired = repairer.apply(db); + dout(5) << __func__ << " repair applied" << dendl; + } + +out_scan: + dout(2) << __func__ << " " << num_objects << " objects, " + << num_sharded_objects << " of them sharded. " + << dendl; + dout(2) << __func__ << " " << num_extents << " extents to " + << num_blobs << " blobs, " + << num_spanning_blobs << " spanning, " + << num_shared_blobs << " shared." + << dendl; + + utime_t duration = ceph_clock_now() - start; + dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, " << repaired + << " repaired, " << (errors - (int)repaired) << " remaining in " + << duration << " seconds" << dendl; + return errors - (int)repaired; +} + +/// methods to inject various errors fsck can repair +void BlueStore::inject_broken_shared_blob_key(const string& key, + const bufferlist& bl) +{ + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + txn->set(PREFIX_SHARED_BLOB, key, bl); + db->submit_transaction_sync(txn); +}; + +void BlueStore::inject_leaked(uint64_t len) +{ + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + + PExtentVector exts; + int64_t alloc_len = alloc->allocate(len, min_alloc_size, + min_alloc_size * 256, 0, &exts); + ceph_assert(alloc_len >= (int64_t)len); + for (auto& p : exts) { + fm->allocate(p.offset, p.length, txn); + } + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_false_free(coll_t cid, ghobject_t oid) +{ + KeyValueDB::Transaction txn; + OnodeRef o; + CollectionRef c = _get_collection(cid); + ceph_assert(c); + { + RWLock::WLocker l(c->lock); // just to avoid internal asserts + o = c->get_onode(oid, false); + ceph_assert(o); + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + } + + bool injected = false; + txn = db->get_transaction(); + auto& em = o->extent_map.extent_map; + std::vector<const PExtentVector*> v; + if (em.size()) { + v.push_back(&em.begin()->blob->get_blob().get_extents()); + } + if (em.size() > 1) { + auto it = em.end(); + --it; + v.push_back(&(it->blob->get_blob().get_extents())); + } + for (auto pext : v) { + if (pext->size()) { + auto p = pext->begin(); + while (p != pext->end()) { + if (p->is_valid()) { + dout(20) << __func__ << " release 0x" << std::hex << p->offset + << "~" << p->length << std::dec << dendl; + fm->release(p->offset, p->length, txn); + injected = true; + break; + } + ++p; + } + } + } + ceph_assert(injected); + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs) +{ + BlueStoreRepairer repairer; + repairer.fix_statfs(db, key, new_statfs); + repairer.apply(db); +} + +void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs) +{ + KeyValueDB::Transaction t = db->get_transaction(); + volatile_statfs v; + v = new_statfs; + bufferlist bl; + v.encode(bl); + t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl); + db->submit_transaction_sync(t); +} + +void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1, + coll_t cid2, ghobject_t oid2, + uint64_t offset) +{ + OnodeRef o1; + CollectionRef c1 = _get_collection(cid1); + ceph_assert(c1); + { + RWLock::WLocker l(c1->lock); // just to avoid internal asserts + o1 = c1->get_onode(oid1, false); + ceph_assert(o1); + o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE); + } + OnodeRef o2; + CollectionRef c2 = _get_collection(cid2); + ceph_assert(c2); + { + RWLock::WLocker l(c2->lock); // just to avoid internal asserts + o2 = c2->get_onode(oid2, false); + ceph_assert(o2); + o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE); + } + Extent& e1 = *(o1->extent_map.seek_lextent(offset)); + Extent& e2 = *(o2->extent_map.seek_lextent(offset)); + + // require onode/extent layout to be the same (and simple) + // to make things easier + ceph_assert(o1->onode.extent_map_shards.empty()); + ceph_assert(o2->onode.extent_map_shards.empty()); + ceph_assert(o1->extent_map.spanning_blob_map.size() == 0); + ceph_assert(o2->extent_map.spanning_blob_map.size() == 0); + ceph_assert(e1.logical_offset == e2.logical_offset); + ceph_assert(e1.length == e2.length); + ceph_assert(e1.blob_offset == e2.blob_offset); + + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + + // along with misreference error this will create space leaks errors + e2.blob->dirty_blob() = e1.blob->get_blob(); + o2->extent_map.dirty_range(offset, e2.length); + o2->extent_map.update(txn, false); + + _record_onode(o2, txn); + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid, + int16_t blob_id) +{ + OnodeRef o; + CollectionRef c = _get_collection(cid); + ceph_assert(c); + { + RWLock::WLocker l(c->lock); // just to avoid internal asserts + o = c->get_onode(oid, false); + ceph_assert(o); + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + } + + BlobRef b = c->new_blob(); + b->id = blob_id; + o->extent_map.spanning_blob_map[blob_id] = b; + + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + + _record_onode(o, txn); + db->submit_transaction_sync(txn); +} + +void BlueStore::collect_metadata(map<string,string> *pm) +{ + dout(10) << __func__ << dendl; + bdev->collect_metadata("bluestore_bdev_", pm); + if (bluefs) { + (*pm)["bluefs"] = "1"; + (*pm)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device); + bluefs->collect_metadata(pm, bluefs_shared_bdev); + } else { + (*pm)["bluefs"] = "0"; + } + + // report numa mapping for underlying devices + int node = -1; + set<int> nodes; + set<string> failed; + int r = get_numa_node(&node, &nodes, &failed); + if (r >= 0) { + if (!failed.empty()) { + (*pm)["objectstore_numa_unknown_devices"] = stringify(failed); + } + if (!nodes.empty()) { + dout(1) << __func__ << " devices span numa nodes " << nodes << dendl; + (*pm)["objectstore_numa_nodes"] = stringify(nodes); + } + if (node >= 0) { + (*pm)["objectstore_numa_node"] = stringify(node); + } + } +} + +int BlueStore::get_numa_node( + int *final_node, + set<int> *out_nodes, + set<string> *out_failed) +{ + int node = -1; + set<string> devices; + get_devices(&devices); + set<int> nodes; + set<string> failed; + for (auto& devname : devices) { + int n; + BlkDev bdev(devname); + int r = bdev.get_numa_node(&n); + if (r < 0) { + dout(10) << __func__ << " bdev " << devname << " can't detect numa_node" + << dendl; + failed.insert(devname); + continue; + } + dout(10) << __func__ << " bdev " << devname << " on numa_node " << n + << dendl; + nodes.insert(n); + if (node < 0) { + node = n; + } + } + if (node >= 0 && nodes.size() == 1 && failed.empty()) { + *final_node = node; + } + if (out_nodes) { + *out_nodes = nodes; + } + if (out_failed) { + *out_failed = failed; + } + return 0; +} + +int BlueStore::get_devices(set<string> *ls) +{ + if (bdev) { + bdev->get_devices(ls); + if (bluefs) { + bluefs->get_devices(ls); + } + return 0; + } + + // grumble, we haven't started up yet. + int r = _open_path(); + if (r < 0) + goto out; + r = _open_fsid(false); + if (r < 0) + goto out_path; + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + r = _open_bdev(false); + if (r < 0) + goto out_fsid; + r = _minimal_open_bluefs(false); + if (r < 0) + goto out_bdev; + bdev->get_devices(ls); + if (bluefs) { + bluefs->get_devices(ls); + } + r = 0; + _minimal_close_bluefs(); + out_bdev: + _close_bdev(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + out: + return r; +} + +void BlueStore::_get_statfs_overall(struct store_statfs_t *buf) +{ + buf->reset(); + + buf->omap_allocated = db->estimate_prefix_size(PREFIX_OMAP); + + uint64_t bfree = alloc->get_free(); + + if (bluefs) { + int64_t bluefs_total = bluefs->get_total(bluefs_shared_bdev); + int64_t bluefs_free = bluefs->get_free(bluefs_shared_bdev); + // part of our shared device is "free" according to BlueFS, but we + // can't touch bluestore_bluefs_min of it. + int64_t shared_available = std::min( + bluefs_free, + int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min)); + buf->internally_reserved = bluefs_total - shared_available; + if (shared_available > 0) { + bfree += shared_available; + } + // include dedicated db, too, if that isn't the shared device. + if (bluefs_shared_bdev != BlueFS::BDEV_DB) { + buf->total += bluefs->get_total(BlueFS::BDEV_DB); + } + // call any non-omap bluefs space "internal metadata" + buf->internal_metadata = + std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min) + - buf->omap_allocated; + } + + uint64_t thin_total, thin_avail; + if (bdev->get_thin_utilization(&thin_total, &thin_avail)) { + buf->total += thin_total; + + // we are limited by both the size of the virtual device and the + // underlying physical device. + bfree = std::min(bfree, thin_avail); + + buf->allocated = thin_total - thin_avail; + } else { + buf->total += bdev->get_size(); + } + buf->available = bfree; +} + +int BlueStore::statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts) +{ + if (alerts) { + alerts->clear(); + _log_alerts(*alerts); + } + _get_statfs_overall(buf); + { + std::lock_guard l(vstatfs_lock); + buf->allocated = vstatfs.allocated(); + buf->data_stored = vstatfs.stored(); + buf->data_compressed = vstatfs.compressed(); + buf->data_compressed_original = vstatfs.compressed_original(); + buf->data_compressed_allocated = vstatfs.compressed_allocated(); + } + + dout(20) << __func__ << " " << *buf << dendl; + return 0; +} + +int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) +{ + dout(20) << __func__ << " pool " << pool_id<< dendl; + + if (!per_pool_stat_collection) { + dout(20) << __func__ << " not supported in legacy mode " << dendl; + return -ENOTSUP; + } + buf->reset(); + + { + std::lock_guard l(vstatfs_lock); + osd_pools[pool_id].publish(buf); + } + dout(10) << __func__ << *buf << dendl; + return 0; +} + +void BlueStore::_check_legacy_statfs_alert() +{ + string s; + if (!per_pool_stat_collection && + cct->_conf->bluestore_warn_on_legacy_statfs) { + s = "legacy statfs reporting detected, " + "suggest to run store repair to get consistent statistic reports"; + } + std::lock_guard l(qlock); + legacy_statfs_alert = s; +} + +// --------------- +// cache + +BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid) +{ + RWLock::RLocker l(coll_lock); + ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid); + if (cp == coll_map.end()) + return CollectionRef(); + return cp->second; +} + +void BlueStore::_queue_reap_collection(CollectionRef& c) +{ + dout(10) << __func__ << " " << c << " " << c->cid << dendl; + // _reap_collections and this in the same thread, + // so no need a lock. + removed_collections.push_back(c); +} + +void BlueStore::_reap_collections() +{ + + list<CollectionRef> removed_colls; + { + // _queue_reap_collection and this in the same thread. + // So no need a lock. + if (!removed_collections.empty()) + removed_colls.swap(removed_collections); + else + return; + } + + list<CollectionRef>::iterator p = removed_colls.begin(); + while (p != removed_colls.end()) { + CollectionRef c = *p; + dout(10) << __func__ << " " << c << " " << c->cid << dendl; + if (c->onode_map.map_any([&](OnodeRef o) { + ceph_assert(!o->exists); + if (o->flushing_count.load()) { + dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid + << " flush_txns " << o->flushing_count << dendl; + return true; + } + return false; + })) { + ++p; + continue; + } + c->onode_map.clear(); + p = removed_colls.erase(p); + dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl; + } + if (removed_colls.empty()) { + dout(10) << __func__ << " all reaped" << dendl; + } else { + removed_collections.splice(removed_collections.begin(), removed_colls); + } +} + +void BlueStore::_update_cache_logger() +{ + uint64_t num_onodes = 0; + uint64_t num_extents = 0; + uint64_t num_blobs = 0; + uint64_t num_buffers = 0; + uint64_t num_buffer_bytes = 0; + for (auto c : cache_shards) { + c->add_stats(&num_onodes, &num_extents, &num_blobs, + &num_buffers, &num_buffer_bytes); + } + logger->set(l_bluestore_onodes, num_onodes); + logger->set(l_bluestore_extents, num_extents); + logger->set(l_bluestore_blobs, num_blobs); + logger->set(l_bluestore_buffers, num_buffers); + logger->set(l_bluestore_buffer_bytes, num_buffer_bytes); +} + +// --------------- +// read operations + +ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid) +{ + return _get_collection(cid); +} + +ObjectStore::CollectionHandle BlueStore::create_new_collection( + const coll_t& cid) +{ + RWLock::WLocker l(coll_lock); + Collection *c = new Collection( + this, + cache_shards[cid.hash_to_shard(cache_shards.size())], + cid); + new_coll_map[cid] = c; + _osr_attach(c); + return c; +} + +void BlueStore::set_collection_commit_queue( + const coll_t& cid, + ContextQueue *commit_queue) +{ + if (commit_queue) { + RWLock::RLocker l(coll_lock); + if (coll_map.count(cid)) { + coll_map[cid]->commit_queue = commit_queue; + } else if (new_coll_map.count(cid)) { + new_coll_map[cid]->commit_queue = commit_queue; + } + } +} + + +bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(10) << __func__ << " " << c->cid << " " << oid << dendl; + if (!c->exists) + return false; + + bool r = true; + + { + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + r = false; + } + + return r; +} + +int BlueStore::stat( + CollectionHandle &c_, + const ghobject_t& oid, + struct stat *st, + bool allow_eio) +{ + Collection *c = static_cast<Collection *>(c_.get()); + if (!c->exists) + return -ENOENT; + dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; + + { + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + return -ENOENT; + st->st_size = o->onode.size; + st->st_blksize = 4096; + st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize; + st->st_nlink = 1; + } + + int r = 0; + if (_debug_mdata_eio(oid)) { + r = -EIO; + derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; + } + return r; +} +int BlueStore::set_collection_opts( + CollectionHandle& ch, + const pool_opts_t& opts) +{ + Collection *c = static_cast<Collection *>(ch.get()); + dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl; + if (!c->exists) + return -ENOENT; + RWLock::WLocker l(c->lock); + c->pool_opts = opts; + return 0; +} + +int BlueStore::read( + CollectionHandle &c_, + const ghobject_t& oid, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags) +{ + auto start = mono_clock::now(); + Collection *c = static_cast<Collection *>(c_.get()); + const coll_t &cid = c->get_cid(); + dout(15) << __func__ << " " << cid << " " << oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << dendl; + if (!c->exists) + return -ENOENT; + + bl.clear(); + int r; + { + RWLock::RLocker l(c->lock); + auto start1 = mono_clock::now(); + OnodeRef o = c->get_onode(oid, false); + log_latency("get_onode@read", + l_bluestore_read_onode_meta_lat, + mono_clock::now() - start1, + cct->_conf->bluestore_log_op_age); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (offset == length && offset == 0) + length = o->onode.size; + + r = _do_read(c, o, offset, length, bl, op_flags); + if (r == -EIO) { + logger->inc(l_bluestore_read_eio); + } + } + + out: + if (r >= 0 && _debug_data_eio(oid)) { + r = -EIO; + derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; + } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */ + cct->_conf->bluestore_debug_random_read_err && + (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * + 100.0)) == 0) { + dout(0) << __func__ << ": inject random EIO" << dendl; + r = -EIO; + } + dout(10) << __func__ << " " << cid << " " << oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << " = " << r << dendl; + log_latency(__func__, + l_bluestore_read_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + return r; +} + +// -------------------------------------------------------- +// intermediate data structures used while reading +struct region_t { + uint64_t logical_offset; + uint64_t blob_xoffset; //region offset within the blob + uint64_t length; + + // used later in read process + uint64_t front = 0; + + region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0) + : logical_offset(offset), + blob_xoffset(b_offs), + length(len), + front(front){} + region_t(const region_t& from) + : logical_offset(from.logical_offset), + blob_xoffset(from.blob_xoffset), + length(from.length), + front(from.front){} + + friend ostream& operator<<(ostream& out, const region_t& r) { + return out << "0x" << std::hex << r.logical_offset << ":" + << r.blob_xoffset << "~" << r.length << std::dec; + } +}; + +// merged blob read request +struct read_req_t { + uint64_t r_off = 0; + uint64_t r_len = 0; + bufferlist bl; + std::list<region_t> regs; // original read regions + + read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {} + + friend ostream& operator<<(ostream& out, const read_req_t& r) { + out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : ["; + for (const auto& reg : r.regs) + out << reg; + return out << "]}" << std::dec; + } +}; + +typedef list<read_req_t> regions2read_t; +typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t; + +int BlueStore::_do_read( + Collection *c, + OnodeRef o, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags, + uint64_t retry_count) +{ + FUNCTRACE(cct); + int r = 0; + int read_cache_policy = 0; // do not bypass clean or dirty cache + + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << " size 0x" << o->onode.size << " (" << std::dec + << o->onode.size << ")" << dendl; + bl.clear(); + + if (offset >= o->onode.size) { + return r; + } + + // generally, don't buffer anything, unless the client explicitly requests + // it. + bool buffered = false; + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { + dout(20) << __func__ << " will do buffered read" << dendl; + buffered = true; + } else if (cct->_conf->bluestore_default_buffered_read && + (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { + dout(20) << __func__ << " defaulting to buffered read" << dendl; + buffered = true; + } + + if (offset + length > o->onode.size) { + length = o->onode.size - offset; + } + + auto start = mono_clock::now(); + o->extent_map.fault_range(db, offset, length); + log_latency(__func__, + l_bluestore_read_onode_meta_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + _dump_onode<30>(cct, *o); + + ready_regions_t ready_regions; + + // for deep-scrub, we only read dirty cache and bypass clean cache in + // order to read underlying block device in case there are silent disk errors. + if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) { + dout(20) << __func__ << " will bypass cache and do direct read" << dendl; + read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE; + } + + // build blob-wise list to of stuff read (that isn't cached) + blobs2read_t blobs2read; + unsigned left = length; + uint64_t pos = offset; + unsigned num_regions = 0; + auto lp = o->extent_map.seek_lextent(offset); + while (left > 0 && lp != o->extent_map.extent_map.end()) { + if (pos < lp->logical_offset) { + unsigned hole = lp->logical_offset - pos; + if (hole >= left) { + break; + } + dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole + << std::dec << dendl; + pos += hole; + left -= hole; + } + BlobRef& bptr = lp->blob; + unsigned l_off = pos - lp->logical_offset; + unsigned b_off = l_off + lp->blob_offset; + unsigned b_len = std::min(left, lp->length - l_off); + + ready_regions_t cache_res; + interval_set<uint32_t> cache_interval; + bptr->shared_blob->bc.read( + bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval, + read_cache_policy); + dout(20) << __func__ << " blob " << *bptr << std::hex + << " need 0x" << b_off << "~" << b_len + << " cache has 0x" << cache_interval + << std::dec << dendl; + + auto pc = cache_res.begin(); + uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size); + while (b_len > 0) { + unsigned l; + if (pc != cache_res.end() && + pc->first == b_off) { + l = pc->second.length(); + ready_regions[pos].claim(pc->second); + dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x" + << b_off << "~" << l << std::dec << dendl; + ++pc; + } else { + l = b_len; + if (pc != cache_res.end()) { + ceph_assert(pc->first > b_off); + l = pc->first - b_off; + } + dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x" + << b_off << "~" << l << std::dec << dendl; + // merge regions + { + uint64_t r_off = b_off; + uint64_t r_len = l; + uint64_t front = r_off % chunk_size; + if (front) { + r_off -= front; + r_len += front; + } + unsigned tail = r_len % chunk_size; + if (tail) { + r_len += chunk_size - tail; + } + bool merged = false; + regions2read_t& r2r = blobs2read[bptr]; + if (r2r.size()) { + read_req_t& pre = r2r.back(); + if (r_off <= (pre.r_off + pre.r_len)) { + front += (r_off - pre.r_off); + pre.r_len += (r_off + r_len - pre.r_off - pre.r_len); + pre.regs.emplace_back(region_t(pos, b_off, l, front)); + merged = true; + } + } + if (!merged) { + read_req_t req(r_off, r_len); + req.regs.emplace_back(region_t(pos, b_off, l, front)); + r2r.emplace_back(std::move(req)); + } + } + ++num_regions; + } + pos += l; + b_off += l; + left -= l; + b_len -= l; + } + ++lp; + } + + // read raw blob data. use aio if we have >1 blobs to read. + start = mono_clock::now(); // for the sake of simplicity + // measure the whole block below. + // The error isn't that much... + vector<bufferlist> compressed_blob_bls; + IOContext ioc(cct, NULL, true); // allow EIO + for (auto& p : blobs2read) { + const BlobRef& bptr = p.first; + regions2read_t& r2r = p.second; + dout(20) << __func__ << " blob " << *bptr << std::hex + << " need " << r2r << std::dec << dendl; + if (bptr->get_blob().is_compressed()) { + // read the whole thing + if (compressed_blob_bls.empty()) { + // ensure we avoid any reallocation on subsequent blobs + compressed_blob_bls.reserve(blobs2read.size()); + } + compressed_blob_bls.push_back(bufferlist()); + bufferlist& bl = compressed_blob_bls.back(); + r = bptr->get_blob().map( + 0, bptr->get_blob().get_ondisk_length(), + [&](uint64_t offset, uint64_t length) { + int r; + // use aio if there are more regions to read than those in this blob + if (num_regions > r2r.size()) { + r = bdev->aio_read(offset, length, &bl, &ioc); + } else { + r = bdev->read(offset, length, &bl, &ioc, false); + } + if (r < 0) + return r; + return 0; + }); + if (r < 0) { + derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl; + if (r == -EIO) { + // propagate EIO to caller + return r; + } + ceph_assert(r == 0); + } + } else { + // read the pieces + for (auto& req : r2r) { + dout(20) << __func__ << " region 0x" << std::hex + << req.regs.front().logical_offset + << ": 0x" << req.regs.front().blob_xoffset + << " reading 0x" << req.r_off + << "~" << req.r_len << std::dec + << dendl; + + // read it + r = bptr->get_blob().map( + req.r_off, req.r_len, + [&](uint64_t offset, uint64_t length) { + int r; + // use aio if there is more than one region to read + if (num_regions > 1) { + r = bdev->aio_read(offset, length, &req.bl, &ioc); + } else { + r = bdev->read(offset, length, &req.bl, &ioc, false); + } + if (r < 0) + return r; + return 0; + }); + if (r < 0) { + derr << __func__ << " bdev-read failed: " << cpp_strerror(r) + << dendl; + if (r == -EIO) { + // propagate EIO to caller + return r; + } + ceph_assert(r == 0); + } + ceph_assert(req.bl.length() == req.r_len); + } + } + } + + int64_t num_ios = length; + if (ioc.has_pending_aios()) { + num_ios = -ioc.get_num_ios(); + bdev->aio_submit(&ioc); + dout(20) << __func__ << " waiting for aio" << dendl; + ioc.aio_wait(); + r = ioc.get_return_value(); + if (r < 0) { + ceph_assert(r == -EIO); // no other errors allowed + return -EIO; + } + } + log_latency_fn(__func__, + l_bluestore_read_wait_aio_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age, + [&](auto lat) { return ", num_ios = " + stringify(num_ios); } + ); + + // enumerate and decompress desired blobs + auto p = compressed_blob_bls.begin(); + blobs2read_t::iterator b2r_it = blobs2read.begin(); + while (b2r_it != blobs2read.end()) { + const BlobRef& bptr = b2r_it->first; + regions2read_t& r2r = b2r_it->second; + dout(20) << __func__ << " blob " << *bptr << std::hex + << " need 0x" << r2r << std::dec << dendl; + if (bptr->get_blob().is_compressed()) { + ceph_assert(p != compressed_blob_bls.end()); + bufferlist& compressed_bl = *p++; + if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl, + r2r.front().regs.front().logical_offset) < 0) { + // Handles spurious read errors caused by a kernel bug. + // We sometimes get all-zero pages as a result of the read under + // high memory pressure. Retrying the failing read succeeds in most + // cases. + // See also: http://tracker.ceph.com/issues/22464 + if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { + return -EIO; + } + return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1); + } + bufferlist raw_bl; + r = _decompress(compressed_bl, &raw_bl); + if (r < 0) + return r; + if (buffered) { + bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0, + raw_bl); + } + for (auto& req : r2r) { + for (auto& r : req.regs) { + ready_regions[r.logical_offset].substr_of( + raw_bl, r.blob_xoffset, r.length); + } + } + } else { + for (auto& req : r2r) { + if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl, + req.regs.front().logical_offset) < 0) { + // Handles spurious read errors caused by a kernel bug. + // We sometimes get all-zero pages as a result of the read under + // high memory pressure. Retrying the failing read succeeds in most + // cases. + // See also: http://tracker.ceph.com/issues/22464 + if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { + return -EIO; + } + return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1); + } + if (buffered) { + bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), + req.r_off, req.bl); + } + + // prune and keep result + for (const auto& r : req.regs) { + ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length); + } + } + } + ++b2r_it; + } + + // generate a resulting buffer + auto pr = ready_regions.begin(); + auto pr_end = ready_regions.end(); + pos = 0; + while (pos < length) { + if (pr != pr_end && pr->first == pos + offset) { + dout(30) << __func__ << " assemble 0x" << std::hex << pos + << ": data from 0x" << pr->first << "~" << pr->second.length() + << std::dec << dendl; + pos += pr->second.length(); + bl.claim_append(pr->second); + ++pr; + } else { + uint64_t l = length - pos; + if (pr != pr_end) { + ceph_assert(pr->first > pos + offset); + l = pr->first - (pos + offset); + } + dout(30) << __func__ << " assemble 0x" << std::hex << pos + << ": zeros for 0x" << (pos + offset) << "~" << l + << std::dec << dendl; + bl.append_zero(l); + pos += l; + } + } + ceph_assert(bl.length() == length); + ceph_assert(pos == length); + ceph_assert(pr == pr_end); + r = bl.length(); + if (retry_count) { + logger->inc(l_bluestore_reads_with_retries); + dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length + << " failed " << std::dec << retry_count << " times before succeeding" << dendl; + } + return r; +} + +int BlueStore::_verify_csum(OnodeRef& o, + const bluestore_blob_t* blob, uint64_t blob_xoffset, + const bufferlist& bl, + uint64_t logical_offset) const +{ + int bad; + uint64_t bad_csum; + auto start = mono_clock::now(); + int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum); + if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 && + (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) { + derr << __func__ << " injecting bluestore checksum verifcation error" << dendl; + bad = blob_xoffset; + r = -1; + bad_csum = 0xDEADBEEF; + } + if (r < 0) { + if (r == -1) { + PExtentVector pex; + blob->map( + bad, + blob->get_csum_chunk_size(), + [&](uint64_t offset, uint64_t length) { + pex.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + derr << __func__ << " bad " + << Checksummer::get_csum_type_string(blob->csum_type) + << "/0x" << std::hex << blob->get_csum_chunk_size() + << " checksum at blob offset 0x" << bad + << ", got 0x" << bad_csum << ", expected 0x" + << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec + << ", device location " << pex + << ", logical extent 0x" << std::hex + << (logical_offset + bad - blob_xoffset) << "~" + << blob->get_csum_chunk_size() << std::dec + << ", object " << o->oid + << dendl; + } else { + derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl; + } + } + log_latency(__func__, + l_bluestore_csum_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + if (cct->_conf->bluestore_ignore_data_csum) { + return 0; + } + return r; +} + +int BlueStore::_decompress(bufferlist& source, bufferlist* result) +{ + int r = 0; + auto start = mono_clock::now(); + auto i = source.cbegin(); + bluestore_compression_header_t chdr; + decode(chdr, i); + int alg = int(chdr.type); + CompressorRef cp = compressor; + if (!cp || (int)cp->get_type() != alg) { + cp = Compressor::create(cct, alg); + } + + if (!cp.get()) { + // if compressor isn't available - error, because cannot return + // decompressed data? + + const char* alg_name = Compressor::get_comp_alg_name(alg); + derr << __func__ << " can't load decompressor " << alg_name << dendl; + _set_compression_alert(false, alg_name); + r = -EIO; + } else { + r = cp->decompress(i, chdr.length, *result); + if (r < 0) { + derr << __func__ << " decompression failed with exit code " << r << dendl; + r = -EIO; + } + } + log_latency(__func__, + l_bluestore_decompress_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + return r; +} + +// this stores fiemap into interval_set, other variations +// use it internally +int BlueStore::_fiemap( + CollectionHandle &c_, + const ghobject_t& oid, + uint64_t offset, + size_t length, + interval_set<uint64_t>& destset) +{ + Collection *c = static_cast<Collection *>(c_.get()); + if (!c->exists) + return -ENOENT; + { + RWLock::RLocker l(c->lock); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + return -ENOENT; + } + _dump_onode<30>(cct, *o); + + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << " size 0x" << o->onode.size << std::dec << dendl; + + boost::intrusive::set<Extent>::iterator ep, eend; + if (offset >= o->onode.size) + goto out; + + if (offset + length > o->onode.size) { + length = o->onode.size - offset; + } + + o->extent_map.fault_range(db, offset, length); + eend = o->extent_map.extent_map.end(); + ep = o->extent_map.seek_lextent(offset); + while (length > 0) { + dout(20) << __func__ << " offset " << offset << dendl; + if (ep != eend && ep->logical_offset + ep->length <= offset) { + ++ep; + continue; + } + + uint64_t x_len = length; + if (ep != eend && ep->logical_offset <= offset) { + uint64_t x_off = offset - ep->logical_offset; + x_len = std::min(x_len, ep->length - x_off); + dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~" + << x_len << std::dec << " blob " << ep->blob << dendl; + destset.insert(offset, x_len); + length -= x_len; + offset += x_len; + if (x_off + x_len == ep->length) + ++ep; + continue; + } + if (ep != eend && + ep->logical_offset > offset && + ep->logical_offset - offset < x_len) { + x_len = ep->logical_offset - offset; + } + offset += x_len; + length -= x_len; + } + } + + out: + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << " size = 0x(" << destset << ")" << std::dec << dendl; + return 0; +} + +int BlueStore::fiemap( + CollectionHandle &c_, + const ghobject_t& oid, + uint64_t offset, + size_t length, + bufferlist& bl) +{ + interval_set<uint64_t> m; + int r = _fiemap(c_, oid, offset, length, m); + if (r >= 0) { + encode(m, bl); + } + return r; +} + +int BlueStore::fiemap( + CollectionHandle &c_, + const ghobject_t& oid, + uint64_t offset, + size_t length, + map<uint64_t, uint64_t>& destmap) +{ + interval_set<uint64_t> m; + int r = _fiemap(c_, oid, offset, length, m); + if (r >= 0) { + m.move_into(destmap); + } + return r; +} + +int BlueStore::getattr( + CollectionHandle &c_, + const ghobject_t& oid, + const char *name, + bufferptr& value) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl; + if (!c->exists) + return -ENOENT; + + int r; + { + RWLock::RLocker l(c->lock); + mempool::bluestore_cache_meta::string k(name); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (!o->onode.attrs.count(k)) { + r = -ENODATA; + goto out; + } + value = o->onode.attrs[k]; + r = 0; + } + out: + if (r == 0 && _debug_mdata_eio(oid)) { + r = -EIO; + derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; + } + dout(10) << __func__ << " " << c->cid << " " << oid << " " << name + << " = " << r << dendl; + return r; +} + +int BlueStore::getattrs( + CollectionHandle &c_, + const ghobject_t& oid, + map<string,bufferptr>& aset) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + if (!c->exists) + return -ENOENT; + + int r; + { + RWLock::RLocker l(c->lock); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + for (auto& i : o->onode.attrs) { + aset.emplace(i.first.c_str(), i.second); + } + r = 0; + } + + out: + if (r == 0 && _debug_mdata_eio(oid)) { + r = -EIO; + derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; + } + dout(10) << __func__ << " " << c->cid << " " << oid + << " = " << r << dendl; + return r; +} + +int BlueStore::list_collections(vector<coll_t>& ls) +{ + RWLock::RLocker l(coll_lock); + ls.reserve(coll_map.size()); + for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin(); + p != coll_map.end(); + ++p) + ls.push_back(p->first); + return 0; +} + +bool BlueStore::collection_exists(const coll_t& c) +{ + RWLock::RLocker l(coll_lock); + return coll_map.count(c); +} + +int BlueStore::collection_empty(CollectionHandle& ch, bool *empty) +{ + dout(15) << __func__ << " " << ch->cid << dendl; + vector<ghobject_t> ls; + ghobject_t next; + int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1, + &ls, &next); + if (r < 0) { + derr << __func__ << " collection_list returned: " << cpp_strerror(r) + << dendl; + return r; + } + *empty = ls.empty(); + dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl; + return 0; +} + +int BlueStore::collection_bits(CollectionHandle& ch) +{ + dout(15) << __func__ << " " << ch->cid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl; + return c->cnode.bits; +} + +int BlueStore::collection_list( + CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *pnext) +{ + Collection *c = static_cast<Collection *>(c_.get()); + c->flush(); + dout(15) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max << dendl; + int r; + { + RWLock::RLocker l(c->lock); + r = _collection_list(c, start, end, max, false, ls, pnext); + } + + dout(10) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max + << " = " << r << ", ls.size() = " << ls->size() + << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl; + return r; +} + +int BlueStore::collection_list_legacy( + CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *pnext) +{ + Collection *c = static_cast<Collection *>(c_.get()); + c->flush(); + dout(15) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max << dendl; + int r; + { + RWLock::RLocker l(c->lock); + r = _collection_list(c, start, end, max, true, ls, pnext); + } + + dout(10) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max + << " = " << r << ", ls.size() = " << ls->size() + << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl; + return r; +} + +int BlueStore::_collection_list( + Collection *c, const ghobject_t& start, const ghobject_t& end, int max, + bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext) +{ + + if (!c->exists) + return -ENOENT; + + auto start_time = mono_clock::now(); + int r = 0; + ghobject_t static_next; + std::unique_ptr<CollectionListIterator> it; + ghobject_t coll_range_temp_start, coll_range_temp_end; + ghobject_t coll_range_start, coll_range_end; + bool set_next = false; + ghobject_t pend; + bool temp; + + if (!pnext) + pnext = &static_next; + + if (start.is_max() || start.hobj.is_max()) { + goto out; + } + get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start, + &coll_range_temp_end, &coll_range_start, &coll_range_end); + dout(20) << __func__ + << " range " << coll_range_temp_start + << " to " << coll_range_temp_end + << " and " << coll_range_start + << " to " << coll_range_end + << " start " << start << dendl; + if (legacy) { + it = std::make_unique<SimpleCollectionListIterator>( + cct, db->get_iterator(PREFIX_OBJ)); + } else { + it = std::make_unique<SortedCollectionListIterator>( + db->get_iterator(PREFIX_OBJ)); + } + if (start == ghobject_t() || + start.hobj == hobject_t() || + start == c->cid.get_min_hobj()) { + it->upper_bound(coll_range_temp_start); + temp = true; + } else { + if (start.hobj.is_temp()) { + temp = true; + ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end); + } else { + temp = false; + ceph_assert(start >= coll_range_start && start < coll_range_end); + } + dout(20) << __func__ << " temp=" << (int)temp << dendl; + it->lower_bound(start); + } + if (end.hobj.is_max()) { + pend = temp ? coll_range_temp_end : coll_range_end; + } else { + if (end.hobj.is_temp()) { + if (temp) + pend = end; + else + goto out; + } else { + pend = temp ? coll_range_temp_end : end; + } + } + dout(20) << __func__ << " pend " << pend << dendl; + while (true) { + if (!it->valid() || it->is_ge(pend)) { + if (!it->valid()) + dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; + else + dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl; + if (temp) { + if (end.hobj.is_temp()) { + if (it->valid() && it->is_lt(coll_range_temp_end)) { + *pnext = it->oid(); + set_next = true; + } + break; + } + dout(30) << __func__ << " switch to non-temp namespace" << dendl; + temp = false; + it->upper_bound(coll_range_start); + if (end.hobj.is_max()) + pend = coll_range_end; + else + pend = end; + dout(30) << __func__ << " pend " << pend << dendl; + continue; + } + if (it->valid() && it->is_lt(coll_range_end)) { + *pnext = it->oid(); + set_next = true; + } + break; + } + dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl; + if (ls->size() >= (unsigned)max) { + dout(20) << __func__ << " reached max " << max << dendl; + *pnext = it->oid(); + set_next = true; + break; + } + ls->push_back(it->oid()); + it->next(); + } +out: + if (!set_next) { + *pnext = ghobject_t::get_max(); + } + log_latency_fn( + __func__, + l_bluestore_clist_lat, + mono_clock::now() - start_time, + cct->_conf->bluestore_log_collection_list_age, + [&] (const ceph::timespan& lat) { + ostringstream ostr; + ostr << ", lat = " << timespan_str(lat) + << " cid =" << c->cid + << " start " << start << " end " << end + << " max " << max; + return ostr.str(); + } + ); + return r; +} + +int BlueStore::omap_get( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) + goto out; + o->flush(); + { + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + KeyValueDB::Iterator it = db->get_iterator(prefix); + string head, tail; + get_omap_header(o->onode.nid, &head); + get_omap_tail(o->onode.nid, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() == head) { + dout(30) << __func__ << " got header" << dendl; + *header = it->value(); + } else if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + string user_key; + decode_omap_key(it->key(), &user_key); + dout(20) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + (*out)[user_key] = it->value(); + } + it->next(); + } + } + out: + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +int BlueStore::omap_get_header( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) + goto out; + o->flush(); + { + string head; + get_omap_header(o->onode.nid, &head); + if (db->get(o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP, + head, header) >= 0) { + dout(30) << __func__ << " got header" << dendl; + } else { + dout(30) << __func__ << " no header" << dendl; + } + } + out: + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +int BlueStore::omap_get_keys( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set<string> *keys ///< [out] Keys defined on oid + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + auto start1 = mono_clock::now(); + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) + goto out; + o->flush(); + { + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + KeyValueDB::Iterator it = db->get_iterator(prefix); + string head, tail; + get_omap_key(o->onode.nid, string(), &head); + get_omap_tail(o->onode.nid, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } + string user_key; + decode_omap_key(it->key(), &user_key); + dout(20) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + keys->insert(user_key); + it->next(); + } + } + out: + c->store->log_latency( + __func__, + l_bluestore_omap_get_keys_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +int BlueStore::omap_get_values( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + RWLock::RLocker l(c->lock); + auto start1 = mono_clock::now(); + int r = 0; + string final_key; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) + goto out; + { + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + o->flush(); + _key_encode_u64(o->onode.nid, &final_key); + final_key.push_back('.'); + for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) { + final_key.resize(9); // keep prefix + final_key += *p; + bufferlist val; + if (db->get(prefix, final_key, &val) >= 0) { + dout(30) << __func__ << " got " << pretty_binary_string(final_key) + << " -> " << *p << dendl; + out->insert(make_pair(*p, val)); + } + } + } + out: + c->store->log_latency( + __func__, + l_bluestore_omap_get_values_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +int BlueStore::omap_check_keys( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + string final_key; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) + goto out; + { + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + o->flush(); + _key_encode_u64(o->onode.nid, &final_key); + final_key.push_back('.'); + for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) { + final_key.resize(9); // keep prefix + final_key += *p; + bufferlist val; + if (db->get(prefix, final_key, &val) >= 0) { + dout(30) << __func__ << " have " << pretty_binary_string(final_key) + << " -> " << *p << dendl; + out->insert(*p); + } else { + dout(30) << __func__ << " miss " << pretty_binary_string(final_key) + << " -> " << *p << dendl; + } + } + } + out: + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator( + CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; + if (!c->exists) { + return ObjectMap::ObjectMapIterator(); + } + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl; + return ObjectMap::ObjectMapIterator(); + } + o->flush(); + dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl; + KeyValueDB::Iterator it = db->get_iterator( + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP); + return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it)); +} + +// ----------------- +// write helpers + +uint64_t BlueStore::_get_ondisk_reserved() const { + return round_up_to( + std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size); +} + +void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t) +{ + dout(10) << __func__ << " ondisk_format " << ondisk_format + << " min_compat_ondisk_format " << min_compat_ondisk_format + << dendl; + ceph_assert(ondisk_format == latest_ondisk_format); + { + bufferlist bl; + encode(ondisk_format, bl); + t->set(PREFIX_SUPER, "ondisk_format", bl); + } + { + bufferlist bl; + encode(min_compat_ondisk_format, bl); + t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl); + } +} + +int BlueStore::_open_super_meta() +{ + // nid + { + nid_max = 0; + bufferlist bl; + db->get(PREFIX_SUPER, "nid_max", &bl); + auto p = bl.cbegin(); + try { + uint64_t v; + decode(v, p); + nid_max = v; + } catch (buffer::error& e) { + derr << __func__ << " unable to read nid_max" << dendl; + return -EIO; + } + dout(10) << __func__ << " old nid_max " << nid_max << dendl; + nid_last = nid_max.load(); + } + + // blobid + { + blobid_max = 0; + bufferlist bl; + db->get(PREFIX_SUPER, "blobid_max", &bl); + auto p = bl.cbegin(); + try { + uint64_t v; + decode(v, p); + blobid_max = v; + } catch (buffer::error& e) { + derr << __func__ << " unable to read blobid_max" << dendl; + return -EIO; + } + dout(10) << __func__ << " old blobid_max " << blobid_max << dendl; + blobid_last = blobid_max.load(); + } + + // freelist + { + bufferlist bl; + db->get(PREFIX_SUPER, "freelist_type", &bl); + if (bl.length()) { + freelist_type = std::string(bl.c_str(), bl.length()); + dout(10) << __func__ << " freelist_type " << freelist_type << dendl; + } else { + ceph_abort_msg("Not Support extent freelist manager"); + } + } + + // ondisk format + int32_t compat_ondisk_format = 0; + { + bufferlist bl; + int r = db->get(PREFIX_SUPER, "ondisk_format", &bl); + if (r < 0) { + // base case: kraken bluestore is v1 and readable by v1 + dout(20) << __func__ << " missing ondisk_format; assuming kraken" + << dendl; + ondisk_format = 1; + compat_ondisk_format = 1; + } else { + auto p = bl.cbegin(); + try { + decode(ondisk_format, p); + } catch (buffer::error& e) { + derr << __func__ << " unable to read ondisk_format" << dendl; + return -EIO; + } + bl.clear(); + { + r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl); + ceph_assert(!r); + auto p = bl.cbegin(); + try { + decode(compat_ondisk_format, p); + } catch (buffer::error& e) { + derr << __func__ << " unable to read compat_ondisk_format" << dendl; + return -EIO; + } + } + } + dout(10) << __func__ << " ondisk_format " << ondisk_format + << " compat_ondisk_format " << compat_ondisk_format + << dendl; + } + + if (latest_ondisk_format < compat_ondisk_format) { + derr << __func__ << " compat_ondisk_format is " + << compat_ondisk_format << " but we only understand version " + << latest_ondisk_format << dendl; + return -EPERM; + } + + { + bufferlist bl; + db->get(PREFIX_SUPER, "min_alloc_size", &bl); + auto p = bl.cbegin(); + try { + uint64_t val; + decode(val, p); + min_alloc_size = val; + min_alloc_size_order = ctz(val); + ceph_assert(min_alloc_size == 1u << min_alloc_size_order); + } catch (buffer::error& e) { + derr << __func__ << " unable to read min_alloc_size" << dendl; + return -EIO; + } + dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size + << std::dec << dendl; + } + _open_statfs(); + _set_alloc_sizes(); + _set_throttle_params(); + + _set_csum(); + _set_compression(); + _set_blob_size(); + + _validate_bdev(); + return 0; +} + +int BlueStore::_upgrade_super() +{ + dout(1) << __func__ << " from " << ondisk_format << ", latest " + << latest_ondisk_format << dendl; + if (ondisk_format < latest_ondisk_format) { + ceph_assert(ondisk_format > 0); + ceph_assert(ondisk_format < latest_ondisk_format); + + if (ondisk_format == 1) { + // changes: + // - super: added ondisk_format + // - super: added min_readable_ondisk_format + // - super: added min_compat_ondisk_format + // - super: added min_alloc_size + // - super: removed min_min_alloc_size + KeyValueDB::Transaction t = db->get_transaction(); + { + bufferlist bl; + db->get(PREFIX_SUPER, "min_min_alloc_size", &bl); + auto p = bl.cbegin(); + try { + uint64_t val; + decode(val, p); + min_alloc_size = val; + } catch (buffer::error& e) { + derr << __func__ << " failed to read min_min_alloc_size" << dendl; + return -EIO; + } + t->set(PREFIX_SUPER, "min_alloc_size", bl); + t->rmkey(PREFIX_SUPER, "min_min_alloc_size"); + } + ondisk_format = 2; + _prepare_ondisk_format_super(t); + int r = db->submit_transaction_sync(t); + ceph_assert(r == 0); + } + } + // done + dout(1) << __func__ << " done" << dendl; + return 0; +} + +void BlueStore::_assign_nid(TransContext *txc, OnodeRef o) +{ + if (o->onode.nid) { + ceph_assert(o->exists); + return; + } + uint64_t nid = ++nid_last; + dout(20) << __func__ << " " << nid << dendl; + o->onode.nid = nid; + txc->last_nid = nid; + o->exists = true; +} + +uint64_t BlueStore::_assign_blobid(TransContext *txc) +{ + uint64_t bid = ++blobid_last; + dout(20) << __func__ << " " << bid << dendl; + txc->last_blobid = bid; + return bid; +} + +void BlueStore::get_db_statistics(Formatter *f) +{ + db->get_statistics(f); +} + +BlueStore::TransContext *BlueStore::_txc_create( + Collection *c, OpSequencer *osr, + list<Context*> *on_commits) +{ + TransContext *txc = new TransContext(cct, c, osr, on_commits); + txc->t = db->get_transaction(); + osr->queue_new(txc); + dout(20) << __func__ << " osr " << osr << " = " << txc + << " seq " << txc->seq << dendl; + return txc; +} + +void BlueStore::_txc_calc_cost(TransContext *txc) +{ + // one "io" for the kv commit + auto ios = 1 + txc->ioc.get_num_ios(); + auto cost = throttle_cost_per_io.load(); + txc->cost = ios * cost + txc->bytes; + dout(10) << __func__ << " " << txc << " cost " << txc->cost << " (" + << ios << " ios * " << cost << " + " << txc->bytes + << " bytes)" << dendl; +} + +void BlueStore::_txc_update_store_statfs(TransContext *txc) +{ + if (txc->statfs_delta.is_empty()) + return; + + logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated()); + logger->inc(l_bluestore_stored, txc->statfs_delta.stored()); + logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed()); + logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated()); + logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original()); + + bufferlist bl; + txc->statfs_delta.encode(bl); + if (per_pool_stat_collection) { + string key; + get_pool_stat_key(txc->osd_pool_id, &key); + txc->t->merge(PREFIX_STAT, key, bl); + + std::lock_guard l(vstatfs_lock); + auto& stats = osd_pools[txc->osd_pool_id]; + stats += txc->statfs_delta; + + vstatfs += txc->statfs_delta; //non-persistent in this mode + + } else { + txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl); + + std::lock_guard l(vstatfs_lock); + vstatfs += txc->statfs_delta; + } + txc->statfs_delta.reset(); +} + +void BlueStore::_txc_state_proc(TransContext *txc) +{ + while (true) { + dout(10) << __func__ << " txc " << txc + << " " << txc->get_state_name() << dendl; + switch (txc->state) { + case TransContext::STATE_PREPARE: + txc->log_state_latency(logger, l_bluestore_state_prepare_lat); + if (txc->ioc.has_pending_aios()) { + txc->state = TransContext::STATE_AIO_WAIT; + txc->had_ios = true; + _txc_aio_submit(txc); + return; + } + // ** fall-thru ** + + case TransContext::STATE_AIO_WAIT: + { + utime_t lat = txc->log_state_latency(logger, l_bluestore_state_aio_wait_lat); + if (lat >= cct->_conf->bluestore_log_op_age) { + dout(0) << __func__ << " slow aio_wait, txc = " << txc + << ", latency = " << lat + << dendl; + } + } + + _txc_finish_io(txc); // may trigger blocked txc's too + return; + + case TransContext::STATE_IO_DONE: + ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io + if (txc->had_ios) { + ++txc->osr->txc_with_unstable_io; + } + txc->log_state_latency(logger, l_bluestore_state_io_done_lat); + txc->state = TransContext::STATE_KV_QUEUED; + if (cct->_conf->bluestore_sync_submit_transaction) { + if (txc->last_nid >= nid_max || + txc->last_blobid >= blobid_max) { + dout(20) << __func__ + << " last_{nid,blobid} exceeds max, submit via kv thread" + << dendl; + } else if (txc->osr->kv_committing_serially) { + dout(20) << __func__ << " prior txc submitted via kv thread, us too" + << dendl; + // note: this is starvation-prone. once we have a txc in a busy + // sequencer that is committing serially it is possible to keep + // submitting new transactions fast enough that we get stuck doing + // so. the alternative is to block here... fixme? + } else if (txc->osr->txc_with_unstable_io) { + dout(20) << __func__ << " prior txc(s) with unstable ios " + << txc->osr->txc_with_unstable_io.load() << dendl; + } else if (cct->_conf->bluestore_debug_randomize_serial_transaction && + rand() % cct->_conf->bluestore_debug_randomize_serial_transaction + == 0) { + dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread" + << dendl; + } else { + int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t); + ceph_assert(r == 0); + txc->state = TransContext::STATE_KV_SUBMITTED; + _txc_applied_kv(txc); + } + } + { + std::lock_guard l(kv_lock); + kv_queue.push_back(txc); + kv_cond.notify_one(); + if (txc->state != TransContext::STATE_KV_SUBMITTED) { + kv_queue_unsubmitted.push_back(txc); + ++txc->osr->kv_committing_serially; + } + if (txc->had_ios) + kv_ios++; + kv_throttle_costs += txc->cost; + } + return; + case TransContext::STATE_KV_SUBMITTED: + _txc_committed_kv(txc); + // ** fall-thru ** + + case TransContext::STATE_KV_DONE: + txc->log_state_latency(logger, l_bluestore_state_kv_done_lat); + if (txc->deferred_txn) { + txc->state = TransContext::STATE_DEFERRED_QUEUED; + _deferred_queue(txc); + return; + } + txc->state = TransContext::STATE_FINISHING; + break; + + case TransContext::STATE_DEFERRED_CLEANUP: + txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat); + txc->state = TransContext::STATE_FINISHING; + // ** fall-thru ** + + case TransContext::STATE_FINISHING: + txc->log_state_latency(logger, l_bluestore_state_finishing_lat); + _txc_finish(txc); + return; + + default: + derr << __func__ << " unexpected txc " << txc + << " state " << txc->get_state_name() << dendl; + ceph_abort_msg("unexpected txc state"); + return; + } + } +} + +void BlueStore::_txc_finish_io(TransContext *txc) +{ + dout(20) << __func__ << " " << txc << dendl; + + /* + * we need to preserve the order of kv transactions, + * even though aio will complete in any order. + */ + + OpSequencer *osr = txc->osr.get(); + std::lock_guard l(osr->qlock); + txc->state = TransContext::STATE_IO_DONE; + txc->ioc.release_running_aios(); + OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc); + while (p != osr->q.begin()) { + --p; + if (p->state < TransContext::STATE_IO_DONE) { + dout(20) << __func__ << " " << txc << " blocked by " << &*p << " " + << p->get_state_name() << dendl; + return; + } + if (p->state > TransContext::STATE_IO_DONE) { + ++p; + break; + } + } + do { + _txc_state_proc(&*p++); + } while (p != osr->q.end() && + p->state == TransContext::STATE_IO_DONE); + + if (osr->kv_submitted_waiters) { + osr->qcond.notify_all(); + } +} + +void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t) +{ + dout(20) << __func__ << " txc " << txc + << " onodes " << txc->onodes + << " shared_blobs " << txc->shared_blobs + << dendl; + + // finalize onodes + for (auto o : txc->onodes) { + _record_onode(o, t); + o->flushing_count++; + } + + // objects we modified but didn't affect the onode + auto p = txc->modified_objects.begin(); + while (p != txc->modified_objects.end()) { + if (txc->onodes.count(*p) == 0) { + (*p)->flushing_count++; + ++p; + } else { + // remove dups with onodes list to avoid problems in _txc_finish + p = txc->modified_objects.erase(p); + } + } + + // finalize shared_blobs + for (auto sb : txc->shared_blobs) { + string key; + auto sbid = sb->get_sbid(); + get_shared_blob_key(sbid, &key); + if (sb->persistent->empty()) { + dout(20) << __func__ << " shared_blob 0x" + << std::hex << sbid << std::dec + << " is empty" << dendl; + t->rmkey(PREFIX_SHARED_BLOB, key); + } else { + bufferlist bl; + encode(*(sb->persistent), bl); + dout(20) << __func__ << " shared_blob 0x" + << std::hex << sbid << std::dec + << " is " << bl.length() << " " << *sb << dendl; + t->set(PREFIX_SHARED_BLOB, key, bl); + } + } +} + +void BlueStore::BSPerfTracker::update_from_perfcounters( + PerfCounters &logger) +{ + os_commit_latency_ns.consume_next( + logger.get_tavg_ns( + l_bluestore_commit_lat)); + os_apply_latency_ns.consume_next( + logger.get_tavg_ns( + l_bluestore_commit_lat)); +} + +void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t) +{ + dout(20) << __func__ << " txc " << txc << std::hex + << " allocated 0x" << txc->allocated + << " released 0x" << txc->released + << std::dec << dendl; + + // We have to handle the case where we allocate *and* deallocate the + // same region in this transaction. The freelist doesn't like that. + // (Actually, the only thing that cares is the BitmapFreelistManager + // debug check. But that's important.) + interval_set<uint64_t> tmp_allocated, tmp_released; + interval_set<uint64_t> *pallocated = &txc->allocated; + interval_set<uint64_t> *preleased = &txc->released; + if (!txc->allocated.empty() && !txc->released.empty()) { + interval_set<uint64_t> overlap; + overlap.intersection_of(txc->allocated, txc->released); + if (!overlap.empty()) { + tmp_allocated = txc->allocated; + tmp_allocated.subtract(overlap); + tmp_released = txc->released; + tmp_released.subtract(overlap); + dout(20) << __func__ << " overlap 0x" << std::hex << overlap + << ", new allocated 0x" << tmp_allocated + << " released 0x" << tmp_released << std::dec + << dendl; + pallocated = &tmp_allocated; + preleased = &tmp_released; + } + } + + // update freelist with non-overlap sets + for (interval_set<uint64_t>::iterator p = pallocated->begin(); + p != pallocated->end(); + ++p) { + fm->allocate(p.get_start(), p.get_len(), t); + } + for (interval_set<uint64_t>::iterator p = preleased->begin(); + p != preleased->end(); + ++p) { + dout(20) << __func__ << " release 0x" << std::hex << p.get_start() + << "~" << p.get_len() << std::dec << dendl; + fm->release(p.get_start(), p.get_len(), t); + } + + _txc_update_store_statfs(txc); +} + +void BlueStore::_txc_applied_kv(TransContext *txc) +{ + for (auto ls : { &txc->onodes, &txc->modified_objects }) { + for (auto& o : *ls) { + dout(20) << __func__ << " onode " << o << " had " << o->flushing_count + << dendl; + if (--o->flushing_count == 0) { + std::lock_guard l(o->flush_lock); + o->flush_cond.notify_all(); + } + } + } +} + +void BlueStore::_txc_committed_kv(TransContext *txc) +{ + dout(20) << __func__ << " txc " << txc << dendl; + { + std::lock_guard l(txc->osr->qlock); + txc->state = TransContext::STATE_KV_DONE; + if (txc->ch->commit_queue) { + txc->ch->commit_queue->queue(txc->oncommits); + } else { + finisher.queue(txc->oncommits); + } + } + txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat); + log_latency_fn( + __func__, + l_bluestore_commit_lat, + ceph::make_timespan(ceph_clock_now() - txc->start), + cct->_conf->bluestore_log_op_age, + [&](auto lat) { + return ", txc = " + stringify(txc); + } + ); +} + +void BlueStore::_txc_finish(TransContext *txc) +{ + dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl; + ceph_assert(txc->state == TransContext::STATE_FINISHING); + + for (auto& sb : txc->shared_blobs_written) { + sb->finish_write(txc->seq); + } + txc->shared_blobs_written.clear(); + + while (!txc->removed_collections.empty()) { + _queue_reap_collection(txc->removed_collections.front()); + txc->removed_collections.pop_front(); + } + + OpSequencerRef osr = txc->osr; + bool empty = false; + bool submit_deferred = false; + OpSequencer::q_list_t releasing_txc; + { + std::lock_guard l(osr->qlock); + txc->state = TransContext::STATE_DONE; + bool notify = false; + while (!osr->q.empty()) { + TransContext *txc = &osr->q.front(); + dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name() + << dendl; + if (txc->state != TransContext::STATE_DONE) { + if (txc->state == TransContext::STATE_PREPARE && + deferred_aggressive) { + // for _osr_drain_preceding() + notify = true; + } + if (txc->state == TransContext::STATE_DEFERRED_QUEUED && + osr->q.size() > g_conf()->bluestore_max_deferred_txc) { + submit_deferred = true; + } + break; + } + + osr->q.pop_front(); + releasing_txc.push_back(*txc); + notify = true; + } + if (notify) { + osr->qcond.notify_all(); + } + if (osr->q.empty()) { + dout(20) << __func__ << " osr " << osr << " q now empty" << dendl; + empty = true; + } + } + while (!releasing_txc.empty()) { + // release to allocator only after all preceding txc's have also + // finished any deferred writes that potentially land in these + // blocks + auto txc = &releasing_txc.front(); + _txc_release_alloc(txc); + releasing_txc.pop_front(); + txc->log_state_latency(logger, l_bluestore_state_done_lat); + delete txc; + } + + if (submit_deferred) { + // we're pinning memory; flush! we could be more fine-grained here but + // i'm not sure it's worth the bother. + deferred_try_submit(); + } + + if (empty && osr->zombie) { + std::lock_guard l(zombie_osr_lock); + if (zombie_osr_set.erase(osr->cid)) { + dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl; + } else { + dout(10) << __func__ << " empty zombie osr " << osr << " already reaped" + << dendl; + } + } + } + +void BlueStore::_txc_release_alloc(TransContext *txc) +{ + // it's expected we're called with lazy_release_lock already taken! + if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) { + int r = 0; + if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) { + r = bdev->queue_discard(txc->released); + if (r == 0) { + dout(10) << __func__ << "(queued) " << txc << " " << std::hex + << txc->released << std::dec << dendl; + goto out; + } + } else if (cct->_conf->bdev_enable_discard) { + for (auto p = txc->released.begin(); p != txc->released.end(); ++p) { + bdev->discard(p.get_start(), p.get_len()); + } + } + dout(10) << __func__ << "(sync) " << txc << " " << std::hex + << txc->released << std::dec << dendl; + alloc->release(txc->released); + } + +out: + txc->allocated.clear(); + txc->released.clear(); +} + +void BlueStore::_osr_attach(Collection *c) +{ + // note: caller has RWLock on coll_map + auto q = coll_map.find(c->cid); + if (q != coll_map.end()) { + c->osr = q->second->osr; + ldout(cct, 10) << __func__ << " " << c->cid + << " reusing osr " << c->osr << " from existing coll " + << q->second << dendl; + } else { + std::lock_guard l(zombie_osr_lock); + auto p = zombie_osr_set.find(c->cid); + if (p == zombie_osr_set.end()) { + c->osr = new OpSequencer(this, c->cid); + ldout(cct, 10) << __func__ << " " << c->cid + << " fresh osr " << c->osr << dendl; + } else { + c->osr = p->second; + zombie_osr_set.erase(p); + ldout(cct, 10) << __func__ << " " << c->cid + << " resurrecting zombie osr " << c->osr << dendl; + c->osr->zombie = false; + } + } +} + +void BlueStore::_osr_register_zombie(OpSequencer *osr) +{ + std::lock_guard l(zombie_osr_lock); + dout(10) << __func__ << " " << osr << " " << osr->cid << dendl; + osr->zombie = true; + auto i = zombie_osr_set.emplace(osr->cid, osr); + // this is either a new insertion or the same osr is already there + ceph_assert(i.second || i.first->second == osr); +} + +void BlueStore::_osr_drain_preceding(TransContext *txc) +{ + OpSequencer *osr = txc->osr.get(); + dout(10) << __func__ << " " << txc << " osr " << osr << dendl; + ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag? + { + // submit anything pending + deferred_lock.lock(); + if (osr->deferred_pending && !osr->deferred_running) { + _deferred_submit_unlock(osr); + } else { + deferred_lock.unlock(); + } + } + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + kv_cond.notify_one(); + } + osr->drain_preceding(txc); + --deferred_aggressive; + dout(10) << __func__ << " " << osr << " done" << dendl; +} + +void BlueStore::_osr_drain(OpSequencer *osr) +{ + dout(10) << __func__ << " " << osr << dendl; + ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag? + { + // submit anything pending + deferred_lock.lock(); + if (osr->deferred_pending && !osr->deferred_running) { + _deferred_submit_unlock(osr); + } else { + deferred_lock.unlock(); + } + } + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + kv_cond.notify_one(); + } + osr->drain(); + --deferred_aggressive; + dout(10) << __func__ << " " << osr << " done" << dendl; +} + +void BlueStore::_osr_drain_all() +{ + dout(10) << __func__ << dendl; + + set<OpSequencerRef> s; + vector<OpSequencerRef> zombies; + { + RWLock::RLocker l(coll_lock); + for (auto& i : coll_map) { + s.insert(i.second->osr); + } + } + { + std::lock_guard l(zombie_osr_lock); + for (auto& i : zombie_osr_set) { + s.insert(i.second); + zombies.push_back(i.second); + } + } + dout(20) << __func__ << " osr_set " << s << dendl; + + ++deferred_aggressive; + { + // submit anything pending + deferred_try_submit(); + } + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + kv_cond.notify_one(); + } + { + std::lock_guard l(kv_finalize_lock); + kv_finalize_cond.notify_one(); + } + for (auto osr : s) { + dout(20) << __func__ << " drain " << osr << dendl; + osr->drain(); + } + --deferred_aggressive; + + { + std::lock_guard l(zombie_osr_lock); + for (auto& osr : zombies) { + if (zombie_osr_set.erase(osr->cid)) { + dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl; + ceph_assert(osr->q.empty()); + } else if (osr->zombie) { + dout(10) << __func__ << " empty zombie osr " << osr + << " already reaped" << dendl; + ceph_assert(osr->q.empty()); + } else { + dout(10) << __func__ << " empty zombie osr " << osr + << " resurrected" << dendl; + } + } + } + + dout(10) << __func__ << " done" << dendl; +} + + +void BlueStore::_kv_start() +{ + dout(10) << __func__ << dendl; + + deferred_finisher.start(); + finisher.start(); + kv_sync_thread.create("bstore_kv_sync"); + kv_finalize_thread.create("bstore_kv_final"); +} + +void BlueStore::_kv_stop() +{ + dout(10) << __func__ << dendl; + { + std::unique_lock l(kv_lock); + while (!kv_sync_started) { + kv_cond.wait(l); + } + kv_stop = true; + kv_cond.notify_all(); + } + { + std::unique_lock l(kv_finalize_lock); + while (!kv_finalize_started) { + kv_finalize_cond.wait(l); + } + kv_finalize_stop = true; + kv_finalize_cond.notify_all(); + } + kv_sync_thread.join(); + kv_finalize_thread.join(); + ceph_assert(removed_collections.empty()); + { + std::lock_guard l(kv_lock); + kv_stop = false; + } + { + std::lock_guard l(kv_finalize_lock); + kv_finalize_stop = false; + } + dout(10) << __func__ << " stopping finishers" << dendl; + deferred_finisher.wait_for_empty(); + deferred_finisher.stop(); + finisher.wait_for_empty(); + finisher.stop(); + dout(10) << __func__ << " stopped" << dendl; +} + +void BlueStore::_kv_sync_thread() +{ + dout(10) << __func__ << " start" << dendl; + deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable + std::unique_lock l(kv_lock); + ceph_assert(!kv_sync_started); + kv_sync_started = true; + kv_cond.notify_all(); + + auto t0 = mono_clock::now(); + timespan twait = ceph::make_timespan(0); + size_t kv_submitted = 0; + + while (true) { + auto period = cct->_conf->bluestore_kv_sync_util_logging_s; + auto observation_period = + ceph::make_timespan(period); + auto elapsed = mono_clock::now() - t0; + if (period && elapsed >= observation_period) { + dout(5) << __func__ << " utilization: idle " + << twait << " of " << elapsed + << ", submitted: " << kv_submitted + <<dendl; + t0 = mono_clock::now(); + twait = ceph::make_timespan(0); + kv_submitted = 0; + } + ceph_assert(kv_committing.empty()); + if (kv_queue.empty() && + ((deferred_done_queue.empty() && deferred_stable_queue.empty()) || + !deferred_aggressive)) { + if (kv_stop) + break; + dout(20) << __func__ << " sleep" << dendl; + auto t = mono_clock::now(); + kv_cond.wait(l); + twait += mono_clock::now() - t; + + dout(20) << __func__ << " wake" << dendl; + } else { + deque<TransContext*> kv_submitting; + deque<DeferredBatch*> deferred_done, deferred_stable; + uint64_t aios = 0, costs = 0; + + dout(20) << __func__ << " committing " << kv_queue.size() + << " submitting " << kv_queue_unsubmitted.size() + << " deferred done " << deferred_done_queue.size() + << " stable " << deferred_stable_queue.size() + << dendl; + kv_committing.swap(kv_queue); + kv_submitting.swap(kv_queue_unsubmitted); + deferred_done.swap(deferred_done_queue); + deferred_stable.swap(deferred_stable_queue); + aios = kv_ios; + costs = kv_throttle_costs; + kv_ios = 0; + kv_throttle_costs = 0; + l.unlock(); + + dout(30) << __func__ << " committing " << kv_committing << dendl; + dout(30) << __func__ << " submitting " << kv_submitting << dendl; + dout(30) << __func__ << " deferred_done " << deferred_done << dendl; + dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl; + + auto start = mono_clock::now(); + + bool force_flush = false; + // if bluefs is sharing the same device as data (only), then we + // can rely on the bluefs commit to flush the device and make + // deferred aios stable. that means that if we do have done deferred + // txcs AND we are not on a single device, we need to force a flush. + if (bluefs_single_shared_device && bluefs) { + if (aios) { + force_flush = true; + } else if (kv_committing.empty() && deferred_stable.empty()) { + force_flush = true; // there's nothing else to commit! + } else if (deferred_aggressive) { + force_flush = true; + } + } else { + if (aios || !deferred_done.empty()) { + force_flush = true; + } else { + dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl; + } + } + + if (force_flush) { + dout(20) << __func__ << " num_aios=" << aios + << " force_flush=" << (int)force_flush + << ", flushing, deferred done->stable" << dendl; + // flush/barrier on block device + bdev->flush(); + + // if we flush then deferred done are now deferred stable + deferred_stable.insert(deferred_stable.end(), deferred_done.begin(), + deferred_done.end()); + deferred_done.clear(); + } + auto after_flush = mono_clock::now(); + + // we will use one final transaction to force a sync + KeyValueDB::Transaction synct = db->get_transaction(); + + // increase {nid,blobid}_max? note that this covers both the + // case where we are approaching the max and the case we passed + // it. in either case, we increase the max in the earlier txn + // we submit. + uint64_t new_nid_max = 0, new_blobid_max = 0; + if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) { + KeyValueDB::Transaction t = + kv_submitting.empty() ? synct : kv_submitting.front()->t; + new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc; + bufferlist bl; + encode(new_nid_max, bl); + t->set(PREFIX_SUPER, "nid_max", bl); + dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl; + } + if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) { + KeyValueDB::Transaction t = + kv_submitting.empty() ? synct : kv_submitting.front()->t; + new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc; + bufferlist bl; + encode(new_blobid_max, bl); + t->set(PREFIX_SUPER, "blobid_max", bl); + dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl; + } + + for (auto txc : kv_committing) { + if (txc->state == TransContext::STATE_KV_QUEUED) { + txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat); + int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t); + ceph_assert(r == 0); + ++kv_submitted; + txc->state = TransContext::STATE_KV_SUBMITTED; + _txc_applied_kv(txc); + --txc->osr->kv_committing_serially; + if (txc->osr->kv_submitted_waiters) { + std::lock_guard l(txc->osr->qlock); + txc->osr->qcond.notify_all(); + } + + } else { + ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED); + txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat); + } + if (txc->had_ios) { + --txc->osr->txc_with_unstable_io; + } + } + + // release throttle *before* we commit. this allows new ops + // to be prepared and enter pipeline while we are waiting on + // the kv commit sync/flush. then hopefully on the next + // iteration there will already be ops awake. otherwise, we + // end up going to sleep, and then wake up when the very first + // transaction is ready for commit. + throttle_bytes.put(costs); + + if (bluefs && + after_flush - bluefs_last_balance > + ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) { + bluefs_last_balance = after_flush; + int r = _balance_bluefs_freespace(); + ceph_assert(r >= 0); + } + + // cleanup sync deferred keys + for (auto b : deferred_stable) { + for (auto& txc : b->txcs) { + bluestore_deferred_transaction_t& wt = *txc.deferred_txn; + ceph_assert(wt.released.empty()); // only kraken did this + string key; + get_deferred_key(wt.seq, &key); + synct->rm_single_key(PREFIX_DEFERRED, key); + } + } + + // submit synct synchronously (block and wait for it to commit) + int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct); + ceph_assert(r == 0); + + { + std::unique_lock m(kv_finalize_lock); + if (kv_committing_to_finalize.empty()) { + kv_committing_to_finalize.swap(kv_committing); + } else { + kv_committing_to_finalize.insert( + kv_committing_to_finalize.end(), + kv_committing.begin(), + kv_committing.end()); + kv_committing.clear(); + } + if (deferred_stable_to_finalize.empty()) { + deferred_stable_to_finalize.swap(deferred_stable); + } else { + deferred_stable_to_finalize.insert( + deferred_stable_to_finalize.end(), + deferred_stable.begin(), + deferred_stable.end()); + deferred_stable.clear(); + } + kv_finalize_cond.notify_one(); + } + + if (new_nid_max) { + nid_max = new_nid_max; + dout(10) << __func__ << " nid_max now " << nid_max << dendl; + } + if (new_blobid_max) { + blobid_max = new_blobid_max; + dout(10) << __func__ << " blobid_max now " << blobid_max << dendl; + } + + { + auto finish = mono_clock::now(); + ceph::timespan dur_flush = after_flush - start; + ceph::timespan dur_kv = finish - after_flush; + ceph::timespan dur = finish - start; + dout(20) << __func__ << " committed " << kv_committing.size() + << " cleaned " << deferred_stable.size() + << " in " << dur + << " (" << dur_flush << " flush + " << dur_kv << " kv commit)" + << dendl; + log_latency("kv_flush", + l_bluestore_kv_flush_lat, + dur_flush, + cct->_conf->bluestore_log_op_age); + log_latency("kv_commit", + l_bluestore_kv_commit_lat, + dur_kv, + cct->_conf->bluestore_log_op_age); + log_latency("kv_sync", + l_bluestore_kv_sync_lat, + dur, + cct->_conf->bluestore_log_op_age); + } + + if (bluefs) { + if (!bluefs_extents_reclaiming.empty()) { + dout(0) << __func__ << " releasing old bluefs 0x" << std::hex + << bluefs_extents_reclaiming << std::dec << dendl; + int r = 0; + if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) { + r = bdev->queue_discard(bluefs_extents_reclaiming); + if (r == 0) { + goto clear; + } + } else if (cct->_conf->bdev_enable_discard) { + for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) { + bdev->discard(p.get_start(), p.get_len()); + } + } + + alloc->release(bluefs_extents_reclaiming); +clear: + bluefs_extents_reclaiming.clear(); + } + } + + l.lock(); + // previously deferred "done" are now "stable" by virtue of this + // commit cycle. + deferred_stable_queue.swap(deferred_done); + } + } + dout(10) << __func__ << " finish" << dendl; + kv_sync_started = false; +} + +void BlueStore::_kv_finalize_thread() +{ + deque<TransContext*> kv_committed; + deque<DeferredBatch*> deferred_stable; + dout(10) << __func__ << " start" << dendl; + std::unique_lock l(kv_finalize_lock); + ceph_assert(!kv_finalize_started); + kv_finalize_started = true; + kv_finalize_cond.notify_all(); + while (true) { + ceph_assert(kv_committed.empty()); + ceph_assert(deferred_stable.empty()); + if (kv_committing_to_finalize.empty() && + deferred_stable_to_finalize.empty()) { + if (kv_finalize_stop) + break; + dout(20) << __func__ << " sleep" << dendl; + kv_finalize_cond.wait(l); + dout(20) << __func__ << " wake" << dendl; + } else { + kv_committed.swap(kv_committing_to_finalize); + deferred_stable.swap(deferred_stable_to_finalize); + l.unlock(); + dout(20) << __func__ << " kv_committed " << kv_committed << dendl; + dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl; + + auto start = mono_clock::now(); + + while (!kv_committed.empty()) { + TransContext *txc = kv_committed.front(); + ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED); + _txc_state_proc(txc); + kv_committed.pop_front(); + } + + for (auto b : deferred_stable) { + auto p = b->txcs.begin(); + while (p != b->txcs.end()) { + TransContext *txc = &*p; + p = b->txcs.erase(p); // unlink here because + _txc_state_proc(txc); // this may destroy txc + } + delete b; + } + deferred_stable.clear(); + + if (!deferred_aggressive) { + if (deferred_queue_size >= deferred_batch_ops.load() || + throttle_deferred_bytes.past_midpoint()) { + deferred_try_submit(); + } + } + + // this is as good a place as any ... + _reap_collections(); + + logger->set(l_bluestore_fragmentation, + (uint64_t)(alloc->get_fragmentation() * 1000)); + + log_latency("kv_final", + l_bluestore_kv_final_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + + l.lock(); + } + } + dout(10) << __func__ << " finish" << dendl; + kv_finalize_started = false; +} + +bluestore_deferred_op_t *BlueStore::_get_deferred_op( + TransContext *txc, OnodeRef o) +{ + if (!txc->deferred_txn) { + txc->deferred_txn = new bluestore_deferred_transaction_t; + } + txc->deferred_txn->ops.push_back(bluestore_deferred_op_t()); + return &txc->deferred_txn->ops.back(); +} + +void BlueStore::_deferred_queue(TransContext *txc) +{ + dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl; + deferred_lock.lock(); + if (!txc->osr->deferred_pending && + !txc->osr->deferred_running) { + deferred_queue.push_back(*txc->osr); + } + if (!txc->osr->deferred_pending) { + txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get()); + } + ++deferred_queue_size; + txc->osr->deferred_pending->txcs.push_back(*txc); + bluestore_deferred_transaction_t& wt = *txc->deferred_txn; + for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) { + const auto& op = *opi; + ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE); + bufferlist::const_iterator p = op.data.begin(); + for (auto e : op.extents) { + txc->osr->deferred_pending->prepare_write( + cct, wt.seq, e.offset, e.length, p); + } + } + if (deferred_aggressive && + !txc->osr->deferred_running) { + _deferred_submit_unlock(txc->osr.get()); + } else { + deferred_lock.unlock(); + } +} + +void BlueStore::deferred_try_submit() +{ + dout(20) << __func__ << " " << deferred_queue.size() << " osrs, " + << deferred_queue_size << " txcs" << dendl; + std::lock_guard l(deferred_lock); + vector<OpSequencerRef> osrs; + osrs.reserve(deferred_queue.size()); + for (auto& osr : deferred_queue) { + osrs.push_back(&osr); + } + for (auto& osr : osrs) { + if (osr->deferred_pending) { + if (!osr->deferred_running) { + _deferred_submit_unlock(osr.get()); + deferred_lock.lock(); + } else { + dout(20) << __func__ << " osr " << osr << " already has running" + << dendl; + } + } else { + dout(20) << __func__ << " osr " << osr << " has no pending" << dendl; + } + } +} + +void BlueStore::_deferred_submit_unlock(OpSequencer *osr) +{ + dout(10) << __func__ << " osr " << osr + << " " << osr->deferred_pending->iomap.size() << " ios pending " + << dendl; + ceph_assert(osr->deferred_pending); + ceph_assert(!osr->deferred_running); + + auto b = osr->deferred_pending; + deferred_queue_size -= b->seq_bytes.size(); + ceph_assert(deferred_queue_size >= 0); + + osr->deferred_running = osr->deferred_pending; + osr->deferred_pending = nullptr; + + deferred_lock.unlock(); + + for (auto& txc : b->txcs) { + txc.log_state_latency(logger, l_bluestore_state_deferred_queued_lat); + } + uint64_t start = 0, pos = 0; + bufferlist bl; + auto i = b->iomap.begin(); + while (true) { + if (i == b->iomap.end() || i->first != pos) { + if (bl.length()) { + dout(20) << __func__ << " write 0x" << std::hex + << start << "~" << bl.length() + << " crc " << bl.crc32c(-1) << std::dec << dendl; + if (!g_conf()->bluestore_debug_omit_block_device_write) { + logger->inc(l_bluestore_deferred_write_ops); + logger->inc(l_bluestore_deferred_write_bytes, bl.length()); + int r = bdev->aio_write(start, bl, &b->ioc, false); + ceph_assert(r == 0); + } + } + if (i == b->iomap.end()) { + break; + } + start = 0; + pos = i->first; + bl.clear(); + } + dout(20) << __func__ << " seq " << i->second.seq << " 0x" + << std::hex << pos << "~" << i->second.bl.length() << std::dec + << dendl; + if (!bl.length()) { + start = pos; + } + pos += i->second.bl.length(); + bl.claim_append(i->second.bl); + ++i; + } + + bdev->aio_submit(&b->ioc); +} + +struct C_DeferredTrySubmit : public Context { + BlueStore *store; + C_DeferredTrySubmit(BlueStore *s) : store(s) {} + void finish(int r) { + store->deferred_try_submit(); + } +}; + +void BlueStore::_deferred_aio_finish(OpSequencer *osr) +{ + dout(10) << __func__ << " osr " << osr << dendl; + ceph_assert(osr->deferred_running); + DeferredBatch *b = osr->deferred_running; + + { + std::lock_guard l(deferred_lock); + ceph_assert(osr->deferred_running == b); + osr->deferred_running = nullptr; + if (!osr->deferred_pending) { + dout(20) << __func__ << " dequeueing" << dendl; + auto q = deferred_queue.iterator_to(*osr); + deferred_queue.erase(q); + } else if (deferred_aggressive) { + dout(20) << __func__ << " queuing async deferred_try_submit" << dendl; + deferred_finisher.queue(new C_DeferredTrySubmit(this)); + } else { + dout(20) << __func__ << " leaving queued, more pending" << dendl; + } + } + + { + uint64_t costs = 0; + { + std::lock_guard l2(osr->qlock); + for (auto& i : b->txcs) { + TransContext *txc = &i; + txc->log_state_latency(logger, l_bluestore_state_deferred_aio_wait_lat); + txc->state = TransContext::STATE_DEFERRED_CLEANUP; + costs += txc->cost; + } + } + throttle_deferred_bytes.put(costs); + std::lock_guard l(kv_lock); + deferred_done_queue.emplace_back(b); + } + + // in the normal case, do not bother waking up the kv thread; it will + // catch us on the next commit anyway. + if (deferred_aggressive) { + std::lock_guard l(kv_lock); + kv_cond.notify_one(); + } +} + +int BlueStore::_deferred_replay() +{ + dout(10) << __func__ << " start" << dendl; + int count = 0; + int r = 0; + CollectionRef ch = _get_collection(coll_t::meta()); + bool fake_ch = false; + if (!ch) { + // hmm, replaying initial mkfs? + ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get()); + fake_ch = true; + } + OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get()); + KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED); + for (it->lower_bound(string()); it->valid(); it->next(), ++count) { + dout(20) << __func__ << " replay " << pretty_binary_string(it->key()) + << dendl; + bluestore_deferred_transaction_t *deferred_txn = + new bluestore_deferred_transaction_t; + bufferlist bl = it->value(); + auto p = bl.cbegin(); + try { + decode(*deferred_txn, p); + } catch (buffer::error& e) { + derr << __func__ << " failed to decode deferred txn " + << pretty_binary_string(it->key()) << dendl; + delete deferred_txn; + r = -EIO; + goto out; + } + TransContext *txc = _txc_create(ch.get(), osr, nullptr); + txc->deferred_txn = deferred_txn; + txc->state = TransContext::STATE_KV_DONE; + _txc_state_proc(txc); + } + out: + dout(20) << __func__ << " draining osr" << dendl; + _osr_register_zombie(osr); + _osr_drain_all(); + if (fake_ch) { + new_coll_map.clear(); + } + dout(10) << __func__ << " completed " << count << " events" << dendl; + return r; +} + +// --------------------------- +// transactions + +int BlueStore::queue_transactions( + CollectionHandle& ch, + vector<Transaction>& tls, + TrackedOpRef op, + ThreadPool::TPHandle *handle) +{ + FUNCTRACE(cct); + list<Context *> on_applied, on_commit, on_applied_sync; + ObjectStore::Transaction::collect_contexts( + tls, &on_applied, &on_commit, &on_applied_sync); + + auto start = mono_clock::now(); + + Collection *c = static_cast<Collection*>(ch.get()); + OpSequencer *osr = c->osr.get(); + dout(10) << __func__ << " ch " << c << " " << c->cid << dendl; + + // prepare + TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr, + &on_commit); + + for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) { + txc->bytes += (*p).get_num_bytes(); + _txc_add_transaction(txc, &(*p)); + } + _txc_calc_cost(txc); + + _txc_write_nodes(txc, txc->t); + + // journal deferred items + if (txc->deferred_txn) { + txc->deferred_txn->seq = ++deferred_seq; + bufferlist bl; + encode(*txc->deferred_txn, bl); + string key; + get_deferred_key(txc->deferred_txn->seq, &key); + txc->t->set(PREFIX_DEFERRED, key, bl); + } + + _txc_finalize_kv(txc, txc->t); + if (handle) + handle->suspend_tp_timeout(); + + auto tstart = mono_clock::now(); + throttle_bytes.get(txc->cost); + if (txc->deferred_txn) { + // ensure we do not block here because of deferred writes + if (!throttle_deferred_bytes.get_or_fail(txc->cost)) { + dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive" + << dendl; + ++deferred_aggressive; + deferred_try_submit(); + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + kv_cond.notify_one(); + } + throttle_deferred_bytes.get(txc->cost); + --deferred_aggressive; + } + } + auto tend = mono_clock::now(); + + if (handle) + handle->reset_tp_timeout(); + + logger->inc(l_bluestore_txc); + + // execute (start) + _txc_state_proc(txc); + + // we're immediately readable (unlike FileStore) + for (auto c : on_applied_sync) { + c->complete(0); + } + if (!on_applied.empty()) { + if (c->commit_queue) { + c->commit_queue->queue(on_applied); + } else { + finisher.queue(on_applied); + } + } + + log_latency("submit_transact", + l_bluestore_submit_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + log_latency("throttle_transact", + l_bluestore_throttle_lat, + tend - tstart, + cct->_conf->bluestore_log_op_age); + return 0; +} + +void BlueStore::_txc_aio_submit(TransContext *txc) +{ + dout(10) << __func__ << " txc " << txc << dendl; + bdev->aio_submit(&txc->ioc); +} + +void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t) +{ + Transaction::iterator i = t->begin(); + + _dump_transaction<30>(cct, t); + + vector<CollectionRef> cvec(i.colls.size()); + unsigned j = 0; + for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end(); + ++p, ++j) { + cvec[j] = _get_collection(*p); + } + + vector<OnodeRef> ovec(i.objects.size()); + + for (int pos = 0; i.have_op(); ++pos) { + Transaction::Op *op = i.decode_op(); + int r = 0; + + // no coll or obj + if (op->op == Transaction::OP_NOP) + continue; + + + // collection operations + CollectionRef &c = cvec[op->cid]; + + // initialize osd_pool_id and do a smoke test that all collections belong + // to the same pool + spg_t pgid; + if (!!c ? c->cid.is_pg(&pgid) : false) { + ceph_assert(txc->osd_pool_id == META_POOL_ID || + txc->osd_pool_id == pgid.pool()); + txc->osd_pool_id = pgid.pool(); + } + + switch (op->op) { + case Transaction::OP_RMCOLL: + { + const coll_t &cid = i.get_cid(op->cid); + r = _remove_collection(txc, cid, &c); + if (!r) + continue; + } + break; + + case Transaction::OP_MKCOLL: + { + ceph_assert(!c); + const coll_t &cid = i.get_cid(op->cid); + r = _create_collection(txc, cid, op->split_bits, &c); + if (!r) + continue; + } + break; + + case Transaction::OP_SPLIT_COLLECTION: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_SPLIT_COLLECTION2: + { + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem); + if (!r) + continue; + } + break; + + case Transaction::OP_MERGE_COLLECTION: + { + uint32_t bits = op->split_bits; + r = _merge_collection(txc, &c, cvec[op->dest_cid], bits); + if (!r) + continue; + } + break; + + case Transaction::OP_COLL_HINT: + { + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + auto hiter = hint.cbegin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + decode(pg_num, hiter); + decode(num_objs, hiter); + dout(10) << __func__ << " collection hint objects is a no-op, " + << " pg_num " << pg_num << " num_objects " << num_objs + << dendl; + } else { + // Ignore the hint + dout(10) << __func__ << " unknown collection hint " << type << dendl; + } + continue; + } + break; + + case Transaction::OP_COLL_SETATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RMATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RENAME: + ceph_abort_msg("not implemented"); + break; + } + if (r < 0) { + derr << __func__ << " error " << cpp_strerror(r) + << " not handled on operation " << op->op + << " (op " << pos << ", counting from 0)" << dendl; + _dump_transaction<0>(cct, t); + ceph_abort_msg("unexpected error"); + } + + // these operations implicity create the object + bool create = false; + if (op->op == Transaction::OP_TOUCH || + op->op == Transaction::OP_WRITE || + op->op == Transaction::OP_ZERO) { + create = true; + } + + // object operations + RWLock::WLocker l(c->lock); + OnodeRef &o = ovec[op->oid]; + if (!o) { + ghobject_t oid = i.get_oid(op->oid); + o = c->get_onode(oid, create); + } + if (!create && (!o || !o->exists)) { + dout(10) << __func__ << " op " << op->op << " got ENOENT on " + << i.get_oid(op->oid) << dendl; + r = -ENOENT; + goto endop; + } + + switch (op->op) { + case Transaction::OP_TOUCH: + r = _touch(txc, c, o); + break; + + case Transaction::OP_WRITE: + { + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + r = _write(txc, c, o, off, len, bl, fadvise_flags); + } + break; + + case Transaction::OP_ZERO: + { + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(txc, c, o, off, len); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + uint64_t off = op->off; + r = _truncate(txc, c, o, off); + } + break; + + case Transaction::OP_REMOVE: + { + r = _remove(txc, c, o); + } + break; + + case Transaction::OP_SETATTR: + { + string name = i.decode_string(); + bufferptr bp; + i.decode_bp(bp); + r = _setattr(txc, c, o, name, bp); + } + break; + + case Transaction::OP_SETATTRS: + { + map<string, bufferptr> aset; + i.decode_attrset(aset); + r = _setattrs(txc, c, o, aset); + } + break; + + case Transaction::OP_RMATTR: + { + string name = i.decode_string(); + r = _rmattr(txc, c, o, name); + } + break; + + case Transaction::OP_RMATTRS: + { + r = _rmattrs(txc, c, o); + } + break; + + case Transaction::OP_CLONE: + { + OnodeRef& no = ovec[op->dest_oid]; + if (!no) { + const ghobject_t& noid = i.get_oid(op->dest_oid); + no = c->get_onode(noid, true); + } + r = _clone(txc, c, o, no); + } + break; + + case Transaction::OP_CLONERANGE: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_CLONERANGE2: + { + OnodeRef& no = ovec[op->dest_oid]; + if (!no) { + const ghobject_t& noid = i.get_oid(op->dest_oid); + no = c->get_onode(noid, true); + } + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + r = _clone_range(txc, c, o, no, srcoff, len, dstoff); + } + break; + + case Transaction::OP_COLL_ADD: + ceph_abort_msg("not implemented"); + break; + + case Transaction::OP_COLL_REMOVE: + ceph_abort_msg("not implemented"); + break; + + case Transaction::OP_COLL_MOVE: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_COLL_MOVE_RENAME: + case Transaction::OP_TRY_RENAME: + { + ceph_assert(op->cid == op->dest_cid); + const ghobject_t& noid = i.get_oid(op->dest_oid); + OnodeRef& no = ovec[op->dest_oid]; + if (!no) { + no = c->get_onode(noid, false); + } + r = _rename(txc, c, o, no, noid); + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + r = _omap_clear(txc, c, o); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + bufferlist aset_bl; + i.decode_attrset_bl(&aset_bl); + r = _omap_setkeys(txc, c, o, aset_bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + bufferlist keys_bl; + i.decode_keyset_bl(&keys_bl); + r = _omap_rmkeys(txc, c, o, keys_bl); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkey_range(txc, c, o, first, last); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + bufferlist bl; + i.decode_bl(bl); + r = _omap_setheader(txc, c, o, bl); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + r = _set_alloc_hint(txc, c, o, + op->expected_object_size, + op->expected_write_size, + op->alloc_hint_flags); + } + break; + + default: + derr << __func__ << " bad op " << op->op << dendl; + ceph_abort(); + } + + endop: + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD || + op->op == Transaction::OP_SETATTR || + op->op == Transaction::OP_SETATTRS || + op->op == Transaction::OP_RMATTR || + op->op == Transaction::OP_OMAP_SETKEYS || + op->op == Transaction::OP_OMAP_RMKEYS || + op->op == Transaction::OP_OMAP_RMKEYRANGE || + op->op == Transaction::OP_OMAP_SETHEADER)) + // -ENOENT is usually okay + ok = true; + if (r == -ENODATA) + ok = true; + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) + msg = "ENOENT on clone suggests osd bug"; + + if (r == -ENOSPC) + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC from bluestore, misconfigured cluster"; + + if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } + + derr << __func__ << " error " << cpp_strerror(r) + << " not handled on operation " << op->op + << " (op " << pos << ", counting from 0)" + << dendl; + derr << msg << dendl; + _dump_transaction<0>(cct, t); + ceph_abort_msg("unexpected error"); + } + } + } +} + + + +// ----------------- +// write operations + +int BlueStore::_touch(TransContext *txc, + CollectionRef& c, + OnodeRef &o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + _assign_nid(txc, o); + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +void BlueStore::_pad_zeros( + bufferlist *bl, uint64_t *offset, + uint64_t chunk_size) +{ + auto length = bl->length(); + dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length + << " chunk_size 0x" << chunk_size << std::dec << dendl; + dout(40) << "before:\n"; + bl->hexdump(*_dout); + *_dout << dendl; + // front + size_t front_pad = *offset % chunk_size; + size_t back_pad = 0; + size_t pad_count = 0; + if (front_pad) { + size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length); + bufferptr z = buffer::create_small_page_aligned(chunk_size); + z.zero(0, front_pad, false); + pad_count += front_pad; + bl->copy(0, front_copy, z.c_str() + front_pad); + if (front_copy + front_pad < chunk_size) { + back_pad = chunk_size - (length + front_pad); + z.zero(front_pad + length, back_pad, false); + pad_count += back_pad; + } + bufferlist old, t; + old.swap(*bl); + t.substr_of(old, front_copy, length - front_copy); + bl->append(z); + bl->claim_append(t); + *offset -= front_pad; + length += pad_count; + } + + // back + uint64_t end = *offset + length; + unsigned back_copy = end % chunk_size; + if (back_copy) { + ceph_assert(back_pad == 0); + back_pad = chunk_size - back_copy; + ceph_assert(back_copy <= length); + bufferptr tail(chunk_size); + bl->copy(length - back_copy, back_copy, tail.c_str()); + tail.zero(back_copy, back_pad, false); + bufferlist old; + old.swap(*bl); + bl->substr_of(old, 0, length - back_copy); + bl->append(tail); + length += back_pad; + pad_count += back_pad; + } + dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x" + << back_pad << " on front/back, now 0x" << *offset << "~" + << length << std::dec << dendl; + dout(40) << "after:\n"; + bl->hexdump(*_dout); + *_dout << dendl; + if (pad_count) + logger->inc(l_bluestore_write_pad_bytes, pad_count); + ceph_assert(bl->length() == length); +} + +void BlueStore::_do_write_small( + TransContext *txc, + CollectionRef &c, + OnodeRef o, + uint64_t offset, uint64_t length, + bufferlist::iterator& blp, + WriteContext *wctx) +{ + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + ceph_assert(length < min_alloc_size); + uint64_t end_offs = offset + length; + + logger->inc(l_bluestore_write_small); + logger->inc(l_bluestore_write_small_bytes, length); + + bufferlist bl; + blp.copy(length, bl); + + auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); + auto min_off = offset >= max_bsize ? offset - max_bsize : 0; + uint32_t alloc_len = min_alloc_size; + auto offset0 = p2align<uint64_t>(offset, alloc_len); + + bool any_change; + + // search suitable extent in both forward and reverse direction in + // [offset - target_max_blob_size, offset + target_max_blob_size] range + // then check if blob can be reused via can_reuse_blob func or apply + // direct/deferred write (the latter for extents including or higher + // than 'offset' only). + o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off); + + // Look for an existing mutable blob we can use. + auto begin = o->extent_map.extent_map.begin(); + auto end = o->extent_map.extent_map.end(); + auto ep = o->extent_map.seek_lextent(offset); + if (ep != begin) { + --ep; + if (ep->blob_end() <= offset) { + ++ep; + } + } + auto prev_ep = ep; + if (prev_ep != begin) { + --prev_ep; + } else { + prev_ep = end; // to avoid this extent check as it's a duplicate + } + + boost::container::flat_set<const bluestore_blob_t*> inspected_blobs; + // We don't want to have more blobs than min alloc units fit + // into 2 max blobs + size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1; + bool above_blob_threshold = false; + + inspected_blobs.reserve(blob_threshold); + + uint64_t max_off = 0; + auto start_ep = ep; + auto end_ep = ep; // exclusively + do { + any_change = false; + + if (ep != end && ep->logical_offset < offset + max_bsize) { + BlobRef b = ep->blob; + if (!above_blob_threshold) { + inspected_blobs.insert(&b->get_blob()); + above_blob_threshold = inspected_blobs.size() >= blob_threshold; + } + max_off = ep->logical_end(); + auto bstart = ep->blob_start(); + + dout(20) << __func__ << " considering " << *b + << " bstart 0x" << std::hex << bstart << std::dec << dendl; + if (bstart >= end_offs) { + dout(20) << __func__ << " ignoring distant " << *b << dendl; + } else if (!b->get_blob().is_mutable()) { + dout(20) << __func__ << " ignoring immutable " << *b << dendl; + } else if (ep->logical_offset % min_alloc_size != + ep->blob_offset % min_alloc_size) { + dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl; + } else { + uint64_t chunk_size = b->get_blob().get_chunk_size(block_size); + // can we pad our head/tail out with zeros? + uint64_t head_pad, tail_pad; + head_pad = p2phase(offset, chunk_size); + tail_pad = p2nphase(end_offs, chunk_size); + if (head_pad || tail_pad) { + o->extent_map.fault_range(db, offset - head_pad, + end_offs - offset + head_pad + tail_pad); + } + if (head_pad && + o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) { + head_pad = 0; + } + if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) { + tail_pad = 0; + } + + uint64_t b_off = offset - head_pad - bstart; + uint64_t b_len = length + head_pad + tail_pad; + + // direct write into unused blocks of an existing mutable blob? + if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) && + b->get_blob().get_ondisk_length() >= b_off + b_len && + b->get_blob().is_unused(b_off, b_len) && + b->get_blob().is_allocated(b_off, b_len)) { + _apply_padding(head_pad, tail_pad, bl); + + dout(20) << __func__ << " write to unused 0x" << std::hex + << b_off << "~" << b_len + << " pad 0x" << head_pad << " + 0x" << tail_pad + << std::dec << " of mutable " << *b << dendl; + _buffer_cache_write(txc, b, b_off, bl, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + if (!g_conf()->bluestore_debug_omit_block_device_write) { + if (b_len <= prefer_deferred_size) { + dout(20) << __func__ << " deferring small 0x" << std::hex + << b_len << std::dec << " unused write via deferred" << dendl; + bluestore_deferred_op_t *op = _get_deferred_op(txc, o); + op->op = bluestore_deferred_op_t::OP_WRITE; + b->get_blob().map( + b_off, b_len, + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + op->data = bl; + } else { + b->get_blob().map_bl( + b_off, bl, + [&](uint64_t offset, bufferlist& t) { + bdev->aio_write(offset, t, + &txc->ioc, wctx->buffered); + }); + } + } + b->dirty_blob().calc_csum(b_off, bl); + dout(20) << __func__ << " lex old " << *ep << dendl; + Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length, + b, + &wctx->old_extents); + b->dirty_blob().mark_used(le->blob_offset, le->length); + txc->statfs_delta.stored() += le->length; + dout(20) << __func__ << " lex " << *le << dendl; + logger->inc(l_bluestore_write_small_unused); + return; + } + // read some data to fill out the chunk? + uint64_t head_read = p2phase(b_off, chunk_size); + uint64_t tail_read = p2nphase(b_off + b_len, chunk_size); + if ((head_read || tail_read) && + (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) && + head_read + tail_read < min_alloc_size) { + b_off -= head_read; + b_len += head_read + tail_read; + + } else { + head_read = tail_read = 0; + } + + // chunk-aligned deferred overwrite? + if (b->get_blob().get_ondisk_length() >= b_off + b_len && + b_off % chunk_size == 0 && + b_len % chunk_size == 0 && + b->get_blob().is_allocated(b_off, b_len)) { + + _apply_padding(head_pad, tail_pad, bl); + + dout(20) << __func__ << " reading head 0x" << std::hex << head_read + << " and tail 0x" << tail_read << std::dec << dendl; + if (head_read) { + bufferlist head_bl; + int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read, + head_bl, 0); + ceph_assert(r >= 0 && r <= (int)head_read); + size_t zlen = head_read - r; + if (zlen) { + head_bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + head_bl.claim_append(bl); + bl.swap(head_bl); + logger->inc(l_bluestore_write_penalty_read_ops); + } + if (tail_read) { + bufferlist tail_bl; + int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read, + tail_bl, 0); + ceph_assert(r >= 0 && r <= (int)tail_read); + size_t zlen = tail_read - r; + if (zlen) { + tail_bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + bl.claim_append(tail_bl); + logger->inc(l_bluestore_write_penalty_read_ops); + } + logger->inc(l_bluestore_write_small_pre_read); + + _buffer_cache_write(txc, b, b_off, bl, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + if (b->get_blob().csum_type) { + b->dirty_blob().calc_csum(b_off, bl); + } + + if (!g_conf()->bluestore_debug_omit_block_device_write) { + bluestore_deferred_op_t *op = _get_deferred_op(txc, o); + op->op = bluestore_deferred_op_t::OP_WRITE; + int r = b->get_blob().map( + b_off, b_len, + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + ceph_assert(r == 0); + op->data.claim(bl); + dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~" + << b_len << std::dec << " of mutable " << *b + << " at " << op->extents << dendl; + } + + Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length, + b, &wctx->old_extents); + b->dirty_blob().mark_used(le->blob_offset, le->length); + txc->statfs_delta.stored() += le->length; + dout(20) << __func__ << " lex " << *le << dendl; + logger->inc(l_bluestore_write_small_deferred); + return; + } + // try to reuse blob if we can + if (b->can_reuse_blob(min_alloc_size, + max_bsize, + offset0 - bstart, + &alloc_len)) { + ceph_assert(alloc_len == min_alloc_size); // expecting data always + // fit into reused blob + // Need to check for pending writes desiring to + // reuse the same pextent. The rationale is that during GC two chunks + // from garbage blobs(compressed?) can share logical space within the same + // AU. That's in turn might be caused by unaligned len in clone_range2. + // Hence the second write will fail in an attempt to reuse blob at + // do_alloc_write(). + if (!wctx->has_conflict(b, + offset0, + offset0 + alloc_len, + min_alloc_size)) { + + // we can't reuse pad_head/pad_tail since they might be truncated + // due to existent extents + uint64_t b_off = offset - bstart; + uint64_t b_off0 = b_off; + _pad_zeros(&bl, &b_off0, chunk_size); + + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (0x" << b_off0 << "~" << bl.length() << ")" + << " (0x" << b_off << "~" << length << ")" + << std::dec << dendl; + + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, + false, false); + logger->inc(l_bluestore_write_small_unused); + return; + } + } + } + ++ep; + end_ep = ep; + any_change = true; + } // if (ep != end && ep->logical_offset < offset + max_bsize) + + // check extent for reuse in reverse order + if (prev_ep != end && prev_ep->logical_offset >= min_off) { + BlobRef b = prev_ep->blob; + if (!above_blob_threshold) { + inspected_blobs.insert(&b->get_blob()); + above_blob_threshold = inspected_blobs.size() >= blob_threshold; + } + start_ep = prev_ep; + auto bstart = prev_ep->blob_start(); + dout(20) << __func__ << " considering " << *b + << " bstart 0x" << std::hex << bstart << std::dec << dendl; + if (b->can_reuse_blob(min_alloc_size, + max_bsize, + offset0 - bstart, + &alloc_len)) { + ceph_assert(alloc_len == min_alloc_size); // expecting data always + // fit into reused blob + // Need to check for pending writes desiring to + // reuse the same pextent. The rationale is that during GC two chunks + // from garbage blobs(compressed?) can share logical space within the same + // AU. That's in turn might be caused by unaligned len in clone_range2. + // Hence the second write will fail in an attempt to reuse blob at + // do_alloc_write(). + if (!wctx->has_conflict(b, + offset0, + offset0 + alloc_len, + min_alloc_size)) { + + uint64_t chunk_size = b->get_blob().get_chunk_size(block_size); + uint64_t b_off = offset - bstart; + uint64_t b_off0 = b_off; + _pad_zeros(&bl, &b_off0, chunk_size); + + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (0x" << b_off0 << "~" << bl.length() << ")" + << " (0x" << b_off << "~" << length << ")" + << std::dec << dendl; + + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, + false, false); + logger->inc(l_bluestore_write_small_unused); + return; + } + } + if (prev_ep != begin) { + --prev_ep; + any_change = true; + } else { + prev_ep = end; // to avoid useless first extent re-check + } + } // if (prev_ep != end && prev_ep->logical_offset >= min_off) + } while (any_change); + + if (above_blob_threshold) { + dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size() + << " " << std::hex << min_off << "~" << max_off << std::dec + << dendl; + ceph_assert(start_ep != end_ep); + for (auto ep = start_ep; ep != end_ep; ++ep) { + dout(20) << __func__ << " inserting for GC " + << std::hex << ep->logical_offset << "~" << ep->length + << std::dec << dendl; + + wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length); + } + // insert newly written extent to GC + wctx->extents_to_gc.union_insert(offset, length); + dout(20) << __func__ << " inserting (last) for GC " + << std::hex << offset << "~" << length + << std::dec << dendl; + } + // new blob. + BlobRef b = c->new_blob(); + uint64_t b_off = p2phase<uint64_t>(offset, alloc_len); + uint64_t b_off0 = b_off; + _pad_zeros(&bl, &b_off0, block_size); + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, + min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity + // doesn't match disk one only + true); + + return; +} + +void BlueStore::_do_write_big( + TransContext *txc, + CollectionRef &c, + OnodeRef o, + uint64_t offset, uint64_t length, + bufferlist::iterator& blp, + WriteContext *wctx) +{ + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length + << " target_blob_size 0x" << wctx->target_blob_size << std::dec + << " compress " << (int)wctx->compress + << dendl; + logger->inc(l_bluestore_write_big); + logger->inc(l_bluestore_write_big_bytes, length); + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); + while (length > 0) { + bool new_blob = false; + uint32_t l = std::min(max_bsize, length); + BlobRef b; + uint32_t b_off = 0; + + //attempting to reuse existing blob + if (!wctx->compress) { + // look for an existing mutable blob we can reuse + auto begin = o->extent_map.extent_map.begin(); + auto end = o->extent_map.extent_map.end(); + auto ep = o->extent_map.seek_lextent(offset); + auto prev_ep = ep; + if (prev_ep != begin) { + --prev_ep; + } else { + prev_ep = end; // to avoid this extent check as it's a duplicate + } + auto min_off = offset >= max_bsize ? offset - max_bsize : 0; + // search suitable extent in both forward and reverse direction in + // [offset - target_max_blob_size, offset + target_max_blob_size] range + // then check if blob can be reused via can_reuse_blob func. + bool any_change; + do { + any_change = false; + if (ep != end && ep->logical_offset < offset + max_bsize) { + if (offset >= ep->blob_start() && + ep->blob->can_reuse_blob(min_alloc_size, max_bsize, + offset - ep->blob_start(), + &l)) { + b = ep->blob; + b_off = offset - ep->blob_start(); + prev_ep = end; // to avoid check below + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (0x" << b_off << "~" << l << ")" << std::dec << dendl; + } else { + ++ep; + any_change = true; + } + } + + if (prev_ep != end && prev_ep->logical_offset >= min_off) { + if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize, + offset - prev_ep->blob_start(), + &l)) { + b = prev_ep->blob; + b_off = offset - prev_ep->blob_start(); + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (0x" << b_off << "~" << l << ")" << std::dec << dendl; + } else if (prev_ep != begin) { + --prev_ep; + any_change = true; + } else { + prev_ep = end; // to avoid useless first extent re-check + } + } + } while (b == nullptr && any_change); + } + if (b == nullptr) { + b = c->new_blob(); + b_off = 0; + new_blob = true; + } + + bufferlist t; + blp.copy(l, t); + wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob); + offset += l; + length -= l; + logger->inc(l_bluestore_write_big_blobs); + } +} + +int BlueStore::_do_alloc_write( + TransContext *txc, + CollectionRef coll, + OnodeRef o, + WriteContext *wctx) +{ + dout(20) << __func__ << " txc " << txc + << " " << wctx->writes.size() << " blobs" + << dendl; + if (wctx->writes.empty()) { + return 0; + } + + CompressorRef c; + double crr = 0; + if (wctx->compress) { + c = select_option( + "compression_algorithm", + compressor, + [&]() { + string val; + if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) { + CompressorRef cp = compressor; + if (!cp || cp->get_type_name() != val) { + cp = Compressor::create(cct, val); + if (!cp) { + if (_set_compression_alert(false, val.c_str())) { + derr << __func__ << " unable to initialize " << val.c_str() + << " compressor" << dendl; + } + } + } + return boost::optional<CompressorRef>(cp); + } + return boost::optional<CompressorRef>(); + } + ); + + crr = select_option( + "compression_required_ratio", + cct->_conf->bluestore_compression_required_ratio, + [&]() { + double val; + if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) { + return boost::optional<double>(val); + } + return boost::optional<double>(); + } + ); + } + + // checksum + int64_t csum = csum_type.load(); + csum = select_option( + "csum_type", + csum, + [&]() { + int64_t val; + if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) { + return boost::optional<int64_t>(val); + } + return boost::optional<int64_t>(); + } + ); + + // compress (as needed) and calc needed space + uint64_t need = 0; + auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); + for (auto& wi : wctx->writes) { + if (c && wi.blob_length > min_alloc_size) { + auto start = mono_clock::now(); + + // compress + ceph_assert(wi.b_off == 0); + ceph_assert(wi.blob_length == wi.bl.length()); + + // FIXME: memory alignment here is bad + bufferlist t; + int r = c->compress(wi.bl, t); + uint64_t want_len_raw = wi.blob_length * crr; + uint64_t want_len = p2roundup(want_len_raw, min_alloc_size); + bool rejected = false; + uint64_t compressed_len = t.length(); + // do an approximate (fast) estimation for resulting blob size + // that doesn't take header overhead into account + uint64_t result_len = p2roundup(compressed_len, min_alloc_size); + if (r == 0 && result_len <= want_len && result_len < wi.blob_length) { + bluestore_compression_header_t chdr; + chdr.type = c->get_type(); + chdr.length = t.length(); + encode(chdr, wi.compressed_bl); + wi.compressed_bl.claim_append(t); + + compressed_len = wi.compressed_bl.length(); + result_len = p2roundup(compressed_len, min_alloc_size); + if (result_len <= want_len && result_len < wi.blob_length) { + // Cool. We compressed at least as much as we were hoping to. + // pad out to min_alloc_size + wi.compressed_bl.append_zero(result_len - compressed_len); + wi.compressed_len = compressed_len; + wi.compressed = true; + logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len); + dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length + << " -> 0x" << compressed_len << " => 0x" << result_len + << " with " << c->get_type() + << std::dec << dendl; + txc->statfs_delta.compressed() += compressed_len; + txc->statfs_delta.compressed_original() += wi.blob_length; + txc->statfs_delta.compressed_allocated() += result_len; + logger->inc(l_bluestore_compress_success_count); + need += result_len; + } else { + rejected = true; + } + } else if (r != 0) { + dout(5) << __func__ << std::hex << " 0x" << wi.blob_length + << " bytes compressed using " << c->get_type_name() + << std::dec + << " failed with errcode = " << r + << ", leaving uncompressed" + << dendl; + logger->inc(l_bluestore_compress_rejected_count); + need += wi.blob_length; + } else { + rejected = true; + } + + if (rejected) { + dout(20) << __func__ << std::hex << " 0x" << wi.blob_length + << " compressed to 0x" << compressed_len << " -> 0x" << result_len + << " with " << c->get_type() + << ", which is more than required 0x" << want_len_raw + << " -> 0x" << want_len + << ", leaving uncompressed" + << std::dec << dendl; + logger->inc(l_bluestore_compress_rejected_count); + need += wi.blob_length; + } + log_latency("compress@_do_alloc_write", + l_bluestore_compress_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age ); + } else { + need += wi.blob_length; + } + } + PExtentVector prealloc; + prealloc.reserve(2 * wctx->writes.size());; + int64_t prealloc_left = 0; + prealloc_left = alloc->allocate( + need, min_alloc_size, need, + 0, &prealloc); + if (prealloc_left < 0 || prealloc_left < (int64_t)need) { + derr << __func__ << " failed to allocate 0x" << std::hex << need + << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left) + << " min_alloc_size 0x" << min_alloc_size + << " available 0x " << alloc->get_free() + << std::dec << dendl; + if (prealloc.size()) { + alloc->release(prealloc); + } + return -ENOSPC; + } + + dout(20) << __func__ << " prealloc " << prealloc << dendl; + auto prealloc_pos = prealloc.begin(); + + for (auto& wi : wctx->writes) { + BlobRef b = wi.b; + bluestore_blob_t& dblob = b->dirty_blob(); + uint64_t b_off = wi.b_off; + bufferlist *l = &wi.bl; + uint64_t final_length = wi.blob_length; + uint64_t csum_length = wi.blob_length; + if (wi.compressed) { + final_length = wi.compressed_bl.length(); + csum_length = final_length; + unsigned csum_order = ctz(csum_length); + l = &wi.compressed_bl; + dblob.set_compressed(wi.blob_length, wi.compressed_len); + if (csum != Checksummer::CSUM_NONE) { + dout(20) << __func__ << " initialize csum setting for compressed blob " << *b + << " csum_type " << Checksummer::get_csum_type_string(csum) + << " csum_order " << csum_order + << " csum_length 0x" << std::hex << csum_length + << " blob_length 0x" << wi.blob_length + << " compressed_length 0x" << wi.compressed_len << std::dec + << dendl; + dblob.init_csum(csum, csum_order, csum_length); + } + } else if (wi.new_blob) { + unsigned csum_order; + // initialize newly created blob only + ceph_assert(dblob.is_mutable()); + if (l->length() != wi.blob_length) { + // hrm, maybe we could do better here, but let's not bother. + dout(20) << __func__ << " forcing csum_order to block_size_order " + << block_size_order << dendl; + csum_order = block_size_order; + } else { + csum_order = std::min(wctx->csum_order, ctz(l->length())); + } + // try to align blob with max_blob_size to improve + // its reuse ratio, e.g. in case of reverse write + uint32_t suggested_boff = + (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize; + if ((suggested_boff % (1 << csum_order)) == 0 && + suggested_boff + final_length <= max_bsize && + suggested_boff > b_off) { + dout(20) << __func__ << " forcing blob_offset to 0x" + << std::hex << suggested_boff << std::dec << dendl; + ceph_assert(suggested_boff >= b_off); + csum_length += suggested_boff - b_off; + b_off = suggested_boff; + } + if (csum != Checksummer::CSUM_NONE) { + dout(20) << __func__ << " initialize csum setting for new blob " << *b + << " csum_type " << Checksummer::get_csum_type_string(csum) + << " csum_order " << csum_order + << " csum_length 0x" << std::hex << csum_length << std::dec + << dendl; + dblob.init_csum(csum, csum_order, csum_length); + } + } + + PExtentVector extents; + int64_t left = final_length; + while (left > 0) { + ceph_assert(prealloc_left > 0); + if (prealloc_pos->length <= left) { + prealloc_left -= prealloc_pos->length; + left -= prealloc_pos->length; + txc->statfs_delta.allocated() += prealloc_pos->length; + extents.push_back(*prealloc_pos); + ++prealloc_pos; + } else { + extents.emplace_back(prealloc_pos->offset, left); + prealloc_pos->offset += left; + prealloc_pos->length -= left; + prealloc_left -= left; + txc->statfs_delta.allocated() += left; + left = 0; + break; + } + } + for (auto& p : extents) { + txc->allocated.insert(p.offset, p.length); + } + dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents); + + dout(20) << __func__ << " blob " << *b << dendl; + if (dblob.has_csum()) { + dblob.calc_csum(b_off, *l); + } + + if (wi.mark_unused) { + ceph_assert(!dblob.is_compressed()); + auto b_end = b_off + wi.bl.length(); + if (b_off) { + dblob.add_unused(0, b_off); + } + uint64_t llen = dblob.get_logical_length(); + if (b_end < llen) { + dblob.add_unused(b_end, llen - b_end); + } + } + + Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset, + b_off + (wi.b_off0 - wi.b_off), + wi.length0, + wi.b, + nullptr); + wi.b->dirty_blob().mark_used(le->blob_offset, le->length); + txc->statfs_delta.stored() += le->length; + dout(20) << __func__ << " lex " << *le << dendl; + _buffer_cache_write(txc, wi.b, b_off, wi.bl, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + // queue io + if (!g_conf()->bluestore_debug_omit_block_device_write) { + if (l->length() <= prefer_deferred_size.load()) { + dout(20) << __func__ << " deferring small 0x" << std::hex + << l->length() << std::dec << " write via deferred" << dendl; + bluestore_deferred_op_t *op = _get_deferred_op(txc, o); + op->op = bluestore_deferred_op_t::OP_WRITE; + int r = b->get_blob().map( + b_off, l->length(), + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + ceph_assert(r == 0); + op->data = *l; + logger->inc(l_bluestore_write_small_deferred); + } else { + b->get_blob().map_bl( + b_off, *l, + [&](uint64_t offset, bufferlist& t) { + bdev->aio_write(offset, t, &txc->ioc, false); + }); + logger->inc(l_bluestore_write_small_new); + } + } + } + ceph_assert(prealloc_pos == prealloc.end()); + ceph_assert(prealloc_left == 0); + return 0; +} + +void BlueStore::_wctx_finish( + TransContext *txc, + CollectionRef& c, + OnodeRef o, + WriteContext *wctx, + set<SharedBlob*> *maybe_unshared_blobs) +{ + auto oep = wctx->old_extents.begin(); + while (oep != wctx->old_extents.end()) { + auto &lo = *oep; + oep = wctx->old_extents.erase(oep); + dout(20) << __func__ << " lex_old " << lo.e << dendl; + BlobRef b = lo.e.blob; + const bluestore_blob_t& blob = b->get_blob(); + if (blob.is_compressed()) { + if (lo.blob_empty) { + txc->statfs_delta.compressed() -= blob.get_compressed_payload_length(); + } + txc->statfs_delta.compressed_original() -= lo.e.length; + } + auto& r = lo.r; + txc->statfs_delta.stored() -= lo.e.length; + if (!r.empty()) { + dout(20) << __func__ << " blob release " << r << dendl; + if (blob.is_shared()) { + PExtentVector final; + c->load_shared_blob(b->shared_blob); + bool unshare = false; + bool* unshare_ptr = + !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare; + for (auto e : r) { + b->shared_blob->put_ref( + e.offset, e.length, &final, + unshare_ptr); + } + if (unshare) { + ceph_assert(maybe_unshared_blobs); + maybe_unshared_blobs->insert(b->shared_blob.get()); + } + dout(20) << __func__ << " shared_blob release " << final + << " from " << *b->shared_blob << dendl; + txc->write_shared_blob(b->shared_blob); + r.clear(); + r.swap(final); + } + } + // we can't invalidate our logical extents as we drop them because + // other lextents (either in our onode or others) may still + // reference them. but we can throw out anything that is no + // longer allocated. Note that this will leave behind edge bits + // that are no longer referenced but not deallocated (until they + // age out of the cache naturally). + b->discard_unallocated(c.get()); + for (auto e : r) { + dout(20) << __func__ << " release " << e << dendl; + txc->released.insert(e.offset, e.length); + txc->statfs_delta.allocated() -= e.length; + if (blob.is_compressed()) { + txc->statfs_delta.compressed_allocated() -= e.length; + } + } + delete &lo; + if (b->is_spanning() && !b->is_referenced()) { + dout(20) << __func__ << " spanning_blob_map removing empty " << *b + << dendl; + o->extent_map.spanning_blob_map.erase(b->id); + } + } +} + +void BlueStore::_do_write_data( + TransContext *txc, + CollectionRef& c, + OnodeRef o, + uint64_t offset, + uint64_t length, + bufferlist& bl, + WriteContext *wctx) +{ + uint64_t end = offset + length; + bufferlist::iterator p = bl.begin(); + + if (offset / min_alloc_size == (end - 1) / min_alloc_size && + (length != min_alloc_size)) { + // we fall within the same block + _do_write_small(txc, c, o, offset, length, p, wctx); + } else { + uint64_t head_offset, head_length; + uint64_t middle_offset, middle_length; + uint64_t tail_offset, tail_length; + + head_offset = offset; + head_length = p2nphase(offset, min_alloc_size); + + tail_offset = p2align(end, min_alloc_size); + tail_length = p2phase(end, min_alloc_size); + + middle_offset = head_offset + head_length; + middle_length = length - head_length - tail_length; + + if (head_length) { + _do_write_small(txc, c, o, head_offset, head_length, p, wctx); + } + + if (middle_length) { + _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx); + } + + if (tail_length) { + _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx); + } + } +} + +void BlueStore::_choose_write_options( + CollectionRef& c, + OnodeRef o, + uint32_t fadvise_flags, + WriteContext *wctx) +{ + if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { + dout(20) << __func__ << " will do buffered write" << dendl; + wctx->buffered = true; + } else if (cct->_conf->bluestore_default_buffered_write && + (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { + dout(20) << __func__ << " defaulting to buffered write" << dendl; + wctx->buffered = true; + } + + // apply basic csum block size + wctx->csum_order = block_size_order; + + // compression parameters + unsigned alloc_hints = o->onode.alloc_hint_flags; + auto cm = select_option( + "compression_mode", + comp_mode.load(), + [&]() { + string val; + if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) { + return boost::optional<Compressor::CompressionMode>( + Compressor::get_comp_mode_type(val)); + } + return boost::optional<Compressor::CompressionMode>(); + } + ); + + wctx->compress = (cm != Compressor::COMP_NONE) && + ((cm == Compressor::COMP_FORCE) || + (cm == Compressor::COMP_AGGRESSIVE && + (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) || + (cm == Compressor::COMP_PASSIVE && + (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE))); + + if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) && + (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 && + (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE | + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) && + (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) { + + dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl; + + if (o->onode.expected_write_size) { + wctx->csum_order = std::max(min_alloc_size_order, + (uint8_t)ctz(o->onode.expected_write_size)); + } else { + wctx->csum_order = min_alloc_size_order; + } + + if (wctx->compress) { + wctx->target_blob_size = select_option( + "compression_max_blob_size", + comp_max_blob_size.load(), + [&]() { + int64_t val; + if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) { + return boost::optional<uint64_t>((uint64_t)val); + } + return boost::optional<uint64_t>(); + } + ); + } + } else { + if (wctx->compress) { + wctx->target_blob_size = select_option( + "compression_min_blob_size", + comp_min_blob_size.load(), + [&]() { + int64_t val; + if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) { + return boost::optional<uint64_t>((uint64_t)val); + } + return boost::optional<uint64_t>(); + } + ); + } + } + + uint64_t max_bsize = max_blob_size.load(); + if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) { + wctx->target_blob_size = max_bsize; + } + + // set the min blob size floor at 2x the min_alloc_size, or else we + // won't be able to allocate a smaller extent for the compressed + // data. + if (wctx->compress && + wctx->target_blob_size < min_alloc_size * 2) { + wctx->target_blob_size = min_alloc_size * 2; + } + + dout(20) << __func__ << " prefer csum_order " << wctx->csum_order + << " target_blob_size 0x" << std::hex << wctx->target_blob_size + << " compress=" << (int)wctx->compress + << " buffered=" << (int)wctx->buffered + << std::dec << dendl; +} + +int BlueStore::_do_gc( + TransContext *txc, + CollectionRef& c, + OnodeRef o, + const WriteContext& wctx, + uint64_t *dirty_start, + uint64_t *dirty_end) +{ + + bool dirty_range_updated = false; + WriteContext wctx_gc; + wctx_gc.fork(wctx); // make a clone for garbage collection + + auto & extents_to_collect = wctx.extents_to_gc; + for (auto it = extents_to_collect.begin(); + it != extents_to_collect.end(); + ++it) { + bufferlist bl; + auto offset = (*it).first; + auto length = (*it).second; + dout(20) << __func__ << " processing " << std::hex + << offset << "~" << length << std::dec + << dendl; + int r = _do_read(c.get(), o, offset, length, bl, 0); + ceph_assert(r == (int)length); + + _do_write_data(txc, c, o, offset, length, bl, &wctx_gc); + logger->inc(l_bluestore_gc_merged, length); + + if (*dirty_start > offset) { + *dirty_start = offset; + dirty_range_updated = true; + } + + if (*dirty_end < offset + length) { + *dirty_end = offset + length; + dirty_range_updated = true; + } + } + if (dirty_range_updated) { + o->extent_map.fault_range(db, *dirty_start, *dirty_end); + } + + dout(30) << __func__ << " alloc write" << dendl; + int r = _do_alloc_write(txc, c, o, &wctx_gc); + if (r < 0) { + derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r) + << dendl; + return r; + } + + _wctx_finish(txc, c, o, &wctx_gc); + return 0; +} + +int BlueStore::_do_write( + TransContext *txc, + CollectionRef& c, + OnodeRef o, + uint64_t offset, + uint64_t length, + bufferlist& bl, + uint32_t fadvise_flags) +{ + int r = 0; + + dout(20) << __func__ + << " " << o->oid + << " 0x" << std::hex << offset << "~" << length + << " - have 0x" << o->onode.size + << " (" << std::dec << o->onode.size << ")" + << " bytes" + << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec + << dendl; + _dump_onode<30>(cct, *o); + + if (length == 0) { + return 0; + } + + uint64_t end = offset + length; + + GarbageCollector gc(c->store->cct); + int64_t benefit = 0; + auto dirty_start = offset; + auto dirty_end = end; + + WriteContext wctx; + _choose_write_options(c, o, fadvise_flags, &wctx); + o->extent_map.fault_range(db, offset, length); + _do_write_data(txc, c, o, offset, length, bl, &wctx); + r = _do_alloc_write(txc, c, o, &wctx); + if (r < 0) { + derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r) + << dendl; + goto out; + } + + if (wctx.extents_to_gc.empty() || + wctx.extents_to_gc.range_start() > offset || + wctx.extents_to_gc.range_end() < offset + length) { + benefit = gc.estimate(offset, + length, + o->extent_map, + wctx.old_extents, + min_alloc_size); + } + + // NB: _wctx_finish() will empty old_extents + // so we must do gc estimation before that + _wctx_finish(txc, c, o, &wctx); + if (end > o->onode.size) { + dout(20) << __func__ << " extending size to 0x" << std::hex << end + << std::dec << dendl; + o->onode.size = end; + } + + if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) { + wctx.extents_to_gc.union_of(gc.get_extents_to_collect()); + dout(20) << __func__ + << " perform garbage collection for compressed extents, " + << "expected benefit = " << benefit << " AUs" << dendl; + } + if (!wctx.extents_to_gc.empty()) { + dout(20) << __func__ << " perform garbage collection" << dendl; + + r = _do_gc(txc, c, o, + wctx, + &dirty_start, &dirty_end); + if (r < 0) { + derr << __func__ << " _do_gc failed with " << cpp_strerror(r) + << dendl; + goto out; + } + dout(20)<<__func__<<" gc range is " << std::hex << dirty_start + << "~" << dirty_end - dirty_start << std::dec << dendl; + } + o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start); + o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start); + + r = 0; + + out: + return r; +} + +int BlueStore::_write(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t length, + bufferlist& bl, + uint32_t fadvise_flags) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << dendl; + int r = 0; + if (offset + length >= OBJECT_MAX_SIZE) { + r = -E2BIG; + } else { + _assign_nid(txc, o); + r = _do_write(txc, c, o, offset, length, bl, fadvise_flags); + txc->write_onode(o); + } + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << " = " << r << dendl; + return r; +} + +int BlueStore::_zero(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t length) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << dendl; + int r = 0; + if (offset + length >= OBJECT_MAX_SIZE) { + r = -E2BIG; + } else { + _assign_nid(txc, o); + r = _do_zero(txc, c, o, offset, length); + } + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << " = " << r << dendl; + return r; +} + +int BlueStore::_do_zero(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t length) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << dendl; + int r = 0; + + _dump_onode<30>(cct, *o); + + WriteContext wctx; + o->extent_map.fault_range(db, offset, length); + o->extent_map.punch_hole(c, offset, length, &wctx.old_extents); + o->extent_map.dirty_range(offset, length); + _wctx_finish(txc, c, o, &wctx); + + if (length > 0 && offset + length > o->onode.size) { + o->onode.size = offset + length; + dout(20) << __func__ << " extending size to " << offset + length + << dendl; + } + txc->write_onode(o); + + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << " = " << r << dendl; + return r; +} + +void BlueStore::_do_truncate( + TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset, + set<SharedBlob*> *maybe_unshared_blobs) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << std::dec << dendl; + + _dump_onode<30>(cct, *o); + + if (offset == o->onode.size) + return; + + if (offset < o->onode.size) { + WriteContext wctx; + uint64_t length = o->onode.size - offset; + o->extent_map.fault_range(db, offset, length); + o->extent_map.punch_hole(c, offset, length, &wctx.old_extents); + o->extent_map.dirty_range(offset, length); + _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs); + + // if we have shards past EOF, ask for a reshard + if (!o->onode.extent_map_shards.empty() && + o->onode.extent_map_shards.back().offset >= offset) { + dout(10) << __func__ << " request reshard past EOF" << dendl; + if (offset) { + o->extent_map.request_reshard(offset - 1, offset + length); + } else { + o->extent_map.request_reshard(0, length); + } + } + } + + o->onode.size = offset; + + txc->write_onode(o); +} + +int BlueStore::_truncate(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << std::dec + << dendl; + int r = 0; + if (offset >= OBJECT_MAX_SIZE) { + r = -E2BIG; + } else { + _do_truncate(txc, c, o, offset); + } + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << std::dec + << " = " << r << dendl; + return r; +} + +int BlueStore::_do_remove( + TransContext *txc, + CollectionRef& c, + OnodeRef o) +{ + set<SharedBlob*> maybe_unshared_blobs; + bool is_gen = !o->oid.is_no_gen(); + _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr); + if (o->onode.has_omap()) { + o->flush(); + _do_omap_clear(txc, + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP, + o->onode.nid); + } + o->exists = false; + string key; + for (auto &s : o->extent_map.shards) { + dout(20) << __func__ << " removing shard 0x" << std::hex + << s.shard_info->offset << std::dec << dendl; + generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key, + [&](const string& final_key) { + txc->t->rmkey(PREFIX_OBJ, final_key); + } + ); + } + txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size()); + txc->note_removed_object(o); + o->extent_map.clear(); + o->onode = bluestore_onode_t(); + _debug_obj_on_delete(o->oid); + + if (!is_gen || maybe_unshared_blobs.empty()) { + return 0; + } + + // see if we can unshare blobs still referenced by the head + dout(10) << __func__ << " gen and maybe_unshared_blobs " + << maybe_unshared_blobs << dendl; + ghobject_t nogen = o->oid; + nogen.generation = ghobject_t::NO_GEN; + OnodeRef h = c->onode_map.lookup(nogen); + + if (!h || !h->exists) { + return 0; + } + + dout(20) << __func__ << " checking for unshareable blobs on " << h + << " " << h->oid << dendl; + map<SharedBlob*,bluestore_extent_ref_map_t> expect; + for (auto& e : h->extent_map.extent_map) { + const bluestore_blob_t& b = e.blob->get_blob(); + SharedBlob *sb = e.blob->shared_blob.get(); + if (b.is_shared() && + sb->loaded && + maybe_unshared_blobs.count(sb)) { + if (b.is_compressed()) { + expect[sb].get(0, b.get_ondisk_length()); + } else { + b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) { + expect[sb].get(off, len); + return 0; + }); + } + } + } + + vector<SharedBlob*> unshared_blobs; + unshared_blobs.reserve(maybe_unshared_blobs.size()); + for (auto& p : expect) { + dout(20) << " ? " << *p.first << " vs " << p.second << dendl; + if (p.first->persistent->ref_map == p.second) { + SharedBlob *sb = p.first; + dout(20) << __func__ << " unsharing " << *sb << dendl; + unshared_blobs.push_back(sb); + txc->unshare_blob(sb); + uint64_t sbid = c->make_blob_unshared(sb); + string key; + get_shared_blob_key(sbid, &key); + txc->t->rmkey(PREFIX_SHARED_BLOB, key); + } + } + + if (unshared_blobs.empty()) { + return 0; + } + + for (auto& e : h->extent_map.extent_map) { + const bluestore_blob_t& b = e.blob->get_blob(); + SharedBlob *sb = e.blob->shared_blob.get(); + if (b.is_shared() && + std::find(unshared_blobs.begin(), unshared_blobs.end(), + sb) != unshared_blobs.end()) { + dout(20) << __func__ << " unsharing " << e << dendl; + bluestore_blob_t& blob = e.blob->dirty_blob(); + blob.clear_flag(bluestore_blob_t::FLAG_SHARED); + h->extent_map.dirty_range(e.logical_offset, 1); + } + } + txc->write_onode(h); + + return 0; +} + +int BlueStore::_remove(TransContext *txc, + CollectionRef& c, + OnodeRef &o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " onode " << o.get() + << " txc "<< txc << dendl; + + auto start_time = mono_clock::now(); + int r = _do_remove(txc, c, o); + log_latency_fn( + __func__, + l_bluestore_remove_lat, + mono_clock::now() - start_time, + cct->_conf->bluestore_log_op_age, + [&](const ceph::timespan& lat) { + ostringstream ostr; + ostr << ", lat = " << timespan_str(lat) + << " cid =" << c->cid + << " oid =" << o->oid; + return ostr.str(); + } + ); + + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_setattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name, + bufferptr& val) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << name << " (" << val.length() << " bytes)" + << dendl; + int r = 0; + if (val.is_partial()) { + auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), + val.length()); + b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } else { + auto& b = o->onode.attrs[name.c_str()] = val; + b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << name << " (" << val.length() << " bytes)" + << " = " << r << dendl; + return r; +} + +int BlueStore::_setattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const map<string,bufferptr>& aset) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << aset.size() << " keys" + << dendl; + int r = 0; + for (map<string,bufferptr>::const_iterator p = aset.begin(); + p != aset.end(); ++p) { + if (p->second.is_partial()) { + auto& b = o->onode.attrs[p->first.c_str()] = + bufferptr(p->second.c_str(), p->second.length()); + b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } else { + auto& b = o->onode.attrs[p->first.c_str()] = p->second; + b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } + } + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << aset.size() << " keys" + << " = " << r << dendl; + return r; +} + + +int BlueStore::_rmattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << name << dendl; + int r = 0; + auto it = o->onode.attrs.find(name.c_str()); + if (it == o->onode.attrs.end()) + goto out; + + o->onode.attrs.erase(it); + txc->write_onode(o); + + out: + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << name << " = " << r << dendl; + return r; +} + +int BlueStore::_rmattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + + if (o->onode.attrs.empty()) + goto out; + + o->onode.attrs.clear(); + txc->write_onode(o); + + out: + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +void BlueStore::_do_omap_clear(TransContext *txc, const string& omap_prefix, + uint64_t id) +{ + string prefix, tail; + get_omap_header(id, &prefix); + get_omap_tail(id, &tail); + txc->t->rm_range_keys(omap_prefix, prefix, tail); + txc->t->rmkey(omap_prefix, tail); + dout(20) << __func__ << " remove range start: " + << pretty_binary_string(prefix) << " end: " + << pretty_binary_string(tail) << dendl; +} + +int BlueStore::_omap_clear(TransContext *txc, + CollectionRef& c, + OnodeRef& o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + if (o->onode.has_omap()) { + o->flush(); + _do_omap_clear(txc, + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP, + o->onode.nid); + o->onode.clear_omap_flag(); + txc->write_onode(o); + } + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_omap_setkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist &bl) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r; + auto p = bl.cbegin(); + __u32 num; + if (!o->onode.has_omap()) { + o->onode.set_omap_flag(); + if (o->oid.is_pgmeta()) { + o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; + } + txc->write_onode(o); + + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + string key_tail; + bufferlist tail; + get_omap_tail(o->onode.nid, &key_tail); + txc->t->set(prefix, key_tail, tail); + } else { + txc->note_modified_object(o); + } + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + string final_key; + _key_encode_u64(o->onode.nid, &final_key); + final_key.push_back('.'); + decode(num, p); + while (num--) { + string key; + bufferlist value; + decode(key, p); + decode(value, p); + final_key.resize(9); // keep prefix + final_key += key; + dout(20) << __func__ << " " << pretty_binary_string(final_key) + << " <- " << key << dendl; + txc->t->set(prefix, final_key, value); + } + r = 0; + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_omap_setheader(TransContext *txc, + CollectionRef& c, + OnodeRef &o, + bufferlist& bl) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r; + string key; + if (!o->onode.has_omap()) { + o->onode.set_omap_flag(); + if (o->oid.is_pgmeta()) { + o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; + } + txc->write_onode(o); + + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + string key_tail; + bufferlist tail; + get_omap_tail(o->onode.nid, &key_tail); + txc->t->set(prefix, key_tail, tail); + } else { + txc->note_modified_object(o); + } + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + get_omap_header(o->onode.nid, &key); + txc->t->set(prefix, key, bl); + r = 0; + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_omap_rmkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist& bl) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + auto p = bl.cbegin(); + __u32 num; + string final_key; + + if (!o->onode.has_omap()) { + goto out; + } + { + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + _key_encode_u64(o->onode.nid, &final_key); + final_key.push_back('.'); + decode(num, p); + while (num--) { + string key; + decode(key, p); + final_key.resize(9); // keep prefix + final_key += key; + dout(20) << __func__ << " rm " << pretty_binary_string(final_key) + << " <- " << key << dendl; + txc->t->rmkey(prefix, final_key); + } + } + txc->note_modified_object(o); + + out: + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_omap_rmkey_range(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& first, const string& last) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + string key_first, key_last; + int r = 0; + if (!o->onode.has_omap()) { + goto out; + } + { + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + o->flush(); + get_omap_key(o->onode.nid, first, &key_first); + get_omap_key(o->onode.nid, last, &key_last); + txc->t->rm_range_keys(prefix, key_first, key_last); + dout(20) << __func__ << " remove range start: " + << pretty_binary_string(key_first) << " end: " + << pretty_binary_string(key_last) << dendl; + } + txc->note_modified_object(o); + + out: + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_set_alloc_hint( + TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << " flags " << ceph_osd_alloc_hint_flag_string(flags) + << dendl; + int r = 0; + o->onode.expected_object_size = expected_object_size; + o->onode.expected_write_size = expected_write_size; + o->onode.alloc_hint_flags = flags; + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << " flags " << ceph_osd_alloc_hint_flag_string(flags) + << " = " << r << dendl; + return r; +} + +int BlueStore::_clone(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << dendl; + int r = 0; + if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) { + derr << __func__ << " mismatched hash on " << oldo->oid + << " and " << newo->oid << dendl; + return -EINVAL; + } + + _assign_nid(txc, newo); + + // clone data + oldo->flush(); + _do_truncate(txc, c, newo, 0); + if (cct->_conf->bluestore_clone_cow) { + _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0); + } else { + bufferlist bl; + r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0); + if (r < 0) + goto out; + r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0); + if (r < 0) + goto out; + } + + // clone attrs + newo->onode.attrs = oldo->onode.attrs; + + // clone omap + if (newo->onode.has_omap()) { + dout(20) << __func__ << " clearing old omap data" << dendl; + newo->flush(); + _do_omap_clear(txc, + newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP + : PREFIX_OMAP, + newo->onode.nid); + newo->onode.clear_omap_flag(); + } + if (oldo->onode.has_omap()) { + dout(20) << __func__ << " copying omap data" << dendl; + newo->onode.set_omap_flag(); + if (newo->oid.is_pgmeta()) { + newo->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; + } + const string& prefix = + newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + KeyValueDB::Iterator it = db->get_iterator(prefix); + string head, tail; + get_omap_header(oldo->onode.nid, &head); + get_omap_tail(oldo->onode.nid, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + dout(30) << __func__ << " got header/data " + << pretty_binary_string(it->key()) << dendl; + string key; + rewrite_omap_key(newo->onode.nid, it->key(), &key); + txc->t->set(prefix, key, it->value()); + } + it->next(); + } + string new_tail; + bufferlist new_tail_value; + get_omap_tail(newo->onode.nid, &new_tail); + txc->t->set(prefix, new_tail, new_tail_value); + } + + txc->write_onode(newo); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_do_clone_range( + TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + uint64_t srcoff, + uint64_t length, + uint64_t dstoff) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid + << " 0x" << std::hex << srcoff << "~" << length << " -> " + << " 0x" << dstoff << "~" << length << std::dec << dendl; + oldo->extent_map.fault_range(db, srcoff, length); + newo->extent_map.fault_range(db, dstoff, length); + _dump_onode<30>(cct, *oldo); + _dump_onode<30>(cct, *newo); + + oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff); + _dump_onode<30>(cct, *oldo); + _dump_onode<30>(cct, *newo); + return 0; +} + +int BlueStore::_clone_range(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + uint64_t srcoff, uint64_t length, uint64_t dstoff) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << " from 0x" << std::hex << srcoff << "~" << length + << " to offset 0x" << dstoff << std::dec << dendl; + int r = 0; + + if (srcoff + length >= OBJECT_MAX_SIZE || + dstoff + length >= OBJECT_MAX_SIZE) { + r = -E2BIG; + goto out; + } + if (srcoff + length > oldo->onode.size) { + r = -EINVAL; + goto out; + } + + _assign_nid(txc, newo); + + if (length > 0) { + if (cct->_conf->bluestore_clone_cow) { + _do_zero(txc, c, newo, dstoff, length); + _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff); + } else { + bufferlist bl; + r = _do_read(c.get(), oldo, srcoff, length, bl, 0); + if (r < 0) + goto out; + r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0); + if (r < 0) + goto out; + } + } + + txc->write_onode(newo); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << " from 0x" << std::hex << srcoff << "~" << length + << " to offset 0x" << dstoff << std::dec + << " = " << r << dendl; + return r; +} + +int BlueStore::_rename(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + const ghobject_t& new_oid) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << new_oid << dendl; + int r; + ghobject_t old_oid = oldo->oid; + mempool::bluestore_cache_meta::string new_okey; + + if (newo) { + if (newo->exists) { + r = -EEXIST; + goto out; + } + ceph_assert(txc->onodes.count(newo) == 0); + } + + txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size()); + + // rewrite shards + { + oldo->extent_map.fault_range(db, 0, oldo->onode.size); + get_object_key(cct, new_oid, &new_okey); + string key; + for (auto &s : oldo->extent_map.shards) { + generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key, + [&](const string& final_key) { + txc->t->rmkey(PREFIX_OBJ, final_key); + } + ); + s.dirty = true; + } + } + + newo = oldo; + txc->write_onode(newo); + + // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty + // Onode in the old slot + c->onode_map.rename(oldo, old_oid, new_oid, new_okey); + r = 0; + + // hold a ref to new Onode in old name position, to ensure we don't drop + // it from the cache before this txc commits (or else someone may come along + // and read newo's metadata via the old name). + txc->note_modified_object(oldo); + + out: + dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " = " << r << dendl; + return r; +} + +// collections + +int BlueStore::_create_collection( + TransContext *txc, + const coll_t &cid, + unsigned bits, + CollectionRef *c) +{ + dout(15) << __func__ << " " << cid << " bits " << bits << dendl; + int r; + bufferlist bl; + + { + RWLock::WLocker l(coll_lock); + if (*c) { + r = -EEXIST; + goto out; + } + auto p = new_coll_map.find(cid); + ceph_assert(p != new_coll_map.end()); + *c = p->second; + (*c)->cnode.bits = bits; + coll_map[cid] = *c; + new_coll_map.erase(p); + } + encode((*c)->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(cid), bl); + r = 0; + + out: + dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl; + return r; +} + +int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid, + CollectionRef *c) +{ + dout(15) << __func__ << " " << cid << dendl; + int r; + + (*c)->flush_all_but_last(); + { + RWLock::WLocker l(coll_lock); + if (!*c) { + r = -ENOENT; + goto out; + } + size_t nonexistent_count = 0; + ceph_assert((*c)->exists); + if ((*c)->onode_map.map_any([&](OnodeRef o) { + if (o->exists) { + dout(1) << __func__ << " " << o->oid << " " << o + << " exists in onode_map" << dendl; + return true; + } + ++nonexistent_count; + return false; + })) { + r = -ENOTEMPTY; + goto out; + } + + vector<ghobject_t> ls; + ghobject_t next; + // Enumerate onodes in db, up to nonexistent_count + 1 + // then check if all of them are marked as non-existent. + // Bypass the check if (next != ghobject_t::get_max()) + r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(), + nonexistent_count + 1, false, &ls, &next); + if (r >= 0) { + // If true mean collecton has more objects than nonexistent_count, + // so bypass check. + bool exists = (!next.is_max()); + for (auto it = ls.begin(); !exists && it < ls.end(); ++it) { + dout(10) << __func__ << " oid " << *it << dendl; + auto onode = (*c)->onode_map.lookup(*it); + exists = !onode || onode->exists; + if (exists) { + dout(1) << __func__ << " " << *it + << " exists in db, " + << (!onode ? "not present in ram" : "present in ram") + << dendl; + } + } + if (!exists) { + _do_remove_collection(txc, c); + r = 0; + } else { + dout(10) << __func__ << " " << cid + << " is non-empty" << dendl; + r = -ENOTEMPTY; + } + } + } + + out: + dout(10) << __func__ << " " << cid << " = " << r << dendl; + return r; +} + +void BlueStore::_do_remove_collection(TransContext *txc, + CollectionRef *c) +{ + coll_map.erase((*c)->cid); + txc->removed_collections.push_back(*c); + (*c)->exists = false; + _osr_register_zombie((*c)->osr.get()); + txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid)); + c->reset(); +} + +int BlueStore::_split_collection(TransContext *txc, + CollectionRef& c, + CollectionRef& d, + unsigned bits, int rem) +{ + dout(15) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << dendl; + RWLock::WLocker l(c->lock); + RWLock::WLocker l2(d->lock); + int r; + + // flush all previous deferred writes on this sequencer. this is a bit + // heavyweight, but we need to make sure all deferred writes complete + // before we split as the new collection's sequencer may need to order + // this after those writes, and we don't bother with the complexity of + // moving those TransContexts over to the new osr. + _osr_drain_preceding(txc); + + // move any cached items (onodes and referenced shared blobs) that will + // belong to the child collection post-split. leave everything else behind. + // this may include things that don't strictly belong to the now-smaller + // parent split, but the OSD will always send us a split for every new + // child. + + spg_t pgid, dest_pgid; + bool is_pg = c->cid.is_pg(&pgid); + ceph_assert(is_pg); + is_pg = d->cid.is_pg(&dest_pgid); + ceph_assert(is_pg); + + // the destination should initially be empty. + ceph_assert(d->onode_map.empty()); + ceph_assert(d->shared_blob_set.empty()); + ceph_assert(d->cnode.bits == bits); + + c->split_cache(d.get()); + + // adjust bits. note that this will be redundant for all but the first + // split call for this parent (first child). + c->cnode.bits = bits; + ceph_assert(d->cnode.bits == bits); + r = 0; + + bufferlist bl; + encode(c->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(c->cid), bl); + + dout(10) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << " = " << r << dendl; + return r; +} + +int BlueStore::_merge_collection( + TransContext *txc, + CollectionRef *c, + CollectionRef& d, + unsigned bits) +{ + dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid + << " bits " << bits << dendl; + RWLock::WLocker l((*c)->lock); + RWLock::WLocker l2(d->lock); + int r; + + coll_t cid = (*c)->cid; + + // flush all previous deferred writes on the source collection to ensure + // that all deferred writes complete before we merge as the target collection's + // sequencer may need to order new ops after those writes. + + _osr_drain((*c)->osr.get()); + + // move any cached items (onodes and referenced shared blobs) that will + // belong to the child collection post-split. leave everything else behind. + // this may include things that don't strictly belong to the now-smaller + // parent split, but the OSD will always send us a split for every new + // child. + + spg_t pgid, dest_pgid; + bool is_pg = cid.is_pg(&pgid); + ceph_assert(is_pg); + is_pg = d->cid.is_pg(&dest_pgid); + ceph_assert(is_pg); + + // adjust bits. note that this will be redundant for all but the first + // merge call for the parent/target. + d->cnode.bits = bits; + + // behavior depends on target (d) bits, so this after that is updated. + (*c)->split_cache(d.get()); + + // remove source collection + { + RWLock::WLocker l3(coll_lock); + _do_remove_collection(txc, c); + } + + r = 0; + + bufferlist bl; + encode(d->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(d->cid), bl); + + dout(10) << __func__ << " " << cid << " to " << d->cid << " " + << " bits " << bits << " = " << r << dendl; + return r; +} + +void BlueStore::log_latency( + const char* name, + int idx, + const ceph::timespan& l, + double lat_threshold, + const char* info) const +{ + logger->tinc(idx, l); + if (lat_threshold > 0.0 && + l >= make_timespan(lat_threshold)) { + dout(0) << __func__ << " slow operation observed for " << name + << ", latency = " << l + << info + << dendl; + } +} + +void BlueStore::log_latency_fn( + const char* name, + int idx, + const ceph::timespan& l, + double lat_threshold, + std::function<string (const ceph::timespan& lat)> fn) const +{ + logger->tinc(idx, l); + if (lat_threshold > 0.0 && + l >= make_timespan(lat_threshold)) { + dout(0) << __func__ << " slow operation observed for " << name + << ", latency = " << l + << fn(l) + << dendl; + } +} + + +// DB key value Histogram +#define KEY_SLAB 32 +#define VALUE_SLAB 64 + +const string prefix_onode = "o"; +const string prefix_onode_shard = "x"; +const string prefix_other = "Z"; + +int BlueStore::DBHistogram::get_key_slab(size_t sz) +{ + return (sz/KEY_SLAB); +} + +string BlueStore::DBHistogram::get_key_slab_to_range(int slab) +{ + int lower_bound = slab * KEY_SLAB; + int upper_bound = (slab + 1) * KEY_SLAB; + string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")"; + return ret; +} + +int BlueStore::DBHistogram::get_value_slab(size_t sz) +{ + return (sz/VALUE_SLAB); +} + +string BlueStore::DBHistogram::get_value_slab_to_range(int slab) +{ + int lower_bound = slab * VALUE_SLAB; + int upper_bound = (slab + 1) * VALUE_SLAB; + string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")"; + return ret; +} + +void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist, + const string &prefix, size_t key_size, size_t value_size) +{ + uint32_t key_slab = get_key_slab(key_size); + uint32_t value_slab = get_value_slab(value_size); + key_hist[prefix][key_slab].count++; + key_hist[prefix][key_slab].max_len = + std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len); + key_hist[prefix][key_slab].val_map[value_slab].count++; + key_hist[prefix][key_slab].val_map[value_slab].max_len = + std::max<size_t>(value_size, + key_hist[prefix][key_slab].val_map[value_slab].max_len); +} + +void BlueStore::DBHistogram::dump(Formatter *f) +{ + f->open_object_section("rocksdb_value_distribution"); + for (auto i : value_hist) { + f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second); + } + f->close_section(); + + f->open_object_section("rocksdb_key_value_histogram"); + for (auto i : key_hist) { + f->dump_string("prefix", i.first); + f->open_object_section("key_hist"); + for ( auto k : i.second) { + f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count); + f->dump_unsigned("max_len", k.second.max_len); + f->open_object_section("value_hist"); + for ( auto j : k.second.val_map) { + f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count); + f->dump_unsigned("max_len", j.second.max_len); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} + +//Itrerates through the db and collects the stats +void BlueStore::generate_db_histogram(Formatter *f) +{ + //globals + uint64_t num_onodes = 0; + uint64_t num_shards = 0; + uint64_t num_super = 0; + uint64_t num_coll = 0; + uint64_t num_omap = 0; + uint64_t num_pgmeta_omap = 0; + uint64_t num_deferred = 0; + uint64_t num_alloc = 0; + uint64_t num_stat = 0; + uint64_t num_others = 0; + uint64_t num_shared_shards = 0; + size_t max_key_size =0, max_value_size = 0; + uint64_t total_key_size = 0, total_value_size = 0; + size_t key_size = 0, value_size = 0; + DBHistogram hist; + + auto start = coarse_mono_clock::now(); + + KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator(); + iter->seek_to_first(); + while (iter->valid()) { + dout(30) << __func__ << " Key: " << iter->key() << dendl; + key_size = iter->key_size(); + value_size = iter->value_size(); + hist.value_hist[hist.get_value_slab(value_size)]++; + max_key_size = std::max(max_key_size, key_size); + max_value_size = std::max(max_value_size, value_size); + total_key_size += key_size; + total_value_size += value_size; + + pair<string,string> key(iter->raw_key()); + + if (key.first == PREFIX_SUPER) { + hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size); + num_super++; + } else if (key.first == PREFIX_STAT) { + hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size); + num_stat++; + } else if (key.first == PREFIX_COLL) { + hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size); + num_coll++; + } else if (key.first == PREFIX_OBJ) { + if (key.second.back() == ONODE_KEY_SUFFIX) { + hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size); + num_onodes++; + } else { + hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size); + num_shards++; + } + } else if (key.first == PREFIX_OMAP) { + hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size); + num_omap++; + } else if (key.first == PREFIX_PGMETA_OMAP) { + hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size); + num_pgmeta_omap++; + } else if (key.first == PREFIX_DEFERRED) { + hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size); + num_deferred++; + } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) { + hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size); + num_alloc++; + } else if (key.first == PREFIX_SHARED_BLOB) { + hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size); + num_shared_shards++; + } else { + hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size); + num_others++; + } + iter->next(); + } + + ceph::timespan duration = coarse_mono_clock::now() - start; + f->open_object_section("rocksdb_key_value_stats"); + f->dump_unsigned("num_onodes", num_onodes); + f->dump_unsigned("num_shards", num_shards); + f->dump_unsigned("num_super", num_super); + f->dump_unsigned("num_coll", num_coll); + f->dump_unsigned("num_omap", num_omap); + f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap); + f->dump_unsigned("num_deferred", num_deferred); + f->dump_unsigned("num_alloc", num_alloc); + f->dump_unsigned("num_stat", num_stat); + f->dump_unsigned("num_shared_shards", num_shared_shards); + f->dump_unsigned("num_others", num_others); + f->dump_unsigned("max_key_size", max_key_size); + f->dump_unsigned("max_value_size", max_value_size); + f->dump_unsigned("total_key_size", total_key_size); + f->dump_unsigned("total_value_size", total_value_size); + f->close_section(); + + hist.dump(f); + + dout(20) << __func__ << " finished in " << duration << " seconds" << dendl; + +} + +void BlueStore::_flush_cache() +{ + dout(10) << __func__ << dendl; + for (auto i : cache_shards) { + i->trim_all(); + ceph_assert(i->empty()); + } + for (auto& p : coll_map) { + if (!p.second->onode_map.empty()) { + derr << __func__ << " stray onodes on " << p.first << dendl; + p.second->onode_map.dump<0>(cct); + } + if (!p.second->shared_blob_set.empty()) { + derr << __func__ << " stray shared blobs on " << p.first << dendl; + p.second->shared_blob_set.dump<0>(cct); + } + ceph_assert(p.second->onode_map.empty()); + ceph_assert(p.second->shared_blob_set.empty()); + } + coll_map.clear(); +} + +// For external caller. +// We use a best-effort policy instead, e.g., +// we don't care if there are still some pinned onodes/data in the cache +// after this command is completed. +int BlueStore::flush_cache(ostream *os) +{ + dout(10) << __func__ << dendl; + for (auto i : cache_shards) { + i->trim_all(); + } + + return 0; +} + +void BlueStore::_apply_padding(uint64_t head_pad, + uint64_t tail_pad, + bufferlist& padded) +{ + if (head_pad) { + padded.prepend_zero(head_pad); + } + if (tail_pad) { + padded.append_zero(tail_pad); + } + if (head_pad || tail_pad) { + dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad + << " tail 0x" << tail_pad << std::dec << dendl; + logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad); + } +} + +void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn) +{ + // finalize extent_map shards + o->extent_map.update(txn, false); + if (o->extent_map.needs_reshard()) { + o->extent_map.reshard(db, txn); + o->extent_map.update(txn, true); + if (o->extent_map.needs_reshard()) { + dout(20) << __func__ << " warning: still wants reshard, check options?" + << dendl; + o->extent_map.clear_needs_reshard(); + } + logger->inc(l_bluestore_onode_reshard); + } + + // bound encode + size_t bound = 0; + denc(o->onode, bound); + o->extent_map.bound_encode_spanning_blobs(bound); + if (o->onode.extent_map_shards.empty()) { + denc(o->extent_map.inline_bl, bound); + } + + // encode + bufferlist bl; + unsigned onode_part, blob_part, extent_part; + { + auto p = bl.get_contiguous_appender(bound, true); + denc(o->onode, p); + onode_part = p.get_logical_offset(); + o->extent_map.encode_spanning_blobs(p); + blob_part = p.get_logical_offset() - onode_part; + if (o->onode.extent_map_shards.empty()) { + denc(o->extent_map.inline_bl, p); + } + extent_part = p.get_logical_offset() - onode_part - blob_part; + } + + dout(20) << __func__ << " onode " << o->oid << " is " << bl.length() + << " (" << onode_part << " bytes onode + " + << blob_part << " bytes spanning blobs + " + << extent_part << " bytes inline extents)" + << dendl; + + + txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl); +} + +void BlueStore::_log_alerts(osd_alert_list_t& alerts) +{ + std::lock_guard l(qlock); + + if (!disk_size_mismatch_alert.empty()) { + alerts.emplace( + "BLUESTORE_DISK_SIZE_MISMATCH", + disk_size_mismatch_alert); + } + if (!legacy_statfs_alert.empty()) { + alerts.emplace( + "BLUESTORE_LEGACY_STATFS", + legacy_statfs_alert); + } + if (!spillover_alert.empty() && + cct->_conf->bluestore_warn_on_bluefs_spillover) { + alerts.emplace( + "BLUEFS_SPILLOVER", + spillover_alert); + } + string s0(failed_cmode); + + if (!failed_compressors.empty()) { + if (!s0.empty()) { + s0 += ", "; + } + s0 += "unable to load:"; + bool first = true; + for (auto& s : failed_compressors) { + if (first) { + first = false; + } else { + s0 += ", "; + } + s0 += s; + } + alerts.emplace( + "BLUESTORE_NO_COMPRESSION", + s0); + } +} + +// =========================================== +// BlueStoreRepairer + +size_t BlueStoreRepairer::StoreSpaceTracker::filter_out( + const interval_set<uint64_t>& extents) +{ + ceph_assert(granularity); // initialized + // can't call for the second time + ceph_assert(!was_filtered_out); + ceph_assert(collections_bfs.size() == objects_bfs.size()); + + uint64_t prev_pos = 0; + uint64_t npos = collections_bfs.size(); + + bloom_vector collections_reduced; + bloom_vector objects_reduced; + + for (auto e : extents) { + if (e.second == 0) { + continue; + } + uint64_t pos = max(e.first / granularity, prev_pos); + uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity; + while (pos != npos && pos < end_pos) { + ceph_assert( collections_bfs[pos].element_count() == + objects_bfs[pos].element_count()); + if (collections_bfs[pos].element_count()) { + collections_reduced.push_back(std::move(collections_bfs[pos])); + objects_reduced.push_back(std::move(objects_bfs[pos])); + } + ++pos; + } + prev_pos = end_pos; + } + collections_reduced.swap(collections_bfs); + objects_reduced.swap(objects_bfs); + was_filtered_out = true; + return collections_bfs.size(); +} + +bool BlueStoreRepairer::remove_key(KeyValueDB *db, + const string& prefix, + const string& key) +{ + if (!remove_key_txn) { + remove_key_txn = db->get_transaction(); + } + ++to_repair_cnt; + remove_key_txn->rmkey(prefix, key); + + return true; +} + +bool BlueStoreRepairer::fix_shared_blob( + KeyValueDB *db, + uint64_t sbid, + const bufferlist* bl) +{ + KeyValueDB::Transaction txn; + if (fix_misreferences_txn) { // reuse this txn + txn = fix_misreferences_txn; + } else { + if (!fix_shared_blob_txn) { + fix_shared_blob_txn = db->get_transaction(); + } + txn = fix_shared_blob_txn; + } + string key; + get_shared_blob_key(sbid, &key); + + ++to_repair_cnt; + if (bl) { + txn->set(PREFIX_SHARED_BLOB, key, *bl); + } else { + txn->rmkey(PREFIX_SHARED_BLOB, key); + } + return true; +} + +bool BlueStoreRepairer::fix_statfs(KeyValueDB *db, + const string& key, + const store_statfs_t& new_statfs) +{ + if (!fix_statfs_txn) { + fix_statfs_txn = db->get_transaction(); + } + BlueStore::volatile_statfs vstatfs; + vstatfs = new_statfs; + bufferlist bl; + vstatfs.encode(bl); + ++to_repair_cnt; + fix_statfs_txn->set(PREFIX_STAT, key, bl); + return true; +} + +bool BlueStoreRepairer::fix_leaked(KeyValueDB *db, + FreelistManager* fm, + uint64_t offset, uint64_t len) +{ + if (!fix_fm_leaked_txn) { + fix_fm_leaked_txn = db->get_transaction(); + } + ++to_repair_cnt; + fm->release(offset, len, fix_fm_leaked_txn); + return true; +} +bool BlueStoreRepairer::fix_false_free(KeyValueDB *db, + FreelistManager* fm, + uint64_t offset, uint64_t len) +{ + if (!fix_fm_false_free_txn) { + fix_fm_false_free_txn = db->get_transaction(); + } + ++to_repair_cnt; + fm->allocate(offset, len, fix_fm_false_free_txn); + return true; +} + +bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag) +{ + // this is just a stub to count num of repairs properly, + // actual repair happens in BlueStore::_close_db_and_around() + // while doing _sync_bluefs_and_fm + ++out_of_sync_flag; + ++to_repair_cnt; + return true; +} +KeyValueDB::Transaction BlueStoreRepairer::fix_spanning_blobs(KeyValueDB* db) +{ + if (!fix_onode_txn) { + fix_onode_txn = db->get_transaction(); + } + ++to_repair_cnt; + return fix_onode_txn; +} + +bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db) +{ + if (misreferenced_extents.size()) { + size_t n = space_usage_tracker.filter_out(misreferenced_extents); + ceph_assert(n > 0); + if (!fix_misreferences_txn) { + fix_misreferences_txn = db->get_transaction(); + } + return true; + } + return false; +} + +unsigned BlueStoreRepairer::apply(KeyValueDB* db) +{ + if (fix_fm_leaked_txn) { + db->submit_transaction_sync(fix_fm_leaked_txn); + fix_fm_leaked_txn = nullptr; + } + if (fix_fm_false_free_txn) { + db->submit_transaction_sync(fix_fm_false_free_txn); + fix_fm_false_free_txn = nullptr; + } + if (remove_key_txn) { + db->submit_transaction_sync(remove_key_txn); + remove_key_txn = nullptr; + } + if (fix_misreferences_txn) { + db->submit_transaction_sync(fix_misreferences_txn); + fix_misreferences_txn = nullptr; + } + if (fix_onode_txn) { + db->submit_transaction_sync(fix_onode_txn); + fix_onode_txn = nullptr; + } + if (fix_shared_blob_txn) { + db->submit_transaction_sync(fix_shared_blob_txn); + fix_shared_blob_txn = nullptr; + } + + if (fix_statfs_txn) { + db->submit_transaction_sync(fix_statfs_txn); + fix_statfs_txn = nullptr; + } + unsigned repaired = to_repair_cnt; + to_repair_cnt = 0; + return repaired; +} + +// ======================================================= +// RocksDBBlueFSVolumeSelector + +uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) { + ceph_assert(h != nullptr); + uint64_t hint = reinterpret_cast<uint64_t>(h); + uint8_t res; + switch (hint) { + case LEVEL_SLOW: + res = BlueFS::BDEV_SLOW; + if (db_avail4slow > 0) { + // considering statically available db space vs. + // - observed maximums on DB dev for DB/WAL/UNSORTED data + // - observed maximum spillovers + uint64_t max_db_use = 0; // max db usage we potentially observed + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST); + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST); + // this could go to db hence using it in the estimation + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST); + + auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST]; + uint64_t avail = min( + db_avail4slow, + max_db_use < db_total ? db_total - max_db_use : 0); + + // considering current DB dev usage for SLOW data + if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) { + res = BlueFS::BDEV_DB; + } + } + break; + case LEVEL_WAL: + res = BlueFS::BDEV_WAL; + break; + case LEVEL_DB: + default: + res = BlueFS::BDEV_DB; + break; + } + return res; +} + +void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const +{ + res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]); + res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]); +} + +void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const { + uint8_t res = LEVEL_DB; + if (dirname.length() > 5) { + // the "db.slow" and "db.wal" directory names are hard-coded at + // match up with bluestore. the slow device is always the second + // one (when a dedicated block.db device is present and used at + // bdev 0). the wal device is always last. + if (boost::algorithm::ends_with(dirname, ".slow")) { + res = LEVEL_SLOW; + } + else if (boost::algorithm::ends_with(dirname, ".wal")) { + res = LEVEL_WAL; + } + } + return reinterpret_cast<void*>(res); +} + +void RocksDBBlueFSVolumeSelector::dump(ostream& sout) { + auto max_x = per_level_per_dev_usage.get_max_x(); + auto max_y = per_level_per_dev_usage.get_max_y(); + sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST] + << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST] + << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST] + << ", db_avail:" << db_avail4slow << std::endl + << "Usage matrix:" << std::endl; + constexpr std::array<const char*, 7> names{ { + "DEV/LEV", + "WAL", + "DB", + "SLOW", + "*", + "*", + "REAL" + } }; + const size_t width = 12; + for (size_t i = 0; i < names.size(); ++i) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << names[i]; + } + sout << std::endl; + for (size_t l = 0; l < max_y; l++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + switch (l + LEVEL_FIRST) { + case LEVEL_WAL: + sout << "WAL"; break; + case LEVEL_DB: + sout << "DB"; break; + case LEVEL_SLOW: + sout << "SLOW"; break; + case LEVEL_MAX: + sout << "TOTALS"; break; + } + for (size_t d = 0; d < max_x - 1; d++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l))); + } + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_usage.at(max_x - 1, l))) + << std::endl; + } + ceph_assert(max_x == per_level_per_dev_max.get_max_x()); + ceph_assert(max_y == per_level_per_dev_max.get_max_y()); + sout << "MAXIMUMS:" << std::endl; + for (size_t l = 0; l < max_y; l++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + switch (l + LEVEL_FIRST) { + case LEVEL_WAL: + sout << "WAL"; break; + case LEVEL_DB: + sout << "DB"; break; + case LEVEL_SLOW: + sout << "SLOW"; break; + case LEVEL_MAX: + sout << "TOTALS"; break; + } + for (size_t d = 0; d < max_x - 1; d++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l))); + } + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l))); + if (l < max_y - 1) { + sout << std::endl; + } + } +} + +// ======================================================= diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h new file mode 100644 index 00000000..159e9296 --- /dev/null +++ b/src/os/bluestore/BlueStore.h @@ -0,0 +1,3602 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_BLUESTORE_H +#define CEPH_OSD_BLUESTORE_H + +#include "acconfig.h" + +#include <unistd.h> + +#include <atomic> +#include <mutex> +#include <condition_variable> + +#include <boost/intrusive/list.hpp> +#include <boost/intrusive/unordered_set.hpp> +#include <boost/intrusive/set.hpp> +#include <boost/functional/hash.hpp> +#include <boost/dynamic_bitset.hpp> + +#include "include/cpp-btree/btree_set.h" + +#include "include/ceph_assert.h" +#include "include/unordered_map.h" +#include "include/mempool.h" +#include "common/bloom_filter.hpp" +#include "common/Finisher.h" +#include "common/Throttle.h" +#include "common/perf_counters.h" +#include "common/PriorityCache.h" +#include "compressor/Compressor.h" +#include "os/ObjectStore.h" + +#include "bluestore_types.h" +#include "BlockDevice.h" +#include "BlueFS.h" +#include "common/EventTrace.h" + +class Allocator; +class FreelistManager; +class BlueStoreRepairer; + +//#define DEBUG_CACHE +//#define DEBUG_DEFERRED + + + +// constants for Buffer::optimize() +#define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N + + +enum { + l_bluestore_first = 732430, + l_bluestore_kv_flush_lat, + l_bluestore_kv_commit_lat, + l_bluestore_kv_sync_lat, + l_bluestore_kv_final_lat, + l_bluestore_state_prepare_lat, + l_bluestore_state_aio_wait_lat, + l_bluestore_state_io_done_lat, + l_bluestore_state_kv_queued_lat, + l_bluestore_state_kv_committing_lat, + l_bluestore_state_kv_done_lat, + l_bluestore_state_deferred_queued_lat, + l_bluestore_state_deferred_aio_wait_lat, + l_bluestore_state_deferred_cleanup_lat, + l_bluestore_state_finishing_lat, + l_bluestore_state_done_lat, + l_bluestore_throttle_lat, + l_bluestore_submit_lat, + l_bluestore_commit_lat, + l_bluestore_read_lat, + l_bluestore_read_onode_meta_lat, + l_bluestore_read_wait_aio_lat, + l_bluestore_compress_lat, + l_bluestore_decompress_lat, + l_bluestore_csum_lat, + l_bluestore_compress_success_count, + l_bluestore_compress_rejected_count, + l_bluestore_write_pad_bytes, + l_bluestore_deferred_write_ops, + l_bluestore_deferred_write_bytes, + l_bluestore_write_penalty_read_ops, + l_bluestore_allocated, + l_bluestore_stored, + l_bluestore_compressed, + l_bluestore_compressed_allocated, + l_bluestore_compressed_original, + l_bluestore_onodes, + l_bluestore_onode_hits, + l_bluestore_onode_misses, + l_bluestore_onode_shard_hits, + l_bluestore_onode_shard_misses, + l_bluestore_extents, + l_bluestore_blobs, + l_bluestore_buffers, + l_bluestore_buffer_bytes, + l_bluestore_buffer_hit_bytes, + l_bluestore_buffer_miss_bytes, + l_bluestore_write_big, + l_bluestore_write_big_bytes, + l_bluestore_write_big_blobs, + l_bluestore_write_small, + l_bluestore_write_small_bytes, + l_bluestore_write_small_unused, + l_bluestore_write_small_deferred, + l_bluestore_write_small_pre_read, + l_bluestore_write_small_new, + l_bluestore_txc, + l_bluestore_onode_reshard, + l_bluestore_blob_split, + l_bluestore_extent_compress, + l_bluestore_gc_merged, + l_bluestore_read_eio, + l_bluestore_reads_with_retries, + l_bluestore_fragmentation, + l_bluestore_omap_seek_to_first_lat, + l_bluestore_omap_upper_bound_lat, + l_bluestore_omap_lower_bound_lat, + l_bluestore_omap_next_lat, + l_bluestore_omap_get_keys_lat, + l_bluestore_omap_get_values_lat, + l_bluestore_clist_lat, + l_bluestore_remove_lat, + l_bluestore_last +}; + +#define META_POOL_ID ((uint64_t)-1ull) + +class BlueStore : public ObjectStore, + public BlueFSDeviceExpander, + public md_config_obs_t { + // ----------------------------------------------------- + // types +public: + // config observer + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) override; + + //handler for discard event + void handle_discard(interval_set<uint64_t>& to_release); + + void _set_csum(); + void _set_compression(); + void _set_throttle_params(); + int _set_cache_sizes(); + + class TransContext; + + typedef map<uint64_t, bufferlist> ready_regions_t; + + + struct BufferSpace; + struct Collection; + typedef boost::intrusive_ptr<Collection> CollectionRef; + + struct AioContext { + virtual void aio_finish(BlueStore *store) = 0; + virtual ~AioContext() {} + }; + + /// cached buffer + struct Buffer { + MEMPOOL_CLASS_HELPERS(); + + enum { + STATE_EMPTY, ///< empty buffer -- used for cache history + STATE_CLEAN, ///< clean data that is up to date + STATE_WRITING, ///< data that is being written (io not yet complete) + }; + static const char *get_state_name(int s) { + switch (s) { + case STATE_EMPTY: return "empty"; + case STATE_CLEAN: return "clean"; + case STATE_WRITING: return "writing"; + default: return "???"; + } + } + enum { + FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN) + // NOTE: fix operator<< when you define a second flag + }; + static const char *get_flag_name(int s) { + switch (s) { + case FLAG_NOCACHE: return "nocache"; + default: return "???"; + } + } + + BufferSpace *space; + uint16_t state; ///< STATE_* + uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl + uint32_t flags; ///< FLAG_* + uint64_t seq; + uint32_t offset, length; + bufferlist data; + + boost::intrusive::list_member_hook<> lru_item; + boost::intrusive::list_member_hook<> state_item; + + Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l, + unsigned f = 0) + : space(space), state(s), flags(f), seq(q), offset(o), length(l) {} + Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, bufferlist& b, + unsigned f = 0) + : space(space), state(s), flags(f), seq(q), offset(o), + length(b.length()), data(b) {} + + bool is_empty() const { + return state == STATE_EMPTY; + } + bool is_clean() const { + return state == STATE_CLEAN; + } + bool is_writing() const { + return state == STATE_WRITING; + } + + uint32_t end() const { + return offset + length; + } + + void truncate(uint32_t newlen) { + ceph_assert(newlen < length); + if (data.length()) { + bufferlist t; + t.substr_of(data, 0, newlen); + data.claim(t); + } + length = newlen; + } + void maybe_rebuild() { + if (data.length() && + (data.get_num_buffers() > 1 || + data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) { + data.rebuild(); + } + } + + void dump(Formatter *f) const { + f->dump_string("state", get_state_name(state)); + f->dump_unsigned("seq", seq); + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); + f->dump_unsigned("data_length", data.length()); + } + }; + + struct Cache; + + /// map logical extent range (object) onto buffers + struct BufferSpace { + enum { + BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache + }; + + typedef boost::intrusive::list< + Buffer, + boost::intrusive::member_hook< + Buffer, + boost::intrusive::list_member_hook<>, + &Buffer::state_item> > state_list_t; + + mempool::bluestore_cache_meta::map<uint32_t, std::unique_ptr<Buffer>> + buffer_map; + + // we use a bare intrusive list here instead of std::map because + // it uses less memory and we expect this to be very small (very + // few IOs in flight to the same Blob at the same time). + state_list_t writing; ///< writing buffers, sorted by seq, ascending + + ~BufferSpace() { + ceph_assert(buffer_map.empty()); + ceph_assert(writing.empty()); + } + + void _add_buffer(Cache* cache, Buffer *b, int level, Buffer *near) { + cache->_audit("_add_buffer start"); + buffer_map[b->offset].reset(b); + if (b->is_writing()) { + b->data.reassign_to_mempool(mempool::mempool_bluestore_writing); + if (writing.empty() || writing.rbegin()->seq <= b->seq) { + writing.push_back(*b); + } else { + auto it = writing.begin(); + while (it->seq < b->seq) { + ++it; + } + + ceph_assert(it->seq >= b->seq); + // note that this will insert b before it + // hence the order is maintained + writing.insert(it, *b); + } + } else { + b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data); + cache->_add_buffer(b, level, near); + } + cache->_audit("_add_buffer end"); + } + void _rm_buffer(Cache* cache, Buffer *b) { + _rm_buffer(cache, buffer_map.find(b->offset)); + } + void _rm_buffer(Cache* cache, + map<uint32_t, std::unique_ptr<Buffer>>::iterator p) { + ceph_assert(p != buffer_map.end()); + cache->_audit("_rm_buffer start"); + if (p->second->is_writing()) { + writing.erase(writing.iterator_to(*p->second)); + } else { + cache->_rm_buffer(p->second.get()); + } + buffer_map.erase(p); + cache->_audit("_rm_buffer end"); + } + + map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound( + uint32_t offset) { + auto i = buffer_map.lower_bound(offset); + if (i != buffer_map.begin()) { + --i; + if (i->first + i->second->length <= offset) + ++i; + } + return i; + } + + // must be called under protection of the Cache lock + void _clear(Cache* cache); + + // return value is the highest cache_private of a trimmed buffer, or 0. + int discard(Cache* cache, uint32_t offset, uint32_t length) { + std::lock_guard l(cache->lock); + return _discard(cache, offset, length); + } + int _discard(Cache* cache, uint32_t offset, uint32_t length); + + void write(Cache* cache, uint64_t seq, uint32_t offset, bufferlist& bl, + unsigned flags) { + std::lock_guard l(cache->lock); + Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl, + flags); + b->cache_private = _discard(cache, offset, bl.length()); + _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr); + } + void _finish_write(Cache* cache, uint64_t seq); + void did_read(Cache* cache, uint32_t offset, bufferlist& bl) { + std::lock_guard l(cache->lock); + Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl); + b->cache_private = _discard(cache, offset, bl.length()); + _add_buffer(cache, b, 1, nullptr); + } + + void read(Cache* cache, uint32_t offset, uint32_t length, + BlueStore::ready_regions_t& res, + interval_set<uint32_t>& res_intervals, + int flags = 0); + + void truncate(Cache* cache, uint32_t offset) { + discard(cache, offset, (uint32_t)-1 - offset); + } + + void split(Cache* cache, size_t pos, BufferSpace &r); + + void dump(Cache* cache, Formatter *f) const { + std::lock_guard l(cache->lock); + f->open_array_section("buffers"); + for (auto& i : buffer_map) { + f->open_object_section("buffer"); + ceph_assert(i.first == i.second->offset); + i.second->dump(f); + f->close_section(); + } + f->close_section(); + } + }; + + struct SharedBlobSet; + + /// in-memory shared blob state (incl cached buffers) + struct SharedBlob { + MEMPOOL_CLASS_HELPERS(); + + std::atomic_int nref = {0}; ///< reference count + bool loaded = false; + + CollectionRef coll; + union { + uint64_t sbid_unloaded; ///< sbid if persistent isn't loaded + bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any + }; + BufferSpace bc; ///< buffer cache + + SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) { + if (get_cache()) { + get_cache()->add_blob(); + } + } + SharedBlob(uint64_t i, Collection *_coll); + ~SharedBlob(); + + uint64_t get_sbid() const { + return loaded ? persistent->sbid : sbid_unloaded; + } + + friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); } + friend void intrusive_ptr_release(SharedBlob *b) { b->put(); } + + friend ostream& operator<<(ostream& out, const SharedBlob& sb); + + void get() { + ++nref; + } + void put(); + + /// get logical references + void get_ref(uint64_t offset, uint32_t length); + + /// put logical references, and get back any released extents + void put_ref(uint64_t offset, uint32_t length, + PExtentVector *r, bool *unshare); + + void finish_write(uint64_t seq); + + friend bool operator==(const SharedBlob &l, const SharedBlob &r) { + return l.get_sbid() == r.get_sbid(); + } + inline Cache* get_cache() { + return coll ? coll->cache : nullptr; + } + inline SharedBlobSet* get_parent() { + return coll ? &(coll->shared_blob_set) : nullptr; + } + inline bool is_loaded() const { + return loaded; + } + + }; + typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef; + + /// a lookup table of SharedBlobs + struct SharedBlobSet { + /// protect lookup, insertion, removal + ceph::mutex lock = ceph::make_mutex("BlueStore::SharedBlobSet::lock"); + + // we use a bare pointer because we don't want to affect the ref + // count + mempool::bluestore_cache_meta::unordered_map<uint64_t,SharedBlob*> sb_map; + + SharedBlobRef lookup(uint64_t sbid) { + std::lock_guard l(lock); + auto p = sb_map.find(sbid); + if (p == sb_map.end() || + p->second->nref == 0) { + return nullptr; + } + return p->second; + } + + void add(Collection* coll, SharedBlob *sb) { + std::lock_guard l(lock); + sb_map[sb->get_sbid()] = sb; + sb->coll = coll; + } + + bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) { + std::lock_guard l(lock); + ceph_assert(sb->get_parent() == this); + if (verify_nref_is_zero && sb->nref != 0) { + return false; + } + // only remove if it still points to us + auto p = sb_map.find(sb->get_sbid()); + if (p != sb_map.end() && + p->second == sb) { + sb_map.erase(p); + } + return true; + } + + bool empty() { + std::lock_guard l(lock); + return sb_map.empty(); + } + + template <int LogLevelV> + void dump(CephContext *cct); + }; + +//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/ + + /// in-memory blob metadata and associated cached buffers (if any) + struct Blob { + MEMPOOL_CLASS_HELPERS(); + + std::atomic_int nref = {0}; ///< reference count + int16_t id = -1; ///< id, for spanning blobs only, >= 0 + int16_t last_encoded_id = -1; ///< (ephemeral) used during encoding only + SharedBlobRef shared_blob; ///< shared blob state (if any) + + private: + mutable bluestore_blob_t blob; ///< decoded blob metadata +#ifdef CACHE_BLOB_BL + mutable bufferlist blob_bl; ///< cached encoded blob, blob is dirty if empty +#endif + /// refs from this shard. ephemeral if id<0, persisted if spanning. + bluestore_blob_use_tracker_t used_in_blob; + + public: + + friend void intrusive_ptr_add_ref(Blob *b) { b->get(); } + friend void intrusive_ptr_release(Blob *b) { b->put(); } + + friend ostream& operator<<(ostream& out, const Blob &b); + + const bluestore_blob_use_tracker_t& get_blob_use_tracker() const { + return used_in_blob; + } + bool is_referenced() const { + return used_in_blob.is_not_empty(); + } + uint32_t get_referenced_bytes() const { + return used_in_blob.get_referenced_bytes(); + } + + bool is_spanning() const { + return id >= 0; + } + + bool can_split() const { + std::lock_guard l(shared_blob->get_cache()->lock); + // splitting a BufferSpace writing list is too hard; don't try. + return shared_blob->bc.writing.empty() && + used_in_blob.can_split() && + get_blob().can_split(); + } + + bool can_split_at(uint32_t blob_offset) const { + return used_in_blob.can_split_at(blob_offset) && + get_blob().can_split_at(blob_offset); + } + + bool can_reuse_blob(uint32_t min_alloc_size, + uint32_t target_blob_size, + uint32_t b_offset, + uint32_t *length0); + + void dup(Blob& o) { + o.shared_blob = shared_blob; + o.blob = blob; +#ifdef CACHE_BLOB_BL + o.blob_bl = blob_bl; +#endif + } + + inline const bluestore_blob_t& get_blob() const { + return blob; + } + inline bluestore_blob_t& dirty_blob() { +#ifdef CACHE_BLOB_BL + blob_bl.clear(); +#endif + return blob; + } + + /// discard buffers for unallocated regions + void discard_unallocated(Collection *coll); + + /// get logical references + void get_ref(Collection *coll, uint32_t offset, uint32_t length); + /// put logical references, and get back any released extents + bool put_ref(Collection *coll, uint32_t offset, uint32_t length, + PExtentVector *r); + + /// split the blob + void split(Collection *coll, uint32_t blob_offset, Blob *o); + + void get() { + ++nref; + } + void put() { + if (--nref == 0) + delete this; + } + + +#ifdef CACHE_BLOB_BL + void _encode() const { + if (blob_bl.length() == 0 ) { + encode(blob, blob_bl); + } else { + ceph_assert(blob_bl.length()); + } + } + void bound_encode( + size_t& p, + bool include_ref_map) const { + _encode(); + p += blob_bl.length(); + if (include_ref_map) { + used_in_blob.bound_encode(p); + } + } + void encode( + bufferlist::contiguous_appender& p, + bool include_ref_map) const { + _encode(); + p.append(blob_bl); + if (include_ref_map) { + used_in_blob.encode(p); + } + } + void decode( + Collection */*coll*/, + bufferptr::const_iterator& p, + bool include_ref_map) { + const char *start = p.get_pos(); + denc(blob, p); + const char *end = p.get_pos(); + blob_bl.clear(); + blob_bl.append(start, end - start); + if (include_ref_map) { + used_in_blob.decode(p); + } + } +#else + void bound_encode( + size_t& p, + uint64_t struct_v, + uint64_t sbid, + bool include_ref_map) const { + denc(blob, p, struct_v); + if (blob.is_shared()) { + denc(sbid, p); + } + if (include_ref_map) { + used_in_blob.bound_encode(p); + } + } + void encode( + bufferlist::contiguous_appender& p, + uint64_t struct_v, + uint64_t sbid, + bool include_ref_map) const { + denc(blob, p, struct_v); + if (blob.is_shared()) { + denc(sbid, p); + } + if (include_ref_map) { + used_in_blob.encode(p); + } + } + void decode( + Collection *coll, + bufferptr::const_iterator& p, + uint64_t struct_v, + uint64_t* sbid, + bool include_ref_map); +#endif + }; + typedef boost::intrusive_ptr<Blob> BlobRef; + typedef mempool::bluestore_cache_meta::map<int,BlobRef> blob_map_t; + + /// a logical extent, pointing to (some portion of) a blob + typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings + struct Extent : public ExtentBase { + MEMPOOL_CLASS_HELPERS(); + + uint32_t logical_offset = 0; ///< logical offset + uint32_t blob_offset = 0; ///< blob offset + uint32_t length = 0; ///< length + BlobRef blob; ///< the blob with our data + + /// ctor for lookup only + explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { } + /// ctor for delayed initialization (see decode_some()) + explicit Extent() : ExtentBase() { + } + /// ctor for general usage + Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) + : ExtentBase(), + logical_offset(lo), blob_offset(o), length(l) { + assign_blob(b); + } + ~Extent() { + if (blob) { + blob->shared_blob->get_cache()->rm_extent(); + } + } + + void assign_blob(const BlobRef& b) { + ceph_assert(!blob); + blob = b; + blob->shared_blob->get_cache()->add_extent(); + } + + // comparators for intrusive_set + friend bool operator<(const Extent &a, const Extent &b) { + return a.logical_offset < b.logical_offset; + } + friend bool operator>(const Extent &a, const Extent &b) { + return a.logical_offset > b.logical_offset; + } + friend bool operator==(const Extent &a, const Extent &b) { + return a.logical_offset == b.logical_offset; + } + + uint32_t blob_start() const { + return logical_offset - blob_offset; + } + + uint32_t blob_end() const { + return blob_start() + blob->get_blob().get_logical_length(); + } + + uint32_t logical_end() const { + return logical_offset + length; + } + + // return true if any piece of the blob is out of + // the given range [o, o + l]. + bool blob_escapes_range(uint32_t o, uint32_t l) const { + return blob_start() < o || blob_end() > o + l; + } + }; + typedef boost::intrusive::set<Extent> extent_map_t; + + + friend ostream& operator<<(ostream& out, const Extent& e); + + struct OldExtent { + boost::intrusive::list_member_hook<> old_extent_item; + Extent e; + PExtentVector r; + bool blob_empty; // flag to track the last removed extent that makes blob + // empty - required to update compression stat properly + OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) + : e(lo, o, l, b), blob_empty(false) { + } + static OldExtent* create(CollectionRef c, + uint32_t lo, + uint32_t o, + uint32_t l, + BlobRef& b); + }; + typedef boost::intrusive::list< + OldExtent, + boost::intrusive::member_hook< + OldExtent, + boost::intrusive::list_member_hook<>, + &OldExtent::old_extent_item> > old_extent_map_t; + + struct Onode; + + /// a sharded extent map, mapping offsets to lextents to blobs + struct ExtentMap { + Onode *onode; + extent_map_t extent_map; ///< map of Extents to Blobs + blob_map_t spanning_blob_map; ///< blobs that span shards + typedef boost::intrusive_ptr<Onode> OnodeRef; + + struct Shard { + bluestore_onode_t::shard_info *shard_info = nullptr; + unsigned extents = 0; ///< count extents in this shard + bool loaded = false; ///< true if shard is loaded + bool dirty = false; ///< true if shard is dirty and needs reencoding + }; + mempool::bluestore_cache_meta::vector<Shard> shards; ///< shards + + bufferlist inline_bl; ///< cached encoded map, if unsharded; empty=>dirty + + uint32_t needs_reshard_begin = 0; + uint32_t needs_reshard_end = 0; + + void dup(BlueStore* b, TransContext*, CollectionRef&, OnodeRef&, OnodeRef&, + uint64_t&, uint64_t&, uint64_t&); + + bool needs_reshard() const { + return needs_reshard_end > needs_reshard_begin; + } + void clear_needs_reshard() { + needs_reshard_begin = needs_reshard_end = 0; + } + void request_reshard(uint32_t begin, uint32_t end) { + if (begin < needs_reshard_begin) { + needs_reshard_begin = begin; + } + if (end > needs_reshard_end) { + needs_reshard_end = end; + } + } + + struct DeleteDisposer { + void operator()(Extent *e) { delete e; } + }; + + ExtentMap(Onode *o); + ~ExtentMap() { + extent_map.clear_and_dispose(DeleteDisposer()); + } + + void clear() { + extent_map.clear_and_dispose(DeleteDisposer()); + shards.clear(); + inline_bl.clear(); + clear_needs_reshard(); + } + + bool encode_some(uint32_t offset, uint32_t length, bufferlist& bl, + unsigned *pn); + unsigned decode_some(bufferlist& bl); + + void bound_encode_spanning_blobs(size_t& p); + void encode_spanning_blobs(bufferlist::contiguous_appender& p); + void decode_spanning_blobs(bufferptr::const_iterator& p); + + BlobRef get_spanning_blob(int id) { + auto p = spanning_blob_map.find(id); + ceph_assert(p != spanning_blob_map.end()); + return p->second; + } + + void update(KeyValueDB::Transaction t, bool force); + decltype(BlueStore::Blob::id) allocate_spanning_blob_id(); + void reshard( + KeyValueDB *db, + KeyValueDB::Transaction t); + + /// initialize Shards from the onode + void init_shards(bool loaded, bool dirty); + + /// return index of shard containing offset + /// or -1 if not found + int seek_shard(uint32_t offset) { + size_t end = shards.size(); + size_t mid, left = 0; + size_t right = end; // one passed the right end + + while (left < right) { + mid = left + (right - left) / 2; + if (offset >= shards[mid].shard_info->offset) { + size_t next = mid + 1; + if (next >= end || offset < shards[next].shard_info->offset) + return mid; + //continue to search forwards + left = next; + } else { + //continue to search backwards + right = mid; + } + } + + return -1; // not found + } + + /// check if a range spans a shard + bool spans_shard(uint32_t offset, uint32_t length) { + if (shards.empty()) { + return false; + } + int s = seek_shard(offset); + ceph_assert(s >= 0); + if (s == (int)shards.size() - 1) { + return false; // last shard + } + if (offset + length <= shards[s+1].shard_info->offset) { + return false; + } + return true; + } + + /// ensure that a range of the map is loaded + void fault_range(KeyValueDB *db, + uint32_t offset, uint32_t length); + + /// ensure a range of the map is marked dirty + void dirty_range(uint32_t offset, uint32_t length); + + /// for seek_lextent test + extent_map_t::iterator find(uint64_t offset); + + /// seek to the first lextent including or after offset + extent_map_t::iterator seek_lextent(uint64_t offset); + extent_map_t::const_iterator seek_lextent(uint64_t offset) const; + + /// add a new Extent + void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) { + extent_map.insert(*new Extent(lo, o, l, b)); + } + + /// remove (and delete) an Extent + void rm(extent_map_t::iterator p) { + extent_map.erase_and_dispose(p, DeleteDisposer()); + } + + bool has_any_lextents(uint64_t offset, uint64_t length); + + /// consolidate adjacent lextents in extent_map + int compress_extent_map(uint64_t offset, uint64_t length); + + /// punch a logical hole. add lextents to deref to target list. + void punch_hole(CollectionRef &c, + uint64_t offset, uint64_t length, + old_extent_map_t *old_extents); + + /// put new lextent into lextent_map overwriting existing ones if + /// any and update references accordingly + Extent *set_lextent(CollectionRef &c, + uint64_t logical_offset, + uint64_t offset, uint64_t length, + BlobRef b, + old_extent_map_t *old_extents); + + /// split a blob (and referring extents) + BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos); + }; + + /// Compressed Blob Garbage collector + /* + The primary idea of the collector is to estimate a difference between + allocation units(AU) currently present for compressed blobs and new AUs + required to store that data uncompressed. + Estimation is performed for protrusive extents within a logical range + determined by a concatenation of old_extents collection and specific(current) + write request. + The root cause for old_extents use is the need to handle blob ref counts + properly. Old extents still hold blob refs and hence we need to traverse + the collection to determine if blob to be released. + Protrusive extents are extents that fit into the blob set in action + (ones that are below the logical range from above) but not removed totally + due to the current write. + E.g. for + extent1 <loffs = 100, boffs = 100, len = 100> -> + blob1<compressed, len_on_disk=4096, logical_len=8192> + extent2 <loffs = 200, boffs = 200, len = 100> -> + blob2<raw, len_on_disk=4096, llen=4096> + extent3 <loffs = 300, boffs = 300, len = 100> -> + blob1<compressed, len_on_disk=4096, llen=8192> + extent4 <loffs = 4096, boffs = 0, len = 100> -> + blob3<raw, len_on_disk=4096, llen=4096> + write(300~100) + protrusive extents are within the following ranges <0~300, 400~8192-400> + In this case existing AUs that might be removed due to GC (i.e. blob1) + use 2x4K bytes. + And new AUs expected after GC = 0 since extent1 to be merged into blob2. + Hence we should do a collect. + */ + class GarbageCollector + { + public: + /// return amount of allocation units that might be saved due to GC + int64_t estimate( + uint64_t offset, + uint64_t length, + const ExtentMap& extent_map, + const old_extent_map_t& old_extents, + uint64_t min_alloc_size); + + /// return a collection of extents to perform GC on + const interval_set<uint64_t>& get_extents_to_collect() const { + return extents_to_collect; + } + GarbageCollector(CephContext* _cct) : cct(_cct) {} + + private: + struct BlobInfo { + uint64_t referenced_bytes = 0; ///< amount of bytes referenced in blob + int64_t expected_allocations = 0; ///< new alloc units required + ///< in case of gc fulfilled + bool collect_candidate = false; ///< indicate if blob has any extents + ///< eligible for GC. + extent_map_t::const_iterator first_lextent; ///< points to the first + ///< lextent referring to + ///< the blob if any. + ///< collect_candidate flag + ///< determines the validity + extent_map_t::const_iterator last_lextent; ///< points to the last + ///< lextent referring to + ///< the blob if any. + + BlobInfo(uint64_t ref_bytes) : + referenced_bytes(ref_bytes) { + } + }; + CephContext* cct; + map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map + ///< copies that are affected by the + ///< specific write + + ///< protrusive extents that should be collected if GC takes place + interval_set<uint64_t> extents_to_collect; + + boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation + ///< unit when traversing + ///< protrusive extents. + ///< Other extents mapped to + ///< this AU to be ignored + ///< (except the case where + ///< uncompressed extent follows + ///< compressed one - see below). + BlobInfo* blob_info_counted = nullptr; ///< set if previous allocation unit + ///< caused expected_allocations + ///< counter increment at this blob. + ///< if uncompressed extent follows + ///< a decrement for the + ///< expected_allocations counter + ///< is needed + int64_t expected_allocations = 0; ///< new alloc units required in case + ///< of gc fulfilled + int64_t expected_for_release = 0; ///< alloc units currently used by + ///< compressed blobs that might + ///< gone after GC + + protected: + void process_protrusive_extents(const BlueStore::ExtentMap& extent_map, + uint64_t start_offset, + uint64_t end_offset, + uint64_t start_touch_offset, + uint64_t end_touch_offset, + uint64_t min_alloc_size); + }; + + struct OnodeSpace; + + /// an in-memory object + struct Onode { + MEMPOOL_CLASS_HELPERS(); + + std::atomic_int nref; ///< reference count + Collection *c; + + ghobject_t oid; + + /// key under PREFIX_OBJ where we are stored + mempool::bluestore_cache_meta::string key; + + boost::intrusive::list_member_hook<> lru_item; + + bluestore_onode_t onode; ///< metadata stored as value in kv store + bool exists; ///< true if object logically exists + + ExtentMap extent_map; + + // track txc's that have not been committed to kv store (and whose + // effects cannot be read via the kvdb read methods) + std::atomic<int> flushing_count = {0}; + /// protect flush_txns + ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock"); + ceph::condition_variable flush_cond; ///< wait here for uncommitted txns + + Onode(Collection *c, const ghobject_t& o, + const mempool::bluestore_cache_meta::string& k) + : nref(0), + c(c), + oid(o), + key(k), + exists(false), + extent_map(this) { + } + Onode(Collection* c, const ghobject_t& o, + const string& k) + : nref(0), + c(c), + oid(o), + key(k), + exists(false), + extent_map(this) { + } + Onode(Collection* c, const ghobject_t& o, + const char* k) + : nref(0), + c(c), + oid(o), + key(k), + exists(false), + extent_map(this) { + } + + static Onode* decode( + CollectionRef c, + const ghobject_t& oid, + const string& key, + const bufferlist& v); + + void flush(); + void get() { + ++nref; + } + void put() { + if (--nref == 0) + delete this; + } + }; + typedef boost::intrusive_ptr<Onode> OnodeRef; + + + /// a cache (shard) of onodes and buffers + struct Cache { + CephContext* cct; + PerfCounters *logger; + + /// protect lru and other structures + ceph::recursive_mutex lock = { + ceph::make_recursive_mutex("BlueStore::Cache::lock") }; + + std::atomic<uint64_t> num_extents = {0}; + std::atomic<uint64_t> num_blobs = {0}; + + std::array<std::pair<ghobject_t, mono_clock::time_point>, 64> dumped_onodes; + + static Cache *create(CephContext* cct, string type, PerfCounters *logger); + + Cache(CephContext* cct) : cct(cct), logger(nullptr) {} + virtual ~Cache() {} + + virtual void _add_onode(OnodeRef& o, int level) = 0; + virtual void _rm_onode(OnodeRef& o) = 0; + virtual void _touch_onode(OnodeRef& o) = 0; + + virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0; + virtual void _rm_buffer(Buffer *b) = 0; + virtual void _move_buffer(Cache *src, Buffer *b) = 0; + virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0; + virtual void _touch_buffer(Buffer *b) = 0; + + virtual uint64_t _get_num_onodes() = 0; + virtual uint64_t _get_buffer_bytes() = 0; + + void add_extent() { + ++num_extents; + } + void rm_extent() { + --num_extents; + } + + void add_blob() { + ++num_blobs; + } + void rm_blob() { + --num_blobs; + } + + void trim(uint64_t onode_max, uint64_t buffer_max); + + void trim_all(); + + virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0; + + virtual void add_stats(uint64_t *onodes, uint64_t *extents, + uint64_t *blobs, + uint64_t *buffers, + uint64_t *bytes) = 0; + + bool empty() { + std::lock_guard l(lock); + return _get_num_onodes() == 0 && _get_buffer_bytes() == 0; + } + +#ifdef DEBUG_CACHE + virtual void _audit(const char *s) = 0; +#else + void _audit(const char *s) { /* no-op */ } +#endif + }; + + /// simple LRU cache for onodes and buffers + struct LRUCache : public Cache { + private: + typedef boost::intrusive::list< + Onode, + boost::intrusive::member_hook< + Onode, + boost::intrusive::list_member_hook<>, + &Onode::lru_item> > onode_lru_list_t; + typedef boost::intrusive::list< + Buffer, + boost::intrusive::member_hook< + Buffer, + boost::intrusive::list_member_hook<>, + &Buffer::lru_item> > buffer_lru_list_t; + + onode_lru_list_t onode_lru; + onode_lru_list_t::iterator last_pinned; + + buffer_lru_list_t buffer_lru; + uint64_t buffer_size = 0; + + void _onode_lru_erase(onode_lru_list_t::iterator it) { + if (it == last_pinned) { + last_pinned = onode_lru.end(); + } + onode_lru.erase(it); + } + + public: + LRUCache(CephContext* cct) : Cache(cct), last_pinned(onode_lru.end()){} + uint64_t _get_num_onodes() override { + return onode_lru.size(); + } + void _add_onode(OnodeRef& o, int level) override { + if (level > 0) + onode_lru.push_front(*o); + else + onode_lru.push_back(*o); + } + void _rm_onode(OnodeRef& o) override { + auto q = onode_lru.iterator_to(*o); + _onode_lru_erase(q); + } + void _touch_onode(OnodeRef& o) override; + + uint64_t _get_buffer_bytes() override { + return buffer_size; + } + void _add_buffer(Buffer *b, int level, Buffer *near) override { + if (near) { + auto q = buffer_lru.iterator_to(*near); + buffer_lru.insert(q, *b); + } else if (level > 0) { + buffer_lru.push_front(*b); + } else { + buffer_lru.push_back(*b); + } + buffer_size += b->length; + } + void _rm_buffer(Buffer *b) override { + ceph_assert(buffer_size >= b->length); + buffer_size -= b->length; + auto q = buffer_lru.iterator_to(*b); + buffer_lru.erase(q); + } + void _move_buffer(Cache *src, Buffer *b) override { + src->_rm_buffer(b); + _add_buffer(b, 0, nullptr); + } + void _adjust_buffer_size(Buffer *b, int64_t delta) override { + ceph_assert((int64_t)buffer_size + delta >= 0); + buffer_size += delta; + } + void _touch_buffer(Buffer *b) override { + auto p = buffer_lru.iterator_to(*b); + buffer_lru.erase(p); + buffer_lru.push_front(*b); + _audit("_touch_buffer end"); + } + + void _trim(uint64_t onode_max, uint64_t buffer_max) override; + + void add_stats(uint64_t *onodes, uint64_t *extents, + uint64_t *blobs, + uint64_t *buffers, + uint64_t *bytes) override { + std::lock_guard l(lock); + *onodes += onode_lru.size(); + *extents += num_extents; + *blobs += num_blobs; + *buffers += buffer_lru.size(); + *bytes += buffer_size; + } + +#ifdef DEBUG_CACHE + void _audit(const char *s) override; +#endif + }; + + // 2Q cache for buffers, LRU for onodes + struct TwoQCache : public Cache { + private: + // stick with LRU for onodes for now (fixme?) + typedef boost::intrusive::list< + Onode, + boost::intrusive::member_hook< + Onode, + boost::intrusive::list_member_hook<>, + &Onode::lru_item> > onode_lru_list_t; + typedef boost::intrusive::list< + Buffer, + boost::intrusive::member_hook< + Buffer, + boost::intrusive::list_member_hook<>, + &Buffer::lru_item> > buffer_list_t; + + onode_lru_list_t onode_lru; + onode_lru_list_t::iterator last_pinned; + + buffer_list_t buffer_hot; ///< "Am" hot buffers + buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers + buffer_list_t buffer_warm_out; ///< "A1out" empty buffers we've evicted + uint64_t buffer_bytes = 0; ///< bytes + + enum { + BUFFER_NEW = 0, + BUFFER_WARM_IN, ///< in buffer_warm_in + BUFFER_WARM_OUT, ///< in buffer_warm_out + BUFFER_HOT, ///< in buffer_hot + BUFFER_TYPE_MAX + }; + + uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type + + void _onode_lru_erase(onode_lru_list_t::iterator it) { + if (it == last_pinned) { + last_pinned = onode_lru.end(); + } + onode_lru.erase(it); + } + public: + TwoQCache(CephContext* cct) : Cache(cct), last_pinned(onode_lru.end()){} + uint64_t _get_num_onodes() override { + return onode_lru.size(); + } + void _add_onode(OnodeRef& o, int level) override { + if (level > 0) + onode_lru.push_front(*o); + else + onode_lru.push_back(*o); + } + void _rm_onode(OnodeRef& o) override { + auto q = onode_lru.iterator_to(*o); + _onode_lru_erase(q); + } + void _touch_onode(OnodeRef& o) override; + + uint64_t _get_buffer_bytes() override { + return buffer_bytes; + } + void _add_buffer(Buffer *b, int level, Buffer *near) override; + void _rm_buffer(Buffer *b) override; + void _move_buffer(Cache *src, Buffer *b) override; + void _adjust_buffer_size(Buffer *b, int64_t delta) override; + void _touch_buffer(Buffer *b) override { + switch (b->cache_private) { + case BUFFER_WARM_IN: + // do nothing (somewhat counter-intuitively!) + break; + case BUFFER_WARM_OUT: + // move from warm_out to hot LRU + ceph_abort_msg("this happens via discard hint"); + break; + case BUFFER_HOT: + // move to front of hot LRU + buffer_hot.erase(buffer_hot.iterator_to(*b)); + buffer_hot.push_front(*b); + break; + } + _audit("_touch_buffer end"); + } + + void _trim(uint64_t onode_max, uint64_t buffer_max) override; + + void add_stats(uint64_t *onodes, uint64_t *extents, + uint64_t *blobs, + uint64_t *buffers, + uint64_t *bytes) override { + std::lock_guard l(lock); + *onodes += onode_lru.size(); + *extents += num_extents; + *blobs += num_blobs; + *buffers += buffer_hot.size() + buffer_warm_in.size(); + *bytes += buffer_bytes; + } + +#ifdef DEBUG_CACHE + void _audit(const char *s) override; +#endif + }; + + struct OnodeSpace { + private: + Cache *cache; + + /// forward lookups + mempool::bluestore_cache_meta::unordered_map<ghobject_t,OnodeRef> onode_map; + + friend class Collection; // for split_cache() + + public: + OnodeSpace(Cache *c) : cache(c) {} + ~OnodeSpace() { + clear(); + } + + OnodeRef add(const ghobject_t& oid, OnodeRef o); + OnodeRef lookup(const ghobject_t& o); + void remove(const ghobject_t& oid) { + onode_map.erase(oid); + } + void rename(OnodeRef& o, const ghobject_t& old_oid, + const ghobject_t& new_oid, + const mempool::bluestore_cache_meta::string& new_okey); + void clear(); + bool empty(); + + template <int LogLevelV> + void dump(CephContext *cct); + + /// return true if f true for any item + bool map_any(std::function<bool(OnodeRef)> f); + }; + + class OpSequencer; + typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef; + + struct Collection : public CollectionImpl { + BlueStore *store; + OpSequencerRef osr; + Cache *cache; ///< our cache shard + bluestore_cnode_t cnode; + RWLock lock; + + bool exists; + + SharedBlobSet shared_blob_set; ///< open SharedBlobs + + // cache onodes on a per-collection basis to avoid lock + // contention. + OnodeSpace onode_map; + + //pool options + pool_opts_t pool_opts; + ContextQueue *commit_queue; + + OnodeRef get_onode(const ghobject_t& oid, bool create); + + // the terminology is confusing here, sorry! + // + // blob_t shared_blob_t + // !shared unused -> open + // shared !loaded -> open + shared + // shared loaded -> open + shared + loaded + // + // i.e., + // open = SharedBlob is instantiated + // shared = blob_t shared flag is set; SharedBlob is hashed. + // loaded = SharedBlob::shared_blob_t is loaded from kv store + void open_shared_blob(uint64_t sbid, BlobRef b); + void load_shared_blob(SharedBlobRef sb); + void make_blob_shared(uint64_t sbid, BlobRef b); + uint64_t make_blob_unshared(SharedBlob *sb); + + BlobRef new_blob() { + BlobRef b = new Blob(); + b->shared_blob = new SharedBlob(this); + return b; + } + + bool contains(const ghobject_t& oid) { + if (cid.is_meta()) + return oid.hobj.pool == -1; + spg_t spgid; + if (cid.is_pg(&spgid)) + return + spgid.pgid.contains(cnode.bits, oid) && + oid.shard_id == spgid.shard; + return false; + } + + void split_cache(Collection *dest); + + bool flush_commit(Context *c) override; + void flush() override; + void flush_all_but_last(); + + Collection(BlueStore *ns, Cache *ca, coll_t c); + }; + + class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { + CollectionRef c; + OnodeRef o; + KeyValueDB::Iterator it; + string head, tail; + + string _stringify() const; + + public: + OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it); + int seek_to_first() override; + int upper_bound(const string &after) override; + int lower_bound(const string &to) override; + bool valid() override; + int next() override; + string key() override; + bufferlist value() override; + int status() override { + return 0; + } + }; + + struct volatile_statfs{ + enum { + STATFS_ALLOCATED = 0, + STATFS_STORED, + STATFS_COMPRESSED_ORIGINAL, + STATFS_COMPRESSED, + STATFS_COMPRESSED_ALLOCATED, + STATFS_LAST + }; + int64_t values[STATFS_LAST]; + volatile_statfs() { + memset(this, 0, sizeof(volatile_statfs)); + } + void reset() { + *this = volatile_statfs(); + } + void publish(store_statfs_t* buf) const { + buf->allocated = allocated(); + buf->data_stored = stored(); + buf->data_compressed = compressed(); + buf->data_compressed_original = compressed_original(); + buf->data_compressed_allocated = compressed_allocated(); + } + + volatile_statfs& operator+=(const volatile_statfs& other) { + for (size_t i = 0; i < STATFS_LAST; ++i) { + values[i] += other.values[i]; + } + return *this; + } + int64_t& allocated() { + return values[STATFS_ALLOCATED]; + } + int64_t& stored() { + return values[STATFS_STORED]; + } + int64_t& compressed_original() { + return values[STATFS_COMPRESSED_ORIGINAL]; + } + int64_t& compressed() { + return values[STATFS_COMPRESSED]; + } + int64_t& compressed_allocated() { + return values[STATFS_COMPRESSED_ALLOCATED]; + } + int64_t allocated() const { + return values[STATFS_ALLOCATED]; + } + int64_t stored() const { + return values[STATFS_STORED]; + } + int64_t compressed_original() const { + return values[STATFS_COMPRESSED_ORIGINAL]; + } + int64_t compressed() const { + return values[STATFS_COMPRESSED]; + } + int64_t compressed_allocated() const { + return values[STATFS_COMPRESSED_ALLOCATED]; + } + volatile_statfs& operator=(const store_statfs_t& st) { + values[STATFS_ALLOCATED] = st.allocated; + values[STATFS_STORED] = st.data_stored; + values[STATFS_COMPRESSED_ORIGINAL] = st.data_compressed_original; + values[STATFS_COMPRESSED] = st.data_compressed; + values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated; + return *this; + } + bool is_empty() { + return values[STATFS_ALLOCATED] == 0 && + values[STATFS_STORED] == 0 && + values[STATFS_COMPRESSED] == 0 && + values[STATFS_COMPRESSED_ORIGINAL] == 0 && + values[STATFS_COMPRESSED_ALLOCATED] == 0; + } + void decode(bufferlist::const_iterator& it) { + using ceph::decode; + for (size_t i = 0; i < STATFS_LAST; i++) { + decode(values[i], it); + } + } + + void encode(bufferlist& bl) { + using ceph::encode; + for (size_t i = 0; i < STATFS_LAST; i++) { + encode(values[i], bl); + } + } + }; + + struct TransContext final : public AioContext { + MEMPOOL_CLASS_HELPERS(); + + typedef enum { + STATE_PREPARE, + STATE_AIO_WAIT, + STATE_IO_DONE, + STATE_KV_QUEUED, // queued for kv_sync_thread submission + STATE_KV_SUBMITTED, // submitted to kv; not yet synced + STATE_KV_DONE, + STATE_DEFERRED_QUEUED, // in deferred_queue (pending or running) + STATE_DEFERRED_CLEANUP, // remove deferred kv record + STATE_DEFERRED_DONE, + STATE_FINISHING, + STATE_DONE, + } state_t; + + state_t state = STATE_PREPARE; + + const char *get_state_name() { + switch (state) { + case STATE_PREPARE: return "prepare"; + case STATE_AIO_WAIT: return "aio_wait"; + case STATE_IO_DONE: return "io_done"; + case STATE_KV_QUEUED: return "kv_queued"; + case STATE_KV_SUBMITTED: return "kv_submitted"; + case STATE_KV_DONE: return "kv_done"; + case STATE_DEFERRED_QUEUED: return "deferred_queued"; + case STATE_DEFERRED_CLEANUP: return "deferred_cleanup"; + case STATE_DEFERRED_DONE: return "deferred_done"; + case STATE_FINISHING: return "finishing"; + case STATE_DONE: return "done"; + } + return "???"; + } + +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + const char *get_state_latency_name(int state) { + switch (state) { + case l_bluestore_state_prepare_lat: return "prepare"; + case l_bluestore_state_aio_wait_lat: return "aio_wait"; + case l_bluestore_state_io_done_lat: return "io_done"; + case l_bluestore_state_kv_queued_lat: return "kv_queued"; + case l_bluestore_state_kv_committing_lat: return "kv_committing"; + case l_bluestore_state_kv_done_lat: return "kv_done"; + case l_bluestore_state_deferred_queued_lat: return "deferred_queued"; + case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup"; + case l_bluestore_state_finishing_lat: return "finishing"; + case l_bluestore_state_done_lat: return "done"; + } + return "???"; + } +#endif + + utime_t log_state_latency(PerfCounters *logger, int state) { + utime_t lat, now = ceph_clock_now(); + lat = now - last_stamp; + logger->tinc(state, lat); +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + if (state >= l_bluestore_state_prepare_lat && state <= l_bluestore_state_done_lat) { + double usecs = (now.to_nsec()-last_stamp.to_nsec())/1000; + OID_ELAPSED("", usecs, get_state_latency_name(state)); + } +#endif + last_stamp = now; + return lat; + } + + CollectionRef ch; + OpSequencerRef osr; // this should be ch->osr + boost::intrusive::list_member_hook<> sequencer_item; + + uint64_t bytes = 0, cost = 0; + + set<OnodeRef> onodes; ///< these need to be updated/written + set<OnodeRef> modified_objects; ///< objects we modified (and need a ref) + set<SharedBlobRef> shared_blobs; ///< these need to be updated/written + set<SharedBlobRef> shared_blobs_written; ///< update these on io completion + + KeyValueDB::Transaction t; ///< then we will commit this + list<Context*> oncommits; ///< more commit completions + list<CollectionRef> removed_collections; ///< colls we removed + + boost::intrusive::list_member_hook<> deferred_queue_item; + bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any + + interval_set<uint64_t> allocated, released; + volatile_statfs statfs_delta; ///< overall store statistics delta + uint64_t osd_pool_id = META_POOL_ID; ///< osd pool id we're operating on + + IOContext ioc; + bool had_ios = false; ///< true if we submitted IOs before our kv txn + + uint64_t seq = 0; + utime_t start; + utime_t last_stamp; + + uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated + uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated + + explicit TransContext(CephContext* cct, Collection *c, OpSequencer *o, + list<Context*> *on_commits) + : ch(c), + osr(o), + ioc(cct, this), + start(ceph_clock_now()) { + last_stamp = start; + if (on_commits) { + oncommits.swap(*on_commits); + } + } + ~TransContext() { + delete deferred_txn; + } + + void write_onode(OnodeRef &o) { + onodes.insert(o); + } + void write_shared_blob(SharedBlobRef &sb) { + shared_blobs.insert(sb); + } + void unshare_blob(SharedBlob *sb) { + shared_blobs.erase(sb); + } + + /// note we logically modified object (when onode itself is unmodified) + void note_modified_object(OnodeRef &o) { + // onode itself isn't written, though + modified_objects.insert(o); + } + void note_removed_object(OnodeRef& o) { + onodes.erase(o); + modified_objects.insert(o); + } + + void aio_finish(BlueStore *store) override { + store->txc_aio_finish(this); + } + }; + + typedef boost::intrusive::list< + TransContext, + boost::intrusive::member_hook< + TransContext, + boost::intrusive::list_member_hook<>, + &TransContext::deferred_queue_item> > deferred_queue_t; + + struct DeferredBatch final : public AioContext { + OpSequencer *osr; + struct deferred_io { + bufferlist bl; ///< data + uint64_t seq; ///< deferred transaction seq + }; + map<uint64_t,deferred_io> iomap; ///< map of ios in this batch + deferred_queue_t txcs; ///< txcs in this batch + IOContext ioc; ///< our aios + /// bytes of pending io for each deferred seq (may be 0) + map<uint64_t,int> seq_bytes; + + void _discard(CephContext *cct, uint64_t offset, uint64_t length); + void _audit(CephContext *cct); + + DeferredBatch(CephContext *cct, OpSequencer *osr) + : osr(osr), ioc(cct, this) {} + + /// prepare a write + void prepare_write(CephContext *cct, + uint64_t seq, uint64_t offset, uint64_t length, + bufferlist::const_iterator& p); + + void aio_finish(BlueStore *store) override { + store->_deferred_aio_finish(osr); + } + }; + + class OpSequencer : public RefCountedObject { + public: + ceph::mutex qlock = ceph::make_mutex("BlueStore::OpSequencer::qlock"); + ceph::condition_variable qcond; + typedef boost::intrusive::list< + TransContext, + boost::intrusive::member_hook< + TransContext, + boost::intrusive::list_member_hook<>, + &TransContext::sequencer_item> > q_list_t; + q_list_t q; ///< transactions + + boost::intrusive::list_member_hook<> deferred_osr_queue_item; + + DeferredBatch *deferred_running = nullptr; + DeferredBatch *deferred_pending = nullptr; + + BlueStore *store; + coll_t cid; + + uint64_t last_seq = 0; + + std::atomic_int txc_with_unstable_io = {0}; ///< num txcs with unstable io + + std::atomic_int kv_committing_serially = {0}; + + std::atomic_int kv_submitted_waiters = {0}; + + std::atomic_bool zombie = {false}; ///< in zombie_osr set (collection going away) + + OpSequencer(BlueStore *store, const coll_t& c) + : RefCountedObject(store->cct, 0), + store(store), cid(c) { + } + ~OpSequencer() { + ceph_assert(q.empty()); + } + + void queue_new(TransContext *txc) { + std::lock_guard l(qlock); + txc->seq = ++last_seq; + q.push_back(*txc); + } + + void drain() { + std::unique_lock l(qlock); + while (!q.empty()) + qcond.wait(l); + } + + void drain_preceding(TransContext *txc) { + std::unique_lock l(qlock); + while (!q.empty() && &q.front() != txc) + qcond.wait(l); + } + + bool _is_all_kv_submitted() { + // caller must hold qlock & q.empty() must not empty + ceph_assert(!q.empty()); + TransContext *txc = &q.back(); + if (txc->state >= TransContext::STATE_KV_SUBMITTED) { + return true; + } + return false; + } + + void flush() { + std::unique_lock l(qlock); + while (true) { + // set flag before the check because the condition + // may become true outside qlock, and we need to make + // sure those threads see waiters and signal qcond. + ++kv_submitted_waiters; + if (q.empty() || _is_all_kv_submitted()) { + --kv_submitted_waiters; + return; + } + qcond.wait(l); + --kv_submitted_waiters; + } + } + + void flush_all_but_last() { + std::unique_lock l(qlock); + assert (q.size() >= 1); + while (true) { + // set flag before the check because the condition + // may become true outside qlock, and we need to make + // sure those threads see waiters and signal qcond. + ++kv_submitted_waiters; + if (q.size() <= 1) { + --kv_submitted_waiters; + return; + } else { + auto it = q.rbegin(); + it++; + if (it->state >= TransContext::STATE_KV_SUBMITTED) { + --kv_submitted_waiters; + return; + } + } + qcond.wait(l); + --kv_submitted_waiters; + } + } + + bool flush_commit(Context *c) { + std::lock_guard l(qlock); + if (q.empty()) { + return true; + } + TransContext *txc = &q.back(); + if (txc->state >= TransContext::STATE_KV_DONE) { + return true; + } + txc->oncommits.push_back(c); + return false; + } + }; + + typedef boost::intrusive::list< + OpSequencer, + boost::intrusive::member_hook< + OpSequencer, + boost::intrusive::list_member_hook<>, + &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t; + + struct KVSyncThread : public Thread { + BlueStore *store; + explicit KVSyncThread(BlueStore *s) : store(s) {} + void *entry() override { + store->_kv_sync_thread(); + return NULL; + } + }; + struct KVFinalizeThread : public Thread { + BlueStore *store; + explicit KVFinalizeThread(BlueStore *s) : store(s) {} + void *entry() { + store->_kv_finalize_thread(); + return NULL; + } + }; + + struct DBHistogram { + struct value_dist { + uint64_t count; + uint32_t max_len; + }; + + struct key_dist { + uint64_t count; + uint32_t max_len; + map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key + }; + + map<string, map<int, struct key_dist> > key_hist; + map<int, uint64_t> value_hist; + int get_key_slab(size_t sz); + string get_key_slab_to_range(int slab); + int get_value_slab(size_t sz); + string get_value_slab_to_range(int slab); + void update_hist_entry(map<string, map<int, struct key_dist> > &key_hist, + const string &prefix, size_t key_size, size_t value_size); + void dump(Formatter *f); + }; + + // -------------------------------------------------------- + // members +private: + BlueFS *bluefs = nullptr; + unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing + bool bluefs_single_shared_device = true; + mono_time bluefs_last_balance; + utime_t next_dump_on_bluefs_alloc_failure; + + KeyValueDB *db = nullptr; + BlockDevice *bdev = nullptr; + std::string freelist_type; + FreelistManager *fm = nullptr; + Allocator *alloc = nullptr; + uuid_d fsid; + int path_fd = -1; ///< open handle to $path + int fsid_fd = -1; ///< open handle (locked) to $path/fsid + bool mounted = false; + + RWLock coll_lock = {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map + mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map; + bool collections_had_errors = false; + map<coll_t,CollectionRef> new_coll_map; + + vector<Cache*> cache_shards; + + /// protect zombie_osr_set + ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock"); + std::map<coll_t,OpSequencerRef> zombie_osr_set; ///< set of OpSequencers for deleted collections + + std::atomic<uint64_t> nid_last = {0}; + std::atomic<uint64_t> nid_max = {0}; + std::atomic<uint64_t> blobid_last = {0}; + std::atomic<uint64_t> blobid_max = {0}; + + Throttle throttle_bytes; ///< submit to commit + Throttle throttle_deferred_bytes; ///< submit to deferred complete + + interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs + interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming + + ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::deferred_lock"); + std::atomic<uint64_t> deferred_seq = {0}; + deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending + int deferred_queue_size = 0; ///< num txc's queued across all osrs + atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread + Finisher deferred_finisher, finisher; + + KVSyncThread kv_sync_thread; + ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock"); + ceph::condition_variable kv_cond; + bool _kv_only = false; + bool kv_sync_started = false; + bool kv_stop = false; + bool kv_finalize_started = false; + bool kv_finalize_stop = false; + deque<TransContext*> kv_queue; ///< ready, already submitted + deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread + deque<TransContext*> kv_committing; ///< currently syncing + deque<DeferredBatch*> deferred_done_queue; ///< deferred ios done + + KVFinalizeThread kv_finalize_thread; + ceph::mutex kv_finalize_lock = ceph::make_mutex("BlueStore::kv_finalize_lock"); + ceph::condition_variable kv_finalize_cond; + deque<TransContext*> kv_committing_to_finalize; ///< pending finalization + deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization + + PerfCounters *logger = nullptr; + + list<CollectionRef> removed_collections; + + RWLock debug_read_error_lock = {"BlueStore::debug_read_error_lock"}; + set<ghobject_t> debug_data_error_objects; + set<ghobject_t> debug_mdata_error_objects; + + std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C}; + + uint64_t block_size = 0; ///< block size of block device (power of 2) + uint64_t block_mask = 0; ///< mask to get just the block offset + size_t block_size_order = 0; ///< bits to shift to get block size + + uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2) + ///< bits for min_alloc_size + uint8_t min_alloc_size_order = 0; + static_assert(std::numeric_limits<uint8_t>::max() > + std::numeric_limits<decltype(min_alloc_size)>::digits, + "not enough bits for min_alloc_size"); + + ///< maximum allocation unit (power of 2) + std::atomic<uint64_t> max_alloc_size = {0}; + + ///< number threshold for forced deferred writes + std::atomic<int> deferred_batch_ops = {0}; + + ///< size threshold for forced deferred writes + std::atomic<uint64_t> prefer_deferred_size = {0}; + + ///< approx cost per io, in bytes + std::atomic<uint64_t> throttle_cost_per_io = {0}; + + std::atomic<Compressor::CompressionMode> comp_mode = + {Compressor::COMP_NONE}; ///< compression mode + CompressorRef compressor; + std::atomic<uint64_t> comp_min_blob_size = {0}; + std::atomic<uint64_t> comp_max_blob_size = {0}; + + std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size + + uint64_t kv_ios = 0; + uint64_t kv_throttle_costs = 0; + + // cache trim control + uint64_t cache_size = 0; ///< total cache size + double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata + double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb) + double cache_data_ratio = 0; ///< cache ratio dedicated to object data + bool cache_autotune = false; ///< cache autotune setting + double cache_autotune_interval = 0; ///< time to wait between cache rebalancing + uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache + uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache + double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation + uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache + double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing + std::atomic<uint32_t> config_changed = {0}; ///< Counter to determine if there is a configuration change. + + typedef map<uint64_t, volatile_statfs> osd_pools_map; + + ceph::mutex vstatfs_lock = ceph::make_mutex("BlueStore::vstatfs_lock"); + volatile_statfs vstatfs; + osd_pools_map osd_pools; // protected by vstatfs_lock as well + + bool per_pool_stat_collection = true; + + struct MempoolThread : public Thread { + public: + BlueStore *store; + + ceph::condition_variable cond; + ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock"); + bool stop = false; + uint64_t autotune_cache_size = 0; + std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr; + std::shared_ptr<PriorityCache::Manager> pcm = nullptr; + + struct MempoolCache : public PriorityCache::PriCache { + BlueStore *store; + int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0}; + int64_t committed_bytes = 0; + double cache_ratio = 0; + + MempoolCache(BlueStore *s) : store(s) {}; + + virtual uint64_t _get_used_bytes() const = 0; + + virtual int64_t request_cache_bytes( + PriorityCache::Priority pri, uint64_t total_cache) const { + int64_t assigned = get_cache_bytes(pri); + + switch (pri) { + // All cache items are currently shoved into the PRI1 priority + case PriorityCache::Priority::PRI1: + { + int64_t request = _get_used_bytes(); + return(request > assigned) ? request - assigned : 0; + } + default: + break; + } + return -EOPNOTSUPP; + } + + virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const { + return cache_bytes[pri]; + } + virtual int64_t get_cache_bytes() const { + int64_t total = 0; + + for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) { + PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i); + total += get_cache_bytes(pri); + } + return total; + } + virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) { + cache_bytes[pri] = bytes; + } + virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) { + cache_bytes[pri] += bytes; + } + virtual int64_t commit_cache_size(uint64_t total_cache) { + committed_bytes = PriorityCache::get_chunk( + get_cache_bytes(), total_cache); + return committed_bytes; + } + virtual int64_t get_committed_size() const { + return committed_bytes; + } + virtual double get_cache_ratio() const { + return cache_ratio; + } + virtual void set_cache_ratio(double ratio) { + cache_ratio = ratio; + } + virtual string get_cache_name() const = 0; + }; + + struct MetaCache : public MempoolCache { + MetaCache(BlueStore *s) : MempoolCache(s) {}; + + virtual uint64_t _get_used_bytes() const { + return mempool::bluestore_Buffer::allocated_bytes() + + mempool::bluestore_Blob::allocated_bytes() + + mempool::bluestore_Extent::allocated_bytes() + + mempool::bluestore_cache_meta::allocated_bytes() + + mempool::bluestore_cache_other::allocated_bytes() + + mempool::bluestore_cache_onode::allocated_bytes() + + mempool::bluestore_SharedBlob::allocated_bytes() + + mempool::bluestore_inline_bl::allocated_bytes(); + } + + virtual string get_cache_name() const { + return "BlueStore Meta Cache"; + } + + uint64_t _get_num_onodes() const { + uint64_t onode_num = + mempool::bluestore_cache_onode::allocated_items(); + return (2 > onode_num) ? 2 : onode_num; + } + + double get_bytes_per_onode() const { + return (double)_get_used_bytes() / (double)_get_num_onodes(); + } + }; + std::shared_ptr<MetaCache> meta_cache; + + struct DataCache : public MempoolCache { + DataCache(BlueStore *s) : MempoolCache(s) {}; + + virtual uint64_t _get_used_bytes() const { + uint64_t bytes = 0; + for (auto i : store->cache_shards) { + bytes += i->_get_buffer_bytes(); + } + return bytes; + } + virtual string get_cache_name() const { + return "BlueStore Data Cache"; + } + }; + std::shared_ptr<DataCache> data_cache; + + public: + explicit MempoolThread(BlueStore *s) + : store(s), + meta_cache(new MetaCache(s)), + data_cache(new DataCache(s)) {} + + void *entry() override; + void init() { + ceph_assert(stop == false); + create("bstore_mempool"); + } + void shutdown() { + lock.lock(); + stop = true; + cond.notify_all(); + lock.unlock(); + join(); + } + + private: + void _adjust_cache_settings(); + void _trim_shards(bool interval_stats); + void _tune_cache_size(bool interval_stats); + void _balance_cache( + const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches); + void _balance_cache_pri( + int64_t *mem_avail, + const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches, + PriorityCache::Priority pri); + void _update_cache_settings(); + } mempool_thread; + + // -------------------------------------------------------- + // private methods + + void _init_logger(); + void _shutdown_logger(); + int _reload_logger(); + + int _open_path(); + void _close_path(); + int _open_fsid(bool create); + int _lock_fsid(); + int _read_fsid(uuid_d *f); + int _write_fsid(); + void _close_fsid(); + void _set_alloc_sizes(); + void _set_blob_size(); + void _set_finisher_num(); + void _update_osd_memory_options(); + + int _open_bdev(bool create); + // Verifies if disk space is enough for reserved + min bluefs + // and alters the latter if needed. + // Depends on min_alloc_size hence should be called after + // its initialization (and outside of _open_bdev) + void _validate_bdev(); + void _close_bdev(); + + int _minimal_open_bluefs(bool create); + void _minimal_close_bluefs(); + int _open_bluefs(bool create); + void _close_bluefs(bool cold_close); + + // Limited (u)mount intended for BlueFS operations only + int _mount_for_bluefs(); + void _umount_for_bluefs(); + + + int _is_bluefs(bool create, bool* ret); + /* + * opens both DB and dependant super_meta, FreelistManager and allocator + * in the proper order + */ + int _open_db_and_around(bool read_only); + void _close_db_and_around(bool read_only); + + // updates legacy bluefs related recs in DB to a state valid for + // downgrades from nautilus. + void _sync_bluefs_and_fm(); + + /* + * @warning to_repair_db means that we open this db to repair it, will not + * hold the rocksdb's file lock. + */ + int _open_db(bool create, + bool to_repair_db=false, + bool read_only = false); + void _close_db(bool read_only); + int _open_fm(KeyValueDB::Transaction t); + void _close_fm(); + int _open_alloc(); + void _close_alloc(); + int _open_collections(); + void _fsck_collections(int64_t* errors); + void _close_collections(); + + int _setup_block_symlink_or_file(string name, string path, uint64_t size, + bool create); + +public: + static int _write_bdev_label(CephContext* cct, + string path, bluestore_bdev_label_t label); + static int _read_bdev_label(CephContext* cct, string path, + bluestore_bdev_label_t *label); +private: + int _check_or_set_bdev_label(string path, uint64_t size, string desc, + bool create); + + int _open_super_meta(); + + void _open_statfs(); + void _get_statfs_overall(struct store_statfs_t *buf); + + void _dump_alloc_on_failure(); + + int64_t _get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total); + int _balance_bluefs_freespace(); + + CollectionRef _get_collection(const coll_t& cid); + void _queue_reap_collection(CollectionRef& c); + void _reap_collections(); + void _update_cache_logger(); + + void _assign_nid(TransContext *txc, OnodeRef o); + uint64_t _assign_blobid(TransContext *txc); + + template <int LogLevelV> + friend void _dump_onode(CephContext *cct, const Onode& o); + template <int LogLevelV> + friend void _dump_extent_map(CephContext *cct, const ExtentMap& em); + template <int LogLevelV> + friend void _dump_transaction(CephContext *cct, Transaction *t); + + TransContext *_txc_create(Collection *c, OpSequencer *osr, + list<Context*> *on_commits); + void _txc_update_store_statfs(TransContext *txc); + void _txc_add_transaction(TransContext *txc, Transaction *t); + void _txc_calc_cost(TransContext *txc); + void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t); + void _txc_state_proc(TransContext *txc); + void _txc_aio_submit(TransContext *txc); +public: + void txc_aio_finish(void *p) { + _txc_state_proc(static_cast<TransContext*>(p)); + } +private: + void _txc_finish_io(TransContext *txc); + void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t); + void _txc_applied_kv(TransContext *txc); + void _txc_committed_kv(TransContext *txc); + void _txc_finish(TransContext *txc); + void _txc_release_alloc(TransContext *txc); + + void _osr_attach(Collection *c); + void _osr_register_zombie(OpSequencer *osr); + void _osr_drain(OpSequencer *osr); + void _osr_drain_preceding(TransContext *txc); + void _osr_drain_all(); + + void _kv_start(); + void _kv_stop(); + void _kv_sync_thread(); + void _kv_finalize_thread(); + + bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o); + void _deferred_queue(TransContext *txc); +public: + void deferred_try_submit(); +private: + void _deferred_submit_unlock(OpSequencer *osr); + void _deferred_aio_finish(OpSequencer *osr); + int _deferred_replay(); + +public: + using mempool_dynamic_bitset = + boost::dynamic_bitset<uint64_t, + mempool::bluestore_fsck::pool_allocator<uint64_t>>; + using per_pool_statfs = + mempool::bluestore_fsck::map<uint64_t, store_statfs_t>; + + enum FSCKDepth { + FSCK_REGULAR, + FSCK_DEEP, + FSCK_SHALLOW + }; + +private: + int _fsck_check_extents( + const coll_t& cid, + const ghobject_t& oid, + const PExtentVector& extents, + bool compressed, + mempool_dynamic_bitset &used_blocks, + uint64_t granularity, + BlueStoreRepairer* repairer, + store_statfs_t& expected_statfs, + FSCKDepth depth); + + void _fsck_check_pool_statfs( + per_pool_statfs& expected_pool_statfs, + int64_t& errors, + int64_t &warnings, + BlueStoreRepairer* repairer); + + int _fsck(FSCKDepth depth, bool repair); + int _fsck_on_open(BlueStore::FSCKDepth depth, bool repair); + + void _buffer_cache_write( + TransContext *txc, + BlobRef b, + uint64_t offset, + bufferlist& bl, + unsigned flags) { + b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl, + flags); + txc->shared_blobs_written.insert(b->shared_blob); + } + + int _collection_list( + Collection *c, const ghobject_t& start, const ghobject_t& end, + int max, bool legacy, vector<ghobject_t> *ls, ghobject_t *next); + + template <typename T, typename F> + T select_option(const std::string& opt_name, T val1, F f) { + //NB: opt_name reserved for future use + boost::optional<T> val2 = f(); + if (val2) { + return *val2; + } + return val1; + } + + void _apply_padding(uint64_t head_pad, + uint64_t tail_pad, + bufferlist& padded); + + void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn); + + // -- ondisk version --- +public: + const int32_t latest_ondisk_format = 2; ///< our version + const int32_t min_readable_ondisk_format = 1; ///< what we can read + const int32_t min_compat_ondisk_format = 2; ///< who can read us + +private: + int32_t ondisk_format = 0; ///< value detected on mount + + int _upgrade_super(); ///< upgrade (called during open_super) + uint64_t _get_ondisk_reserved() const; + void _prepare_ondisk_format_super(KeyValueDB::Transaction& t); + + // --- public interface --- +public: + BlueStore(CephContext *cct, const string& path); + BlueStore(CephContext *cct, const string& path, uint64_t min_alloc_size); // Ctor for UT only + ~BlueStore() override; + + string get_type() override { + return "bluestore"; + } + + bool needs_journal() override { return false; }; + bool wants_journal() override { return false; }; + bool allows_journal() override { return false; }; + + int get_devices(set<string> *ls) override; + + bool is_rotational() override; + bool is_journal_rotational() override; + + string get_default_device_class() override { + string device_class; + map<string, string> metadata; + collect_metadata(&metadata); + auto it = metadata.find("bluestore_bdev_type"); + if (it != metadata.end()) { + device_class = it->second; + } + return device_class; + } + + int get_numa_node( + int *numa_node, + set<int> *nodes, + set<string> *failed) override; + + static int get_block_device_fsid(CephContext* cct, const string& path, + uuid_d *fsid); + + bool test_mount_in_use() override; + +private: + int _mount(bool kv_only, bool open_db=true); +public: + int mount() override { + return _mount(false); + } + int umount() override; + + int start_kv_only(KeyValueDB **pdb, bool open_db=true) { + int r = _mount(true, open_db); + if (r < 0) + return r; + *pdb = db; + return 0; + } + + int write_meta(const std::string& key, const std::string& value) override; + int read_meta(const std::string& key, std::string *value) override; + + int cold_open(); + int cold_close(); + + int fsck(bool deep) override { + return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, false); + } + int repair(bool deep) override { + return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, true); + } + int quick_fix() override { + return _fsck(FSCK_SHALLOW, true); + } + + void set_cache_shards(unsigned num) override; + void dump_cache_stats(Formatter *f) override { + int onode_count = 0, buffers_bytes = 0; + for (auto i: cache_shards) { + onode_count += i->_get_num_onodes(); + buffers_bytes += i->_get_buffer_bytes(); + } + f->dump_int("bluestore_onode", onode_count); + f->dump_int("bluestore_buffers", buffers_bytes); + } + void dump_cache_stats(ostream& ss) override { + int onode_count = 0, buffers_bytes = 0; + for (auto i: cache_shards) { + onode_count += i->_get_num_onodes(); + buffers_bytes += i->_get_buffer_bytes(); + } + ss << "bluestore_onode: " << onode_count; + ss << "bluestore_buffers: " << buffers_bytes; + } + + int validate_hobject_key(const hobject_t &obj) const override { + return 0; + } + unsigned get_max_attr_name_length() override { + return 256; // arbitrary; there is no real limit internally + } + + int mkfs() override; + int mkjournal() override { + return 0; + } + + void get_db_statistics(Formatter *f) override; + void generate_db_histogram(Formatter *f) override; + void _flush_cache(); + int flush_cache(ostream *os = NULL) override; + void dump_perf_counters(Formatter *f) override { + f->open_object_section("perf_counters"); + logger->dump_formatted(f, false); + f->close_section(); + } + + int add_new_bluefs_device(int id, const string& path); + int migrate_to_existing_bluefs_device(const set<int>& devs_source, + int id); + int migrate_to_new_bluefs_device(const set<int>& devs_source, + int id, + const string& path); + int expand_devices(ostream& out); + string get_device_path(unsigned id); + + int dump_bluefs_sizes(ostream& out); + +public: + int statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts = nullptr) override; + int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override; + + void collect_metadata(map<string,string> *pm) override; + + bool exists(CollectionHandle &c, const ghobject_t& oid) override; + int set_collection_opts( + CollectionHandle& c, + const pool_opts_t& opts) override; + int stat( + CollectionHandle &c, + const ghobject_t& oid, + struct stat *st, + bool allow_eio = false) override; + int read( + CollectionHandle &c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0) override; + int _do_read( + Collection *c, + OnodeRef o, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0, + uint64_t retry_count = 0); + +private: + int _fiemap(CollectionHandle &c_, const ghobject_t& oid, + uint64_t offset, size_t len, interval_set<uint64_t>& destset); +public: + int fiemap(CollectionHandle &c, const ghobject_t& oid, + uint64_t offset, size_t len, bufferlist& bl) override; + int fiemap(CollectionHandle &c, const ghobject_t& oid, + uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override; + + + int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name, + bufferptr& value) override; + + int getattrs(CollectionHandle &c, const ghobject_t& oid, + map<string,bufferptr>& aset) override; + + int list_collections(vector<coll_t>& ls) override; + + CollectionHandle open_collection(const coll_t &c) override; + CollectionHandle create_new_collection(const coll_t& cid) override; + void set_collection_commit_queue(const coll_t& cid, + ContextQueue *commit_queue) override; + + bool collection_exists(const coll_t& c) override; + int collection_empty(CollectionHandle& c, bool *empty) override; + int collection_bits(CollectionHandle& c) override; + + int collection_list(CollectionHandle &c, + const ghobject_t& start, + const ghobject_t& end, + int max, + vector<ghobject_t> *ls, ghobject_t *next) override; + + int collection_list_legacy(CollectionHandle &c, + const ghobject_t& start, + const ghobject_t& end, + int max, + vector<ghobject_t> *ls, + ghobject_t *next) override; + + int omap_get( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) override; + + /// Get omap header + int omap_get_header( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio = false ///< [in] don't assert on eio + ) override; + + /// Get keys defined on oid + int omap_get_keys( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set<string> *keys ///< [out] Keys defined on oid + ) override; + + /// Get key values + int omap_get_values( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) override; + + /// Filters keys into out which are defined on oid + int omap_check_keys( + CollectionHandle &c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) override; + + ObjectMap::ObjectMapIterator get_omap_iterator( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) override; + + void set_fsid(uuid_d u) override { + fsid = u; + } + uuid_d get_fsid() override { + return fsid; + } + + uint64_t estimate_objects_overhead(uint64_t num_objects) override { + return num_objects * 300; //assuming per-object overhead is 300 bytes + } + + struct BSPerfTracker { + PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns; + PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns; + + objectstore_perf_stat_t get_cur_stats() const { + objectstore_perf_stat_t ret; + ret.os_commit_latency_ns = os_commit_latency_ns.current_avg(); + ret.os_apply_latency_ns = os_apply_latency_ns.current_avg(); + return ret; + } + + void update_from_perfcounters(PerfCounters &logger); + } perf_tracker; + + objectstore_perf_stat_t get_cur_stats() override { + perf_tracker.update_from_perfcounters(*logger); + return perf_tracker.get_cur_stats(); + } + const PerfCounters* get_perf_counters() const override { + return logger; + } + const PerfCounters* get_bluefs_perf_counters() const { + return bluefs->get_perf_counters(); + } + + int queue_transactions( + CollectionHandle& ch, + vector<Transaction>& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL) override; + + // error injection + void inject_data_error(const ghobject_t& o) override { + RWLock::WLocker l(debug_read_error_lock); + debug_data_error_objects.insert(o); + } + void inject_mdata_error(const ghobject_t& o) override { + RWLock::WLocker l(debug_read_error_lock); + debug_mdata_error_objects.insert(o); + } + + /// methods to inject various errors fsck can repair + void inject_broken_shared_blob_key(const string& key, + const bufferlist& bl); + void inject_leaked(uint64_t len); + void inject_false_free(coll_t cid, ghobject_t oid); + void inject_statfs(const string& key, const store_statfs_t& new_statfs); + void inject_global_statfs(const store_statfs_t& new_statfs); + void inject_misreference(coll_t cid1, ghobject_t oid1, + coll_t cid2, ghobject_t oid2, + uint64_t offset); + void inject_zombie_spanning_blob(coll_t cid, ghobject_t oid, int16_t blob_id); + // resets global per_pool_omap in DB + + void compact() override { + ceph_assert(db); + db->compact(); + } + bool has_builtin_csum() const override { + return true; + } + + /* + Allocate space for BlueFS from slow device. + Either automatically applies allocated extents to underlying + BlueFS (extents == nullptr) or just return them (non-null extents) provided + */ + int allocate_bluefs_freespace( + uint64_t min_size, + uint64_t size, + PExtentVector* extents); + + inline void log_latency(const char* name, + int idx, + const ceph::timespan& lat, + double lat_threshold, + const char* info = "") const; + + inline void log_latency_fn(const char* name, + int idx, + const ceph::timespan& lat, + double lat_threshold, + std::function<string (const ceph::timespan& lat)> fn) const; + +private: + bool _debug_data_eio(const ghobject_t& o) { + if (!cct->_conf->bluestore_debug_inject_read_err) { + return false; + } + RWLock::RLocker l(debug_read_error_lock); + return debug_data_error_objects.count(o); + } + bool _debug_mdata_eio(const ghobject_t& o) { + if (!cct->_conf->bluestore_debug_inject_read_err) { + return false; + } + RWLock::RLocker l(debug_read_error_lock); + return debug_mdata_error_objects.count(o); + } + void _debug_obj_on_delete(const ghobject_t& o) { + if (cct->_conf->bluestore_debug_inject_read_err) { + RWLock::WLocker l(debug_read_error_lock); + debug_data_error_objects.erase(o); + debug_mdata_error_objects.erase(o); + } + } +private: + ceph::mutex qlock = ceph::make_mutex("BlueStore::Alerts::qlock"); + string failed_cmode; + set<string> failed_compressors; + string spillover_alert; + string legacy_statfs_alert; + string disk_size_mismatch_alert; + + void _log_alerts(osd_alert_list_t& alerts); + bool _set_compression_alert(bool cmode, const char* s) { + std::lock_guard l(qlock); + if (cmode) { + bool ret = failed_cmode.empty(); + failed_cmode = s; + return ret; + } + return failed_compressors.emplace(s).second; + } + void _clear_compression_alert() { + std::lock_guard l(qlock); + failed_compressors.clear(); + failed_cmode.clear(); + } + + void _set_spillover_alert(const string& s) { + std::lock_guard l(qlock); + spillover_alert = s; + } + void _clear_spillover_alert() { + std::lock_guard l(qlock); + spillover_alert.clear(); + } + + void _check_legacy_statfs_alert(); + void _set_disk_size_mismatch_alert(const string& s) { + std::lock_guard l(qlock); + disk_size_mismatch_alert = s; + } + +private: + + // -------------------------------------------------------- + // read processing internal methods + int _verify_csum( + OnodeRef& o, + const bluestore_blob_t* blob, + uint64_t blob_xoffset, + const bufferlist& bl, + uint64_t logical_offset) const; + int _decompress(bufferlist& source, bufferlist* result); + + + // -------------------------------------------------------- + // write ops + + struct WriteContext { + bool buffered = false; ///< buffered write + bool compress = false; ///< compressed write + uint64_t target_blob_size = 0; ///< target (max) blob size + unsigned csum_order = 0; ///< target checksum chunk order + + old_extent_map_t old_extents; ///< must deref these blobs + interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection + + struct write_item { + uint64_t logical_offset; ///< write logical offset + BlobRef b; + uint64_t blob_length; + uint64_t b_off; + bufferlist bl; + uint64_t b_off0; ///< original offset in a blob prior to padding + uint64_t length0; ///< original data length prior to padding + + bool mark_unused; + bool new_blob; ///< whether new blob was created + + bool compressed = false; + bufferlist compressed_bl; + size_t compressed_len = 0; + + write_item( + uint64_t logical_offs, + BlobRef b, + uint64_t blob_len, + uint64_t o, + bufferlist& bl, + uint64_t o0, + uint64_t l0, + bool _mark_unused, + bool _new_blob) + : + logical_offset(logical_offs), + b(b), + blob_length(blob_len), + b_off(o), + bl(bl), + b_off0(o0), + length0(l0), + mark_unused(_mark_unused), + new_blob(_new_blob) {} + }; + vector<write_item> writes; ///< blobs we're writing + + /// partial clone of the context + void fork(const WriteContext& other) { + buffered = other.buffered; + compress = other.compress; + target_blob_size = other.target_blob_size; + csum_order = other.csum_order; + } + void write( + uint64_t loffs, + BlobRef b, + uint64_t blob_len, + uint64_t o, + bufferlist& bl, + uint64_t o0, + uint64_t len0, + bool _mark_unused, + bool _new_blob) { + writes.emplace_back(loffs, + b, + blob_len, + o, + bl, + o0, + len0, + _mark_unused, + _new_blob); + } + /// Checks for writes to the same pextent within a blob + bool has_conflict( + BlobRef b, + uint64_t loffs, + uint64_t loffs_end, + uint64_t min_alloc_size); + }; + + void _do_write_small( + TransContext *txc, + CollectionRef &c, + OnodeRef o, + uint64_t offset, uint64_t length, + bufferlist::iterator& blp, + WriteContext *wctx); + void _do_write_big( + TransContext *txc, + CollectionRef &c, + OnodeRef o, + uint64_t offset, uint64_t length, + bufferlist::iterator& blp, + WriteContext *wctx); + int _do_alloc_write( + TransContext *txc, + CollectionRef c, + OnodeRef o, + WriteContext *wctx); + void _wctx_finish( + TransContext *txc, + CollectionRef& c, + OnodeRef o, + WriteContext *wctx, + set<SharedBlob*> *maybe_unshared_blobs=0); + + int _write(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t len, + bufferlist& bl, + uint32_t fadvise_flags); + void _pad_zeros(bufferlist *bl, uint64_t *offset, + uint64_t chunk_size); + + void _choose_write_options(CollectionRef& c, + OnodeRef o, + uint32_t fadvise_flags, + WriteContext *wctx); + + int _do_gc(TransContext *txc, + CollectionRef& c, + OnodeRef o, + const WriteContext& wctx, + uint64_t *dirty_start, + uint64_t *dirty_end); + + int _do_write(TransContext *txc, + CollectionRef &c, + OnodeRef o, + uint64_t offset, uint64_t length, + bufferlist& bl, + uint32_t fadvise_flags); + void _do_write_data(TransContext *txc, + CollectionRef& c, + OnodeRef o, + uint64_t offset, + uint64_t length, + bufferlist& bl, + WriteContext *wctx); + + int _touch(TransContext *txc, + CollectionRef& c, + OnodeRef& o); + int _do_zero(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t len); + int _zero(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t len); + void _do_truncate(TransContext *txc, + CollectionRef& c, + OnodeRef o, + uint64_t offset, + set<SharedBlob*> *maybe_unshared_blobs=0); + int _truncate(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset); + int _remove(TransContext *txc, + CollectionRef& c, + OnodeRef& o); + int _do_remove(TransContext *txc, + CollectionRef& c, + OnodeRef o); + int _setattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name, + bufferptr& val); + int _setattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const map<string,bufferptr>& aset); + int _rmattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name); + int _rmattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o); + void _do_omap_clear(TransContext *txc, const string& prefix, uint64_t id); + int _omap_clear(TransContext *txc, + CollectionRef& c, + OnodeRef& o); + int _omap_setkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist& bl); + int _omap_setheader(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist& header); + int _omap_rmkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist& bl); + int _omap_rmkey_range(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& first, const string& last); + int _set_alloc_hint( + TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + int _do_clone_range(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + uint64_t srcoff, uint64_t length, uint64_t dstoff); + int _clone(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo); + int _clone_range(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + uint64_t srcoff, uint64_t length, uint64_t dstoff); + int _rename(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + const ghobject_t& new_oid); + int _create_collection(TransContext *txc, const coll_t &cid, + unsigned bits, CollectionRef *c); + int _remove_collection(TransContext *txc, const coll_t &cid, + CollectionRef *c); + void _do_remove_collection(TransContext *txc, CollectionRef *c); + int _split_collection(TransContext *txc, + CollectionRef& c, + CollectionRef& d, + unsigned bits, int rem); + int _merge_collection(TransContext *txc, + CollectionRef *c, + CollectionRef& d, + unsigned bits); + +private: + std::atomic<uint64_t> out_of_sync_fm = {0}; + // -------------------------------------------------------- + // BlueFSDeviceExpander implementation + uint64_t get_recommended_expansion_delta(uint64_t bluefs_free, + uint64_t bluefs_total) override { + auto delta = _get_bluefs_size_delta(bluefs_free, bluefs_total); + return delta > 0 ? delta : 0; + } + int allocate_freespace( + uint64_t min_size, + uint64_t size, + PExtentVector& extents) override { + return allocate_bluefs_freespace(min_size, size, &extents); + }; + size_t available_freespace(uint64_t alloc_size) override; + +public: + struct sb_info_t { + coll_t cid; + int64_t pool_id = INT64_MIN; + list<ghobject_t> oids; + BlueStore::SharedBlobRef sb; + bluestore_extent_ref_map_t ref_map; + bool compressed = false; + bool passed = false; + bool updated = false; + }; + typedef btree::btree_set< + uint64_t, std::less<uint64_t>, + mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t; + + typedef mempool::bluestore_fsck::map<uint64_t, sb_info_t> sb_info_map_t; + struct FSCK_ObjectCtx { + int64_t& errors; + int64_t& warnings; + uint64_t& num_objects; + uint64_t& num_extents; + uint64_t& num_blobs; + uint64_t& num_sharded_objects; + uint64_t& num_spanning_blobs; + + mempool_dynamic_bitset* used_blocks; + uint64_t_btree_t* used_omap_head; + uint64_t_btree_t* used_per_pool_omap_head; + uint64_t_btree_t* used_pgmeta_omap_head; + + ceph::mutex* sb_info_lock; + sb_info_map_t& sb_info; + + store_statfs_t& expected_store_statfs; + per_pool_statfs& expected_pool_statfs; + BlueStoreRepairer* repairer; + + FSCK_ObjectCtx(int64_t& e, + int64_t& w, + uint64_t& _num_objects, + uint64_t& _num_extents, + uint64_t& _num_blobs, + uint64_t& _num_sharded_objects, + uint64_t& _num_spanning_blobs, + mempool_dynamic_bitset* _ub, + uint64_t_btree_t* _used_omap_head, + uint64_t_btree_t* _used_per_pool_omap_head, + uint64_t_btree_t* _used_pgmeta_omap_head, + ceph::mutex* _sb_info_lock, + sb_info_map_t& _sb_info, + store_statfs_t& _store_statfs, + per_pool_statfs& _pool_statfs, + BlueStoreRepairer* _repairer) : + errors(e), + warnings(w), + num_objects(_num_objects), + num_extents(_num_extents), + num_blobs(_num_blobs), + num_sharded_objects(_num_sharded_objects), + num_spanning_blobs(_num_spanning_blobs), + used_blocks(_ub), + used_omap_head(_used_omap_head), + used_per_pool_omap_head(_used_per_pool_omap_head), + used_pgmeta_omap_head(_used_pgmeta_omap_head), + sb_info_lock(_sb_info_lock), + sb_info(_sb_info), + expected_store_statfs(_store_statfs), + expected_pool_statfs(_pool_statfs), + repairer(_repairer) { + } + }; + + OnodeRef fsck_check_objects_shallow( + FSCKDepth depth, + int64_t pool_id, + CollectionRef c, + const ghobject_t& oid, + const string& key, + const bufferlist& value, + mempool::bluestore_fsck::list<string>& expecting_shards, + map<BlobRef, bluestore_blob_t::unused_t>* referenced, + const BlueStore::FSCK_ObjectCtx& ctx); + +private: + void _fsck_check_objects(FSCKDepth depth, + FSCK_ObjectCtx& ctx); +}; + +inline ostream& operator<<(ostream& out, const BlueStore::volatile_statfs& s) { + return out + << " allocated:" + << s.values[BlueStore::volatile_statfs::STATFS_ALLOCATED] + << " stored:" + << s.values[BlueStore::volatile_statfs::STATFS_STORED] + << " compressed:" + << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED] + << " compressed_orig:" + << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL] + << " compressed_alloc:" + << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED]; +} + +static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) { + o->get(); +} +static inline void intrusive_ptr_release(BlueStore::Onode *o) { + o->put(); +} + +static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) { + o->get(); +} +static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) { + o->put(); +} + +class BlueStoreRepairer +{ +public: + // to simplify future potential migration to mempools + using fsck_interval = interval_set<uint64_t>; + + // Structure to track what pextents are used for specific cid/oid. + // Similar to Bloom filter positive and false-positive matches are + // possible only. + // Maintains two lists of bloom filters for both cids and oids + // where each list entry is a BF for specific disk pextent + // The length of the extent per filter is measured on init. + // Allows to filter out 'uninteresting' pextents to speadup subsequent + // 'is_used' access. + struct StoreSpaceTracker { + const uint64_t BLOOM_FILTER_SALT_COUNT = 2; + const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter + const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected + static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024; + + typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector; + bloom_vector collections_bfs; + bloom_vector objects_bfs; + + bool was_filtered_out = false; + uint64_t granularity = 0; // extent length for a single filter + + StoreSpaceTracker() { + } + StoreSpaceTracker(const StoreSpaceTracker& from) : + collections_bfs(from.collections_bfs), + objects_bfs(from.objects_bfs), + granularity(from.granularity) { + } + + void init(uint64_t total, + uint64_t min_alloc_size, + uint64_t mem_cap = DEF_MEM_CAP) { + ceph_assert(!granularity); // not initialized yet + ceph_assert(min_alloc_size && isp2(min_alloc_size)); + ceph_assert(mem_cap); + + total = round_up_to(total, min_alloc_size); + granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap; + + if (!granularity) { + granularity = min_alloc_size; + } else { + granularity = round_up_to(granularity, min_alloc_size); + } + + uint64_t entries = round_up_to(total, granularity) / granularity; + collections_bfs.resize(entries, + bloom_filter(BLOOM_FILTER_SALT_COUNT, + BLOOM_FILTER_TABLE_SIZE, + 0, + BLOOM_FILTER_EXPECTED_COUNT)); + objects_bfs.resize(entries, + bloom_filter(BLOOM_FILTER_SALT_COUNT, + BLOOM_FILTER_TABLE_SIZE, + 0, + BLOOM_FILTER_EXPECTED_COUNT)); + } + inline uint32_t get_hash(const coll_t& cid) const { + return cid.hash_to_shard(1); + } + inline void set_used(uint64_t offset, uint64_t len, + const coll_t& cid, const ghobject_t& oid) { + ceph_assert(granularity); // initialized + + // can't call this func after filter_out has been applied + ceph_assert(!was_filtered_out); + if (!len) { + return; + } + auto pos = offset / granularity; + auto end_pos = (offset + len - 1) / granularity; + while (pos <= end_pos) { + collections_bfs[pos].insert(get_hash(cid)); + objects_bfs[pos].insert(oid.hobj.get_hash()); + ++pos; + } + } + // filter-out entries unrelated to the specified(broken) extents. + // 'is_used' calls are permitted after that only + size_t filter_out(const fsck_interval& extents); + + // determines if collection's present after filtering-out + inline bool is_used(const coll_t& cid) const { + ceph_assert(was_filtered_out); + for(auto& bf : collections_bfs) { + if (bf.contains(get_hash(cid))) { + return true; + } + } + return false; + } + // determines if object's present after filtering-out + inline bool is_used(const ghobject_t& oid) const { + ceph_assert(was_filtered_out); + for(auto& bf : objects_bfs) { + if (bf.contains(oid.hobj.get_hash())) { + return true; + } + } + return false; + } + // determines if collection's present before filtering-out + inline bool is_used(const coll_t& cid, uint64_t offs) const { + ceph_assert(granularity); // initialized + ceph_assert(!was_filtered_out); + auto &bf = collections_bfs[offs / granularity]; + if (bf.contains(get_hash(cid))) { + return true; + } + return false; + } + // determines if object's present before filtering-out + inline bool is_used(const ghobject_t& oid, uint64_t offs) const { + ceph_assert(granularity); // initialized + ceph_assert(!was_filtered_out); + auto &bf = objects_bfs[offs / granularity]; + if (bf.contains(oid.hobj.get_hash())) { + return true; + } + return false; + } + }; +public: + + bool remove_key(KeyValueDB *db, const string& prefix, const string& key); + bool fix_shared_blob(KeyValueDB *db, + uint64_t sbid, + const bufferlist* bl); + bool fix_statfs(KeyValueDB *db, const string& key, + const store_statfs_t& new_statfs); + + bool fix_leaked(KeyValueDB *db, + FreelistManager* fm, + uint64_t offset, uint64_t len); + bool fix_false_free(KeyValueDB *db, + FreelistManager* fm, + uint64_t offset, uint64_t len); + bool fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag); + KeyValueDB::Transaction fix_spanning_blobs(KeyValueDB* db); + + void init(uint64_t total_space, uint64_t lres_tracking_unit_size); + + bool preprocess_misreference(KeyValueDB *db); + + unsigned apply(KeyValueDB* db); + + void note_misreference(uint64_t offs, uint64_t len, bool inc_error) { + misreferenced_extents.union_insert(offs, len); + if (inc_error) { + ++to_repair_cnt; + } + } + void inc_repaired() { + ++to_repair_cnt; + } + + StoreSpaceTracker& get_space_usage_tracker() { + return space_usage_tracker; + } + const fsck_interval& get_misreferences() const { + return misreferenced_extents; + } + KeyValueDB::Transaction get_fix_misreferences_txn() { + return fix_misreferences_txn; + } + +private: + unsigned to_repair_cnt = 0; + KeyValueDB::Transaction fix_fm_leaked_txn; + KeyValueDB::Transaction fix_fm_false_free_txn; + KeyValueDB::Transaction remove_key_txn; + KeyValueDB::Transaction fix_statfs_txn; + KeyValueDB::Transaction fix_shared_blob_txn; + + KeyValueDB::Transaction fix_misreferences_txn; + KeyValueDB::Transaction fix_onode_txn; + + StoreSpaceTracker space_usage_tracker; + + // non-shared extents with multiple references + fsck_interval misreferenced_extents; + +}; + +class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector +{ + template <class T, size_t MaxX, size_t MaxY> + class matrix_2d { + T values[MaxX][MaxY]; + public: + matrix_2d() { + clear(); + } + T& at(size_t x, size_t y) { + ceph_assert(x < MaxX); + ceph_assert(y < MaxY); + + return values[x][y]; + } + size_t get_max_x() const { + return MaxX; + } + size_t get_max_y() const { + return MaxY; + } + void clear() { + memset(values, 0, sizeof(values)); + } + }; + + enum { + // use 0/nullptr as unset indication + LEVEL_FIRST = 1, + LEVEL_WAL = LEVEL_FIRST, + LEVEL_DB, + LEVEL_SLOW, + LEVEL_MAX + }; + // add +1 row for corresponding per-device totals + // add +1 column for per-level actual (taken from file size) total + typedef matrix_2d<uint64_t, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t; + + per_level_per_dev_usage_t per_level_per_dev_usage; + + // Note: maximum per-device totals below might be smaller than corresponding + // perf counters by up to a single alloc unit (1M) due to superblock extent. + // The later is not accounted here. + per_level_per_dev_usage_t per_level_per_dev_max; + + uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST]; + uint64_t db_avail4slow = 0; + enum { + OLD_POLICY, + USE_SOME_EXTRA + }; + +public: + RocksDBBlueFSVolumeSelector( + uint64_t _wal_total, + uint64_t _db_total, + uint64_t _slow_total, + uint64_t _level0_size, + uint64_t _level_base, + uint64_t _level_multiplier, + double reserved_factor, + uint64_t reserved, + bool new_pol) + { + l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total; + l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total; + l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total; + + if (!new_pol) { + return; + } + + // Calculating how much extra space is available at DB volume. + // Depending on the presence of explicit reserved size specification it might be either + // * DB volume size - reserved + // or + // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor + if (!reserved) { + uint64_t prev_levels = _level0_size; + uint64_t cur_level = _level_base; + uint64_t cur_threshold = 0; + do { + uint64_t next_level = cur_level * _level_multiplier; + uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor; + if (_db_total <= next_threshold) { + db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0; + break; + } else { + prev_levels += cur_level; + cur_level = next_level; + cur_threshold = next_threshold; + } + } while (true); + } else { + db_avail4slow = _db_total - reserved; + } + } + + void* get_hint_by_device(uint8_t dev) const override { + ceph_assert(dev == BlueFS::BDEV_WAL); // others aren't used atm + return reinterpret_cast<void*>(LEVEL_WAL); + } + void* get_hint_by_dir(const string& dirname) const override; + + void add_usage(void* hint, const bluefs_fnode_t& fnode) override { + if (hint == nullptr) + return; + size_t pos = (size_t)hint - LEVEL_FIRST; + for (auto& p : fnode.extents) { + auto& cur = per_level_per_dev_usage.at(p.bdev, pos); + auto& max = per_level_per_dev_max.at(p.bdev, pos); + cur += p.length; + if (cur > max) { + max = cur; + } + { + //update per-device totals + auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); + auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); + cur += p.length; + if (cur > max) { + max = cur; + } + } + } + { + //update per-level actual totals + auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); + auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos); + cur += fnode.size; + if (cur > max) { + max = cur; + } + } + } + void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { + if (hint == nullptr) + return; + size_t pos = (size_t)hint - LEVEL_FIRST; + for (auto& p : fnode.extents) { + auto& cur = per_level_per_dev_usage.at(p.bdev, pos); + ceph_assert(cur >= p.length); + cur -= p.length; + + //update per-device totals + auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); + ceph_assert(cur2 >= p.length); + cur2 -= p.length; + } + //update per-level actual totals + auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); + ceph_assert(cur >= fnode.size); + cur -= fnode.size; + } + void add_usage(void* hint, uint64_t fsize) override { + if (hint == nullptr) + return; + size_t pos = (size_t)hint - LEVEL_FIRST; + //update per-level actual totals + auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); + auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos); + cur += fsize; + if (cur > max) { + max = cur; + } + } + void sub_usage(void* hint, uint64_t fsize) override { + if (hint == nullptr) + return; + size_t pos = (size_t)hint - LEVEL_FIRST; + //update per-level actual totals + auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); + ceph_assert(cur >= fsize); + per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos) -= fsize; + } + + uint8_t select_prefer_bdev(void* h) override; + void get_paths( + const std::string& base, + BlueFSVolumeSelector::paths& res) const override; + + void dump(ostream& sout) override; +}; + +#endif diff --git a/src/os/bluestore/FreelistManager.cc b/src/os/bluestore/FreelistManager.cc new file mode 100644 index 00000000..8aeb4526 --- /dev/null +++ b/src/os/bluestore/FreelistManager.cc @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "FreelistManager.h" +#include "BitmapFreelistManager.h" + +FreelistManager *FreelistManager::create( + CephContext* cct, + string type, + string prefix) +{ + // a bit of a hack... we hard-code the prefixes here. we need to + // put the freelistmanagers in different prefixes because the merge + // op is per prefix, has to done pre-db-open, and we don't know the + // freelist type until after we open the db. + ceph_assert(prefix == "B"); + if (type == "bitmap") + return new BitmapFreelistManager(cct, "B", "b"); + return NULL; +} + +void FreelistManager::setup_merge_operators(KeyValueDB *db) +{ + BitmapFreelistManager::setup_merge_operator(db, "b"); +} diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h new file mode 100644 index 00000000..56e05d14 --- /dev/null +++ b/src/os/bluestore/FreelistManager.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OS_BLUESTORE_FREELISTMANAGER_H +#define CEPH_OS_BLUESTORE_FREELISTMANAGER_H + +#include <string> +#include <map> +#include <mutex> +#include <ostream> +#include "kv/KeyValueDB.h" + +class FreelistManager { +public: + CephContext* cct; + FreelistManager(CephContext* cct) : cct(cct) {} + virtual ~FreelistManager() {} + + static FreelistManager *create( + CephContext* cct, + string type, + string prefix); + + static void setup_merge_operators(KeyValueDB *db); + + virtual int create(uint64_t size, uint64_t granularity, + KeyValueDB::Transaction txn) = 0; + + virtual int expand(uint64_t new_size, + KeyValueDB::Transaction txn) = 0; + + virtual int init(KeyValueDB *kvdb) = 0; + virtual void shutdown() = 0; + + virtual void dump(KeyValueDB *kvdb) = 0; + + virtual void enumerate_reset() = 0; + virtual bool enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length) = 0; + + virtual void allocate( + uint64_t offset, uint64_t length, + KeyValueDB::Transaction txn) = 0; + virtual void release( + uint64_t offset, uint64_t length, + KeyValueDB::Transaction txn) = 0; + + virtual uint64_t get_size() const = 0; + virtual uint64_t get_alloc_units() const = 0; + virtual uint64_t get_alloc_size() const = 0; + +}; + + +#endif diff --git a/src/os/bluestore/HybridAllocator.cc b/src/os/bluestore/HybridAllocator.cc new file mode 100644 index 00000000..6caf5c6d --- /dev/null +++ b/src/os/bluestore/HybridAllocator.cc @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "HybridAllocator.h" + +#include <limits> + +#include "common/config_proxy.h" +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore +#undef dout_prefix +#define dout_prefix *_dout << "HybridAllocator " + + +int64_t HybridAllocator::allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector* extents) +{ + ldout(cct, 10) << __func__ << std::hex + << " want 0x" << want + << " unit 0x" << unit + << " max_alloc_size 0x" << max_alloc_size + << " hint 0x" << hint + << std::dec << dendl; + ceph_assert(isp2(unit)); + ceph_assert(want % unit == 0); + + if (max_alloc_size == 0) { + max_alloc_size = want; + } + if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max(); + max_alloc_size >= cap) { + max_alloc_size = p2align(uint64_t(cap), (uint64_t)get_block_size()); + } + + std::lock_guard l(lock); + + int64_t res; + PExtentVector local_extents; + + // preserve original 'extents' vector state + auto orig_size = extents->size(); + auto orig_pos = extents->end(); + if (orig_size) { + --orig_pos; + } + + // try bitmap first to avoid unneeded contiguous extents split if + // desired amount is less than shortes range in AVL + if (bmap_alloc && bmap_alloc->get_free() && + want < _lowest_size_available()) { + res = bmap_alloc->allocate(want, unit, max_alloc_size, hint, extents); + if (res < 0) { + // got a failure, release already allocated and + // start over allocation from avl + if (orig_size) { + local_extents.insert( + local_extents.end(), ++orig_pos, extents->end()); + extents->resize(orig_size); + } else { + extents->swap(local_extents); + } + bmap_alloc->release(local_extents); + res = 0; + } + if ((uint64_t)res < want) { + auto res2 = _allocate(want - res, unit, max_alloc_size, hint, extents); + if (res2 < 0) { + res = res2; // caller to do the release + } else { + res += res2; + } + } + } else { + res = _allocate(want, unit, max_alloc_size, hint, extents); + if (res < 0) { + // got a failure, release already allocated and + // start over allocation from bitmap + if (orig_size) { + local_extents.insert( + local_extents.end(), ++orig_pos, extents->end()); + extents->resize(orig_size); + } else { + extents->swap(local_extents); + } + _release(local_extents); + res = 0; + } + if ((uint64_t)res < want ) { + auto res2 = bmap_alloc ? + bmap_alloc->allocate(want - res, unit, max_alloc_size, hint, extents) : + 0; + if (res2 < 0 ) { + res = res2; // caller to do the release + } else { + res += res2; + } + } + } + return res ? res : -ENOSPC; +} + +void HybridAllocator::release(const interval_set<uint64_t>& release_set) { + std::lock_guard l(lock); + // this will attempt to put free ranges into AvlAllocator first and + // fallback to bitmap one via _try_insert_range call + _release(release_set); +} + +uint64_t HybridAllocator::get_free() +{ + std::lock_guard l(lock); + return (bmap_alloc ? bmap_alloc->get_free() : 0) + _get_free(); +} + +double HybridAllocator::get_fragmentation() +{ + std::lock_guard l(lock); + auto f = AvlAllocator::_get_fragmentation(); + auto bmap_free = bmap_alloc ? bmap_alloc->get_free() : 0; + if (bmap_free) { + auto _free = _get_free() + bmap_free; + auto bf = bmap_alloc->get_fragmentation(); + + f = f * _get_free() / _free + bf * bmap_free / _free; + } + return f; +} + +void HybridAllocator::dump() +{ + std::lock_guard l(lock); + AvlAllocator::_dump(); + if (bmap_alloc) { + bmap_alloc->dump(); + } + ldout(cct, 0) << __func__ + << " avl_free: " << _get_free() + << " bmap_free: " << (bmap_alloc ? bmap_alloc->get_free() : 0) + << dendl; +} + +void HybridAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify) +{ + AvlAllocator::dump(notify); + if (bmap_alloc) { + bmap_alloc->dump(notify); + } +} + +void HybridAllocator::init_rm_free(uint64_t offset, uint64_t length) +{ + std::lock_guard l(lock); + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + _try_remove_from_tree(offset, length, + [&](uint64_t o, uint64_t l, bool found) { + if (!found) { + if (bmap_alloc) { + bmap_alloc->init_rm_free(o, l); + } else { + lderr(cct) << "init_rm_free lambda" << std::hex + << "Uexpected extent: " + << " 0x" << o << "~" << l + << std::dec << dendl; + ceph_assert(false); + } + } + }); +} + +void HybridAllocator::shutdown() +{ + std::lock_guard l(lock); + _shutdown(); + if (bmap_alloc) { + bmap_alloc->shutdown(); + delete bmap_alloc; + bmap_alloc = nullptr; + } +} + +void HybridAllocator::_spillover_range(uint64_t start, uint64_t end) +{ + auto size = end - start; + dout(20) << __func__ + << std::hex << " " + << start << "~" << size + << std::dec + << dendl; + ceph_assert(size); + if (!bmap_alloc) { + dout(1) << __func__ + << std::hex + << " constructing fallback allocator" + << dendl; + bmap_alloc = new BitmapAllocator(cct, + get_capacity(), + get_block_size(), + get_name() + ".fallback"); + } + bmap_alloc->init_add_free(start, size); +} + +void HybridAllocator::_add_to_tree(uint64_t start, uint64_t size) +{ + if (bmap_alloc) { + uint64_t head = bmap_alloc->claim_free_to_left(start); + uint64_t tail = bmap_alloc->claim_free_to_right(start + size); + ceph_assert(head <= start); + start -= head; + size += head + tail; + } + AvlAllocator::_add_to_tree(start, size); +} diff --git a/src/os/bluestore/HybridAllocator.h b/src/os/bluestore/HybridAllocator.h new file mode 100644 index 00000000..e8246cf4 --- /dev/null +++ b/src/os/bluestore/HybridAllocator.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <mutex> + +#include "AvlAllocator.h" +#include "BitmapAllocator.h" + +class HybridAllocator : public AvlAllocator { + BitmapAllocator* bmap_alloc = nullptr; +public: + HybridAllocator(CephContext* cct, int64_t device_size, int64_t _block_size, + uint64_t max_mem, + const std::string& name) : + AvlAllocator(cct, device_size, _block_size, max_mem, name) { + } + int64_t allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector *extents) override; + void release(const interval_set<uint64_t>& release_set) override; + uint64_t get_free() override; + double get_fragmentation() override; + + void dump() override; + void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override; + void init_rm_free(uint64_t offset, uint64_t length) override; + void shutdown() override; + +protected: + // intended primarily for UT + BitmapAllocator* get_bmap() { + return bmap_alloc; + } + const BitmapAllocator* get_bmap() const { + return bmap_alloc; + } +private: + + void _spillover_range(uint64_t start, uint64_t end) override; + + // called when extent to be released/marked free + void _add_to_tree(uint64_t start, uint64_t size) override; +}; diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc new file mode 100644 index 00000000..2a20f209 --- /dev/null +++ b/src/os/bluestore/KernelDevice.cc @@ -0,0 +1,1185 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/file.h> + +#include "KernelDevice.h" +#include "include/types.h" +#include "include/compat.h" +#include "include/stringify.h" +#include "common/blkdev.h" +#include "common/errno.h" +#if defined(__FreeBSD__) +#include "bsm/audit_errno.h" +#endif +#include "common/debug.h" +#include "common/align.h" +#include "common/numa.h" + +#include "global/global_context.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bdev +#undef dout_prefix +#define dout_prefix *_dout << "bdev(" << this << " " << path << ") " + +KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv) + : BlockDevice(cct, cb, cbpriv), + aio(false), dio(false), + aio_queue(cct->_conf->bdev_aio_max_queue_depth), + discard_callback(d_cb), + discard_callback_priv(d_cbpriv), + aio_stop(false), + discard_started(false), + discard_stop(false), + aio_thread(this), + discard_thread(this), + injecting_crash(0) +{ + fd_directs.resize(WRITE_LIFE_MAX, -1); + fd_buffereds.resize(WRITE_LIFE_MAX, -1); +} + +int KernelDevice::_lock() +{ + dout(10) << __func__ << " " << fd_directs[WRITE_LIFE_NOT_SET] << dendl; + // When the block changes, systemd-udevd will open the block, + // read some information and close it. Then a failure occurs here. + // So we need to try again here. + int fd = fd_directs[WRITE_LIFE_NOT_SET]; + uint64_t nr_tries = 0; + for (;;) { + struct flock fl = { F_WRLCK, + SEEK_SET }; + int r = ::fcntl(fd, F_OFD_SETLK, &fl); + if (r < 0) { + if (errno == EINVAL) { + r = ::flock(fd, LOCK_EX | LOCK_NB); + } + } + if (r == 0) { + return 0; + } + if (errno != EAGAIN) { + return -errno; + } + dout(1) << __func__ << " flock busy on " << path << dendl; + if (const uint64_t max_retry = + cct->_conf.get_val<uint64_t>("bdev_flock_retry"); + max_retry > 0 && nr_tries++ == max_retry) { + return -EAGAIN; + } + double retry_interval = + cct->_conf.get_val<double>("bdev_flock_retry_interval"); + std::this_thread::sleep_for(ceph::make_timespan(retry_interval)); + } +} + +int KernelDevice::open(const string& p) +{ + path = p; + int r = 0, i = 0; + dout(1) << __func__ << " path " << path << dendl; + + for (i = 0; i < WRITE_LIFE_MAX; i++) { + int fd = ::open(path.c_str(), O_RDWR | O_DIRECT); + if (fd < 0) { + r = -errno; + break; + } + fd_directs[i] = fd; + + fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC); + if (fd < 0) { + r = -errno; + break; + } + fd_buffereds[i] = fd; + } + + if (i != WRITE_LIFE_MAX) { + derr << __func__ << " open got: " << cpp_strerror(r) << dendl; + goto out_fail; + } + +#if defined(F_SET_FILE_RW_HINT) + for (i = WRITE_LIFE_NONE; i < WRITE_LIFE_MAX; i++) { + if (fcntl(fd_directs[i], F_SET_FILE_RW_HINT, &i) < 0) { + r = -errno; + break; + } + if (fcntl(fd_buffereds[i], F_SET_FILE_RW_HINT, &i) < 0) { + r = -errno; + break; + } + } + if (i != WRITE_LIFE_MAX) { + enable_wrt = false; + dout(0) << "ioctl(F_SET_FILE_RW_HINT) on " << path << " failed: " << cpp_strerror(r) << dendl; + } +#endif + + dio = true; + aio = cct->_conf->bdev_aio; + if (!aio) { + ceph_abort_msg("non-aio not supported"); + } + + // disable readahead as it will wreak havoc on our mix of + // directio/aio and buffered io. + r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], 0, 0, POSIX_FADV_RANDOM); + if (r) { + r = -r; + derr << __func__ << " open got: " << cpp_strerror(r) << dendl; + goto out_fail; + } + + if (lock_exclusive) { + r = _lock(); + if (r < 0) { + derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) + << dendl; + goto out_fail; + } + } + + struct stat st; + r = ::fstat(fd_directs[WRITE_LIFE_NOT_SET], &st); + if (r < 0) { + r = -errno; + derr << __func__ << " fstat got " << cpp_strerror(r) << dendl; + goto out_fail; + } + + // Operate as though the block size is 4 KB. The backing file + // blksize doesn't strictly matter except that some file systems may + // require a read/modify/write if we write something smaller than + // it. + block_size = cct->_conf->bdev_block_size; + if (block_size != (unsigned)st.st_blksize) { + dout(1) << __func__ << " backing device/file reports st_blksize " + << st.st_blksize << ", using bdev_block_size " + << block_size << " anyway" << dendl; + } + + + { + BlkDev blkdev_direct(fd_directs[WRITE_LIFE_NOT_SET]); + BlkDev blkdev_buffered(fd_buffereds[WRITE_LIFE_NOT_SET]); + + if (S_ISBLK(st.st_mode)) { + int64_t s; + r = blkdev_direct.get_size(&s); + if (r < 0) { + goto out_fail; + } + size = s; + } else { + size = st.st_size; + } + + char partition[PATH_MAX], devname[PATH_MAX]; + if ((r = blkdev_buffered.partition(partition, PATH_MAX)) || + (r = blkdev_buffered.wholedisk(devname, PATH_MAX))) { + derr << "unable to get device name for " << path << ": " + << cpp_strerror(r) << dendl; + rotational = true; + } else { + dout(20) << __func__ << " devname " << devname << dendl; + rotational = blkdev_buffered.is_rotational(); + support_discard = blkdev_buffered.support_discard(); + this->devname = devname; + _detect_vdo(); + } + } + + r = _aio_start(); + if (r < 0) { + goto out_fail; + } + _discard_start(); + + // round size down to an even block + size &= ~(block_size - 1); + + dout(1) << __func__ + << " size " << size + << " (0x" << std::hex << size << std::dec << ", " + << byte_u_t(size) << ")" + << " block_size " << block_size + << " (" << byte_u_t(block_size) << ")" + << " " << (rotational ? "rotational" : "non-rotational") + << " discard " << (support_discard ? "supported" : "not supported") + << dendl; + return 0; + +out_fail: + for (i = 0; i < WRITE_LIFE_MAX; i++) { + if (fd_directs[i] >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i])); + fd_directs[i] = -1; + } else { + break; + } + if (fd_buffereds[i] >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i])); + fd_buffereds[i] = -1; + } else { + break; + } + } + return r; +} + +int KernelDevice::get_devices(std::set<std::string> *ls) +{ + if (devname.empty()) { + return 0; + } + get_raw_devices(devname, ls); + return 0; +} + +void KernelDevice::close() +{ + dout(1) << __func__ << dendl; + _aio_stop(); + _discard_stop(); + + if (vdo_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(vdo_fd)); + vdo_fd = -1; + } + + for (int i = 0; i < WRITE_LIFE_MAX; i++) { + assert(fd_directs[i] >= 0); + VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i])); + fd_directs[i] = -1; + + assert(fd_buffereds[i] >= 0); + VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i])); + fd_buffereds[i] = -1; + } + path.clear(); +} + +int KernelDevice::collect_metadata(const string& prefix, map<string,string> *pm) const +{ + (*pm)[prefix + "support_discard"] = stringify((int)(bool)support_discard); + (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational); + (*pm)[prefix + "size"] = stringify(get_size()); + (*pm)[prefix + "block_size"] = stringify(get_block_size()); + (*pm)[prefix + "driver"] = "KernelDevice"; + if (rotational) { + (*pm)[prefix + "type"] = "hdd"; + } else { + (*pm)[prefix + "type"] = "ssd"; + } + if (vdo_fd >= 0) { + (*pm)[prefix + "vdo"] = "true"; + uint64_t total, avail; + get_vdo_utilization(vdo_fd, &total, &avail); + (*pm)[prefix + "vdo_physical_size"] = stringify(total); + } + + struct stat st; + int r = ::fstat(fd_buffereds[WRITE_LIFE_NOT_SET], &st); + if (r < 0) + return -errno; + if (S_ISBLK(st.st_mode)) { + (*pm)[prefix + "access_mode"] = "blk"; + + char buffer[1024] = {0}; + BlkDev blkdev{fd_buffereds[WRITE_LIFE_NOT_SET]}; + if (r = blkdev.partition(buffer, sizeof(buffer)); r) { + (*pm)[prefix + "partition_path"] = "unknown"; + } else { + (*pm)[prefix + "partition_path"] = buffer; + } + buffer[0] = '\0'; + if (r = blkdev.partition(buffer, sizeof(buffer)); r) { + (*pm)[prefix + "dev_node"] = "unknown"; + } else { + (*pm)[prefix + "dev_node"] = buffer; + } + if (!r) { + return 0; + } + buffer[0] = '\0'; + blkdev.model(buffer, sizeof(buffer)); + (*pm)[prefix + "model"] = buffer; + + buffer[0] = '\0'; + blkdev.dev(buffer, sizeof(buffer)); + (*pm)[prefix + "dev"] = buffer; + + // nvme exposes a serial number + buffer[0] = '\0'; + blkdev.serial(buffer, sizeof(buffer)); + (*pm)[prefix + "serial"] = buffer; + + if (blkdev.is_nvme()) + (*pm)[prefix + "type"] = "nvme"; + + // numa + int node; + r = blkdev.get_numa_node(&node); + if (r >= 0) { + (*pm)[prefix + "numa_node"] = stringify(node); + } + } else { + (*pm)[prefix + "access_mode"] = "file"; + (*pm)[prefix + "path"] = path; + } + return 0; +} + +void KernelDevice::_detect_vdo() +{ + vdo_fd = get_vdo_stats_handle(devname.c_str(), &vdo_name); + if (vdo_fd >= 0) { + dout(1) << __func__ << " VDO volume " << vdo_name + << " maps to " << devname << dendl; + } else { + dout(20) << __func__ << " no VDO volume maps to " << devname << dendl; + } + return; +} + +bool KernelDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const +{ + if (vdo_fd < 0) { + return false; + } + return get_vdo_utilization(vdo_fd, total, avail); +} + +int KernelDevice::choose_fd(bool buffered, int write_hint) const +{ + assert(write_hint >= WRITE_LIFE_NOT_SET && write_hint < WRITE_LIFE_MAX); + if (!enable_wrt) + write_hint = WRITE_LIFE_NOT_SET; + return buffered ? fd_buffereds[write_hint] : fd_directs[write_hint]; +} + +int KernelDevice::flush() +{ + // protect flush with a mutex. note that we are not really protecting + // data here. instead, we're ensuring that if any flush() caller + // sees that io_since_flush is true, they block any racing callers + // until the flush is observed. that allows racing threads to be + // calling flush while still ensuring that *any* of them that got an + // aio completion notification will not return before that aio is + // stable on disk: whichever thread sees the flag first will block + // followers until the aio is stable. + std::lock_guard l(flush_mutex); + + bool expect = true; + if (!io_since_flush.compare_exchange_strong(expect, false)) { + dout(10) << __func__ << " no-op (no ios since last flush), flag is " + << (int)io_since_flush.load() << dendl; + return 0; + } + + dout(10) << __func__ << " start" << dendl; + if (cct->_conf->bdev_inject_crash) { + ++injecting_crash; + // sleep for a moment to give other threads a chance to submit or + // wait on io that races with a flush. + derr << __func__ << " injecting crash. first we sleep..." << dendl; + sleep(cct->_conf->bdev_inject_crash_flush_delay); + derr << __func__ << " and now we die" << dendl; + cct->_log->flush(); + _exit(1); + } + utime_t start = ceph_clock_now(); + int r = ::fdatasync(fd_directs[WRITE_LIFE_NOT_SET]); + utime_t end = ceph_clock_now(); + utime_t dur = end - start; + if (r < 0) { + r = -errno; + derr << __func__ << " fdatasync got: " << cpp_strerror(r) << dendl; + ceph_abort(); + } + dout(5) << __func__ << " in " << dur << dendl;; + return r; +} + +int KernelDevice::_aio_start() +{ + if (aio) { + dout(10) << __func__ << dendl; + int r = aio_queue.init(); + if (r < 0) { + if (r == -EAGAIN) { + derr << __func__ << " io_setup(2) failed with EAGAIN; " + << "try increasing /proc/sys/fs/aio-max-nr" << dendl; + } else { + derr << __func__ << " io_setup(2) failed: " << cpp_strerror(r) << dendl; + } + return r; + } + aio_thread.create("bstore_aio"); + } + return 0; +} + +void KernelDevice::_aio_stop() +{ + if (aio) { + dout(10) << __func__ << dendl; + aio_stop = true; + aio_thread.join(); + aio_stop = false; + aio_queue.shutdown(); + } +} + +int KernelDevice::_discard_start() +{ + discard_thread.create("bstore_discard"); + return 0; +} + +void KernelDevice::_discard_stop() +{ + dout(10) << __func__ << dendl; + { + std::unique_lock l(discard_lock); + while (!discard_started) { + discard_cond.wait(l); + } + discard_stop = true; + discard_cond.notify_all(); + } + discard_thread.join(); + { + std::lock_guard l(discard_lock); + discard_stop = false; + } + dout(10) << __func__ << " stopped" << dendl; +} + +void KernelDevice::discard_drain() +{ + dout(10) << __func__ << dendl; + std::unique_lock l(discard_lock); + while (!discard_queued.empty() || discard_running) { + discard_cond.wait(l); + } +} + +static bool is_expected_ioerr(const int r) +{ + // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135 + return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC || + r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO || + r == -ENODATA || r == -EILSEQ || r == -ENOMEM || +#if defined(__linux__) + r == -EREMCHG || r == -EBADE +#elif defined(__FreeBSD__) + r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE +#endif + ); +} + +void KernelDevice::_aio_thread() +{ + dout(10) << __func__ << " start" << dendl; + int inject_crash_count = 0; + while (!aio_stop) { + dout(40) << __func__ << " polling" << dendl; + int max = cct->_conf->bdev_aio_reap_max; + aio_t *aio[max]; + int r = aio_queue.get_next_completed(cct->_conf->bdev_aio_poll_ms, + aio, max); + if (r < 0) { + derr << __func__ << " got " << cpp_strerror(r) << dendl; + ceph_abort_msg("got unexpected error from io_getevents"); + } + if (r > 0) { + dout(30) << __func__ << " got " << r << " completed aios" << dendl; + for (int i = 0; i < r; ++i) { + IOContext *ioc = static_cast<IOContext*>(aio[i]->priv); + _aio_log_finish(ioc, aio[i]->offset, aio[i]->length); + if (aio[i]->queue_item.is_linked()) { + std::lock_guard l(debug_queue_lock); + debug_aio_unlink(*aio[i]); + } + + // set flag indicating new ios have completed. we do this *before* + // any completion or notifications so that any user flush() that + // follows the observed io completion will include this io. Note + // that an earlier, racing flush() could observe and clear this + // flag, but that also ensures that the IO will be stable before the + // later flush() occurs. + io_since_flush.store(true); + + long r = aio[i]->get_return_value(); + if (r < 0) { + derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")" + << dendl; + if (ioc->allow_eio && is_expected_ioerr(r)) { + derr << __func__ << " translating the error to EIO for upper layer" + << dendl; + ioc->set_return_value(-EIO); + } else { + if (is_expected_ioerr(r)) { + note_io_error_event( + devname.c_str(), + path.c_str(), + r, +#if defined(HAVE_POSIXAIO) + aio[i]->aio.aiocb.aio_lio_opcode, +#else + aio[i]->iocb.aio_lio_opcode, +#endif + aio[i]->offset, + aio[i]->length); + ceph_abort_msg( + "Unexpected IO error. " + "This may suggest a hardware issue. " + "Please check your kernel log!"); + } + ceph_abort_msg( + "Unexpected IO error. " + "This may suggest HW issue. Please check your dmesg!"); + } + } else if (aio[i]->length != (uint64_t)r) { + derr << "aio to 0x" << std::hex << aio[i]->offset + << "~" << aio[i]->length << std::dec + << " but returned: " << r << dendl; + ceph_abort_msg("unexpected aio return value: does not match length"); + } + + dout(10) << __func__ << " finished aio " << aio[i] << " r " << r + << " ioc " << ioc + << " with " << (ioc->num_running.load() - 1) + << " aios left" << dendl; + + // NOTE: once num_running and we either call the callback or + // call aio_wake we cannot touch ioc or aio[] as the caller + // may free it. + if (ioc->priv) { + if (--ioc->num_running == 0) { + aio_callback(aio_callback_priv, ioc->priv); + } + } else { + ioc->try_aio_wake(); + } + } + } + if (cct->_conf->bdev_debug_aio) { + utime_t now = ceph_clock_now(); + std::lock_guard l(debug_queue_lock); + if (debug_oldest) { + if (debug_stall_since == utime_t()) { + debug_stall_since = now; + } else { + if (cct->_conf->bdev_debug_aio_suicide_timeout) { + utime_t cutoff = now; + cutoff -= cct->_conf->bdev_debug_aio_suicide_timeout; + if (debug_stall_since < cutoff) { + derr << __func__ << " stalled aio " << debug_oldest + << " since " << debug_stall_since << ", timeout is " + << cct->_conf->bdev_debug_aio_suicide_timeout + << "s, suicide" << dendl; + ceph_abort_msg("stalled aio... buggy kernel or bad device?"); + } + } + } + } + } + reap_ioc(); + if (cct->_conf->bdev_inject_crash) { + ++inject_crash_count; + if (inject_crash_count * cct->_conf->bdev_aio_poll_ms / 1000 > + cct->_conf->bdev_inject_crash + cct->_conf->bdev_inject_crash_flush_delay) { + derr << __func__ << " bdev_inject_crash trigger from aio thread" + << dendl; + cct->_log->flush(); + _exit(1); + } + } + } + reap_ioc(); + dout(10) << __func__ << " end" << dendl; +} + +void KernelDevice::_discard_thread() +{ + std::unique_lock l(discard_lock); + ceph_assert(!discard_started); + discard_started = true; + discard_cond.notify_all(); + while (true) { + ceph_assert(discard_finishing.empty()); + if (discard_queued.empty()) { + if (discard_stop) + break; + dout(20) << __func__ << " sleep" << dendl; + discard_cond.notify_all(); // for the thread trying to drain... + discard_cond.wait(l); + dout(20) << __func__ << " wake" << dendl; + } else { + discard_finishing.swap(discard_queued); + discard_running = true; + l.unlock(); + dout(20) << __func__ << " finishing" << dendl; + for (auto p = discard_finishing.begin();p != discard_finishing.end(); ++p) { + discard(p.get_start(), p.get_len()); + } + + discard_callback(discard_callback_priv, static_cast<void*>(&discard_finishing)); + discard_finishing.clear(); + l.lock(); + discard_running = false; + } + } + dout(10) << __func__ << " finish" << dendl; + discard_started = false; +} + +int KernelDevice::queue_discard(interval_set<uint64_t> &to_release) +{ + if (!support_discard) + return -1; + + if (to_release.empty()) + return 0; + + std::lock_guard l(discard_lock); + discard_queued.insert(to_release); + discard_cond.notify_all(); + return 0; +} + +void KernelDevice::_aio_log_start( + IOContext *ioc, + uint64_t offset, + uint64_t length) +{ + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + if (cct->_conf->bdev_debug_inflight_ios) { + std::lock_guard l(debug_lock); + if (debug_inflight.intersects(offset, length)) { + derr << __func__ << " inflight overlap of 0x" + << std::hex + << offset << "~" << length << std::dec + << " with " << debug_inflight << dendl; + ceph_abort(); + } + debug_inflight.insert(offset, length); + } +} + +void KernelDevice::debug_aio_link(aio_t& aio) +{ + if (debug_queue.empty()) { + debug_oldest = &aio; + } + debug_queue.push_back(aio); +} + +void KernelDevice::debug_aio_unlink(aio_t& aio) +{ + if (aio.queue_item.is_linked()) { + debug_queue.erase(debug_queue.iterator_to(aio)); + if (debug_oldest == &aio) { + auto age = cct->_conf->bdev_debug_aio_log_age; + if (age && debug_stall_since != utime_t()) { + utime_t cutoff = ceph_clock_now(); + cutoff -= age; + if (debug_stall_since < cutoff) { + derr << __func__ << " stalled aio " << debug_oldest + << " since " << debug_stall_since << ", timeout is " + << age + << "s" << dendl; + } + } + + if (debug_queue.empty()) { + debug_oldest = nullptr; + } else { + debug_oldest = &debug_queue.front(); + } + debug_stall_since = utime_t(); + } + } +} + +void KernelDevice::_aio_log_finish( + IOContext *ioc, + uint64_t offset, + uint64_t length) +{ + dout(20) << __func__ << " " << aio << " 0x" + << std::hex << offset << "~" << length << std::dec << dendl; + if (cct->_conf->bdev_debug_inflight_ios) { + std::lock_guard l(debug_lock); + debug_inflight.erase(offset, length); + } +} + +void KernelDevice::aio_submit(IOContext *ioc) +{ + dout(20) << __func__ << " ioc " << ioc + << " pending " << ioc->num_pending.load() + << " running " << ioc->num_running.load() + << dendl; + + if (ioc->num_pending.load() == 0) { + return; + } + + // move these aside, and get our end iterator position now, as the + // aios might complete as soon as they are submitted and queue more + // wal aio's. + list<aio_t>::iterator e = ioc->running_aios.begin(); + ioc->running_aios.splice(e, ioc->pending_aios); + + int pending = ioc->num_pending.load(); + ioc->num_running += pending; + ioc->num_pending -= pending; + ceph_assert(ioc->num_pending.load() == 0); // we should be only thread doing this + ceph_assert(ioc->pending_aios.size() == 0); + + if (cct->_conf->bdev_debug_aio) { + list<aio_t>::iterator p = ioc->running_aios.begin(); + while (p != e) { + dout(30) << __func__ << " " << *p << dendl; + std::lock_guard l(debug_queue_lock); + debug_aio_link(*p++); + } + } + + void *priv = static_cast<void*>(ioc); + int r, retries = 0; + r = aio_queue.submit_batch(ioc->running_aios.begin(), e, + pending, priv, &retries); + + if (retries) + derr << __func__ << " retries " << retries << dendl; + if (r < 0) { + derr << " aio submit got " << cpp_strerror(r) << dendl; + ceph_assert(r == 0); + } +} + +int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int write_hint) +{ + uint64_t len = bl.length(); + dout(5) << __func__ << " 0x" << std::hex << off << "~" << len + << std::dec << (buffered ? " (buffered)" : " (direct)") << dendl; + if (cct->_conf->bdev_inject_crash && + rand() % cct->_conf->bdev_inject_crash == 0) { + derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex + << off << "~" << len << std::dec << dendl; + ++injecting_crash; + return 0; + } + vector<iovec> iov; + bl.prepare_iov(&iov); + + auto left = len; + auto o = off; + size_t idx = 0; + do { + auto r = ::pwritev(choose_fd(buffered, write_hint), + &iov[idx], iov.size() - idx, o); + + if (r < 0) { + r = -errno; + derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl; + return r; + } + o += r; + left -= r; + if (left) { + // skip fully processed IOVs + while (idx < iov.size() && (size_t)r >= iov[idx].iov_len) { + r -= iov[idx++].iov_len; + } + // update partially processed one if any + if (r) { + ceph_assert(idx < iov.size()); + ceph_assert((size_t)r < iov[idx].iov_len); + iov[idx].iov_base = static_cast<char*>(iov[idx].iov_base) + r; + iov[idx].iov_len -= r; + r = 0; + } + ceph_assert(r == 0); + } + } while (left); + +#ifdef HAVE_SYNC_FILE_RANGE + if (buffered) { + // initiate IO and wait till it completes + auto r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER|SYNC_FILE_RANGE_WAIT_BEFORE); + if (r < 0) { + r = -errno; + derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl; + return r; + } + } +#endif + + io_since_flush.store(true); + + return 0; +} + +int KernelDevice::write( + uint64_t off, + bufferlist &bl, + bool buffered, + int write_hint) +{ + uint64_t len = bl.length(); + dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec + << (buffered ? " (buffered)" : " (direct)") + << dendl; + ceph_assert(is_valid_io(off, len)); + if (cct->_conf->objectstore_blackhole) { + lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO" + << dendl; + return 0; + } + + if ((!buffered || bl.get_num_buffers() >= IOV_MAX) && + bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) { + dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl; + } + dout(40) << "data: "; + bl.hexdump(*_dout); + *_dout << dendl; + + return _sync_write(off, bl, buffered, write_hint); +} + +int KernelDevice::aio_write( + uint64_t off, + bufferlist &bl, + IOContext *ioc, + bool buffered, + int write_hint) +{ + uint64_t len = bl.length(); + dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec + << (buffered ? " (buffered)" : " (direct)") + << dendl; + ceph_assert(is_valid_io(off, len)); + if (cct->_conf->objectstore_blackhole) { + lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO" + << dendl; + return 0; + } + + if ((!buffered || bl.get_num_buffers() >= IOV_MAX) && + bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) { + dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl; + } + dout(40) << "data: "; + bl.hexdump(*_dout); + *_dout << dendl; + + _aio_log_start(ioc, off, len); + +#ifdef HAVE_LIBAIO + if (aio && dio && !buffered) { + if (cct->_conf->bdev_inject_crash && + rand() % cct->_conf->bdev_inject_crash == 0) { + derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex + << off << "~" << len << std::dec + << dendl; + // generate a real io so that aio_wait behaves properly, but make it + // a read instead of write, and toss the result. + ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); + ++ioc->num_pending; + auto& aio = ioc->pending_aios.back(); + aio.pread(off, len); + ++injecting_crash; + } else { + if (bl.length() <= RW_IO_MAX) { + // fast path (non-huge write) + ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); + ++ioc->num_pending; + auto& aio = ioc->pending_aios.back(); + bl.prepare_iov(&aio.iov); + aio.bl.claim_append(bl); + aio.pwritev(off, len); + dout(30) << aio << dendl; + dout(5) << __func__ << " 0x" << std::hex << off << "~" << len + << std::dec << " aio " << &aio << dendl; + } else { + // write in RW_IO_MAX-sized chunks + uint64_t prev_len = 0; + while (prev_len < bl.length()) { + bufferlist tmp; + if (prev_len + RW_IO_MAX < bl.length()) { + tmp.substr_of(bl, prev_len, RW_IO_MAX); + } else { + tmp.substr_of(bl, prev_len, bl.length() - prev_len); + } + auto len = tmp.length(); + ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); + ++ioc->num_pending; + auto& aio = ioc->pending_aios.back(); + tmp.prepare_iov(&aio.iov); + aio.bl.claim_append(tmp); + aio.pwritev(off + prev_len, len); + dout(30) << aio << dendl; + dout(5) << __func__ << " 0x" << std::hex << off + prev_len + << "~" << len + << std::dec << " aio " << &aio << " (piece)" << dendl; + prev_len += len; + } + } + } + } else +#endif + { + int r = _sync_write(off, bl, buffered, write_hint); + _aio_log_finish(ioc, off, len); + if (r < 0) + return r; + } + return 0; +} + +int KernelDevice::discard(uint64_t offset, uint64_t len) +{ + int r = 0; + if (cct->_conf->objectstore_blackhole) { + lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO" + << dendl; + return 0; + } + if (support_discard) { + dout(10) << __func__ + << " 0x" << std::hex << offset << "~" << len << std::dec + << dendl; + + r = BlkDev{fd_directs[WRITE_LIFE_NOT_SET]}.discard((int64_t)offset, (int64_t)len); + } + return r; +} + +int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc, + bool buffered) +{ + dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec + << (buffered ? " (buffered)" : " (direct)") + << dendl; + ceph_assert(is_valid_io(off, len)); + + _aio_log_start(ioc, off, len); + + auto start1 = mono_clock::now(); + + auto p = buffer::ptr_node::create(buffer::create_small_page_aligned(len)); + int r = ::pread(buffered ? fd_buffereds[WRITE_LIFE_NOT_SET] : fd_directs[WRITE_LIFE_NOT_SET], + p->c_str(), len, off); + auto age = cct->_conf->bdev_debug_aio_log_age; + if (mono_clock::now() - start1 >= make_timespan(age)) { + derr << __func__ << " stalled read " + << " 0x" << std::hex << off << "~" << len << std::dec + << (buffered ? " (buffered)" : " (direct)") + << " since " << start1 << ", timeout is " + << age + << "s" << dendl; + } + + if (r < 0) { + if (ioc->allow_eio && is_expected_ioerr(r)) { + r = -EIO; + } else { + r = -errno; + } + goto out; + } + ceph_assert((uint64_t)r == len); + pbl->push_back(std::move(p)); + + dout(40) << "data: "; + pbl->hexdump(*_dout); + *_dout << dendl; + + out: + _aio_log_finish(ioc, off, len); + return r < 0 ? r : 0; +} + +int KernelDevice::aio_read( + uint64_t off, + uint64_t len, + bufferlist *pbl, + IOContext *ioc) +{ + dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec + << dendl; + + int r = 0; +#ifdef HAVE_LIBAIO + if (aio && dio) { + ceph_assert(is_valid_io(off, len)); + _aio_log_start(ioc, off, len); + ioc->pending_aios.push_back(aio_t(ioc, fd_directs[WRITE_LIFE_NOT_SET])); + ++ioc->num_pending; + aio_t& aio = ioc->pending_aios.back(); + aio.pread(off, len); + dout(30) << aio << dendl; + pbl->append(aio.bl); + dout(5) << __func__ << " 0x" << std::hex << off << "~" << len + << std::dec << " aio " << &aio << dendl; + } else +#endif + { + r = read(off, len, pbl, ioc, false); + } + + return r; +} + +int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf) +{ + uint64_t aligned_off = align_down(off, block_size); + uint64_t aligned_len = align_up(off+len, block_size) - aligned_off; + bufferptr p = buffer::create_small_page_aligned(aligned_len); + int r = 0; + + auto start1 = mono_clock::now(); + r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], p.c_str(), aligned_len, aligned_off); + auto age = cct->_conf->bdev_debug_aio_log_age; + if (mono_clock::now() - start1 >= make_timespan(age)) { + derr << __func__ << " stalled read " + << " 0x" << std::hex << off << "~" << len << std::dec + << " since " << start1 << ", timeout is " + << age + << "s" << dendl; + } + + if (r < 0) { + r = -errno; + derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec + << " error: " << cpp_strerror(r) << dendl; + goto out; + } + ceph_assert((uint64_t)r == aligned_len); + memcpy(buf, p.c_str() + (off - aligned_off), len); + + dout(40) << __func__ << " data: "; + bufferlist bl; + bl.append(buf, len); + bl.hexdump(*_dout); + *_dout << dendl; + + out: + return r < 0 ? r : 0; +} + +int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf, + bool buffered) +{ + dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec + << "buffered " << buffered + << dendl; + ceph_assert(len > 0); + ceph_assert(off < size); + ceph_assert(off + len <= size); + int r = 0; + auto age = cct->_conf->bdev_debug_aio_log_age; + + //if it's direct io and unaligned, we have to use a internal buffer + if (!buffered && ((off % block_size != 0) + || (len % block_size != 0) + || (uintptr_t(buf) % CEPH_PAGE_SIZE != 0))) + return direct_read_unaligned(off, len, buf); + + auto start1 = mono_clock::now(); + if (buffered) { + //buffered read + auto off0 = off; + char *t = buf; + uint64_t left = len; + while (left > 0) { + r = ::pread(fd_buffereds[WRITE_LIFE_NOT_SET], t, left, off); + if (r < 0) { + r = -errno; + derr << __func__ << " 0x" << std::hex << off << "~" << left + << std::dec << " error: " << cpp_strerror(r) << dendl; + goto out; + } + off += r; + t += r; + left -= r; + } + if (mono_clock::now() - start1 >= make_timespan(age)) { + derr << __func__ << " stalled read " + << " 0x" << std::hex << off0 << "~" << len << std::dec + << " (buffered) since " << start1 << ", timeout is " + << age + << "s" << dendl; + } + } else { + //direct and aligned read + r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], buf, len, off); + if (mono_clock::now() - start1 >= make_timespan(age)) { + derr << __func__ << " stalled read " + << " 0x" << std::hex << off << "~" << len << std::dec + << " (direct) since " << start1 << ", timeout is " + << age + << "s" << dendl; + } + if (r < 0) { + r = -errno; + derr << __func__ << " direct_aligned_read" << " 0x" << std::hex + << off << "~" << left << std::dec << " error: " << cpp_strerror(r) + << dendl; + goto out; + } + ceph_assert((uint64_t)r == len); + } + + dout(40) << __func__ << " data: "; + bufferlist bl; + bl.append(buf, len); + bl.hexdump(*_dout); + *_dout << dendl; + + out: + return r < 0 ? r : 0; +} + +int KernelDevice::invalidate_cache(uint64_t off, uint64_t len) +{ + dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec + << dendl; + ceph_assert(off % block_size == 0); + ceph_assert(len % block_size == 0); + int r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, POSIX_FADV_DONTNEED); + if (r) { + r = -r; + derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec + << " error: " << cpp_strerror(r) << dendl; + } + return r; +} diff --git a/src/os/bluestore/KernelDevice.h b/src/os/bluestore/KernelDevice.h new file mode 100644 index 00000000..19b52abd --- /dev/null +++ b/src/os/bluestore/KernelDevice.h @@ -0,0 +1,150 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OS_BLUESTORE_KERNELDEVICE_H +#define CEPH_OS_BLUESTORE_KERNELDEVICE_H + +#include <atomic> + +#include "include/types.h" +#include "include/interval_set.h" +#include "common/Thread.h" +#include "include/utime.h" + +#include "ceph_aio.h" +#include "BlockDevice.h" + +#define RW_IO_MAX (INT_MAX & CEPH_PAGE_MASK) + + +class KernelDevice : public BlockDevice { + std::vector<int> fd_directs, fd_buffereds; + bool enable_wrt = true; + std::string path; + bool aio, dio; + + int vdo_fd = -1; ///< fd for vdo sysfs directory + string vdo_name; + + std::string devname; ///< kernel dev name (/sys/block/$devname), if any + + ceph::mutex debug_lock = ceph::make_mutex("KernelDevice::debug_lock"); + interval_set<uint64_t> debug_inflight; + + std::atomic<bool> io_since_flush = {false}; + ceph::mutex flush_mutex = ceph::make_mutex("KernelDevice::flush_mutex"); + + aio_queue_t aio_queue; + aio_callback_t discard_callback; + void *discard_callback_priv; + bool aio_stop; + bool discard_started; + bool discard_stop; + + ceph::mutex discard_lock = ceph::make_mutex("KernelDevice::discard_lock"); + ceph::condition_variable discard_cond; + bool discard_running = false; + interval_set<uint64_t> discard_queued; + interval_set<uint64_t> discard_finishing; + + struct AioCompletionThread : public Thread { + KernelDevice *bdev; + explicit AioCompletionThread(KernelDevice *b) : bdev(b) {} + void *entry() override { + bdev->_aio_thread(); + return NULL; + } + } aio_thread; + + struct DiscardThread : public Thread { + KernelDevice *bdev; + explicit DiscardThread(KernelDevice *b) : bdev(b) {} + void *entry() override { + bdev->_discard_thread(); + return NULL; + } + } discard_thread; + + std::atomic_int injecting_crash; + + void _aio_thread(); + void _discard_thread(); + int queue_discard(interval_set<uint64_t> &to_release) override; + + int _aio_start(); + void _aio_stop(); + + int _discard_start(); + void _discard_stop(); + + void _aio_log_start(IOContext *ioc, uint64_t offset, uint64_t length); + void _aio_log_finish(IOContext *ioc, uint64_t offset, uint64_t length); + + int _sync_write(uint64_t off, bufferlist& bl, bool buffered, int write_hint); + + int _lock(); + + int direct_read_unaligned(uint64_t off, uint64_t len, char *buf); + + // stalled aio debugging + aio_list_t debug_queue; + ceph::mutex debug_queue_lock = ceph::make_mutex("KernelDevice::debug_queue_lock"); + aio_t *debug_oldest = nullptr; + utime_t debug_stall_since; + void debug_aio_link(aio_t& aio); + void debug_aio_unlink(aio_t& aio); + + void _detect_vdo(); + int choose_fd(bool buffered, int write_hint) const; + +public: + KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); + + void aio_submit(IOContext *ioc) override; + void discard_drain() override; + + int collect_metadata(const std::string& prefix, map<std::string,std::string> *pm) const override; + int get_devname(std::string *s) override { + if (devname.empty()) { + return -ENOENT; + } + *s = devname; + return 0; + } + int get_devices(std::set<std::string> *ls) override; + + bool get_thin_utilization(uint64_t *total, uint64_t *avail) const override; + + int read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc, + bool buffered) override; + int aio_read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc) override; + int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override; + + int write(uint64_t off, bufferlist& bl, bool buffered, int write_hint = WRITE_LIFE_NOT_SET) override; + int aio_write(uint64_t off, bufferlist& bl, + IOContext *ioc, + bool buffered, + int write_hint = WRITE_LIFE_NOT_SET) override; + int flush() override; + int discard(uint64_t offset, uint64_t len) override; + + // for managing buffered readers/writers + int invalidate_cache(uint64_t off, uint64_t len) override; + int open(const std::string& path) override; + void close() override; +}; + +#endif diff --git a/src/os/bluestore/NVMEDevice.cc b/src/os/bluestore/NVMEDevice.cc new file mode 100644 index 00000000..acd9eb03 --- /dev/null +++ b/src/os/bluestore/NVMEDevice.cc @@ -0,0 +1,952 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <unistd.h> +#include <stdlib.h> +#include <strings.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include <chrono> +#include <fstream> +#include <functional> +#include <map> +#include <thread> + +#include <spdk/nvme.h> + +#include "include/stringify.h" +#include "include/types.h" +#include "include/compat.h" +#include "common/align.h" +#include "common/errno.h" +#include "common/debug.h" +#include "common/perf_counters.h" + +#include "NVMEDevice.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_bdev +#undef dout_prefix +#define dout_prefix *_dout << "bdev(" << sn << ") " + +thread_local SharedDriverQueueData *queue_t; + +static constexpr uint16_t data_buffer_default_num = 1024; + +static constexpr uint32_t data_buffer_size = 8192; + +static constexpr uint16_t inline_segment_num = 32; + +enum { + l_bluestore_nvmedevice_first = 632430, + l_bluestore_nvmedevice_write_lat, + l_bluestore_nvmedevice_read_lat, + l_bluestore_nvmedevice_flush_lat, + l_bluestore_nvmedevice_write_queue_lat, + l_bluestore_nvmedevice_read_queue_lat, + l_bluestore_nvmedevice_flush_queue_lat, + l_bluestore_nvmedevice_queue_ops, + l_bluestore_nvmedevice_polling_lat, + l_bluestore_nvmedevice_buffer_alloc_failed, + l_bluestore_nvmedevice_last +}; + +static void io_complete(void *t, const struct spdk_nvme_cpl *completion); + +struct IORequest { + uint16_t cur_seg_idx = 0; + uint16_t nseg; + uint32_t cur_seg_left = 0; + void *inline_segs[inline_segment_num]; + void **extra_segs = nullptr; +}; + +struct Task; + +class SharedDriverData { + unsigned id; + spdk_nvme_transport_id trid; + spdk_nvme_ctrlr *ctrlr; + spdk_nvme_ns *ns; + uint32_t block_size = 0; + uint64_t size = 0; + + public: + std::vector<NVMEDevice*> registered_devices; + friend class SharedDriverQueueData; + SharedDriverData(unsigned id_, const spdk_nvme_transport_id& trid_, + spdk_nvme_ctrlr *c, spdk_nvme_ns *ns_) + : id(id_), + trid(trid_), + ctrlr(c), + ns(ns_) { + block_size = spdk_nvme_ns_get_extended_sector_size(ns); + size = spdk_nvme_ns_get_size(ns); + } + + bool is_equal(const spdk_nvme_transport_id& trid2) const { + return spdk_nvme_transport_id_compare(&trid, &trid2) == 0; + } + ~SharedDriverData() { + } + + void register_device(NVMEDevice *device) { + registered_devices.push_back(device); + } + + void remove_device(NVMEDevice *device) { + std::vector<NVMEDevice*> new_devices; + for (auto &&it : registered_devices) { + if (it != device) + new_devices.push_back(it); + } + registered_devices.swap(new_devices); + } + + uint32_t get_block_size() { + return block_size; + } + uint64_t get_size() { + return size; + } +}; + +class SharedDriverQueueData { + NVMEDevice *bdev; + SharedDriverData *driver; + spdk_nvme_ctrlr *ctrlr; + spdk_nvme_ns *ns; + std::string sn; + uint32_t block_size; + uint32_t max_queue_depth; + struct spdk_nvme_qpair *qpair; + bool reap_io = false; + int alloc_buf_from_pool(Task *t, bool write); + + public: + uint32_t current_queue_depth = 0; + std::atomic_ulong completed_op_seq, queue_op_seq; + std::vector<void*> data_buf_mempool; + PerfCounters *logger = nullptr; + void _aio_handle(Task *t, IOContext *ioc); + + SharedDriverQueueData(NVMEDevice *bdev, SharedDriverData *driver) + : bdev(bdev), + driver(driver) { + ctrlr = driver->ctrlr; + ns = driver->ns; + block_size = driver->block_size; + + struct spdk_nvme_io_qpair_opts opts = {}; + spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); + opts.qprio = SPDK_NVME_QPRIO_URGENT; + // usable queue depth should minus 1 to aovid overflow. + max_queue_depth = opts.io_queue_size - 1; + qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); + ceph_assert(qpair != NULL); + + // allocate spdk dma memory + for (uint16_t i = 0; i < data_buffer_default_num; i++) { + void *b = spdk_dma_zmalloc(data_buffer_size, CEPH_PAGE_SIZE, NULL); + if (!b) { + derr << __func__ << " failed to create memory pool for nvme data buffer" << dendl; + ceph_assert(b); + } + data_buf_mempool.push_back(b); + } + + PerfCountersBuilder b(g_ceph_context, string("NVMEDevice-AIOThread-"+stringify(this)), + l_bluestore_nvmedevice_first, l_bluestore_nvmedevice_last); + b.add_time_avg(l_bluestore_nvmedevice_write_lat, "write_lat", "Average write completing latency"); + b.add_time_avg(l_bluestore_nvmedevice_read_lat, "read_lat", "Average read completing latency"); + b.add_time_avg(l_bluestore_nvmedevice_flush_lat, "flush_lat", "Average flush completing latency"); + b.add_u64(l_bluestore_nvmedevice_queue_ops, "queue_ops", "Operations in nvme queue"); + b.add_time_avg(l_bluestore_nvmedevice_polling_lat, "polling_lat", "Average polling latency"); + b.add_time_avg(l_bluestore_nvmedevice_write_queue_lat, "write_queue_lat", "Average queue write request latency"); + b.add_time_avg(l_bluestore_nvmedevice_read_queue_lat, "read_queue_lat", "Average queue read request latency"); + b.add_time_avg(l_bluestore_nvmedevice_flush_queue_lat, "flush_queue_lat", "Average queue flush request latency"); + b.add_u64_counter(l_bluestore_nvmedevice_buffer_alloc_failed, "buffer_alloc_failed", "Alloc data buffer failed count"); + logger = b.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(logger); + bdev->queue_number++; + if (bdev->queue_number.load() == 1) + reap_io = true; + } + + ~SharedDriverQueueData() { + g_ceph_context->get_perfcounters_collection()->remove(logger); + if (qpair) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + bdev->queue_number--; + } + + // free all spdk dma memory; + if (!data_buf_mempool.empty()) { + for (uint16_t i = 0; i < data_buffer_default_num; i++) { + void *b = data_buf_mempool[i]; + ceph_assert(b); + spdk_dma_free(b); + } + data_buf_mempool.clear(); + } + + delete logger; + } +}; + +struct Task { + NVMEDevice *device; + IOContext *ctx = nullptr; + IOCommand command; + uint64_t offset; + uint64_t len; + bufferlist bl; + std::function<void()> fill_cb; + Task *next = nullptr; + int64_t return_code; + ceph::coarse_real_clock::time_point start; + IORequest io_request; + ceph::mutex lock = ceph::make_mutex("Task::lock"); + ceph::condition_variable cond; + SharedDriverQueueData *queue = nullptr; + Task(NVMEDevice *dev, IOCommand c, uint64_t off, uint64_t l, int64_t rc = 0) + : device(dev), command(c), offset(off), len(l), + return_code(rc), + start(ceph::coarse_real_clock::now()) {} + ~Task() { + ceph_assert(!io_request.nseg); + } + void release_segs(SharedDriverQueueData *queue_data) { + if (io_request.extra_segs) { + for (uint16_t i = 0; i < io_request.nseg; i++) + queue_data->data_buf_mempool.push_back(io_request.extra_segs[i]); + delete io_request.extra_segs; + } else if (io_request.nseg) { + for (uint16_t i = 0; i < io_request.nseg; i++) + queue_data->data_buf_mempool.push_back(io_request.inline_segs[i]); + } + ctx->total_nseg -= io_request.nseg; + io_request.nseg = 0; + } + + void copy_to_buf(char *buf, uint64_t off, uint64_t len) { + uint64_t copied = 0; + uint64_t left = len; + void **segs = io_request.extra_segs ? io_request.extra_segs : io_request.inline_segs; + uint16_t i = 0; + while (left > 0) { + char *src = static_cast<char*>(segs[i++]); + uint64_t need_copy = std::min(left, data_buffer_size-off); + memcpy(buf+copied, src+off, need_copy); + off = 0; + left -= need_copy; + copied += need_copy; + } + } +}; + +static void data_buf_reset_sgl(void *cb_arg, uint32_t sgl_offset) +{ + Task *t = static_cast<Task*>(cb_arg); + uint32_t i = sgl_offset / data_buffer_size; + uint32_t offset = i * data_buffer_size; + ceph_assert(i <= t->io_request.nseg); + + for (; i < t->io_request.nseg; i++) { + offset += data_buffer_size; + if (offset > sgl_offset) { + if (offset > t->len) + offset = t->len; + break; + } + } + + t->io_request.cur_seg_idx = i; + t->io_request.cur_seg_left = offset - sgl_offset; + return ; +} + +static int data_buf_next_sge(void *cb_arg, void **address, uint32_t *length) +{ + uint32_t size; + void *addr; + Task *t = static_cast<Task*>(cb_arg); + if (t->io_request.cur_seg_idx >= t->io_request.nseg) { + *length = 0; + *address = 0; + return 0; + } + + addr = t->io_request.extra_segs ? t->io_request.extra_segs[t->io_request.cur_seg_idx] : t->io_request.inline_segs[t->io_request.cur_seg_idx]; + + size = data_buffer_size; + if (t->io_request.cur_seg_idx == t->io_request.nseg - 1) { + uint64_t tail = t->len % data_buffer_size; + if (tail) { + size = (uint32_t) tail; + } + } + + if (t->io_request.cur_seg_left) { + *address = (void *)((uint64_t)addr + size - t->io_request.cur_seg_left); + *length = t->io_request.cur_seg_left; + t->io_request.cur_seg_left = 0; + } else { + *address = addr; + *length = size; + } + + t->io_request.cur_seg_idx++; + return 0; +} + +int SharedDriverQueueData::alloc_buf_from_pool(Task *t, bool write) +{ + uint64_t count = t->len / data_buffer_size; + if (t->len % data_buffer_size) + ++count; + void **segs; + if (count > data_buf_mempool.size()) + return -ENOMEM; + if (count <= inline_segment_num) { + segs = t->io_request.inline_segs; + } else { + t->io_request.extra_segs = new void*[count]; + segs = t->io_request.extra_segs; + } + for (uint16_t i = 0; i < count; i++) { + segs[i] = data_buf_mempool.back(); + data_buf_mempool.pop_back(); + } + t->io_request.nseg = count; + t->ctx->total_nseg += count; + if (write) { + auto blp = t->bl.begin(); + uint32_t len = 0; + uint16_t i = 0; + for (; i < count - 1; ++i) { + blp.copy(data_buffer_size, static_cast<char*>(segs[i])); + len += data_buffer_size; + } + blp.copy(t->bl.length() - len, static_cast<char*>(segs[i])); + } + + return 0; +} + +void SharedDriverQueueData::_aio_handle(Task *t, IOContext *ioc) +{ + dout(20) << __func__ << " start" << dendl; + + int r = 0; + uint64_t lba_off, lba_count; + uint32_t max_io_completion = (uint32_t)g_conf().get_val<uint64_t>("bluestore_spdk_max_io_completion"); + uint64_t io_sleep_in_us = g_conf().get_val<uint64_t>("bluestore_spdk_io_sleep"); + + ceph::coarse_real_clock::time_point cur, start + = ceph::coarse_real_clock::now(); + while (ioc->num_running) { + again: + dout(40) << __func__ << " polling" << dendl; + if (current_queue_depth) { + r = spdk_nvme_qpair_process_completions(qpair, max_io_completion); + if (r < 0) { + ceph_abort(); + } else if (r == 0) { + usleep(io_sleep_in_us); + } + } + + for (; t; t = t->next) { + if (current_queue_depth == max_queue_depth) { + // no slots + goto again; + } + + t->queue = this; + lba_off = t->offset / block_size; + lba_count = t->len / block_size; + switch (t->command) { + case IOCommand::WRITE_COMMAND: + { + dout(20) << __func__ << " write command issued " << lba_off << "~" << lba_count << dendl; + r = alloc_buf_from_pool(t, true); + if (r < 0) { + logger->inc(l_bluestore_nvmedevice_buffer_alloc_failed); + goto again; + } + + r = spdk_nvme_ns_cmd_writev( + ns, qpair, lba_off, lba_count, io_complete, t, 0, + data_buf_reset_sgl, data_buf_next_sge); + if (r < 0) { + derr << __func__ << " failed to do write command" << dendl; + t->ctx->nvme_task_first = t->ctx->nvme_task_last = nullptr; + t->release_segs(this); + delete t; + ceph_abort(); + } + cur = ceph::coarse_real_clock::now(); + auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>(cur - t->start); + logger->tinc(l_bluestore_nvmedevice_write_queue_lat, dur); + break; + } + case IOCommand::READ_COMMAND: + { + dout(20) << __func__ << " read command issued " << lba_off << "~" << lba_count << dendl; + r = alloc_buf_from_pool(t, false); + if (r < 0) { + logger->inc(l_bluestore_nvmedevice_buffer_alloc_failed); + goto again; + } + + r = spdk_nvme_ns_cmd_readv( + ns, qpair, lba_off, lba_count, io_complete, t, 0, + data_buf_reset_sgl, data_buf_next_sge); + if (r < 0) { + derr << __func__ << " failed to read" << dendl; + t->release_segs(this); + delete t; + ceph_abort(); + } else { + cur = ceph::coarse_real_clock::now(); + auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>(cur - t->start); + logger->tinc(l_bluestore_nvmedevice_read_queue_lat, dur); + } + break; + } + case IOCommand::FLUSH_COMMAND: + { + dout(20) << __func__ << " flush command issueed " << dendl; + r = spdk_nvme_ns_cmd_flush(ns, qpair, io_complete, t); + if (r < 0) { + derr << __func__ << " failed to flush" << dendl; + t->release_segs(this); + delete t; + ceph_abort(); + } else { + cur = ceph::coarse_real_clock::now(); + auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>(cur - t->start); + logger->tinc(l_bluestore_nvmedevice_flush_queue_lat, dur); + } + break; + } + } + current_queue_depth++; + } + cur = ceph::coarse_real_clock::now(); + auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>(cur - start); + logger->tinc(l_bluestore_nvmedevice_polling_lat, dur); + start = ceph::coarse_real_clock::now(); + } + + if (reap_io) + bdev->reap_ioc(); + dout(20) << __func__ << " end" << dendl; +} + +#define dout_subsys ceph_subsys_bdev +#undef dout_prefix +#define dout_prefix *_dout << "bdev " + +class NVMEManager { + public: + struct ProbeContext { + spdk_nvme_transport_id trid; + NVMEManager *manager; + SharedDriverData *driver; + bool done; + }; + + private: + ceph::mutex lock = ceph::make_mutex("NVMEManager::lock"); + bool stopping = false; + std::vector<SharedDriverData*> shared_driver_datas; + std::thread dpdk_thread; + ceph::mutex probe_queue_lock = ceph::make_mutex("NVMEManager::probe_queue_lock"); + ceph::condition_variable probe_queue_cond; + std::list<ProbeContext*> probe_queue; + + public: + NVMEManager() {} + ~NVMEManager() { + if (!dpdk_thread.joinable()) + return; + { + std::lock_guard guard(probe_queue_lock); + stopping = true; + probe_queue_cond.notify_all(); + } + dpdk_thread.join(); + } + + int try_get(const spdk_nvme_transport_id& trid, SharedDriverData **driver); + void register_ctrlr(const spdk_nvme_transport_id& trid, spdk_nvme_ctrlr *c, SharedDriverData **driver) { + ceph_assert(ceph_mutex_is_locked(lock)); + spdk_nvme_ns *ns; + int num_ns = spdk_nvme_ctrlr_get_num_ns(c); + ceph_assert(num_ns >= 1); + if (num_ns > 1) { + dout(0) << __func__ << " namespace count larger than 1, currently only use the first namespace" << dendl; + } + ns = spdk_nvme_ctrlr_get_ns(c, 1); + if (!ns) { + derr << __func__ << " failed to get namespace at 1" << dendl; + ceph_abort(); + } + dout(1) << __func__ << " successfully attach nvme device at" << trid.traddr << dendl; + + // only support one device per osd now! + ceph_assert(shared_driver_datas.empty()); + // index 0 is occurred by master thread + shared_driver_datas.push_back(new SharedDriverData(shared_driver_datas.size()+1, trid, c, ns)); + *driver = shared_driver_datas.back(); + } +}; + +static NVMEManager manager; + +static bool probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, struct spdk_nvme_ctrlr_opts *opts) +{ + NVMEManager::ProbeContext *ctx = static_cast<NVMEManager::ProbeContext*>(cb_ctx); + + if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) { + dout(0) << __func__ << " only probe local nvme device" << dendl; + return false; + } + + dout(0) << __func__ << " found device at: " + << "trtype=" << spdk_nvme_transport_id_trtype_str(trid->trtype) << ", " + << "traddr=" << trid->traddr << dendl; + if (spdk_nvme_transport_id_compare(&ctx->trid, trid)) { + dout(0) << __func__ << " device traddr (" << ctx->trid.traddr << ") not match " << trid->traddr << dendl; + return false; + } + + return true; +} + +static void attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + auto ctx = static_cast<NVMEManager::ProbeContext*>(cb_ctx); + ctx->manager->register_ctrlr(ctx->trid, ctrlr, &ctx->driver); +} + +int NVMEManager::try_get(const spdk_nvme_transport_id& trid, SharedDriverData **driver) +{ + std::lock_guard l(lock); + for (auto &&it : shared_driver_datas) { + if (it->is_equal(trid)) { + *driver = it; + return 0; + } + } + + auto coremask_arg = g_conf().get_val<std::string>("bluestore_spdk_coremask"); + int m_core_arg = -1; + try { + auto core_value = stoull(coremask_arg, nullptr, 16); + m_core_arg = ffsll(core_value); + } catch (const std::logic_error& e) { + derr << __func__ << " invalid bluestore_spdk_coremask: " + << coremask_arg << dendl; + return -EINVAL; + } + // at least one core is needed for using spdk + if (m_core_arg == 0) { + derr << __func__ << " invalid bluestore_spdk_coremask, " + << "at least one core is needed" << dendl; + return -ENOENT; + } + m_core_arg -= 1; + + uint32_t mem_size_arg = (uint32_t)g_conf().get_val<Option::size_t>("bluestore_spdk_mem"); + + if (!dpdk_thread.joinable()) { + dpdk_thread = std::thread( + [this, coremask_arg, m_core_arg, mem_size_arg]() { + static struct spdk_env_opts opts; + int r; + + spdk_env_opts_init(&opts); + opts.name = "nvme-device-manager"; + opts.core_mask = coremask_arg.c_str(); + opts.master_core = m_core_arg; + opts.mem_size = mem_size_arg; + spdk_env_init(&opts); + spdk_unaffinitize_thread(); + + spdk_nvme_retry_count = g_ceph_context->_conf->bdev_nvme_retry_count; + if (spdk_nvme_retry_count < 0) + spdk_nvme_retry_count = SPDK_NVME_DEFAULT_RETRY_COUNT; + + std::unique_lock l(probe_queue_lock); + while (!stopping) { + if (!probe_queue.empty()) { + ProbeContext* ctxt = probe_queue.front(); + probe_queue.pop_front(); + r = spdk_nvme_probe(NULL, ctxt, probe_cb, attach_cb, NULL); + if (r < 0) { + ceph_assert(!ctxt->driver); + derr << __func__ << " device probe nvme failed" << dendl; + } + ctxt->done = true; + probe_queue_cond.notify_all(); + } else { + probe_queue_cond.wait(l); + } + } + for (auto p : probe_queue) + p->done = true; + probe_queue_cond.notify_all(); + } + ); + } + + ProbeContext ctx{trid, this, nullptr, false}; + { + std::unique_lock l(probe_queue_lock); + probe_queue.push_back(&ctx); + while (!ctx.done) + probe_queue_cond.wait(l); + } + if (!ctx.driver) + return -1; + *driver = ctx.driver; + + return 0; +} + +void io_complete(void *t, const struct spdk_nvme_cpl *completion) +{ + Task *task = static_cast<Task*>(t); + IOContext *ctx = task->ctx; + SharedDriverQueueData *queue = task->queue; + + ceph_assert(queue != NULL); + ceph_assert(ctx != NULL); + --queue->current_queue_depth; + auto dur = std::chrono::duration_cast<std::chrono::nanoseconds>( + ceph::coarse_real_clock::now() - task->start); + if (task->command == IOCommand::WRITE_COMMAND) { + queue->logger->tinc(l_bluestore_nvmedevice_write_lat, dur); + ceph_assert(!spdk_nvme_cpl_is_error(completion)); + dout(20) << __func__ << " write/zero op successfully, left " + << queue->queue_op_seq - queue->completed_op_seq << dendl; + // check waiting count before doing callback (which may + // destroy this ioc). + if (ctx->priv) { + if (!--ctx->num_running) { + task->device->aio_callback(task->device->aio_callback_priv, ctx->priv); + } + } else { + ctx->try_aio_wake(); + } + task->release_segs(queue); + delete task; + } else if (task->command == IOCommand::READ_COMMAND) { + queue->logger->tinc(l_bluestore_nvmedevice_read_lat, dur); + ceph_assert(!spdk_nvme_cpl_is_error(completion)); + dout(20) << __func__ << " read op successfully" << dendl; + task->fill_cb(); + task->release_segs(queue); + // read submitted by AIO + if (!task->return_code) { + if (ctx->priv) { + if (!--ctx->num_running) { + task->device->aio_callback(task->device->aio_callback_priv, ctx->priv); + } + } else { + ctx->try_aio_wake(); + } + delete task; + } else { + task->return_code = 0; + ctx->try_aio_wake(); + } + } else { + ceph_assert(task->command == IOCommand::FLUSH_COMMAND); + ceph_assert(!spdk_nvme_cpl_is_error(completion)); + queue->logger->tinc(l_bluestore_nvmedevice_flush_lat, dur); + dout(20) << __func__ << " flush op successfully" << dendl; + task->return_code = 0; + } +} + +// ---------------- +#undef dout_prefix +#define dout_prefix *_dout << "bdev(" << name << ") " + +NVMEDevice::NVMEDevice(CephContext* cct, aio_callback_t cb, void *cbpriv) + : BlockDevice(cct, cb, cbpriv), + driver(nullptr) +{ +} + +int NVMEDevice::open(const string& p) +{ + dout(1) << __func__ << " path " << p << dendl; + + std::ifstream ifs(p); + if (!ifs) { + derr << __func__ << " unable to open " << p << dendl; + return -1; + } + string val; + std::getline(ifs, val); + spdk_nvme_transport_id trid; + if (int r = spdk_nvme_transport_id_parse(&trid, val.c_str()); r) { + derr << __func__ << " unable to read " << p << ": " << cpp_strerror(r) + << dendl; + return r; + } + if (int r = manager.try_get(trid, &driver); r < 0) { + derr << __func__ << " failed to get nvme device with transport address " << trid.traddr << dendl; + return r; + } + + driver->register_device(this); + block_size = driver->get_block_size(); + size = driver->get_size(); + name = trid.traddr; + + //nvme is non-rotational device. + rotational = false; + + // round size down to an even block + size &= ~(block_size - 1); + + dout(1) << __func__ << " size " << size << " (" << byte_u_t(size) << ")" + << " block_size " << block_size << " (" << byte_u_t(block_size) + << ")" << dendl; + + + return 0; +} + +void NVMEDevice::close() +{ + dout(1) << __func__ << dendl; + + delete queue_t; + queue_t = nullptr; + name.clear(); + driver->remove_device(this); + + dout(1) << __func__ << " end" << dendl; +} + +int NVMEDevice::collect_metadata(const string& prefix, map<string,string> *pm) const +{ + (*pm)[prefix + "rotational"] = "0"; + (*pm)[prefix + "size"] = stringify(get_size()); + (*pm)[prefix + "block_size"] = stringify(get_block_size()); + (*pm)[prefix + "driver"] = "NVMEDevice"; + (*pm)[prefix + "type"] = "nvme"; + (*pm)[prefix + "access_mode"] = "spdk"; + (*pm)[prefix + "nvme_serial_number"] = name; + + return 0; +} + +int NVMEDevice::flush() +{ + return 0; +} + +void NVMEDevice::aio_submit(IOContext *ioc) +{ + dout(20) << __func__ << " ioc " << ioc << " pending " + << ioc->num_pending.load() << " running " + << ioc->num_running.load() << dendl; + int pending = ioc->num_pending.load(); + Task *t = static_cast<Task*>(ioc->nvme_task_first); + if (pending && t) { + ioc->num_running += pending; + ioc->num_pending -= pending; + ceph_assert(ioc->num_pending.load() == 0); // we should be only thread doing this + // Only need to push the first entry + ioc->nvme_task_first = ioc->nvme_task_last = nullptr; + if (!queue_t) + queue_t = new SharedDriverQueueData(this, driver); + queue_t->_aio_handle(t, ioc); + } +} + +static void write_split( + NVMEDevice *dev, + uint64_t off, + bufferlist &bl, + IOContext *ioc) +{ + uint64_t remain_len = bl.length(), begin = 0, write_size; + Task *t, *first, *last; + // This value may need to be got from configuration later. + uint64_t split_size = 131072; // 128KB. + + while (remain_len > 0) { + write_size = std::min(remain_len, split_size); + t = new Task(dev, IOCommand::WRITE_COMMAND, off + begin, write_size); + // TODO: if upper layer alloc memory with known physical address, + // we can reduce this copy + bl.splice(0, write_size, &t->bl); + remain_len -= write_size; + t->ctx = ioc; + first = static_cast<Task*>(ioc->nvme_task_first); + last = static_cast<Task*>(ioc->nvme_task_last); + if (last) + last->next = t; + if (!first) + ioc->nvme_task_first = t; + ioc->nvme_task_last = t; + ++ioc->num_pending; + begin += write_size; + } +} + +int NVMEDevice::aio_write( + uint64_t off, + bufferlist &bl, + IOContext *ioc, + bool buffered, + int write_hint) +{ + uint64_t len = bl.length(); + dout(20) << __func__ << " " << off << "~" << len << " ioc " << ioc + << " buffered " << buffered << dendl; + ceph_assert(is_valid_io(off, len)); + + write_split(this, off, bl, ioc); + dout(5) << __func__ << " " << off << "~" << len << dendl; + + return 0; +} + +int NVMEDevice::write(uint64_t off, bufferlist &bl, bool buffered, int write_hint) +{ + uint64_t len = bl.length(); + dout(20) << __func__ << " " << off << "~" << len << " buffered " + << buffered << dendl; + ceph_assert(off % block_size == 0); + ceph_assert(len % block_size == 0); + ceph_assert(len > 0); + ceph_assert(off < size); + ceph_assert(off + len <= size); + + IOContext ioc(cct, NULL); + write_split(this, off, bl, &ioc); + dout(5) << __func__ << " " << off << "~" << len << dendl; + aio_submit(&ioc); + ioc.aio_wait(); + return 0; +} + +int NVMEDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc, + bool buffered) +{ + dout(5) << __func__ << " " << off << "~" << len << " ioc " << ioc << dendl; + ceph_assert(is_valid_io(off, len)); + + Task *t = new Task(this, IOCommand::READ_COMMAND, off, len, 1); + bufferptr p = buffer::create_small_page_aligned(len); + int r = 0; + t->ctx = ioc; + char *buf = p.c_str(); + t->fill_cb = [buf, t]() { + t->copy_to_buf(buf, 0, t->len); + }; + + ++ioc->num_pending; + ioc->nvme_task_first = t; + aio_submit(ioc); + ioc->aio_wait(); + + pbl->push_back(std::move(p)); + r = t->return_code; + delete t; + return r; +} + +int NVMEDevice::aio_read( + uint64_t off, + uint64_t len, + bufferlist *pbl, + IOContext *ioc) +{ + dout(20) << __func__ << " " << off << "~" << len << " ioc " << ioc << dendl; + ceph_assert(is_valid_io(off, len)); + + Task *t = new Task(this, IOCommand::READ_COMMAND, off, len); + + bufferptr p = buffer::create_small_page_aligned(len); + pbl->append(p); + t->ctx = ioc; + char* buf = p.c_str(); + t->fill_cb = [buf, t]() { + t->copy_to_buf(buf, 0, t->len); + }; + + Task *first = static_cast<Task*>(ioc->nvme_task_first); + Task *last = static_cast<Task*>(ioc->nvme_task_last); + if (last) + last->next = t; + if (!first) + ioc->nvme_task_first = t; + ioc->nvme_task_last = t; + ++ioc->num_pending; + + return 0; +} + +int NVMEDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered) +{ + ceph_assert(len > 0); + ceph_assert(off < size); + ceph_assert(off + len <= size); + + uint64_t aligned_off = align_down(off, block_size); + uint64_t aligned_len = align_up(off+len, block_size) - aligned_off; + dout(5) << __func__ << " " << off << "~" << len + << " aligned " << aligned_off << "~" << aligned_len << dendl; + IOContext ioc(g_ceph_context, nullptr); + Task *t = new Task(this, IOCommand::READ_COMMAND, aligned_off, aligned_len, 1); + int r = 0; + t->ctx = &ioc; + t->fill_cb = [buf, t, off, len]() { + t->copy_to_buf(buf, off-t->offset, len); + }; + + ++ioc.num_pending; + ioc.nvme_task_first = t; + aio_submit(&ioc); + ioc.aio_wait(); + + r = t->return_code; + delete t; + return r; +} + +int NVMEDevice::invalidate_cache(uint64_t off, uint64_t len) +{ + dout(5) << __func__ << " " << off << "~" << len << dendl; + return 0; +} diff --git a/src/os/bluestore/NVMEDevice.h b/src/os/bluestore/NVMEDevice.h new file mode 100644 index 00000000..f44aeb59 --- /dev/null +++ b/src/os/bluestore/NVMEDevice.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OS_BLUESTORE_NVMEDEVICE +#define CEPH_OS_BLUESTORE_NVMEDEVICE + +#include <queue> +#include <map> +#include <limits> + +// since _Static_assert introduced in c11 +#define _Static_assert static_assert + + +#include "include/interval_set.h" +#include "common/ceph_time.h" +#include "BlockDevice.h" + +enum class IOCommand { + READ_COMMAND, + WRITE_COMMAND, + FLUSH_COMMAND +}; + +class SharedDriverData; +class SharedDriverQueueData; + +class NVMEDevice : public BlockDevice { + /** + * points to pinned, physically contiguous memory region; + * contains 4KB IDENTIFY structure for controller which is + * target for CONTROLLER IDENTIFY command during initialization + */ + SharedDriverData *driver; + string name; + + public: + std::atomic_int queue_number = {0}; + SharedDriverData *get_driver() { return driver; } + + NVMEDevice(CephContext* cct, aio_callback_t cb, void *cbpriv); + + bool supported_bdev_label() override { return false; } + + void aio_submit(IOContext *ioc) override; + + int read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc, + bool buffered) override; + int aio_read( + uint64_t off, + uint64_t len, + bufferlist *pbl, + IOContext *ioc) override; + int aio_write(uint64_t off, bufferlist& bl, + IOContext *ioc, + bool buffered, + int write_hint = WRITE_LIFE_NOT_SET) override; + int write(uint64_t off, bufferlist& bl, bool buffered, int write_hint = WRITE_LIFE_NOT_SET) override; + int flush() override; + int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override; + + // for managing buffered readers/writers + int invalidate_cache(uint64_t off, uint64_t len) override; + int open(const string& path) override; + void close() override; + int collect_metadata(const string& prefix, map<string,string> *pm) const override; +}; + +#endif diff --git a/src/os/bluestore/PMEMDevice.cc b/src/os/bluestore/PMEMDevice.cc new file mode 100644 index 00000000..1f9d9599 --- /dev/null +++ b/src/os/bluestore/PMEMDevice.cc @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel <jianpeng.ma@intel.com> + * + * Author: Jianpeng Ma <jianpeng.ma@intel.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "PMEMDevice.h" +#include "libpmem.h" +#include "include/types.h" +#include "include/compat.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/debug.h" +#include "common/blkdev.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bdev +#undef dout_prefix +#define dout_prefix *_dout << "bdev-PMEM(" << path << ") " + +PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv) + : BlockDevice(cct, cb, cbpriv), + fd(-1), addr(0), + injecting_crash(0) +{ +} + +int PMEMDevice::_lock() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fd, F_SETLK, &l); + if (r < 0) + return -errno; + return 0; +} + +int PMEMDevice::open(const string& p) +{ + path = p; + int r = 0; + dout(1) << __func__ << " path " << path << dendl; + + fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC); + if (fd < 0) { + r = -errno; + derr << __func__ << " open got: " << cpp_strerror(r) << dendl; + return r; + } + + r = _lock(); + if (r < 0) { + derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) + << dendl; + goto out_fail; + } + + struct stat st; + r = ::fstat(fd, &st); + if (r < 0) { + r = -errno; + derr << __func__ << " fstat got " << cpp_strerror(r) << dendl; + goto out_fail; + } + + size_t map_len; + addr = (char *)pmem_map_file(path.c_str(), 0, PMEM_FILE_EXCL, O_RDWR, &map_len, NULL); + if (addr == NULL) { + derr << __func__ << " pmem_map_file failed: " << pmem_errormsg() << dendl; + goto out_fail; + } + size = map_len; + + // Operate as though the block size is 4 KB. The backing file + // blksize doesn't strictly matter except that some file systems may + // require a read/modify/write if we write something smaller than + // it. + block_size = g_conf()->bdev_block_size; + if (block_size != (unsigned)st.st_blksize) { + dout(1) << __func__ << " backing device/file reports st_blksize " + << st.st_blksize << ", using bdev_block_size " + << block_size << " anyway" << dendl; + } + + dout(1) << __func__ + << " size " << size + << " (" << byte_u_t(size) << ")" + << " block_size " << block_size + << " (" << byte_u_t(block_size) << ")" + << dendl; + return 0; + + out_fail: + VOID_TEMP_FAILURE_RETRY(::close(fd)); + fd = -1; + return r; +} + +void PMEMDevice::close() +{ + dout(1) << __func__ << dendl; + + ceph_assert(addr != NULL); + pmem_unmap(addr, size); + ceph_assert(fd >= 0); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + fd = -1; + + path.clear(); +} + +int PMEMDevice::collect_metadata(const string& prefix, map<string,string> *pm) const +{ + (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational); + (*pm)[prefix + "size"] = stringify(get_size()); + (*pm)[prefix + "block_size"] = stringify(get_block_size()); + (*pm)[prefix + "driver"] = "PMEMDevice"; + (*pm)[prefix + "type"] = "ssd"; + + struct stat st; + int r = ::fstat(fd, &st); + if (r < 0) + return -errno; + if (S_ISBLK(st.st_mode)) { + (*pm)[prefix + "access_mode"] = "blk"; + char buffer[1024] = {0}; + BlkDev blkdev(fd_buffered); + + blkdev.model(buffer, sizeof(buffer)); + (*pm)[prefix + "model"] = buffer; + + buffer[0] = '\0'; + blkdev.dev(buffer, sizeof(buffer)); + (*pm)[prefix + "dev"] = buffer; + + // nvme exposes a serial number + buffer[0] = '\0'; + blkdev.serial(buffer, sizeof(buffer)); + (*pm)[prefix + "serial"] = buffer; + + if (blkdev.is_nvme()) + (*pm)[prefix + "type"] = "nvme"; + } else { + (*pm)[prefix + "access_mode"] = "file"; + (*pm)[prefix + "path"] = path; + } + return 0; +} + +int PMEMDevice::flush() +{ + //Because all write is persist. So no need + return 0; +} + + +void PMEMDevice::aio_submit(IOContext *ioc) +{ + if (ioc->priv) { + ceph_assert(ioc->num_running == 0); + aio_callback(aio_callback_priv, ioc->priv); + } else { + ioc->try_aio_wake(); + } + return; +} + +int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered, int write_hint = WRITE_LIFE_NOT_SET) +{ + uint64_t len = bl.length(); + dout(20) << __func__ << " " << off << "~" << len << dendl; + ceph_assert(is_valid_io(off, len)); + + dout(40) << "data: "; + bl.hexdump(*_dout); + *_dout << dendl; + + if (g_conf()->bdev_inject_crash && + rand() % g_conf()->bdev_inject_crash == 0) { + derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len + << dendl; + ++injecting_crash; + return 0; + } + + bufferlist::iterator p = bl.begin(); + uint32_t off1 = off; + while (len) { + const char *data; + uint32_t l = p.get_ptr_and_advance(len, &data); + pmem_memcpy_persist(addr + off1, data, l); + len -= l; + off1 += l; + } + return 0; +} + +int PMEMDevice::aio_write( + uint64_t off, + bufferlist &bl, + IOContext *ioc, + bool buffered, + int write_hint = WRITE_LIFE_NOT_SET) +{ + return write(off, bl, buffered); +} + + +int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc, + bool buffered) +{ + dout(5) << __func__ << " " << off << "~" << len << dendl; + ceph_assert(is_valid_io(off, len)); + + bufferptr p = buffer::create_small_page_aligned(len); + memcpy(p.c_str(), addr + off, len); + + pbl->clear(); + pbl->push_back(std::move(p)); + + dout(40) << "data: "; + pbl->hexdump(*_dout); + *_dout << dendl; + + return 0; +} + +int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc) +{ + return read(off, len, pbl, ioc, false); +} + +int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered) +{ + dout(5) << __func__ << " " << off << "~" << len << dendl; + ceph_assert(is_valid_io(off, len)); + + memcpy(buf, addr + off, len); + return 0; +} + + +int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len) +{ + dout(5) << __func__ << " " << off << "~" << len << dendl; + return 0; +} + + diff --git a/src/os/bluestore/PMEMDevice.h b/src/os/bluestore/PMEMDevice.h new file mode 100644 index 00000000..3077375a --- /dev/null +++ b/src/os/bluestore/PMEMDevice.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel <jianpeng.ma@intel.com> + * + * Author: Jianpeng Ma <jianpeng.ma@intel.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OS_BLUESTORE_PMEMDEVICE_H +#define CEPH_OS_BLUESTORE_PMEMDEVICE_H + +#include <atomic> + +#include "os/fs/FS.h" +#include "include/interval_set.h" +#include "ceph_aio.h" +#include "BlockDevice.h" + +class PMEMDevice : public BlockDevice { + int fd; + char *addr; //the address of mmap + std::string path; + + ceph::mutex debug_lock = ceph::make_mutex("PMEMDevice::debug_lock"); + interval_set<uint64_t> debug_inflight; + + std::atomic_int injecting_crash; + int _lock(); + +public: + PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv); + + + void aio_submit(IOContext *ioc) override; + + int collect_metadata(const std::string& prefix, map<std::string,std::string> *pm) const override; + + int read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc, + bool buffered) override; + int aio_read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc) override; + + int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override; + int write(uint64_t off, bufferlist& bl, bool buffered, int write_hint = WRITE_LIFE_NOT_SET) override; + int aio_write(uint64_t off, bufferlist& bl, + IOContext *ioc, + bool buffered, + int write_hint = WRITE_LIFE_NOT_SET) override; + int flush() override; + + // for managing buffered readers/writers + int invalidate_cache(uint64_t off, uint64_t len) override; + int open(const std::string &path) override; + void close() override; + +private: + bool is_valid_io(uint64_t off, uint64_t len) const { + return (len > 0 && + off < size && + off + len <= size); + } +}; + +#endif diff --git a/src/os/bluestore/StupidAllocator.cc b/src/os/bluestore/StupidAllocator.cc new file mode 100644 index 00000000..f75f7446 --- /dev/null +++ b/src/os/bluestore/StupidAllocator.cc @@ -0,0 +1,364 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "StupidAllocator.h" +#include "bluestore_types.h" +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore +#undef dout_prefix +#define dout_prefix *_dout << "stupidalloc 0x" << this << " " + +StupidAllocator::StupidAllocator(CephContext* cct, + const std::string& name, + int64_t _block_size) + : Allocator(name), cct(cct), num_free(0), + free(10), + last_alloc(0), + block_size(_block_size) +{ +} + +StupidAllocator::~StupidAllocator() +{ +} + +unsigned StupidAllocator::_choose_bin(uint64_t orig_len) +{ + uint64_t len = orig_len / cct->_conf->bdev_block_size; + int bin = std::min((int)cbits(len), (int)free.size() - 1); + ldout(cct, 30) << __func__ << " len 0x" << std::hex << orig_len + << std::dec << " -> " << bin << dendl; + return bin; +} + +void StupidAllocator::_insert_free(uint64_t off, uint64_t len) +{ + unsigned bin = _choose_bin(len); + ldout(cct, 30) << __func__ << " 0x" << std::hex << off << "~" << len + << std::dec << " in bin " << bin << dendl; + while (true) { + free[bin].insert(off, len, &off, &len); + unsigned newbin = _choose_bin(len); + if (newbin == bin) + break; + ldout(cct, 30) << __func__ << " promoting 0x" << std::hex << off << "~" << len + << std::dec << " to bin " << newbin << dendl; + free[bin].erase(off, len); + bin = newbin; + } +} + +/// return the effective length of the extent if we align to alloc_unit +uint64_t StupidAllocator::_aligned_len( + StupidAllocator::interval_set_t::iterator p, + uint64_t alloc_unit) +{ + uint64_t skew = p.get_start() % alloc_unit; + if (skew) + skew = alloc_unit - skew; + if (skew > p.get_len()) + return 0; + else + return p.get_len() - skew; +} + +int64_t StupidAllocator::allocate_int( + uint64_t want_size, uint64_t alloc_unit, int64_t hint, + uint64_t *offset, uint32_t *length) +{ + std::lock_guard l(lock); + ldout(cct, 10) << __func__ << " want_size 0x" << std::hex << want_size + << " alloc_unit 0x" << alloc_unit + << " hint 0x" << hint << std::dec + << dendl; + uint64_t want = std::max(alloc_unit, want_size); + int bin = _choose_bin(want); + int orig_bin = bin; + + auto p = free[0].begin(); + + if (!hint) + hint = last_alloc; + + // search up (from hint) + if (hint) { + for (bin = orig_bin; bin < (int)free.size(); ++bin) { + p = free[bin].lower_bound(hint); + while (p != free[bin].end()) { + if (_aligned_len(p, alloc_unit) >= want_size) { + goto found; + } + ++p; + } + } + } + + // search up (from origin, and skip searched extents by hint) + for (bin = orig_bin; bin < (int)free.size(); ++bin) { + p = free[bin].begin(); + auto end = hint ? free[bin].lower_bound(hint) : free[bin].end(); + while (p != end) { + if (_aligned_len(p, alloc_unit) >= want_size) { + goto found; + } + ++p; + } + } + + // search down (hint) + if (hint) { + for (bin = orig_bin; bin >= 0; --bin) { + p = free[bin].lower_bound(hint); + while (p != free[bin].end()) { + if (_aligned_len(p, alloc_unit) >= alloc_unit) { + goto found; + } + ++p; + } + } + } + + // search down (from origin, and skip searched extents by hint) + for (bin = orig_bin; bin >= 0; --bin) { + p = free[bin].begin(); + auto end = hint ? free[bin].lower_bound(hint) : free[bin].end(); + while (p != end) { + if (_aligned_len(p, alloc_unit) >= alloc_unit) { + goto found; + } + ++p; + } + } + + return -ENOSPC; + + found: + uint64_t skew = p.get_start() % alloc_unit; + if (skew) + skew = alloc_unit - skew; + *offset = p.get_start() + skew; + *length = std::min(std::max(alloc_unit, want_size), p2align((p.get_len() - skew), alloc_unit)); + if (cct->_conf->bluestore_debug_small_allocations) { + uint64_t max = + alloc_unit * (rand() % cct->_conf->bluestore_debug_small_allocations); + if (max && *length > max) { + ldout(cct, 10) << __func__ << " shortening allocation of 0x" << std::hex + << *length << " -> 0x" + << max << " due to debug_small_allocations" << std::dec + << dendl; + *length = max; + } + } + ldout(cct, 30) << __func__ << " got 0x" << std::hex << *offset << "~" << *length + << " from bin " << std::dec << bin << dendl; + + free[bin].erase(*offset, *length); + uint64_t off, len; + if (*offset && free[bin].contains(*offset - skew - 1, &off, &len)) { + int newbin = _choose_bin(len); + if (newbin != bin) { + ldout(cct, 30) << __func__ << " demoting 0x" << std::hex << off << "~" << len + << std::dec << " to bin " << newbin << dendl; + free[bin].erase(off, len); + _insert_free(off, len); + } + } + if (free[bin].contains(*offset + *length, &off, &len)) { + int newbin = _choose_bin(len); + if (newbin != bin) { + ldout(cct, 30) << __func__ << " demoting 0x" << std::hex << off << "~" << len + << std::dec << " to bin " << newbin << dendl; + free[bin].erase(off, len); + _insert_free(off, len); + } + } + + num_free -= *length; + ceph_assert(num_free >= 0); + last_alloc = *offset + *length; + return 0; +} + +int64_t StupidAllocator::allocate( + uint64_t want_size, + uint64_t alloc_unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector *extents) +{ + uint64_t allocated_size = 0; + uint64_t offset = 0; + uint32_t length = 0; + int res = 0; + + if (max_alloc_size == 0) { + max_alloc_size = want_size; + } + // cap with 32-bit val + max_alloc_size = std::min(max_alloc_size, 0x10000000 - alloc_unit); + + while (allocated_size < want_size) { + res = allocate_int(std::min(max_alloc_size, (want_size - allocated_size)), + alloc_unit, hint, &offset, &length); + if (res != 0) { + /* + * Allocation failed. + */ + break; + } + bool can_append = true; + if (!extents->empty()) { + bluestore_pextent_t &last_extent = extents->back(); + if (last_extent.end() == offset) { + uint64_t l64 = last_extent.length; + l64 += length; + if (l64 < 0x100000000 && l64 <= max_alloc_size) { + can_append = false; + last_extent.length += length; + } + } + } + if (can_append) { + extents->emplace_back(bluestore_pextent_t(offset, length)); + } + + allocated_size += length; + hint = offset + length; + } + + if (allocated_size == 0) { + return -ENOSPC; + } + return allocated_size; +} + +void StupidAllocator::release( + const interval_set<uint64_t>& release_set) +{ + std::lock_guard l(lock); + for (interval_set<uint64_t>::const_iterator p = release_set.begin(); + p != release_set.end(); + ++p) { + const auto offset = p.get_start(); + const auto length = p.get_len(); + ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + _insert_free(offset, length); + num_free += length; + } +} + +uint64_t StupidAllocator::get_free() +{ + std::lock_guard l(lock); + return num_free; +} + +double StupidAllocator::get_fragmentation() +{ + ceph_assert(block_size); + double res; + uint64_t max_intervals = 0; + uint64_t intervals = 0; + { + std::lock_guard l(lock); + max_intervals = p2roundup<uint64_t>(num_free, block_size) / block_size; + for (unsigned bin = 0; bin < free.size(); ++bin) { + intervals += free[bin].num_intervals(); + } + } + ldout(cct, 30) << __func__ << " " << intervals << "/" << max_intervals + << dendl; + ceph_assert(intervals <= max_intervals); + if (!intervals || max_intervals <= 1) { + return 0.0; + } + intervals--; + max_intervals--; + res = (double)intervals / max_intervals; + return res; +} + +void StupidAllocator::dump() +{ + std::lock_guard l(lock); + for (unsigned bin = 0; bin < free.size(); ++bin) { + ldout(cct, 0) << __func__ << " free bin " << bin << ": " + << free[bin].num_intervals() << " extents" << dendl; + for (auto p = free[bin].begin(); + p != free[bin].end(); + ++p) { + ldout(cct, 0) << __func__ << " 0x" << std::hex << p.get_start() << "~" + << p.get_len() << std::dec << dendl; + } + } +} + +void StupidAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify) +{ + std::lock_guard l(lock); + for (unsigned bin = 0; bin < free.size(); ++bin) { + for (auto p = free[bin].begin(); p != free[bin].end(); ++p) { + notify(p.get_start(), p.get_len()); + } + } +} + +void StupidAllocator::init_add_free(uint64_t offset, uint64_t length) +{ + std::lock_guard l(lock); + ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + _insert_free(offset, length); + num_free += length; +} + +void StupidAllocator::init_rm_free(uint64_t offset, uint64_t length) +{ + std::lock_guard l(lock); + ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + interval_set_t rm; + rm.insert(offset, length); + for (unsigned i = 0; i < free.size() && !rm.empty(); ++i) { + interval_set_t overlap; + overlap.intersection_of(rm, free[i]); + if (!overlap.empty()) { + ldout(cct, 20) << __func__ << " bin " << i << " rm 0x" << std::hex << overlap + << std::dec << dendl; + auto it = overlap.begin(); + auto it_end = overlap.end(); + while (it != it_end) { + auto o = it.get_start(); + auto l = it.get_len(); + + free[i].erase(o, l, + [&](uint64_t off, uint64_t len) { + unsigned newbin = _choose_bin(len); + if (newbin != i) { + ldout(cct, 30) << __func__ << " demoting1 0x" << std::hex << off << "~" << len + << std::dec << " to bin " << newbin << dendl; + _insert_free(off, len); + return true; + } + return false; + }); + ++it; + } + + rm.subtract(overlap); + } + } + ceph_assert(rm.empty()); + num_free -= length; + ceph_assert(num_free >= 0); +} + + +void StupidAllocator::shutdown() +{ + ldout(cct, 1) << __func__ << dendl; +} + diff --git a/src/os/bluestore/StupidAllocator.h b/src/os/bluestore/StupidAllocator.h new file mode 100644 index 00000000..d9c4a447 --- /dev/null +++ b/src/os/bluestore/StupidAllocator.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OS_BLUESTORE_STUPIDALLOCATOR_H +#define CEPH_OS_BLUESTORE_STUPIDALLOCATOR_H + +#include <mutex> + +#include "Allocator.h" +#include "include/btree_map.h" +#include "include/interval_set.h" +#include "os/bluestore/bluestore_types.h" +#include "include/mempool.h" +#include "common/ceph_mutex.h" + +class StupidAllocator : public Allocator { + CephContext* cct; + ceph::mutex lock = ceph::make_mutex("StupidAllocator::lock"); + + int64_t num_free; ///< total bytes in freelist + int64_t block_size; + + typedef mempool::bluestore_alloc::pool_allocator< + pair<const uint64_t,uint64_t>> allocator_t; + typedef btree::btree_map<uint64_t,uint64_t,std::less<uint64_t>,allocator_t> interval_set_map_t; + typedef interval_set<uint64_t,interval_set_map_t> interval_set_t; + std::vector<interval_set_t> free; ///< leading-edge copy + + uint64_t last_alloc; + + unsigned _choose_bin(uint64_t len); + void _insert_free(uint64_t offset, uint64_t len); + + uint64_t _aligned_len( + interval_set_t::iterator p, + uint64_t alloc_unit); + +public: + StupidAllocator(CephContext* cct, const std::string& name, int64_t block_size); + ~StupidAllocator() override; + + int64_t allocate( + uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size, + int64_t hint, PExtentVector *extents) override; + + int64_t allocate_int( + uint64_t want_size, uint64_t alloc_unit, int64_t hint, + uint64_t *offset, uint32_t *length); + + void release( + const interval_set<uint64_t>& release_set) override; + + uint64_t get_free() override; + double get_fragmentation() override; + + void dump() override; + void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override; + + void init_add_free(uint64_t offset, uint64_t length) override; + void init_rm_free(uint64_t offset, uint64_t length) override; + + void shutdown() override; +}; + +#endif diff --git a/src/os/bluestore/aio.cc b/src/os/bluestore/aio.cc new file mode 100644 index 00000000..eb0c13fe --- /dev/null +++ b/src/os/bluestore/aio.cc @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> +#include "ceph_aio.h" + +std::ostream& operator<<(std::ostream& os, const aio_t& aio) +{ + unsigned i = 0; + os << "aio: "; + for (auto& iov : aio.iov) { + os << "\n [" << i++ << "] 0x" + << std::hex << iov.iov_base << "~" << iov.iov_len << std::dec; + } + return os; +} + +int aio_queue_t::submit_batch(aio_iter begin, aio_iter end, + uint16_t aios_size, void *priv, + int *retries) +{ + // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds + int attempts = 16; + int delay = 125; + int r; + + aio_iter cur = begin; + struct aio_t *piocb[aios_size]; + int left = 0; + while (cur != end) { + cur->priv = priv; + *(piocb+left) = &(*cur); + ++left; + ++cur; + } + ceph_assert(aios_size >= left); + int done = 0; + while (left > 0) { +#if defined(HAVE_LIBAIO) + r = io_submit(ctx, std::min(left, max_iodepth), (struct iocb**)(piocb + done)); +#elif defined(HAVE_POSIXAIO) + if (piocb[done]->n_aiocb == 1) { + // TODO: consider batching multiple reads together with lio_listio + piocb[done]->aio.aiocb.aio_sigevent.sigev_notify = SIGEV_KEVENT; + piocb[done]->aio.aiocb.aio_sigevent.sigev_notify_kqueue = ctx; + piocb[done]->aio.aiocb.aio_sigevent.sigev_value.sival_ptr = piocb[done]; + r = aio_read(&piocb[done]->aio.aiocb); + } else { + struct sigevent sev; + sev.sigev_notify = SIGEV_KEVENT; + sev.sigev_notify_kqueue = ctx; + sev.sigev_value.sival_ptr = piocb[done]; + r = lio_listio(LIO_NOWAIT, &piocb[done]->aio.aiocbp, piocb[done]->n_aiocb, &sev); + } +#endif + if (r < 0) { + if (r == -EAGAIN && attempts-- > 0) { + usleep(delay); + delay *= 2; + (*retries)++; + continue; + } + return r; + } + ceph_assert(r > 0); + done += r; + left -= r; + attempts = 16; + delay = 125; + } + return done; +} + +int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max) +{ +#if defined(HAVE_LIBAIO) + io_event events[max]; +#elif defined(HAVE_POSIXAIO) + struct kevent events[max]; +#endif + struct timespec t = { + timeout_ms / 1000, + (timeout_ms % 1000) * 1000 * 1000 + }; + + int r = 0; + do { +#if defined(HAVE_LIBAIO) + r = io_getevents(ctx, 1, max, events, &t); +#elif defined(HAVE_POSIXAIO) + r = kevent(ctx, NULL, 0, events, max, &t); + if (r < 0) + r = -errno; +#endif + } while (r == -EINTR); + + for (int i=0; i<r; ++i) { +#if defined(HAVE_LIBAIO) + paio[i] = (aio_t *)events[i].obj; + paio[i]->rval = events[i].res; +#else + paio[i] = (aio_t*)events[i].udata; + if (paio[i]->n_aiocb == 1) { + paio[i]->rval = aio_return(&paio[i]->aio.aiocb); + } else { + // Emulate the return value of pwritev. I can't find any documentation + // for what the value of io_event.res is supposed to be. I'm going to + // assume that it's just like pwritev/preadv/pwrite/pread. + paio[i]->rval = 0; + for (int j = 0; j < paio[i]->n_aiocb; j++) { + int res = aio_return(&paio[i]->aio.aiocbp[j]); + if (res < 0) { + paio[i]->rval = res; + break; + } else { + paio[i]->rval += res; + } + } + free(paio[i]->aio.aiocbp); + } +#endif + } + return r; +} diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc new file mode 100644 index 00000000..c565f43b --- /dev/null +++ b/src/os/bluestore/bluefs_types.cc @@ -0,0 +1,213 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> +#include "bluefs_types.h" +#include "common/Formatter.h" +#include "include/uuid.h" +#include "include/stringify.h" + +// bluefs_extent_t +void bluefs_extent_t::dump(Formatter *f) const +{ + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); + f->dump_unsigned("bdev", bdev); +} + +void bluefs_extent_t::generate_test_instances(list<bluefs_extent_t*>& ls) +{ + ls.push_back(new bluefs_extent_t); + ls.push_back(new bluefs_extent_t); + ls.back()->offset = 1; + ls.back()->length = 2; + ls.back()->bdev = 1; +} + +ostream& operator<<(ostream& out, const bluefs_extent_t& e) +{ + return out << (int)e.bdev << ":0x" << std::hex << e.offset << "~" << e.length + << std::dec; +} + +// bluefs_super_t + +void bluefs_super_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(uuid, bl); + encode(osd_uuid, bl); + encode(version, bl); + encode(block_size, bl); + encode(log_fnode, bl); + ENCODE_FINISH(bl); +} + +void bluefs_super_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(uuid, p); + decode(osd_uuid, p); + decode(version, p); + decode(block_size, p); + decode(log_fnode, p); + DECODE_FINISH(p); +} + +void bluefs_super_t::dump(Formatter *f) const +{ + f->dump_stream("uuid") << uuid; + f->dump_stream("osd_uuid") << osd_uuid; + f->dump_unsigned("version", version); + f->dump_unsigned("block_size", block_size); + f->dump_object("log_fnode", log_fnode); +} + +void bluefs_super_t::generate_test_instances(list<bluefs_super_t*>& ls) +{ + ls.push_back(new bluefs_super_t); + ls.push_back(new bluefs_super_t); + ls.back()->version = 1; + ls.back()->block_size = 4096; +} + +ostream& operator<<(ostream& out, const bluefs_super_t& s) +{ + return out << "super(uuid " << s.uuid + << " osd " << s.osd_uuid + << " v " << s.version + << " block_size 0x" << std::hex << s.block_size + << " log_fnode 0x" << s.log_fnode + << std::dec << ")"; +} + +// bluefs_fnode_t + +mempool::bluefs::vector<bluefs_extent_t>::iterator bluefs_fnode_t::seek( + uint64_t offset, uint64_t *x_off) +{ + auto p = extents.begin(); + + if (extents_index.size() > 4) { + auto it = std::upper_bound(extents_index.begin(), extents_index.end(), + offset); + assert(it != extents_index.begin()); + --it; + assert(offset >= *it); + p += it - extents_index.begin(); + offset -= *it; + } + + while (p != extents.end()) { + if ((int64_t) offset >= p->length) { + offset -= p->length; + ++p; + } else { + break; + } + } + *x_off = offset; + return p; +} + +void bluefs_fnode_t::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("size", size); + f->dump_stream("mtime") << mtime; + f->open_array_section("extents"); + for (auto& p : extents) + f->dump_object("extent", p); + f->close_section(); +} + +void bluefs_fnode_t::generate_test_instances(list<bluefs_fnode_t*>& ls) +{ + ls.push_back(new bluefs_fnode_t); + ls.push_back(new bluefs_fnode_t); + ls.back()->ino = 123; + ls.back()->size = 1048576; + ls.back()->mtime = utime_t(123,45); + ls.back()->extents.push_back(bluefs_extent_t(0, 1048576, 4096)); + ls.back()->__unused__ = 1; +} + +ostream& operator<<(ostream& out, const bluefs_fnode_t& file) +{ + return out << "file(ino " << file.ino + << " size 0x" << std::hex << file.size << std::dec + << " mtime " << file.mtime + << " allocated " << std::hex << file.allocated << std::dec + << " extents " << file.extents + << ")"; +} + + +// bluefs_transaction_t + +void bluefs_transaction_t::encode(bufferlist& bl) const +{ + uint32_t crc = op_bl.crc32c(-1); + ENCODE_START(1, 1, bl); + encode(uuid, bl); + encode(seq, bl); + // not using bufferlist encode method, as it merely copies the bufferptr and not + // contents, meaning we're left with fragmented target bl + __u32 len = op_bl.length(); + encode(len, bl); + for (auto& it : op_bl.buffers()) { + bl.append(it.c_str(), it.length()); + } + encode(crc, bl); + ENCODE_FINISH(bl); +} + +void bluefs_transaction_t::decode(bufferlist::const_iterator& p) +{ + uint32_t crc; + DECODE_START(1, p); + decode(uuid, p); + decode(seq, p); + decode(op_bl, p); + decode(crc, p); + DECODE_FINISH(p); + uint32_t actual = op_bl.crc32c(-1); + if (actual != crc) + throw buffer::malformed_input("bad crc " + stringify(actual) + + " expected " + stringify(crc)); +} + +void bluefs_transaction_t::dump(Formatter *f) const +{ + f->dump_stream("uuid") << uuid; + f->dump_unsigned("seq", seq); + f->dump_unsigned("op_bl_length", op_bl.length()); + f->dump_unsigned("crc", op_bl.crc32c(-1)); +} + +void bluefs_transaction_t::generate_test_instances( + list<bluefs_transaction_t*>& ls) +{ + ls.push_back(new bluefs_transaction_t); + ls.push_back(new bluefs_transaction_t); + ls.back()->op_init(); + ls.back()->op_alloc_add(0, 0, 123123211); + ls.back()->op_alloc_rm(1, 0, 123); + ls.back()->op_dir_create("dir"); + ls.back()->op_dir_create("dir2"); + bluefs_fnode_t fnode; + fnode.ino = 2; + ls.back()->op_file_update(fnode); + ls.back()->op_dir_link("dir", "file1", 2); + ls.back()->op_dir_unlink("dir", "file1"); + ls.back()->op_file_remove(2); + ls.back()->op_dir_remove("dir2"); +} + +ostream& operator<<(ostream& out, const bluefs_transaction_t& t) +{ + return out << "txn(seq " << t.seq + << " len 0x" << std::hex << t.op_bl.length() + << " crc 0x" << t.op_bl.crc32c(-1) + << std::dec << ")"; +} diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h new file mode 100644 index 00000000..9ac27fab --- /dev/null +++ b/src/os/bluestore/bluefs_types.h @@ -0,0 +1,265 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_OS_BLUESTORE_BLUEFS_TYPES_H +#define CEPH_OS_BLUESTORE_BLUEFS_TYPES_H + +#include "bluestore_types.h" +#include "include/utime.h" +#include "include/encoding.h" +#include "include/denc.h" + +class bluefs_extent_t { +public: + uint64_t offset = 0; + uint32_t length = 0; + uint8_t bdev; + + bluefs_extent_t(uint8_t b = 0, uint64_t o = 0, uint32_t l = 0) + : offset(o), length(l), bdev(b) {} + + uint64_t end() const { return offset + length; } + DENC(bluefs_extent_t, v, p) { + DENC_START(1, 1, p); + denc_lba(v.offset, p); + denc_varint_lowz(v.length, p); + denc(v.bdev, p); + DENC_FINISH(p); + } + + void dump(Formatter *f) const; + static void generate_test_instances(list<bluefs_extent_t*>&); +}; +WRITE_CLASS_DENC(bluefs_extent_t) + +ostream& operator<<(ostream& out, const bluefs_extent_t& e); + +struct bluefs_fnode_t { + uint64_t ino; + uint64_t size; + utime_t mtime; + uint8_t __unused__; // was prefer_bdev + mempool::bluefs::vector<bluefs_extent_t> extents; + + // precalculated logical offsets for extents vector entries + // allows fast lookup for extent index by the offset value via upper_bound() + mempool::bluefs::vector<uint64_t> extents_index; + + uint64_t allocated; + + bluefs_fnode_t() : ino(0), size(0), __unused__(0), allocated(0) {} + + uint64_t get_allocated() const { + return allocated; + } + + void recalc_allocated() { + allocated = 0; + extents_index.reserve(extents.size()); + for (auto& p : extents) { + extents_index.emplace_back(allocated); + allocated += p.length; + } + } + + DENC_HELPERS + void bound_encode(size_t& p) const { + _denc_friend(*this, p); + } + void encode(bufferlist::contiguous_appender& p) const { + DENC_DUMP_PRE(bluefs_fnode_t); + _denc_friend(*this, p); + DENC_DUMP_POST(bluefs_fnode_t); + } + void decode(buffer::ptr::const_iterator& p) { + _denc_friend(*this, p); + recalc_allocated(); + } + template<typename T, typename P> + friend std::enable_if_t<std::is_same_v<bluefs_fnode_t, std::remove_const_t<T>>> + _denc_friend(T& v, P& p) { + DENC_START(1, 1, p); + denc_varint(v.ino, p); + denc_varint(v.size, p); + denc(v.mtime, p); + denc(v.__unused__, p); + denc(v.extents, p); + DENC_FINISH(p); + } + + void append_extent(const bluefs_extent_t& ext) { + if (!extents.empty() && + extents.back().end() == ext.offset && + extents.back().bdev == ext.bdev && + (uint64_t)extents.back().length + (uint64_t)ext.length < 0xffffffff) { + extents.back().length += ext.length; + } else { + extents_index.emplace_back(allocated); + extents.push_back(ext); + } + allocated += ext.length; + } + + void pop_front_extent() { + auto it = extents.begin(); + allocated -= it->length; + extents_index.erase(extents_index.begin()); + for (auto& i: extents_index) { + i -= it->length; + } + extents.erase(it); + } + + void swap_extents(bluefs_fnode_t& other) { + other.extents.swap(extents); + other.extents_index.swap(extents_index); + std::swap(allocated, other.allocated); + } + void clear_extents() { + extents_index.clear(); + extents.clear(); + allocated = 0; + } + + mempool::bluefs::vector<bluefs_extent_t>::iterator seek( + uint64_t off, uint64_t *x_off); + + void dump(Formatter *f) const; + static void generate_test_instances(list<bluefs_fnode_t*>& ls); + +}; +WRITE_CLASS_DENC(bluefs_fnode_t) + +ostream& operator<<(ostream& out, const bluefs_fnode_t& file); + + +struct bluefs_super_t { + uuid_d uuid; ///< unique to this bluefs instance + uuid_d osd_uuid; ///< matches the osd that owns us + uint64_t version; + uint32_t block_size; + + bluefs_fnode_t log_fnode; + + bluefs_super_t() + : version(0), + block_size(4096) { } + + uint64_t block_mask() const { + return ~((uint64_t)block_size - 1); + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<bluefs_super_t*>& ls); +}; +WRITE_CLASS_ENCODER(bluefs_super_t) + +ostream& operator<<(ostream&, const bluefs_super_t& s); + + +struct bluefs_transaction_t { + typedef enum { + OP_NONE = 0, + OP_INIT, ///< initial (empty) file system marker + OP_ALLOC_ADD, ///< add extent to available block storage (extent) + OP_ALLOC_RM, ///< remove extent from available block storage (extent) + OP_DIR_LINK, ///< (re)set a dir entry (dirname, filename, ino) + OP_DIR_UNLINK, ///< remove a dir entry (dirname, filename) + OP_DIR_CREATE, ///< create a dir (dirname) + OP_DIR_REMOVE, ///< remove a dir (dirname) + OP_FILE_UPDATE, ///< set/update file metadata (file) + OP_FILE_REMOVE, ///< remove file (ino) + OP_JUMP, ///< jump the seq # and offset + OP_JUMP_SEQ, ///< jump the seq # + } op_t; + + uuid_d uuid; ///< fs uuid + uint64_t seq; ///< sequence number + bufferlist op_bl; ///< encoded transaction ops + + bluefs_transaction_t() : seq(0) {} + + void clear() { + *this = bluefs_transaction_t(); + } + bool empty() const { + return op_bl.length() == 0; + } + + void op_init() { + using ceph::encode; + encode((__u8)OP_INIT, op_bl); + } + void op_alloc_add(uint8_t id, uint64_t offset, uint64_t length) { + using ceph::encode; + encode((__u8)OP_ALLOC_ADD, op_bl); + encode(id, op_bl); + encode(offset, op_bl); + encode(length, op_bl); + } + void op_alloc_rm(uint8_t id, uint64_t offset, uint64_t length) { + using ceph::encode; + encode((__u8)OP_ALLOC_RM, op_bl); + encode(id, op_bl); + encode(offset, op_bl); + encode(length, op_bl); + } + void op_dir_create(const string& dir) { + using ceph::encode; + encode((__u8)OP_DIR_CREATE, op_bl); + encode(dir, op_bl); + } + void op_dir_remove(const string& dir) { + using ceph::encode; + encode((__u8)OP_DIR_REMOVE, op_bl); + encode(dir, op_bl); + } + void op_dir_link(const string& dir, const string& file, uint64_t ino) { + using ceph::encode; + encode((__u8)OP_DIR_LINK, op_bl); + encode(dir, op_bl); + encode(file, op_bl); + encode(ino, op_bl); + } + void op_dir_unlink(const string& dir, const string& file) { + using ceph::encode; + encode((__u8)OP_DIR_UNLINK, op_bl); + encode(dir, op_bl); + encode(file, op_bl); + } + void op_file_update(const bluefs_fnode_t& file) { + using ceph::encode; + encode((__u8)OP_FILE_UPDATE, op_bl); + encode(file, op_bl); + } + void op_file_remove(uint64_t ino) { + using ceph::encode; + encode((__u8)OP_FILE_REMOVE, op_bl); + encode(ino, op_bl); + } + void op_jump(uint64_t next_seq, uint64_t offset) { + using ceph::encode; + encode((__u8)OP_JUMP, op_bl); + encode(next_seq, op_bl); + encode(offset, op_bl); + } + void op_jump_seq(uint64_t next_seq) { + using ceph::encode; + encode((__u8)OP_JUMP_SEQ, op_bl); + encode(next_seq, op_bl); + } + void claim_ops(bluefs_transaction_t& from) { + op_bl.claim_append(from.op_bl); + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<bluefs_transaction_t*>& ls); +}; +WRITE_CLASS_ENCODER(bluefs_transaction_t) + +ostream& operator<<(ostream& out, const bluefs_transaction_t& t); + +#endif diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc new file mode 100644 index 00000000..fc33289b --- /dev/null +++ b/src/os/bluestore/bluestore_tool.cc @@ -0,0 +1,864 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/parsers.hpp> + +#include <stdio.h> +#include <string.h> +#include <iostream> +#include <time.h> +#include <fcntl.h> +#include <unistd.h> +#include "global/global_init.h" +#include "common/ceph_argparse.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/safe_io.h" + +#include "os/bluestore/BlueFS.h" +#include "os/bluestore/BlueStore.h" +#include "common/admin_socket.h" + +namespace po = boost::program_options; + +void usage(po::options_description &desc) +{ + cout << desc << std::endl; +} + +void validate_path(CephContext *cct, const string& path, bool bluefs) +{ + BlueStore bluestore(cct, path); + string type; + int r = bluestore.read_meta("type", &type); + if (r < 0) { + cerr << "failed to load os-type: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + if (type != "bluestore") { + cerr << "expected bluestore, but type is " << type << std::endl; + exit(EXIT_FAILURE); + } + if (!bluefs) { + return; + } + + string kv_backend; + r = bluestore.read_meta("kv_backend", &kv_backend); + if (r < 0) { + cerr << "failed to load kv_backend: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + if (kv_backend != "rocksdb") { + cerr << "expect kv_backend to be rocksdb, but is " << kv_backend + << std::endl; + exit(EXIT_FAILURE); + } + string bluefs_enabled; + r = bluestore.read_meta("bluefs", &bluefs_enabled); + if (r < 0) { + cerr << "failed to load do_bluefs: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + if (bluefs_enabled != "1") { + cerr << "bluefs not enabled for rocksdb" << std::endl; + exit(EXIT_FAILURE); + } +} + +const char* find_device_path( + int id, + CephContext *cct, + const vector<string>& devs) +{ + for (auto& i : devs) { + bluestore_bdev_label_t label; + int r = BlueStore::_read_bdev_label(cct, i, &label); + if (r < 0) { + cerr << "unable to read label for " << i << ": " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + if ((id == BlueFS::BDEV_SLOW && label.description == "main") || + (id == BlueFS::BDEV_DB && label.description == "bluefs db") || + (id == BlueFS::BDEV_WAL && label.description == "bluefs wal")) { + return i.c_str(); + } + } + return nullptr; +} + +void parse_devices( + CephContext *cct, + const vector<string>& devs, + map<string, int>* got, + bool* has_db, + bool* has_wal) +{ + string main; + bool was_db = false; + if (has_wal) { + *has_wal = false; + } + if (has_db) { + *has_db = false; + } + for (auto& d : devs) { + bluestore_bdev_label_t label; + int r = BlueStore::_read_bdev_label(cct, d, &label); + if (r < 0) { + cerr << "unable to read label for " << d << ": " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + int id = -1; + if (label.description == "main") + main = d; + else if (label.description == "bluefs db") { + id = BlueFS::BDEV_DB; + was_db = true; + if (has_db) { + *has_db = true; + } + } + else if (label.description == "bluefs wal") { + id = BlueFS::BDEV_WAL; + if (has_wal) { + *has_wal = true; + } + } + if (id >= 0) { + got->emplace(d, id); + } + } + if (main.length()) { + int id = was_db ? BlueFS::BDEV_SLOW : BlueFS::BDEV_DB; + got->emplace(main, id); + } +} + +void add_devices( + BlueFS *fs, + CephContext *cct, + const vector<string>& devs) +{ + map<string, int> got; + parse_devices(cct, devs, &got, nullptr, nullptr); + for(auto e : got) { + char target_path[PATH_MAX] = ""; + if(!e.first.empty()) { + if (realpath(e.first.c_str(), target_path) == nullptr) { + cerr << "failed to retrieve absolute path for " << e.first + << ": " << cpp_strerror(errno) + << std::endl; + } + } + + cout << " slot " << e.second << " " << e.first; + if (target_path[0]) { + cout << " -> " << target_path; + } + cout << std::endl; + int r = fs->add_block_device(e.second, e.first, false); + if (r < 0) { + cerr << "unable to open " << e.first << ": " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + } +} + +BlueFS *open_bluefs( + CephContext *cct, + const string& path, + const vector<string>& devs) +{ + validate_path(cct, path, true); + BlueFS *fs = new BlueFS(cct); + + add_devices(fs, cct, devs); + + int r = fs->mount(); + if (r < 0) { + cerr << "unable to mount bluefs: " << cpp_strerror(r) + << std::endl; + exit(EXIT_FAILURE); + } + return fs; +} + +void log_dump( + CephContext *cct, + const string& path, + const vector<string>& devs) +{ + BlueFS* fs = open_bluefs(cct, path, devs); + int r = fs->log_dump(); + if (r < 0) { + cerr << "log_dump failed" << ": " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + + delete fs; +} + +void inferring_bluefs_devices(vector<string>& devs, std::string& path) +{ + cout << "inferring bluefs devices from bluestore path" << std::endl; + for (auto fn : {"block", "block.wal", "block.db"}) { + string p = path + "/" + fn; + struct stat st; + if (::stat(p.c_str(), &st) == 0) { + devs.push_back(p); + } + } +} + +int main(int argc, char **argv) +{ + string out_dir; + vector<string> devs; + vector<string> devs_source; + string dev_target; + string path; + string action; + string log_file; + string key, value; + vector<string> allocs_name; + int log_level = 30; + bool fsck_deep = false; + po::options_description po_options("Options"); + po_options.add_options() + ("help,h", "produce help message") + ("path", po::value<string>(&path), "bluestore path") + ("out-dir", po::value<string>(&out_dir), "output directory") + ("log-file,l", po::value<string>(&log_file), "log file") + ("log-level", po::value<int>(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)") + ("dev", po::value<vector<string>>(&devs), "device(s)") + ("devs-source", po::value<vector<string>>(&devs_source), "bluefs-dev-migrate source device(s)") + ("dev-target", po::value<string>(&dev_target), "target/resulting device") + ("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)") + ("key,k", po::value<string>(&key), "label metadata key name") + ("value,v", po::value<string>(&value), "label metadata value") + ("allocator", po::value<vector<string>>(&allocs_name), "allocator to inspect: 'block'/'bluefs-wal'/'bluefs-db'/'bluefs-slow'") + ; + po::options_description po_positional("Positional options"); + po_positional.add_options() + ("command", po::value<string>(&action), + "fsck, " + "repair, " + "quick-fix, " + "bluefs-export, " + "bluefs-bdev-sizes, " + "bluefs-bdev-expand, " + "bluefs-bdev-new-db, " + "bluefs-bdev-new-wal, " + "bluefs-bdev-migrate, " + "show-label, " + "set-label-key, " + "rm-label-key, " + "prime-osd-dir, " + "bluefs-log-dump, " + "free-dump, " + "free-score") + ; + po::options_description po_all("All options"); + po_all.add(po_options).add(po_positional); + po::positional_options_description pd; + pd.add("command", 1); + + vector<string> ceph_option_strings; + po::variables_map vm; + try { + po::parsed_options parsed = + po::command_line_parser(argc, argv).options(po_all).allow_unregistered().positional(pd).run(); + po::store( parsed, vm); + po::notify(vm); + ceph_option_strings = po::collect_unrecognized(parsed.options, + po::include_positional); + } catch(po::error &e) { + std::cerr << e.what() << std::endl; + exit(EXIT_FAILURE); + } + // normalize path (remove ending '/' if any) + if (path.size() > 1 && *(path.end() - 1) == '/') { + path.resize(path.size() - 1); + } + if (vm.count("help")) { + usage(po_all); + exit(EXIT_SUCCESS); + } + if (action.empty()) { + cerr << "must specify an action; --help for help" << std::endl; + exit(EXIT_FAILURE); + } + + if (action == "fsck" || action == "repair" || action == "quick-fix") { + if (path.empty()) { + cerr << "must specify bluestore path" << std::endl; + exit(EXIT_FAILURE); + } + } + if (action == "prime-osd-dir") { + if (devs.size() != 1) { + cerr << "must specify the main bluestore device" << std::endl; + exit(EXIT_FAILURE); + } + if (path.empty()) { + cerr << "must specify osd dir to prime" << std::endl; + exit(EXIT_FAILURE); + } + } + if (action == "set-label-key" || + action == "rm-label-key") { + if (devs.size() != 1) { + cerr << "must specify the main bluestore device" << std::endl; + exit(EXIT_FAILURE); + } + if (key.size() == 0) { + cerr << "must specify a key name with -k" << std::endl; + exit(EXIT_FAILURE); + } + if (action == "set-label-key" && value.size() == 0) { + cerr << "must specify a value with -v" << std::endl; + exit(EXIT_FAILURE); + } + } + if (action == "show-label") { + if (devs.empty() && path.empty()) { + cerr << "must specify bluestore path *or* raw device(s)" << std::endl; + exit(EXIT_FAILURE); + } + if (devs.empty()) + inferring_bluefs_devices(devs, path); + } + if (action == "bluefs-export" || action == "bluefs-log-dump") { + if (path.empty()) { + cerr << "must specify bluestore path" << std::endl; + exit(EXIT_FAILURE); + } + if ((action == "bluefs-export") && out_dir.empty()) { + cerr << "must specify out-dir to export bluefs" << std::endl; + exit(EXIT_FAILURE); + } + inferring_bluefs_devices(devs, path); + } + if (action == "bluefs-bdev-sizes" || action == "bluefs-bdev-expand") { + if (path.empty()) { + cerr << "must specify bluestore path" << std::endl; + exit(EXIT_FAILURE); + } + inferring_bluefs_devices(devs, path); + } + if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") { + if (path.empty()) { + cerr << "must specify bluestore path" << std::endl; + exit(EXIT_FAILURE); + } + if (dev_target.empty()) { + cout << "NOTICE: --dev-target option omitted, will allocate as a file" << std::endl; + } + inferring_bluefs_devices(devs, path); + } + if (action == "bluefs-bdev-migrate") { + if (path.empty()) { + cerr << "must specify bluestore path" << std::endl; + exit(EXIT_FAILURE); + } + inferring_bluefs_devices(devs, path); + if (devs_source.size() == 0) { + cerr << "must specify source devices with --devs-source" << std::endl; + exit(EXIT_FAILURE); + } + if (dev_target.empty()) { + cerr << "must specify target device with --dev-target" << std::endl; + exit(EXIT_FAILURE); + } + } + if (action == "free-score" || action == "free-dump") { + if (path.empty()) { + cerr << "must specify bluestore path" << std::endl; + exit(EXIT_FAILURE); + } + for (auto name : allocs_name) { + if (!name.empty() && + name != "block" && + name != "bluefs-db" && + name != "bluefs-wal" && + name != "bluefs-slow") { + cerr << "unknown allocator '" << name << "'" << std::endl; + exit(EXIT_FAILURE); + } + } + if (allocs_name.empty()) + allocs_name = vector<string>{"block", "bluefs-db", "bluefs-wal", "bluefs-slow"}; + } + vector<const char*> args; + if (log_file.size()) { + args.push_back("--log-file"); + args.push_back(log_file.c_str()); + static char ll[10]; + snprintf(ll, sizeof(ll), "%d", log_level); + args.push_back("--debug-bluestore"); + args.push_back(ll); + args.push_back("--debug-bluefs"); + args.push_back(ll); + } + args.push_back("--no-log-to-stderr"); + args.push_back("--err-to-stderr"); + + for (auto& i : ceph_option_strings) { + args.push_back(i.c_str()); + } + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + + common_init_finish(cct.get()); + + if (action == "fsck" || + action == "repair" || + action == "quick-fix") { + validate_path(cct.get(), path, false); + BlueStore bluestore(cct.get(), path); + int r; + if (action == "fsck") { + r = bluestore.fsck(fsck_deep); + } else if (action == "repair") { + r = bluestore.repair(fsck_deep); + } else { + r = bluestore.quick_fix(); + } + if (r < 0) { + cerr << "error from fsck: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } else if (r > 0) { + cerr << action << " found " << r << " error(s)" << std::endl; + exit(EXIT_FAILURE); + } else { + cout << action << " success" << std::endl; + } + } + else if (action == "prime-osd-dir") { + bluestore_bdev_label_t label; + int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label); + if (r < 0) { + cerr << "failed to read label for " << devs.front() << ": " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + + // kludge some things into the map that we want to populate into + // target dir + label.meta["path_block"] = devs.front(); + label.meta["type"] = "bluestore"; + label.meta["fsid"] = stringify(label.osd_uuid); + + for (auto kk : { + "whoami", + "osd_key", + "ceph_fsid", + "fsid", + "type", + "ready" }) { + string k = kk; + auto i = label.meta.find(k); + if (i == label.meta.end()) { + continue; + } + string p = path + "/" + k; + string v = i->second; + if (k == "osd_key") { + p = path + "/keyring"; + v = "[osd."; + v += label.meta["whoami"]; + v += "]\nkey = " + i->second; + } + v += "\n"; + int fd = ::open(p.c_str(), O_CREAT|O_TRUNC|O_WRONLY|O_CLOEXEC, 0600); + if (fd < 0) { + cerr << "error writing " << p << ": " << cpp_strerror(errno) + << std::endl; + exit(EXIT_FAILURE); + } + int r = safe_write(fd, v.c_str(), v.size()); + if (r < 0) { + cerr << "error writing to " << p << ": " << cpp_strerror(errno) + << std::endl; + exit(EXIT_FAILURE); + } + ::close(fd); + } + } + else if (action == "show-label") { + JSONFormatter jf(true); + jf.open_object_section("devices"); + for (auto& i : devs) { + bluestore_bdev_label_t label; + int r = BlueStore::_read_bdev_label(cct.get(), i, &label); + if (r < 0) { + cerr << "unable to read label for " << i << ": " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + jf.open_object_section(i.c_str()); + label.dump(&jf); + jf.close_section(); + } + jf.close_section(); + jf.flush(cout); + } + else if (action == "set-label-key") { + bluestore_bdev_label_t label; + int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label); + if (r < 0) { + cerr << "unable to read label for " << devs.front() << ": " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + if (key == "size") { + label.size = strtoull(value.c_str(), nullptr, 10); + } else if (key =="osd_uuid") { + label.osd_uuid.parse(value.c_str()); + } else if (key =="btime") { + uint64_t epoch; + uint64_t nsec; + int r = utime_t::parse_date(value.c_str(), &epoch, &nsec); + if (r == 0) { + label.btime = utime_t(epoch, nsec); + } + } else if (key =="description") { + label.description = value; + } else { + label.meta[key] = value; + } + r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label); + if (r < 0) { + cerr << "unable to write label for " << devs.front() << ": " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + } + else if (action == "rm-label-key") { + bluestore_bdev_label_t label; + int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label); + if (r < 0) { + cerr << "unable to read label for " << devs.front() << ": " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + if (!label.meta.count(key)) { + cerr << "key '" << key << "' not present" << std::endl; + exit(EXIT_FAILURE); + } + label.meta.erase(key); + r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label); + if (r < 0) { + cerr << "unable to write label for " << devs.front() << ": " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + } + else if (action == "bluefs-bdev-sizes") { + BlueStore bluestore(cct.get(), path); + bluestore.dump_bluefs_sizes(cout); + } + else if (action == "bluefs-bdev-expand") { + BlueStore bluestore(cct.get(), path); + auto r = bluestore.expand_devices(cout); + if (r <0) { + cerr << "failed to expand bluestore devices: " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + } + else if (action == "bluefs-export") { + BlueFS *fs = open_bluefs(cct.get(), path, devs); + + vector<string> dirs; + int r = fs->readdir("", &dirs); + if (r < 0) { + cerr << "readdir in root failed: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + + if (::access(out_dir.c_str(), F_OK)) { + r = ::mkdir(out_dir.c_str(), 0755); + if (r < 0) { + r = -errno; + cerr << "mkdir " << out_dir << " failed: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + } + + for (auto& dir : dirs) { + if (dir[0] == '.') + continue; + cout << dir << "/" << std::endl; + vector<string> ls; + r = fs->readdir(dir, &ls); + if (r < 0) { + cerr << "readdir " << dir << " failed: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + string full = out_dir + "/" + dir; + if (::access(full.c_str(), F_OK)) { + r = ::mkdir(full.c_str(), 0755); + if (r < 0) { + r = -errno; + cerr << "mkdir " << full << " failed: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + } + for (auto& file : ls) { + if (file[0] == '.') + continue; + cout << dir << "/" << file << std::endl; + uint64_t size; + utime_t mtime; + r = fs->stat(dir, file, &size, &mtime); + if (r < 0) { + cerr << "stat " << file << " failed: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + string path = out_dir + "/" + dir + "/" + file; + int fd = ::open(path.c_str(), O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0644); + if (fd < 0) { + r = -errno; + cerr << "open " << path << " failed: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + if (size > 0) { + BlueFS::FileReader *h; + r = fs->open_for_read(dir, file, &h, false); + if (r < 0) { + cerr << "open_for_read " << dir << "/" << file << " failed: " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + int pos = 0; + int left = size; + while (left) { + bufferlist bl; + r = fs->read(h, &h->buf, pos, left, &bl, NULL); + if (r <= 0) { + cerr << "read " << dir << "/" << file << " from " << pos + << " failed: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + int rc = bl.write_fd(fd); + if (rc < 0) { + cerr << "write to " << path << " failed: " + << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + pos += r; + left -= r; + } + delete h; + } + ::close(fd); + } + } + fs->umount(); + delete fs; + } else if (action == "bluefs-log-dump") { + log_dump(cct.get(), path, devs); + } else if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") { + map<string, int> cur_devs_map; + bool need_db = action == "bluefs-bdev-new-db"; + + bool has_wal = false; + bool has_db = false; + char target_path[PATH_MAX] = ""; + + parse_devices(cct.get(), devs, &cur_devs_map, &has_db, &has_wal); + + if (has_db && has_wal) { + cerr << "can't allocate new device, both WAL and DB exist" + << std::endl; + exit(EXIT_FAILURE); + } else if (need_db && has_db) { + cerr << "can't allocate new DB device, already exists" + << std::endl; + exit(EXIT_FAILURE); + } else if (!need_db && has_wal) { + cerr << "can't allocate new WAL device, already exists" + << std::endl; + exit(EXIT_FAILURE); + } else if(!dev_target.empty() && + realpath(dev_target.c_str(), target_path) == nullptr) { + cerr << "failed to retrieve absolute path for " << dev_target + << ": " << cpp_strerror(errno) + << std::endl; + exit(EXIT_FAILURE); + } + + // Create either DB or WAL volume + int r = EXIT_FAILURE; + if (need_db && cct->_conf->bluestore_block_db_size == 0) { + cerr << "DB size isn't specified, " + "please set Ceph bluestore-block-db-size config parameter " + << std::endl; + } else if (!need_db && cct->_conf->bluestore_block_wal_size == 0) { + cerr << "WAL size isn't specified, " + "please set Ceph bluestore-block-wal-size config parameter " + << std::endl; + } else { + BlueStore bluestore(cct.get(), path); + r = bluestore.add_new_bluefs_device( + need_db ? BlueFS::BDEV_NEWDB : BlueFS::BDEV_NEWWAL, + target_path); + if (r == 0) { + cout << (need_db ? "DB" : "WAL") << " device added " << target_path + << std::endl; + } else { + cerr << "failed to add " << (need_db ? "DB" : "WAL") << " device:" + << cpp_strerror(r) + << std::endl; + } + return r; + } + } else if (action == "bluefs-bdev-migrate") { + map<string, int> cur_devs_map; + set<int> src_dev_ids; + map<string, int> src_devs; + + parse_devices(cct.get(), devs, &cur_devs_map, nullptr, nullptr); + for (auto& s : devs_source) { + auto i = cur_devs_map.find(s); + if (i != cur_devs_map.end()) { + if (s == dev_target) { + cerr << "Device " << dev_target + << " is present in both source and target lists, omitted." + << std::endl; + } else { + src_devs.emplace(*i); + src_dev_ids.emplace(i->second); + } + } else { + cerr << "can't migrate " << s << ", not a valid bluefs volume " + << std::endl; + exit(EXIT_FAILURE); + } + } + + auto i = cur_devs_map.find(dev_target); + + if (i != cur_devs_map.end()) { + // Migrate to an existing BlueFS volume + + auto dev_target_id = i->second; + if (dev_target_id == BlueFS::BDEV_WAL) { + // currently we're unable to migrate to WAL device since there is no space + // reserved for superblock + cerr << "Migrate to WAL device isn't supported." << std::endl; + exit(EXIT_FAILURE); + } + + BlueStore bluestore(cct.get(), path); + int r = bluestore.migrate_to_existing_bluefs_device( + src_dev_ids, + dev_target_id); + if (r == 0) { + for(auto src : src_devs) { + if (src.second != BlueFS::BDEV_SLOW) { + cout << " device removed:" << src.second << " " << src.first + << std::endl; + } + } + } else { + bool need_db = dev_target_id == BlueFS::BDEV_DB; + cerr << "failed to migrate to existing BlueFS device: " + << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_WAL) + << " " << dev_target + << cpp_strerror(r) + << std::endl; + } + return r; + } else { + // Migrate to a new BlueFS volume + // via creating either DB or WAL volume + char target_path[PATH_MAX] = ""; + int dev_target_id; + if (src_dev_ids.count(BlueFS::BDEV_DB)) { + // if we have DB device in the source list - we create DB device + // (and may be remove WAL). + dev_target_id = BlueFS::BDEV_NEWDB; + } else if (src_dev_ids.count(BlueFS::BDEV_WAL)) { + dev_target_id = BlueFS::BDEV_NEWWAL; + } else { + cerr << "Unable to migrate Slow volume to new location, " + "please allocate new DB or WAL with " + "--bluefs-bdev-new-db(wal) command" + << std::endl; + exit(EXIT_FAILURE); + } + if(!dev_target.empty() && + realpath(dev_target.c_str(), target_path) == nullptr) { + cerr << "failed to retrieve absolute path for " << dev_target + << ": " << cpp_strerror(errno) + << std::endl; + exit(EXIT_FAILURE); + } + + BlueStore bluestore(cct.get(), path); + + bool need_db = dev_target_id == BlueFS::BDEV_NEWDB; + int r = bluestore.migrate_to_new_bluefs_device( + src_dev_ids, + dev_target_id, + target_path); + if (r == 0) { + for(auto src : src_devs) { + if (src.second != BlueFS::BDEV_SLOW) { + cout << " device removed:" << src.second << " " << src.first + << std::endl; + } + } + cout << " device added: " + << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB) + << " " << target_path + << std::endl; + } else { + cerr << "failed to migrate to new BlueFS device: " + << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB) + << " " << target_path + << cpp_strerror(r) + << std::endl; + } + return r; + } + } else if (action == "free-dump" || action == "free-score") { + AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); + ceph_assert(admin_socket); + std::string action_name = action == "free-dump" ? "dump" : "score"; + validate_path(cct.get(), path, false); + BlueStore bluestore(cct.get(), path); + int r = bluestore.cold_open(); + if (r < 0) { + cerr << "error from cold_open: " << cpp_strerror(r) << std::endl; + exit(EXIT_FAILURE); + } + + for (auto alloc_name : allocs_name) { + ceph::bufferlist out; + bool b = admin_socket->execute_command( + "{\"prefix\": \"bluestore allocator " + action_name + " " + alloc_name + "\"}", out); + if (!b) { + cerr << "failure querying '" << alloc_name << "'" << std::endl; + exit(EXIT_FAILURE); + } + cout << alloc_name << ":" << std::endl; + cout << std::string(out.c_str(),out.length()) << std::endl; + } + + bluestore.cold_close(); + } else { + cerr << "unrecognized action " << action << std::endl; + return 1; + } + + return 0; +} diff --git a/src/os/bluestore/bluestore_types.cc b/src/os/bluestore/bluestore_types.cc new file mode 100644 index 00000000..134eed5b --- /dev/null +++ b/src/os/bluestore/bluestore_types.cc @@ -0,0 +1,1138 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "bluestore_types.h" +#include "common/Formatter.h" +#include "common/Checksummer.h" +#include "include/stringify.h" + +// bluestore_bdev_label_t + +void bluestore_bdev_label_t::encode(bufferlist& bl) const +{ + // be slightly friendly to someone who looks at the device + bl.append("bluestore block device\n"); + bl.append(stringify(osd_uuid)); + bl.append("\n"); + ENCODE_START(2, 1, bl); + encode(osd_uuid, bl); + encode(size, bl); + encode(btime, bl); + encode(description, bl); + encode(meta, bl); + ENCODE_FINISH(bl); +} + +void bluestore_bdev_label_t::decode(bufferlist::const_iterator& p) +{ + p.advance(60u); // see above + DECODE_START(2, p); + decode(osd_uuid, p); + decode(size, p); + decode(btime, p); + decode(description, p); + if (struct_v >= 2) { + decode(meta, p); + } + DECODE_FINISH(p); +} + +void bluestore_bdev_label_t::dump(Formatter *f) const +{ + f->dump_stream("osd_uuid") << osd_uuid; + f->dump_unsigned("size", size); + f->dump_stream("btime") << btime; + f->dump_string("description", description); + for (auto& i : meta) { + f->dump_string(i.first.c_str(), i.second); + } +} + +void bluestore_bdev_label_t::generate_test_instances( + list<bluestore_bdev_label_t*>& o) +{ + o.push_back(new bluestore_bdev_label_t); + o.push_back(new bluestore_bdev_label_t); + o.back()->size = 123; + o.back()->btime = utime_t(4, 5); + o.back()->description = "fakey"; + o.back()->meta["foo"] = "bar"; +} + +ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l) +{ + return out << "bdev(osd_uuid " << l.osd_uuid + << ", size 0x" << std::hex << l.size << std::dec + << ", btime " << l.btime + << ", desc " << l.description + << ", " << l.meta.size() << " meta" + << ")"; +} + +// cnode_t + +void bluestore_cnode_t::dump(Formatter *f) const +{ + f->dump_unsigned("bits", bits); +} + +void bluestore_cnode_t::generate_test_instances(list<bluestore_cnode_t*>& o) +{ + o.push_back(new bluestore_cnode_t()); + o.push_back(new bluestore_cnode_t(0)); + o.push_back(new bluestore_cnode_t(123)); +} + +ostream& operator<<(ostream& out, const bluestore_cnode_t& l) +{ + return out << "cnode(bits " << l.bits << ")"; +} + +// bluestore_extent_ref_map_t + +void bluestore_extent_ref_map_t::_check() const +{ + uint64_t pos = 0; + unsigned refs = 0; + for (const auto &p : ref_map) { + if (p.first < pos) + ceph_abort_msg("overlap"); + if (p.first == pos && p.second.refs == refs) + ceph_abort_msg("unmerged"); + pos = p.first + p.second.length; + refs = p.second.refs; + } +} + +void bluestore_extent_ref_map_t::_maybe_merge_left( + map<uint64_t,record_t>::iterator& p) +{ + if (p == ref_map.begin()) + return; + auto q = p; + --q; + if (q->second.refs == p->second.refs && + q->first + q->second.length == p->first) { + q->second.length += p->second.length; + ref_map.erase(p); + p = q; + } +} + +void bluestore_extent_ref_map_t::get(uint64_t offset, uint32_t length) +{ + auto p = ref_map.lower_bound(offset); + if (p != ref_map.begin()) { + --p; + if (p->first + p->second.length <= offset) { + ++p; + } + } + while (length > 0) { + if (p == ref_map.end()) { + // nothing after offset; add the whole thing. + p = ref_map.insert( + map<uint64_t,record_t>::value_type(offset, record_t(length, 1))).first; + break; + } + if (p->first > offset) { + // gap + uint64_t newlen = std::min<uint64_t>(p->first - offset, length); + p = ref_map.insert( + map<uint64_t,record_t>::value_type(offset, + record_t(newlen, 1))).first; + offset += newlen; + length -= newlen; + _maybe_merge_left(p); + ++p; + continue; + } + if (p->first < offset) { + // split off the portion before offset + ceph_assert(p->first + p->second.length > offset); + uint64_t left = p->first + p->second.length - offset; + p->second.length = offset - p->first; + p = ref_map.insert(map<uint64_t,record_t>::value_type( + offset, record_t(left, p->second.refs))).first; + // continue below + } + ceph_assert(p->first == offset); + if (length < p->second.length) { + ref_map.insert(make_pair(offset + length, + record_t(p->second.length - length, + p->second.refs))); + p->second.length = length; + ++p->second.refs; + break; + } + ++p->second.refs; + offset += p->second.length; + length -= p->second.length; + _maybe_merge_left(p); + ++p; + } + if (p != ref_map.end()) + _maybe_merge_left(p); + //_check(); +} + +void bluestore_extent_ref_map_t::put( + uint64_t offset, uint32_t length, + PExtentVector *release, + bool *maybe_unshared) +{ + //NB: existing entries in 'release' container must be preserved! + bool unshared = true; + auto p = ref_map.lower_bound(offset); + if (p == ref_map.end() || p->first > offset) { + if (p == ref_map.begin()) { + ceph_abort_msg("put on missing extent (nothing before)"); + } + --p; + if (p->first + p->second.length <= offset) { + ceph_abort_msg("put on missing extent (gap)"); + } + } + if (p->first < offset) { + uint64_t left = p->first + p->second.length - offset; + p->second.length = offset - p->first; + if (p->second.refs != 1) { + unshared = false; + } + p = ref_map.insert(map<uint64_t,record_t>::value_type( + offset, record_t(left, p->second.refs))).first; + } + while (length > 0) { + ceph_assert(p->first == offset); + if (length < p->second.length) { + if (p->second.refs != 1) { + unshared = false; + } + ref_map.insert(make_pair(offset + length, + record_t(p->second.length - length, + p->second.refs))); + if (p->second.refs > 1) { + p->second.length = length; + --p->second.refs; + if (p->second.refs != 1) { + unshared = false; + } + _maybe_merge_left(p); + } else { + if (release) + release->push_back(bluestore_pextent_t(p->first, length)); + ref_map.erase(p); + } + goto out; + } + offset += p->second.length; + length -= p->second.length; + if (p->second.refs > 1) { + --p->second.refs; + if (p->second.refs != 1) { + unshared = false; + } + _maybe_merge_left(p); + ++p; + } else { + if (release) + release->push_back(bluestore_pextent_t(p->first, p->second.length)); + ref_map.erase(p++); + } + } + if (p != ref_map.end()) + _maybe_merge_left(p); + //_check(); +out: + if (maybe_unshared) { + if (unshared) { + // we haven't seen a ref != 1 yet; check the whole map. + for (auto& p : ref_map) { + if (p.second.refs != 1) { + unshared = false; + break; + } + } + } + *maybe_unshared = unshared; + } +} + +bool bluestore_extent_ref_map_t::contains(uint64_t offset, uint32_t length) const +{ + auto p = ref_map.lower_bound(offset); + if (p == ref_map.end() || p->first > offset) { + if (p == ref_map.begin()) { + return false; // nothing before + } + --p; + if (p->first + p->second.length <= offset) { + return false; // gap + } + } + while (length > 0) { + if (p == ref_map.end()) + return false; + if (p->first > offset) + return false; + if (p->first + p->second.length >= offset + length) + return true; + uint64_t overlap = p->first + p->second.length - offset; + offset += overlap; + length -= overlap; + ++p; + } + return true; +} + +bool bluestore_extent_ref_map_t::intersects( + uint64_t offset, + uint32_t length) const +{ + auto p = ref_map.lower_bound(offset); + if (p != ref_map.begin()) { + --p; + if (p->first + p->second.length <= offset) { + ++p; + } + } + if (p == ref_map.end()) + return false; + if (p->first >= offset + length) + return false; + return true; // intersects p! +} + +void bluestore_extent_ref_map_t::dump(Formatter *f) const +{ + f->open_array_section("ref_map"); + for (auto& p : ref_map) { + f->open_object_section("ref"); + f->dump_unsigned("offset", p.first); + f->dump_unsigned("length", p.second.length); + f->dump_unsigned("refs", p.second.refs); + f->close_section(); + } + f->close_section(); +} + +void bluestore_extent_ref_map_t::generate_test_instances( + list<bluestore_extent_ref_map_t*>& o) +{ + o.push_back(new bluestore_extent_ref_map_t); + o.push_back(new bluestore_extent_ref_map_t); + o.back()->get(10, 10); + o.back()->get(18, 22); + o.back()->get(20, 20); + o.back()->get(10, 25); + o.back()->get(15, 20); +} + +ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& m) +{ + out << "ref_map("; + for (auto p = m.ref_map.begin(); p != m.ref_map.end(); ++p) { + if (p != m.ref_map.begin()) + out << ","; + out << std::hex << "0x" << p->first << "~" << p->second.length << std::dec + << "=" << p->second.refs; + } + out << ")"; + return out; +} + +// bluestore_blob_use_tracker_t + +void bluestore_blob_use_tracker_t::allocate() +{ + ceph_assert(num_au != 0); + bytes_per_au = new uint32_t[num_au]; + mempool::get_pool( + mempool::pool_index_t(mempool::mempool_bluestore_cache_other)). + adjust_count(1, sizeof(uint32_t) * num_au); + + for (uint32_t i = 0; i < num_au; ++i) { + bytes_per_au[i] = 0; + } +} + +void bluestore_blob_use_tracker_t::init( + uint32_t full_length, uint32_t _au_size) { + ceph_assert(!au_size || is_empty()); + ceph_assert(_au_size > 0); + ceph_assert(full_length > 0); + clear(); + uint32_t _num_au = round_up_to(full_length, _au_size) / _au_size; + au_size = _au_size; + if ( _num_au > 1 ) { + num_au = _num_au; + allocate(); + } +} + +void bluestore_blob_use_tracker_t::get( + uint32_t offset, uint32_t length) +{ + ceph_assert(au_size); + if (!num_au) { + total_bytes += length; + } else { + auto end = offset + length; + + while (offset < end) { + auto phase = offset % au_size; + bytes_per_au[offset / au_size] += + std::min(au_size - phase, end - offset); + offset += (phase ? au_size - phase : au_size); + } + } +} + +bool bluestore_blob_use_tracker_t::put( + uint32_t offset, uint32_t length, + PExtentVector *release_units) +{ + ceph_assert(au_size); + if (release_units) { + release_units->clear(); + } + bool maybe_empty = true; + if (!num_au) { + ceph_assert(total_bytes >= length); + total_bytes -= length; + } else { + auto end = offset + length; + uint64_t next_offs = 0; + while (offset < end) { + auto phase = offset % au_size; + size_t pos = offset / au_size; + auto diff = std::min(au_size - phase, end - offset); + ceph_assert(diff <= bytes_per_au[pos]); + bytes_per_au[pos] -= diff; + offset += (phase ? au_size - phase : au_size); + if (bytes_per_au[pos] == 0) { + if (release_units) { + if (release_units->empty() || next_offs != pos * au_size) { + release_units->emplace_back(pos * au_size, au_size); + } else { + release_units->back().length += au_size; + } + next_offs += au_size; + } + } else { + maybe_empty = false; // micro optimization detecting we aren't empty + // even in the affected extent + } + } + } + bool empty = maybe_empty ? !is_not_empty() : false; + if (empty && release_units) { + release_units->clear(); + } + return empty; +} + +bool bluestore_blob_use_tracker_t::can_split() const +{ + return num_au > 0; +} + +bool bluestore_blob_use_tracker_t::can_split_at(uint32_t blob_offset) const +{ + ceph_assert(au_size); + return (blob_offset % au_size) == 0 && + blob_offset < num_au * au_size; +} + +void bluestore_blob_use_tracker_t::split( + uint32_t blob_offset, + bluestore_blob_use_tracker_t* r) +{ + ceph_assert(au_size); + ceph_assert(can_split()); + ceph_assert(can_split_at(blob_offset)); + ceph_assert(r->is_empty()); + + uint32_t new_num_au = blob_offset / au_size; + r->init( (num_au - new_num_au) * au_size, au_size); + + for (auto i = new_num_au; i < num_au; i++) { + r->get((i - new_num_au) * au_size, bytes_per_au[i]); + bytes_per_au[i] = 0; + } + if (new_num_au == 0) { + clear(); + } else if (new_num_au == 1) { + uint32_t tmp = bytes_per_au[0]; + uint32_t _au_size = au_size; + clear(); + au_size = _au_size; + total_bytes = tmp; + } else { + num_au = new_num_au; + } +} + +bool bluestore_blob_use_tracker_t::equal( + const bluestore_blob_use_tracker_t& other) const +{ + if (!num_au && !other.num_au) { + return total_bytes == other.total_bytes && au_size == other.au_size; + } else if (num_au && other.num_au) { + if (num_au != other.num_au || au_size != other.au_size) { + return false; + } + for (size_t i = 0; i < num_au; i++) { + if (bytes_per_au[i] != other.bytes_per_au[i]) { + return false; + } + } + return true; + } + + uint32_t n = num_au ? num_au : other.num_au; + uint32_t referenced = + num_au ? other.get_referenced_bytes() : get_referenced_bytes(); + auto bytes_per_au_tmp = num_au ? bytes_per_au : other.bytes_per_au; + uint32_t my_referenced = 0; + for (size_t i = 0; i < n; i++) { + my_referenced += bytes_per_au_tmp[i]; + if (my_referenced > referenced) { + return false; + } + } + return my_referenced == referenced; +} + +void bluestore_blob_use_tracker_t::dump(Formatter *f) const +{ + f->dump_unsigned("num_au", num_au); + f->dump_unsigned("au_size", au_size); + if (!num_au) { + f->dump_unsigned("total_bytes", total_bytes); + } else { + f->open_array_section("bytes_per_au"); + for (size_t i = 0; i < num_au; ++i) { + f->dump_unsigned("", bytes_per_au[i]); + } + f->close_section(); + } +} + +void bluestore_blob_use_tracker_t::generate_test_instances( + list<bluestore_blob_use_tracker_t*>& o) +{ + o.push_back(new bluestore_blob_use_tracker_t()); + o.back()->init(16, 16); + o.back()->get(10, 10); + o.back()->get(10, 5); + o.push_back(new bluestore_blob_use_tracker_t()); + o.back()->init(60, 16); + o.back()->get(18, 22); + o.back()->get(20, 20); + o.back()->get(15, 20); +} + +ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& m) +{ + out << "use_tracker(" << std::hex; + if (!m.num_au) { + out << "0x" << m.au_size + << " " + << "0x" << m.total_bytes; + } else { + out << "0x" << m.num_au + << "*0x" << m.au_size + << " 0x["; + for (size_t i = 0; i < m.num_au; ++i) { + if (i != 0) + out << ","; + out << m.bytes_per_au[i]; + } + out << "]"; + } + out << std::dec << ")"; + return out; +} + +// bluestore_pextent_t + +void bluestore_pextent_t::dump(Formatter *f) const +{ + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); +} + +ostream& operator<<(ostream& out, const bluestore_pextent_t& o) { + if (o.is_valid()) + return out << "0x" << std::hex << o.offset << "~" << o.length << std::dec; + else + return out << "!~" << std::hex << o.length << std::dec; +} + +void bluestore_pextent_t::generate_test_instances(list<bluestore_pextent_t*>& ls) +{ + ls.push_back(new bluestore_pextent_t); + ls.push_back(new bluestore_pextent_t(1, 2)); +} + +// bluestore_blob_t + +string bluestore_blob_t::get_flags_string(unsigned flags) +{ + string s; + if (flags & FLAG_COMPRESSED) { + if (s.length()) + s += '+'; + s += "compressed"; + } + if (flags & FLAG_CSUM) { + if (s.length()) + s += '+'; + s += "csum"; + } + if (flags & FLAG_HAS_UNUSED) { + if (s.length()) + s += '+'; + s += "has_unused"; + } + if (flags & FLAG_SHARED) { + if (s.length()) + s += '+'; + s += "shared"; + } + + return s; +} + +size_t bluestore_blob_t::get_csum_value_size() const +{ + return Checksummer::get_csum_value_size(csum_type); +} + +void bluestore_blob_t::dump(Formatter *f) const +{ + f->open_array_section("extents"); + for (auto& p : extents) { + f->dump_object("extent", p); + } + f->close_section(); + f->dump_unsigned("logical_length", logical_length); + f->dump_unsigned("compressed_length", compressed_length); + f->dump_unsigned("flags", flags); + f->dump_unsigned("csum_type", csum_type); + f->dump_unsigned("csum_chunk_order", csum_chunk_order); + f->open_array_section("csum_data"); + size_t n = get_csum_count(); + for (unsigned i = 0; i < n; ++i) + f->dump_unsigned("csum", get_csum_item(i)); + f->close_section(); + f->dump_unsigned("unused", unused); +} + +void bluestore_blob_t::generate_test_instances(list<bluestore_blob_t*>& ls) +{ + ls.push_back(new bluestore_blob_t); + ls.push_back(new bluestore_blob_t(0)); + ls.push_back(new bluestore_blob_t); + ls.back()->allocated_test(bluestore_pextent_t(111, 222)); + ls.push_back(new bluestore_blob_t); + ls.back()->init_csum(Checksummer::CSUM_XXHASH32, 16, 65536); + ls.back()->csum_data = buffer::claim_malloc(4, strdup("abcd")); + ls.back()->add_unused(0, 3); + ls.back()->add_unused(8, 8); + ls.back()->allocated_test(bluestore_pextent_t(0x40100000, 0x10000)); + ls.back()->allocated_test( + bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x1000)); + ls.back()->allocated_test(bluestore_pextent_t(0x40120000, 0x10000)); +} + +ostream& operator<<(ostream& out, const bluestore_blob_t& o) +{ + out << "blob(" << o.get_extents(); + if (o.is_compressed()) { + out << " clen 0x" << std::hex + << o.get_logical_length() + << " -> 0x" + << o.get_compressed_payload_length() + << std::dec; + } + if (o.flags) { + out << " " << o.get_flags_string(); + } + if (o.has_csum()) { + out << " " << Checksummer::get_csum_type_string(o.csum_type) + << "/0x" << std::hex << (1ull << o.csum_chunk_order) << std::dec; + } + if (o.has_unused()) + out << " unused=0x" << std::hex << o.unused << std::dec; + out << ")"; + return out; +} + +void bluestore_blob_t::calc_csum(uint64_t b_off, const bufferlist& bl) +{ + switch (csum_type) { + case Checksummer::CSUM_XXHASH32: + Checksummer::calculate<Checksummer::xxhash32>( + get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data); + break; + case Checksummer::CSUM_XXHASH64: + Checksummer::calculate<Checksummer::xxhash64>( + get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data); + break;; + case Checksummer::CSUM_CRC32C: + Checksummer::calculate<Checksummer::crc32c>( + get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data); + break; + case Checksummer::CSUM_CRC32C_16: + Checksummer::calculate<Checksummer::crc32c_16>( + get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data); + break; + case Checksummer::CSUM_CRC32C_8: + Checksummer::calculate<Checksummer::crc32c_8>( + get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data); + break; + } +} + +int bluestore_blob_t::verify_csum(uint64_t b_off, const bufferlist& bl, + int* b_bad_off, uint64_t *bad_csum) const +{ + int r = 0; + + *b_bad_off = -1; + switch (csum_type) { + case Checksummer::CSUM_NONE: + break; + case Checksummer::CSUM_XXHASH32: + *b_bad_off = Checksummer::verify<Checksummer::xxhash32>( + get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum); + break; + case Checksummer::CSUM_XXHASH64: + *b_bad_off = Checksummer::verify<Checksummer::xxhash64>( + get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum); + break; + case Checksummer::CSUM_CRC32C: + *b_bad_off = Checksummer::verify<Checksummer::crc32c>( + get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum); + break; + case Checksummer::CSUM_CRC32C_16: + *b_bad_off = Checksummer::verify<Checksummer::crc32c_16>( + get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum); + break; + case Checksummer::CSUM_CRC32C_8: + *b_bad_off = Checksummer::verify<Checksummer::crc32c_8>( + get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum); + break; + default: + r = -EOPNOTSUPP; + break; + } + + if (r < 0) + return r; + else if (*b_bad_off >= 0) + return -1; // bad checksum + else + return 0; +} + +void bluestore_blob_t::allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs) +{ + if (extents.size() == 0) { + // if blob is compressed then logical length to be already configured + // otherwise - to be unset. + ceph_assert((is_compressed() && logical_length != 0) || + (!is_compressed() && logical_length == 0)); + + extents.reserve(allocs.size() + (b_off ? 1 : 0)); + if (b_off) { + extents.emplace_back( + bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, b_off)); + + } + uint32_t new_len = b_off; + for (auto& a : allocs) { + extents.emplace_back(a.offset, a.length); + new_len += a.length; + } + if (!is_compressed()) { + logical_length = new_len; + } + } else { + ceph_assert(!is_compressed()); // partial allocations are forbidden when + // compressed + ceph_assert(b_off < logical_length); + uint32_t cur_offs = 0; + auto start_it = extents.begin(); + size_t pos = 0; + while (true) { + ceph_assert(start_it != extents.end()); + if (cur_offs + start_it->length > b_off) { + break; + } + cur_offs += start_it->length; + ++start_it; + ++pos; + } + uint32_t head = b_off - cur_offs; + uint32_t end_off = b_off + length; + auto end_it = start_it; + + while (true) { + ceph_assert(end_it != extents.end()); + ceph_assert(!end_it->is_valid()); + if (cur_offs + end_it->length >= end_off) { + break; + } + cur_offs += end_it->length; + ++end_it; + } + ceph_assert(cur_offs + end_it->length >= end_off); + uint32_t tail = cur_offs + end_it->length - end_off; + + start_it = extents.erase(start_it, end_it + 1); + size_t count = allocs.size(); + count += head ? 1 : 0; + count += tail ? 1 : 0; + extents.insert(start_it, + count, + bluestore_pextent_t( + bluestore_pextent_t::INVALID_OFFSET, 0)); + + // Workaround to resolve lack of proper iterator return in vector::insert + // Looks like some gcc/stl implementations still lack it despite c++11 + // support claim + start_it = extents.begin() + pos; + + if (head) { + start_it->length = head; + ++start_it; + } + for(auto& e : allocs) { + *start_it = e; + ++start_it; + } + if (tail) { + start_it->length = tail; + } + } +} + +// cut it out of extents +struct vecbuilder { + PExtentVector v; + uint64_t invalid = 0; + + void add_invalid(uint64_t length) { + invalid += length; + } + void flush() { + if (invalid) { + v.emplace_back(bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, + invalid)); + + invalid = 0; + } + } + void add(uint64_t offset, uint64_t length) { + if (offset == bluestore_pextent_t::INVALID_OFFSET) { + add_invalid(length); + } + else { + flush(); + v.emplace_back(offset, length); + } + } +}; + +void bluestore_blob_t::allocated_test(const bluestore_pextent_t& alloc) +{ + extents.emplace_back(alloc); + if (!is_compressed()) { + logical_length += alloc.length; + } +} + +bool bluestore_blob_t::release_extents(bool all, + const PExtentVector& logical, + PExtentVector* r) +{ + // common case: all of it? + if (all) { + uint64_t pos = 0; + for (auto& e : extents) { + if (e.is_valid()) { + r->push_back(e); + } + pos += e.length; + } + ceph_assert(is_compressed() || get_logical_length() == pos); + extents.resize(1); + extents[0].offset = bluestore_pextent_t::INVALID_OFFSET; + extents[0].length = pos; + return true; + } + // remove from pextents according to logical release list + vecbuilder vb; + auto loffs_it = logical.begin(); + auto lend = logical.end(); + uint32_t pext_loffs_start = 0; //starting loffset of the current pextent + uint32_t pext_loffs = 0; //current loffset + auto pext_it = extents.begin(); + auto pext_end = extents.end(); + while (pext_it != pext_end) { + if (loffs_it == lend || + pext_loffs_start + pext_it->length <= loffs_it->offset) { + int delta0 = pext_loffs - pext_loffs_start; + ceph_assert(delta0 >= 0); + if ((uint32_t)delta0 < pext_it->length) { + vb.add(pext_it->offset + delta0, pext_it->length - delta0); + } + pext_loffs_start += pext_it->length; + pext_loffs = pext_loffs_start; + ++pext_it; + } + else { + //assert(pext_loffs == pext_loffs_start); + int delta0 = pext_loffs - pext_loffs_start; + ceph_assert(delta0 >= 0); + + int delta = loffs_it->offset - pext_loffs; + ceph_assert(delta >= 0); + if (delta > 0) { + vb.add(pext_it->offset + delta0, delta); + pext_loffs += delta; + } + + PExtentVector::iterator last_r = r->end(); + if (r->begin() != last_r) { + --last_r; + } + uint32_t to_release = loffs_it->length; + do { + uint32_t to_release_part = + std::min(pext_it->length - delta0 - delta, to_release); + auto o = pext_it->offset + delta0 + delta; + if (last_r != r->end() && last_r->offset + last_r->length == o) { + last_r->length += to_release_part; + } + else { + last_r = r->emplace(r->end(), o, to_release_part); + } + to_release -= to_release_part; + pext_loffs += to_release_part; + if (pext_loffs == pext_loffs_start + pext_it->length) { + pext_loffs_start += pext_it->length; + pext_loffs = pext_loffs_start; + pext_it++; + delta0 = delta = 0; + } + } while (to_release > 0 && pext_it != pext_end); + vb.add_invalid(loffs_it->length - to_release); + ++loffs_it; + } + } + vb.flush(); + extents.swap(vb.v); + return false; +} + +void bluestore_blob_t::split(uint32_t blob_offset, bluestore_blob_t& rb) +{ + size_t left = blob_offset; + uint32_t llen_lb = 0; + uint32_t llen_rb = 0; + unsigned i = 0; + for (auto p = extents.begin(); p != extents.end(); ++p, ++i) { + if (p->length <= left) { + left -= p->length; + llen_lb += p->length; + continue; + } + if (left) { + if (p->is_valid()) { + rb.extents.emplace_back(bluestore_pextent_t(p->offset + left, + p->length - left)); + } + else { + rb.extents.emplace_back(bluestore_pextent_t( + bluestore_pextent_t::INVALID_OFFSET, + p->length - left)); + } + llen_rb += p->length - left; + llen_lb += left; + p->length = left; + ++i; + ++p; + } + while (p != extents.end()) { + llen_rb += p->length; + rb.extents.push_back(*p++); + } + extents.resize(i); + logical_length = llen_lb; + rb.logical_length = llen_rb; + break; + } + rb.flags = flags; + + if (has_csum()) { + rb.csum_type = csum_type; + rb.csum_chunk_order = csum_chunk_order; + size_t csum_order = get_csum_chunk_size(); + ceph_assert(blob_offset % csum_order == 0); + size_t pos = (blob_offset / csum_order) * get_csum_value_size(); + // deep copy csum data + bufferptr old; + old.swap(csum_data); + rb.csum_data = bufferptr(old.c_str() + pos, old.length() - pos); + csum_data = bufferptr(old.c_str(), pos); + } +} + +// bluestore_shared_blob_t +MEMPOOL_DEFINE_OBJECT_FACTORY(bluestore_shared_blob_t, bluestore_shared_blob_t, + bluestore_cache_other); + +void bluestore_shared_blob_t::dump(Formatter *f) const +{ + f->dump_int("sbid", sbid); + f->dump_object("ref_map", ref_map); +} + +void bluestore_shared_blob_t::generate_test_instances( + list<bluestore_shared_blob_t*>& ls) +{ + ls.push_back(new bluestore_shared_blob_t(1)); +} + +ostream& operator<<(ostream& out, const bluestore_shared_blob_t& sb) +{ + out << "(sbid 0x" << std::hex << sb.sbid << std::dec; + out << " " << sb.ref_map << ")"; + return out; +} + +// bluestore_onode_t + +void bluestore_onode_t::shard_info::dump(Formatter *f) const +{ + f->dump_unsigned("offset", offset); + f->dump_unsigned("bytes", bytes); +} + +ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si) +{ + return out << std::hex << "0x" << si.offset << "(0x" << si.bytes << " bytes" + << std::dec << ")"; +} + +void bluestore_onode_t::dump(Formatter *f) const +{ + f->dump_unsigned("nid", nid); + f->dump_unsigned("size", size); + f->open_object_section("attrs"); + for (auto p = attrs.begin(); p != attrs.end(); ++p) { + f->open_object_section("attr"); + f->dump_string("name", p->first.c_str()); // it's not quite std::string + f->dump_unsigned("len", p->second.length()); + f->close_section(); + } + f->close_section(); + f->dump_string("flags", get_flags_string()); + f->open_array_section("extent_map_shards"); + for (auto si : extent_map_shards) { + f->dump_object("shard", si); + } + f->close_section(); + f->dump_unsigned("expected_object_size", expected_object_size); + f->dump_unsigned("expected_write_size", expected_write_size); + f->dump_unsigned("alloc_hint_flags", alloc_hint_flags); +} + +void bluestore_onode_t::generate_test_instances(list<bluestore_onode_t*>& o) +{ + o.push_back(new bluestore_onode_t()); + // FIXME +} + +// bluestore_deferred_op_t + +void bluestore_deferred_op_t::dump(Formatter *f) const +{ + f->dump_unsigned("op", (int)op); + f->dump_unsigned("data_len", data.length()); + f->open_array_section("extents"); + for (auto& e : extents) { + f->dump_object("extent", e); + } + f->close_section(); +} + +void bluestore_deferred_op_t::generate_test_instances(list<bluestore_deferred_op_t*>& o) +{ + o.push_back(new bluestore_deferred_op_t); + o.push_back(new bluestore_deferred_op_t); + o.back()->op = OP_WRITE; + o.back()->extents.push_back(bluestore_pextent_t(1, 2)); + o.back()->extents.push_back(bluestore_pextent_t(100, 5)); + o.back()->data.append("my data"); +} + +void bluestore_deferred_transaction_t::dump(Formatter *f) const +{ + f->dump_unsigned("seq", seq); + f->open_array_section("ops"); + for (list<bluestore_deferred_op_t>::const_iterator p = ops.begin(); p != ops.end(); ++p) { + f->dump_object("op", *p); + } + f->close_section(); + + f->open_array_section("released extents"); + for (interval_set<uint64_t>::const_iterator p = released.begin(); p != released.end(); ++p) { + f->open_object_section("extent"); + f->dump_unsigned("offset", p.get_start()); + f->dump_unsigned("length", p.get_len()); + f->close_section(); + } + f->close_section(); +} + +void bluestore_deferred_transaction_t::generate_test_instances(list<bluestore_deferred_transaction_t*>& o) +{ + o.push_back(new bluestore_deferred_transaction_t()); + o.push_back(new bluestore_deferred_transaction_t()); + o.back()->seq = 123; + o.back()->ops.push_back(bluestore_deferred_op_t()); + o.back()->ops.push_back(bluestore_deferred_op_t()); + o.back()->ops.back().op = bluestore_deferred_op_t::OP_WRITE; + o.back()->ops.back().extents.push_back(bluestore_pextent_t(1,7)); + o.back()->ops.back().data.append("foodata"); +} + +void bluestore_compression_header_t::dump(Formatter *f) const +{ + f->dump_unsigned("type", type); + f->dump_unsigned("length", length); +} + +void bluestore_compression_header_t::generate_test_instances( + list<bluestore_compression_header_t*>& o) +{ + o.push_back(new bluestore_compression_header_t); + o.push_back(new bluestore_compression_header_t(1)); + o.back()->length = 1234; +} diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h new file mode 100644 index 00000000..8232801c --- /dev/null +++ b/src/os/bluestore/bluestore_types.h @@ -0,0 +1,1044 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H +#define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H + +#include <ostream> +#include <bitset> +#include <type_traits> +#include "include/types.h" +#include "include/interval_set.h" +#include "include/utime.h" +#include "common/hobject.h" +#include "compressor/Compressor.h" +#include "common/Checksummer.h" +#include "include/mempool.h" + +namespace ceph { + class Formatter; +} + +/// label for block device +struct bluestore_bdev_label_t { + uuid_d osd_uuid; ///< osd uuid + uint64_t size = 0; ///< device size + utime_t btime; ///< birth time + string description; ///< device description + + map<string,string> meta; ///< {read,write}_meta() content from ObjectStore + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_bdev_label_t*>& o); +}; +WRITE_CLASS_ENCODER(bluestore_bdev_label_t) + +ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l); + +/// collection metadata +struct bluestore_cnode_t { + uint32_t bits; ///< how many bits of coll pgid are significant + + explicit bluestore_cnode_t(int b=0) : bits(b) {} + + DENC(bluestore_cnode_t, v, p) { + DENC_START(1, 1, p); + denc(v.bits, p); + DENC_FINISH(p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_cnode_t*>& o); +}; +WRITE_CLASS_DENC(bluestore_cnode_t) + +ostream& operator<<(ostream& out, const bluestore_cnode_t& l); + +template <typename OFFS_TYPE, typename LEN_TYPE> +struct bluestore_interval_t +{ + static const uint64_t INVALID_OFFSET = ~0ull; + + OFFS_TYPE offset = 0; + LEN_TYPE length = 0; + + bluestore_interval_t(){} + bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {} + + bool is_valid() const { + return offset != INVALID_OFFSET; + } + uint64_t end() const { + return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET; + } + + bool operator==(const bluestore_interval_t& other) const { + return offset == other.offset && length == other.length; + } + +}; + +/// pextent: physical extent +struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t> +{ + bluestore_pextent_t() {} + bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {} + bluestore_pextent_t(const bluestore_interval_t &ext) : + bluestore_interval_t(ext.offset, ext.length) {} + + DENC(bluestore_pextent_t, v, p) { + denc_lba(v.offset, p); + denc_varint_lowz(v.length, p); + } + + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_pextent_t*>& ls); +}; +WRITE_CLASS_DENC(bluestore_pextent_t) + +ostream& operator<<(ostream& out, const bluestore_pextent_t& o); + +typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector; + +template<> +struct denc_traits<PExtentVector> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = true; + static void bound_encode(const PExtentVector& v, size_t& p) { + p += sizeof(uint32_t); + const auto size = v.size(); + if (size) { + size_t per = 0; + denc(v.front(), per); + p += per * size; + } + } + static void encode(const PExtentVector& v, + bufferlist::contiguous_appender& p) { + denc_varint(v.size(), p); + for (auto& i : v) { + denc(i, p); + } + } + static void decode(PExtentVector& v, bufferptr::const_iterator& p) { + unsigned num; + denc_varint(num, p); + v.clear(); + v.resize(num); + for (unsigned i=0; i<num; ++i) { + denc(v[i], p); + } + } +}; + +/// extent_map: a map of reference counted extents +struct bluestore_extent_ref_map_t { + struct record_t { + uint32_t length; + uint32_t refs; + record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {} + DENC(bluestore_extent_ref_map_t::record_t, v, p) { + denc_varint_lowz(v.length, p); + denc_varint(v.refs, p); + } + }; + + typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t; + map_t ref_map; + + void _check() const; + void _maybe_merge_left(map_t::iterator& p); + + void clear() { + ref_map.clear(); + } + bool empty() const { + return ref_map.empty(); + } + + void get(uint64_t offset, uint32_t len); + void put(uint64_t offset, uint32_t len, PExtentVector *release, + bool *maybe_unshared); + + bool contains(uint64_t offset, uint32_t len) const; + bool intersects(uint64_t offset, uint32_t len) const; + + void bound_encode(size_t& p) const { + denc_varint((uint32_t)0, p); + if (!ref_map.empty()) { + size_t elem_size = 0; + denc_varint_lowz((uint64_t)0, elem_size); + ref_map.begin()->second.bound_encode(elem_size); + p += elem_size * ref_map.size(); + } + } + void encode(bufferlist::contiguous_appender& p) const { + const uint32_t n = ref_map.size(); + denc_varint(n, p); + if (n) { + auto i = ref_map.begin(); + denc_varint_lowz(i->first, p); + i->second.encode(p); + int64_t pos = i->first; + while (++i != ref_map.end()) { + denc_varint_lowz((int64_t)i->first - pos, p); + i->second.encode(p); + pos = i->first; + } + } + } + void decode(bufferptr::const_iterator& p) { + uint32_t n; + denc_varint(n, p); + if (n) { + int64_t pos; + denc_varint_lowz(pos, p); + ref_map[pos].decode(p); + while (--n) { + int64_t delta; + denc_varint_lowz(delta, p); + pos += delta; + ref_map[pos].decode(p); + } + } + } + + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_extent_ref_map_t*>& o); +}; +WRITE_CLASS_DENC(bluestore_extent_ref_map_t) + + +ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& rm); +static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l, + const bluestore_extent_ref_map_t::record_t& r) { + return l.length == r.length && l.refs == r.refs; +} +static inline bool operator==(const bluestore_extent_ref_map_t& l, + const bluestore_extent_ref_map_t& r) { + return l.ref_map == r.ref_map; +} +static inline bool operator!=(const bluestore_extent_ref_map_t& l, + const bluestore_extent_ref_map_t& r) { + return !(l == r); +} + +/// blob_use_tracker: a set of per-alloc unit ref counters to track blob usage +struct bluestore_blob_use_tracker_t { + // N.B.: There is no need to minimize au_size/num_au + // as much as possible (e.g. have just a single byte for au_size) since: + // 1) Struct isn't packed hence it's padded. And even if it's packed see 2) + // 2) Mem manager has its own granularity, most probably >= 8 bytes + // + uint32_t au_size; // Allocation (=tracking) unit size, + // == 0 if uninitialized + uint32_t num_au; // Amount of allocation units tracked + // == 0 if single unit or the whole blob is tracked + + union { + uint32_t* bytes_per_au; + uint32_t total_bytes; + }; + + bluestore_blob_use_tracker_t() + : au_size(0), num_au(0), bytes_per_au(nullptr) { + } + ~bluestore_blob_use_tracker_t() { + clear(); + } + + void clear() { + if (num_au != 0) { + delete[] bytes_per_au; + mempool::get_pool( + mempool::pool_index_t(mempool::mempool_bluestore_cache_other)). + adjust_count(-1, -sizeof(uint32_t) * num_au); + } + bytes_per_au = 0; + au_size = 0; + num_au = 0; + } + + uint32_t get_referenced_bytes() const { + uint32_t total = 0; + if (!num_au) { + total = total_bytes; + } else { + for (size_t i = 0; i < num_au; ++i) { + total += bytes_per_au[i]; + } + } + return total; + } + bool is_not_empty() const { + if (!num_au) { + return total_bytes != 0; + } else { + for (size_t i = 0; i < num_au; ++i) { + if (bytes_per_au[i]) { + return true; + } + } + } + return false; + } + bool is_empty() const { + return !is_not_empty(); + } + void prune_tail(uint32_t new_len) { + if (num_au) { + new_len = round_up_to(new_len, au_size); + uint32_t _num_au = new_len / au_size; + ceph_assert(_num_au <= num_au); + if (_num_au) { + num_au = _num_au; // bytes_per_au array is left unmodified + + } else { + clear(); + } + } + } + void add_tail(uint32_t new_len, uint32_t _au_size) { + auto full_size = au_size * (num_au ? num_au : 1); + ceph_assert(new_len >= full_size); + if (new_len == full_size) { + return; + } + if (!num_au) { + uint32_t old_total = total_bytes; + total_bytes = 0; + init(new_len, _au_size); + ceph_assert(num_au); + bytes_per_au[0] = old_total; + } else { + ceph_assert(_au_size == au_size); + new_len = round_up_to(new_len, au_size); + uint32_t _num_au = new_len / au_size; + ceph_assert(_num_au >= num_au); + if (_num_au > num_au) { + auto old_bytes = bytes_per_au; + auto old_num_au = num_au; + num_au = _num_au; + allocate(); + for (size_t i = 0; i < old_num_au; i++) { + bytes_per_au[i] = old_bytes[i]; + } + for (size_t i = old_num_au; i < num_au; i++) { + bytes_per_au[i] = 0; + } + delete[] old_bytes; + } + } + } + + void init( + uint32_t full_length, + uint32_t _au_size); + + void get( + uint32_t offset, + uint32_t len); + + /// put: return true if the blob has no references any more after the call, + /// no release_units is filled for the sake of performance. + /// return false if there are some references to the blob, + /// in this case release_units contains pextents + /// (identified by their offsets relative to the blob start) + /// that are not used any more and can be safely deallocated. + bool put( + uint32_t offset, + uint32_t len, + PExtentVector *release); + + bool can_split() const; + bool can_split_at(uint32_t blob_offset) const; + void split( + uint32_t blob_offset, + bluestore_blob_use_tracker_t* r); + + bool equal( + const bluestore_blob_use_tracker_t& other) const; + + void bound_encode(size_t& p) const { + denc_varint(au_size, p); + if (au_size) { + denc_varint(num_au, p); + if (!num_au) { + denc_varint(total_bytes, p); + } else { + size_t elem_size = 0; + denc_varint((uint32_t)0, elem_size); + p += elem_size * num_au; + } + } + } + void encode(bufferlist::contiguous_appender& p) const { + denc_varint(au_size, p); + if (au_size) { + denc_varint(num_au, p); + if (!num_au) { + denc_varint(total_bytes, p); + } else { + size_t elem_size = 0; + denc_varint((uint32_t)0, elem_size); + for (size_t i = 0; i < num_au; ++i) { + denc_varint(bytes_per_au[i], p); + } + } + } + } + void decode(bufferptr::const_iterator& p) { + clear(); + denc_varint(au_size, p); + if (au_size) { + denc_varint(num_au, p); + if (!num_au) { + denc_varint(total_bytes, p); + } else { + allocate(); + for (size_t i = 0; i < num_au; ++i) { + denc_varint(bytes_per_au[i], p); + } + } + } + } + + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_blob_use_tracker_t*>& o); +private: + void allocate(); +}; +WRITE_CLASS_DENC(bluestore_blob_use_tracker_t) +ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& rm); + +/// blob: a piece of data on disk +struct bluestore_blob_t { +private: + PExtentVector extents; ///< raw data position on device + uint32_t logical_length = 0; ///< original length of data stored in the blob + uint32_t compressed_length = 0; ///< compressed length if any + +public: + enum { + LEGACY_FLAG_MUTABLE = 1, ///< [legacy] blob can be overwritten or split + FLAG_COMPRESSED = 2, ///< blob is compressed + FLAG_CSUM = 4, ///< blob has checksums + FLAG_HAS_UNUSED = 8, ///< blob has unused map + FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob + }; + static string get_flags_string(unsigned flags); + + uint32_t flags = 0; ///< FLAG_* + + typedef uint16_t unused_t; + unused_t unused = 0; ///< portion that has never been written to (bitmap) + + uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_* + uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes + + bufferptr csum_data; ///< opaque vector of csum data + + bluestore_blob_t(uint32_t f = 0) : flags(f) {} + + const PExtentVector& get_extents() const { + return extents; + } + PExtentVector& dirty_extents() { + return extents; + } + + DENC_HELPERS; + void bound_encode(size_t& p, uint64_t struct_v) const { + ceph_assert(struct_v == 1 || struct_v == 2); + denc(extents, p); + denc_varint(flags, p); + denc_varint_lowz(logical_length, p); + denc_varint_lowz(compressed_length, p); + denc(csum_type, p); + denc(csum_chunk_order, p); + denc_varint(csum_data.length(), p); + p += csum_data.length(); + p += sizeof(unused_t); + } + + void encode(bufferlist::contiguous_appender& p, uint64_t struct_v) const { + ceph_assert(struct_v == 1 || struct_v == 2); + denc(extents, p); + denc_varint(flags, p); + if (is_compressed()) { + denc_varint_lowz(logical_length, p); + denc_varint_lowz(compressed_length, p); + } + if (has_csum()) { + denc(csum_type, p); + denc(csum_chunk_order, p); + denc_varint(csum_data.length(), p); + memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(), + csum_data.length()); + } + if (has_unused()) { + denc(unused, p); + } + } + + void decode(bufferptr::const_iterator& p, uint64_t struct_v) { + ceph_assert(struct_v == 1 || struct_v == 2); + denc(extents, p); + denc_varint(flags, p); + if (is_compressed()) { + denc_varint_lowz(logical_length, p); + denc_varint_lowz(compressed_length, p); + } else { + logical_length = get_ondisk_length(); + } + if (has_csum()) { + denc(csum_type, p); + denc(csum_chunk_order, p); + int len; + denc_varint(len, p); + csum_data = p.get_ptr(len); + csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other); + } + if (has_unused()) { + denc(unused, p); + } + } + + bool can_split() const { + return + !has_flag(FLAG_SHARED) && + !has_flag(FLAG_COMPRESSED) && + !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex + } + bool can_split_at(uint32_t blob_offset) const { + return !has_csum() || blob_offset % get_csum_chunk_size() == 0; + } + + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_blob_t*>& ls); + + bool has_flag(unsigned f) const { + return flags & f; + } + void set_flag(unsigned f) { + flags |= f; + } + void clear_flag(unsigned f) { + flags &= ~f; + } + string get_flags_string() const { + return get_flags_string(flags); + } + + void set_compressed(uint64_t clen_orig, uint64_t clen) { + set_flag(FLAG_COMPRESSED); + logical_length = clen_orig; + compressed_length = clen; + } + bool is_mutable() const { + return !is_compressed() && !is_shared(); + } + bool is_compressed() const { + return has_flag(FLAG_COMPRESSED); + } + bool has_csum() const { + return has_flag(FLAG_CSUM); + } + bool has_unused() const { + return has_flag(FLAG_HAS_UNUSED); + } + bool is_shared() const { + return has_flag(FLAG_SHARED); + } + + /// return chunk (i.e. min readable block) size for the blob + uint64_t get_chunk_size(uint64_t dev_block_size) const { + return has_csum() ? + std::max<uint64_t>(dev_block_size, get_csum_chunk_size()) : dev_block_size; + } + uint32_t get_csum_chunk_size() const { + return 1 << csum_chunk_order; + } + uint32_t get_compressed_payload_length() const { + return is_compressed() ? compressed_length : 0; + } + uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const { + auto p = extents.begin(); + ceph_assert(p != extents.end()); + while (x_off >= p->length) { + x_off -= p->length; + ++p; + ceph_assert(p != extents.end()); + } + if (plen) + *plen = p->length - x_off; + return p->offset + x_off; + } + + // validate whether or not the status of pextents within the given range + // meets the requirement(allocated or unallocated). + bool _validate_range(uint64_t b_off, uint64_t b_len, + bool require_allocated) const { + auto p = extents.begin(); + ceph_assert(p != extents.end()); + while (b_off >= p->length) { + b_off -= p->length; + ++p; + ceph_assert(p != extents.end()); + } + b_len += b_off; + while (b_len) { + ceph_assert(p != extents.end()); + if (require_allocated != p->is_valid()) { + return false; + } + + if (p->length >= b_len) { + return true; + } + b_len -= p->length; + ++p; + } + ceph_abort_msg("we should not get here"); + return false; + } + + /// return true if the entire range is allocated + /// (mapped to extents on disk) + bool is_allocated(uint64_t b_off, uint64_t b_len) const { + return _validate_range(b_off, b_len, true); + } + + /// return true if the entire range is unallocated + /// (not mapped to extents on disk) + bool is_unallocated(uint64_t b_off, uint64_t b_len) const { + return _validate_range(b_off, b_len, false); + } + + /// return true if the logical range has never been used + bool is_unused(uint64_t offset, uint64_t length) const { + if (!has_unused()) { + return false; + } + ceph_assert(!is_compressed()); + uint64_t blob_len = get_logical_length(); + ceph_assert((blob_len % (sizeof(unused)*8)) == 0); + ceph_assert(offset + length <= blob_len); + uint64_t chunk_size = blob_len / (sizeof(unused)*8); + uint64_t start = offset / chunk_size; + uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size; + auto i = start; + while (i < end && (unused & (1u << i))) { + i++; + } + return i >= end; + } + + /// mark a range that has never been used + void add_unused(uint64_t offset, uint64_t length) { + ceph_assert(!is_compressed()); + uint64_t blob_len = get_logical_length(); + ceph_assert((blob_len % (sizeof(unused)*8)) == 0); + ceph_assert(offset + length <= blob_len); + uint64_t chunk_size = blob_len / (sizeof(unused)*8); + uint64_t start = round_up_to(offset, chunk_size) / chunk_size; + uint64_t end = (offset + length) / chunk_size; + for (auto i = start; i < end; ++i) { + unused |= (1u << i); + } + if (start != end) { + set_flag(FLAG_HAS_UNUSED); + } + } + + /// indicate that a range has (now) been used. + void mark_used(uint64_t offset, uint64_t length) { + if (has_unused()) { + ceph_assert(!is_compressed()); + uint64_t blob_len = get_logical_length(); + ceph_assert((blob_len % (sizeof(unused)*8)) == 0); + ceph_assert(offset + length <= blob_len); + uint64_t chunk_size = blob_len / (sizeof(unused)*8); + uint64_t start = offset / chunk_size; + uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size; + for (auto i = start; i < end; ++i) { + unused &= ~(1u << i); + } + if (unused == 0) { + clear_flag(FLAG_HAS_UNUSED); + } + } + } + + template<class F> + int map(uint64_t x_off, uint64_t x_len, F&& f) const { + static_assert(std::is_invocable_r_v<int, F, uint64_t, uint64_t>); + + auto p = extents.begin(); + ceph_assert(p != extents.end()); + while (x_off >= p->length) { + x_off -= p->length; + ++p; + ceph_assert(p != extents.end()); + } + while (x_len > 0) { + ceph_assert(p != extents.end()); + uint64_t l = std::min(p->length - x_off, x_len); + int r = f(p->offset + x_off, l); + if (r < 0) + return r; + x_off = 0; + x_len -= l; + ++p; + } + return 0; + } + template<class F> + void map_bl(uint64_t x_off, + bufferlist& bl, + F&& f) const { + static_assert(std::is_invocable_v<F, uint64_t, bufferlist&>); + + auto p = extents.begin(); + ceph_assert(p != extents.end()); + while (x_off >= p->length) { + x_off -= p->length; + ++p; + ceph_assert(p != extents.end()); + } + bufferlist::iterator it = bl.begin(); + uint64_t x_len = bl.length(); + while (x_len > 0) { + ceph_assert(p != extents.end()); + uint64_t l = std::min(p->length - x_off, x_len); + bufferlist t; + it.copy(l, t); + f(p->offset + x_off, t); + x_off = 0; + x_len -= l; + ++p; + } + } + + uint32_t get_ondisk_length() const { + uint32_t len = 0; + for (auto &p : extents) { + len += p.length; + } + return len; + } + + uint32_t get_logical_length() const { + return logical_length; + } + size_t get_csum_value_size() const; + + size_t get_csum_count() const { + size_t vs = get_csum_value_size(); + if (!vs) + return 0; + return csum_data.length() / vs; + } + uint64_t get_csum_item(unsigned i) const { + size_t cs = get_csum_value_size(); + const char *p = csum_data.c_str(); + switch (cs) { + case 0: + ceph_abort_msg("no csum data, bad index"); + case 1: + return reinterpret_cast<const uint8_t*>(p)[i]; + case 2: + return reinterpret_cast<const ceph_le16*>(p)[i]; + case 4: + return reinterpret_cast<const ceph_le32*>(p)[i]; + case 8: + return reinterpret_cast<const ceph_le64*>(p)[i]; + default: + ceph_abort_msg("unrecognized csum word size"); + } + } + const char *get_csum_item_ptr(unsigned i) const { + size_t cs = get_csum_value_size(); + return csum_data.c_str() + (cs * i); + } + char *get_csum_item_ptr(unsigned i) { + size_t cs = get_csum_value_size(); + return csum_data.c_str() + (cs * i); + } + + void init_csum(unsigned type, unsigned order, unsigned len) { + flags |= FLAG_CSUM; + csum_type = type; + csum_chunk_order = order; + csum_data = buffer::create(get_csum_value_size() * len / get_csum_chunk_size()); + csum_data.zero(); + csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other); + } + + /// calculate csum for the buffer at the given b_off + void calc_csum(uint64_t b_off, const bufferlist& bl); + + /// verify csum: return -EOPNOTSUPP for unsupported checksum type; + /// return -1 and valid(nonnegative) b_bad_off for checksum error; + /// return 0 if all is well. + int verify_csum(uint64_t b_off, const bufferlist& bl, int* b_bad_off, + uint64_t *bad_csum) const; + + bool can_prune_tail() const { + return + extents.size() > 1 && // if it's all invalid it's not pruning. + !extents.back().is_valid() && + !has_unused(); + } + void prune_tail() { + const auto &p = extents.back(); + logical_length -= p.length; + extents.pop_back(); + if (has_csum()) { + bufferptr t; + t.swap(csum_data); + csum_data = bufferptr(t.c_str(), + get_logical_length() / get_csum_chunk_size() * + get_csum_value_size()); + } + } + void add_tail(uint32_t new_len) { + ceph_assert(is_mutable()); + ceph_assert(!has_unused()); + ceph_assert(new_len > logical_length); + extents.emplace_back( + bluestore_pextent_t( + bluestore_pextent_t::INVALID_OFFSET, + new_len - logical_length)); + logical_length = new_len; + if (has_csum()) { + bufferptr t; + t.swap(csum_data); + csum_data = buffer::create( + get_csum_value_size() * logical_length / get_csum_chunk_size()); + csum_data.copy_in(0, t.length(), t.c_str()); + csum_data.zero(t.length(), csum_data.length() - t.length()); + } + } + uint32_t get_release_size(uint32_t min_alloc_size) const { + if (is_compressed()) { + return get_logical_length(); + } + uint32_t res = get_csum_chunk_size(); + if (!has_csum() || res < min_alloc_size) { + res = min_alloc_size; + } + return res; + } + + void split(uint32_t blob_offset, bluestore_blob_t& rb); + void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs); + void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only + + /// updates blob's pextents container and return unused pextents eligible + /// for release. + /// all - indicates that the whole blob to be released. + /// logical - specifies set of logical extents within blob's + /// to be released + /// Returns true if blob has no more valid pextents + bool release_extents( + bool all, + const PExtentVector& logical, + PExtentVector* r); +}; +WRITE_CLASS_DENC_FEATURED(bluestore_blob_t) + +ostream& operator<<(ostream& out, const bluestore_blob_t& o); + + +/// shared blob state +struct bluestore_shared_blob_t { + MEMPOOL_CLASS_HELPERS(); + uint64_t sbid; ///> shared blob id + bluestore_extent_ref_map_t ref_map; ///< shared blob extents + + bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {} + bluestore_shared_blob_t(uint64_t _sbid, + bluestore_extent_ref_map_t&& _ref_map ) + : sbid(_sbid), ref_map(std::move(_ref_map)) {} + + DENC(bluestore_shared_blob_t, v, p) { + DENC_START(1, 1, p); + denc(v.ref_map, p); + DENC_FINISH(p); + } + + + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_shared_blob_t*>& ls); + + bool empty() const { + return ref_map.empty(); + } +}; +WRITE_CLASS_DENC(bluestore_shared_blob_t) + +ostream& operator<<(ostream& out, const bluestore_shared_blob_t& o); + +/// onode: per-object metadata +struct bluestore_onode_t { + uint64_t nid = 0; ///< numeric id (locally unique) + uint64_t size = 0; ///< object size + // mempool to be assigned to buffer::ptr manually + std::map<mempool::bluestore_cache_meta::string, ceph::buffer::ptr> attrs; + + struct shard_info { + uint32_t offset = 0; ///< logical offset for start of shard + uint32_t bytes = 0; ///< encoded bytes + DENC(shard_info, v, p) { + denc_varint(v.offset, p); + denc_varint(v.bytes, p); + } + void dump(Formatter *f) const; + }; + vector<shard_info> extent_map_shards; ///< extent map shards (if any) + + uint32_t expected_object_size = 0; + uint32_t expected_write_size = 0; + uint32_t alloc_hint_flags = 0; + + uint8_t flags = 0; + + enum { + FLAG_OMAP = 1, ///< object may have omap data + FLAG_PGMETA_OMAP = 2, ///< omap data is in meta omap prefix + }; + + string get_flags_string() const { + string s; + if (flags & FLAG_OMAP) { + s = "omap"; + } + return s; + } + + bool has_flag(unsigned f) const { + return flags & f; + } + + void set_flag(unsigned f) { + flags |= f; + } + + void clear_flag(unsigned f) { + flags &= ~f; + } + + bool has_omap() const { + return has_flag(FLAG_OMAP); + } + bool is_pgmeta_omap() const { + return has_flag(FLAG_PGMETA_OMAP); + } + + void set_omap_flag() { + set_flag(FLAG_OMAP); + } + + void clear_omap_flag() { + clear_flag(FLAG_OMAP); + } + + DENC(bluestore_onode_t, v, p) { + DENC_START(1, 1, p); + denc_varint(v.nid, p); + denc_varint(v.size, p); + denc(v.attrs, p); + denc(v.flags, p); + denc(v.extent_map_shards, p); + denc_varint(v.expected_object_size, p); + denc_varint(v.expected_write_size, p); + denc_varint(v.alloc_hint_flags, p); + DENC_FINISH(p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_onode_t*>& o); +}; +WRITE_CLASS_DENC(bluestore_onode_t::shard_info) +WRITE_CLASS_DENC(bluestore_onode_t) + +ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si); + +/// writeahead-logged op +struct bluestore_deferred_op_t { + typedef enum { + OP_WRITE = 1, + } type_t; + __u8 op = 0; + + PExtentVector extents; + bufferlist data; + + DENC(bluestore_deferred_op_t, v, p) { + DENC_START(1, 1, p); + denc(v.op, p); + denc(v.extents, p); + denc(v.data, p); + DENC_FINISH(p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_deferred_op_t*>& o); +}; +WRITE_CLASS_DENC(bluestore_deferred_op_t) + + +/// writeahead-logged transaction +struct bluestore_deferred_transaction_t { + uint64_t seq = 0; + list<bluestore_deferred_op_t> ops; + interval_set<uint64_t> released; ///< allocations to release after tx + + bluestore_deferred_transaction_t() : seq(0) {} + + DENC(bluestore_deferred_transaction_t, v, p) { + DENC_START(1, 1, p); + denc(v.seq, p); + denc(v.ops, p); + denc(v.released, p); + DENC_FINISH(p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_deferred_transaction_t*>& o); +}; +WRITE_CLASS_DENC(bluestore_deferred_transaction_t) + +struct bluestore_compression_header_t { + uint8_t type = Compressor::COMP_ALG_NONE; + uint32_t length = 0; + + bluestore_compression_header_t() {} + bluestore_compression_header_t(uint8_t _type) + : type(_type) {} + + DENC(bluestore_compression_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.length, p); + DENC_FINISH(p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list<bluestore_compression_header_t*>& o); +}; +WRITE_CLASS_DENC(bluestore_compression_header_t) + + +#endif diff --git a/src/os/bluestore/ceph_aio.h b/src/os/bluestore/ceph_aio.h new file mode 100644 index 00000000..ab033886 --- /dev/null +++ b/src/os/bluestore/ceph_aio.h @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "acconfig.h" + +#if defined(HAVE_LIBAIO) +#include <libaio.h> +#elif defined(HAVE_POSIXAIO) +#include <aio.h> +#include <sys/event.h> +#endif + +#include <boost/intrusive/list.hpp> +#include <boost/container/small_vector.hpp> + +#include "include/buffer.h" +#include "include/types.h" + +struct aio_t { +#if defined(HAVE_LIBAIO) + struct iocb iocb{}; // must be first element; see shenanigans in aio_queue_t +#elif defined(HAVE_POSIXAIO) + // static long aio_listio_max = -1; + union { + struct aiocb aiocb; + struct aiocb *aiocbp; + } aio; + int n_aiocb; +#endif + void *priv; + int fd; + boost::container::small_vector<iovec,4> iov; + uint64_t offset, length; + long rval; + bufferlist bl; ///< write payload (so that it remains stable for duration) + + boost::intrusive::list_member_hook<> queue_item; + + aio_t(void *p, int f) : priv(p), fd(f), offset(0), length(0), rval(-1000) { + } + + void pwritev(uint64_t _offset, uint64_t len) { + offset = _offset; + length = len; +#if defined(HAVE_LIBAIO) + io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset); +#elif defined(HAVE_POSIXAIO) + n_aiocb = iov.size(); + aio.aiocbp = (struct aiocb*)calloc(iov.size(), sizeof(struct aiocb)); + for (int i = 0; i < iov.size(); i++) { + aio.aiocbp[i].aio_fildes = fd; + aio.aiocbp[i].aio_offset = offset; + aio.aiocbp[i].aio_buf = iov[i].iov_base; + aio.aiocbp[i].aio_nbytes = iov[i].iov_len; + aio.aiocbp[i].aio_lio_opcode = LIO_WRITE; + offset += iov[i].iov_len; + } +#endif + } + void pread(uint64_t _offset, uint64_t len) { + offset = _offset; + length = len; + bufferptr p = buffer::create_small_page_aligned(length); +#if defined(HAVE_LIBAIO) + io_prep_pread(&iocb, fd, p.c_str(), length, offset); +#elif defined(HAVE_POSIXAIO) + n_aiocb = 1; + aio.aiocb.aio_fildes = fd; + aio.aiocb.aio_buf = p.c_str(); + aio.aiocb.aio_nbytes = length; + aio.aiocb.aio_offset = offset; +#endif + bl.append(std::move(p)); + } + + long get_return_value() { + return rval; + } +}; + +std::ostream& operator<<(std::ostream& os, const aio_t& aio); + +typedef boost::intrusive::list< + aio_t, + boost::intrusive::member_hook< + aio_t, + boost::intrusive::list_member_hook<>, + &aio_t::queue_item> > aio_list_t; + +struct aio_queue_t { + int max_iodepth; +#if defined(HAVE_LIBAIO) + io_context_t ctx; +#elif defined(HAVE_POSIXAIO) + int ctx; +#endif + + typedef list<aio_t>::iterator aio_iter; + + explicit aio_queue_t(unsigned max_iodepth) + : max_iodepth(max_iodepth), + ctx(0) { + } + ~aio_queue_t() { + ceph_assert(ctx == 0); + } + + int init() { + ceph_assert(ctx == 0); +#if defined(HAVE_LIBAIO) + int r = io_setup(max_iodepth, &ctx); + if (r < 0) { + if (ctx) { + io_destroy(ctx); + ctx = 0; + } + } + return r; +#elif defined(HAVE_POSIXAIO) + ctx = kqueue(); + if (ctx < 0) + return -errno; + else + return 0; +#endif + } + void shutdown() { + if (ctx) { +#if defined(HAVE_LIBAIO) + int r = io_destroy(ctx); +#elif defined(HAVE_POSIXAIO) + int r = close(ctx); +#endif + ceph_assert(r == 0); + ctx = 0; + } + } + + int submit_batch(aio_iter begin, aio_iter end, uint16_t aios_size, + void *priv, int *retries); + int get_next_completed(int timeout_ms, aio_t **paio, int max); +}; diff --git a/src/os/bluestore/fastbmap_allocator_impl.cc b/src/os/bluestore/fastbmap_allocator_impl.cc new file mode 100755 index 00000000..c8909655 --- /dev/null +++ b/src/os/bluestore/fastbmap_allocator_impl.cc @@ -0,0 +1,717 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Bitmap based in-memory allocator implementation. + * Author: Igor Fedotov, ifedotov@suse.com + * + */ + +#include "fastbmap_allocator_impl.h" + +uint64_t AllocatorLevel::l0_dives = 0; +uint64_t AllocatorLevel::l0_iterations = 0; +uint64_t AllocatorLevel::l0_inner_iterations = 0; +uint64_t AllocatorLevel::alloc_fragments = 0; +uint64_t AllocatorLevel::alloc_fragments_fast = 0; +uint64_t AllocatorLevel::l2_allocs = 0; + +inline interval_t _align2units(uint64_t offset, uint64_t len, uint64_t min_length) +{ + interval_t res; + if (len >= min_length) { + res.offset = p2roundup(offset, min_length); + auto delta_off = res.offset - offset; + if (len > delta_off) { + res.length = len - delta_off; + res.length = p2align<uint64_t>(res.length, min_length); + if (res.length) { + return res; + } + } + } + return interval_t(); +} + +interval_t AllocatorLevel01Loose::_get_longest_from_l0(uint64_t pos0, + uint64_t pos1, uint64_t min_length, interval_t* tail) const +{ + interval_t res; + if (pos0 >= pos1) { + return res; + } + auto pos = pos0; + + interval_t res_candidate; + if (tail->length != 0) { + ceph_assert((tail->offset % l0_granularity) == 0); + ceph_assert((tail->length % l0_granularity) == 0); + res_candidate.offset = tail->offset / l0_granularity; + res_candidate.length = tail->length / l0_granularity; + } + *tail = interval_t(); + + auto d = bits_per_slot; + slot_t bits = l0[pos / d]; + bits >>= pos % d; + bool end_loop = false; + auto min_granules = min_length / l0_granularity; + + do { + if ((pos % d) == 0) { + bits = l0[pos / d]; + if (pos1 - pos >= d) { + switch(bits) { + case all_slot_set: + // slot is totally free + if (!res_candidate.length) { + res_candidate.offset = pos; + } + res_candidate.length += d; + pos += d; + end_loop = pos >= pos1; + if (end_loop) { + *tail = res_candidate; + res_candidate = _align2units(res_candidate.offset, + res_candidate.length, min_granules); + if(res.length < res_candidate.length) { + res = res_candidate; + } + } + continue; + case all_slot_clear: + // slot is totally allocated + res_candidate = _align2units(res_candidate.offset, + res_candidate.length, min_granules); + if (res.length < res_candidate.length) { + res = res_candidate; + } + res_candidate = interval_t(); + pos += d; + end_loop = pos >= pos1; + continue; + } + } + } //if ((pos % d) == 0) + + end_loop = ++pos >= pos1; + if (bits & 1) { + // item is free + if (!res_candidate.length) { + res_candidate.offset = pos - 1; + } + ++res_candidate.length; + if (end_loop) { + *tail = res_candidate; + res_candidate = _align2units(res_candidate.offset, + res_candidate.length, min_granules); + if (res.length < res_candidate.length) { + res = res_candidate; + } + } + } else { + res_candidate = _align2units(res_candidate.offset, + res_candidate.length, min_granules); + if (res.length < res_candidate.length) { + res = res_candidate; + } + res_candidate = interval_t(); + } + bits >>= 1; + } while (!end_loop); + res.offset *= l0_granularity; + res.length *= l0_granularity; + tail->offset *= l0_granularity; + tail->length *= l0_granularity; + return res; +} + +void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start, + uint64_t pos_end, uint64_t length, uint64_t min_length, int mode, + search_ctx_t* ctx) +{ + auto d = L1_ENTRIES_PER_SLOT; + ceph_assert((pos_start % d) == 0); + ceph_assert((pos_end % d) == 0); + + uint64_t l0_w = slots_per_slotset * L0_ENTRIES_PER_SLOT; + + uint64_t l1_pos = pos_start; + const interval_t empty_tail; + interval_t prev_tail; + + uint64_t next_free_l1_pos = 0; + for (auto pos = pos_start / d; pos < pos_end / d; ++pos) { + slot_t slot_val = l1[pos]; + // FIXME minor: code below can be optimized to check slot_val against + // all_slot_set(_clear) value + + for (auto c = 0; c < d; c++) { + switch (slot_val & L1_ENTRY_MASK) { + case L1_ENTRY_FREE: + prev_tail = empty_tail; + if (!ctx->free_count) { + ctx->free_l1_pos = l1_pos; + } else if (l1_pos != next_free_l1_pos){ + auto o = ctx->free_l1_pos * l1_granularity; + auto l = ctx->free_count * l1_granularity; + // check if already found extent fits min_length after alignment + if (_align2units(o, l, min_length).length >= min_length) { + break; + } + // if not - proceed with the next one + ctx->free_l1_pos = l1_pos; + ctx->free_count = 0; + } + next_free_l1_pos = l1_pos + 1; + ++ctx->free_count; + if (mode == STOP_ON_EMPTY) { + return; + } + break; + case L1_ENTRY_FULL: + prev_tail = empty_tail; + break; + case L1_ENTRY_PARTIAL: + interval_t longest; + ++ctx->partial_count; + + longest = _get_longest_from_l0(l1_pos * l0_w, (l1_pos + 1) * l0_w, min_length, &prev_tail); + + if (longest.length >= length) { + if ((ctx->affordable_len == 0) || + ((ctx->affordable_len != 0) && + (longest.length < ctx->affordable_len))) { + ctx->affordable_len = longest.length; + ctx->affordable_offs = longest.offset; + } + } + if (longest.length >= min_length && + (ctx->min_affordable_len == 0 || + (longest.length < ctx->min_affordable_len))) { + + ctx->min_affordable_len = p2align<uint64_t>(longest.length, min_length); + ctx->min_affordable_offs = longest.offset; + } + if (mode == STOP_ON_PARTIAL) { + return; + } + break; + } + slot_val >>= L1_ENTRY_WIDTH; + ++l1_pos; + } + } + ctx->fully_processed = true; +} + +void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end) +{ + if (l0_pos == l0_pos_end) { + return; + } + auto d0 = bits_per_slotset; + uint64_t l1_w = L1_ENTRIES_PER_SLOT; + // this should be aligned with slotset boundaries + ceph_assert(0 == (l0_pos % d0)); + ceph_assert(0 == (l0_pos_end % d0)); + + int64_t idx = l0_pos / bits_per_slot; + int64_t idx_end = l0_pos_end / bits_per_slot; + slot_t mask_to_apply = L1_ENTRY_NOT_USED; + + auto l1_pos = l0_pos / d0; + + while (idx < idx_end) { + if (l0[idx] == all_slot_clear) { + // if not all prev slots are allocated then no need to check the + // current slot set, it's partial + ++idx; + if (mask_to_apply == L1_ENTRY_NOT_USED) { + mask_to_apply = L1_ENTRY_FULL; + } else if (mask_to_apply != L1_ENTRY_FULL) { + idx = p2roundup(idx, int64_t(slots_per_slotset)); + mask_to_apply = L1_ENTRY_PARTIAL; + } + } else if (l0[idx] == all_slot_set) { + // if not all prev slots are free then no need to check the + // current slot set, it's partial + ++idx; + if (mask_to_apply == L1_ENTRY_NOT_USED) { + mask_to_apply = L1_ENTRY_FREE; + } else if (mask_to_apply != L1_ENTRY_FREE) { + idx = p2roundup(idx, int64_t(slots_per_slotset)); + mask_to_apply = L1_ENTRY_PARTIAL; + } + } else { + // no need to check the current slot set, it's partial + mask_to_apply = L1_ENTRY_PARTIAL; + ++idx; + idx = p2roundup(idx, int64_t(slots_per_slotset)); + } + if ((idx % slots_per_slotset) == 0) { + ceph_assert(mask_to_apply != L1_ENTRY_NOT_USED); + uint64_t shift = (l1_pos % l1_w) * L1_ENTRY_WIDTH; + slot_t& slot_val = l1[l1_pos / l1_w]; + auto mask = slot_t(L1_ENTRY_MASK) << shift; + + slot_t old_mask = (slot_val & mask) >> shift; + switch(old_mask) { + case L1_ENTRY_FREE: + unalloc_l1_count--; + break; + case L1_ENTRY_PARTIAL: + partial_l1_count--; + break; + } + slot_val &= ~mask; + slot_val |= slot_t(mask_to_apply) << shift; + switch(mask_to_apply) { + case L1_ENTRY_FREE: + unalloc_l1_count++; + break; + case L1_ENTRY_PARTIAL: + partial_l1_count++; + break; + } + mask_to_apply = L1_ENTRY_NOT_USED; + ++l1_pos; + } + } +} + +void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start, + int64_t l0_pos_end) +{ + auto d0 = L0_ENTRIES_PER_SLOT; + + int64_t pos = l0_pos_start; + slot_t bits = (slot_t)1 << (l0_pos_start % d0); + slot_t* val_s = &l0[pos / d0]; + int64_t pos_e = std::min(l0_pos_end, p2roundup<int64_t>(l0_pos_start + 1, d0)); + while (pos < pos_e) { + (*val_s) &= ~bits; + bits <<= 1; + pos++; + } + pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0)); + while (pos < pos_e) { + *(++val_s) = all_slot_clear; + pos += d0; + } + bits = 1; + ++val_s; + while (pos < l0_pos_end) { + (*val_s) &= ~bits; + bits <<= 1; + pos++; + } +} + +interval_t AllocatorLevel01Loose::_allocate_l1_contiguous(uint64_t length, + uint64_t min_length, uint64_t max_length, + uint64_t pos_start, uint64_t pos_end) +{ + interval_t res = { 0, 0 }; + uint64_t l0_w = slots_per_slotset * L0_ENTRIES_PER_SLOT; + + if (unlikely(length <= l0_granularity)) { + search_ctx_t ctx; + _analyze_partials(pos_start, pos_end, l0_granularity, l0_granularity, + STOP_ON_PARTIAL, &ctx); + + // check partially free slot sets first (including neighboring), + // full length match required. + if (ctx.affordable_len) { + // allocate as specified + ceph_assert(ctx.affordable_len >= length); + auto pos = ctx.affordable_offs / l0_granularity; + _mark_alloc_l1_l0(pos, pos + 1); + res = interval_t(ctx.affordable_offs, length); + return res; + } + + // allocate from free slot sets + if (ctx.free_count) { + auto l = std::min(length, ctx.free_count * l1_granularity); + ceph_assert((l % l0_granularity) == 0); + auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity; + + _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end); + res = interval_t(ctx.free_l1_pos * l1_granularity, l); + return res; + } + } else if (unlikely(length == l1_granularity)) { + search_ctx_t ctx; + _analyze_partials(pos_start, pos_end, length, min_length, STOP_ON_EMPTY, &ctx); + + // allocate using contiguous extent found at l1 if any + if (ctx.free_count) { + + auto l = std::min(length, ctx.free_count * l1_granularity); + ceph_assert((l % l0_granularity) == 0); + auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity; + + _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end); + res = interval_t(ctx.free_l1_pos * l1_granularity, l); + + return res; + } + + // we can terminate earlier on free entry only + ceph_assert(ctx.fully_processed); + + // check partially free slot sets first (including neighboring), + // full length match required. + if (ctx.affordable_len) { + ceph_assert(ctx.affordable_len >= length); + ceph_assert((length % l0_granularity) == 0); + auto pos_start = ctx.affordable_offs / l0_granularity; + auto pos_end = (ctx.affordable_offs + length) / l0_granularity; + _mark_alloc_l1_l0(pos_start, pos_end); + res = interval_t(ctx.affordable_offs, length); + return res; + } + if (ctx.min_affordable_len) { + auto pos_start = ctx.min_affordable_offs / l0_granularity; + auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity; + _mark_alloc_l1_l0(pos_start, pos_end); + return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len); + } + } else { + search_ctx_t ctx; + _analyze_partials(pos_start, pos_end, length, min_length, NO_STOP, &ctx); + ceph_assert(ctx.fully_processed); + // check partially free slot sets first (including neighboring), + // full length match required. + if (ctx.affordable_len) { + ceph_assert(ctx.affordable_len >= length); + ceph_assert((length % l0_granularity) == 0); + auto pos_start = ctx.affordable_offs / l0_granularity; + auto pos_end = (ctx.affordable_offs + length) / l0_granularity; + _mark_alloc_l1_l0(pos_start, pos_end); + res = interval_t(ctx.affordable_offs, length); + return res; + } + // allocate using contiguous extent found at l1 if affordable + // align allocated extent with min_length + if (ctx.free_count) { + auto o = ctx.free_l1_pos * l1_granularity; + auto l = ctx.free_count * l1_granularity; + interval_t aligned_extent = _align2units(o, l, min_length); + if (aligned_extent.length > 0) { + aligned_extent.length = std::min(length, + uint64_t(aligned_extent.length)); + ceph_assert((aligned_extent.offset % l0_granularity) == 0); + ceph_assert((aligned_extent.length % l0_granularity) == 0); + + auto pos_start = aligned_extent.offset / l0_granularity; + auto pos_end = (aligned_extent.offset + aligned_extent.length) / l0_granularity; + + _mark_alloc_l1_l0(pos_start, pos_end); + return aligned_extent; + } + } + if (ctx.min_affordable_len) { + auto pos_start = ctx.min_affordable_offs / l0_granularity; + auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity; + _mark_alloc_l1_l0(pos_start, pos_end); + return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len); + } + } + return res; +} + +bool AllocatorLevel01Loose::_allocate_l1(uint64_t length, + uint64_t min_length, uint64_t max_length, + uint64_t l1_pos_start, uint64_t l1_pos_end, + uint64_t* allocated, + interval_vector_t* res) +{ + uint64_t d0 = L0_ENTRIES_PER_SLOT; + uint64_t d1 = L1_ENTRIES_PER_SLOT; + + ceph_assert(0 == (l1_pos_start % (slots_per_slotset * d1))); + ceph_assert(0 == (l1_pos_end % (slots_per_slotset * d1))); + if (min_length != l0_granularity) { + // probably not the most effecient way but + // don't care much about that at the moment + bool has_space = true; + while (length > *allocated && has_space) { + interval_t i = + _allocate_l1_contiguous(length - *allocated, min_length, max_length, + l1_pos_start, l1_pos_end); + if (i.length == 0) { + has_space = false; + } else { + _fragment_and_emplace(max_length, i.offset, i.length, res); + *allocated += i.length; + } + } + } else { + uint64_t l0_w = slots_per_slotset * d0; + + for (auto idx = l1_pos_start / d1; + idx < l1_pos_end / d1 && length > *allocated; + ++idx) { + slot_t& slot_val = l1[idx]; + if (slot_val == all_slot_clear) { + continue; + } else if (slot_val == all_slot_set) { + uint64_t to_alloc = std::min(length - *allocated, + l1_granularity * d1); + *allocated += to_alloc; + ++alloc_fragments_fast; + _fragment_and_emplace(max_length, idx * d1 * l1_granularity, to_alloc, + res); + _mark_alloc_l1_l0(idx * d1 * bits_per_slotset, + idx * d1 * bits_per_slotset + to_alloc / l0_granularity); + continue; + } + auto free_pos = find_next_set_bit(slot_val, 0); + ceph_assert(free_pos < bits_per_slot); + do { + ceph_assert(length > *allocated); + + bool empty; + empty = _allocate_l0(length, max_length, + (idx * d1 + free_pos / L1_ENTRY_WIDTH) * l0_w, + (idx * d1 + free_pos / L1_ENTRY_WIDTH + 1) * l0_w, + allocated, + res); + + auto mask = slot_t(L1_ENTRY_MASK) << free_pos; + + slot_t old_mask = (slot_val & mask) >> free_pos; + switch(old_mask) { + case L1_ENTRY_FREE: + unalloc_l1_count--; + break; + case L1_ENTRY_PARTIAL: + partial_l1_count--; + break; + } + slot_val &= ~mask; + if (empty) { + // the next line is no op with the current L1_ENTRY_FULL but left + // as-is for the sake of uniformity and to avoid potential errors + // in future + slot_val |= slot_t(L1_ENTRY_FULL) << free_pos; + } else { + slot_val |= slot_t(L1_ENTRY_PARTIAL) << free_pos; + partial_l1_count++; + } + if (length <= *allocated || slot_val == all_slot_clear) { + break; + } + free_pos = find_next_set_bit(slot_val, free_pos + L1_ENTRY_WIDTH); + } while (free_pos < bits_per_slot); + } + } + return _is_empty_l1(l1_pos_start, l1_pos_end); +} + +void AllocatorLevel01Loose::collect_stats( + std::map<size_t, size_t>& bins_overall) +{ + size_t free_seq_cnt = 0; + for (auto slot : l0) { + if (slot == all_slot_set) { + free_seq_cnt += L0_ENTRIES_PER_SLOT; + } else if(slot != all_slot_clear) { + size_t pos = 0; + do { + auto pos1 = find_next_set_bit(slot, pos); + if (pos1 == pos) { + free_seq_cnt++; + pos = pos1 + 1; + } else { + if (free_seq_cnt) { + bins_overall[cbits(free_seq_cnt) - 1]++; + free_seq_cnt = 0; + } + if (pos1 < bits_per_slot) { + free_seq_cnt = 1; + } + pos = pos1 + 1; + } + } while (pos < bits_per_slot); + } else if (free_seq_cnt) { + bins_overall[cbits(free_seq_cnt) - 1]++; + free_seq_cnt = 0; + } + } + if (free_seq_cnt) { + bins_overall[cbits(free_seq_cnt) - 1]++; + } +} + +inline ssize_t AllocatorLevel01Loose::count_0s(slot_t slot_val, size_t start_pos) + { + #ifdef __GNUC__ + size_t pos = __builtin_ffsll(slot_val >> start_pos); + if (pos == 0) + return sizeof(slot_t)*8 - start_pos; + return pos - 1; + #else + size_t pos = start_pos; + slot_t mask = slot_t(1) << pos; + while (pos < bits_per_slot && (slot_val & mask) == 0) { + mask <<= 1; + pos++; + } + return pos - start_pos; + #endif + } + + inline ssize_t AllocatorLevel01Loose::count_1s(slot_t slot_val, size_t start_pos) + { + return count_0s(~slot_val, start_pos); + } +void AllocatorLevel01Loose::dump( + std::function<void(uint64_t offset, uint64_t length)> notify) +{ + size_t len = 0; + size_t off = 0; + for (size_t i = 0; i < l1.size(); i++) + { + for (size_t j = 0; j < L1_ENTRIES_PER_SLOT * L1_ENTRY_WIDTH; j += L1_ENTRY_WIDTH) + { + size_t w = (l1[i] >> j) & L1_ENTRY_MASK; + switch (w) { + case L1_ENTRY_FULL: + if (len > 0) { + notify(off, len); + len = 0; + } + break; + case L1_ENTRY_FREE: + if (len == 0) + off = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset * bits_per_slot; + len += bits_per_slotset; + break; + case L1_ENTRY_PARTIAL: + size_t pos = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset; + for (size_t t = 0; t < slots_per_slotset; t++) { + size_t p = 0; + slot_t allocation_pattern = l0[pos + t]; + while (p < bits_per_slot) { + if (len == 0) { + //continue to skip allocated space, meaning bits set to 0 + ssize_t alloc_count = count_0s(allocation_pattern, p); + p += alloc_count; + //now we are switched to expecting free space + if (p < bits_per_slot) { + //now @p are 1s + ssize_t free_count = count_1s(allocation_pattern, p); + assert(free_count > 0); + len = free_count; + off = (pos + t) * bits_per_slot + p; + p += free_count; + } + } else { + //continue free region + ssize_t free_count = count_1s(allocation_pattern, p); + if (free_count == 0) { + notify(off, len); + len = 0; + } else { + p += free_count; + len += free_count; + } + } + } + } + break; + } + } + } + if (len > 0) + notify(off, len); +} + +uint64_t AllocatorLevel01Loose::_claim_free_to_left_l0(int64_t l0_pos_start) +{ + int64_t d0 = L0_ENTRIES_PER_SLOT; + + int64_t pos = l0_pos_start - 1; + slot_t bits = (slot_t)1 << (pos % d0); + int64_t idx = pos / d0; + slot_t* val_s = l0.data() + idx; + + int64_t pos_e = p2align<int64_t>(pos, d0); + + while (pos >= pos_e) { + if (0 == ((*val_s) & bits)) + return pos + 1; + (*val_s) &= ~bits; + bits >>= 1; + --pos; + } + --idx; + val_s = l0.data() + idx; + while (idx >= 0 && (*val_s) == all_slot_set) { + *val_s = all_slot_clear; + --idx; + pos -= d0; + val_s = l0.data() + idx; + } + + if (idx >= 0 && + (*val_s) != all_slot_set && (*val_s) != all_slot_clear) { + int64_t pos_e = p2align<int64_t>(pos, d0); + slot_t bits = (slot_t)1 << (pos % d0); + while (pos >= pos_e) { + if (0 == ((*val_s) & bits)) + return pos + 1; + (*val_s) &= ~bits; + bits >>= 1; + --pos; + } + } + return pos + 1; +} + +uint64_t AllocatorLevel01Loose::_claim_free_to_right_l0(int64_t l0_pos_start) +{ + auto d0 = L0_ENTRIES_PER_SLOT; + + int64_t pos = l0_pos_start; + slot_t bits = (slot_t)1 << (pos % d0); + size_t idx = pos / d0; + if (idx >= l0.size()) { + return pos; + } + slot_t* val_s = l0.data() + idx; + + int64_t pos_e = p2roundup<int64_t>(pos + 1, d0); + + while (pos < pos_e) { + if (0 == ((*val_s) & bits)) + return pos; + (*val_s) &= ~bits; + bits <<= 1; + ++pos; + } + ++idx; + val_s = l0.data() + idx; + while (idx < l0.size() && (*val_s) == all_slot_set) { + *val_s = all_slot_clear; + ++idx; + pos += d0; + val_s = l0.data() + idx; + } + + if (idx < l0.size() && + (*val_s) != all_slot_set && (*val_s) != all_slot_clear) { + int64_t pos_e = p2roundup<int64_t>(pos + 1, d0); + slot_t bits = (slot_t)1 << (pos % d0); + while (pos < pos_e) { + if (0 == ((*val_s) & bits)) + return pos; + (*val_s) &= ~bits; + bits <<= 1; + ++pos; + } + } + return pos; +} diff --git a/src/os/bluestore/fastbmap_allocator_impl.h b/src/os/bluestore/fastbmap_allocator_impl.h new file mode 100755 index 00000000..52a1edee --- /dev/null +++ b/src/os/bluestore/fastbmap_allocator_impl.h @@ -0,0 +1,833 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Bitmap based in-memory allocator implementation. + * Author: Igor Fedotov, ifedotov@suse.com + * + */ + +#ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H +#define __FAST_BITMAP_ALLOCATOR_IMPL_H +#include "include/intarith.h" + +#include <vector> +#include <algorithm> +#include <mutex> + +typedef uint64_t slot_t; + +#ifdef NON_CEPH_BUILD +#include <assert.h> +struct interval_t +{ + uint64_t offset = 0; + uint64_t length = 0; + + interval_t() {} + interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {} + interval_t(const interval_t &ext) : + offset(ext.offset), length(ext.length) {} +}; +typedef std::vector<interval_t> interval_vector_t; +typedef std::vector<slot_t> slot_vector_t; +#else +#include "include/ceph_assert.h" +#include "common/likely.h" +#include "os/bluestore/bluestore_types.h" +#include "include/mempool.h" +#include "common/ceph_mutex.h" + +typedef bluestore_interval_t<uint64_t, uint64_t> interval_t; +typedef PExtentVector interval_vector_t; + +typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t; + +#endif + +// fitting into cache line on x86_64 +static const size_t slots_per_slotset = 8; // 8 slots per set +static const size_t slotset_bytes = sizeof(slot_t) * slots_per_slotset; +static const size_t bits_per_slot = sizeof(slot_t) * 8; +static const size_t bits_per_slotset = slotset_bytes * 8; +static const slot_t all_slot_set = 0xffffffffffffffff; +static const slot_t all_slot_clear = 0; + +inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos) +{ +#ifdef __GNUC__ + if (start_pos == 0) { + start_pos = __builtin_ffsll(slot_val); + return start_pos ? start_pos - 1 : bits_per_slot; + } +#endif + slot_t mask = slot_t(1) << start_pos; + while (start_pos < bits_per_slot && !(slot_val & mask)) { + mask <<= 1; + ++start_pos; + } + return start_pos; +} + + +class AllocatorLevel +{ +protected: + + virtual uint64_t _children_per_slot() const = 0; + virtual uint64_t _level_granularity() const = 0; + +public: + static uint64_t l0_dives; + static uint64_t l0_iterations; + static uint64_t l0_inner_iterations; + static uint64_t alloc_fragments; + static uint64_t alloc_fragments_fast; + static uint64_t l2_allocs; + + virtual ~AllocatorLevel() + {} + + virtual void collect_stats( + std::map<size_t, size_t>& bins_overall) = 0; + +}; + +class AllocatorLevel01 : public AllocatorLevel +{ +protected: + slot_vector_t l0; // set bit means free entry + slot_vector_t l1; + uint64_t l0_granularity = 0; // space per entry + uint64_t l1_granularity = 0; // space per entry + + size_t partial_l1_count = 0; + size_t unalloc_l1_count = 0; + + double get_fragmentation() const { + double res = 0.0; + auto total = unalloc_l1_count + partial_l1_count; + if (total) { + res = double(partial_l1_count) / double(total); + } + return res; + } + + uint64_t _level_granularity() const override + { + return l1_granularity; + } + + inline bool _is_slot_fully_allocated(uint64_t idx) const { + return l1[idx] == all_slot_clear; + } +public: + inline uint64_t get_min_alloc_size() const + { + return l0_granularity; + } + +}; + +template <class T> +class AllocatorLevel02; + +class AllocatorLevel01Loose : public AllocatorLevel01 +{ + enum { + L1_ENTRY_WIDTH = 2, + L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1, + L1_ENTRY_FULL = 0x00, + L1_ENTRY_PARTIAL = 0x01, + L1_ENTRY_NOT_USED = 0x02, + L1_ENTRY_FREE = 0x03, + L1_ENTRIES_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, //32 + L0_ENTRIES_PER_SLOT = bits_per_slot, // 64 + }; + uint64_t _children_per_slot() const override + { + return L1_ENTRIES_PER_SLOT; + } + + interval_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1, + uint64_t min_length, interval_t* tail) const; + + inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset, + uint64_t len, + interval_vector_t* res) + { + auto it = res->rbegin(); + if (max_length) { + if (it != res->rend() && it->offset + it->length == offset) { + auto l = max_length - it->length; + if (l >= len) { + it->length += len; + return; + } else { + offset += l; + len -= l; + it->length += l; + } + } + + while (len > max_length) { + res->emplace_back(offset, max_length); + offset += max_length; + len -= max_length; + } + res->emplace_back(offset, len); + return; + } + + if (it != res->rend() && it->offset + it->length == offset) { + it->length += len; + } else { + res->emplace_back(offset, len); + } + } + + bool _allocate_l0(uint64_t length, + uint64_t max_length, + uint64_t l0_pos0, uint64_t l0_pos1, + uint64_t* allocated, + interval_vector_t* res) + { + uint64_t d0 = L0_ENTRIES_PER_SLOT; + + ++l0_dives; + + ceph_assert(l0_pos0 < l0_pos1); + ceph_assert(length > *allocated); + ceph_assert(0 == (l0_pos0 % (slots_per_slotset * d0))); + ceph_assert(0 == (l0_pos1 % (slots_per_slotset * d0))); + ceph_assert(((length - *allocated) % l0_granularity) == 0); + + uint64_t need_entries = (length - *allocated) / l0_granularity; + + for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated); + ++idx) { + ++l0_iterations; + slot_t& slot_val = l0[idx]; + auto base = idx * d0; + if (slot_val == all_slot_clear) { + continue; + } else if (slot_val == all_slot_set) { + uint64_t to_alloc = std::min(need_entries, d0); + *allocated += to_alloc * l0_granularity; + ++alloc_fragments; + need_entries -= to_alloc; + + _fragment_and_emplace(max_length, base * l0_granularity, + to_alloc * l0_granularity, res); + + if (to_alloc == d0) { + slot_val = all_slot_clear; + } else { + _mark_alloc_l0(base, base + to_alloc); + } + continue; + } + + auto free_pos = find_next_set_bit(slot_val, 0); + ceph_assert(free_pos < bits_per_slot); + auto next_pos = free_pos + 1; + while (next_pos < bits_per_slot && + (next_pos - free_pos) < need_entries) { + ++l0_inner_iterations; + + if (0 == (slot_val & (slot_t(1) << next_pos))) { + auto to_alloc = (next_pos - free_pos); + *allocated += to_alloc * l0_granularity; + ++alloc_fragments; + need_entries -= to_alloc; + _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity, + to_alloc * l0_granularity, res); + _mark_alloc_l0(base + free_pos, base + next_pos); + free_pos = find_next_set_bit(slot_val, next_pos + 1); + next_pos = free_pos + 1; + } else { + ++next_pos; + } + } + if (need_entries && free_pos < bits_per_slot) { + auto to_alloc = std::min(need_entries, d0 - free_pos); + *allocated += to_alloc * l0_granularity; + ++alloc_fragments; + need_entries -= to_alloc; + _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity, + to_alloc * l0_granularity, res); + _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc); + } + } + return _is_empty_l0(l0_pos0, l0_pos1); + } + +protected: + + friend class AllocatorLevel02<AllocatorLevel01Loose>; + + void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true) + { + l0_granularity = _alloc_unit; + // 512 bits at L0 mapped to L1 entry + l1_granularity = l0_granularity * bits_per_slotset; + + // capacity to have slot alignment at l1 + auto aligned_capacity = + p2roundup((int64_t)capacity, + int64_t(l1_granularity * slots_per_slotset * _children_per_slot())); + size_t slot_count = + aligned_capacity / l1_granularity / _children_per_slot(); + // we use set bit(s) as a marker for (partially) free entry + l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear); + + // l0 slot count + size_t slot_count_l0 = aligned_capacity / _alloc_unit / bits_per_slot; + // we use set bit(s) as a marker for (partially) free entry + l0.resize(slot_count_l0, mark_as_free ? all_slot_set : all_slot_clear); + + partial_l1_count = unalloc_l1_count = 0; + if (mark_as_free) { + unalloc_l1_count = slot_count * _children_per_slot(); + auto l0_pos_no_use = p2roundup((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity; + _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity); + } + } + + struct search_ctx_t + { + size_t partial_count = 0; + size_t free_count = 0; + uint64_t free_l1_pos = 0; + + uint64_t min_affordable_len = 0; + uint64_t min_affordable_offs = 0; + uint64_t affordable_len = 0; + uint64_t affordable_offs = 0; + + bool fully_processed = false; + + void reset() + { + *this = search_ctx_t(); + } + }; + enum { + NO_STOP, + STOP_ON_EMPTY, + STOP_ON_PARTIAL, + }; + void _analyze_partials(uint64_t pos_start, uint64_t pos_end, + uint64_t length, uint64_t min_length, int mode, + search_ctx_t* ctx); + + void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end); + void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end); + uint64_t _claim_free_to_left_l0(int64_t l0_pos_start); + uint64_t _claim_free_to_right_l0(int64_t l0_pos_start); + + + void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end) + { + _mark_alloc_l0(l0_pos_start, l0_pos_end); + l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset)); + l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset)); + _mark_l1_on_l0(l0_pos_start, l0_pos_end); + } + + void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end) + { + auto d0 = L0_ENTRIES_PER_SLOT; + + auto pos = l0_pos_start; + slot_t bits = (slot_t)1 << (l0_pos_start % d0); + slot_t* val_s = &l0[pos / d0]; + int64_t pos_e = std::min(l0_pos_end, + p2roundup<int64_t>(l0_pos_start + 1, d0)); + while (pos < pos_e) { + *val_s |= bits; + bits <<= 1; + pos++; + } + pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0)); + while (pos < pos_e) { + *(++val_s) = all_slot_set; + pos += d0; + } + bits = 1; + ++val_s; + while (pos < l0_pos_end) { + *val_s |= bits; + bits <<= 1; + pos++; + } + } + + void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end) + { + _mark_free_l0(l0_pos_start, l0_pos_end); + l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset)); + l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset)); + _mark_l1_on_l0(l0_pos_start, l0_pos_end); + } + + bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end) + { + bool no_free = true; + uint64_t d = slots_per_slotset * L0_ENTRIES_PER_SLOT; + ceph_assert(0 == (l0_pos % d)); + ceph_assert(0 == (l0_pos_end % d)); + + auto idx = l0_pos / L0_ENTRIES_PER_SLOT; + auto idx_end = l0_pos_end / L0_ENTRIES_PER_SLOT; + while (idx < idx_end && no_free) { + no_free = l0[idx] == all_slot_clear; + ++idx; + } + return no_free; + } + bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end) + { + bool no_free = true; + uint64_t d = slots_per_slotset * _children_per_slot(); + ceph_assert(0 == (l1_pos % d)); + ceph_assert(0 == (l1_pos_end % d)); + + auto idx = l1_pos / L1_ENTRIES_PER_SLOT; + auto idx_end = l1_pos_end / L1_ENTRIES_PER_SLOT; + while (idx < idx_end && no_free) { + no_free = _is_slot_fully_allocated(idx); + ++idx; + } + return no_free; + } + + interval_t _allocate_l1_contiguous(uint64_t length, + uint64_t min_length, uint64_t max_length, + uint64_t pos_start, uint64_t pos_end); + + bool _allocate_l1(uint64_t length, + uint64_t min_length, uint64_t max_length, + uint64_t l1_pos_start, uint64_t l1_pos_end, + uint64_t* allocated, + interval_vector_t* res); + + uint64_t _mark_alloc_l1(uint64_t offset, uint64_t length) + { + uint64_t l0_pos_start = offset / l0_granularity; + uint64_t l0_pos_end = p2roundup(offset + length, l0_granularity) / l0_granularity; + _mark_alloc_l1_l0(l0_pos_start, l0_pos_end); + return l0_granularity * (l0_pos_end - l0_pos_start); + } + + uint64_t _free_l1(uint64_t offs, uint64_t len) + { + uint64_t l0_pos_start = offs / l0_granularity; + uint64_t l0_pos_end = p2roundup(offs + len, l0_granularity) / l0_granularity; + _mark_free_l1_l0(l0_pos_start, l0_pos_end); + return l0_granularity * (l0_pos_end - l0_pos_start); + } + + uint64_t claim_free_to_left_l1(uint64_t offs) + { + uint64_t l0_pos_end = offs / l0_granularity; + uint64_t l0_pos_start = _claim_free_to_left_l0(l0_pos_end); + if (l0_pos_start < l0_pos_end) { + _mark_l1_on_l0( + p2align(l0_pos_start, uint64_t(bits_per_slotset)), + p2roundup(l0_pos_end, uint64_t(bits_per_slotset))); + return l0_granularity * (l0_pos_end - l0_pos_start); + } + return 0; + } + + uint64_t claim_free_to_right_l1(uint64_t offs) + { + uint64_t l0_pos_start = offs / l0_granularity; + uint64_t l0_pos_end = _claim_free_to_right_l0(l0_pos_start); + + if (l0_pos_start < l0_pos_end) { + _mark_l1_on_l0( + p2align(l0_pos_start, uint64_t(bits_per_slotset)), + p2roundup(l0_pos_end, uint64_t(bits_per_slotset))); + return l0_granularity * (l0_pos_end - l0_pos_start); + } + return 0; + } + + +public: + uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0) + { + if (pos1 == 0) { + pos1 = l1.size() * L1_ENTRIES_PER_SLOT; + } + auto avail = debug_get_free(pos0, pos1); + return (pos1 - pos0) * l1_granularity - avail; + } + + uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0) + { + ceph_assert(0 == (l1_pos0 % L1_ENTRIES_PER_SLOT)); + ceph_assert(0 == (l1_pos1 % L1_ENTRIES_PER_SLOT)); + + auto idx0 = l1_pos0 * slots_per_slotset; + auto idx1 = l1_pos1 * slots_per_slotset; + + if (idx1 == 0) { + idx1 = l0.size(); + } + + uint64_t res = 0; + for (uint64_t i = idx0; i < idx1; ++i) { + auto v = l0[i]; + if (v == all_slot_set) { + res += L0_ENTRIES_PER_SLOT; + } else if (v != all_slot_clear) { + size_t cnt = 0; +#ifdef __GNUC__ + cnt = __builtin_popcountll(v); +#else + // Kernighan's Alg to count set bits + while (v) { + v &= (v - 1); + cnt++; + } +#endif + res += cnt; + } + } + return res * l0_granularity; + } + void collect_stats( + std::map<size_t, size_t>& bins_overall) override; + + static inline ssize_t count_0s(slot_t slot_val, size_t start_pos); + static inline ssize_t count_1s(slot_t slot_val, size_t start_pos); + void dump(std::function<void(uint64_t offset, uint64_t length)> notify); +}; + + +class AllocatorLevel01Compact : public AllocatorLevel01 +{ + uint64_t _children_per_slot() const override + { + return 8; + } +public: + void collect_stats( + std::map<size_t, size_t>& bins_overall) override + { + // not implemented + } +}; + +template <class L1> +class AllocatorLevel02 : public AllocatorLevel +{ +public: + uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0) + { + std::lock_guard l(lock); + return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot, + pos1 * l1._children_per_slot() * bits_per_slot); + } + uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0) + { + std::lock_guard l(lock); + return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot, + pos1 * l1._children_per_slot() * bits_per_slot); + } + + uint64_t get_available() + { + std::lock_guard l(lock); + return available; + } + inline uint64_t get_min_alloc_size() const + { + return l1.get_min_alloc_size(); + } + void collect_stats( + std::map<size_t, size_t>& bins_overall) override { + + std::lock_guard l(lock); + l1.collect_stats(bins_overall); + } + uint64_t claim_free_to_left(uint64_t offset) { + std::lock_guard l(lock); + auto allocated = l1.claim_free_to_left_l1(offset); + ceph_assert(available >= allocated); + available -= allocated; + + uint64_t l2_pos = (offset - allocated) / l2_granularity; + uint64_t l2_pos_end = + p2roundup(int64_t(offset), int64_t(l2_granularity)) / l2_granularity; + _mark_l2_on_l1(l2_pos, l2_pos_end); + return allocated; + } + + uint64_t claim_free_to_right(uint64_t offset) { + std::lock_guard l(lock); + auto allocated = l1.claim_free_to_right_l1(offset); + ceph_assert(available >= allocated); + available -= allocated; + + uint64_t l2_pos = (offset) / l2_granularity; + int64_t end = offset + allocated; + uint64_t l2_pos_end = p2roundup(end, int64_t(l2_granularity)) / l2_granularity; + _mark_l2_on_l1(l2_pos, l2_pos_end); + return allocated; + } +protected: + ceph::mutex lock = ceph::make_mutex("AllocatorLevel02::lock"); + L1 l1; + slot_vector_t l2; + uint64_t l2_granularity = 0; // space per entry + uint64_t available = 0; + uint64_t last_pos = 0; + + enum { + L1_ENTRIES_PER_SLOT = bits_per_slot, // 64 + }; + + uint64_t _children_per_slot() const override + { + return L1_ENTRIES_PER_SLOT; + } + uint64_t _level_granularity() const override + { + return l2_granularity; + } + + void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true) + { + ceph_assert(isp2(_alloc_unit)); + l1._init(capacity, _alloc_unit, mark_as_free); + + l2_granularity = + l1._level_granularity() * l1._children_per_slot() * slots_per_slotset; + + // capacity to have slot alignment at l2 + auto aligned_capacity = + p2roundup((int64_t)capacity, (int64_t)l2_granularity * L1_ENTRIES_PER_SLOT); + size_t elem_count = aligned_capacity / l2_granularity / L1_ENTRIES_PER_SLOT; + // we use set bit(s) as a marker for (partially) free entry + l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear); + + if (mark_as_free) { + // capacity to have slotset alignment at l1 + auto l2_pos_no_use = + p2roundup((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity; + _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity); + available = p2align(capacity, _alloc_unit); + } else { + available = 0; + } + } + + void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end) + { + auto d = L1_ENTRIES_PER_SLOT; + ceph_assert(0 <= l2_pos_end); + ceph_assert((int64_t)l2.size() >= (l2_pos_end / d)); + + while (l2_pos < l2_pos_end) { + l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d)); + ++l2_pos; + } + } + + void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end) + { + auto d = L1_ENTRIES_PER_SLOT; + ceph_assert(0 <= l2_pos_end); + ceph_assert((int64_t)l2.size() >= (l2_pos_end / d)); + + while (l2_pos < l2_pos_end) { + l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d)); + ++l2_pos; + } + } + + void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end) + { + auto d = L1_ENTRIES_PER_SLOT; + ceph_assert(0 <= l2_pos_end); + ceph_assert((int64_t)l2.size() >= (l2_pos_end / d)); + + auto idx = l2_pos * slots_per_slotset; + auto idx_end = l2_pos_end * slots_per_slotset; + bool all_allocated = true; + while (idx < idx_end) { + if (!l1._is_slot_fully_allocated(idx)) { + all_allocated = false; + idx = p2roundup(int64_t(++idx), int64_t(slots_per_slotset)); + } + else { + ++idx; + } + if ((idx % slots_per_slotset) == 0) { + if (all_allocated) { + l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d)); + } + else { + l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d)); + } + all_allocated = true; + ++l2_pos; + } + } + } + + void _allocate_l2(uint64_t length, + uint64_t min_length, + uint64_t max_length, + uint64_t hint, + + uint64_t* allocated, + interval_vector_t* res) + { + uint64_t prev_allocated = *allocated; + uint64_t d = L1_ENTRIES_PER_SLOT; + ceph_assert(min_length <= l2_granularity); + ceph_assert(max_length == 0 || max_length >= min_length); + ceph_assert(max_length == 0 || (max_length % min_length) == 0); + ceph_assert(length >= min_length); + ceph_assert((length % min_length) == 0); + + uint64_t cap = 1ull << 31; + if (max_length == 0 || max_length >= cap) { + max_length = cap; + } + + uint64_t l1_w = slots_per_slotset * l1._children_per_slot(); + + std::lock_guard l(lock); + + if (available < min_length) { + return; + } + if (hint != 0) { + last_pos = (hint / (d * l2_granularity)) < l2.size() ? p2align(hint / l2_granularity, d) : 0; + } + auto l2_pos = last_pos; + auto last_pos0 = last_pos; + auto pos = last_pos / d; + auto pos_end = l2.size(); + // outer loop below is intended to optimize the performance by + // avoiding 'modulo' operations inside the internal loop. + // Looks like they have negative impact on the performance + for (auto i = 0; i < 2; ++i) { + for(; length > *allocated && pos < pos_end; ++pos) { + slot_t& slot_val = l2[pos]; + size_t free_pos = 0; + bool all_set = false; + if (slot_val == all_slot_clear) { + l2_pos += d; + last_pos = l2_pos; + continue; + } else if (slot_val == all_slot_set) { + free_pos = 0; + all_set = true; + } else { + free_pos = find_next_set_bit(slot_val, 0); + ceph_assert(free_pos < bits_per_slot); + } + do { + ceph_assert(length > *allocated); + bool empty = l1._allocate_l1(length, + min_length, + max_length, + (l2_pos + free_pos) * l1_w, + (l2_pos + free_pos + 1) * l1_w, + allocated, + res); + if (empty) { + slot_val &= ~(slot_t(1) << free_pos); + } + if (length <= *allocated || slot_val == all_slot_clear) { + break; + } + ++free_pos; + if (!all_set) { + free_pos = find_next_set_bit(slot_val, free_pos); + } + } while (free_pos < bits_per_slot); + last_pos = l2_pos; + l2_pos += d; + } + l2_pos = 0; + pos = 0; + pos_end = last_pos0 / d; + } + + ++l2_allocs; + auto allocated_here = *allocated - prev_allocated; + ceph_assert(available >= allocated_here); + available -= allocated_here; + } + +#ifndef NON_CEPH_BUILD + // to provide compatibility with BlueStore's allocator interface + void _free_l2(const interval_set<uint64_t> & rr) + { + uint64_t released = 0; + std::lock_guard l(lock); + for (auto r : rr) { + released += l1._free_l1(r.first, r.second); + uint64_t l2_pos = r.first / l2_granularity; + uint64_t l2_pos_end = p2roundup(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity; + + _mark_l2_free(l2_pos, l2_pos_end); + } + available += released; + } +#endif + + template <typename T> + void _free_l2(const T& rr) + { + uint64_t released = 0; + std::lock_guard l(lock); + for (auto r : rr) { + released += l1._free_l1(r.offset, r.length); + uint64_t l2_pos = r.offset / l2_granularity; + uint64_t l2_pos_end = p2roundup(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity; + + _mark_l2_free(l2_pos, l2_pos_end); + } + available += released; + } + + void _mark_allocated(uint64_t o, uint64_t len) + { + uint64_t l2_pos = o / l2_granularity; + uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity; + + std::lock_guard l(lock); + auto allocated = l1._mark_alloc_l1(o, len); + ceph_assert(available >= allocated); + available -= allocated; + _mark_l2_on_l1(l2_pos, l2_pos_end); + } + + void _mark_free(uint64_t o, uint64_t len) + { + uint64_t l2_pos = o / l2_granularity; + uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity; + + std::lock_guard l(lock); + available += l1._free_l1(o, len); + _mark_l2_free(l2_pos, l2_pos_end); + } + void _shutdown() + { + last_pos = 0; + } + double _get_fragmentation() { + std::lock_guard l(lock); + return l1.get_fragmentation(); + } +}; + +#endif diff --git a/src/os/filestore/BtrfsFileStoreBackend.cc b/src/os/filestore/BtrfsFileStoreBackend.cc new file mode 100644 index 00000000..2ff2000d --- /dev/null +++ b/src/os/filestore/BtrfsFileStoreBackend.cc @@ -0,0 +1,575 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" +#include "include/types.h" + +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include "include/compat.h" +#include "include/linux_fiemap.h" +#include "include/color.h" +#include "include/buffer.h" +#include "include/ceph_assert.h" + +#ifndef __CYGWIN__ +#include "os/fs/btrfs_ioctl.h" +#endif + +#include <iostream> +#include <fstream> +#include <sstream> + +#include "BtrfsFileStoreBackend.h" + +#include "common/errno.h" +#include "common/config.h" + +#if defined(__linux__) + +#define dout_context cct() +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") " + +#define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) +#define ALIGNED(x, by) (!((x) % (by))) +#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) + +BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs): + GenericFileStoreBackend(fs), has_clone_range(false), + has_snap_create(false), has_snap_destroy(false), + has_snap_create_v2(false), has_wait_sync(false), stable_commits(false), + m_filestore_btrfs_clone_range(cct()->_conf->filestore_btrfs_clone_range), + m_filestore_btrfs_snap (cct()->_conf->filestore_btrfs_snap) { } + +int BtrfsFileStoreBackend::detect_features() +{ + int r; + + r = GenericFileStoreBackend::detect_features(); + if (r < 0) + return r; + + // clone_range? + if (m_filestore_btrfs_clone_range) { + int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY|O_CLOEXEC, 0600); + if (fd >= 0) { + if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) { + r = -errno; + dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: " + << cpp_strerror(r) << dendl; + } + btrfs_ioctl_clone_range_args clone_args; + memset(&clone_args, 0, sizeof(clone_args)); + clone_args.src_fd = -1; + r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args); + if (r < 0 && errno == EBADF) { + dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl; + has_clone_range = true; + } else { + r = -errno; + dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl; + } + TEMP_FAILURE_RETRY(::close(fd)); + } else { + r = -errno; + dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: " + << cpp_strerror(r) << dendl; + } + } else { + dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl; + } + + struct btrfs_ioctl_vol_args vol_args; + memset(&vol_args, 0, sizeof(vol_args)); + + // create test source volume + vol_args.fd = 0; + strcpy(vol_args.name, "test_subvol"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args); + if (r != 0) { + r = -errno; + dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl; + } + int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY|O_CLOEXEC); + if (srcfd < 0) { + r = -errno; + dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl; + } + + // snap_create and snap_destroy? + vol_args.fd = srcfd; + strcpy(vol_args.name, "sync_snap_test"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); + int err = errno; + if (r == 0 || errno == EEXIST) { + dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl; + has_snap_create = true; + + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r == 0) { + dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl; + has_snap_destroy = true; + } else { + err = -errno; + dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl; + + if (err == -EPERM && getuid() != 0) { + dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl; + cerr << TEXT_YELLOW + << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed" + << TEXT_NORMAL << std::endl; + } else if (err == -EOPNOTSUPP) { + derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl; + } + } + } else { + dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl; + } + + if (m_filestore_btrfs_snap) { + if (has_snap_destroy) + stable_commits = true; + else + dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl; + } + + // start_sync? + __u64 transid = 0; + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid); + if (r < 0) { + int err = errno; + dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl; + } + if (r == 0 && transid > 0) { + dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl; + + // do we have wait_sync too? + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid); + if (r == 0 || errno == ERANGE) { + dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl; + has_wait_sync = true; + } else { + int err = errno; + dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl; + } + } else { + int err = errno; + dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl; + } + + if (has_wait_sync) { + // async snap creation? + struct btrfs_ioctl_vol_args_v2 async_args; + memset(&async_args, 0, sizeof(async_args)); + async_args.fd = srcfd; + async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC; + strcpy(async_args.name, "async_snap_test"); + + // remove old one, first + struct stat st; + strcpy(vol_args.name, async_args.name); + if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) { + dout(0) << "detect_feature: removing old async_snap_test" << dendl; + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r != 0) { + int err = errno; + dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl; + } + } + + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args); + if (r == 0 || errno == EEXIST) { + dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl; + has_snap_create_v2 = true; + + // clean up + strcpy(vol_args.name, "async_snap_test"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r != 0) { + int err = errno; + dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl; + } + } else { + int err = errno; + dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl; + } + } + + // clean up test subvol + if (srcfd >= 0) + TEMP_FAILURE_RETRY(::close(srcfd)); + + strcpy(vol_args.name, "test_subvol"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r < 0) { + r = -errno; + dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl; + } + + if (m_filestore_btrfs_snap && !has_snap_create_v2) { + dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n" + << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n" + << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n" + << " performance.\n" + << TEXT_NORMAL; + } + + return 0; +} + +bool BtrfsFileStoreBackend::can_checkpoint() +{ + return stable_commits; +} + +int BtrfsFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "create_current: current/ exists but is not a directory" << dendl; + return -EINVAL; + } + + struct stat basest; + struct statfs currentfs; + ret = ::fstat(get_basedir_fd(), &basest); + if (ret < 0) { + ret = -errno; + dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl; + return ret; + } + ret = ::statfs(get_current_path().c_str(), ¤tfs); + if (ret < 0) { + ret = -errno; + dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl; + return ret; + } + if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) { + dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl; + stable_commits = true; + } + return 0; + } + + struct btrfs_ioctl_vol_args volargs; + memset(&volargs, 0, sizeof(volargs)); + + volargs.fd = 0; + strcpy(volargs.name, "current"); + if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) { + ret = -errno; + dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error " + << cpp_strerror(ret) << dendl; + return ret; + } + + dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl; + if (::chmod(get_current_path().c_str(), 0755) < 0) { + ret = -errno; + dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: " + << cpp_strerror(ret) << dendl; + return ret; + } + + stable_commits = true; + return 0; +} + +int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls) +{ + int ret, err = 0; + + struct stat basest; + ret = ::fstat(get_basedir_fd(), &basest); + if (ret < 0) { + ret = -errno; + dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl; + return ret; + } + + // get snap list + DIR *dir = ::opendir(get_basedir_path().c_str()); + if (!dir) { + ret = -errno; + dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: " + << cpp_strerror(ret) << dendl; + return ret; + } + + list<string> snaps; + char path[PATH_MAX]; + struct dirent *de; + while ((de = ::readdir(dir))) { + snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name); + + struct stat st; + ret = ::stat(path, &st); + if (ret < 0) { + err = -errno; + dout(0) << "list_checkpoints: stat '" << path << "' failed: " + << cpp_strerror(err) << dendl; + break; + } + + if (!S_ISDIR(st.st_mode)) + continue; + + struct statfs fs; + ret = ::statfs(path, &fs); + if (ret < 0) { + err = -errno; + dout(0) << "list_checkpoints: statfs '" << path << "' failed: " + << cpp_strerror(err) << dendl; + break; + } + + if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) + snaps.push_back(string(de->d_name)); + } + + if (::closedir(dir) < 0) { + ret = -errno; + dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl; + if (!err) + err = ret; + } + + if (err) + return err; + + ls.swap(snaps); + return 0; +} + +int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid) +{ + dout(10) << "create_checkpoint: '" << name << "'" << dendl; + if (has_snap_create_v2 && transid) { + struct btrfs_ioctl_vol_args_v2 async_args; + memset(&async_args, 0, sizeof(async_args)); + async_args.fd = get_current_fd(); + async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC; + + size_t name_size = sizeof(async_args.name); + strncpy(async_args.name, name.c_str(), name_size); + async_args.name[name_size-1] = '\0'; + + int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args); + if (r < 0) { + r = -errno; + dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl; + return r; + } + dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl; + *transid = async_args.transid; + } else { + struct btrfs_ioctl_vol_args vol_args; + memset(&vol_args, 0, sizeof(vol_args)); + vol_args.fd = get_current_fd(); + + size_t name_size = sizeof(vol_args.name); + strncpy(vol_args.name, name.c_str(), name_size); + vol_args.name[name_size-1] = '\0'; + + int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); + if (r < 0) { + r = -errno; + dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl; + return r; + } + if (transid) + *transid = 0; + } + return 0; +} + +int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid) +{ + // wait for commit + dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl; + int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid); + if (ret < 0) { + ret = -errno; + dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl; + return -errno; + } + dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl; + return 0; +} + +int BtrfsFileStoreBackend::rollback_to(const string& name) +{ + dout(10) << "rollback_to: to '" << name << "'" << dendl; + char s[PATH_MAX]; + btrfs_ioctl_vol_args vol_args; + + memset(&vol_args, 0, sizeof(vol_args)); + vol_args.fd = 0; + strcpy(vol_args.name, "current"); + + int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (ret && errno != ENOENT) { + dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl; + snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand()); + if (::rename(get_current_path().c_str(), s)) { + ret = -errno; + dout(0) << "rollback_to: error renaming old current subvol: " + << cpp_strerror(ret) << dendl; + return ret; + } + } + + snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str()); + + // roll back + vol_args.fd = ::open(s, O_RDONLY|O_CLOEXEC); + if (vol_args.fd < 0) { + ret = -errno; + dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl; + return ret; + } + ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); + if (ret < 0 ) { + ret = -errno; + dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl; + } + TEMP_FAILURE_RETRY(::close(vol_args.fd)); + return ret; +} + +int BtrfsFileStoreBackend::destroy_checkpoint(const string& name) +{ + dout(10) << "destroy_checkpoint: '" << name << "'" << dendl; + btrfs_ioctl_vol_args vol_args; + memset(&vol_args, 0, sizeof(vol_args)); + vol_args.fd = 0; + strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name)); + + int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (ret) { + ret = -errno; + dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl; + return ret; + } + return 0; +} + +int BtrfsFileStoreBackend::syncfs() +{ + dout(15) << "syncfs" << dendl; + // do a full btrfs commit + int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC); + if (ret < 0) { + ret = -errno; + dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl; + } + return ret; +} + +int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl; + size_t blk_size = get_blksize(); + if (!has_clone_range || + srcoff % blk_size != dstoff % blk_size) { + dout(20) << "clone_range: using copy" << dendl; + return _copy_range(from, to, srcoff, len, dstoff); + } + + int err = 0; + int r = 0; + + uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size); + uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size); + if (srcoffclone >= srcoff + len) { + dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl; + return _copy_range(from, to, srcoff, len, dstoff); + } + + uint64_t lenclone = len - (srcoffclone - srcoff); + if (!ALIGNED(lenclone, blk_size)) { + struct stat from_stat, to_stat; + err = ::fstat(from, &from_stat); + if (err) return -errno; + err = ::fstat(to , &to_stat); + if (err) return -errno; + + if (srcoff + len != (uint64_t)from_stat.st_size || + dstoff + len < (uint64_t)to_stat.st_size) { + // Not to the end of the file, need to align length as well + lenclone = ALIGN_DOWN(lenclone, blk_size); + } + } + if (lenclone == 0) { + // too short + return _copy_range(from, to, srcoff, len, dstoff); + } + + dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone + << " to " << dstoffclone << " = " << r << dendl; + btrfs_ioctl_clone_range_args a; + a.src_fd = from; + a.src_offset = srcoffclone; + a.src_length = lenclone; + a.dest_offset = dstoffclone; + err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a); + if (err >= 0) { + r += err; + } else if (errno == EINVAL) { + // Still failed, might be compressed + dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl; + return _copy_range(from, to, srcoff, len, dstoff); + } else { + return -errno; + } + + // Take care any trimmed from front + if (srcoffclone != srcoff) { + err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff); + if (err >= 0) { + r += err; + } else { + return err; + } + } + + // Copy end + if (srcoffclone + lenclone != srcoff + len) { + err = _copy_range(from, to, + srcoffclone + lenclone, + (srcoff + len) - (srcoffclone + lenclone), + dstoffclone + lenclone); + if (err >= 0) { + r += err; + } else { + return err; + } + } + dout(20) << "clone_range: finished " << srcoff << "~" << len + << " to " << dstoff << " = " << r << dendl; + return r; +} +#endif diff --git a/src/os/filestore/BtrfsFileStoreBackend.h b/src/os/filestore/BtrfsFileStoreBackend.h new file mode 100644 index 00000000..0794be2d --- /dev/null +++ b/src/os/filestore/BtrfsFileStoreBackend.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BTRFSFILESTOREBACKEDN_H +#define CEPH_BTRFSFILESTOREBACKEDN_H + +#if defined(__linux__) +#include "GenericFileStoreBackend.h" + +class BtrfsFileStoreBackend : public GenericFileStoreBackend { +private: + bool has_clone_range; ///< clone range ioctl is supported + bool has_snap_create; ///< snap create ioctl is supported + bool has_snap_destroy; ///< snap destroy ioctl is supported + bool has_snap_create_v2; ///< snap create v2 ioctl (async!) is supported + bool has_wait_sync; ///< wait sync ioctl is supported + bool stable_commits; + bool m_filestore_btrfs_clone_range; + bool m_filestore_btrfs_snap; +public: + explicit BtrfsFileStoreBackend(FileStore *fs); + ~BtrfsFileStoreBackend() override {} + const char *get_name() override { + return "btrfs"; + } + int detect_features() override; + bool can_checkpoint() override; + int create_current() override; + int list_checkpoints(list<string>& ls) override; + int create_checkpoint(const string& name, uint64_t *cid) override; + int sync_checkpoint(uint64_t cid) override; + int rollback_to(const string& name) override; + int destroy_checkpoint(const string& name) override; + int syncfs() override; + int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override; +}; +#endif +#endif diff --git a/src/os/filestore/CollectionIndex.h b/src/os/filestore/CollectionIndex.h new file mode 100644 index 00000000..eb43e47d --- /dev/null +++ b/src/os/filestore/CollectionIndex.h @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef OS_COLLECTIONINDEX_H +#define OS_COLLECTIONINDEX_H + +#include <string> +#include <vector> + +#include "osd/osd_types.h" +#include "include/object.h" +#include "common/RWLock.h" + +/** + CollectionIndex provides an interface for manipulating indexed collections + */ +class CollectionIndex { +public: + CephContext* cct; +protected: + /** + * Object encapsulating a returned path. + * + * A path to an object (existent or non-existent) becomes invalid + * when a different object is created in the index. Path stores + * a shared_ptr to the CollectionIndex to keep the index alive + * during its lifetime. + * @see IndexManager + * @see self_ref + * @see set_ref + */ + class Path { + public: + /// Returned path + string full_path; + /// Ref to parent Index + CollectionIndex* parent_ref; + /// coll_t for parent Index + coll_t parent_coll; + + /// Normal Constructor + Path( + string path, ///< [in] Path to return. + CollectionIndex* ref) + : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {} + + /// Debugging Constructor + Path( + string path, ///< [in] Path to return. + const coll_t& coll) ///< [in] collection + : full_path(path), parent_coll(coll) {} + + /// Getter for the stored path. + const char *path() const { return full_path.c_str(); } + + /// Getter for collection + const coll_t& coll() const { return parent_coll; } + + /// Getter for parent + CollectionIndex* get_index() const { + return parent_ref; + } + }; + public: + + RWLock access_lock; + /// Type of returned paths + typedef std::shared_ptr<Path> IndexedPath; + + static IndexedPath get_testing_path(string path, coll_t collection) { + return std::make_shared<Path>(path, collection); + } + + static const uint32_t FLAT_INDEX_TAG = 0; + static const uint32_t HASH_INDEX_TAG = 1; + static const uint32_t HASH_INDEX_TAG_2 = 2; + static const uint32_t HOBJECT_WITH_POOL = 3; + /** + * For tracking Filestore collection versions. + * + * @return Collection version represented by the Index implementation + */ + virtual uint32_t collection_version() = 0; + + /** + * Returns the collection managed by this CollectionIndex + */ + virtual coll_t coll() const = 0; + + + /** + * Initializes the index. + * + * @return Error Code, 0 for success + */ + virtual int init() = 0; + + /** + * Cleanup before replaying journal + * + * Index implementations may need to perform compound operations + * which may leave the collection unstable if interrupted. cleanup + * is called on mount to allow the CollectionIndex implementation + * to stabilize. + * + * @see HashIndex + * @return Error Code, 0 for success + */ + virtual int cleanup() = 0; + + /** + * Call when a file is created using a path returned from lookup. + * + * @return Error Code, 0 for success + */ + virtual int created( + const ghobject_t &oid, ///< [in] Created object. + const char *path ///< [in] Path to created object. + ) = 0; + + /** + * Removes oid from the collection + * + * @return Error Code, 0 for success + */ + virtual int unlink( + const ghobject_t &oid ///< [in] Object to remove + ) = 0; + + /** + * Gets the IndexedPath for oid. + * + * @return Error Code, 0 for success + */ + virtual int lookup( + const ghobject_t &oid, ///< [in] Object to lookup + IndexedPath *path, ///< [out] Path to object + int *hardlink ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist. + ) = 0; + + /** + * Moves objects matching @e match in the lsb @e bits + * + * dest and this must be the same subclass + * + * @return Error Code, 0 for success + */ + virtual int split( + uint32_t match, //< [in] value to match + uint32_t bits, //< [in] bits to check + CollectionIndex* dest //< [in] destination index + ) { ceph_abort(); return 0; } + + virtual int merge( + uint32_t bits, //< [in] common (target) bits + CollectionIndex* dest //< [in] destination index + ) { ceph_abort(); return 0; } + + + /// List contents of collection by hash + virtual int collection_list_partial( + const ghobject_t &start, ///< [in] object at which to start + const ghobject_t &end, ///< [in] list only objects < end + int max_count, ///< [in] return at most max_count objects + vector<ghobject_t> *ls, ///< [out] Listed objects + ghobject_t *next ///< [out] Next object to list + ) = 0; + + /// Call prior to removing directory + virtual int prep_delete() { return 0; } + + CollectionIndex(CephContext* cct, const coll_t& collection) + : cct(cct), access_lock("CollectionIndex::access_lock", true, false) {} + + /* + * Pre-hash the collection, this collection should map to a PG folder. + * + * @param pg_num - pg number of the pool this collection belongs to. + * @param expected_num_objs - expected number of objects in this collection. + * @Return 0 on success, an error code otherwise. + */ + virtual int pre_hash_collection( + uint32_t pg_num, ///< [in] pg number of the pool this collection belongs to + uint64_t expected_num_objs ///< [in] expected number of objects this collection has + ) { ceph_abort(); return 0; } + + virtual int apply_layout_settings(int target_level) { ceph_abort(); return 0; } + + /// Read index-wide settings (should be called after construction) + virtual int read_settings() { return 0; } + + /// Virtual destructor + virtual ~CollectionIndex() {} +}; + +#endif diff --git a/src/os/filestore/DBObjectMap.cc b/src/os/filestore/DBObjectMap.cc new file mode 100644 index 00000000..5a057014 --- /dev/null +++ b/src/os/filestore/DBObjectMap.cc @@ -0,0 +1,1415 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "include/int_types.h" +#include "include/buffer.h" + +#include <iostream> +#include <set> +#include <map> +#include <string> +#include <vector> + +#include "os/ObjectMap.h" +#include "kv/KeyValueDB.h" +#include "DBObjectMap.h" +#include <errno.h> + +#include "common/debug.h" +#include "common/config.h" +#include "include/ceph_assert.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "filestore " + +const string DBObjectMap::USER_PREFIX = "_USER_"; +const string DBObjectMap::XATTR_PREFIX = "_AXATTR_"; +const string DBObjectMap::SYS_PREFIX = "_SYS_"; +const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_"; +const string DBObjectMap::HEADER_KEY = "HEADER"; +const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER"; +const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER"; +const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_"; + +// Legacy +const string DBObjectMap::LEAF_PREFIX = "_LEAF_"; +const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_"; + +static void append_escaped(const string &in, string *out) +{ + for (string::const_iterator i = in.begin(); i != in.end(); ++i) { + if (*i == '%') { + out->push_back('%'); + out->push_back('p'); + } else if (*i == '.') { + out->push_back('%'); + out->push_back('e'); + } else if (*i == '_') { + out->push_back('%'); + out->push_back('u'); + } else { + out->push_back(*i); + } + } +} + +int DBObjectMap::check(std::ostream &out, bool repair, bool force) +{ + int errors = 0, comp_errors = 0; + bool repaired = false; + map<uint64_t, uint64_t> parent_to_num_children; + map<uint64_t, uint64_t> parent_to_actual_num_children; + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + _Header header; + bufferlist bl = iter->value(); + while (true) { + auto bliter = bl.cbegin(); + header.decode(bliter); + if (header.seq != 0) + parent_to_actual_num_children[header.seq] = header.num_children; + + if (state.v == 2 || force) { + // Check complete table + bool complete_error = false; + boost::optional<string> prev; + KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX); + for (complete_iter->seek_to_first(); complete_iter->valid(); + complete_iter->next()) { + if (prev && prev >= complete_iter->key()) { + out << "Bad complete for " << header.oid << std::endl; + complete_error = true; + break; + } + prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1); + } + if (complete_error) { + out << "Complete mapping for " << header.seq << " :" << std::endl; + for (complete_iter->seek_to_first(); complete_iter->valid(); + complete_iter->next()) { + out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl; + } + if (repair) { + repaired = true; + KeyValueDB::Transaction t = db->get_transaction(); + t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX); + db->submit_transaction(t); + out << "Cleared complete mapping to repair" << std::endl; + } else { + errors++; // Only count when not repaired + comp_errors++; // Track errors here for version update + } + } + } + + if (header.parent == 0) + break; + + if (!parent_to_num_children.count(header.parent)) + parent_to_num_children[header.parent] = 0; + parent_to_num_children[header.parent]++; + if (parent_to_actual_num_children.count(header.parent)) + break; + + set<string> to_get; + map<string, bufferlist> got; + to_get.insert(HEADER_KEY); + db->get(sys_parent_prefix(header), to_get, &got); + if (got.empty()) { + out << "Missing: seq " << header.parent << std::endl; + errors++; + break; + } else { + bl = got.begin()->second; + } + } + } + + for (map<uint64_t, uint64_t>::iterator i = parent_to_num_children.begin(); + i != parent_to_num_children.end(); + parent_to_num_children.erase(i++)) { + if (!parent_to_actual_num_children.count(i->first)) + continue; + if (parent_to_actual_num_children[i->first] != i->second) { + out << "Invalid: seq " << i->first << " recorded children: " + << parent_to_actual_num_children[i->first] << " found: " + << i->second << std::endl; + errors++; + } + parent_to_actual_num_children.erase(i->first); + } + + // Only advance the version from 2 to 3 here + // Mark as legacy because there are still older structures + // we don't update. The value of legacy is only used + // for internal assertions. + if (comp_errors == 0 && state.v == 2 && repair) { + state.v = 3; + state.legacy = true; + set_state(); + } + + if (errors == 0 && repaired) + return -1; + return errors; +} + +string DBObjectMap::ghobject_key(const ghobject_t &oid) +{ + string out; + append_escaped(oid.hobj.oid.name, &out); + out.push_back('.'); + append_escaped(oid.hobj.get_key(), &out); + out.push_back('.'); + append_escaped(oid.hobj.nspace, &out); + out.push_back('.'); + + char snap_with_hash[1000]; + char *t = snap_with_hash; + char *end = t + sizeof(snap_with_hash); + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); + + if (oid.hobj.pool == -1) + t += snprintf(t, end - t, ".none"); + else + t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool); + t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash()); + + if (oid.generation != ghobject_t::NO_GEN || + oid.shard_id != shard_id_t::NO_SHARD) { + t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation); + t += snprintf(t, end - t, ".%x", (int)oid.shard_id); + } + out += string(snap_with_hash); + return out; +} + +// ok: pglog%u3%efs1...0.none.0017B237 +// bad: plana8923501-10...4c.3.ffffffffffffffff.2 +// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2 +// returns 0 for false, 1 for true, negative for error +int DBObjectMap::is_buggy_ghobject_key_v1(CephContext* cct, + const string &in) +{ + int dots = 5; // skip 5 .'s + const char *s = in.c_str(); + do { + while (*s && *s != '.') + ++s; + if (!*s) { + derr << "unexpected null at " << (int)(s-in.c_str()) << dendl; + return -EINVAL; + } + ++s; + } while (*s && --dots); + if (!*s) { + derr << "unexpected null at " << (int)(s-in.c_str()) << dendl; + return -EINVAL; + } + // we are now either at a hash value (32 bits, 8 chars) or a generation + // value (64 bits) '.' and shard id. count the dots! + int len = 0; + while (*s && *s != '.') { + ++s; + ++len; + } + if (*s == '\0') { + if (len != 8) { + derr << "hash value is not 8 chars" << dendl; + return -EINVAL; // the hash value is always 8 chars. + } + return 0; + } + if (*s != '.') { // the shard follows. + derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl; + return -EINVAL; + } + return 1; +} + + +string DBObjectMap::map_header_key(const ghobject_t &oid) +{ + return ghobject_key(oid); +} + +string DBObjectMap::header_key(uint64_t seq) +{ + char buf[100]; + snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq); + return string(buf); +} + +string DBObjectMap::complete_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX; +} + +string DBObjectMap::user_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + USER_PREFIX; +} + +string DBObjectMap::sys_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + SYS_PREFIX; +} + +string DBObjectMap::xattr_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX; +} + +string DBObjectMap::sys_parent_prefix(_Header header) +{ + return USER_PREFIX + header_key(header.parent) + SYS_PREFIX; +} + +int DBObjectMap::DBObjectMapIteratorImpl::init() +{ + invalid = false; + if (ready) { + return 0; + } + ceph_assert(!parent_iter); + if (header->parent) { + Header parent = map->lookup_parent(header); + if (!parent) { + ceph_abort(); + return -EINVAL; + } + parent_iter = std::make_shared<DBObjectMapIteratorImpl>(map, parent); + } + key_iter = map->db->get_iterator(map->user_prefix(header)); + ceph_assert(key_iter); + complete_iter = map->db->get_iterator(map->complete_prefix(header)); + ceph_assert(complete_iter); + cur_iter = key_iter; + ceph_assert(cur_iter); + ready = true; + return 0; +} + +ObjectMap::ObjectMapIterator DBObjectMap::get_iterator( + const ghobject_t &oid) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return ObjectMapIterator(new EmptyIteratorImpl()); + DBObjectMapIterator iter = _get_iterator(header); + iter->hlock.swap(hl); + return iter; +} + +int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first() +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->seek_to_first(); + if (r < 0) + return r; + } + r = key_iter->seek_to_first(); + if (r < 0) + return r; + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last() +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->seek_to_last(); + if (r < 0) + return r; + if (parent_iter->valid()) + r = parent_iter->next(); + if (r < 0) + return r; + } + r = key_iter->seek_to_last(); + if (r < 0) + return r; + if (key_iter->valid()) + r = key_iter->next(); + if (r < 0) + return r; + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to) +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->lower_bound(to); + if (r < 0) + return r; + } + r = key_iter->lower_bound(to); + if (r < 0) + return r; + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::lower_bound_parent(const string &to) +{ + int r = lower_bound(to); + if (r < 0) + return r; + if (valid() && !on_parent()) + return next_parent(); + else + return r; +} + +int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after) +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->upper_bound(after); + if (r < 0) + return r; + } + r = key_iter->upper_bound(after); + if (r < 0) + return r; + return adjust(); +} + +bool DBObjectMap::DBObjectMapIteratorImpl::valid() +{ + bool valid = !invalid && ready; + ceph_assert(!valid || cur_iter->valid()); + return valid; +} + +bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent() +{ + if (parent_iter && parent_iter->valid() && + (!key_iter->valid() || key_iter->key() > parent_iter->key())) + return true; + return false; +} + +int DBObjectMap::DBObjectMapIteratorImpl::next() +{ + ceph_assert(cur_iter->valid()); + ceph_assert(valid()); + cur_iter->next(); + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::next_parent() +{ + r = next(); + if (r < 0) + return r; + while (parent_iter && parent_iter->valid() && !on_parent()) { + ceph_assert(valid()); + r = lower_bound(parent_iter->key()); + if (r < 0) + return r; + } + + if (!parent_iter || !parent_iter->valid()) { + invalid = true; + } + return 0; +} + +int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test, + string *begin, + string *end) +{ + /* This is clumsy because one cannot call prev() on end(), nor can one + * test for == begin(). + */ + complete_iter->upper_bound(to_test); + if (complete_iter->valid()) { + complete_iter->prev(); + if (!complete_iter->valid()) { + complete_iter->upper_bound(to_test); + return false; + } + } else { + complete_iter->seek_to_last(); + if (!complete_iter->valid()) + return false; + } + + ceph_assert(complete_iter->key() <= to_test); + ceph_assert(complete_iter->value().length() >= 1); + string _end(complete_iter->value().c_str(), + complete_iter->value().length() - 1); + if (_end.empty() || _end > to_test) { + if (begin) + *begin = complete_iter->key(); + if (end) + *end = _end; + return true; + } else { + complete_iter->next(); + ceph_assert(!complete_iter->valid() || complete_iter->key() > to_test); + return false; + } +} + +/** + * Moves parent_iter to the next position both out of the complete_region and + * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and + * less than key_iter and key_iter otherwise. + */ +int DBObjectMap::DBObjectMapIteratorImpl::adjust() +{ + string begin, end; + while (parent_iter && parent_iter->valid()) { + if (in_complete_region(parent_iter->key(), &begin, &end)) { + if (end.size() == 0) { + parent_iter->seek_to_last(); + if (parent_iter->valid()) + parent_iter->next(); + } else + parent_iter->lower_bound(end); + } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) { + parent_iter->next(); + } else { + break; + } + } + if (valid_parent()) { + cur_iter = parent_iter; + } else if (key_iter->valid()) { + cur_iter = key_iter; + } else { + invalid = true; + } + ceph_assert(invalid || cur_iter->valid()); + return 0; +} + + +string DBObjectMap::DBObjectMapIteratorImpl::key() +{ + return cur_iter->key(); +} + +bufferlist DBObjectMap::DBObjectMapIteratorImpl::value() +{ + return cur_iter->value(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::status() +{ + return r; +} + +int DBObjectMap::set_keys(const ghobject_t &oid, + const map<string, bufferlist> &set, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_create_map_header(hl, oid, t); + if (!header) + return -EINVAL; + if (check_spos(oid, header, spos)) + return 0; + + t->set(user_prefix(header), set); + + return db->submit_transaction(t); +} + +int DBObjectMap::set_header(const ghobject_t &oid, + const bufferlist &bl, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_create_map_header(hl, oid, t); + if (!header) + return -EINVAL; + if (check_spos(oid, header, spos)) + return 0; + _set_header(header, bl, t); + return db->submit_transaction(t); +} + +void DBObjectMap::_set_header(Header header, const bufferlist &bl, + KeyValueDB::Transaction t) +{ + map<string, bufferlist> to_set; + to_set[USER_HEADER_KEY] = bl; + t->set(sys_prefix(header), to_set); +} + +int DBObjectMap::get_header(const ghobject_t &oid, + bufferlist *bl) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) { + return 0; + } + return _get_header(header, bl); +} + +int DBObjectMap::_get_header(Header header, + bufferlist *bl) +{ + map<string, bufferlist> out; + while (true) { + out.clear(); + set<string> to_get; + to_get.insert(USER_HEADER_KEY); + int r = db->get(sys_prefix(header), to_get, &out); + if (r == 0 && !out.empty()) + break; + if (r < 0) + return r; + Header current(header); + if (!current->parent) + break; + header = lookup_parent(current); + } + + if (!out.empty()) + bl->swap(out.begin()->second); + return 0; +} + +int DBObjectMap::clear(const ghobject_t &oid, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + if (check_spos(oid, header, spos)) + return 0; + remove_map_header(hl, oid, header, t); + ceph_assert(header->num_children > 0); + header->num_children--; + int r = _clear(header, t); + if (r < 0) + return r; + return db->submit_transaction(t); +} + +int DBObjectMap::_clear(Header header, + KeyValueDB::Transaction t) +{ + while (1) { + if (header->num_children) { + set_header(header, t); + break; + } + clear_header(header, t); + if (!header->parent) + break; + Header parent = lookup_parent(header); + if (!parent) { + return -EINVAL; + } + ceph_assert(parent->num_children > 0); + parent->num_children--; + header.swap(parent); + } + return 0; +} + +int DBObjectMap::copy_up_header(Header header, + KeyValueDB::Transaction t) +{ + bufferlist bl; + int r = _get_header(header, &bl); + if (r < 0) + return r; + + _set_header(header, bl, t); + return 0; +} + +int DBObjectMap::rm_keys(const ghobject_t &oid, + const set<string> &to_clear, + const SequencerPosition *spos) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + KeyValueDB::Transaction t = db->get_transaction(); + if (check_spos(oid, header, spos)) + return 0; + t->rmkeys(user_prefix(header), to_clear); + if (!header->parent) { + return db->submit_transaction(t); + } + + ceph_assert(state.legacy); + + { + // We only get here for legacy (v2) stores + // Copy up all keys from parent excluding to_clear + // and remove parent + // This eliminates a v2 format use of complete for this oid only + map<string, bufferlist> to_write; + ObjectMapIterator iter = _get_iterator(header); + for (iter->seek_to_first() ; iter->valid() ; iter->next()) { + if (iter->status()) + return iter->status(); + if (!to_clear.count(iter->key())) + to_write[iter->key()] = iter->value(); + } + t->set(user_prefix(header), to_write); + } // destruct iter which has parent in_use + + copy_up_header(header, t); + Header parent = lookup_parent(header); + if (!parent) + return -EINVAL; + parent->num_children--; + _clear(parent, t); + header->parent = 0; + set_map_header(hl, oid, *header, t); + t->rmkeys_by_prefix(complete_prefix(header)); + return db->submit_transaction(t); +} + +int DBObjectMap::clear_keys_header(const ghobject_t &oid, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + if (check_spos(oid, header, spos)) + return 0; + + // save old attrs + KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header)); + if (!iter) + return -EINVAL; + map<string, bufferlist> attrs; + for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next()) + attrs.insert(make_pair(iter->key(), iter->value())); + if (iter->status()) + return iter->status(); + + // remove current header + remove_map_header(hl, oid, header, t); + ceph_assert(header->num_children > 0); + header->num_children--; + int r = _clear(header, t); + if (r < 0) + return r; + + // create new header + Header newheader = generate_new_header(oid, Header()); + set_map_header(hl, oid, *newheader, t); + if (!attrs.empty()) + t->set(xattr_prefix(newheader), attrs); + return db->submit_transaction(t); +} + +int DBObjectMap::get(const ghobject_t &oid, + bufferlist *_header, + map<string, bufferlist> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + _get_header(header, _header); + ObjectMapIterator iter = _get_iterator(header); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + if (iter->status()) + return iter->status(); + out->insert(make_pair(iter->key(), iter->value())); + } + return 0; +} + +int DBObjectMap::get_keys(const ghobject_t &oid, + set<string> *keys) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + ObjectMapIterator iter = _get_iterator(header); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + if (iter->status()) + return iter->status(); + keys->insert(iter->key()); + } + return 0; +} + +int DBObjectMap::scan(Header header, + const set<string> &in_keys, + set<string> *out_keys, + map<string, bufferlist> *out_values) +{ + ObjectMapIterator db_iter = _get_iterator(header); + for (set<string>::const_iterator key_iter = in_keys.begin(); + key_iter != in_keys.end(); + ++key_iter) { + db_iter->lower_bound(*key_iter); + if (db_iter->status()) + return db_iter->status(); + if (db_iter->valid() && db_iter->key() == *key_iter) { + if (out_keys) + out_keys->insert(*key_iter); + if (out_values) + out_values->insert(make_pair(db_iter->key(), db_iter->value())); + } + } + return 0; +} + +int DBObjectMap::get_values(const ghobject_t &oid, + const set<string> &keys, + map<string, bufferlist> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + return scan(header, keys, 0, out); +} + +int DBObjectMap::check_keys(const ghobject_t &oid, + const set<string> &keys, + set<string> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + return scan(header, keys, out, 0); +} + +int DBObjectMap::get_xattrs(const ghobject_t &oid, + const set<string> &to_get, + map<string, bufferlist> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + return db->get(xattr_prefix(header), to_get, out); +} + +int DBObjectMap::get_all_xattrs(const ghobject_t &oid, + set<string> *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header)); + if (!iter) + return -EINVAL; + for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next()) + out->insert(iter->key()); + return iter->status(); +} + +int DBObjectMap::set_xattrs(const ghobject_t &oid, + const map<string, bufferlist> &to_set, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_create_map_header(hl, oid, t); + if (!header) + return -EINVAL; + if (check_spos(oid, header, spos)) + return 0; + t->set(xattr_prefix(header), to_set); + return db->submit_transaction(t); +} + +int DBObjectMap::remove_xattrs(const ghobject_t &oid, + const set<string> &to_remove, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + if (check_spos(oid, header, spos)) + return 0; + t->rmkeys(xattr_prefix(header), to_remove); + return db->submit_transaction(t); +} + +// ONLY USED FOR TESTING +// Set version to 2 to avoid asserts +int DBObjectMap::legacy_clone(const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos) +{ + state.legacy = true; + + if (oid == target) + return 0; + + MapHeaderLock _l1(this, std::min(oid, target)); + MapHeaderLock _l2(this, std::max(oid, target)); + MapHeaderLock *lsource, *ltarget; + if (oid > target) { + lsource = &_l2; + ltarget= &_l1; + } else { + lsource = &_l1; + ltarget= &_l2; + } + + KeyValueDB::Transaction t = db->get_transaction(); + { + Header destination = lookup_map_header(*ltarget, target); + if (destination) { + if (check_spos(target, destination, spos)) + return 0; + destination->num_children--; + remove_map_header(*ltarget, target, destination, t); + _clear(destination, t); + } + } + + Header parent = lookup_map_header(*lsource, oid); + if (!parent) + return db->submit_transaction(t); + + Header source = generate_new_header(oid, parent); + Header destination = generate_new_header(target, parent); + if (spos) + destination->spos = *spos; + + parent->num_children = 2; + set_header(parent, t); + set_map_header(*lsource, oid, *source, t); + set_map_header(*ltarget, target, *destination, t); + + map<string, bufferlist> to_set; + KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent)); + for (xattr_iter->seek_to_first(); + xattr_iter->valid(); + xattr_iter->next()) + to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value())); + t->set(xattr_prefix(source), to_set); + t->set(xattr_prefix(destination), to_set); + t->rmkeys_by_prefix(xattr_prefix(parent)); + return db->submit_transaction(t); +} + +int DBObjectMap::clone(const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos) +{ + if (oid == target) + return 0; + + MapHeaderLock _l1(this, std::min(oid, target)); + MapHeaderLock _l2(this, std::max(oid, target)); + MapHeaderLock *lsource, *ltarget; + if (oid > target) { + lsource = &_l2; + ltarget= &_l1; + } else { + lsource = &_l1; + ltarget= &_l2; + } + + KeyValueDB::Transaction t = db->get_transaction(); + { + Header destination = lookup_map_header(*ltarget, target); + if (destination) { + if (check_spos(target, destination, spos)) + return 0; + destination->num_children--; + remove_map_header(*ltarget, target, destination, t); + _clear(destination, t); + } + } + + Header source = lookup_map_header(*lsource, oid); + if (!source) + return db->submit_transaction(t); + + Header destination = generate_new_header(target, Header()); + if (spos) + destination->spos = *spos; + + set_map_header(*ltarget, target, *destination, t); + + bufferlist bl; + int r = _get_header(source, &bl); + if (r < 0) + return r; + _set_header(destination, bl, t); + + map<string, bufferlist> to_set; + KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(source)); + for (xattr_iter->seek_to_first(); + xattr_iter->valid(); + xattr_iter->next()) + to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value())); + t->set(xattr_prefix(destination), to_set); + + map<string, bufferlist> to_write; + ObjectMapIterator iter = _get_iterator(source); + for (iter->seek_to_first() ; iter->valid() ; iter->next()) { + if (iter->status()) + return iter->status(); + to_write[iter->key()] = iter->value(); + } + t->set(user_prefix(destination), to_write); + + return db->submit_transaction(t); +} + +int DBObjectMap::upgrade_to_v2() +{ + dout(1) << __func__ << " start" << dendl; + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + iter->seek_to_first(); + while (iter->valid()) { + unsigned count = 0; + KeyValueDB::Transaction t = db->get_transaction(); + set<string> remove; + map<string, bufferlist> add; + for (; + iter->valid() && count < 300; + iter->next()) { + dout(20) << __func__ << " key is " << iter->key() << dendl; + int r = is_buggy_ghobject_key_v1(cct, iter->key()); + if (r < 0) { + derr << __func__ << " bad key '" << iter->key() << "'" << dendl; + return r; + } + if (!r) { + dout(20) << __func__ << " " << iter->key() << " ok" << dendl; + continue; + } + + // decode header to get oid + _Header hdr; + bufferlist bl = iter->value(); + auto bliter = bl.cbegin(); + hdr.decode(bliter); + + string newkey(ghobject_key(hdr.oid)); + dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl; + add[newkey] = iter->value(); + remove.insert(iter->key()); + ++count; + } + + if (!remove.empty()) { + dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl; + t->rmkeys(HOBJECT_TO_SEQ, remove); + t->set(HOBJECT_TO_SEQ, add); + int r = db->submit_transaction(t); + if (r < 0) + return r; + } + } + + state.v = 2; + + set_state(); + return 0; +} + +void DBObjectMap::set_state() +{ + Mutex::Locker l(header_lock); + KeyValueDB::Transaction t = db->get_transaction(); + write_state(t); + int ret = db->submit_transaction_sync(t); + ceph_assert(ret == 0); + dout(1) << __func__ << " done" << dendl; + return; +} + +int DBObjectMap::get_state() +{ + map<string, bufferlist> result; + set<string> to_get; + to_get.insert(GLOBAL_STATE_KEY); + int r = db->get(SYS_PREFIX, to_get, &result); + if (r < 0) + return r; + if (!result.empty()) { + auto bliter = result.begin()->second.cbegin(); + state.decode(bliter); + } else { + // New store + state.v = State::CUR_VERSION; + state.seq = 1; + state.legacy = false; + } + return 0; +} + +int DBObjectMap::init(bool do_upgrade) +{ + int ret = get_state(); + if (ret < 0) + return ret; + if (state.v < 1) { + dout(1) << "DBObjectMap is *very* old; upgrade to an older version first" + << dendl; + return -ENOTSUP; + } + if (state.v < 2) { // Needs upgrade + if (!do_upgrade) { + dout(1) << "DOBjbectMap requires an upgrade," + << " set filestore_update_to" + << dendl; + return -ENOTSUP; + } else { + int r = upgrade_to_v2(); + if (r < 0) + return r; + } + } + ostringstream ss; + int errors = check(ss, true); + if (errors) { + derr << ss.str() << dendl; + if (errors > 0) + return -EINVAL; + } + dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl; + return 0; +} + +int DBObjectMap::sync(const ghobject_t *oid, + const SequencerPosition *spos) { + KeyValueDB::Transaction t = db->get_transaction(); + if (oid) { + ceph_assert(spos); + MapHeaderLock hl(this, *oid); + Header header = lookup_map_header(hl, *oid); + if (header) { + dout(10) << "oid: " << *oid << " setting spos to " + << *spos << dendl; + header->spos = *spos; + set_map_header(hl, *oid, *header, t); + } + /* It may appear that this and the identical portion of the else + * block can combined below, but in this block, the transaction + * must be submitted under *both* the MapHeaderLock and the full + * header_lock. + * + * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891. + */ + Mutex::Locker l(header_lock); + write_state(t); + return db->submit_transaction_sync(t); + } else { + Mutex::Locker l(header_lock); + write_state(t); + return db->submit_transaction_sync(t); + } +} + +int DBObjectMap::write_state(KeyValueDB::Transaction _t) { + ceph_assert(header_lock.is_locked_by_me()); + dout(20) << "dbobjectmap: seq is " << state.seq << dendl; + KeyValueDB::Transaction t = _t ? _t : db->get_transaction(); + bufferlist bl; + state.encode(bl); + map<string, bufferlist> to_write; + to_write[GLOBAL_STATE_KEY] = bl; + t->set(SYS_PREFIX, to_write); + return _t ? 0 : db->submit_transaction(t); +} + + +DBObjectMap::Header DBObjectMap::_lookup_map_header( + const MapHeaderLock &l, + const ghobject_t &oid) +{ + ceph_assert(l.get_locked() == oid); + + _Header *header = new _Header(); + { + Mutex::Locker l(cache_lock); + if (caches.lookup(oid, header)) { + ceph_assert(!in_use.count(header->seq)); + in_use.insert(header->seq); + return Header(header, RemoveOnDelete(this)); + } + } + + bufferlist out; + int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out); + if (r < 0 || out.length()==0) { + delete header; + return Header(); + } + + Header ret(header, RemoveOnDelete(this)); + auto iter = out.cbegin(); + ret->decode(iter); + { + Mutex::Locker l(cache_lock); + caches.add(oid, *ret); + } + + ceph_assert(!in_use.count(header->seq)); + in_use.insert(header->seq); + return ret; +} + +DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid, + Header parent) +{ + Header header = Header(new _Header(), RemoveOnDelete(this)); + header->seq = state.seq++; + if (parent) { + header->parent = parent->seq; + header->spos = parent->spos; + } + header->num_children = 1; + header->oid = oid; + ceph_assert(!in_use.count(header->seq)); + in_use.insert(header->seq); + + write_state(); + return header; +} + +DBObjectMap::Header DBObjectMap::lookup_parent(Header input) +{ + Mutex::Locker l(header_lock); + while (in_use.count(input->parent)) + header_cond.Wait(header_lock); + map<string, bufferlist> out; + set<string> keys; + keys.insert(HEADER_KEY); + + dout(20) << "lookup_parent: parent " << input->parent + << " for seq " << input->seq << dendl; + int r = db->get(sys_parent_prefix(input), keys, &out); + if (r < 0) { + ceph_abort(); + return Header(); + } + if (out.empty()) { + ceph_abort(); + return Header(); + } + + Header header = Header(new _Header(), RemoveOnDelete(this)); + auto iter = out.begin()->second.cbegin(); + header->decode(iter); + ceph_assert(header->seq == input->parent); + dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent " + << header->parent << dendl; + in_use.insert(header->seq); + return header; +} + +DBObjectMap::Header DBObjectMap::lookup_create_map_header( + const MapHeaderLock &hl, + const ghobject_t &oid, + KeyValueDB::Transaction t) +{ + Mutex::Locker l(header_lock); + Header header = _lookup_map_header(hl, oid); + if (!header) { + header = _generate_new_header(oid, Header()); + set_map_header(hl, oid, *header, t); + } + return header; +} + +void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t) +{ + dout(20) << "clear_header: clearing seq " << header->seq << dendl; + t->rmkeys_by_prefix(user_prefix(header)); + t->rmkeys_by_prefix(sys_prefix(header)); + if (state.legacy) + t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0 + t->rmkeys_by_prefix(xattr_prefix(header)); + set<string> keys; + keys.insert(header_key(header->seq)); + t->rmkeys(USER_PREFIX, keys); +} + +void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t) +{ + dout(20) << "set_header: setting seq " << header->seq << dendl; + map<string, bufferlist> to_write; + header->encode(to_write[HEADER_KEY]); + t->set(sys_prefix(header), to_write); +} + +void DBObjectMap::remove_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, + Header header, + KeyValueDB::Transaction t) +{ + ceph_assert(l.get_locked() == oid); + dout(20) << "remove_map_header: removing " << header->seq + << " oid " << oid << dendl; + set<string> to_remove; + to_remove.insert(map_header_key(oid)); + t->rmkeys(HOBJECT_TO_SEQ, to_remove); + { + Mutex::Locker l(cache_lock); + caches.clear(oid); + } +} + +void DBObjectMap::set_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, _Header header, + KeyValueDB::Transaction t) +{ + ceph_assert(l.get_locked() == oid); + dout(20) << "set_map_header: setting " << header.seq + << " oid " << oid << " parent seq " + << header.parent << dendl; + map<string, bufferlist> to_set; + header.encode(to_set[map_header_key(oid)]); + t->set(HOBJECT_TO_SEQ, to_set); + { + Mutex::Locker l(cache_lock); + caches.add(oid, header); + } +} + +bool DBObjectMap::check_spos(const ghobject_t &oid, + Header header, + const SequencerPosition *spos) +{ + if (!spos || *spos > header->spos) { + stringstream out; + if (spos) + dout(10) << "oid: " << oid << " not skipping op, *spos " + << *spos << dendl; + else + dout(10) << "oid: " << oid << " not skipping op, *spos " + << "empty" << dendl; + dout(10) << " > header.spos " << header->spos << dendl; + return false; + } else { + dout(10) << "oid: " << oid << " skipping op, *spos " << *spos + << " <= header.spos " << header->spos << dendl; + return true; + } +} + +int DBObjectMap::list_objects(vector<ghobject_t> *out) +{ + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + bufferlist bl = iter->value(); + auto bliter = bl.cbegin(); + _Header header; + header.decode(bliter); + out->push_back(header.oid); + } + return 0; +} + +int DBObjectMap::list_object_headers(vector<_Header> *out) +{ + int error = 0; + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + bufferlist bl = iter->value(); + auto bliter = bl.cbegin(); + _Header header; + header.decode(bliter); + out->push_back(header); + while (header.parent) { + set<string> to_get; + map<string, bufferlist> got; + to_get.insert(HEADER_KEY); + db->get(sys_parent_prefix(header), to_get, &got); + if (got.empty()) { + dout(0) << "Missing: seq " << header.parent << dendl; + error = -ENOENT; + break; + } else { + bl = got.begin()->second; + auto bliter = bl.cbegin(); + header.decode(bliter); + out->push_back(header); + } + } + } + return error; +} + +ostream& operator<<(ostream& out, const DBObjectMap::_Header& h) +{ + out << "seq=" << h.seq << " parent=" << h.parent + << " num_children=" << h.num_children + << " ghobject=" << h.oid; + return out; +} + +int DBObjectMap::rename(const ghobject_t &from, + const ghobject_t &to, + const SequencerPosition *spos) +{ + if (from == to) + return 0; + + MapHeaderLock _l1(this, std::min(from, to)); + MapHeaderLock _l2(this, std::max(from, to)); + MapHeaderLock *lsource, *ltarget; + if (from > to) { + lsource = &_l2; + ltarget= &_l1; + } else { + lsource = &_l1; + ltarget= &_l2; + } + + KeyValueDB::Transaction t = db->get_transaction(); + { + Header destination = lookup_map_header(*ltarget, to); + if (destination) { + if (check_spos(to, destination, spos)) + return 0; + destination->num_children--; + remove_map_header(*ltarget, to, destination, t); + _clear(destination, t); + } + } + + Header hdr = lookup_map_header(*lsource, from); + if (!hdr) + return db->submit_transaction(t); + + remove_map_header(*lsource, from, hdr, t); + hdr->oid = to; + set_map_header(*ltarget, to, *hdr, t); + + return db->submit_transaction(t); +} diff --git a/src/os/filestore/DBObjectMap.h b/src/os/filestore/DBObjectMap.h new file mode 100644 index 00000000..e288df83 --- /dev/null +++ b/src/os/filestore/DBObjectMap.h @@ -0,0 +1,585 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +#ifndef DBOBJECTMAP_DB_H +#define DBOBJECTMAP_DB_H + +#include "include/buffer_fwd.h" +#include <set> +#include <map> +#include <string> + +#include <vector> +#include <boost/scoped_ptr.hpp> + +#include "os/ObjectMap.h" +#include "kv/KeyValueDB.h" +#include "osd/osd_types.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/simple_cache.hpp" +#include <boost/optional/optional_io.hpp> + +#include "SequencerPosition.h" + +/** + * DBObjectMap: Implements ObjectMap in terms of KeyValueDB + * + * Prefix space structure: + * + * @see complete_prefix + * @see user_prefix + * @see sys_prefix + * + * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and + * corresponding omap header + * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number + * @see State + * @see write_state + * @see init + * @see generate_new_header + * - USER_PREFIX + header_key(header->seq) + USER_PREFIX + * : key->value for header->seq + * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below + * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs + * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX + * : USER_HEADER_KEY - omap header for header->seq + * : HEADER_KEY - encoding of header for header->seq + * + * For each node (represented by a header), we + * store three mappings: the key mapping, the complete mapping, and the parent. + * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in + * this mapping indicates that the key mapping contains all entries on [x,y). + * Note, max string is represented by "", so ""->"" indicates that the parent + * is unnecessary (@see rm_keys). When looking up a key not contained in the + * the complete set, we have to check the parent if we don't find it in the + * key set. During rm_keys, we copy keys from the parent and update the + * complete set to reflect the change @see rm_keys. + */ +class DBObjectMap : public ObjectMap { +public: + + KeyValueDB *get_db() override { return db.get(); } + + /** + * Serializes access to next_seq as well as the in_use set + */ + Mutex header_lock; + Cond header_cond; + Cond map_header_cond; + + /** + * Set of headers currently in use + */ + set<uint64_t> in_use; + set<ghobject_t> map_header_in_use; + + /** + * Takes the map_header_in_use entry in constructor, releases in + * destructor + */ + class MapHeaderLock { + DBObjectMap *db; + boost::optional<ghobject_t> locked; + + MapHeaderLock(const MapHeaderLock &); + MapHeaderLock &operator=(const MapHeaderLock &); + public: + explicit MapHeaderLock(DBObjectMap *db) : db(db) {} + MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) { + Mutex::Locker l(db->header_lock); + while (db->map_header_in_use.count(*locked)) + db->map_header_cond.Wait(db->header_lock); + db->map_header_in_use.insert(*locked); + } + + const ghobject_t &get_locked() const { + ceph_assert(locked); + return *locked; + } + + void swap(MapHeaderLock &o) { + ceph_assert(db == o.db); + + // centos6's boost optional doesn't seem to have swap :( + boost::optional<ghobject_t> _locked = o.locked; + o.locked = locked; + locked = _locked; + } + + ~MapHeaderLock() { + if (locked) { + Mutex::Locker l(db->header_lock); + ceph_assert(db->map_header_in_use.count(*locked)); + db->map_header_cond.Signal(); + db->map_header_in_use.erase(*locked); + } + } + }; + + DBObjectMap(CephContext* cct, KeyValueDB *db) + : ObjectMap(cct, db), header_lock("DBOBjectMap"), + cache_lock("DBObjectMap::CacheLock"), + caches(cct->_conf->filestore_omap_header_cache_size) + {} + + int set_keys( + const ghobject_t &oid, + const map<string, bufferlist> &set, + const SequencerPosition *spos=0 + ) override; + + int set_header( + const ghobject_t &oid, + const bufferlist &bl, + const SequencerPosition *spos=0 + ) override; + + int get_header( + const ghobject_t &oid, + bufferlist *bl + ) override; + + int clear( + const ghobject_t &oid, + const SequencerPosition *spos=0 + ) override; + + int clear_keys_header( + const ghobject_t &oid, + const SequencerPosition *spos=0 + ) override; + + int rm_keys( + const ghobject_t &oid, + const set<string> &to_clear, + const SequencerPosition *spos=0 + ) override; + + int get( + const ghobject_t &oid, + bufferlist *header, + map<string, bufferlist> *out + ) override; + + int get_keys( + const ghobject_t &oid, + set<string> *keys + ) override; + + int get_values( + const ghobject_t &oid, + const set<string> &keys, + map<string, bufferlist> *out + ) override; + + int check_keys( + const ghobject_t &oid, + const set<string> &keys, + set<string> *out + ) override; + + int get_xattrs( + const ghobject_t &oid, + const set<string> &to_get, + map<string, bufferlist> *out + ) override; + + int get_all_xattrs( + const ghobject_t &oid, + set<string> *out + ) override; + + int set_xattrs( + const ghobject_t &oid, + const map<string, bufferlist> &to_set, + const SequencerPosition *spos=0 + ) override; + + int remove_xattrs( + const ghobject_t &oid, + const set<string> &to_remove, + const SequencerPosition *spos=0 + ) override; + + int clone( + const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos=0 + ) override; + + int rename( + const ghobject_t &from, + const ghobject_t &to, + const SequencerPosition *spos=0 + ); + + int legacy_clone( + const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos=0 + ); + + /// Read initial state from backing store + int get_state(); + /// Write current state settings to DB + void set_state(); + /// Read initial state and upgrade or initialize state + int init(bool upgrade = false); + + /// Upgrade store to current version + int upgrade_to_v2(); + + /// Consistency check, debug, there must be no parallel writes + int check(std::ostream &out, bool repair = false, bool force = false) override; + + /// Ensure that all previous operations are durable + int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override; + + void compact() override { + ceph_assert(db); + db->compact(); + } + + /// Util, get all objects, there must be no other concurrent access + int list_objects(vector<ghobject_t> *objs ///< [out] objects + ); + + struct _Header; + // Util, get all object headers, there must be no other concurrent access + int list_object_headers(vector<_Header> *out ///< [out] headers + ); + + ObjectMapIterator get_iterator(const ghobject_t &oid) override; + + static const string USER_PREFIX; + static const string XATTR_PREFIX; + static const string SYS_PREFIX; + static const string COMPLETE_PREFIX; + static const string HEADER_KEY; + static const string USER_HEADER_KEY; + static const string GLOBAL_STATE_KEY; + static const string HOBJECT_TO_SEQ; + + /// Legacy + static const string LEAF_PREFIX; + static const string REVERSE_LEAF_PREFIX; + + /// persistent state for store @see generate_header + struct State { + static const __u8 CUR_VERSION = 3; + __u8 v; + uint64_t seq; + // legacy is false when complete regions never used + bool legacy; + State() : v(0), seq(1), legacy(false) {} + explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {} + + void encode(bufferlist &bl) const { + ENCODE_START(3, 1, bl); + encode(v, bl); + encode(seq, bl); + encode(legacy, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl) { + DECODE_START(3, bl); + if (struct_v >= 2) + decode(v, bl); + else + v = 0; + decode(seq, bl); + if (struct_v >= 3) + decode(legacy, bl); + else + legacy = false; + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + f->dump_unsigned("v", v); + f->dump_unsigned("seq", seq); + f->dump_bool("legacy", legacy); + } + + static void generate_test_instances(list<State*> &o) { + o.push_back(new State(0)); + o.push_back(new State(20)); + } + } state; + + struct _Header { + uint64_t seq; + uint64_t parent; + uint64_t num_children; + + ghobject_t oid; + + SequencerPosition spos; + + void encode(bufferlist &bl) const { + coll_t unused; + ENCODE_START(2, 1, bl); + encode(seq, bl); + encode(parent, bl); + encode(num_children, bl); + encode(unused, bl); + encode(oid, bl); + encode(spos, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl) { + coll_t unused; + DECODE_START(2, bl); + decode(seq, bl); + decode(parent, bl); + decode(num_children, bl); + decode(unused, bl); + decode(oid, bl); + if (struct_v >= 2) + decode(spos, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + f->dump_unsigned("seq", seq); + f->dump_unsigned("parent", parent); + f->dump_unsigned("num_children", num_children); + f->dump_stream("oid") << oid; + } + + static void generate_test_instances(list<_Header*> &o) { + o.push_back(new _Header); + o.push_back(new _Header); + o.back()->parent = 20; + o.back()->seq = 30; + } + + size_t length() { + return sizeof(_Header); + } + + _Header() : seq(0), parent(0), num_children(1) {} + }; + + /// String munging (public for testing) + static string ghobject_key(const ghobject_t &oid); + static string ghobject_key_v0(coll_t c, const ghobject_t &oid); + static int is_buggy_ghobject_key_v1(CephContext* cct, + const string &in); +private: + /// Implicit lock on Header->seq + typedef std::shared_ptr<_Header> Header; + Mutex cache_lock; + SimpleLRU<ghobject_t, _Header> caches; + + string map_header_key(const ghobject_t &oid); + string header_key(uint64_t seq); + string complete_prefix(Header header); + string user_prefix(Header header); + string sys_prefix(Header header); + string xattr_prefix(Header header); + string sys_parent_prefix(_Header header); + string sys_parent_prefix(Header header) { + return sys_parent_prefix(*header); + } + + class EmptyIteratorImpl : public ObjectMapIteratorImpl { + public: + int seek_to_first() override { return 0; } + int seek_to_last() { return 0; } + int upper_bound(const string &after) override { return 0; } + int lower_bound(const string &to) override { return 0; } + bool valid() override { return false; } + int next() override { ceph_abort(); return 0; } + string key() override { ceph_abort(); return ""; } + bufferlist value() override { ceph_abort(); return bufferlist(); } + int status() override { return 0; } + }; + + + /// Iterator + class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl { + public: + DBObjectMap *map; + + /// NOTE: implicit lock hlock->get_locked() when returned out of the class + MapHeaderLock hlock; + /// NOTE: implicit lock on header->seq AND for all ancestors + Header header; + + /// parent_iter == NULL iff no parent + std::shared_ptr<DBObjectMapIteratorImpl> parent_iter; + KeyValueDB::Iterator key_iter; + KeyValueDB::Iterator complete_iter; + + /// cur_iter points to currently valid iterator + std::shared_ptr<ObjectMapIteratorImpl> cur_iter; + int r; + + /// init() called, key_iter, complete_iter, parent_iter filled in + bool ready; + /// past end + bool invalid; + + DBObjectMapIteratorImpl(DBObjectMap *map, Header header) : + map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {} + int seek_to_first() override; + int seek_to_last(); + int upper_bound(const string &after) override; + int lower_bound(const string &to) override; + bool valid() override; + int next() override; + string key() override; + bufferlist value() override; + int status() override; + + bool on_parent() { + return cur_iter == parent_iter; + } + + /// skips to next valid parent entry + int next_parent(); + + /// first parent() >= to + int lower_bound_parent(const string &to); + + /** + * Tests whether to_test is in complete region + * + * postcondition: complete_iter will be max s.t. complete_iter->value > to_test + */ + int in_complete_region(const string &to_test, ///< [in] key to test + string *begin, ///< [out] beginning of region + string *end ///< [out] end of region + ); ///< @returns true if to_test is in the complete region, else false + + private: + int init(); + bool valid_parent(); + int adjust(); + }; + + typedef std::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator; + DBObjectMapIterator _get_iterator(Header header) { + return std::make_shared<DBObjectMapIteratorImpl>(this, header); + } + + /// sys + + /// Removes node corresponding to header + void clear_header(Header header, KeyValueDB::Transaction t); + + /// Set node containing input to new contents + void set_header(Header input, KeyValueDB::Transaction t); + + /// Remove leaf node corresponding to oid in c + void remove_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, + Header header, + KeyValueDB::Transaction t); + + /// Set leaf node for c and oid to the value of header + void set_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, _Header header, + KeyValueDB::Transaction t); + + /// Set leaf node for c and oid to the value of header + bool check_spos(const ghobject_t &oid, + Header header, + const SequencerPosition *spos); + + /// Lookup or create header for c oid + Header lookup_create_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, + KeyValueDB::Transaction t); + + /** + * Generate new header for c oid with new seq number + * + * Has the side effect of synchronously saving the new DBObjectMap state + */ + Header _generate_new_header(const ghobject_t &oid, Header parent); + Header generate_new_header(const ghobject_t &oid, Header parent) { + Mutex::Locker l(header_lock); + return _generate_new_header(oid, parent); + } + + /// Lookup leaf header for c oid + Header _lookup_map_header( + const MapHeaderLock &l, + const ghobject_t &oid); + Header lookup_map_header( + const MapHeaderLock &l2, + const ghobject_t &oid) { + Mutex::Locker l(header_lock); + return _lookup_map_header(l2, oid); + } + + /// Lookup header node for input + Header lookup_parent(Header input); + + + /// Helpers + int _get_header(Header header, bufferlist *bl); + + /// Scan keys in header into out_keys and out_values (if nonnull) + int scan(Header header, + const set<string> &in_keys, + set<string> *out_keys, + map<string, bufferlist> *out_values); + + /// Remove header and all related prefixes + int _clear(Header header, + KeyValueDB::Transaction t); + + /* Scan complete region bumping *begin to the beginning of any + * containing region and adding all complete region keys between + * the updated begin and end to the complete_keys_to_remove set */ + int merge_new_complete(DBObjectMapIterator &iter, + string *begin, + const string &end, + set<string> *complete_keys_to_remove); + + /// Writes out State (mainly next_seq) + int write_state(KeyValueDB::Transaction _t = + KeyValueDB::Transaction()); + + /// Copies header entry from parent @see rm_keys + int copy_up_header(Header header, + KeyValueDB::Transaction t); + + /// Sets header @see set_header + void _set_header(Header header, const bufferlist &bl, + KeyValueDB::Transaction t); + + /** + * Removes header seq lock and possibly object lock + * once Header is out of scope + * @see lookup_parent + * @see generate_new_header + */ + class RemoveOnDelete { + public: + DBObjectMap *db; + explicit RemoveOnDelete(DBObjectMap *db) : + db(db) {} + void operator() (_Header *header) { + Mutex::Locker l(db->header_lock); + ceph_assert(db->in_use.count(header->seq)); + db->in_use.erase(header->seq); + db->header_cond.Signal(); + delete header; + } + }; + friend class RemoveOnDelete; +}; +WRITE_CLASS_ENCODER(DBObjectMap::_Header) +WRITE_CLASS_ENCODER(DBObjectMap::State) + +ostream& operator<<(ostream& out, const DBObjectMap::_Header& h); + +#endif diff --git a/src/os/filestore/FDCache.h b/src/os/filestore/FDCache.h new file mode 100644 index 00000000..ee8c4fb0 --- /dev/null +++ b/src/os/filestore/FDCache.h @@ -0,0 +1,112 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_FDCACHE_H +#define CEPH_FDCACHE_H + +#include <memory> +#include <errno.h> +#include <cstdio> +#include "common/config_obs.h" +#include "common/hobject.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/shared_cache.hpp" +#include "include/compat.h" +#include "include/intarith.h" + +/** + * FD Cache + */ +class FDCache : public md_config_obs_t { +public: + /** + * FD + * + * Wrapper for an fd. Destructor closes the fd. + */ + class FD { + public: + const int fd; + explicit FD(int _fd) : fd(_fd) { + ceph_assert(_fd >= 0); + } + int operator*() const { + return fd; + } + ~FD() { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + }; + +private: + CephContext *cct; + const int registry_shards; + SharedLRU<ghobject_t, FD> *registry; + +public: + explicit FDCache(CephContext *cct) : cct(cct), + registry_shards(std::max<int64_t>(cct->_conf->filestore_fd_cache_shards, 1)) { + ceph_assert(cct); + cct->_conf.add_observer(this); + registry = new SharedLRU<ghobject_t, FD>[registry_shards]; + for (int i = 0; i < registry_shards; ++i) { + registry[i].set_cct(cct); + registry[i].set_size( + std::max<int64_t>((cct->_conf->filestore_fd_cache_size / registry_shards), 1)); + } + } + ~FDCache() override { + cct->_conf.remove_observer(this); + delete[] registry; + } + typedef std::shared_ptr<FD> FDRef; + + FDRef lookup(const ghobject_t &hoid) { + int registry_id = hoid.hobj.get_hash() % registry_shards; + return registry[registry_id].lookup(hoid); + } + + FDRef add(const ghobject_t &hoid, int fd, bool *existed) { + int registry_id = hoid.hobj.get_hash() % registry_shards; + return registry[registry_id].add(hoid, new FD(fd), existed); + } + + /// clear cached fd for hoid, subsequent lookups will get an empty FD + void clear(const ghobject_t &hoid) { + int registry_id = hoid.hobj.get_hash() % registry_shards; + registry[registry_id].purge(hoid); + } + + /// md_config_obs_t + const char** get_tracked_conf_keys() const override { + static const char* KEYS[] = { + "filestore_fd_cache_size", + NULL + }; + return KEYS; + } + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) override { + if (changed.count("filestore_fd_cache_size")) { + for (int i = 0; i < registry_shards; ++i) + registry[i].set_size( + std::max<int64_t>((conf->filestore_fd_cache_size / registry_shards), 1)); + } + } + +}; +typedef FDCache::FDRef FDRef; + +#endif diff --git a/src/os/filestore/FileJournal.cc b/src/os/filestore/FileJournal.cc new file mode 100644 index 00000000..f0351fe4 --- /dev/null +++ b/src/os/filestore/FileJournal.cc @@ -0,0 +1,2216 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "acconfig.h" + +#include "common/debug.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "FileJournal.h" +#include "include/color.h" +#include "common/perf_counters.h" +#include "FileStore.h" + +#include "include/compat.h" + +#include <fcntl.h> +#include <limits.h> +#include <sstream> +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mount.h> + +#include "common/blkdev.h" +#if defined(__linux__) +#include "common/linux_version.h" +#endif + +#if defined(__FreeBSD__) +#define O_DSYNC O_SYNC +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_journal +#undef dout_prefix +#define dout_prefix *_dout << "journal " + +const static int64_t ONE_MEG(1 << 20); +const static int CEPH_DIRECTIO_ALIGNMENT(4096); + + +int FileJournal::_open(bool forwrite, bool create) +{ + int flags, ret; + + if (forwrite) { + flags = O_RDWR; + if (directio) + flags |= O_DIRECT | O_DSYNC; + } else { + flags = O_RDONLY; + } + if (create) + flags |= O_CREAT; + + if (fd >= 0) { + if (TEMP_FAILURE_RETRY(::close(fd))) { + int err = errno; + derr << "FileJournal::_open: error closing old fd: " + << cpp_strerror(err) << dendl; + } + } + fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags|O_CLOEXEC, 0644)); + if (fd < 0) { + int err = errno; + dout(2) << "FileJournal::_open unable to open journal " + << fn << ": " << cpp_strerror(err) << dendl; + return -err; + } + + struct stat st; + ret = ::fstat(fd, &st); + if (ret) { + ret = errno; + derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl; + ret = -ret; + goto out_fd; + } + + if (S_ISBLK(st.st_mode)) { + ret = _open_block_device(); + } else if (S_ISREG(st.st_mode)) { + if (aio && !force_aio) { + derr << "FileJournal::_open: disabling aio for non-block journal. Use " + << "journal_force_aio to force use of aio anyway" << dendl; + aio = false; + } + ret = _open_file(st.st_size, st.st_blksize, create); + } else { + derr << "FileJournal::_open: wrong journal file type: " << st.st_mode + << dendl; + ret = -EINVAL; + } + + if (ret) + goto out_fd; + +#ifdef HAVE_LIBAIO + if (aio) { + aio_ctx = 0; + ret = io_setup(128, &aio_ctx); + if (ret < 0) { + switch (ret) { + // Contrary to naive expectations -EAGIAN means ... + case -EAGAIN: + derr << "FileJournal::_open: user's limit of aio events exceeded. " + << "Try increasing /proc/sys/fs/aio-max-nr" << dendl; + break; + default: + derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl; + break; + } + goto out_fd; + } + } +#endif + + /* We really want max_size to be a multiple of block_size. */ + max_size -= max_size % block_size; + + dout(1) << "_open " << fn << " fd " << fd + << ": " << max_size + << " bytes, block size " << block_size + << " bytes, directio = " << directio + << ", aio = " << aio + << dendl; + return 0; + + out_fd: + VOID_TEMP_FAILURE_RETRY(::close(fd)); + fd = -1; + return ret; +} + +int FileJournal::_open_block_device() +{ + int64_t bdev_sz = 0; + BlkDev blkdev(fd); + int ret = blkdev.get_size(&bdev_sz); + if (ret) { + dout(0) << __func__ << ": failed to read block device size." << dendl; + return -EIO; + } + + /* Check for bdev_sz too small */ + if (bdev_sz < ONE_MEG) { + dout(0) << __func__ << ": your block device must be at least " + << ONE_MEG << " bytes to be used for a Ceph journal." << dendl; + return -EINVAL; + } + + dout(10) << __func__ << ": ignoring osd journal size. " + << "We'll use the entire block device (size: " << bdev_sz << ")" + << dendl; + max_size = bdev_sz; + + block_size = cct->_conf->journal_block_size; + + if (cct->_conf->journal_discard) { + discard = blkdev.support_discard(); + dout(10) << fn << " support discard: " << (int)discard << dendl; + } + + return 0; +} + +int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, + bool create) +{ + int ret; + int64_t conf_journal_sz(cct->_conf->osd_journal_size); + conf_journal_sz <<= 20; + + if ((cct->_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) { + derr << "I'm sorry, I don't know how large of a journal to create." + << "Please specify a block device to use as the journal OR " + << "set osd_journal_size in your ceph.conf" << dendl; + return -EINVAL; + } + + if (create && (oldsize < conf_journal_sz)) { + uint64_t newsize(conf_journal_sz); + dout(10) << __func__ << " _open extending to " << newsize << " bytes" << dendl; + ret = ::ftruncate(fd, newsize); + if (ret < 0) { + int err = errno; + derr << "FileJournal::_open_file : unable to extend journal to " + << newsize << " bytes: " << cpp_strerror(err) << dendl; + return -err; + } + ret = ceph_posix_fallocate(fd, 0, newsize); + if (ret) { + derr << "FileJournal::_open_file : unable to preallocation journal to " + << newsize << " bytes: " << cpp_strerror(ret) << dendl; + return -ret; + } + max_size = newsize; + } + else { + max_size = oldsize; + } + block_size = cct->_conf->journal_block_size; + + if (create && cct->_conf->journal_zero_on_create) { + derr << "FileJournal::_open_file : zeroing journal" << dendl; + uint64_t write_size = 1 << 20; + char *buf; + ret = ::posix_memalign((void **)&buf, block_size, write_size); + if (ret != 0) { + return -ret; + } + memset(static_cast<void*>(buf), 0, write_size); + uint64_t i = 0; + for (; (i + write_size) <= (uint64_t)max_size; i += write_size) { + ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i); + if (ret < 0) { + free(buf); + return -errno; + } + } + if (i < (uint64_t)max_size) { + ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i); + if (ret < 0) { + free(buf); + return -errno; + } + } + free(buf); + } + + + dout(10) << "_open journal is not a block device, NOT checking disk " + << "write cache on '" << fn << "'" << dendl; + + return 0; +} + +// This can not be used on an active journal +int FileJournal::check() +{ + int ret; + + ceph_assert(fd == -1); + ret = _open(false, false); + if (ret) + return ret; + + ret = read_header(&header); + if (ret < 0) + goto done; + + if (header.fsid != fsid) { + derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid + << ", invalid (someone else's?) journal" << dendl; + ret = -EINVAL; + goto done; + } + + dout(1) << "check: header looks ok" << dendl; + ret = 0; + + done: + close(); + return ret; +} + + +int FileJournal::create() +{ + void *buf = 0; + int64_t needed_space; + int ret; + buffer::ptr bp; + dout(2) << "create " << fn << " fsid " << fsid << dendl; + + ret = _open(true, true); + if (ret) + goto done; + + // write empty header + header = header_t(); + header.flags = header_t::FLAG_CRC; // enable crcs on any new journal. + header.fsid = fsid; + header.max_size = max_size; + header.block_size = block_size; + if (cct->_conf->journal_block_align || directio) + header.alignment = block_size; + else + header.alignment = 16; // at least stay word aligned on 64bit machines... + + header.start = get_top(); + header.start_seq = 0; + + print_header(header); + + // static zeroed buffer for alignment padding + delete [] zero_buf; + zero_buf = new char[header.alignment]; + memset(zero_buf, 0, header.alignment); + + bp = prepare_header(); + if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) { + ret = -errno; + derr << "FileJournal::create : create write header error " + << cpp_strerror(ret) << dendl; + goto close_fd; + } + + // zero first little bit, too. + ret = posix_memalign(&buf, block_size, block_size); + if (ret) { + ret = -ret; + derr << "FileJournal::create: failed to allocate " << block_size + << " bytes of memory: " << cpp_strerror(ret) << dendl; + goto close_fd; + } + memset(buf, 0, block_size); + if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) { + ret = -errno; + derr << "FileJournal::create: error zeroing first " << block_size + << " bytes " << cpp_strerror(ret) << dendl; + goto free_buf; + } + + needed_space = cct->_conf->osd_max_write_size << 20; + needed_space += (2 * sizeof(entry_header_t)) + get_top(); + if (header.max_size - header.start < needed_space) { + derr << "FileJournal::create: OSD journal is not large enough to hold " + << "osd_max_write_size bytes!" << dendl; + ret = -ENOSPC; + goto free_buf; + } + + dout(2) << "create done" << dendl; + ret = 0; + +free_buf: + free(buf); + buf = 0; +close_fd: + if (TEMP_FAILURE_RETRY(::close(fd)) < 0) { + ret = -errno; + derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret) + << dendl; + } +done: + fd = -1; + return ret; +} + +// This can not be used on an active journal +int FileJournal::peek_fsid(uuid_d& fsid) +{ + ceph_assert(fd == -1); + int r = _open(false, false); + if (r) + return r; + r = read_header(&header); + if (r < 0) + goto out; + fsid = header.fsid; +out: + close(); + return r; +} + +int FileJournal::open(uint64_t fs_op_seq) +{ + dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl; + + uint64_t next_seq = fs_op_seq + 1; + uint64_t seq = -1; + + int err = _open(false); + if (err) + return err; + + // assume writeable, unless... + read_pos = 0; + write_pos = get_top(); + + // read header? + err = read_header(&header); + if (err < 0) + goto out; + + // static zeroed buffer for alignment padding + delete [] zero_buf; + zero_buf = new char[header.alignment]; + memset(zero_buf, 0, header.alignment); + + dout(10) << "open header.fsid = " << header.fsid + //<< " vs expected fsid = " << fsid + << dendl; + if (header.fsid != fsid) { + derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid + << ", invalid (someone else's?) journal" << dendl; + err = -EINVAL; + goto out; + } + if (header.max_size > max_size) { + dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl; + err = -EINVAL; + goto out; + } + if (header.block_size != block_size) { + dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl; + err = -EINVAL; + goto out; + } + if (header.max_size % header.block_size) { + dout(2) << "open journal max size " << header.max_size + << " not a multiple of block size " << header.block_size << dendl; + err = -EINVAL; + goto out; + } + if (header.alignment != block_size && directio) { + dout(0) << "open journal alignment " << header.alignment << " does not match block size " + << block_size << " (required for direct_io journal mode)" << dendl; + err = -EINVAL; + goto out; + } + if ((header.alignment % CEPH_DIRECTIO_ALIGNMENT) && directio) { + dout(0) << "open journal alignment " << header.alignment + << " is not multiple of minimum directio alignment " + << CEPH_DIRECTIO_ALIGNMENT << " (required for direct_io journal mode)" + << dendl; + err = -EINVAL; + goto out; + } + + // looks like a valid header. + write_pos = 0; // not writeable yet + + journaled_seq = header.committed_up_to; + + // find next entry + read_pos = header.start; + seq = header.start_seq; + + while (1) { + bufferlist bl; + off64_t old_pos = read_pos; + if (!read_entry(bl, seq)) { + dout(10) << "open reached end of journal." << dendl; + break; + } + if (seq > next_seq) { + dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq + << ", ignoring journal contents" + << dendl; + read_pos = -1; + last_committed_seq = 0; + return 0; + } + if (seq == next_seq) { + dout(10) << "open reached seq " << seq << dendl; + read_pos = old_pos; + break; + } + seq++; // next event should follow. + } + + return 0; +out: + close(); + return err; +} + +void FileJournal::_close(int fd) const +{ + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + +void FileJournal::close() +{ + dout(1) << "close " << fn << dendl; + + // stop writer thread + stop_writer(); + + // close + ceph_assert(writeq_empty()); + ceph_assert(!must_write_header); + ceph_assert(fd >= 0); + _close(fd); + fd = -1; +} + + +int FileJournal::dump(ostream& out) +{ + return _dump(out, false); +} + +int FileJournal::simple_dump(ostream& out) +{ + return _dump(out, true); +} + +int FileJournal::_dump(ostream& out, bool simple) +{ + JSONFormatter f(true); + int ret = _fdump(f, simple); + f.flush(out); + return ret; +} + +int FileJournal::_fdump(Formatter &f, bool simple) +{ + dout(10) << "_fdump" << dendl; + + ceph_assert(fd == -1); + int err = _open(false, false); + if (err) + return err; + + err = read_header(&header); + if (err < 0) { + close(); + return err; + } + + off64_t next_pos = header.start; + + f.open_object_section("journal"); + + f.open_object_section("header"); + f.dump_unsigned("flags", header.flags); + ostringstream os; + os << header.fsid; + f.dump_string("fsid", os.str()); + f.dump_unsigned("block_size", header.block_size); + f.dump_unsigned("alignment", header.alignment); + f.dump_int("max_size", header.max_size); + f.dump_int("start", header.start); + f.dump_unsigned("committed_up_to", header.committed_up_to); + f.dump_unsigned("start_seq", header.start_seq); + f.close_section(); + + f.open_array_section("entries"); + uint64_t seq = header.start_seq; + while (1) { + bufferlist bl; + off64_t pos = next_pos; + + if (!pos) { + dout(2) << "_dump -- not readable" << dendl; + err = -EINVAL; + break; + } + stringstream ss; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + &ss); + if (result != SUCCESS) { + if (seq < header.committed_up_to) { + dout(2) << "Unable to read past sequence " << seq + << " but header indicates the journal has committed up through " + << header.committed_up_to << ", journal is corrupt" << dendl; + err = -EINVAL; + } + dout(25) << ss.str() << dendl; + dout(25) << "No further valid entries found, journal is most likely valid" + << dendl; + break; + } + + f.open_object_section("entry"); + f.dump_unsigned("offset", pos); + f.dump_unsigned("seq", seq); + if (simple) { + f.dump_unsigned("bl.length", bl.length()); + } else { + f.open_array_section("transactions"); + auto p = bl.cbegin(); + int trans_num = 0; + while (!p.end()) { + ObjectStore::Transaction t(p); + f.open_object_section("transaction"); + f.dump_unsigned("trans_num", trans_num); + t.dump(&f); + f.close_section(); + trans_num++; + } + f.close_section(); + } + f.close_section(); + } + + f.close_section(); + f.close_section(); + dout(10) << "dump finish" << dendl; + + close(); + return err; +} + + +void FileJournal::start_writer() +{ + write_stop = false; + aio_stop = false; + write_thread.create("journal_write"); +#ifdef HAVE_LIBAIO + if (aio) + write_finish_thread.create("journal_wrt_fin"); +#endif +} + +void FileJournal::stop_writer() +{ + // Do nothing if writer already stopped or never started + if (!write_stop) + { + { + Mutex::Locker l(write_lock); + Mutex::Locker p(writeq_lock); + write_stop = true; + writeq_cond.Signal(); + // Doesn't hurt to signal commit_cond in case thread is waiting there + // and caller didn't use committed_thru() first. + commit_cond.Signal(); + } + write_thread.join(); + + // write journal header now so that we have less to replay on remount + write_header_sync(); + } + +#ifdef HAVE_LIBAIO + // stop aio completeion thread *after* writer thread has stopped + // and has submitted all of its io + if (aio && !aio_stop) { + aio_lock.Lock(); + aio_stop = true; + aio_cond.Signal(); + write_finish_cond.Signal(); + aio_lock.Unlock(); + write_finish_thread.join(); + } +#endif +} + + + +void FileJournal::print_header(const header_t &header) const +{ + dout(10) << "header: block_size " << header.block_size + << " alignment " << header.alignment + << " max_size " << header.max_size + << dendl; + dout(10) << "header: start " << header.start << dendl; + dout(10) << " write_pos " << write_pos << dendl; +} + +int FileJournal::read_header(header_t *hdr) const +{ + dout(10) << "read_header" << dendl; + bufferlist bl; + + buffer::ptr bp = buffer::create_small_page_aligned(block_size); + char* bpdata = bp.c_str(); + int r = ::pread(fd, bpdata, bp.length(), 0); + + if (r < 0) { + int err = errno; + dout(0) << "read_header got " << cpp_strerror(err) << dendl; + return -err; + } + + // don't use bp.zero() here, because it also invalidates + // crc cache (which is not yet populated anyway) + if (bp.length() != (size_t)r) { + // r will be always less or equal than bp.length + bpdata += r; + memset(bpdata, 0, bp.length() - r); + } + + bl.push_back(std::move(bp)); + + try { + auto p = bl.cbegin(); + decode(*hdr, p); + } + catch (buffer::error& e) { + derr << "read_header error decoding journal header" << dendl; + return -EINVAL; + } + + + /* + * Unfortunately we weren't initializing the flags field for new + * journals! Aie. This is safe(ish) now that we have only one + * flag. Probably around when we add the next flag we need to + * remove this or else this (eventually old) code will clobber newer + * code's flags. + */ + if (hdr->flags > 3) { + derr << "read_header appears to have gibberish flags; assuming 0" << dendl; + hdr->flags = 0; + } + + print_header(*hdr); + + return 0; +} + +bufferptr FileJournal::prepare_header() +{ + bufferlist bl; + { + Mutex::Locker l(finisher_lock); + header.committed_up_to = journaled_seq; + } + encode(header, bl); + bufferptr bp = buffer::create_small_page_aligned(get_top()); + // don't use bp.zero() here, because it also invalidates + // crc cache (which is not yet populated anyway) + char* data = bp.c_str(); + memcpy(data, bl.c_str(), bl.length()); + data += bl.length(); + memset(data, 0, bp.length()-bl.length()); + return bp; +} + +void FileJournal::write_header_sync() +{ + Mutex::Locker locker(write_lock); + must_write_header = true; + bufferlist bl; + do_write(bl); + dout(20) << __func__ << " finish" << dendl; +} + +int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size) +{ + // already full? + if (full_state != FULL_NOTFULL) + return -ENOSPC; + + // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL. + off64_t room; + if (pos >= header.start) + room = (header.max_size - pos) + (header.start - get_top()) - 1; + else + room = header.start - pos - 1; + dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start + << " top " << get_top() << dendl; + + if (do_sync_cond) { + if (room >= (header.max_size >> 1) && + room - size < (header.max_size >> 1)) { + dout(10) << " passing half full mark, triggering commit" << dendl; + do_sync_cond->SloppySignal(); // initiate a real commit so we can trim + } + } + + if (room >= size) { + dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl; + if (pos + size > header.max_size) + must_write_header = true; + return 0; + } + + // full + dout(1) << "check_for_full at " << pos << " : JOURNAL FULL " + << pos << " >= " << room + << " (max_size " << header.max_size << " start " << header.start << ")" + << dendl; + + off64_t max = header.max_size - get_top(); + if (size > max) + dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl; + + return -ENOSPC; +} + +int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes) +{ + // gather queued writes + off64_t queue_pos = write_pos; + + int eleft = cct->_conf->journal_max_write_entries; + unsigned bmax = cct->_conf->journal_max_write_bytes; + + if (full_state != FULL_NOTFULL) + return -ENOSPC; + + while (!writeq_empty()) { + list<write_item> items; + batch_pop_write(items); + list<write_item>::iterator it = items.begin(); + while (it != items.end()) { + uint64_t bytes = it->bl.length(); + int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes); + if (r == 0) { // prepare ok, delete it + items.erase(it++); +#ifdef HAVE_LIBAIO + { + Mutex::Locker locker(aio_lock); + ceph_assert(aio_write_queue_ops > 0); + aio_write_queue_ops--; + ceph_assert(aio_write_queue_bytes >= bytes); + aio_write_queue_bytes -= bytes; + } +#else + (void)bytes; +#endif + } + if (r == -ENOSPC) { + // the journal maybe full, insert the left item to writeq + batch_unpop_write(items); + if (orig_ops) + goto out; // commit what we have + + if (logger) + logger->inc(l_filestore_journal_full); + + if (wait_on_full) { + dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl; + } else { + dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl; + + // throw out what we have so far + full_state = FULL_FULL; + while (!writeq_empty()) { + complete_write(1, peek_write().orig_len); + pop_write(); + } + print_header(header); + } + + return -ENOSPC; // hrm, full on first op + } + if (eleft) { + if (--eleft == 0) { + dout(20) << "prepare_multi_write hit max events per write " + << cct->_conf->journal_max_write_entries << dendl; + batch_unpop_write(items); + goto out; + } + } + if (bmax) { + if (bl.length() >= bmax) { + dout(20) << "prepare_multi_write hit max write size " + << cct->_conf->journal_max_write_bytes << dendl; + batch_unpop_write(items); + goto out; + } + } + } + } + +out: + dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl; + ceph_assert((write_pos + bl.length() == queue_pos) || + (write_pos + bl.length() - header.max_size + get_top() == queue_pos)); + return 0; +} + +/* +void FileJournal::queue_write_fin(uint64_t seq, Context *fin) +{ + writing_seq.push_back(seq); + if (!waiting_for_notfull.empty()) { + // make sure previously unjournaled stuff waiting for UNFULL triggers + // _before_ newly journaled stuff does + dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin + << " until after UNFULL" << dendl; + C_Gather *g = new C_Gather(writeq.front().fin); + writing_fin.push_back(g->new_sub()); + waiting_for_notfull.push_back(g->new_sub()); + } else { + writing_fin.push_back(writeq.front().fin); + dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl; + } +} +*/ + +void FileJournal::queue_completions_thru(uint64_t seq) +{ + ceph_assert(finisher_lock.is_locked()); + utime_t now = ceph_clock_now(); + list<completion_item> items; + batch_pop_completions(items); + list<completion_item>::iterator it = items.begin(); + while (it != items.end()) { + completion_item& next = *it; + if (next.seq > seq) + break; + utime_t lat = now; + lat -= next.start; + dout(10) << "queue_completions_thru seq " << seq + << " queueing seq " << next.seq + << " " << next.finish + << " lat " << lat << dendl; + if (logger) { + logger->tinc(l_filestore_journal_latency, lat); + } + if (next.finish) + finisher->queue(next.finish); + if (next.tracked_op) { + next.tracked_op->mark_event("journaled_completion_queued"); + next.tracked_op->journal_trace.event("queued completion"); + next.tracked_op->journal_trace.keyval("completed through", seq); + } + items.erase(it++); + } + batch_unpop_completions(items); + finisher_cond.Signal(); +} + + +int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes) +{ + uint64_t seq = next_write.seq; + bufferlist &ebl = next_write.bl; + off64_t size = ebl.length(); + + int r = check_for_full(seq, queue_pos, size); + if (r < 0) + return r; // ENOSPC or EAGAIN + + uint32_t orig_len = next_write.orig_len; + orig_bytes += orig_len; + orig_ops++; + + // add to write buffer + dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq + << " len " << orig_len << " -> " << size << dendl; + + unsigned seq_offset = offsetof(entry_header_t, seq); + unsigned magic1_offset = offsetof(entry_header_t, magic1); + unsigned magic2_offset = offsetof(entry_header_t, magic2); + + bufferptr headerptr = ebl.buffers().front(); + uint64_t _seq = seq; + uint64_t _queue_pos = queue_pos; + uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64()); + headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq); + headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos); + headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2); + + bufferptr footerptr = ebl.buffers().back(); + unsigned post_offset = footerptr.length() - sizeof(entry_header_t); + footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq); + footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos); + footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2); + + bl.claim_append(ebl); + if (next_write.tracked_op) { + next_write.tracked_op->mark_event("write_thread_in_journal_buffer"); + next_write.tracked_op->journal_trace.event("prepare_single_write"); + } + + journalq.push_back(pair<uint64_t,off64_t>(seq, queue_pos)); + writing_seq = seq; + + queue_pos += size; + if (queue_pos >= header.max_size) + queue_pos = queue_pos + get_top() - header.max_size; + + return 0; +} + +void FileJournal::check_align(off64_t pos, bufferlist& bl) +{ + // make sure list segments are page aligned + if (directio && !bl.is_aligned_size_and_memory(block_size, CEPH_DIRECTIO_ALIGNMENT)) { + ceph_assert((bl.length() & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0); + ceph_assert((pos & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0); + ceph_abort_msg("bl was not aligned"); + } +} + +int FileJournal::write_bl(off64_t& pos, bufferlist& bl) +{ + int ret; + + off64_t spos = ::lseek64(fd, pos, SEEK_SET); + if (spos < 0) { + ret = -errno; + derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl; + return ret; + } + ret = bl.write_fd(fd); + if (ret) { + derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl; + return ret; + } + pos += bl.length(); + if (pos == header.max_size) + pos = get_top(); + return 0; +} + +void FileJournal::do_write(bufferlist& bl) +{ + // nothing to do? + if (bl.length() == 0 && !must_write_header) + return; + + buffer::ptr hbp; + if (cct->_conf->journal_write_header_frequency && + (((++journaled_since_start) % + cct->_conf->journal_write_header_frequency) == 0)) { + must_write_header = true; + } + + if (must_write_header) { + must_write_header = false; + hbp = prepare_header(); + } + + dout(15) << "do_write writing " << write_pos << "~" << bl.length() + << (hbp.length() ? " + header":"") + << dendl; + + utime_t from = ceph_clock_now(); + + // entry + off64_t pos = write_pos; + + // Adjust write_pos + write_pos += bl.length(); + if (write_pos >= header.max_size) + write_pos = write_pos - header.max_size + get_top(); + + write_lock.Unlock(); + + // split? + off64_t split = 0; + if (pos + bl.length() > header.max_size) { + bufferlist first, second; + split = header.max_size - pos; + first.substr_of(bl, 0, split); + second.substr_of(bl, split, bl.length() - split); + ceph_assert(first.length() + second.length() == bl.length()); + dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length() + << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl; + + //Save pos to write first piece second + off64_t first_pos = pos; + off64_t orig_pos; + pos = get_top(); + // header too? + if (hbp.length()) { + // be sneaky: include the header in the second fragment + bufferlist tmp; + tmp.push_back(hbp); + tmp.claim_append(second); + second.swap(tmp); + pos = 0; // we included the header + } + // Write the second portion first possible with the header, so + // do_read_entry() won't even get a valid entry_header_t if there + // is a crash between the two writes. + orig_pos = pos; + if (write_bl(pos, second)) { + derr << "FileJournal::do_write: write_bl(pos=" << orig_pos + << ") failed" << dendl; + check_align(pos, second); + ceph_abort(); + } + orig_pos = first_pos; + if (write_bl(first_pos, first)) { + derr << "FileJournal::do_write: write_bl(pos=" << orig_pos + << ") failed" << dendl; + check_align(first_pos, first); + ceph_abort(); + } + ceph_assert(first_pos == get_top()); + } else { + // header too? + if (hbp.length()) { + if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) { + int err = errno; + derr << "FileJournal::do_write: pwrite(fd=" << fd + << ", hbp.length=" << hbp.length() << ") failed :" + << cpp_strerror(err) << dendl; + ceph_abort(); + } + } + + if (write_bl(pos, bl)) { + derr << "FileJournal::do_write: write_bl(pos=" << pos + << ") failed" << dendl; + check_align(pos, bl); + ceph_abort(); + } + } + + if (!directio) { + dout(20) << "do_write fsync" << dendl; + + /* + * We'd really love to have a fsync_range or fdatasync_range and do a: + * + * if (split) { + * ::fsync_range(fd, header.max_size - split, split)l + * ::fsync_range(fd, get_top(), bl.length() - split); + * else + * ::fsync_range(fd, write_pos, bl.length()) + * + * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be + * too hard given all the underlying infrastructure already exist. + * + * NOTE: using sync_file_range here would not be safe as it does not + * flush disk caches or commits any sort of metadata. + */ + int ret = 0; +#if defined(__APPLE__) || defined(__FreeBSD__) + ret = ::fsync(fd); +#else + ret = ::fdatasync(fd); +#endif + if (ret < 0) { + derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +#ifdef HAVE_POSIX_FADVISE + if (cct->_conf->filestore_fadvise) + posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + } + + utime_t lat = ceph_clock_now() - from; + dout(20) << "do_write latency " << lat << dendl; + + write_lock.Lock(); + + ceph_assert(write_pos == pos); + ceph_assert(write_pos % header.alignment == 0); + + { + Mutex::Locker locker(finisher_lock); + journaled_seq = writing_seq; + + // kick finisher? + // only if we haven't filled up recently! + if (full_state != FULL_NOTFULL) { + dout(10) << "do_write NOT queueing finisher seq " << journaled_seq + << ", full_commit_seq|full_restart_seq" << dendl; + } else { + if (plug_journal_completions) { + dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq + << " due to completion plug" << dendl; + } else { + dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl; + queue_completions_thru(journaled_seq); + } + } + } +} + +void FileJournal::flush() +{ + dout(10) << "waiting for completions to empty" << dendl; + { + Mutex::Locker l(finisher_lock); + while (!completions_empty()) + finisher_cond.Wait(finisher_lock); + } + dout(10) << "flush waiting for finisher" << dendl; + finisher->wait_for_empty(); + dout(10) << "flush done" << dendl; +} + + +void FileJournal::write_thread_entry() +{ + dout(10) << "write_thread_entry start" << dendl; + while (1) { + { + Mutex::Locker locker(writeq_lock); + if (writeq.empty() && !must_write_header) { + if (write_stop) + break; + dout(20) << "write_thread_entry going to sleep" << dendl; + writeq_cond.Wait(writeq_lock); + dout(20) << "write_thread_entry woke up" << dendl; + continue; + } + } + +#ifdef HAVE_LIBAIO + if (aio) { + Mutex::Locker locker(aio_lock); + // should we back off to limit aios in flight? try to do this + // adaptively so that we submit larger aios once we have lots of + // them in flight. + // + // NOTE: our condition here is based on aio_num (protected by + // aio_lock) and throttle_bytes (part of the write queue). when + // we sleep, we *only* wait for aio_num to change, and do not + // wake when more data is queued. this is not strictly correct, + // but should be fine given that we will have plenty of aios in + // flight if we hit this limit to ensure we keep the device + // saturated. + while (aio_num > 0) { + int exp = std::min<int>(aio_num * 2, 24); + long unsigned min_new = 1ull << exp; + uint64_t cur = aio_write_queue_bytes; + dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes + << " ... exp " << exp << " min_new " << min_new + << " ... pending " << cur << dendl; + if (cur >= min_new) + break; + dout(20) << "write_thread_entry deferring until more aios complete: " + << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new + << " bytes to start a new aio (currently " << cur << " pending)" << dendl; + aio_cond.Wait(aio_lock); + dout(20) << "write_thread_entry woke up" << dendl; + } + } +#endif + + Mutex::Locker locker(write_lock); + uint64_t orig_ops = 0; + uint64_t orig_bytes = 0; + + bufferlist bl; + int r = prepare_multi_write(bl, orig_ops, orig_bytes); + // Don't care about journal full if stoppping, so drop queue and + // possibly let header get written and loop above to notice stop + if (r == -ENOSPC) { + if (write_stop) { + dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl; + while (!writeq_empty()) { + complete_write(1, peek_write().orig_len); + pop_write(); + } + print_header(header); + r = 0; + } else { + dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl; + commit_cond.Wait(write_lock); + dout(20) << "write_thread_entry woke up" << dendl; + continue; + } + } + ceph_assert(r == 0); + + if (logger) { + logger->inc(l_filestore_journal_wr); + logger->inc(l_filestore_journal_wr_bytes, bl.length()); + } + +#ifdef HAVE_LIBAIO + if (aio) + do_aio_write(bl); + else + do_write(bl); +#else + do_write(bl); +#endif + complete_write(orig_ops, orig_bytes); + } + + dout(10) << "write_thread_entry finish" << dendl; +} + +#ifdef HAVE_LIBAIO +void FileJournal::do_aio_write(bufferlist& bl) +{ + + if (cct->_conf->journal_write_header_frequency && + (((++journaled_since_start) % + cct->_conf->journal_write_header_frequency) == 0)) { + must_write_header = true; + } + + // nothing to do? + if (bl.length() == 0 && !must_write_header) + return; + + buffer::ptr hbp; + if (must_write_header) { + must_write_header = false; + hbp = prepare_header(); + } + + // entry + off64_t pos = write_pos; + + dout(15) << "do_aio_write writing " << pos << "~" << bl.length() + << (hbp.length() ? " + header":"") + << dendl; + + // split? + off64_t split = 0; + if (pos + bl.length() > header.max_size) { + bufferlist first, second; + split = header.max_size - pos; + first.substr_of(bl, 0, split); + second.substr_of(bl, split, bl.length() - split); + ceph_assert(first.length() + second.length() == bl.length()); + dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl; + + if (write_aio_bl(pos, first, 0)) { + derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + ceph_assert(pos == header.max_size); + if (hbp.length()) { + // be sneaky: include the header in the second fragment + bufferlist tmp; + tmp.push_back(hbp); + tmp.claim_append(second); + second.swap(tmp); + pos = 0; // we included the header + } else + pos = get_top(); // no header, start after that + if (write_aio_bl(pos, second, writing_seq)) { + derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + } else { + // header too? + if (hbp.length()) { + bufferlist hbl; + hbl.push_back(hbp); + loff_t pos = 0; + if (write_aio_bl(pos, hbl, 0)) { + derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl; + ceph_abort(); + } + } + + if (write_aio_bl(pos, bl, writing_seq)) { + derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + } + + write_pos = pos; + if (write_pos == header.max_size) + write_pos = get_top(); + ceph_assert(write_pos % header.alignment == 0); +} + +/** + * write a buffer using aio + * + * @param seq seq to trigger when this aio completes. if 0, do not update any state + * on completion. + */ +int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq) +{ + dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl; + + while (bl.length() > 0) { + int max = std::min<int>(bl.get_num_buffers(), IOV_MAX-1); + iovec *iov = new iovec[max]; + int n = 0; + unsigned len = 0; + for (auto p = std::cbegin(bl.buffers()); n < max; ++p, ++n) { + ceph_assert(p != std::cend(bl.buffers())); + iov[n].iov_base = const_cast<void*>(static_cast<const void*>(p->c_str())); + iov[n].iov_len = p->length(); + len += p->length(); + } + + bufferlist tbl; + bl.splice(0, len, &tbl); // move bytes from bl -> tbl + + // lock only aio_queue, current aio, aio_num, aio_bytes, which may be + // modified in check_aio_completion + aio_lock.Lock(); + aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq)); + aio_info& aio = aio_queue.back(); + aio.iov = iov; + + io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos); + + dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len + << " in " << n << dendl; + + aio_num++; + aio_bytes += aio.len; + + // need to save current aio len to update write_pos later because current + // aio could be ereased from aio_queue once it is done + uint64_t cur_len = aio.len; + // unlock aio_lock because following io_submit might take time to return + aio_lock.Unlock(); + + iocb *piocb = &aio.iocb; + + // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds + int attempts = 16; + int delay = 125; + do { + int r = io_submit(aio_ctx, 1, &piocb); + dout(20) << "write_aio_bl io_submit return value: " << r << dendl; + if (r < 0) { + derr << "io_submit to " << aio.off << "~" << cur_len + << " got " << cpp_strerror(r) << dendl; + if (r == -EAGAIN && attempts-- > 0) { + usleep(delay); + delay *= 2; + continue; + } + check_align(pos, tbl); + ceph_abort_msg("io_submit got unexpected error"); + } else { + break; + } + } while (true); + pos += cur_len; + } + aio_lock.Lock(); + write_finish_cond.Signal(); + aio_lock.Unlock(); + return 0; +} +#endif + +void FileJournal::write_finish_thread_entry() +{ +#ifdef HAVE_LIBAIO + dout(10) << __func__ << " enter" << dendl; + while (true) { + { + Mutex::Locker locker(aio_lock); + if (aio_queue.empty()) { + if (aio_stop) + break; + dout(20) << __func__ << " sleeping" << dendl; + write_finish_cond.Wait(aio_lock); + continue; + } + } + + dout(20) << __func__ << " waiting for aio(s)" << dendl; + io_event event[16]; + int r = io_getevents(aio_ctx, 1, 16, event, NULL); + if (r < 0) { + if (r == -EINTR) { + dout(0) << "io_getevents got " << cpp_strerror(r) << dendl; + continue; + } + derr << "io_getevents got " << cpp_strerror(r) << dendl; + if (r == -EIO) { + note_io_error_event(devname.c_str(), fn.c_str(), -EIO, 0, 0, 0); + } + ceph_abort_msg("got unexpected error from io_getevents"); + } + + { + Mutex::Locker locker(aio_lock); + for (int i=0; i<r; i++) { + aio_info *ai = (aio_info *)event[i].obj; + if (event[i].res != ai->len) { + derr << "aio to " << ai->off << "~" << ai->len + << " returned: " << (int)event[i].res << dendl; + ceph_abort_msg("unexpected aio error"); + } + dout(10) << __func__ << " aio " << ai->off + << "~" << ai->len << " done" << dendl; + ai->done = true; + } + check_aio_completion(); + } + } + dout(10) << __func__ << " exit" << dendl; +#endif +} + +#ifdef HAVE_LIBAIO +/** + * check aio_wait for completed aio, and update state appropriately. + */ +void FileJournal::check_aio_completion() +{ + ceph_assert(aio_lock.is_locked()); + dout(20) << "check_aio_completion" << dendl; + + bool completed_something = false, signal = false; + uint64_t new_journaled_seq = 0; + + list<aio_info>::iterator p = aio_queue.begin(); + while (p != aio_queue.end() && p->done) { + dout(20) << "check_aio_completion completed seq " << p->seq << " " + << p->off << "~" << p->len << dendl; + if (p->seq) { + new_journaled_seq = p->seq; + completed_something = true; + } + aio_num--; + aio_bytes -= p->len; + aio_queue.erase(p++); + signal = true; + } + + if (completed_something) { + // kick finisher? + // only if we haven't filled up recently! + Mutex::Locker locker(finisher_lock); + journaled_seq = new_journaled_seq; + if (full_state != FULL_NOTFULL) { + dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq + << ", full_commit_seq|full_restart_seq" << dendl; + } else { + if (plug_journal_completions) { + dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq + << " due to completion plug" << dendl; + } else { + dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl; + queue_completions_thru(journaled_seq); + } + } + } + if (signal) { + // maybe write queue was waiting for aio count to drop? + aio_cond.Signal(); + } +} +#endif + +int FileJournal::prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) { + dout(10) << "prepare_entry " << tls << dendl; + int data_len = cct->_conf->journal_align_min_size - 1; + int data_align = -1; // -1 indicates that we don't care about the alignment + bufferlist bl; + for (vector<ObjectStore::Transaction>::iterator p = tls.begin(); + p != tls.end(); ++p) { + if ((int)(*p).get_data_length() > data_len) { + data_len = (*p).get_data_length(); + data_align = ((*p).get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK; + } + encode(*p, bl); + } + if (tbl->length()) { + bl.claim_append(*tbl); + } + // add it this entry + entry_header_t h; + unsigned head_size = sizeof(entry_header_t); + off64_t base_size = 2*head_size + bl.length(); + memset(&h, 0, sizeof(h)); + if (data_align >= 0) + h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK; + off64_t size = round_up_to(base_size + h.pre_pad, header.alignment); + unsigned post_pad = size - base_size - h.pre_pad; + h.len = bl.length(); + h.post_pad = post_pad; + h.crc32c = bl.crc32c(0); + dout(10) << " len " << bl.length() << " -> " << size + << " (head " << head_size << " pre_pad " << h.pre_pad + << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")" + << " (bl alignment " << data_align << ")" + << dendl; + bufferlist ebl; + // header + ebl.append((const char*)&h, sizeof(h)); + if (h.pre_pad) { + ebl.push_back(buffer::create_static(h.pre_pad, zero_buf)); + } + // payload + ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy + if (h.post_pad) { + ebl.push_back(buffer::create_static(h.post_pad, zero_buf)); + } + // footer + ebl.append((const char*)&h, sizeof(h)); + if (directio) + ebl.rebuild_aligned(CEPH_DIRECTIO_ALIGNMENT); + tbl->claim(ebl); + return h.len; +} + +void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len, + Context *oncommit, TrackedOpRef osd_op) +{ + // dump on queue + dout(5) << "submit_entry seq " << seq + << " len " << e.length() + << " (" << oncommit << ")" << dendl; + ceph_assert(e.length() > 0); + ceph_assert(e.length() < header.max_size); + + if (logger) { + logger->inc(l_filestore_journal_queue_bytes, orig_len); + logger->inc(l_filestore_journal_queue_ops, 1); + } + + throttle.register_throttle_seq(seq, e.length()); + if (logger) { + logger->inc(l_filestore_journal_ops, 1); + logger->inc(l_filestore_journal_bytes, e.length()); + } + + if (osd_op) { + osd_op->mark_event("commit_queued_for_journal_write"); + if (osd_op->store_trace) { + osd_op->journal_trace.init("journal", &trace_endpoint, &osd_op->store_trace); + osd_op->journal_trace.event("submit_entry"); + osd_op->journal_trace.keyval("seq", seq); + } + } + { + Mutex::Locker l1(writeq_lock); +#ifdef HAVE_LIBAIO + Mutex::Locker l2(aio_lock); +#endif + Mutex::Locker l3(completions_lock); + +#ifdef HAVE_LIBAIO + aio_write_queue_ops++; + aio_write_queue_bytes += e.length(); + aio_cond.Signal(); +#endif + + completions.push_back( + completion_item( + seq, oncommit, ceph_clock_now(), osd_op)); + if (writeq.empty()) + writeq_cond.Signal(); + writeq.push_back(write_item(seq, e, orig_len, osd_op)); + if (osd_op) + osd_op->journal_trace.keyval("queue depth", writeq.size()); + } +} + +bool FileJournal::writeq_empty() +{ + Mutex::Locker locker(writeq_lock); + return writeq.empty(); +} + +FileJournal::write_item &FileJournal::peek_write() +{ + ceph_assert(write_lock.is_locked()); + Mutex::Locker locker(writeq_lock); + return writeq.front(); +} + +void FileJournal::pop_write() +{ + ceph_assert(write_lock.is_locked()); + Mutex::Locker locker(writeq_lock); + if (logger) { + logger->dec(l_filestore_journal_queue_bytes, writeq.front().orig_len); + logger->dec(l_filestore_journal_queue_ops, 1); + } + writeq.pop_front(); +} + +void FileJournal::batch_pop_write(list<write_item> &items) +{ + ceph_assert(write_lock.is_locked()); + { + Mutex::Locker locker(writeq_lock); + writeq.swap(items); + } + for (auto &&i : items) { + if (logger) { + logger->dec(l_filestore_journal_queue_bytes, i.orig_len); + logger->dec(l_filestore_journal_queue_ops, 1); + } + } +} + +void FileJournal::batch_unpop_write(list<write_item> &items) +{ + ceph_assert(write_lock.is_locked()); + for (auto &&i : items) { + if (logger) { + logger->inc(l_filestore_journal_queue_bytes, i.orig_len); + logger->inc(l_filestore_journal_queue_ops, 1); + } + } + Mutex::Locker locker(writeq_lock); + writeq.splice(writeq.begin(), items); +} + +void FileJournal::commit_start(uint64_t seq) +{ + dout(10) << "commit_start" << dendl; + + // was full? + switch (full_state) { + case FULL_NOTFULL: + break; // all good + + case FULL_FULL: + if (seq >= journaled_seq) { + dout(1) << " FULL_FULL -> FULL_WAIT. commit_start on seq " + << seq << " > journaled_seq " << journaled_seq + << ", moving to FULL_WAIT." + << dendl; + full_state = FULL_WAIT; + } else { + dout(1) << "FULL_FULL commit_start on seq " + << seq << " < journaled_seq " << journaled_seq + << ", remaining in FULL_FULL" + << dendl; + } + break; + + case FULL_WAIT: + dout(1) << " FULL_WAIT -> FULL_NOTFULL. journal now active, setting completion plug." << dendl; + full_state = FULL_NOTFULL; + plug_journal_completions = true; + break; + } +} + +/* + *send discard command to joural block deivce + */ +void FileJournal::do_discard(int64_t offset, int64_t end) +{ + dout(10) << __func__ << " trim(" << offset << ", " << end << dendl; + + offset = round_up_to(offset, block_size); + if (offset >= end) + return; + end = round_up_to(end - block_size, block_size); + ceph_assert(end >= offset); + if (offset < end) { + BlkDev blkdev(fd); + if (blkdev.discard(offset, end - offset) < 0) { + dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl; + } + } +} + +void FileJournal::committed_thru(uint64_t seq) +{ + Mutex::Locker locker(write_lock); + + auto released = throttle.flush(seq); + if (logger) { + logger->dec(l_filestore_journal_ops, released.first); + logger->dec(l_filestore_journal_bytes, released.second); + } + + if (seq < last_committed_seq) { + dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl; + ceph_assert(seq >= last_committed_seq); + return; + } + if (seq == last_committed_seq) { + dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl; + return; + } + + dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl; + last_committed_seq = seq; + + // completions! + { + Mutex::Locker locker(finisher_lock); + queue_completions_thru(seq); + if (plug_journal_completions && seq >= header.start_seq) { + dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl; + plug_journal_completions = false; + queue_completions_thru(journaled_seq); + } + } + + // adjust start pointer + while (!journalq.empty() && journalq.front().first <= seq) { + journalq.pop_front(); + } + + int64_t old_start = header.start; + if (!journalq.empty()) { + header.start = journalq.front().second; + header.start_seq = journalq.front().first; + } else { + header.start = write_pos; + header.start_seq = seq + 1; + } + + if (discard) { + dout(10) << __func__ << " will trim (" << old_start << ", " << header.start << ")" << dendl; + if (old_start < header.start) + do_discard(old_start, header.start - 1); + else { + do_discard(old_start, header.max_size - 1); + do_discard(get_top(), header.start - 1); + } + } + + must_write_header = true; + print_header(header); + + // committed but unjournaled items + while (!writeq_empty() && peek_write().seq <= seq) { + dout(15) << " dropping committed but unwritten seq " << peek_write().seq + << " len " << peek_write().bl.length() + << dendl; + complete_write(1, peek_write().orig_len); + pop_write(); + } + + commit_cond.Signal(); + + dout(10) << "committed_thru done" << dendl; +} + + +void FileJournal::complete_write(uint64_t ops, uint64_t bytes) +{ + dout(5) << __func__ << " finished " << ops << " ops and " + << bytes << " bytes" << dendl; +} + +int FileJournal::make_writeable() +{ + dout(10) << __func__ << dendl; + int r = set_throttle_params(); + if (r < 0) + return r; + + r = _open(true); + if (r < 0) + return r; + + if (read_pos > 0) + write_pos = read_pos; + else + write_pos = get_top(); + read_pos = 0; + + must_write_header = true; + + start_writer(); + return 0; +} + +int FileJournal::set_throttle_params() +{ + stringstream ss; + bool valid = throttle.set_params( + cct->_conf->journal_throttle_low_threshhold, + cct->_conf->journal_throttle_high_threshhold, + cct->_conf->filestore_expected_throughput_bytes, + cct->_conf->journal_throttle_high_multiple, + cct->_conf->journal_throttle_max_multiple, + header.max_size - get_top(), + &ss); + + if (!valid) { + derr << "tried to set invalid params: " + << ss.str() + << dendl; + } + return valid ? 0 : -EINVAL; +} + +const char** FileJournal::get_tracked_conf_keys() const +{ + static const char *KEYS[] = { + "journal_throttle_low_threshhold", + "journal_throttle_high_threshhold", + "journal_throttle_high_multiple", + "journal_throttle_max_multiple", + "filestore_expected_throughput_bytes", + NULL}; + return KEYS; +} + +void FileJournal::wrap_read_bl( + off64_t pos, + int64_t olen, + bufferlist* bl, + off64_t *out_pos + ) const +{ + while (olen > 0) { + while (pos >= header.max_size) + pos = pos + get_top() - header.max_size; + + int64_t len; + if (pos + olen > header.max_size) + len = header.max_size - pos; // partial + else + len = olen; // rest + + int64_t actual = ::lseek64(fd, pos, SEEK_SET); + ceph_assert(actual == pos); + + bufferptr bp = buffer::create(len); + int r = safe_read_exact(fd, bp.c_str(), len); + if (r) { + derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned " + << cpp_strerror(r) << dendl; + ceph_abort(); + } + bl->push_back(std::move(bp)); + pos += len; + olen -= len; + } + if (pos >= header.max_size) + pos = pos + get_top() - header.max_size; + if (out_pos) + *out_pos = pos; +} + +bool FileJournal::read_entry( + bufferlist &bl, + uint64_t &next_seq, + bool *corrupt) +{ + if (corrupt) + *corrupt = false; + uint64_t seq = next_seq; + + if (!read_pos) { + dout(2) << "read_entry -- not readable" << dendl; + return false; + } + + off64_t pos = read_pos; + off64_t next_pos = pos; + stringstream ss; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + &ss); + if (result == SUCCESS) { + journalq.push_back( pair<uint64_t,off64_t>(seq, pos)); + uint64_t amount_to_take = + next_pos > pos ? + next_pos - pos : + (header.max_size - pos) + (next_pos - get_top()); + throttle.take(amount_to_take); + throttle.register_throttle_seq(next_seq, amount_to_take); + if (logger) { + logger->inc(l_filestore_journal_ops, 1); + logger->inc(l_filestore_journal_bytes, amount_to_take); + } + if (next_seq > seq) { + return false; + } else { + read_pos = next_pos; + next_seq = seq; + if (seq > journaled_seq) + journaled_seq = seq; + return true; + } + } else { + derr << "do_read_entry(" << pos << "): " << ss.str() << dendl; + } + + if (seq && seq < header.committed_up_to) { + derr << "Unable to read past sequence " << seq + << " but header indicates the journal has committed up through " + << header.committed_up_to << ", journal is corrupt" << dendl; + if (cct->_conf->journal_ignore_corruption) { + if (corrupt) + *corrupt = true; + return false; + } else { + ceph_abort(); + } + } + + dout(2) << "No further valid entries found, journal is most likely valid" + << dendl; + return false; +} + +FileJournal::read_entry_result FileJournal::do_read_entry( + off64_t init_pos, + off64_t *next_pos, + bufferlist *bl, + uint64_t *seq, + ostream *ss, + entry_header_t *_h) const +{ + off64_t cur_pos = init_pos; + bufferlist _bl; + if (!bl) + bl = &_bl; + + // header + entry_header_t *h; + bufferlist hbl; + off64_t _next_pos; + wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos); + h = reinterpret_cast<entry_header_t *>(hbl.c_str()); + + if (!h->check_magic(cur_pos, header.get_fsid64())) { + dout(25) << "read_entry " << init_pos + << " : bad header magic, end of journal" << dendl; + if (ss) + *ss << "bad header magic"; + if (next_pos) + *next_pos = init_pos + (4<<10); // check 4k ahead + return MAYBE_CORRUPT; + } + cur_pos = _next_pos; + + // pad + body + pad + if (h->pre_pad) + cur_pos += h->pre_pad; + + bl->clear(); + wrap_read_bl(cur_pos, h->len, bl, &cur_pos); + + if (h->post_pad) + cur_pos += h->post_pad; + + // footer + entry_header_t *f; + bufferlist fbl; + wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos); + f = reinterpret_cast<entry_header_t *>(fbl.c_str()); + if (memcmp(f, h, sizeof(*f))) { + if (ss) + *ss << "bad footer magic, partial entry"; + if (next_pos) + *next_pos = cur_pos; + return MAYBE_CORRUPT; + } + + if ((header.flags & header_t::FLAG_CRC) || // if explicitly enabled (new journal) + h->crc32c != 0) { // newer entry in old journal + uint32_t actual_crc = bl->crc32c(0); + if (actual_crc != h->crc32c) { + if (ss) + *ss << "header crc (" << h->crc32c + << ") doesn't match body crc (" << actual_crc << ")"; + if (next_pos) + *next_pos = cur_pos; + return MAYBE_CORRUPT; + } + } + + // yay! + dout(2) << "read_entry " << init_pos << " : seq " << h->seq + << " " << h->len << " bytes" + << dendl; + + // ok! + if (seq) + *seq = h->seq; + + + if (next_pos) + *next_pos = cur_pos; + + if (_h) + *_h = *h; + + ceph_assert(cur_pos % header.alignment == 0); + return SUCCESS; +} + +void FileJournal::reserve_throttle_and_backoff(uint64_t count) +{ + throttle.get(count); +} + +void FileJournal::get_header( + uint64_t wanted_seq, + off64_t *_pos, + entry_header_t *h) +{ + off64_t pos = header.start; + off64_t next_pos = pos; + bufferlist bl; + uint64_t seq = 0; + dout(2) << __func__ << dendl; + while (1) { + bl.clear(); + pos = next_pos; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + 0, + h); + if (result == FAILURE || result == MAYBE_CORRUPT) + ceph_abort(); + if (seq == wanted_seq) { + if (_pos) + *_pos = pos; + return; + } + } + ceph_abort(); // not reachable +} + +void FileJournal::corrupt( + int wfd, + off64_t corrupt_at) +{ + dout(2) << __func__ << dendl; + if (corrupt_at >= header.max_size) + corrupt_at = corrupt_at + get_top() - header.max_size; + + int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET); + ceph_assert(actual == corrupt_at); + + char buf[10]; + int r = safe_read_exact(fd, buf, 1); + ceph_assert(r == 0); + + actual = ::lseek64(wfd, corrupt_at, SEEK_SET); + ceph_assert(actual == corrupt_at); + + buf[0]++; + r = safe_write(wfd, buf, 1); + ceph_assert(r == 0); +} + +void FileJournal::corrupt_payload( + int wfd, + uint64_t seq) +{ + dout(2) << __func__ << dendl; + off64_t pos = 0; + entry_header_t h; + get_header(seq, &pos, &h); + off64_t corrupt_at = + pos + sizeof(entry_header_t) + h.pre_pad; + corrupt(wfd, corrupt_at); +} + + +void FileJournal::corrupt_footer_magic( + int wfd, + uint64_t seq) +{ + dout(2) << __func__ << dendl; + off64_t pos = 0; + entry_header_t h; + get_header(seq, &pos, &h); + off64_t corrupt_at = + pos + sizeof(entry_header_t) + h.pre_pad + + h.len + h.post_pad + + (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h)); + corrupt(wfd, corrupt_at); +} + + +void FileJournal::corrupt_header_magic( + int wfd, + uint64_t seq) +{ + dout(2) << __func__ << dendl; + off64_t pos = 0; + entry_header_t h; + get_header(seq, &pos, &h); + off64_t corrupt_at = + pos + + (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h)); + corrupt(wfd, corrupt_at); +} + +off64_t FileJournal::get_journal_size_estimate() +{ + off64_t size, start = header.start; + if (write_pos < start) { + size = (max_size - start) + write_pos; + } else { + size = write_pos - start; + } + dout(20) << __func__ << " journal size=" << size << dendl; + return size; +} + +void FileJournal::get_devices(set<string> *ls) +{ + string dev_node; + BlkDev blkdev(fd); + if (int rc = blkdev.wholedisk(&dev_node); rc) { + return; + } + get_raw_devices(dev_node, ls); +} + +void FileJournal::collect_metadata(map<string,string> *pm) +{ + BlkDev blkdev(fd); + char partition_path[PATH_MAX]; + char dev_node[PATH_MAX]; + if (blkdev.partition(partition_path, PATH_MAX)) { + (*pm)["backend_filestore_journal_partition_path"] = "unknown"; + } else { + (*pm)["backend_filestore_journal_partition_path"] = string(partition_path); + } + if (blkdev.wholedisk(dev_node, PATH_MAX)) { + (*pm)["backend_filestore_journal_dev_node"] = "unknown"; + } else { + (*pm)["backend_filestore_journal_dev_node"] = string(dev_node); + devname = dev_node; + } +} diff --git a/src/os/filestore/FileJournal.h b/src/os/filestore/FileJournal.h new file mode 100644 index 00000000..2313b4b8 --- /dev/null +++ b/src/os/filestore/FileJournal.h @@ -0,0 +1,556 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FILEJOURNAL_H +#define CEPH_FILEJOURNAL_H + +#include <stdlib.h> +#include <deque> +using std::deque; + +#include "Journal.h" +#include "common/config_fwd.h" +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/Thread.h" +#include "common/Throttle.h" +#include "JournalThrottle.h" +#include "common/zipkin_trace.h" + +#ifdef HAVE_LIBAIO +# include <libaio.h> +#endif + +// re-include our assert to clobber the system one; fix dout: +#include "include/ceph_assert.h" + +/** + * Implements journaling on top of block device or file. + * + * Lock ordering is write_lock > aio_lock > (completions_lock | finisher_lock) + */ +class FileJournal : + public Journal, + public md_config_obs_t { +public: + /// Protected by finisher_lock + struct completion_item { + uint64_t seq; + Context *finish; + utime_t start; + TrackedOpRef tracked_op; + completion_item(uint64_t o, Context *c, utime_t s, TrackedOpRef opref) + : seq(o), finish(c), start(s), tracked_op(opref) {} + completion_item() : seq(0), finish(0), start(0) {} + }; + struct write_item { + uint64_t seq; + bufferlist bl; + uint32_t orig_len; + TrackedOpRef tracked_op; + ZTracer::Trace trace; + write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) : + seq(s), orig_len(ol), tracked_op(opref) { + bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy + } + write_item() : seq(0), orig_len(0) {} + }; + + Mutex finisher_lock; + Cond finisher_cond; + uint64_t journaled_seq; + bool plug_journal_completions; + + Mutex writeq_lock; + Cond writeq_cond; + list<write_item> writeq; + bool writeq_empty(); + write_item &peek_write(); + void pop_write(); + void batch_pop_write(list<write_item> &items); + void batch_unpop_write(list<write_item> &items); + + Mutex completions_lock; + list<completion_item> completions; + bool completions_empty() { + Mutex::Locker l(completions_lock); + return completions.empty(); + } + void batch_pop_completions(list<completion_item> &items) { + Mutex::Locker l(completions_lock); + completions.swap(items); + } + void batch_unpop_completions(list<completion_item> &items) { + Mutex::Locker l(completions_lock); + completions.splice(completions.begin(), items); + } + completion_item completion_peek_front() { + Mutex::Locker l(completions_lock); + ceph_assert(!completions.empty()); + return completions.front(); + } + void completion_pop_front() { + Mutex::Locker l(completions_lock); + ceph_assert(!completions.empty()); + completions.pop_front(); + } + + int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) override; + + void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len, + Context *oncommit, + TrackedOpRef osd_op = TrackedOpRef()) override; + /// End protected by finisher_lock + + /* + * journal header + */ + struct header_t { + enum { + FLAG_CRC = (1<<0), + // NOTE: remove kludgey weirdness in read_header() next time a flag is added. + }; + + uint64_t flags; + uuid_d fsid; + __u32 block_size; + __u32 alignment; + int64_t max_size; // max size of journal ring buffer + int64_t start; // offset of first entry + uint64_t committed_up_to; // committed up to + + /** + * start_seq + * + * entry at header.start has sequence >= start_seq + * + * Generally, the entry at header.start will have sequence + * start_seq if it exists. The only exception is immediately + * after journal creation since the first sequence number is + * not known. + * + * If the first read on open fails, we can assume corruption + * if start_seq > committed_up_to because the entry would have + * a sequence >= start_seq and therefore > committed_up_to. + */ + uint64_t start_seq; + + header_t() : + flags(0), block_size(0), alignment(0), max_size(0), start(0), + committed_up_to(0), start_seq(0) {} + + void clear() { + start = block_size; + } + + uint64_t get_fsid64() const { + return *(uint64_t*)fsid.bytes(); + } + + void encode(bufferlist& bl) const { + using ceph::encode; + __u32 v = 4; + encode(v, bl); + bufferlist em; + { + encode(flags, em); + encode(fsid, em); + encode(block_size, em); + encode(alignment, em); + encode(max_size, em); + encode(start, em); + encode(committed_up_to, em); + encode(start_seq, em); + } + encode(em, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + __u32 v; + decode(v, bl); + if (v < 2) { // normally 0, but conceivably 1 + // decode old header_t struct (pre v0.40). + bl.advance(4u); // skip __u32 flags (it was unused by any old code) + flags = 0; + uint64_t tfsid; + decode(tfsid, bl); + *(uint64_t*)&fsid.bytes()[0] = tfsid; + *(uint64_t*)&fsid.bytes()[8] = tfsid; + decode(block_size, bl); + decode(alignment, bl); + decode(max_size, bl); + decode(start, bl); + committed_up_to = 0; + start_seq = 0; + return; + } + bufferlist em; + decode(em, bl); + auto t = em.cbegin(); + decode(flags, t); + decode(fsid, t); + decode(block_size, t); + decode(alignment, t); + decode(max_size, t); + decode(start, t); + + if (v > 2) + decode(committed_up_to, t); + else + committed_up_to = 0; + + if (v > 3) + decode(start_seq, t); + else + start_seq = 0; + } + } header; + + struct entry_header_t { + uint64_t seq; // fs op seq # + uint32_t crc32c; // payload only. not header, pre_pad, post_pad, or footer. + uint32_t len; + uint32_t pre_pad, post_pad; + uint64_t magic1; + uint64_t magic2; + + static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) { + return (fsid ^ seq ^ len); + } + bool check_magic(off64_t pos, uint64_t fsid) { + return + magic1 == (uint64_t)pos && + magic2 == (fsid ^ seq ^ len); + } + } __attribute__((__packed__, aligned(4))); + + bool journalq_empty() { return journalq.empty(); } + +private: + string fn; + + char *zero_buf; + off64_t max_size; + size_t block_size; + bool directio, aio, force_aio; + bool must_write_header; + off64_t write_pos; // byte where the next entry to be written will go + off64_t read_pos; // + bool discard; //for block journal whether support discard + +#ifdef HAVE_LIBAIO + /// state associated with an in-flight aio request + /// Protected by aio_lock + struct aio_info { + struct iocb iocb {}; + bufferlist bl; + struct iovec *iov; + bool done; + uint64_t off, len; ///< these are for debug only + uint64_t seq; ///< seq number to complete on aio completion, if non-zero + + aio_info(bufferlist& b, uint64_t o, uint64_t s) + : iov(NULL), done(false), off(o), len(b.length()), seq(s) { + bl.claim(b); + } + ~aio_info() { + delete[] iov; + } + }; + Mutex aio_lock; + Cond aio_cond; + Cond write_finish_cond; + io_context_t aio_ctx; + list<aio_info> aio_queue; + int aio_num, aio_bytes; + uint64_t aio_write_queue_ops; + uint64_t aio_write_queue_bytes; + /// End protected by aio_lock +#endif + + uint64_t last_committed_seq; + uint64_t journaled_since_start; + + string devname; + + /* + * full states cycle at the beginnging of each commit epoch, when commit_start() + * is called. + * FULL - we just filled up during this epoch. + * WAIT - we filled up last epoch; now we have to wait until everything during + * that epoch commits to the fs before we can start writing over it. + * NOTFULL - all good, journal away. + */ + enum { + FULL_NOTFULL = 0, + FULL_FULL = 1, + FULL_WAIT = 2, + } full_state; + + int fd; + + // in journal + deque<pair<uint64_t, off64_t> > journalq; // track seq offsets, so we can trim later. + uint64_t writing_seq; + + + // throttle + int set_throttle_params(); + const char** get_tracked_conf_keys() const override; + void handle_conf_change( + const ConfigProxy& conf, + const std::set <std::string> &changed) override { + for (const char **i = get_tracked_conf_keys(); + *i; + ++i) { + if (changed.count(string(*i))) { + set_throttle_params(); + return; + } + } + } + + void complete_write(uint64_t ops, uint64_t bytes); + JournalThrottle throttle; + + // write thread + Mutex write_lock; + bool write_stop; + bool aio_stop; + + Cond commit_cond; + + int _open(bool wr, bool create=false); + int _open_block_device(); + void _close(int fd) const; + int _open_file(int64_t oldsize, blksize_t blksize, bool create); + int _dump(ostream& out, bool simple); + void print_header(const header_t &hdr) const; + int read_header(header_t *hdr) const; + bufferptr prepare_header(); + void start_writer(); + void stop_writer(); + void write_thread_entry(); + + void queue_completions_thru(uint64_t seq); + + int check_for_full(uint64_t seq, off64_t pos, off64_t size); + int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee); + int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, + uint64_t& orig_ops, uint64_t& orig_bytes); + void do_write(bufferlist& bl); + + void write_finish_thread_entry(); + void check_aio_completion(); + void do_aio_write(bufferlist& bl); + int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq); + + + void check_align(off64_t pos, bufferlist& bl); + int write_bl(off64_t& pos, bufferlist& bl); + + /// read len from journal starting at in_pos and wrapping up to len + void wrap_read_bl( + off64_t in_pos, ///< [in] start position + int64_t len, ///< [in] length to read + bufferlist* bl, ///< [out] result + off64_t *out_pos ///< [out] next position to read, will be wrapped + ) const; + + void do_discard(int64_t offset, int64_t end); + + class Writer : public Thread { + FileJournal *journal; + public: + explicit Writer(FileJournal *fj) : journal(fj) {} + void *entry() override { + journal->write_thread_entry(); + return 0; + } + } write_thread; + + class WriteFinisher : public Thread { + FileJournal *journal; + public: + explicit WriteFinisher(FileJournal *fj) : journal(fj) {} + void *entry() override { + journal->write_finish_thread_entry(); + return 0; + } + } write_finish_thread; + + off64_t get_top() const { + return round_up_to(sizeof(header), block_size); + } + + ZTracer::Endpoint trace_endpoint; + + public: + FileJournal(CephContext* cct, uuid_d fsid, Finisher *fin, Cond *sync_cond, + const char *f, bool dio=false, bool ai=true, bool faio=false) : + Journal(cct, fsid, fin, sync_cond), + finisher_lock("FileJournal::finisher_lock", false, true, false), + journaled_seq(0), + plug_journal_completions(false), + writeq_lock("FileJournal::writeq_lock", false, true, false), + completions_lock( + "FileJournal::completions_lock", false, true, false), + fn(f), + zero_buf(NULL), + max_size(0), block_size(0), + directio(dio), aio(ai), force_aio(faio), + must_write_header(false), + write_pos(0), read_pos(0), + discard(false), +#ifdef HAVE_LIBAIO + aio_lock("FileJournal::aio_lock"), + aio_ctx(0), + aio_num(0), aio_bytes(0), + aio_write_queue_ops(0), + aio_write_queue_bytes(0), +#endif + last_committed_seq(0), + journaled_since_start(0), + full_state(FULL_NOTFULL), + fd(-1), + writing_seq(0), + throttle(cct->_conf->filestore_caller_concurrency), + write_lock("FileJournal::write_lock", false, true, false), + write_stop(true), + aio_stop(true), + write_thread(this), + write_finish_thread(this), + trace_endpoint("0.0.0.0", 0, "FileJournal") { + + if (aio && !directio) { + lderr(cct) << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl; + aio = false; + } +#ifndef HAVE_LIBAIO + if (aio && ::getenv("CEPH_DEV") == NULL) { + lderr(cct) << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl; + aio = false; + } +#endif + + cct->_conf.add_observer(this); + } + ~FileJournal() override { + ceph_assert(fd == -1); + delete[] zero_buf; + cct->_conf.remove_observer(this); + } + + int check() override; + int create() override; + int open(uint64_t fs_op_seq) override; + void close() override; + int peek_fsid(uuid_d& fsid); + + int dump(ostream& out) override; + int simple_dump(ostream& out); + int _fdump(Formatter &f, bool simple); + + void flush() override; + + void get_devices(set<string> *ls) override; + void collect_metadata(map<string,string> *pm) override; + + void reserve_throttle_and_backoff(uint64_t count) override; + + bool is_writeable() override { + return read_pos == 0; + } + int make_writeable() override; + + // writes + void commit_start(uint64_t seq) override; + void committed_thru(uint64_t seq) override; + bool should_commit_now() override { + return full_state != FULL_NOTFULL && !write_stop; + } + + void write_header_sync(); + + void set_wait_on_full(bool b) { wait_on_full = b; } + + off64_t get_journal_size_estimate(); + + // reads + + /// Result code for read_entry + enum read_entry_result { + SUCCESS, + FAILURE, + MAYBE_CORRUPT + }; + + /** + * read_entry + * + * Reads next entry starting at pos. If the entry appears + * clean, *bl will contain the payload, *seq will contain + * the sequence number, and *out_pos will reflect the next + * read position. If the entry is invalid *ss will contain + * debug text, while *seq, *out_pos, and *bl will be unchanged. + * + * If the entry suggests a corrupt log, *ss will contain debug + * text, *out_pos will contain the next index to check. If + * we find an entry in this way that returns SUCCESS, the journal + * is most likely corrupt. + */ + read_entry_result do_read_entry( + off64_t pos, ///< [in] position to read + off64_t *next_pos, ///< [out] next position to read + bufferlist* bl, ///< [out] payload for successful read + uint64_t *seq, ///< [out] seq of successful read + ostream *ss, ///< [out] error output + entry_header_t *h = 0 ///< [out] header + ) const; ///< @return result code + + bool read_entry( + bufferlist &bl, + uint64_t &last_seq, + bool *corrupt + ); + + bool read_entry( + bufferlist &bl, + uint64_t &last_seq) override { + return read_entry(bl, last_seq, 0); + } + + // Debug/Testing + void get_header( + uint64_t wanted_seq, + off64_t *_pos, + entry_header_t *h); + void corrupt( + int wfd, + off64_t corrupt_at); + void corrupt_payload( + int wfd, + uint64_t seq); + void corrupt_footer_magic( + int wfd, + uint64_t seq); + void corrupt_header_magic( + int wfd, + uint64_t seq); +}; + +WRITE_CLASS_ENCODER(FileJournal::header_t) + +#endif diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc new file mode 100644 index 00000000..d387947e --- /dev/null +++ b/src/os/filestore/FileStore.cc @@ -0,0 +1,6425 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (c) 2015 Hewlett-Packard Development Company, L.P. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "include/compat.h" +#include "include/int_types.h" +#include "boost/tuple/tuple.hpp" + +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/file.h> +#include <errno.h> +#include <dirent.h> +#include <sys/ioctl.h> + +#if defined(__linux__) +#include <linux/fs.h> +#include <linux/falloc.h> +#endif + +#include <iostream> +#include <map> + +#include "include/linux_fiemap.h" + +#include "common/xattr.h" +#include "chain_xattr.h" + +#if defined(__APPLE__) || defined(__FreeBSD__) +#include <sys/param.h> +#include <sys/mount.h> +#endif + + +#include <fstream> +#include <sstream> + +#include "FileStore.h" +#include "GenericFileStoreBackend.h" +#include "BtrfsFileStoreBackend.h" +#include "XfsFileStoreBackend.h" +#include "ZFSFileStoreBackend.h" +#include "common/BackTrace.h" +#include "include/types.h" +#include "FileJournal.h" + +#include "osd/osd_types.h" +#include "include/color.h" +#include "include/buffer.h" + +#include "common/Timer.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/run_cmd.h" +#include "common/safe_io.h" +#include "common/perf_counters.h" +#include "common/sync_filesystem.h" +#include "common/fd.h" +#include "HashIndex.h" +#include "DBObjectMap.h" +#include "kv/KeyValueDB.h" + +#include "common/ceph_crypto.h" +using ceph::crypto::SHA1; + +#include "include/ceph_assert.h" + +#include "common/config.h" +#include "common/blkdev.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/objectstore.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "filestore(" << basedir << ") " + +#define COMMIT_SNAP_ITEM "snap_%llu" +#define CLUSTER_SNAP_ITEM "clustersnap_%s" + +#define REPLAY_GUARD_XATTR "user.cephos.seq" +#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq" + +// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether +// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file +// xattrs and the value is "no", it indicates no xattrs in DBObjectMap +#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out" +#define XATTR_NO_SPILL_OUT "0" +#define XATTR_SPILL_OUT "1" +#define __FUNC__ __func__ << "(" << __LINE__ << ")" + +//Initial features in new superblock. +static CompatSet get_fs_initial_compat_set() { + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} + +//Features are added here that this FileStore supports. +static CompatSet get_fs_supported_compat_set() { + CompatSet compat = get_fs_initial_compat_set(); + //Any features here can be set in code, but not in initial superblock + compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); + return compat; +} + +int FileStore::validate_hobject_key(const hobject_t &obj) const +{ + unsigned len = LFNIndex::get_max_escaped_name_len(obj); + return len > m_filestore_max_xattr_value_size ? -ENAMETOOLONG : 0; +} + +int FileStore::get_block_device_fsid(CephContext* cct, const string& path, + uuid_d *fsid) +{ + // make sure we don't try to use aio or direct_io (and get annoying + // error messages from failing to do so); performance implications + // should be irrelevant for this use + FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false); + return j.peek_fsid(*fsid); +} + +void FileStore::FSPerfTracker::update_from_perfcounters( + PerfCounters &logger) +{ + os_commit_latency_ns.consume_next( + logger.get_tavg_ns( + l_filestore_journal_latency)); + os_apply_latency_ns.consume_next( + logger.get_tavg_ns( + l_filestore_apply_latency)); +} + + +ostream& operator<<(ostream& out, const FileStore::OpSequencer& s) +{ + return out << "osr(" << s.cid << ")"; +} + +int FileStore::get_cdir(const coll_t& cid, char *s, int len) +{ + const string &cid_str(cid.to_str()); + return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str()); +} + +void FileStore::handle_eio() +{ + // don't try to map this back to an offset; too hard since there is + // a file system in between. we also don't really know whether this + // was a read or a write, since we have so many layers beneath us. + // don't even try. + note_io_error_event(devname.c_str(), basedir.c_str(), -EIO, 0, 0, 0); + ceph_abort_msg("unexpected eio error"); +} + +int FileStore::get_index(const coll_t& cid, Index *index) +{ + int r = index_manager.get_index(cid, basedir, index); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +int FileStore::init_index(const coll_t& cid) +{ + char path[PATH_MAX]; + get_cdir(cid, path, sizeof(path)); + int r = index_manager.init_index(cid, path, target_version); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path) +{ + IndexedPath path2; + if (!path) + path = &path2; + int r, exist; + ceph_assert(index.index); + r = (index.index)->lookup(oid, path, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (!exist) + return -ENOENT; + return 0; +} + +int FileStore::lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length) +{ + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) + return r; + r = ::ftruncate(**fd, length); + if (r < 0) + r = -errno; + if (r >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_truncate(**fd, length); + ceph_assert(rc >= 0); + } + lfn_close(fd); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +int FileStore::lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf) +{ + IndexedPath path; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + + r = lfn_find(oid, index, &path); + if (r < 0) + return r; + r = ::stat(path->path(), buf); + if (r < 0) + r = -errno; + return r; +} + +int FileStore::lfn_open(const coll_t& cid, + const ghobject_t& oid, + bool create, + FDRef *outfd, + Index *index) +{ + ceph_assert(outfd); + int r = 0; + bool need_lock = true; + int flags = O_RDWR; + + if (create) + flags |= O_CREAT; + if (cct->_conf->filestore_odsync_write) { + flags |= O_DSYNC; + } + + Index index2; + if (!index) { + index = &index2; + } + if (!((*index).index)) { + r = get_index(cid, index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + return r; + } + } else { + need_lock = false; + } + + int fd, exist; + ceph_assert((*index).index); + if (need_lock) { + ((*index).index)->access_lock.get_write(); + } + if (!replaying) { + *outfd = fdcache.lookup(oid); + if (*outfd) { + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + return 0; + } + } + + + IndexedPath path2; + IndexedPath *path = &path2; + + r = (*index)->lookup(oid, path, &exist); + if (r < 0) { + derr << "could not find " << oid << " in index: " + << cpp_strerror(-r) << dendl; + goto fail; + } + + r = ::open((*path)->path(), flags|O_CLOEXEC, 0644); + if (r < 0) { + r = -errno; + dout(10) << "error opening file " << (*path)->path() << " with flags=" + << flags << ": " << cpp_strerror(-r) << dendl; + goto fail; + } + fd = r; + if (create && (!exist)) { + r = (*index)->created(oid, (*path)->path()); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "error creating " << oid << " (" << (*path)->path() + << ") in index: " << cpp_strerror(-r) << dendl; + goto fail; + } + r = chain_fsetxattr<true, true>( + fd, XATTR_SPILL_OUT_NAME, + XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path() + << "):" << cpp_strerror(-r) << dendl; + goto fail; + } + } + + if (!replaying) { + bool existed; + *outfd = fdcache.add(oid, fd, &existed); + if (existed) { + TEMP_FAILURE_RETRY(::close(fd)); + } + } else { + *outfd = std::make_shared<FDCache::FD>(fd); + } + + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + + return 0; + + fail: + + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +void FileStore::lfn_close(FDRef fd) +{ +} + +int FileStore::lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) +{ + Index index_new, index_old; + IndexedPath path_new, path_old; + int exist; + int r; + bool index_same = false; + if (c < newcid) { + r = get_index(newcid, &index_new); + if (r < 0) + return r; + r = get_index(c, &index_old); + if (r < 0) + return r; + } else if (c == newcid) { + r = get_index(c, &index_old); + if (r < 0) + return r; + index_new = index_old; + index_same = true; + } else { + r = get_index(c, &index_old); + if (r < 0) + return r; + r = get_index(newcid, &index_new); + if (r < 0) + return r; + } + + ceph_assert(index_old.index); + ceph_assert(index_new.index); + + if (!index_same) { + + RWLock::RLocker l1((index_old.index)->access_lock); + + r = index_old->lookup(o, &path_old, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (!exist) + return -ENOENT; + + RWLock::WLocker l2((index_new.index)->access_lock); + + r = index_new->lookup(newoid, &path_new, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (exist) + return -EEXIST; + + dout(25) << __FUNC__ << ": path_old: " << path_old << dendl; + dout(25) << __FUNC__ << ": path_new: " << path_new << dendl; + r = ::link(path_old->path(), path_new->path()); + if (r < 0) + return -errno; + + r = index_new->created(newoid, path_new->path()); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + } else { + RWLock::WLocker l1((index_old.index)->access_lock); + + r = index_old->lookup(o, &path_old, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (!exist) + return -ENOENT; + + r = index_new->lookup(newoid, &path_new, &exist); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (exist) + return -EEXIST; + + dout(25) << __FUNC__ << ": path_old: " << path_old << dendl; + dout(25) << __FUNC__ << ": path_new: " << path_new << dendl; + r = ::link(path_old->path(), path_new->path()); + if (r < 0) + return -errno; + + // make sure old fd for unlinked/overwritten file is gone + fdcache.clear(newoid); + + r = index_new->created(newoid, path_new->path()); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + } + return 0; +} + +int FileStore::lfn_unlink(const coll_t& cid, const ghobject_t& o, + const SequencerPosition &spos, + bool force_clear_omap) +{ + Index index; + int r = get_index(cid, &index); + if (r < 0) { + dout(25) << __FUNC__ << ": get_index failed " << cpp_strerror(r) << dendl; + return r; + } + + ceph_assert(index.index); + RWLock::WLocker l((index.index)->access_lock); + + { + IndexedPath path; + int hardlink; + r = index->lookup(o, &path, &hardlink); + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + + if (!force_clear_omap) { + if (hardlink == 0 || hardlink == 1) { + force_clear_omap = true; + } + } + if (force_clear_omap) { + dout(20) << __FUNC__ << ": clearing omap on " << o + << " in cid " << cid << dendl; + r = object_map->clear(o, &spos); + if (r < 0 && r != -ENOENT) { + dout(25) << __FUNC__ << ": omap clear failed " << cpp_strerror(r) << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + if (cct->_conf->filestore_debug_inject_read_err) { + debug_obj_on_delete(o); + } + if (!m_disable_wbthrottle) { + wbthrottle.clear_object(o); // should be only non-cache ref + } + fdcache.clear(o); + } else { + /* Ensure that replay of this op doesn't result in the object_map + * going away. + */ + if (!backend->can_checkpoint()) + object_map->sync(&o, &spos); + } + if (hardlink == 0) { + if (!m_disable_wbthrottle) { + wbthrottle.clear_object(o); // should be only non-cache ref + } + return 0; + } + } + r = index->unlink(o); + if (r < 0) { + dout(25) << __FUNC__ << ": index unlink failed " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +FileStore::FileStore(CephContext* cct, const std::string &base, + const std::string &jdev, osflagbits_t flags, + const char *name, bool do_update) : + JournalingObjectStore(cct, base), + internal_name(name), + basedir(base), journalpath(jdev), + generic_flags(flags), + blk_size(0), + fsid_fd(-1), op_fd(-1), + basedir_fd(-1), current_fd(-1), + backend(nullptr), + index_manager(cct, do_update), + lock("FileStore::lock"), + force_sync(false), + sync_entry_timeo_lock("FileStore::sync_entry_timeo_lock"), + timer(cct, sync_entry_timeo_lock), + stop(false), sync_thread(this), + coll_lock("FileStore::coll_lock"), + fdcache(cct), + wbthrottle(cct), + next_osr_id(0), + m_disable_wbthrottle(cct->_conf->filestore_odsync_write || + !cct->_conf->filestore_wbthrottle_enable), + throttle_ops(cct, "filestore_ops", cct->_conf->filestore_caller_concurrency), + throttle_bytes(cct, "filestore_bytes", cct->_conf->filestore_caller_concurrency), + m_ondisk_finisher_num(cct->_conf->filestore_ondisk_finisher_threads), + m_apply_finisher_num(cct->_conf->filestore_apply_finisher_threads), + op_tp(cct, "FileStore::op_tp", "tp_fstore_op", cct->_conf->filestore_op_threads, "filestore_op_threads"), + op_wq(this, cct->_conf->filestore_op_thread_timeout, + cct->_conf->filestore_op_thread_suicide_timeout, &op_tp), + logger(nullptr), + trace_endpoint("0.0.0.0", 0, "FileStore"), + read_error_lock("FileStore::read_error_lock"), + m_filestore_commit_timeout(cct->_conf->filestore_commit_timeout), + m_filestore_journal_parallel(cct->_conf->filestore_journal_parallel ), + m_filestore_journal_trailing(cct->_conf->filestore_journal_trailing), + m_filestore_journal_writeahead(cct->_conf->filestore_journal_writeahead), + m_filestore_fiemap_threshold(cct->_conf->filestore_fiemap_threshold), + m_filestore_max_sync_interval(cct->_conf->filestore_max_sync_interval), + m_filestore_min_sync_interval(cct->_conf->filestore_min_sync_interval), + m_filestore_fail_eio(cct->_conf->filestore_fail_eio), + m_filestore_fadvise(cct->_conf->filestore_fadvise), + do_update(do_update), + m_journal_dio(cct->_conf->journal_dio), + m_journal_aio(cct->_conf->journal_aio), + m_journal_force_aio(cct->_conf->journal_force_aio), + m_osd_rollback_to_cluster_snap(cct->_conf->osd_rollback_to_cluster_snap), + m_osd_use_stale_snap(cct->_conf->osd_use_stale_snap), + m_filestore_do_dump(false), + m_filestore_dump_fmt(true), + m_filestore_sloppy_crc(cct->_conf->filestore_sloppy_crc), + m_filestore_sloppy_crc_block_size(cct->_conf->filestore_sloppy_crc_block_size), + m_filestore_max_alloc_hint_size(cct->_conf->filestore_max_alloc_hint_size), + m_fs_type(0), + m_filestore_max_inline_xattr_size(0), + m_filestore_max_inline_xattrs(0), + m_filestore_max_xattr_value_size(0) +{ + m_filestore_kill_at = cct->_conf->filestore_kill_at; + for (int i = 0; i < m_ondisk_finisher_num; ++i) { + ostringstream oss; + oss << "filestore-ondisk-" << i; + Finisher *f = new Finisher(cct, oss.str(), "fn_odsk_fstore"); + ondisk_finishers.push_back(f); + } + for (int i = 0; i < m_apply_finisher_num; ++i) { + ostringstream oss; + oss << "filestore-apply-" << i; + Finisher *f = new Finisher(cct, oss.str(), "fn_appl_fstore"); + apply_finishers.push_back(f); + } + + ostringstream oss; + oss << basedir << "/current"; + current_fn = oss.str(); + + ostringstream sss; + sss << basedir << "/current/commit_op_seq"; + current_op_seq_fn = sss.str(); + + ostringstream omss; + if (cct->_conf->filestore_omap_backend_path != "") { + omap_dir = cct->_conf->filestore_omap_backend_path; + } else { + omss << basedir << "/current/omap"; + omap_dir = omss.str(); + } + + // initialize logger + PerfCountersBuilder plb(cct, internal_name, l_filestore_first, l_filestore_last); + + plb.add_u64(l_filestore_journal_queue_ops, "journal_queue_ops", "Operations in journal queue"); + plb.add_u64(l_filestore_journal_ops, "journal_ops", "Active journal entries to be applied"); + plb.add_u64(l_filestore_journal_queue_bytes, "journal_queue_bytes", "Size of journal queue"); + plb.add_u64(l_filestore_journal_bytes, "journal_bytes", "Active journal operation size to be applied"); + plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency", + NULL, PerfCountersBuilder::PRIO_USEFUL); + plb.add_u64_counter(l_filestore_journal_wr, "journal_wr", "Journal write IOs"); + plb.add_u64_avg(l_filestore_journal_wr_bytes, "journal_wr_bytes", "Journal data written"); + plb.add_u64(l_filestore_op_queue_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue"); + plb.add_u64(l_filestore_op_queue_ops, "op_queue_ops", "Operations in writing to FS queue"); + plb.add_u64_counter(l_filestore_ops, "ops", "Operations written to store"); + plb.add_u64(l_filestore_op_queue_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue"); + plb.add_u64(l_filestore_op_queue_bytes, "op_queue_bytes", "Size of writing to FS queue"); + plb.add_u64_counter(l_filestore_bytes, "bytes", "Data written to store"); + plb.add_time_avg(l_filestore_apply_latency, "apply_latency", "Apply latency"); + plb.add_u64(l_filestore_committing, "committing", "Is currently committing"); + + plb.add_u64_counter(l_filestore_commitcycle, "commitcycle", "Commit cycles"); + plb.add_time_avg(l_filestore_commitcycle_interval, "commitcycle_interval", "Average interval between commits"); + plb.add_time_avg(l_filestore_commitcycle_latency, "commitcycle_latency", "Average latency of commit"); + plb.add_u64_counter(l_filestore_journal_full, "journal_full", "Journal writes while full"); + plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg", + "Store operation queue latency", NULL, PerfCountersBuilder::PRIO_USEFUL); + plb.add_time(l_filestore_sync_pause_max_lat, "sync_pause_max_latency", "Max latency of op_wq pause before syncfs"); + + logger = plb.create_perf_counters(); + + cct->get_perfcounters_collection()->add(logger); + cct->_conf.add_observer(this); + + superblock.compat_features = get_fs_initial_compat_set(); +} + +FileStore::~FileStore() +{ + for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + delete *it; + *it = nullptr; + } + for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + delete *it; + *it = nullptr; + } + cct->_conf.remove_observer(this); + cct->get_perfcounters_collection()->remove(logger); + + if (journal) + journal->logger = nullptr; + delete logger; + logger = nullptr; + + if (m_filestore_do_dump) { + dump_stop(); + } +} + +static void get_attrname(const char *name, char *buf, int len) +{ + snprintf(buf, len, "user.ceph.%s", name); +} + +bool parse_attrname(char **name) +{ + if (strncmp(*name, "user.ceph.", 10) == 0) { + *name += 10; + return true; + } + return false; +} + +void FileStore::collect_metadata(map<string,string> *pm) +{ + char partition_path[PATH_MAX]; + char dev_node[PATH_MAX]; + + (*pm)["filestore_backend"] = backend->get_name(); + ostringstream ss; + ss << "0x" << std::hex << m_fs_type << std::dec; + (*pm)["filestore_f_type"] = ss.str(); + + if (cct->_conf->filestore_collect_device_partition_information) { + int rc = 0; + BlkDev blkdev(fsid_fd); + if (rc = blkdev.partition(partition_path, PATH_MAX); rc) { + (*pm)["backend_filestore_partition_path"] = "unknown"; + } else { + (*pm)["backend_filestore_partition_path"] = string(partition_path); + } + if (rc = blkdev.wholedisk(dev_node, PATH_MAX); rc) { + (*pm)["backend_filestore_dev_node"] = "unknown"; + } else { + (*pm)["backend_filestore_dev_node"] = string(dev_node); + devname = dev_node; + } + if (rc == 0 && vdo_fd >= 0) { + (*pm)["vdo"] = "true"; + (*pm)["vdo_physical_size"] = + stringify(4096 * get_vdo_stat(vdo_fd, "physical_blocks")); + } + if (journal) { + journal->collect_metadata(pm); + } + } +} + +int FileStore::get_devices(set<string> *ls) +{ + string dev_node; + BlkDev blkdev(fsid_fd); + if (int rc = blkdev.wholedisk(&dev_node); rc) { + return rc; + } + get_raw_devices(dev_node, ls); + if (journal) { + journal->get_devices(ls); + } + return 0; +} + +int FileStore::statfs(struct store_statfs_t *buf0, osd_alert_list_t* alerts) +{ + struct statfs buf; + buf0->reset(); + if (alerts) { + alerts->clear(); // returns nothing for now + } + if (::statfs(basedir.c_str(), &buf) < 0) { + int r = -errno; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + ceph_assert(r != -ENOENT); + return r; + } + + uint64_t bfree = buf.f_bavail * buf.f_bsize; + + // assume all of leveldb/rocksdb is omap. + { + map<string,uint64_t> kv_usage; + buf0->omap_allocated += object_map->get_db()->get_estimated_size(kv_usage); + } + + uint64_t thin_total, thin_avail; + if (get_vdo_utilization(vdo_fd, &thin_total, &thin_avail)) { + buf0->total = thin_total; + bfree = std::min(bfree, thin_avail); + buf0->allocated = thin_total - thin_avail; + buf0->data_stored = bfree; + } else { + buf0->total = buf.f_blocks * buf.f_bsize; + buf0->allocated = bfree; + buf0->data_stored = bfree; + } + buf0->available = bfree; + + // FIXME: we don't know how to populate buf->internal_metadata; XFS doesn't + // tell us what its internal overhead is. + + // Adjust for writes pending in the journal + if (journal) { + uint64_t estimate = journal->get_journal_size_estimate(); + buf0->internally_reserved = estimate; + if (buf0->available > estimate) + buf0->available -= estimate; + else + buf0->available = 0; + } + + return 0; +} + +int FileStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) +{ + return -ENOTSUP; +} + +void FileStore::new_journal() +{ + if (journalpath.length()) { + dout(10) << "open_journal at " << journalpath << dendl; + journal = new FileJournal(cct, fsid, &finisher, &sync_cond, + journalpath.c_str(), + m_journal_dio, m_journal_aio, + m_journal_force_aio); + if (journal) + journal->logger = logger; + } + return; +} + +int FileStore::dump_journal(ostream& out) +{ + int r; + + if (!journalpath.length()) + return -EINVAL; + + FileJournal *journal = new FileJournal(cct, fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio); + r = journal->dump(out); + delete journal; + journal = nullptr; + return r; +} + +FileStoreBackend *FileStoreBackend::create(unsigned long f_type, FileStore *fs) +{ + switch (f_type) { +#if defined(__linux__) + case BTRFS_SUPER_MAGIC: + return new BtrfsFileStoreBackend(fs); +# ifdef HAVE_LIBXFS + case XFS_SUPER_MAGIC: + return new XfsFileStoreBackend(fs); +# endif +#endif +#ifdef HAVE_LIBZFS + case ZFS_SUPER_MAGIC: + return new ZFSFileStoreBackend(fs); +#endif + default: + return new GenericFileStoreBackend(fs); + } +} + +void FileStore::create_backend(unsigned long f_type) +{ + m_fs_type = f_type; + + ceph_assert(!backend); + backend = FileStoreBackend::create(f_type, this); + + dout(0) << "backend " << backend->get_name() + << " (magic 0x" << std::hex << f_type << std::dec << ")" + << dendl; + + switch (f_type) { +#if defined(__linux__) + case BTRFS_SUPER_MAGIC: + if (!m_disable_wbthrottle){ + wbthrottle.set_fs(WBThrottle::BTRFS); + } + break; + + case XFS_SUPER_MAGIC: + // wbthrottle is constructed with fs(WBThrottle::XFS) + break; +#endif + } + + set_xattr_limits_via_conf(); +} + +int FileStore::mkfs() +{ + int ret = 0; + char fsid_fn[PATH_MAX]; + char fsid_str[40]; + uuid_d old_fsid; + uuid_d old_omap_fsid; + + dout(1) << "mkfs in " << basedir << dendl; + basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC); + if (basedir_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl; + return ret; + } + + // open+lock fsid + snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + if (lock_fsid() < 0) { + ret = -EBUSY; + goto close_fsid_fd; + } + + if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << __FUNC__ << ": generated fsid " << fsid << dendl; + } else { + dout(1) << __FUNC__ << ": using provided fsid " << fsid << dendl; + } + + fsid.print(fsid_str); + strcat(fsid_str, "\n"); + ret = ::ftruncate(fsid_fd, 0); + if (ret < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to truncate fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str)); + if (ret < 0) { + derr << __FUNC__ << ": failed to write fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + if (::fsync(fsid_fd) < 0) { + ret = -errno; + derr << __FUNC__ << ": close failed: can't write fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + dout(10) << __FUNC__ << ": fsid is " << fsid << dendl; + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << __FUNC__ << ": on-disk fsid " << old_fsid << " != provided " << fsid << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + fsid = old_fsid; + dout(1) << __FUNC__ << ": fsid is already set to " << fsid << dendl; + } + + // version stamp + ret = write_version_stamp(); + if (ret < 0) { + derr << __FUNC__ << ": write_version_stamp() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + // superblock + superblock.omap_backend = cct->_conf->filestore_omap_backend; + ret = write_superblock(); + if (ret < 0) { + derr << __FUNC__ << ": write_superblock() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + struct statfs basefs; + ret = ::fstatfs(basedir_fd, &basefs); + if (ret < 0) { + ret = -errno; + derr << __FUNC__ << ": cannot fstatfs basedir " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + +#if defined(__linux__) + if (basefs.f_type == BTRFS_SUPER_MAGIC && + !g_ceph_context->check_experimental_feature_enabled("btrfs")) { + derr << __FUNC__ << ": deprecated btrfs support is not enabled" << dendl; + goto close_fsid_fd; + } +#endif + + create_backend(basefs.f_type); + + ret = backend->create_current(); + if (ret < 0) { + derr << __FUNC__ << ": failed to create current/ " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + // write initial op_seq + { + uint64_t initial_seq = 0; + int fd = read_op_seq(&initial_seq); + if (fd < 0) { + ret = fd; + derr << __FUNC__ << ": failed to create " << current_op_seq_fn << ": " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + if (initial_seq == 0) { + ret = write_op_seq(fd, 1); + if (ret < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << __FUNC__ << ": failed to write to " << current_op_seq_fn << ": " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + if (backend->can_checkpoint()) { + // create snap_1 too + current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC); + ceph_assert(current_fd >= 0); + char s[NAME_MAX]; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull); + ret = backend->create_checkpoint(s, nullptr); + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + if (ret < 0 && ret != -EEXIST) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << __FUNC__ << ": failed to create snap_1: " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + } + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir); + if (ret < 0) { + derr << __FUNC__ << ": failed to create " << cct->_conf->filestore_omap_backend << dendl; + goto close_fsid_fd; + } + // create fsid under omap + // open+lock fsid + int omap_fsid_fd; + char omap_fsid_fn[PATH_MAX]; + snprintf(omap_fsid_fn, sizeof(omap_fsid_fn), "%s/osd_uuid", omap_dir.c_str()); + omap_fsid_fd = ::open(omap_fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644); + if (omap_fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << omap_fsid_fn << ": " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + if (read_fsid(omap_fsid_fd, &old_omap_fsid) < 0 || old_omap_fsid.is_zero()) { + ceph_assert(!fsid.is_zero()); + fsid.print(fsid_str); + strcat(fsid_str, "\n"); + ret = ::ftruncate(omap_fsid_fd, 0); + if (ret < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to truncate fsid: " + << cpp_strerror(ret) << dendl; + goto close_omap_fsid_fd; + } + ret = safe_write(omap_fsid_fd, fsid_str, strlen(fsid_str)); + if (ret < 0) { + derr << __FUNC__ << ": failed to write fsid: " + << cpp_strerror(ret) << dendl; + goto close_omap_fsid_fd; + } + dout(10) << __FUNC__ << ": write success, fsid:" << fsid_str << ", ret:" << ret << dendl; + if (::fsync(omap_fsid_fd) < 0) { + ret = -errno; + derr << __FUNC__ << ": close failed: can't write fsid: " + << cpp_strerror(ret) << dendl; + goto close_omap_fsid_fd; + } + dout(10) << "mkfs omap fsid is " << fsid << dendl; + } else { + if (fsid != old_omap_fsid) { + derr << __FUNC__ << ": " << omap_fsid_fn + << " has existed omap fsid " << old_omap_fsid + << " != expected osd fsid " << fsid + << dendl; + ret = -EINVAL; + goto close_omap_fsid_fd; + } + dout(1) << __FUNC__ << ": omap fsid is already set to " << fsid << dendl; + } + + dout(1) << cct->_conf->filestore_omap_backend << " db exists/created" << dendl; + + // journal? + ret = mkjournal(); + if (ret) + goto close_omap_fsid_fd; + + ret = write_meta("type", "filestore"); + if (ret) + goto close_omap_fsid_fd; + + dout(1) << "mkfs done in " << basedir << dendl; + ret = 0; + + close_omap_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd)); + close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + close_basedir_fd: + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + delete backend; + backend = nullptr; + return ret; +} + +int FileStore::mkjournal() +{ + // read fsid + int ret; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC, 0644); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": open error: " << cpp_strerror(err) << dendl; + return -err; + } + ret = read_fsid(fd, &fsid); + if (ret < 0) { + derr << __FUNC__ << ": read error: " << cpp_strerror(ret) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + + ret = 0; + + new_journal(); + if (journal) { + ret = journal->check(); + if (ret < 0) { + ret = journal->create(); + if (ret) + derr << __FUNC__ << ": error creating journal on " << journalpath + << ": " << cpp_strerror(ret) << dendl; + else + dout(0) << __FUNC__ << ": created journal on " << journalpath << dendl; + } + delete journal; + journal = nullptr; + } + return ret; +} + +int FileStore::read_fsid(int fd, uuid_d *uuid) +{ + char fsid_str[40]; + memset(fsid_str, 0, sizeof(fsid_str)); + int ret = safe_read(fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) + return ret; + if (ret == 8) { + // old 64-bit fsid... mirror it. + *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str; + *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str; + return 0; + } + + if (ret > 36) + fsid_str[36] = 0; + else + fsid_str[ret] = 0; + if (!uuid->parse(fsid_str)) + return -EINVAL; + return 0; +} + +int FileStore::lock_fsid() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + dout(0) << __FUNC__ << ": failed to lock " << basedir << "/fsid, is another ceph-osd still running? " + << cpp_strerror(err) << dendl; + return -err; + } + return 0; +} + +bool FileStore::test_mount_in_use() +{ + dout(5) << __FUNC__ << ": basedir " << basedir << " journal " << journalpath << dendl; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); + + // verify fs isn't in use + + fsid_fd = ::open(fn, O_RDWR|O_CLOEXEC, 0644); + if (fsid_fd < 0) + return 0; // no fsid, ok. + bool inuse = lock_fsid() < 0; + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + return inuse; +} + +bool FileStore::is_rotational() +{ + bool rotational; + if (backend) { + rotational = backend->is_rotational(); + } else { + int fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) + return true; + struct statfs st; + int r = ::fstatfs(fd, &st); + ::close(fd); + if (r < 0) { + return true; + } + create_backend(st.f_type); + rotational = backend->is_rotational(); + delete backend; + backend = nullptr; + } + dout(10) << __func__ << " " << (int)rotational << dendl; + return rotational; +} + +bool FileStore::is_journal_rotational() +{ + bool journal_rotational; + if (backend) { + journal_rotational = backend->is_journal_rotational(); + } else { + int fd = ::open(journalpath.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) + return true; + struct statfs st; + int r = ::fstatfs(fd, &st); + ::close(fd); + if (r < 0) { + return true; + } + create_backend(st.f_type); + journal_rotational = backend->is_journal_rotational(); + delete backend; + backend = nullptr; + } + dout(10) << __func__ << " " << (int)journal_rotational << dendl; + return journal_rotational; +} + +int FileStore::_detect_fs() +{ + struct statfs st; + int r = ::fstatfs(basedir_fd, &st); + if (r < 0) + return -errno; + + blk_size = st.f_bsize; + +#if defined(__linux__) + if (st.f_type == BTRFS_SUPER_MAGIC && + !g_ceph_context->check_experimental_feature_enabled("btrfs")) { + derr <<__FUNC__ << ": deprecated btrfs support is not enabled" << dendl; + return -EPERM; + } +#endif + + create_backend(st.f_type); + + r = backend->detect_features(); + if (r < 0) { + derr << __FUNC__ << ": detect_features error: " << cpp_strerror(r) << dendl; + return r; + } + + // vdo + { + char dev_node[PATH_MAX]; + if (int rc = BlkDev{fsid_fd}.wholedisk(dev_node, PATH_MAX); rc == 0) { + vdo_fd = get_vdo_stats_handle(dev_node, &vdo_name); + if (vdo_fd >= 0) { + dout(0) << __func__ << " VDO volume " << vdo_name << " for " << dev_node + << dendl; + } + } + } + + // test xattrs + char fn[PATH_MAX]; + int x = rand(); + int y = x+1; + snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str()); + int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0700); + if (tmpfd < 0) { + int ret = -errno; + derr << __FUNC__ << ": unable to create " << fn << ": " << cpp_strerror(ret) << dendl; + return ret; + } + + int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x)); + if (ret >= 0) + ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y)); + if ((ret < 0) || (x != y)) { + derr << "Extended attributes don't appear to work. "; + if (ret) + *_dout << "Got error " + cpp_strerror(ret) + ". "; + *_dout << "If you are using ext3 or ext4, be sure to mount the underlying " + << "file system with the 'user_xattr' option." << dendl; + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); + return -ENOTSUP; + } + + char buf[1000]; + memset(buf, 0, sizeof(buf)); // shut up valgrind + chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf)); + ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf)); + if (ret == -ENOSPC) { + dout(0) << "limited size xattrs" << dendl; + } + chain_fremovexattr(tmpfd, "user.test"); + chain_fremovexattr(tmpfd, "user.test2"); + chain_fremovexattr(tmpfd, "user.test3"); + chain_fremovexattr(tmpfd, "user.test4"); + chain_fremovexattr(tmpfd, "user.test5"); + + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); + + return 0; +} + +int FileStore::_sanity_check_fs() +{ + // sanity check(s) + + if (((int)m_filestore_journal_writeahead + + (int)m_filestore_journal_parallel + + (int)m_filestore_journal_trailing) > 1) { + dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl; + cerr << TEXT_RED + << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n" + << " is enabled in ceph.conf. You must choose a single journal mode." + << TEXT_NORMAL << std::endl; + return -EINVAL; + } + + if (!backend->can_checkpoint()) { + if (!journal || !m_filestore_journal_writeahead) { + dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl; + cerr << TEXT_RED + << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n" + << " For non-btrfs volumes, a writeahead journal is required to\n" + << " maintain on-disk consistency in the event of a crash. Your conf\n" + << " should include something like:\n" + << " osd journal = /path/to/journal_device_or_file\n" + << " filestore journal writeahead = true\n" + << TEXT_NORMAL; + } + } + + if (!journal) { + dout(0) << "mount WARNING: no journal" << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: No osd journal is configured: write latency may be high.\n" + << " If you will not be using an osd journal, write latency may be\n" + << " relatively high. It can be reduced somewhat by lowering\n" + << " filestore_max_sync_interval, but lower values mean lower write\n" + << " throughput, especially with spinning disks.\n" + << TEXT_NORMAL; + } + + return 0; +} + +int FileStore::write_superblock() +{ + bufferlist bl; + encode(superblock, bl); + return safe_write_file(basedir.c_str(), "superblock", + bl.c_str(), bl.length(), 0600); +} + +int FileStore::read_superblock() +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "superblock", + bp.c_str(), bp.length()); + if (ret < 0) { + if (ret == -ENOENT) { + // If the file doesn't exist write initial CompatSet + return write_superblock(); + } + return ret; + } + + bufferlist bl; + bl.push_back(std::move(bp)); + auto i = bl.cbegin(); + decode(superblock, i); + return 0; +} + +int FileStore::update_version_stamp() +{ + return write_version_stamp(); +} + +int FileStore::version_stamp_is_valid(uint32_t *version) +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "store_version", + bp.c_str(), bp.length()); + if (ret < 0) { + return ret; + } + bufferlist bl; + bl.push_back(std::move(bp)); + auto i = bl.cbegin(); + decode(*version, i); + dout(10) << __FUNC__ << ": was " << *version << " vs target " + << target_version << dendl; + if (*version == target_version) + return 1; + else + return 0; +} + +int FileStore::flush_cache(ostream *os) +{ + string drop_caches_file = "/proc/sys/vm/drop_caches"; + int drop_caches_fd = ::open(drop_caches_file.c_str(), O_WRONLY|O_CLOEXEC), ret = 0; + char buf[2] = "3"; + size_t len = strlen(buf); + + if (drop_caches_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << drop_caches_file << ": " << cpp_strerror(ret) << dendl; + if (os) { + *os << "FileStore flush_cache: failed to open " << drop_caches_file << ": " << cpp_strerror(ret); + } + return ret; + } + + if (::write(drop_caches_fd, buf, len) < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to write to " << drop_caches_file << ": " << cpp_strerror(ret) << dendl; + if (os) { + *os << "FileStore flush_cache: failed to write to " << drop_caches_file << ": " << cpp_strerror(ret); + } + goto out; + } + +out: + ::close(drop_caches_fd); + return ret; +} + +int FileStore::write_version_stamp() +{ + dout(1) << __FUNC__ << ": " << target_version << dendl; + bufferlist bl; + encode(target_version, bl); + + return safe_write_file(basedir.c_str(), "store_version", + bl.c_str(), bl.length(), 0600); +} + +int FileStore::upgrade() +{ + dout(1) << __FUNC__ << dendl; + uint32_t version; + int r = version_stamp_is_valid(&version); + + if (r == -ENOENT) { + derr << "The store_version file doesn't exist." << dendl; + return -EINVAL; + } + if (r < 0) + return r; + if (r == 1) + return 0; + + if (version < 3) { + derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl; + return -EINVAL; + } + + // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to + // open up DBObjectMap with the do_upgrade flag, which we already did. + update_version_stamp(); + return 0; +} + +int FileStore::read_op_seq(uint64_t *seq) +{ + int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR|O_CLOEXEC, 0644); + if (op_fd < 0) { + int r = -errno; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + char s[40]; + memset(s, 0, sizeof(s)); + int ret = safe_read(op_fd, s, sizeof(s) - 1); + if (ret < 0) { + derr << __FUNC__ << ": error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(op_fd)); + ceph_assert(!m_filestore_fail_eio || ret != -EIO); + return ret; + } + *seq = atoll(s); + return op_fd; +} + +int FileStore::write_op_seq(int fd, uint64_t seq) +{ + char s[30]; + snprintf(s, sizeof(s), "%" PRId64 "\n", seq); + int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0)); + if (ret < 0) { + ret = -errno; + ceph_assert(!m_filestore_fail_eio || ret != -EIO); + } + return ret; +} + +int FileStore::mount() +{ + int ret; + char buf[PATH_MAX]; + uint64_t initial_op_seq; + uuid_d omap_fsid; + set<string> cluster_snaps; + CompatSet supported_compat_set = get_fs_supported_compat_set(); + + dout(5) << "basedir " << basedir << " journal " << journalpath << dendl; + + ret = set_throttle_params(); + if (ret != 0) + goto done; + + // make sure global base dir exists + if (::access(basedir.c_str(), R_OK | W_OK)) { + ret = -errno; + derr << __FUNC__ << ": unable to access basedir '" << basedir << "': " + << cpp_strerror(ret) << dendl; + goto done; + } + + // get fsid + snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(buf, O_RDWR|O_CLOEXEC, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": error opening '" << buf << "': " + << cpp_strerror(ret) << dendl; + goto done; + } + + ret = read_fsid(fsid_fd, &fsid); + if (ret < 0) { + derr << __FUNC__ << ": error reading fsid_fd: " << cpp_strerror(ret) + << dendl; + goto close_fsid_fd; + } + + if (lock_fsid() < 0) { + derr << __FUNC__ << ": lock_fsid failed" << dendl; + ret = -EBUSY; + goto close_fsid_fd; + } + + dout(10) << "mount fsid is " << fsid << dendl; + + + uint32_t version_stamp; + ret = version_stamp_is_valid(&version_stamp); + if (ret < 0) { + derr << __FUNC__ << ": error in version_stamp_is_valid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } else if (ret == 0) { + if (do_update || (int)version_stamp < cct->_conf->filestore_update_to) { + derr << __FUNC__ << ": stale version stamp detected: " + << version_stamp + << ". Proceeding, do_update " + << "is set, performing disk format upgrade." + << dendl; + do_update = true; + } else { + ret = -EINVAL; + derr << __FUNC__ << ": stale version stamp " << version_stamp + << ". Please run the FileStore update script before starting the " + << "OSD, or set filestore_update_to to " << target_version + << " (currently " << cct->_conf->filestore_update_to << ")" + << dendl; + goto close_fsid_fd; + } + } + + ret = read_superblock(); + if (ret < 0) { + goto close_fsid_fd; + } + + // Check if this FileStore supports all the necessary features to mount + if (supported_compat_set.compare(superblock.compat_features) == -1) { + derr << __FUNC__ << ": Incompatible features set " + << superblock.compat_features << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + + // open some dir handles + basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC); + if (basedir_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << basedir << ": " + << cpp_strerror(ret) << dendl; + basedir_fd = -1; + goto close_fsid_fd; + } + + // test for btrfs, xattrs, etc. + ret = _detect_fs(); + if (ret < 0) { + derr << __FUNC__ << ": error in _detect_fs: " + << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + { + list<string> ls; + ret = backend->list_checkpoints(ls); + if (ret < 0) { + derr << __FUNC__ << ": error in _list_snaps: "<< cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + long long unsigned c, prev = 0; + char clustersnap[NAME_MAX]; + for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) { + if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) { + ceph_assert(c > prev); + prev = c; + snaps.push_back(c); + } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1) + cluster_snaps.insert(*it); + } + } + + if (m_osd_rollback_to_cluster_snap.length() && + cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) { + derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl; + ret = -ENOENT; + goto close_basedir_fd; + } + + char nosnapfn[200]; + snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str()); + + if (backend->can_checkpoint()) { + if (snaps.empty()) { + dout(0) << __FUNC__ << ": WARNING: no consistent snaps found, store may be in inconsistent state" << dendl; + } else { + char s[NAME_MAX]; + uint64_t curr_seq = 0; + + if (m_osd_rollback_to_cluster_snap.length()) { + derr << TEXT_RED + << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **" + << TEXT_NORMAL + << dendl; + ceph_assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap)); + snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str()); + } else { + { + int fd = read_op_seq(&curr_seq); + if (fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + } + if (curr_seq) + dout(10) << " current/ seq was " << curr_seq << dendl; + else + dout(10) << " current/ missing entirely (unusual, but okay)" << dendl; + + uint64_t cp = snaps.back(); + dout(10) << " most recent snap from " << snaps << " is " << cp << dendl; + + // if current/ is marked as non-snapshotted, refuse to roll + // back (without clear direction) to avoid throwing out new + // data. + struct stat st; + if (::stat(nosnapfn, &st) == 0) { + if (!m_osd_use_stale_snap) { + derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl; + derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl; + derr << "config option for --osd-use-stale-snap startup argument." << dendl; + ret = -ENOTSUP; + goto close_basedir_fd; + } + derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq + << ", newest snap is " << cp << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: forcing the use of stale snapshot data **" + << TEXT_NORMAL << std::endl; + } + + dout(10) << __FUNC__ << ": rolling back to consistent snap " << cp << dendl; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); + } + + // drop current? + ret = backend->rollback_to(s); + if (ret) { + derr << __FUNC__ << ": error rolling back to " << s << ": " + << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + } + } + initial_op_seq = 0; + + current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC); + if (current_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + ceph_assert(current_fd >= 0); + + op_fd = read_op_seq(&initial_op_seq); + if (op_fd < 0) { + ret = op_fd; + derr << __FUNC__ << ": read_op_seq failed" << dendl; + goto close_current_fd; + } + + dout(5) << "mount op_seq is " << initial_op_seq << dendl; + if (initial_op_seq == 0) { + derr << "mount initial op seq is 0; something is wrong" << dendl; + ret = -EINVAL; + goto close_current_fd; + } + + if (!backend->can_checkpoint()) { + // mark current/ as non-snapshotted so that we don't rollback away + // from it. + int r = ::creat(nosnapfn, 0644); + if (r < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to create current/nosnap" << dendl; + goto close_current_fd; + } + VOID_TEMP_FAILURE_RETRY(::close(r)); + } else { + // clear nosnap marker, if present. + ::unlink(nosnapfn); + } + + // check fsid with omap + // get omap fsid + char omap_fsid_buf[PATH_MAX]; + struct ::stat omap_fsid_stat; + snprintf(omap_fsid_buf, sizeof(omap_fsid_buf), "%s/osd_uuid", omap_dir.c_str()); + // if osd_uuid not exists, assume as this omap matchs corresponding osd + if (::stat(omap_fsid_buf, &omap_fsid_stat) != 0){ + dout(10) << __FUNC__ << ": osd_uuid not found under omap, " + << "assume as matched." + << dendl; + } else { + int omap_fsid_fd; + // if osd_uuid exists, compares osd_uuid with fsid + omap_fsid_fd = ::open(omap_fsid_buf, O_RDONLY|O_CLOEXEC, 0644); + if (omap_fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": error opening '" << omap_fsid_buf << "': " + << cpp_strerror(ret) + << dendl; + goto close_current_fd; + } + ret = read_fsid(omap_fsid_fd, &omap_fsid); + VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd)); + if (ret < 0) { + derr << __FUNC__ << ": error reading omap_fsid_fd" + << ", omap_fsid = " << omap_fsid + << cpp_strerror(ret) + << dendl; + goto close_current_fd; + } + if (fsid != omap_fsid) { + derr << __FUNC__ << ": " << omap_fsid_buf + << " has existed omap fsid " << omap_fsid + << " != expected osd fsid " << fsid + << dendl; + ret = -EINVAL; + goto close_current_fd; + } + } + + dout(0) << "start omap initiation" << dendl; + if (!(generic_flags & SKIP_MOUNT_OMAP)) { + KeyValueDB * omap_store = KeyValueDB::create(cct, + superblock.omap_backend, + omap_dir); + if (!omap_store) + { + derr << __FUNC__ << ": Error creating " << superblock.omap_backend << dendl; + ret = -1; + goto close_current_fd; + } + + if (superblock.omap_backend == "rocksdb") + ret = omap_store->init(cct->_conf->filestore_rocksdb_options); + else + ret = omap_store->init(); + + if (ret < 0) { + derr << __FUNC__ << ": Error initializing omap_store: " << cpp_strerror(ret) << dendl; + goto close_current_fd; + } + + stringstream err; + if (omap_store->create_and_open(err)) { + delete omap_store; + omap_store = nullptr; + derr << __FUNC__ << ": Error initializing " << superblock.omap_backend + << " : " << err.str() << dendl; + ret = -1; + goto close_current_fd; + } + + DBObjectMap *dbomap = new DBObjectMap(cct, omap_store); + ret = dbomap->init(do_update); + if (ret < 0) { + delete dbomap; + dbomap = nullptr; + derr << __FUNC__ << ": Error initializing DBObjectMap: " << ret << dendl; + goto close_current_fd; + } + stringstream err2; + + if (cct->_conf->filestore_debug_omap_check && !dbomap->check(err2)) { + derr << err2.str() << dendl; + delete dbomap; + dbomap = nullptr; + ret = -EINVAL; + goto close_current_fd; + } + object_map.reset(dbomap); + } + + // journal + new_journal(); + + // select journal mode? + if (journal) { + if (!m_filestore_journal_writeahead && + !m_filestore_journal_parallel && + !m_filestore_journal_trailing) { + if (!backend->can_checkpoint()) { + m_filestore_journal_writeahead = true; + dout(0) << __FUNC__ << ": enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl; + } else { + m_filestore_journal_parallel = true; + dout(0) << __FUNC__ << ": enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl; + } + } else { + if (m_filestore_journal_writeahead) + dout(0) << __FUNC__ << ": WRITEAHEAD journal mode explicitly enabled in conf" << dendl; + if (m_filestore_journal_parallel) + dout(0) << __FUNC__ << ": PARALLEL journal mode explicitly enabled in conf" << dendl; + if (m_filestore_journal_trailing) + dout(0) << __FUNC__ << ": TRAILING journal mode explicitly enabled in conf" << dendl; + } + if (m_filestore_journal_writeahead) + journal->set_wait_on_full(true); + } else { + dout(0) << __FUNC__ << ": no journal" << dendl; + } + + ret = _sanity_check_fs(); + if (ret) { + derr << __FUNC__ << ": _sanity_check_fs failed with error " + << ret << dendl; + goto close_current_fd; + } + + // Cleanup possibly invalid collections + { + vector<coll_t> collections; + ret = list_collections(collections, true); + if (ret < 0) { + derr << "Error " << ret << " while listing collections" << dendl; + goto close_current_fd; + } + for (vector<coll_t>::iterator i = collections.begin(); + i != collections.end(); + ++i) { + Index index; + ret = get_index(*i, &index); + if (ret < 0) { + derr << "Unable to mount index " << *i + << " with error: " << ret << dendl; + goto close_current_fd; + } + ceph_assert(index.index); + RWLock::WLocker l((index.index)->access_lock); + + index->cleanup(); + } + } + if (!m_disable_wbthrottle) { + wbthrottle.start(); + } else { + dout(0) << __FUNC__ << ": INFO: WbThrottle is disabled" << dendl; + if (cct->_conf->filestore_odsync_write) { + dout(0) << __FUNC__ << ": INFO: O_DSYNC write is enabled" << dendl; + } + } + sync_thread.create("filestore_sync"); + + if (!(generic_flags & SKIP_JOURNAL_REPLAY)) { + ret = journal_replay(initial_op_seq); + if (ret < 0) { + derr << __FUNC__ << ": failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl; + if (ret == -ENOTTY) { + derr << "maybe journal is not pointing to a block device and its size " + << "wasn't configured?" << dendl; + } + + goto stop_sync; + } + } + + { + stringstream err2; + if (cct->_conf->filestore_debug_omap_check && !object_map->check(err2)) { + derr << err2.str() << dendl; + ret = -EINVAL; + goto stop_sync; + } + } + + init_temp_collections(); + + journal_start(); + + op_tp.start(); + for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->start(); + } + for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->start(); + } + + timer.init(); + + // upgrade? + if (cct->_conf->filestore_update_to >= (int)get_target_version()) { + int err = upgrade(); + if (err < 0) { + derr << "error converting store" << dendl; + umount(); + return err; + } + } + + // all okay. + return 0; + +stop_sync: + // stop sync thread + lock.Lock(); + stop = true; + sync_cond.Signal(); + lock.Unlock(); + sync_thread.join(); + if (!m_disable_wbthrottle) { + wbthrottle.stop(); + } +close_current_fd: + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; +close_basedir_fd: + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + basedir_fd = -1; +close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; +done: + ceph_assert(!m_filestore_fail_eio || ret != -EIO); + delete backend; + backend = nullptr; + object_map.reset(); + return ret; +} + +void FileStore::init_temp_collections() +{ + dout(10) << __FUNC__ << dendl; + vector<coll_t> ls; + int r = list_collections(ls, true); + ceph_assert(r >= 0); + + dout(20) << " ls " << ls << dendl; + + SequencerPosition spos; + + set<coll_t> temps; + for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) + if (p->is_temp()) + temps.insert(*p); + dout(20) << " temps " << temps << dendl; + + for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) { + if (p->is_temp()) + continue; + coll_map[*p] = new OpSequencer(cct, ++next_osr_id, *p); + if (p->is_meta()) + continue; + coll_t temp = p->get_temp(); + if (temps.count(temp)) { + temps.erase(temp); + } else { + dout(10) << __FUNC__ << ": creating " << temp << dendl; + r = _create_collection(temp, 0, spos); + ceph_assert(r == 0); + } + } + + for (set<coll_t>::iterator p = temps.begin(); p != temps.end(); ++p) { + dout(10) << __FUNC__ << ": removing stray " << *p << dendl; + r = _collection_remove_recursive(*p, spos); + ceph_assert(r == 0); + } +} + +int FileStore::umount() +{ + dout(5) << __FUNC__ << ": " << basedir << dendl; + + flush(); + sync(); + do_force_sync(); + + { + Mutex::Locker l(coll_lock); + coll_map.clear(); + } + + lock.Lock(); + stop = true; + sync_cond.Signal(); + lock.Unlock(); + sync_thread.join(); + if (!m_disable_wbthrottle){ + wbthrottle.stop(); + } + op_tp.stop(); + + journal_stop(); + if (!(generic_flags & SKIP_JOURNAL_REPLAY)) + journal_write_close(); + + for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->stop(); + } + for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->stop(); + } + + if (vdo_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(vdo_fd)); + vdo_fd = -1; + } + if (fsid_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + } + if (op_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(op_fd)); + op_fd = -1; + } + if (current_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; + } + if (basedir_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + basedir_fd = -1; + } + + force_sync = false; + + delete backend; + backend = nullptr; + + object_map.reset(); + + { + Mutex::Locker l(sync_entry_timeo_lock); + timer.shutdown(); + } + + // nothing + return 0; +} + + +/// ----------------------------- + +// keep OpSequencer handles alive for all time so that a sequence +// that removes a collection and creates a new one will not allow +// two sequencers for the same collection to be alive at once. + +ObjectStore::CollectionHandle FileStore::open_collection(const coll_t& c) +{ + Mutex::Locker l(coll_lock); + auto p = coll_map.find(c); + if (p == coll_map.end()) { + return CollectionHandle(); + } + return p->second; +} + +ObjectStore::CollectionHandle FileStore::create_new_collection(const coll_t& c) +{ + Mutex::Locker l(coll_lock); + auto p = coll_map.find(c); + if (p == coll_map.end()) { + auto *r = new OpSequencer(cct, ++next_osr_id, c); + coll_map[c] = r; + return r; + } else { + return p->second; + } +} + + +/// ----------------------------- + +FileStore::Op *FileStore::build_op(vector<Transaction>& tls, + Context *onreadable, + Context *onreadable_sync, + TrackedOpRef osd_op) +{ + uint64_t bytes = 0, ops = 0; + for (vector<Transaction>::iterator p = tls.begin(); + p != tls.end(); + ++p) { + bytes += (*p).get_num_bytes(); + ops += (*p).get_num_ops(); + } + + Op *o = new Op; + o->start = ceph_clock_now(); + o->tls = std::move(tls); + o->onreadable = onreadable; + o->onreadable_sync = onreadable_sync; + o->ops = ops; + o->bytes = bytes; + o->osd_op = osd_op; + return o; +} + + + +void FileStore::queue_op(OpSequencer *osr, Op *o) +{ + // queue op on sequencer, then queue sequencer for the threadpool, + // so that regardless of which order the threads pick up the + // sequencer, the op order will be preserved. + + osr->queue(o); + o->trace.event("queued"); + + logger->inc(l_filestore_ops); + logger->inc(l_filestore_bytes, o->bytes); + + dout(5) << __FUNC__ << ": " << o << " seq " << o->op + << " " << *osr + << " " << o->bytes << " bytes" + << " (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)" + << dendl; + op_wq.queue(osr); +} + +void FileStore::op_queue_reserve_throttle(Op *o) +{ + throttle_ops.get(); + throttle_bytes.get(o->bytes); + + logger->set(l_filestore_op_queue_ops, throttle_ops.get_current()); + logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current()); +} + +void FileStore::op_queue_release_throttle(Op *o) +{ + throttle_ops.put(); + throttle_bytes.put(o->bytes); + logger->set(l_filestore_op_queue_ops, throttle_ops.get_current()); + logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current()); +} + +void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle) +{ + if (!m_disable_wbthrottle) { + wbthrottle.throttle(); + } + // inject a stall? + if (cct->_conf->filestore_inject_stall) { + int orig = cct->_conf->filestore_inject_stall; + dout(5) << __FUNC__ << ": filestore_inject_stall " << orig << ", sleeping" << dendl; + sleep(orig); + cct->_conf.set_val("filestore_inject_stall", "0"); + dout(5) << __FUNC__ << ": done stalling" << dendl; + } + + osr->apply_lock.Lock(); + Op *o = osr->peek_queue(); + o->trace.event("op_apply_start"); + apply_manager.op_apply_start(o->op); + dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " start" << dendl; + o->trace.event("_do_transactions start"); + int r = _do_transactions(o->tls, o->op, &handle, osr->osr_name); + o->trace.event("op_apply_finish"); + apply_manager.op_apply_finish(o->op); + dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " r = " << r + << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl; +} + +void FileStore::_finish_op(OpSequencer *osr) +{ + list<Context*> to_queue; + Op *o = osr->dequeue(&to_queue); + + o->tls.clear(); + + utime_t lat = ceph_clock_now(); + lat -= o->start; + + dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " lat " << lat << dendl; + osr->apply_lock.Unlock(); // locked in _do_op + o->trace.event("_finish_op"); + + // called with tp lock held + op_queue_release_throttle(o); + + logger->tinc(l_filestore_apply_latency, lat); + + if (o->onreadable_sync) { + o->onreadable_sync->complete(0); + } + if (o->onreadable) { + apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable); + } + if (!to_queue.empty()) { + apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue); + } + delete o; + o = nullptr; +} + +struct C_JournaledAhead : public Context { + FileStore *fs; + FileStore::OpSequencer *osr; + FileStore::Op *o; + Context *ondisk; + + C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk): + fs(f), osr(os), o(o), ondisk(ondisk) { } + void finish(int r) override { + fs->_journaled_ahead(osr, o, ondisk); + } +}; + +int FileStore::queue_transactions(CollectionHandle& ch, vector<Transaction>& tls, + TrackedOpRef osd_op, + ThreadPool::TPHandle *handle) +{ + Context *onreadable; + Context *ondisk; + Context *onreadable_sync; + ObjectStore::Transaction::collect_contexts( + tls, &onreadable, &ondisk, &onreadable_sync); + + if (cct->_conf->objectstore_blackhole) { + dout(0) << __FUNC__ << ": objectstore_blackhole = TRUE, dropping transaction" + << dendl; + delete ondisk; + ondisk = nullptr; + delete onreadable; + onreadable = nullptr; + delete onreadable_sync; + onreadable_sync = nullptr; + return 0; + } + + utime_t start = ceph_clock_now(); + + OpSequencer *osr = static_cast<OpSequencer*>(ch.get()); + dout(5) << __FUNC__ << ": osr " << osr << " " << *osr << dendl; + + ZTracer::Trace trace; + if (osd_op && osd_op->pg_trace) { + osd_op->store_trace.init("filestore op", &trace_endpoint, &osd_op->pg_trace); + trace = osd_op->store_trace; + } + + if (journal && journal->is_writeable() && !m_filestore_journal_trailing) { + Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); + + //prepare and encode transactions data out of lock + bufferlist tbl; + int orig_len = journal->prepare_entry(o->tls, &tbl); + + if (handle) + handle->suspend_tp_timeout(); + + op_queue_reserve_throttle(o); + journal->reserve_throttle_and_backoff(tbl.length()); + + if (handle) + handle->reset_tp_timeout(); + + uint64_t op_num = submit_manager.op_submit_start(); + o->op = op_num; + trace.keyval("opnum", op_num); + + if (m_filestore_do_dump) + dump_transactions(o->tls, o->op, osr); + + if (m_filestore_journal_parallel) { + dout(5) << __FUNC__ << ": (parallel) " << o->op << " " << o->tls << dendl; + + trace.keyval("journal mode", "parallel"); + trace.event("journal started"); + _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op); + + // queue inside submit_manager op submission lock + queue_op(osr, o); + trace.event("op queued"); + } else if (m_filestore_journal_writeahead) { + dout(5) << __FUNC__ << ": (writeahead) " << o->op << " " << o->tls << dendl; + + osr->queue_journal(o); + + trace.keyval("journal mode", "writeahead"); + trace.event("journal started"); + _op_journal_transactions(tbl, orig_len, o->op, + new C_JournaledAhead(this, osr, o, ondisk), + osd_op); + } else { + ceph_abort(); + } + submit_manager.op_submit_finish(op_num); + utime_t end = ceph_clock_now(); + logger->tinc(l_filestore_queue_transaction_latency_avg, end - start); + return 0; + } + + if (!journal) { + Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); + dout(5) << __FUNC__ << ": (no journal) " << o << " " << tls << dendl; + + if (handle) + handle->suspend_tp_timeout(); + + op_queue_reserve_throttle(o); + + if (handle) + handle->reset_tp_timeout(); + + uint64_t op_num = submit_manager.op_submit_start(); + o->op = op_num; + + if (m_filestore_do_dump) + dump_transactions(o->tls, o->op, osr); + + queue_op(osr, o); + trace.keyval("opnum", op_num); + trace.keyval("journal mode", "none"); + trace.event("op queued"); + + if (ondisk) + apply_manager.add_waiter(op_num, ondisk); + submit_manager.op_submit_finish(op_num); + utime_t end = ceph_clock_now(); + logger->tinc(l_filestore_queue_transaction_latency_avg, end - start); + return 0; + } + + ceph_assert(journal); + //prepare and encode transactions data out of lock + bufferlist tbl; + int orig_len = -1; + if (journal->is_writeable()) { + orig_len = journal->prepare_entry(tls, &tbl); + } + uint64_t op = submit_manager.op_submit_start(); + dout(5) << __FUNC__ << ": (trailing journal) " << op << " " << tls << dendl; + + if (m_filestore_do_dump) + dump_transactions(tls, op, osr); + + trace.event("op_apply_start"); + trace.keyval("opnum", op); + trace.keyval("journal mode", "trailing"); + apply_manager.op_apply_start(op); + trace.event("do_transactions"); + int r = do_transactions(tls, op); + + if (r >= 0) { + trace.event("journal started"); + _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op); + } else { + delete ondisk; + ondisk = nullptr; + } + + // start on_readable finisher after we queue journal item, as on_readable callback + // is allowed to delete the Transaction + if (onreadable_sync) { + onreadable_sync->complete(r); + } + apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r); + + submit_manager.op_submit_finish(op); + trace.event("op_apply_finish"); + apply_manager.op_apply_finish(op); + + utime_t end = ceph_clock_now(); + logger->tinc(l_filestore_queue_transaction_latency_avg, end - start); + return r; +} + +void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk) +{ + dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl; + + o->trace.event("writeahead journal finished"); + + // this should queue in order because the journal does it's completions in order. + queue_op(osr, o); + + list<Context*> to_queue; + osr->dequeue_journal(&to_queue); + + // do ondisk completions async, to prevent any onreadable_sync completions + // getting blocked behind an ondisk completion. + if (ondisk) { + dout(10) << " queueing ondisk " << ondisk << dendl; + ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk); + } + if (!to_queue.empty()) { + ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue); + } +} + +int FileStore::_do_transactions( + vector<Transaction> &tls, + uint64_t op_seq, + ThreadPool::TPHandle *handle, + const char *osr_name) +{ + int trans_num = 0; + + for (vector<Transaction>::iterator p = tls.begin(); + p != tls.end(); + ++p, trans_num++) { + _do_transaction(*p, op_seq, trans_num, handle, osr_name); + if (handle) + handle->reset_tp_timeout(); + } + + return 0; +} + +void FileStore::_set_global_replay_guard(const coll_t& cid, + const SequencerPosition &spos) +{ + if (backend->can_checkpoint()) + return; + + // sync all previous operations on this sequencer + int ret = object_map->sync(); + if (ret < 0) { + derr << __FUNC__ << ": omap sync error " << cpp_strerror(ret) << dendl; + ceph_abort_msg("_set_global_replay_guard failed"); + } + ret = sync_filesystem(basedir_fd); + if (ret < 0) { + derr << __FUNC__ << ": sync_filesystem error " << cpp_strerror(ret) << dendl; + ceph_abort_msg("_set_global_replay_guard failed"); + } + + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + ceph_abort_msg("_set_global_replay_guard failed"); + } + + _inject_failure(); + + // then record that we did it + bufferlist v; + encode(spos, v); + int r = chain_fsetxattr<true, true>( + fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length()); + if (r < 0) { + derr << __FUNC__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR + << " got " << cpp_strerror(r) << dendl; + ceph_abort_msg("fsetxattr failed"); + } + + // and make sure our xattr is durable. + r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + _inject_failure(); + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + dout(10) << __FUNC__ << ": " << spos << " done" << dendl; +} + +int FileStore::_check_global_replay_guard(const coll_t& cid, + const SequencerPosition& spos) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + dout(10) << __FUNC__ << ": " << cid << " dne" << dendl; + return 1; // if collection does not exist, there is no guard, and we can replay. + } + + char buf[100]; + int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf)); + if (r < 0) { + dout(20) << __FUNC__ << ": no xattr" << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return 1; // no xattr + } + bufferlist bl; + bl.append(buf, r); + + SequencerPosition opos; + auto p = bl.cbegin(); + decode(opos, p); + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return spos >= opos ? 1 : -1; +} + + +void FileStore::_set_replay_guard(const coll_t& cid, + const SequencerPosition &spos, + bool in_progress=false) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + ceph_abort_msg("_set_replay_guard failed"); + } + _set_replay_guard(fd, spos, 0, in_progress); + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + + +void FileStore::_set_replay_guard(int fd, + const SequencerPosition& spos, + const ghobject_t *hoid, + bool in_progress) +{ + if (backend->can_checkpoint()) + return; + + dout(10) << __FUNC__ << ": " << spos << (in_progress ? " START" : "") << dendl; + + _inject_failure(); + + // first make sure the previous operation commits + int r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + if (!in_progress) { + // sync object_map too. even if this object has a header or keys, + // it have had them in the past and then removed them, so always + // sync. + object_map->sync(hoid, &spos); + } + + _inject_failure(); + + // then record that we did it + bufferlist v(40); + encode(spos, v); + encode(in_progress, v); + r = chain_fsetxattr<true, true>( + fd, REPLAY_GUARD_XATTR, v.c_str(), v.length()); + if (r < 0) { + derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; + ceph_abort_msg("fsetxattr failed"); + } + + // and make sure our xattr is durable. + r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + _inject_failure(); + + dout(10) << __FUNC__ << ": " << spos << " done" << dendl; +} + +void FileStore::_close_replay_guard(const coll_t& cid, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + ceph_abort_msg("_close_replay_guard failed"); + } + _close_replay_guard(fd, spos); + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + +void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos, + const ghobject_t *hoid) +{ + if (backend->can_checkpoint()) + return; + + dout(10) << __FUNC__ << ": " << spos << dendl; + + _inject_failure(); + + // sync object_map too. even if this object has a header or keys, + // it have had them in the past and then removed them, so always + // sync. + object_map->sync(hoid, &spos); + + // then record that we are done with this operation + bufferlist v(40); + encode(spos, v); + bool in_progress = false; + encode(in_progress, v); + int r = chain_fsetxattr<true, true>( + fd, REPLAY_GUARD_XATTR, v.c_str(), v.length()); + if (r < 0) { + derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; + ceph_abort_msg("fsetxattr failed"); + } + + // and make sure our xattr is durable. + r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + _inject_failure(); + + dout(10) << __FUNC__ << ": " << spos << " done" << dendl; +} + +int FileStore::_check_replay_guard(const coll_t& cid, const ghobject_t &oid, + const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + int r = _check_global_replay_guard(cid, spos); + if (r < 0) + return r; + + FDRef fd; + r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << __FUNC__ << ": " << cid << " " << oid << " dne" << dendl; + return 1; // if file does not exist, there is no guard, and we can replay. + } + int ret = _check_replay_guard(**fd, spos); + lfn_close(fd); + return ret; +} + +int FileStore::_check_replay_guard(const coll_t& cid, const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + dout(10) << __FUNC__ << ": " << cid << " dne" << dendl; + return 1; // if collection does not exist, there is no guard, and we can replay. + } + int ret = _check_replay_guard(fd, spos); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; +} + +int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + char buf[100]; + int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf)); + if (r < 0) { + dout(20) << __FUNC__ << ": no xattr" << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return 1; // no xattr + } + bufferlist bl; + bl.append(buf, r); + + SequencerPosition opos; + auto p = bl.cbegin(); + decode(opos, p); + bool in_progress = false; + if (!p.end()) // older journals don't have this + decode(in_progress, p); + if (opos > spos) { + dout(10) << __FUNC__ << ": object has " << opos << " > current pos " << spos + << ", now or in future, SKIPPING REPLAY" << dendl; + return -1; + } else if (opos == spos) { + if (in_progress) { + dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos + << ", in_progress=true, CONDITIONAL REPLAY" << dendl; + return 0; + } else { + dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos + << ", in_progress=false, SKIPPING REPLAY" << dendl; + return -1; + } + } else { + dout(10) << __FUNC__ << ": object has " << opos << " < current pos " << spos + << ", in past, will replay" << dendl; + return 1; + } +} + +void FileStore::_do_transaction( + Transaction& t, uint64_t op_seq, int trans_num, + ThreadPool::TPHandle *handle, + const char *osr_name) +{ + dout(10) << __FUNC__ << ": on " << &t << dendl; + + Transaction::iterator i = t.begin(); + + SequencerPosition spos(op_seq, trans_num, 0); + while (i.have_op()) { + if (handle) + handle->reset_tp_timeout(); + + Transaction::Op *op = i.decode_op(); + int r = 0; + + _inject_failure(); + + switch (op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_TOUCH: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, touch_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _touch(cid, oid); + tracepoint(objectstore, touch_exit, r); + } + break; + + case Transaction::OP_WRITE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, write_enter, osr_name, off, len); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _write(cid, oid, off, len, bl, fadvise_flags); + tracepoint(objectstore, write_exit, r); + } + break; + + case Transaction::OP_ZERO: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + uint64_t len = op->len; + tracepoint(objectstore, zero_enter, osr_name, off, len); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _zero(cid, oid, off, len); + tracepoint(objectstore, zero_exit, r); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + tracepoint(objectstore, truncate_enter, osr_name, off); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _truncate(cid, oid, off); + tracepoint(objectstore, truncate_exit, r); + } + break; + + case Transaction::OP_REMOVE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, remove_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _remove(cid, oid, spos); + tracepoint(objectstore, remove_exit, r); + } + break; + + case Transaction::OP_SETATTR: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, setattr_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) { + map<string, bufferptr> to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(cid, oid, to_set, spos); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid + << " name " << name << " size " << bl.length() << dendl; + } + tracepoint(objectstore, setattr_exit, r); + } + break; + + case Transaction::OP_SETATTRS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + map<string, bufferptr> aset; + i.decode_attrset(aset); + tracepoint(objectstore, setattrs_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _setattrs(cid, oid, aset, spos); + tracepoint(objectstore, setattrs_exit, r); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl; + } + break; + + case Transaction::OP_RMATTR: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + string name = i.decode_string(); + tracepoint(objectstore, rmattr_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _rmattr(cid, oid, name.c_str(), spos); + tracepoint(objectstore, rmattr_exit, r); + } + break; + + case Transaction::OP_RMATTRS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, rmattrs_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _rmattrs(cid, oid, spos); + tracepoint(objectstore, rmattrs_exit, r); + } + break; + + case Transaction::OP_CLONE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + const ghobject_t &noid = i.get_oid(op->dest_oid); + tracepoint(objectstore, clone_enter, osr_name); + r = _clone(cid, oid, noid, spos); + tracepoint(objectstore, clone_exit, r); + } + break; + + case Transaction::OP_CLONERANGE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const ghobject_t &noid = i.get_oid(op->dest_oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + uint64_t len = op->len; + tracepoint(objectstore, clone_range_enter, osr_name, len); + r = _clone_range(cid, oid, ncid, noid, off, len, off, spos); + tracepoint(objectstore, clone_range_exit, r); + } + break; + + case Transaction::OP_CLONERANGE2: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const ghobject_t &noid = i.get_oid(op->dest_oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ? + _cid : _cid.get_temp(); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + tracepoint(objectstore, clone_range2_enter, osr_name, len); + r = _clone_range(cid, oid, ncid, noid, srcoff, len, dstoff, spos); + tracepoint(objectstore, clone_range2_exit, r); + } + break; + + case Transaction::OP_MKCOLL: + { + const coll_t &cid = i.get_cid(op->cid); + tracepoint(objectstore, mkcoll_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _create_collection(cid, op->split_bits, spos); + tracepoint(objectstore, mkcoll_exit, r); + } + break; + + case Transaction::OP_COLL_SET_BITS: + { + const coll_t &cid = i.get_cid(op->cid); + int bits = op->split_bits; + r = _collection_set_bits(cid, bits); + } + break; + + case Transaction::OP_COLL_HINT: + { + const coll_t &cid = i.get_cid(op->cid); + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + auto hiter = hint.cbegin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + decode(pg_num, hiter); + decode(num_objs, hiter); + if (_check_replay_guard(cid, spos) > 0) { + r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos); + } + } else { + // Ignore the hint + dout(10) << "Unrecognized collection hint type: " << type << dendl; + } + } + break; + + case Transaction::OP_RMCOLL: + { + const coll_t &cid = i.get_cid(op->cid); + tracepoint(objectstore, rmcoll_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _destroy_collection(cid); + tracepoint(objectstore, rmcoll_exit, r); + } + break; + + case Transaction::OP_COLL_ADD: + { + const coll_t &ocid = i.get_cid(op->cid); + const coll_t &ncid = i.get_cid(op->dest_cid); + const ghobject_t &oid = i.get_oid(op->oid); + + ceph_assert(oid.hobj.pool >= -1); + + // always followed by OP_COLL_REMOVE + Transaction::Op *op2 = i.decode_op(); + const coll_t &ocid2 = i.get_cid(op2->cid); + const ghobject_t &oid2 = i.get_oid(op2->oid); + ceph_assert(op2->op == Transaction::OP_COLL_REMOVE); + ceph_assert(ocid2 == ocid); + ceph_assert(oid2 == oid); + + tracepoint(objectstore, coll_add_enter); + r = _collection_add(ncid, ocid, oid, spos); + tracepoint(objectstore, coll_add_exit, r); + spos.op++; + if (r < 0) + break; + tracepoint(objectstore, coll_remove_enter, osr_name); + if (_check_replay_guard(ocid, oid, spos) > 0) + r = _remove(ocid, oid, spos); + tracepoint(objectstore, coll_remove_exit, r); + } + break; + + case Transaction::OP_COLL_MOVE: + { + // WARNING: this is deprecated and buggy; only here to replay old journals. + const coll_t &ocid = i.get_cid(op->cid); + const coll_t &ncid = i.get_cid(op->dest_cid); + const ghobject_t &oid = i.get_oid(op->oid); + tracepoint(objectstore, coll_move_enter); + r = _collection_add(ocid, ncid, oid, spos); + if (r == 0 && + (_check_replay_guard(ocid, oid, spos) > 0)) + r = _remove(ocid, oid, spos); + tracepoint(objectstore, coll_move_exit, r); + } + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + const coll_t &_oldcid = i.get_cid(op->cid); + const ghobject_t &oldoid = i.get_oid(op->oid); + const coll_t &_newcid = i.get_cid(op->dest_cid); + const ghobject_t &newoid = i.get_oid(op->dest_oid); + const coll_t &oldcid = !_need_temp_object_collection(_oldcid, oldoid) ? + _oldcid : _oldcid.get_temp(); + const coll_t &newcid = !_need_temp_object_collection(_newcid, newoid) ? + _oldcid : _newcid.get_temp(); + tracepoint(objectstore, coll_move_rename_enter); + r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos); + tracepoint(objectstore, coll_move_rename_exit, r); + } + break; + + case Transaction::OP_TRY_RENAME: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oldoid = i.get_oid(op->oid); + const ghobject_t &newoid = i.get_oid(op->dest_oid); + const coll_t &oldcid = !_need_temp_object_collection(_cid, oldoid) ? + _cid : _cid.get_temp(); + const coll_t &newcid = !_need_temp_object_collection(_cid, newoid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, coll_try_rename_enter); + r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos, true); + tracepoint(objectstore, coll_try_rename_exit, r); + } + break; + + case Transaction::OP_COLL_SETATTR: + case Transaction::OP_COLL_RMATTR: + ceph_abort_msg("collection attr methods no longer implemented"); + break; + + case Transaction::OP_COLL_RENAME: + { + r = -EOPNOTSUPP; + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, omap_clear_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_clear(cid, oid, spos); + tracepoint(objectstore, omap_clear_exit, r); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + map<string, bufferlist> aset; + i.decode_attrset(aset); + tracepoint(objectstore, omap_setkeys_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_setkeys(cid, oid, aset, spos); + tracepoint(objectstore, omap_setkeys_exit, r); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + set<string> keys; + i.decode_keyset(keys); + tracepoint(objectstore, omap_rmkeys_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_rmkeys(cid, oid, keys, spos); + tracepoint(objectstore, omap_rmkeys_exit, r); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + tracepoint(objectstore, omap_rmkeyrange_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_rmkeyrange(cid, oid, first, last, spos); + tracepoint(objectstore, omap_rmkeyrange_exit, r); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, omap_setheader_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _omap_setheader(cid, oid, bl, spos); + tracepoint(objectstore, omap_setheader_exit, r); + } + break; + case Transaction::OP_SPLIT_COLLECTION: + { + ceph_abort_msg("not legacy journal; upgrade to firefly first"); + } + break; + case Transaction::OP_SPLIT_COLLECTION2: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + coll_t dest = i.get_cid(op->dest_cid); + tracepoint(objectstore, split_coll2_enter, osr_name); + r = _split_collection(cid, bits, rem, dest, spos); + tracepoint(objectstore, split_coll2_exit, r); + } + break; + + case Transaction::OP_MERGE_COLLECTION: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + coll_t dest = i.get_cid(op->dest_cid); + tracepoint(objectstore, merge_coll_enter, osr_name); + r = _merge_collection(cid, bits, dest, spos); + tracepoint(objectstore, merge_coll_exit, r); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t expected_object_size = op->expected_object_size; + uint64_t expected_write_size = op->expected_write_size; + tracepoint(objectstore, setallochint_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _set_alloc_hint(cid, oid, expected_object_size, + expected_write_size); + tracepoint(objectstore, setallochint_exit, r); + } + break; + + default: + derr << "bad op " << op->op << dendl; + ceph_abort(); + } + + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD || + op->op == Transaction::OP_SETATTR || + op->op == Transaction::OP_SETATTRS || + op->op == Transaction::OP_RMATTR || + op->op == Transaction::OP_OMAP_SETKEYS || + op->op == Transaction::OP_OMAP_RMKEYS || + op->op == Transaction::OP_OMAP_RMKEYRANGE || + op->op == Transaction::OP_OMAP_SETHEADER)) + // -ENOENT is normally okay + // ...including on a replayed OP_RMCOLL with checkpoint mode + ok = true; + if (r == -ENODATA) + ok = true; + + if (op->op == Transaction::OP_SETALLOCHINT) + // Either EOPNOTSUPP or EINVAL most probably. EINVAL in most + // cases means invalid hint size (e.g. too big, not a multiple + // of block size, etc) or, at least on xfs, an attempt to set + // or change it when the file is not empty. However, + // OP_SETALLOCHINT is advisory, so ignore all errors. + ok = true; + + if (replaying && !backend->can_checkpoint()) { + if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -ERANGE) { + dout(10) << "tolerating ERANGE on replay" << dendl; + ok = true; + } + if (r == -ENOENT) { + dout(10) << "tolerating ENOENT on replay" << dendl; + ok = true; + } + } + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) { + msg = "ENOENT on clone suggests osd bug"; + } else if (r == -ENOSPC) { + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC from disk filesystem, misconfigured cluster"; + } else if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } else if (r == -EPERM) { + msg = "EPERM suggests file(s) in osd data dir not owned by ceph user, or leveldb corruption"; + } + + derr << " error " << cpp_strerror(r) << " not handled on operation " << op + << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl; + dout(0) << msg << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + + if (r == -EMFILE) { + dump_open_fds(cct); + } + + ceph_abort_msg("unexpected error"); + } + } + + spos.op++; + } + + _inject_failure(); +} + + /*********************************************/ + + + +// -------------------- +// objects + +bool FileStore::exists(CollectionHandle& ch, const ghobject_t& oid) +{ + tracepoint(objectstore, exists_enter, ch->cid.c_str()); + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + struct stat st; + bool retval = stat(ch, oid, &st) == 0; + tracepoint(objectstore, exists_exit, retval); + return retval; +} + +int FileStore::stat( + CollectionHandle& ch, const ghobject_t& oid, struct stat *st, bool allow_eio) +{ + tracepoint(objectstore, stat_enter, ch->cid.c_str()); + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + int r = lfn_stat(cid, oid, st); + ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO); + if (r < 0) { + dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid + << " = " << r << dendl; + } else { + dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid + << " = " << r + << " (size " << st->st_size << ")" << dendl; + } + if (cct->_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, stat_exit, r); + return r; + } +} + +int FileStore::set_collection_opts( + CollectionHandle& ch, + const pool_opts_t& opts) +{ + return -EOPNOTSUPP; +} + +int FileStore::read( + CollectionHandle& ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags) +{ + int got; + tracepoint(objectstore, read_enter, ch->cid.c_str(), offset, len); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") open error: " + << cpp_strerror(r) << dendl; + return r; + } + + if (offset == 0 && len == 0) { + struct stat st; + memset(&st, 0, sizeof(struct stat)); + int r = ::fstat(**fd, &st); + ceph_assert(r == 0); + len = st.st_size; + } + +#ifdef HAVE_POSIX_FADVISE + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM) + posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM); + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) + posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL); +#endif + + bufferptr bptr(len); // prealloc space for entire read + got = safe_pread(**fd, bptr.c_str(), len, offset); + if (got < 0) { + dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl; + lfn_close(fd); + return got; + } + bptr.set_length(got); // properly size the buffer + bl.clear(); + bl.push_back(std::move(bptr)); // put it in the target bufferlist + +#ifdef HAVE_POSIX_FADVISE + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED); + if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)) + posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL); +#endif + + if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) { + ostringstream ss; + int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss); + if (errors != 0) { + dout(0) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" + << got << " ... BAD CRC:\n" << ss.str() << dendl; + ceph_abort_msg("bad crc on read"); + } + } + + lfn_close(fd); + + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" + << got << "/" << len << dendl; + if (cct->_conf->filestore_debug_inject_read_err && + debug_data_eio(oid)) { + return -EIO; + } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */ + cct->_conf->filestore_debug_random_read_err && + (rand() % (int)(cct->_conf->filestore_debug_random_read_err * + 100.0)) == 0) { + dout(0) << __func__ << ": inject random EIO" << dendl; + return -EIO; + } else { + tracepoint(objectstore, read_exit, got); + return got; + } +} + +int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len, + map<uint64_t, uint64_t> *m) +{ + uint64_t i; + struct fiemap_extent *extent = nullptr; + struct fiemap *fiemap = nullptr; + int r = 0; + +more: + r = backend->do_fiemap(fd, offset, len, &fiemap); + if (r < 0) + return r; + + if (fiemap->fm_mapped_extents == 0) { + free(fiemap); + return r; + } + + extent = &fiemap->fm_extents[0]; + + /* start where we were asked to start */ + if (extent->fe_logical < offset) { + extent->fe_length -= offset - extent->fe_logical; + extent->fe_logical = offset; + } + + i = 0; + + struct fiemap_extent *last = nullptr; + while (i < fiemap->fm_mapped_extents) { + struct fiemap_extent *next = extent + 1; + + dout(10) << __FUNC__ << ": fm_mapped_extents=" << fiemap->fm_mapped_extents + << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl; + + /* try to merge extents */ + while ((i < fiemap->fm_mapped_extents - 1) && + (extent->fe_logical + extent->fe_length == next->fe_logical)) { + next->fe_length += extent->fe_length; + next->fe_logical = extent->fe_logical; + extent = next; + next = extent + 1; + i++; + } + + if (extent->fe_logical + extent->fe_length > offset + len) + extent->fe_length = offset + len - extent->fe_logical; + (*m)[extent->fe_logical] = extent->fe_length; + i++; + last = extent++; + } + uint64_t xoffset = last->fe_logical + last->fe_length - offset; + offset = last->fe_logical + last->fe_length; + len -= xoffset; + const bool is_last = (last->fe_flags & FIEMAP_EXTENT_LAST) || (len == 0); + free(fiemap); + if (!is_last) { + goto more; + } + + return r; +} + +int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len, + map<uint64_t, uint64_t> *m) +{ +#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) + off_t hole_pos, data_pos; + int r = 0; + + // If lseek fails with errno setting to be ENXIO, this means the current + // file offset is beyond the end of the file. + off_t start = offset; + while(start < (off_t)(offset + len)) { + data_pos = lseek(fd, start, SEEK_DATA); + if (data_pos < 0) { + if (errno == ENXIO) + break; + else { + r = -errno; + dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; + return r; + } + } else if (data_pos > (off_t)(offset + len)) { + break; + } + + hole_pos = lseek(fd, data_pos, SEEK_HOLE); + if (hole_pos < 0) { + if (errno == ENXIO) { + break; + } else { + r = -errno; + dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; + return r; + } + } + + if (hole_pos >= (off_t)(offset + len)) { + (*m)[data_pos] = offset + len - data_pos; + break; + } + (*m)[data_pos] = hole_pos - data_pos; + start = hole_pos; + } + + return r; +#else + (*m)[offset] = len; + return 0; +#endif +} + +int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid, + uint64_t offset, size_t len, + bufferlist& bl) +{ + map<uint64_t, uint64_t> exomap; + int r = fiemap(ch, oid, offset, len, exomap); + if (r >= 0) { + encode(exomap, bl); + } + return r; +} + +int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid, + uint64_t offset, size_t len, + map<uint64_t, uint64_t>& destmap) +{ + tracepoint(objectstore, fiemap_enter, ch->cid.c_str(), offset, len); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + destmap.clear(); + + if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) || + len <= (size_t)m_filestore_fiemap_threshold) { + destmap[offset] = len; + return 0; + } + + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + + FDRef fd; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl; + goto done; + } + + if (backend->has_seek_data_hole()) { + dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl; + r = _do_seek_hole_data(**fd, offset, len, &destmap); + } else if (backend->has_fiemap()) { + dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl; + r = _do_fiemap(**fd, offset, len, &destmap); + } + + lfn_close(fd); + +done: + + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << destmap.size() << " " << destmap << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + tracepoint(objectstore, fiemap_exit, r); + return r; +} + +int FileStore::_remove(const coll_t& cid, const ghobject_t& oid, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + int r = lfn_unlink(cid, oid, spos); + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int FileStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << dendl; + int r = lfn_truncate(cid, oid, size); + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << " = " << r << dendl; + return r; +} + + +int FileStore::_touch(const coll_t& cid, const ghobject_t& oid) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + + FDRef fd; + int r = lfn_open(cid, oid, true, &fd); + if (r < 0) { + return r; + } else { + lfn_close(fd); + } + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int FileStore::_write(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len, + const bufferlist& bl, uint32_t fadvise_flags) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + int r; + + FDRef fd; + r = lfn_open(cid, oid, true, &fd); + if (r < 0) { + dout(0) << __FUNC__ << ": couldn't open " << cid << "/" + << oid << ": " + << cpp_strerror(r) << dendl; + goto out; + } + + // write + r = bl.write_fd(**fd, offset); + if (r < 0) { + derr << __FUNC__ << ": write_fd on " << cid << "/" << oid + << " error: " << cpp_strerror(r) << dendl; + lfn_close(fd); + goto out; + } + r = bl.length(); + + if (r >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_write(**fd, offset, len, bl); + ceph_assert(rc >= 0); + } + + if (replaying || m_disable_wbthrottle) { + if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) { +#ifdef HAVE_POSIX_FADVISE + posix_fadvise(**fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + } + } else { + wbthrottle.queue_wb(fd, oid, offset, len, + fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + } + + lfn_close(fd); + + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl; + return r; +} + +int FileStore::_zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + int ret = 0; + + if (cct->_conf->filestore_punch_hole) { +#ifdef CEPH_HAVE_FALLOCATE +# if !defined(__APPLE__) && !defined(__FreeBSD__) +# ifdef FALLOC_FL_KEEP_SIZE + // first try to punch a hole. + FDRef fd; + ret = lfn_open(cid, oid, false, &fd); + if (ret < 0) { + goto out; + } + + struct stat st; + ret = ::fstat(**fd, &st); + if (ret < 0) { + ret = -errno; + lfn_close(fd); + goto out; + } + + // first try fallocate + ret = fallocate(**fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + offset, len); + if (ret < 0) { + ret = -errno; + } else { + // ensure we extend file size, if needed + if (len > 0 && offset + len > (uint64_t)st.st_size) { + ret = ::ftruncate(**fd, offset + len); + if (ret < 0) { + ret = -errno; + lfn_close(fd); + goto out; + } + } + } + lfn_close(fd); + + if (ret >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_zero(**fd, offset, len); + ceph_assert(rc >= 0); + } + + if (ret == 0) + goto out; // yay! + if (ret != -EOPNOTSUPP) + goto out; // some other error +# endif +# endif +#endif + } + + // lame, kernel is old and doesn't support it. + // write zeros.. yuck! + dout(20) << __FUNC__ << ": falling back to writing zeros" << dendl; + { + bufferlist bl; + bl.append_zero(len); + ret = _write(cid, oid, offset, len, bl); + } + +#ifdef CEPH_HAVE_FALLOCATE +# if !defined(__APPLE__) && !defined(__FreeBSD__) +# ifdef FALLOC_FL_KEEP_SIZE + out: +# endif +# endif +#endif + dout(20) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl; + return ret; +} + +int FileStore::_clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid, + const SequencerPosition& spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl; + + if (_check_replay_guard(cid, newoid, spos) < 0) + return 0; + + int r; + FDRef o, n; + { + Index index; + r = lfn_open(cid, oldoid, false, &o, &index); + if (r < 0) { + goto out2; + } + ceph_assert(index.index); + RWLock::WLocker l((index.index)->access_lock); + + r = lfn_open(cid, newoid, true, &n, &index); + if (r < 0) { + goto out; + } + r = ::ftruncate(**n, 0); + if (r < 0) { + r = -errno; + goto out3; + } + struct stat st; + r = ::fstat(**o, &st); + if (r < 0) { + r = -errno; + goto out3; + } + + r = _do_clone_range(**o, **n, 0, st.st_size, 0); + if (r < 0) { + goto out3; + } + + dout(20) << "objectmap clone" << dendl; + r = object_map->clone(oldoid, newoid, &spos); + if (r < 0 && r != -ENOENT) + goto out3; + } + + { + char buf[2]; + map<string, bufferptr> aset; + r = _fgetattrs(**o, aset); + if (r < 0) + goto out3; + + r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, + sizeof(XATTR_NO_SPILL_OUT)); + } else { + r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, + sizeof(XATTR_SPILL_OUT)); + } + if (r < 0) + goto out3; + + r = _fsetattrs(**n, aset); + if (r < 0) + goto out3; + } + + // clone is non-idempotent; record our work. + _set_replay_guard(**n, spos, &newoid); + + out3: + lfn_close(n); + out: + lfn_close(o); + out2: + dout(10) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; +} + +int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << __FUNC__ << ": copy " << srcoff << "~" << len << " to " << dstoff << dendl; + return backend->clone_range(from, to, srcoff, len, dstoff); +} + +int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl; + int r = 0; + map<uint64_t, uint64_t> exomap; + // fiemap doesn't allow zero length + if (len == 0) + return 0; + + if (backend->has_seek_data_hole()) { + dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl; + r = _do_seek_hole_data(from, srcoff, len, &exomap); + } else if (backend->has_fiemap()) { + dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl; + r = _do_fiemap(from, srcoff, len, &exomap); + } + + + int64_t written = 0; + if (r < 0) + goto out; + + for (map<uint64_t, uint64_t>::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) { + uint64_t it_off = miter->first - srcoff + dstoff; + r = _do_copy_range(from, to, miter->first, miter->second, it_off, true); + if (r < 0) { + derr << __FUNC__ << ": copy error at " << miter->first << "~" << miter->second + << " to " << it_off << ", " << cpp_strerror(r) << dendl; + break; + } + written += miter->second; + } + + if (r >= 0) { + if (m_filestore_sloppy_crc) { + int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); + ceph_assert(rc >= 0); + } + struct stat st; + r = ::fstat(to, &st); + if (r < 0) { + r = -errno; + derr << __FUNC__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl; + goto out; + } + if (st.st_size < (int)(dstoff + len)) { + r = ::ftruncate(to, dstoff + len); + if (r < 0) { + r = -errno; + derr << __FUNC__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl; + goto out; + } + } + r = written; + } + + out: + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc) +{ + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl; + int r = 0; + loff_t pos = srcoff; + loff_t end = srcoff + len; + int buflen = 4096 * 16; //limit by pipe max size.see fcntl + +#ifdef CEPH_HAVE_SPLICE + if (backend->has_splice()) { + int pipefd[2]; + if (pipe_cloexec(pipefd) < 0) { + int e = errno; + derr << " pipe " << " got " << cpp_strerror(e) << dendl; + return -e; + } + + loff_t dstpos = dstoff; + while (pos < end) { + int l = std::min<int>(end-pos, buflen); + r = safe_splice(from, &pos, pipefd[1], nullptr, l, SPLICE_F_NONBLOCK); + dout(10) << " safe_splice read from " << pos << "~" << l << " got " << r << dendl; + if (r < 0) { + derr << __FUNC__ << ": safe_splice read error at " << pos << "~" << len + << ", " << cpp_strerror(r) << dendl; + break; + } + if (r == 0) { + // hrm, bad source range, wtf. + r = -ERANGE; + derr << __FUNC__ << ": got short read result at " << pos + << " of fd " << from << " len " << len << dendl; + break; + } + + r = safe_splice(pipefd[0], nullptr, to, &dstpos, r, 0); + dout(10) << " safe_splice write to " << to << " len " << r + << " got " << r << dendl; + if (r < 0) { + derr << __FUNC__ << ": write error at " << pos << "~" + << r << ", " << cpp_strerror(r) << dendl; + break; + } + } + close(pipefd[0]); + close(pipefd[1]); + } else +#endif + { + int64_t actual; + + actual = ::lseek64(from, srcoff, SEEK_SET); + if (actual != (int64_t)srcoff) { + if (actual < 0) + r = -errno; + else + r = -EINVAL; + derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl; + return r; + } + actual = ::lseek64(to, dstoff, SEEK_SET); + if (actual != (int64_t)dstoff) { + if (actual < 0) + r = -errno; + else + r = -EINVAL; + derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl; + return r; + } + + char buf[buflen]; + while (pos < end) { + int l = std::min<int>(end-pos, buflen); + r = ::read(from, buf, l); + dout(25) << " read from " << pos << "~" << l << " got " << r << dendl; + if (r < 0) { + if (errno == EINTR) { + continue; + } else { + r = -errno; + derr << __FUNC__ << ": read error at " << pos << "~" << len + << ", " << cpp_strerror(r) << dendl; + break; + } + } + if (r == 0) { + // hrm, bad source range, wtf. + r = -ERANGE; + derr << __FUNC__ << ": got short read result at " << pos + << " of fd " << from << " len " << len << dendl; + break; + } + int op = 0; + while (op < r) { + int r2 = safe_write(to, buf+op, r-op); + dout(25) << " write to " << to << " len " << (r-op) + << " got " << r2 << dendl; + if (r2 < 0) { + r = r2; + derr << __FUNC__ << ": write error at " << pos << "~" + << r-op << ", " << cpp_strerror(r) << dendl; + + break; + } + op += (r-op); + } + if (r < 0) + break; + pos += r; + } + } + + if (r < 0 && replaying) { + ceph_assert(r == -ERANGE); + derr << __FUNC__ << ": short source tolerated because we are replaying" << dendl; + r = len; + } + ceph_assert(replaying || pos == end); + if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); + ceph_assert(rc >= 0); + } + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +int FileStore::_clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid, + uint64_t srcoff, uint64_t len, uint64_t dstoff, + const SequencerPosition& spos) +{ + dout(15) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl; + + if (_check_replay_guard(newcid, newoid, spos) < 0) + return 0; + + int r; + FDRef o, n; + r = lfn_open(oldcid, oldoid, false, &o); + if (r < 0) { + goto out2; + } + r = lfn_open(newcid, newoid, true, &n); + if (r < 0) { + goto out; + } + r = _do_clone_range(**o, **n, srcoff, len, dstoff); + if (r < 0) { + goto out3; + } + + // clone is non-idempotent; record our work. + _set_replay_guard(**n, spos, &newoid); + + out3: + lfn_close(n); + out: + lfn_close(o); + out2: + dout(10) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " + << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +class SyncEntryTimeout : public Context { +public: + CephContext* cct; + explicit SyncEntryTimeout(CephContext* cct, int commit_timeo) + : cct(cct), m_commit_timeo(commit_timeo) + { + } + + void finish(int r) override { + BackTrace *bt = new BackTrace(1); + generic_dout(-1) << "FileStore: sync_entry timed out after " + << m_commit_timeo << " seconds.\n"; + bt->print(*_dout); + *_dout << dendl; + delete bt; + bt = nullptr; + ceph_abort(); + } +private: + int m_commit_timeo; +}; + +void FileStore::sync_entry() +{ + lock.Lock(); + while (!stop) { + utime_t max_interval; + max_interval.set_from_double(m_filestore_max_sync_interval); + utime_t min_interval; + min_interval.set_from_double(m_filestore_min_sync_interval); + + utime_t startwait = ceph_clock_now(); + if (!force_sync) { + dout(20) << __FUNC__ << ": waiting for max_interval " << max_interval << dendl; + sync_cond.WaitInterval(lock, max_interval); + } else { + dout(20) << __FUNC__ << ": not waiting, force_sync set" << dendl; + } + + if (force_sync) { + dout(20) << __FUNC__ << ": force_sync set" << dendl; + force_sync = false; + } else if (stop) { + dout(20) << __FUNC__ << ": stop set" << dendl; + break; + } else { + // wait for at least the min interval + utime_t woke = ceph_clock_now(); + woke -= startwait; + dout(20) << __FUNC__ << ": woke after " << woke << dendl; + if (woke < min_interval) { + utime_t t = min_interval; + t -= woke; + dout(20) << __FUNC__ << ": waiting for another " << t + << " to reach min interval " << min_interval << dendl; + sync_cond.WaitInterval(lock, t); + } + } + + list<Context*> fin; + again: + fin.swap(sync_waiters); + lock.Unlock(); + + op_tp.pause(); + if (apply_manager.commit_start()) { + utime_t start = ceph_clock_now(); + uint64_t cp = apply_manager.get_committing_seq(); + + sync_entry_timeo_lock.Lock(); + SyncEntryTimeout *sync_entry_timeo = + new SyncEntryTimeout(cct, m_filestore_commit_timeout); + if (!timer.add_event_after(m_filestore_commit_timeout, + sync_entry_timeo)) { + sync_entry_timeo = nullptr; + } + sync_entry_timeo_lock.Unlock(); + + logger->set(l_filestore_committing, 1); + + dout(15) << __FUNC__ << ": committing " << cp << dendl; + stringstream errstream; + if (cct->_conf->filestore_debug_omap_check && !object_map->check(errstream)) { + derr << errstream.str() << dendl; + ceph_abort(); + } + + if (backend->can_checkpoint()) { + int err = write_op_seq(op_fd, cp); + if (err < 0) { + derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; + ceph_abort_msg("error during write_op_seq"); + } + + char s[NAME_MAX]; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); + uint64_t cid = 0; + err = backend->create_checkpoint(s, &cid); + if (err < 0) { + int err = errno; + derr << "snap create '" << s << "' got error " << err << dendl; + ceph_assert(err == 0); + } + + snaps.push_back(cp); + apply_manager.commit_started(); + op_tp.unpause(); + + if (cid > 0) { + dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl; + err = backend->sync_checkpoint(cid); + if (err < 0) { + derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl; + ceph_abort_msg("wait_sync got error"); + } + dout(20) << " done waiting for checkpoint " << cid << " to complete" << dendl; + } + } else { + apply_manager.commit_started(); + op_tp.unpause(); + + int err = object_map->sync(); + if (err < 0) { + derr << "object_map sync got " << cpp_strerror(err) << dendl; + ceph_abort_msg("object_map sync returned error"); + } + + err = backend->syncfs(); + if (err < 0) { + derr << "syncfs got " << cpp_strerror(err) << dendl; + ceph_abort_msg("syncfs returned error"); + } + + err = write_op_seq(op_fd, cp); + if (err < 0) { + derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; + ceph_abort_msg("error during write_op_seq"); + } + err = ::fsync(op_fd); + if (err < 0) { + derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl; + ceph_abort_msg("error during fsync of op_seq"); + } + } + + utime_t done = ceph_clock_now(); + utime_t lat = done - start; + utime_t dur = done - startwait; + dout(10) << __FUNC__ << ": commit took " << lat << ", interval was " << dur << dendl; + utime_t max_pause_lat = logger->tget(l_filestore_sync_pause_max_lat); + if (max_pause_lat < dur - lat) { + logger->tinc(l_filestore_sync_pause_max_lat, dur - lat); + } + + logger->inc(l_filestore_commitcycle); + logger->tinc(l_filestore_commitcycle_latency, lat); + logger->tinc(l_filestore_commitcycle_interval, dur); + + apply_manager.commit_finish(); + if (!m_disable_wbthrottle) { + wbthrottle.clear(); + } + + logger->set(l_filestore_committing, 0); + + // remove old snaps? + if (backend->can_checkpoint()) { + char s[NAME_MAX]; + while (snaps.size() > 2) { + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front()); + snaps.pop_front(); + dout(10) << "removing snap '" << s << "'" << dendl; + int r = backend->destroy_checkpoint(s); + if (r) { + int err = errno; + derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl; + } + } + } + + dout(15) << __FUNC__ << ": committed to op_seq " << cp << dendl; + + if (sync_entry_timeo) { + Mutex::Locker lock(sync_entry_timeo_lock); + timer.cancel_event(sync_entry_timeo); + } + } else { + op_tp.unpause(); + } + + lock.Lock(); + finish_contexts(cct, fin, 0); + fin.clear(); + if (!sync_waiters.empty()) { + dout(10) << __FUNC__ << ": more waiters, committing again" << dendl; + goto again; + } + if (!stop && journal && journal->should_commit_now()) { + dout(10) << __FUNC__ << ": journal says we should commit again (probably is/was full)" << dendl; + goto again; + } + } + stop = false; + lock.Unlock(); +} + +void FileStore::do_force_sync() +{ + dout(10) << __FUNC__ << dendl; + Mutex::Locker l(lock); + force_sync = true; + sync_cond.Signal(); +} + +void FileStore::start_sync(Context *onsafe) +{ + Mutex::Locker l(lock); + sync_waiters.push_back(onsafe); + sync_cond.Signal(); + force_sync = true; + dout(10) << __FUNC__ << dendl; +} + +void FileStore::sync() +{ + Mutex l("FileStore::sync"); + Cond c; + bool done; + C_SafeCond *fin = new C_SafeCond(&l, &c, &done); + + start_sync(fin); + + l.Lock(); + while (!done) { + dout(10) << "sync waiting" << dendl; + c.Wait(l); + } + l.Unlock(); + dout(10) << "sync done" << dendl; +} + +void FileStore::_flush_op_queue() +{ + dout(10) << __FUNC__ << ": draining op tp" << dendl; + op_wq.drain(); + dout(10) << __FUNC__ << ": waiting for apply finisher" << dendl; + for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->wait_for_empty(); + } +} + +/* + * flush - make every queued write readable + */ +void FileStore::flush() +{ + dout(10) << __FUNC__ << dendl; + + if (cct->_conf->filestore_blackhole) { + // wait forever + Mutex lock("FileStore::flush::lock"); + Cond cond; + lock.Lock(); + while (true) + cond.Wait(lock); + ceph_abort(); + } + + if (m_filestore_journal_writeahead) { + if (journal) + journal->flush(); + dout(10) << __FUNC__ << ": draining ondisk finisher" << dendl; + for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->wait_for_empty(); + } + } + + _flush_op_queue(); + dout(10) << __FUNC__ << ": complete" << dendl; +} + +/* + * sync_and_flush - make every queued write readable AND committed to disk + */ +void FileStore::sync_and_flush() +{ + dout(10) << __FUNC__ << dendl; + + if (m_filestore_journal_writeahead) { + if (journal) + journal->flush(); + _flush_op_queue(); + } else { + // includes m_filestore_journal_parallel + _flush_op_queue(); + sync(); + } + dout(10) << __FUNC__ << ": done" << dendl; +} + +int FileStore::flush_journal() +{ + dout(10) << __FUNC__ << dendl; + sync_and_flush(); + sync(); + return 0; +} + +int FileStore::snapshot(const string& name) +{ + dout(10) << __FUNC__ << ": " << name << dendl; + sync_and_flush(); + + if (!backend->can_checkpoint()) { + dout(0) << __FUNC__ << ": " << name << " failed, not supported" << dendl; + return -EOPNOTSUPP; + } + + char s[NAME_MAX]; + snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str()); + + int r = backend->create_checkpoint(s, nullptr); + if (r) { + derr << __FUNC__ << ": " << name << " failed: " << cpp_strerror(r) << dendl; + } + + return r; +} + +// ------------------------------- +// attributes + +int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp) +{ + char val[CHAIN_XATTR_MAX_BLOCK_LEN]; + int l = chain_fgetxattr(fd, name, val, sizeof(val)); + if (l >= 0) { + bp = buffer::create(l); + memcpy(bp.c_str(), val, l); + } else if (l == -ERANGE) { + l = chain_fgetxattr(fd, name, 0, 0); + if (l > 0) { + bp = buffer::create(l); + l = chain_fgetxattr(fd, name, bp.c_str(), l); + } + } + ceph_assert(!m_filestore_fail_eio || l != -EIO); + return l; +} + +int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset) +{ + // get attr list + char names1[100]; + int len = chain_flistxattr(fd, names1, sizeof(names1)-1); + char *names2 = 0; + char *name = 0; + if (len == -ERANGE) { + len = chain_flistxattr(fd, 0, 0); + if (len < 0) { + ceph_assert(!m_filestore_fail_eio || len != -EIO); + return len; + } + dout(10) << " -ERANGE, len is " << len << dendl; + names2 = new char[len+1]; + len = chain_flistxattr(fd, names2, len); + dout(10) << " -ERANGE, got " << len << dendl; + if (len < 0) { + ceph_assert(!m_filestore_fail_eio || len != -EIO); + delete[] names2; + return len; + } + name = names2; + } else if (len < 0) { + ceph_assert(!m_filestore_fail_eio || len != -EIO); + return len; + } else { + name = names1; + } + name[len] = 0; + + char *end = name + len; + while (name < end) { + char *attrname = name; + if (parse_attrname(&name)) { + if (*name) { + dout(20) << __FUNC__ << ": " << fd << " getting '" << name << "'" << dendl; + int r = _fgetattr(fd, attrname, aset[name]); + if (r < 0) { + delete[] names2; + return r; + } + } + } + name += strlen(name) + 1; + } + + delete[] names2; + return 0; +} + +int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset) +{ + for (map<string, bufferptr>::iterator p = aset.begin(); + p != aset.end(); + ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + const char *val; + if (p->second.length()) + val = p->second.c_str(); + else + val = ""; + // ??? Why do we skip setting all the other attrs if one fails? + int r = chain_fsetxattr(fd, n, val, p->second.length()); + if (r < 0) { + derr << __FUNC__ << ": chain_setxattr returned " << r << dendl; + return r; + } + } + return 0; +} + +// debug EIO injection +void FileStore::inject_data_error(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __FUNC__ << ": init error on " << oid << dendl; + data_error_set.insert(oid); +} +void FileStore::inject_mdata_error(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __FUNC__ << ": init error on " << oid << dendl; + mdata_error_set.insert(oid); +} + +void FileStore::debug_obj_on_delete(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __FUNC__ << ": clear error on " << oid << dendl; + data_error_set.erase(oid); + mdata_error_set.erase(oid); +} +bool FileStore::debug_data_eio(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + if (data_error_set.count(oid)) { + dout(10) << __FUNC__ << ": inject error on " << oid << dendl; + return true; + } else { + return false; + } +} +bool FileStore::debug_mdata_eio(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + if (mdata_error_set.count(oid)) { + dout(10) << __FUNC__ << ": inject error on " << oid << dendl; + return true; + } else { + return false; + } +} + + +// objects + +int FileStore::getattr(CollectionHandle& ch, const ghobject_t& oid, const char *name, bufferptr &bp) +{ + tracepoint(objectstore, getattr_enter, ch->cid.c_str()); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); + r = _fgetattr(**fd, n, bp); + lfn_close(fd); + if (r == -ENODATA) { + map<string, bufferlist> got; + set<string> to_get; + to_get.insert(string(name)); + Index index; + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out; + } + r = object_map->get_xattrs(oid, to_get, &got); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": get_xattrs err r =" << r << dendl; + goto out; + } + if (got.empty()) { + dout(10) << __FUNC__ << ": got.size() is 0" << dendl; + return -ENODATA; + } + bp = bufferptr(got.begin()->second.c_str(), + got.begin()->second.length()); + r = bp.length(); + } + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + if (cct->_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, getattr_exit, r); + return r < 0 ? r : 0; + } +} + +int FileStore::getattrs(CollectionHandle& ch, const ghobject_t& oid, map<string,bufferptr>& aset) +{ + tracepoint(objectstore, getattrs_enter, ch->cid.c_str()); + const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp(); + set<string> omap_attrs; + map<string, bufferlist> omap_aset; + Index index; + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + + FDRef fd; + bool spill_out = true; + char buf[2]; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) + spill_out = false; + + r = _fgetattrs(**fd, aset); + lfn_close(fd); + fd = FDRef(); // defensive + if (r < 0) { + goto out; + } + + if (!spill_out) { + dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl; + goto out; + } + + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out; + } + { + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl; + goto out; + } + + r = object_map->get_xattrs(oid, omap_attrs, &omap_aset); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl; + goto out; + } + if (r == -ENOENT) + r = 0; + } + ceph_assert(omap_attrs.size() == omap_aset.size()); + for (map<string, bufferlist>::iterator i = omap_aset.begin(); + i != omap_aset.end(); + ++i) { + string key(i->first); + aset.insert(make_pair(key, + bufferptr(i->second.c_str(), i->second.length()))); + } + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + + if (cct->_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, getattrs_exit, r); + return r; + } +} + +int FileStore::_setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset, + const SequencerPosition &spos) +{ + map<string, bufferlist> omap_set; + set<string> omap_remove; + map<string, bufferptr> inline_set; + map<string, bufferptr> inline_to_set; + FDRef fd; + int spill_out = -1; + bool incomplete_inline = false; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) + spill_out = 0; + else + spill_out = 1; + + r = _fgetattrs(**fd, inline_set); + incomplete_inline = (r == -E2BIG); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + dout(15) << __FUNC__ << ": " << cid << "/" << oid + << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "") + << dendl; + + for (map<string,bufferptr>::iterator p = aset.begin(); + p != aset.end(); + ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + + if (incomplete_inline) { + chain_fremovexattr(**fd, n); // ignore any error + omap_set[p->first].push_back(p->second); + continue; + } + + if (p->second.length() > m_filestore_max_inline_xattr_size) { + if (inline_set.count(p->first)) { + inline_set.erase(p->first); + r = chain_fremovexattr(**fd, n); + if (r < 0) + goto out_close; + } + omap_set[p->first].push_back(p->second); + continue; + } + + if (!inline_set.count(p->first) && + inline_set.size() >= m_filestore_max_inline_xattrs) { + omap_set[p->first].push_back(p->second); + continue; + } + omap_remove.insert(p->first); + inline_set.insert(*p); + + inline_to_set.insert(*p); + } + + if (spill_out != 1 && !omap_set.empty()) { + chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, + sizeof(XATTR_SPILL_OUT)); + } + + r = _fsetattrs(**fd, inline_to_set); + if (r < 0) + goto out_close; + + if (spill_out && !omap_remove.empty()) { + r = object_map->remove_xattrs(oid, omap_remove, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not remove_xattrs r = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + goto out_close; + } else { + r = 0; // don't confuse the debug output + } + } + + if (!omap_set.empty()) { + r = object_map->set_xattrs(oid, omap_set, &spos); + if (r < 0) { + dout(10) << __FUNC__ << ": could not set_xattrs r = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + goto out_close; + } + } + out_close: + lfn_close(fd); + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + +int FileStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl; + FDRef fd; + bool spill_out = true; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + spill_out = false; + } + + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); + r = chain_fremovexattr(**fd, n); + if (r == -ENODATA && spill_out) { + Index index; + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out_close; + } + set<string> to_remove; + to_remove.insert(string(name)); + r = object_map->remove_xattrs(oid, to_remove, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not remove_xattrs index r = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + goto out_close; + } + } + out_close: + lfn_close(fd); + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl; + return r; +} + +int FileStore::_rmattrs(const coll_t& cid, const ghobject_t& oid, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + + map<string,bufferptr> aset; + FDRef fd; + set<string> omap_attrs; + Index index; + bool spill_out = true; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + spill_out = false; + } + + r = _fgetattrs(**fd, aset); + if (r >= 0) { + for (map<string,bufferptr>::iterator p = aset.begin(); p != aset.end(); ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + r = chain_fremovexattr(**fd, n); + if (r < 0) { + dout(10) << __FUNC__ << ": could not remove xattr r = " << r << dendl; + goto out_close; + } + } + } + + if (!spill_out) { + dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl; + goto out_close; + } + + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out_close; + } + { + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + goto out_close; + } + r = object_map->remove_xattrs(oid, omap_attrs, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not remove omap_attrs r = " << r << dendl; + goto out_close; + } + if (r == -ENOENT) + r = 0; + chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, + sizeof(XATTR_NO_SPILL_OUT)); + } + + out_close: + lfn_close(fd); + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + + + +int FileStore::_collection_remove_recursive(const coll_t &cid, + const SequencerPosition &spos) +{ + struct stat st; + int r = collection_stat(cid, &st); + if (r < 0) { + if (r == -ENOENT) + return 0; + return r; + } + + vector<ghobject_t> objects; + ghobject_t max; + while (!max.is_max()) { + r = collection_list(cid, max, ghobject_t::get_max(), + 300, &objects, &max); + if (r < 0) + return r; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + ceph_assert(_check_replay_guard(cid, *i, spos)); + r = _remove(cid, *i, spos); + if (r < 0) + return r; + } + objects.clear(); + } + return _destroy_collection(cid); +} + +// -------------------------- +// collections + +int FileStore::list_collections(vector<coll_t>& ls) +{ + return list_collections(ls, false); +} + +int FileStore::list_collections(vector<coll_t>& ls, bool include_temp) +{ + tracepoint(objectstore, list_collections_enter); + dout(10) << __FUNC__ << dendl; + + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/current", basedir.c_str()); + + int r = 0; + DIR *dir = ::opendir(fn); + if (!dir) { + r = -errno; + derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + + struct dirent *de = nullptr; + while ((de = ::readdir(dir))) { + if (de->d_type == DT_UNKNOWN) { + // d_type not supported (non-ext[234], btrfs), must stat + struct stat sb; + char filename[PATH_MAX]; + if (int n = snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name); + n >= static_cast<int>(sizeof(filename))) { + derr << __func__ << " path length overrun: " << n << dendl; + ceph_abort(); + } + + r = ::stat(filename, &sb); + if (r < 0) { + r = -errno; + derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + break; + } + if (!S_ISDIR(sb.st_mode)) { + continue; + } + } else if (de->d_type != DT_DIR) { + continue; + } + if (strcmp(de->d_name, "omap") == 0) { + continue; + } + if (de->d_name[0] == '.' && + (de->d_name[1] == '\0' || + (de->d_name[1] == '.' && + de->d_name[2] == '\0'))) + continue; + coll_t cid; + if (!cid.parse(de->d_name)) { + derr << "ignoring invalid collection '" << de->d_name << "'" << dendl; + continue; + } + if (!cid.is_temp() || include_temp) + ls.push_back(cid); + } + + if (r > 0) { + derr << "trying readdir " << fn << ": " << cpp_strerror(r) << dendl; + r = -r; + } + + ::closedir(dir); + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + tracepoint(objectstore, list_collections_exit, r); + return r; +} + +int FileStore::collection_stat(const coll_t& c, struct stat *st) +{ + tracepoint(objectstore, collection_stat_enter, c.c_str()); + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + int r = ::stat(fn, st); + if (r < 0) + r = -errno; + dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + tracepoint(objectstore, collection_stat_exit, r); + return r; +} + +bool FileStore::collection_exists(const coll_t& c) +{ + tracepoint(objectstore, collection_exists_enter, c.c_str()); + struct stat st; + bool ret = collection_stat(c, &st) == 0; + tracepoint(objectstore, collection_exists_exit, ret); + return ret; +} + +int FileStore::collection_empty(const coll_t& cid, bool *empty) +{ + tracepoint(objectstore, collection_empty_enter, cid.c_str()); + dout(15) << __FUNC__ << ": " << cid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) { + derr << __FUNC__ << ": get_index returned: " << cpp_strerror(r) + << dendl; + return r; + } + + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + + vector<ghobject_t> ls; + r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(), + 1, &ls, nullptr); + if (r < 0) { + derr << __FUNC__ << ": collection_list_partial returned: " + << cpp_strerror(r) << dendl; + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + *empty = ls.empty(); + tracepoint(objectstore, collection_empty_exit, *empty); + return 0; +} + +int FileStore::_collection_set_bits(const coll_t& c, int bits) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(10) << __FUNC__ << ": " << fn << " " << bits << dendl; + char n[PATH_MAX]; + int r; + int32_t v = bits; + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto out; + } + get_attrname("bits", n, PATH_MAX); + r = chain_fsetxattr(fd, n, (char*)&v, sizeof(v)); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + out: + dout(10) << __FUNC__ << ": " << fn << " " << bits << " = " << r << dendl; + return r; +} + +int FileStore::collection_bits(CollectionHandle& ch) +{ + char fn[PATH_MAX]; + get_cdir(ch->cid, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + int r; + char n[PATH_MAX]; + int32_t bits; + int fd = ::open(fn, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + bits = r = -errno; + goto out; + } + get_attrname("bits", n, PATH_MAX); + r = chain_fgetxattr(fd, n, (char*)&bits, sizeof(bits)); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + if (r < 0) { + bits = r; + goto out; + } + out: + dout(10) << __FUNC__ << ": " << fn << " = " << bits << dendl; + return bits; +} + +int FileStore::collection_list(const coll_t& c, + const ghobject_t& orig_start, + const ghobject_t& end, + int max, + vector<ghobject_t> *ls, ghobject_t *next) +{ + ghobject_t start = orig_start; + if (start.is_max()) + return 0; + + ghobject_t temp_next; + if (!next) + next = &temp_next; + // figure out the pool id. we need this in order to generate a + // meaningful 'next' value. + int64_t pool = -1; + shard_id_t shard; + { + spg_t pgid; + if (c.is_temp(&pgid)) { + pool = -2 - pgid.pool(); + shard = pgid.shard; + } else if (c.is_pg(&pgid)) { + pool = pgid.pool(); + shard = pgid.shard; + } else if (c.is_meta()) { + pool = -1; + shard = shard_id_t::NO_SHARD; + } else { + // hrm, the caller is test code! we should get kill it off. for now, + // tolerate it. + pool = 0; + shard = shard_id_t::NO_SHARD; + } + dout(20) << __FUNC__ << ": pool is " << pool << " shard is " << shard + << " pgid " << pgid << dendl; + } + ghobject_t sep; + sep.hobj.pool = -1; + sep.set_shard(shard); + if (!c.is_temp() && !c.is_meta()) { + if (start < sep) { + dout(10) << __FUNC__ << ": first checking temp pool" << dendl; + coll_t temp = c.get_temp(); + int r = collection_list(temp, start, end, max, ls, next); + if (r < 0) + return r; + if (*next != ghobject_t::get_max()) + return r; + start = sep; + dout(10) << __FUNC__ << ": fall through to non-temp collection, start " + << start << dendl; + } else { + dout(10) << __FUNC__ << ": start " << start << " >= sep " << sep << dendl; + } + } + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + + r = index->collection_list_partial(start, end, max, ls, next); + + if (r < 0) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + dout(20) << "objects: " << *ls << dendl; + + // HashIndex doesn't know the pool when constructing a 'next' value + if (!next->is_max()) { + next->hobj.pool = pool; + next->set_shard(shard); + dout(20) << " next " << *next << dendl; + } + + return 0; +} + +int FileStore::omap_get(CollectionHandle& ch, const ghobject_t &hoid, + bufferlist *header, + map<string, bufferlist> *out) +{ + tracepoint(objectstore, omap_get_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get(hoid, header, out); + if (r < 0 && r != -ENOENT) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + tracepoint(objectstore, omap_get_exit, 0); + return 0; +} + +int FileStore::omap_get_header( + CollectionHandle& ch, + const ghobject_t &hoid, + bufferlist *bl, + bool allow_eio) +{ + tracepoint(objectstore, omap_get_header_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get_header(hoid, bl); + if (r < 0 && r != -ENOENT) { + ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_get_header_exit, 0); + return 0; +} + +int FileStore::omap_get_keys(CollectionHandle& ch, const ghobject_t &hoid, set<string> *keys) +{ + tracepoint(objectstore, omap_get_keys_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get_keys(hoid, keys); + if (r < 0 && r != -ENOENT) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + tracepoint(objectstore, omap_get_keys_exit, 0); + return 0; +} + +int FileStore::omap_get_values(CollectionHandle& ch, const ghobject_t &hoid, + const set<string> &keys, + map<string, bufferlist> *out) +{ + tracepoint(objectstore, omap_get_values_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + const char *where = "()"; + int r = get_index(c, &index); + if (r < 0) { + where = " (get_index)"; + goto out; + } + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + where = " (lfn_find)"; + goto out; + } + } + r = object_map->get_values(hoid, keys, out); + if (r < 0 && r != -ENOENT) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + where = " (get_values)"; + goto out; + } + r = 0; + out: + tracepoint(objectstore, omap_get_values_exit, r); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << " = " << r + << where << dendl; + return r; +} + +int FileStore::omap_check_keys(CollectionHandle& ch, const ghobject_t &hoid, + const set<string> &keys, + set<string> *out) +{ + tracepoint(objectstore, omap_check_keys_enter, ch->cid.c_str()); + const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(hoid); + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->check_keys(hoid, keys, out); + if (r < 0 && r != -ENOENT) { + if (r == -EIO && m_filestore_fail_eio) handle_eio(); + return r; + } + tracepoint(objectstore, omap_check_keys_exit, 0); + return 0; +} + +ObjectMap::ObjectMapIterator FileStore::get_omap_iterator( + CollectionHandle& ch, + const ghobject_t &oid) +{ + auto osr = static_cast<OpSequencer*>(ch.get()); + osr->wait_for_apply(oid); + return get_omap_iterator(ch->cid, oid); +} + +ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(const coll_t& _c, + const ghobject_t &hoid) +{ + tracepoint(objectstore, get_omap_iterator, _c.c_str()); + const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 " + << "(get_index failed with " << cpp_strerror(r) << ")" << dendl; + return ObjectMap::ObjectMapIterator(); + } + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 " + << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl; + return ObjectMap::ObjectMapIterator(); + } + } + return object_map->get_iterator(hoid); +} + +int FileStore::_collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num, + uint64_t expected_num_objs, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": collection: " << c << " pg number: " + << pg_num << " expected number of objects: " << expected_num_objs << dendl; + + bool empty; + int ret = collection_empty(c, &empty); + if (ret < 0) + return ret; + if (!empty && !replaying) { + dout(0) << "Failed to give an expected number of objects hint to collection : " + << c << ", only empty collection can take such type of hint. " << dendl; + return 0; + } + + Index index; + ret = get_index(c, &index); + if (ret < 0) + return ret; + // Pre-hash the collection + ret = index->pre_hash_collection(pg_num, expected_num_objs); + dout(10) << "pre_hash_collection " << c << " = " << ret << dendl; + if (ret < 0) + return ret; + _set_replay_guard(c, spos); + + return 0; +} + +int FileStore::_create_collection( + const coll_t& c, + int bits, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + int r = ::mkdir(fn, 0755); + if (r < 0) + r = -errno; + if (r == -EEXIST && replaying) + r = 0; + dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl; + + if (r < 0) + return r; + r = init_index(c); + if (r < 0) + return r; + r = _collection_set_bits(c, bits); + if (r < 0) + return r; + // create parallel temp collection, too + if (!c.is_meta() && !c.is_temp()) { + coll_t temp = c.get_temp(); + r = _create_collection(temp, 0, spos); + if (r < 0) + return r; + } + + _set_replay_guard(c, spos); + return 0; +} + +int FileStore::_destroy_collection(const coll_t& c) +{ + int r = 0; + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + { + Index from; + r = get_index(c, &from); + if (r < 0) + goto out; + ceph_assert(from.index); + RWLock::WLocker l((from.index)->access_lock); + + r = from->prep_delete(); + if (r < 0) + goto out; + } + r = ::rmdir(fn); + if (r < 0) { + r = -errno; + goto out; + } + + out: + // destroy parallel temp collection, too + if (!c.is_meta() && !c.is_temp()) { + coll_t temp = c.get_temp(); + int r2 = _destroy_collection(temp); + if (r2 < 0) { + r = r2; + goto out_final; + } + } + + out_final: + dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl; + return r; +} + + +int FileStore::_collection_add(const coll_t& c, const coll_t& oldcid, const ghobject_t& o, + const SequencerPosition& spos) +{ + dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << dendl; + + int dstcmp = _check_replay_guard(c, o, spos); + if (dstcmp < 0) + return 0; + + // check the src name too; it might have a newer guard, and we don't + // want to clobber it + int srccmp = _check_replay_guard(oldcid, o, spos); + if (srccmp < 0) + return 0; + + // open guard on object so we don't any previous operations on the + // new name that will modify the source inode. + FDRef fd; + int r = lfn_open(oldcid, o, 0, &fd); + if (r < 0) { + // the source collection/object does not exist. If we are replaying, we + // should be safe, so just return 0 and move on. + ceph_assert(replaying); + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " + << oldcid << "/" << o << " (dne, continue replay) " << dendl; + return 0; + } + if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" + _set_replay_guard(**fd, spos, &o, true); + } + + r = lfn_link(oldcid, c, o, o); + if (replaying && !backend->can_checkpoint() && + r == -EEXIST) // crashed between link() and set_replay_guard() + r = 0; + + _inject_failure(); + + // close guard on object so we don't do this again + if (r == 0) { + _close_replay_guard(**fd, spos); + } + lfn_close(fd); + + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl; + return r; +} + +int FileStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, + coll_t c, const ghobject_t& o, + const SequencerPosition& spos, + bool allow_enoent) +{ + dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl; + int r = 0; + int dstcmp, srccmp; + + if (replaying) { + /* If the destination collection doesn't exist during replay, + * we need to delete the src object and continue on + */ + if (!collection_exists(c)) + goto out_rm_src; + } + + dstcmp = _check_replay_guard(c, o, spos); + if (dstcmp < 0) + goto out_rm_src; + + // check the src name too; it might have a newer guard, and we don't + // want to clobber it + srccmp = _check_replay_guard(oldcid, oldoid, spos); + if (srccmp < 0) + return 0; + + { + // open guard on object so we don't any previous operations on the + // new name that will modify the source inode. + FDRef fd; + r = lfn_open(oldcid, oldoid, 0, &fd); + if (r < 0) { + // the source collection/object does not exist. If we are replaying, we + // should be safe, so just return 0 and move on. + if (replaying) { + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " + << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl; + } else if (allow_enoent) { + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " + << oldcid << "/" << oldoid << " (dne, ignoring enoent)" + << dendl; + } else { + ceph_abort_msg("ERROR: source must exist"); + } + + if (!replaying) { + return 0; + } + if (allow_enoent && dstcmp > 0) { // if dstcmp == 0, try_rename was started. + return 0; + } + + r = 0; // don't know if object_map was cloned + } else { + if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" + _set_replay_guard(**fd, spos, &o, true); + } + + r = lfn_link(oldcid, c, oldoid, o); + if (replaying && !backend->can_checkpoint() && + r == -EEXIST) // crashed between link() and set_replay_guard() + r = 0; + + lfn_close(fd); + fd = FDRef(); + + _inject_failure(); + } + + if (r == 0) { + // the name changed; link the omap content + r = object_map->rename(oldoid, o, &spos); + if (r == -ENOENT) + r = 0; + } + + _inject_failure(); + + if (r == 0) + r = lfn_unlink(oldcid, oldoid, spos, true); + + if (r == 0) + r = lfn_open(c, o, 0, &fd); + + // close guard on object so we don't do this again + if (r == 0) { + _close_replay_guard(**fd, spos, &o); + lfn_close(fd); + } + } + + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid + << " = " << r << dendl; + return r; + + out_rm_src: + // remove source + if (_check_replay_guard(oldcid, oldoid, spos) > 0) { + r = lfn_unlink(oldcid, oldoid, spos, true); + } + + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid + << " = " << r << dendl; + return r; +} + +void FileStore::_inject_failure() +{ + if (m_filestore_kill_at) { + int final = --m_filestore_kill_at; + dout(5) << __FUNC__ << ": " << (final+1) << " -> " << final << dendl; + if (final == 0) { + derr << __FUNC__ << ": KILLING" << dendl; + cct->_log->flush(); + _exit(1); + } + } +} + +int FileStore::_omap_clear(const coll_t& cid, const ghobject_t &hoid, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->clear_keys_header(hoid, &spos); + if (r < 0 && r != -ENOENT) + return r; + return 0; +} + +int FileStore::_omap_setkeys(const coll_t& cid, const ghobject_t &hoid, + const map<string, bufferlist> &aset, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r; + //treat pgmeta as a logical object, skip to check exist + if (hoid.is_pgmeta()) + goto skip; + + r = get_index(cid, &index); + if (r < 0) { + dout(20) << __FUNC__ << ": get_index got " << cpp_strerror(r) << dendl; + return r; + } + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + dout(20) << __FUNC__ << ": lfn_find got " << cpp_strerror(r) << dendl; + return r; + } + } +skip: + if (g_conf()->subsys.should_gather<ceph_subsys_filestore, 20>()) { + for (auto& p : aset) { + dout(20) << __FUNC__ << ": set " << p.first << dendl; + } + } + r = object_map->set_keys(hoid, aset, &spos); + dout(20) << __FUNC__ << ": " << cid << "/" << hoid << " = " << r << dendl; + return r; +} + +int FileStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &hoid, + const set<string> &keys, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r; + //treat pgmeta as a logical object, skip to check exist + if (hoid.is_pgmeta()) + goto skip; + + r = get_index(cid, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } +skip: + r = object_map->rm_keys(hoid, keys, &spos); + if (r < 0 && r != -ENOENT) + return r; + return 0; +} + +int FileStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &hoid, + const string& first, const string& last, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl; + set<string> keys; + { + ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid); + if (!iter) + return -ENOENT; + for (iter->lower_bound(first); iter->valid() && iter->key() < last; + iter->next()) { + keys.insert(iter->key()); + } + } + return _omap_rmkeys(cid, hoid, keys, spos); +} + +int FileStore::_omap_setheader(const coll_t& cid, const ghobject_t &hoid, + const bufferlist &bl, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + { + ceph_assert(index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + return object_map->set_header(hoid, bl, &spos); +} + +int FileStore::_merge_collection(const coll_t& cid, + uint32_t bits, + coll_t dest, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << " " << dest + << " bits " << bits << dendl; + int r = 0; + + if (!collection_exists(cid)) { + dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl; + ceph_assert(replaying); + return 0; + } + if (!collection_exists(dest)) { + dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl; + ceph_assert(replaying); + return 0; + } + + // set bits + if (_check_replay_guard(cid, spos) > 0) + _collection_set_bits(dest, bits); + + spg_t pgid; + bool is_pg = dest.is_pg(&pgid); + ceph_assert(is_pg); + + int dstcmp = _check_replay_guard(dest, spos); + if (dstcmp < 0) + return 0; + + int srccmp = _check_replay_guard(cid, spos); + if (srccmp < 0) + return 0; + + _set_global_replay_guard(cid, spos); + _set_replay_guard(cid, spos, true); + _set_replay_guard(dest, spos, true); + + // main collection + { + Index from; + r = get_index(cid, &from); + + Index to; + if (!r) + r = get_index(dest, &to); + + if (!r) { + ceph_assert(from.index); + RWLock::WLocker l1((from.index)->access_lock); + + ceph_assert(to.index); + RWLock::WLocker l2((to.index)->access_lock); + + r = from->merge(bits, to.index); + } + } + + // temp too + { + Index from; + r = get_index(cid.get_temp(), &from); + + Index to; + if (!r) + r = get_index(dest.get_temp(), &to); + + if (!r) { + ceph_assert(from.index); + RWLock::WLocker l1((from.index)->access_lock); + + ceph_assert(to.index); + RWLock::WLocker l2((to.index)->access_lock); + + r = from->merge(bits, to.index); + } + } + + // remove source + _destroy_collection(cid); + + _close_replay_guard(dest, spos); + _close_replay_guard(dest.get_temp(), spos); + // no need to close guards on cid... it's removed. + + if (!r && cct->_conf->filestore_debug_verify_split) { + vector<ghobject_t> objects; + ghobject_t next; + while (1) { + collection_list( + dest, + next, ghobject_t::get_max(), + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (!i->match(bits, pgid.pgid.ps())) { + dout(20) << __FUNC__ << ": " << *i << " does not belong in " + << cid << dendl; + ceph_assert(i->match(bits, pgid.pgid.ps())); + } + } + objects.clear(); + } + } + + dout(15) << __FUNC__ << ": " << cid << " " << dest << " bits " << bits + << " = " << r << dendl; + return r; +} + +int FileStore::_split_collection(const coll_t& cid, + uint32_t bits, + uint32_t rem, + coll_t dest, + const SequencerPosition &spos) +{ + int r; + { + dout(15) << __FUNC__ << ": " << cid << " bits: " << bits << dendl; + if (!collection_exists(cid)) { + dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl; + ceph_assert(replaying); + return 0; + } + if (!collection_exists(dest)) { + dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl; + ceph_assert(replaying); + return 0; + } + + int dstcmp = _check_replay_guard(dest, spos); + if (dstcmp < 0) + return 0; + + int srccmp = _check_replay_guard(cid, spos); + if (srccmp < 0) + return 0; + + _set_global_replay_guard(cid, spos); + _set_replay_guard(cid, spos, true); + _set_replay_guard(dest, spos, true); + + Index from; + r = get_index(cid, &from); + + Index to; + if (!r) + r = get_index(dest, &to); + + if (!r) { + ceph_assert(from.index); + RWLock::WLocker l1((from.index)->access_lock); + + ceph_assert(to.index); + RWLock::WLocker l2((to.index)->access_lock); + + r = from->split(rem, bits, to.index); + } + + _close_replay_guard(cid, spos); + _close_replay_guard(dest, spos); + } + _collection_set_bits(cid, bits); + if (!r && cct->_conf->filestore_debug_verify_split) { + vector<ghobject_t> objects; + ghobject_t next; + while (1) { + collection_list( + cid, + next, ghobject_t::get_max(), + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + dout(20) << __FUNC__ << ": " << *i << " still in source " + << cid << dendl; + ceph_assert(!i->match(bits, rem)); + } + objects.clear(); + } + next = ghobject_t(); + while (1) { + collection_list( + dest, + next, ghobject_t::get_max(), + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + dout(20) << __FUNC__ << ": " << *i << " now in dest " + << *i << dendl; + ceph_assert(i->match(bits, rem)); + } + objects.clear(); + } + } + return r; +} + +int FileStore::_set_alloc_hint(const coll_t& cid, const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl; + + FDRef fd; + int ret = 0; + + if (expected_object_size == 0 || expected_write_size == 0) + goto out; + + ret = lfn_open(cid, oid, false, &fd); + if (ret < 0) + goto out; + + { + // TODO: a more elaborate hint calculation + uint64_t hint = std::min<uint64_t>(expected_write_size, m_filestore_max_alloc_hint_size); + + ret = backend->set_alloc_hint(**fd, hint); + dout(20) << __FUNC__ << ": hint " << hint << " ret " << ret << dendl; + } + + lfn_close(fd); +out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl; + ceph_assert(!m_filestore_fail_eio || ret != -EIO); + return ret; +} + +const char** FileStore::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "filestore_max_inline_xattr_size", + "filestore_max_inline_xattr_size_xfs", + "filestore_max_inline_xattr_size_btrfs", + "filestore_max_inline_xattr_size_other", + "filestore_max_inline_xattrs", + "filestore_max_inline_xattrs_xfs", + "filestore_max_inline_xattrs_btrfs", + "filestore_max_inline_xattrs_other", + "filestore_max_xattr_value_size", + "filestore_max_xattr_value_size_xfs", + "filestore_max_xattr_value_size_btrfs", + "filestore_max_xattr_value_size_other", + "filestore_min_sync_interval", + "filestore_max_sync_interval", + "filestore_queue_max_ops", + "filestore_queue_max_bytes", + "filestore_expected_throughput_bytes", + "filestore_expected_throughput_ops", + "filestore_queue_low_threshhold", + "filestore_queue_high_threshhold", + "filestore_queue_high_delay_multiple", + "filestore_queue_max_delay_multiple", + "filestore_commit_timeout", + "filestore_dump_file", + "filestore_kill_at", + "filestore_fail_eio", + "filestore_fadvise", + "filestore_sloppy_crc", + "filestore_sloppy_crc_block_size", + "filestore_max_alloc_hint_size", + NULL + }; + return KEYS; +} + +void FileStore::handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + if (changed.count("filestore_max_inline_xattr_size") || + changed.count("filestore_max_inline_xattr_size_xfs") || + changed.count("filestore_max_inline_xattr_size_btrfs") || + changed.count("filestore_max_inline_xattr_size_other") || + changed.count("filestore_max_inline_xattrs") || + changed.count("filestore_max_inline_xattrs_xfs") || + changed.count("filestore_max_inline_xattrs_btrfs") || + changed.count("filestore_max_inline_xattrs_other") || + changed.count("filestore_max_xattr_value_size") || + changed.count("filestore_max_xattr_value_size_xfs") || + changed.count("filestore_max_xattr_value_size_btrfs") || + changed.count("filestore_max_xattr_value_size_other")) { + if (backend) { + Mutex::Locker l(lock); + set_xattr_limits_via_conf(); + } + } + + if (changed.count("filestore_queue_max_bytes") || + changed.count("filestore_queue_max_ops") || + changed.count("filestore_expected_throughput_bytes") || + changed.count("filestore_expected_throughput_ops") || + changed.count("filestore_queue_low_threshhold") || + changed.count("filestore_queue_high_threshhold") || + changed.count("filestore_queue_high_delay_multiple") || + changed.count("filestore_queue_max_delay_multiple")) { + Mutex::Locker l(lock); + set_throttle_params(); + } + + if (changed.count("filestore_min_sync_interval") || + changed.count("filestore_max_sync_interval") || + changed.count("filestore_kill_at") || + changed.count("filestore_fail_eio") || + changed.count("filestore_sloppy_crc") || + changed.count("filestore_sloppy_crc_block_size") || + changed.count("filestore_max_alloc_hint_size") || + changed.count("filestore_fadvise")) { + Mutex::Locker l(lock); + m_filestore_min_sync_interval = conf->filestore_min_sync_interval; + m_filestore_max_sync_interval = conf->filestore_max_sync_interval; + m_filestore_kill_at = conf->filestore_kill_at; + m_filestore_fail_eio = conf->filestore_fail_eio; + m_filestore_fadvise = conf->filestore_fadvise; + m_filestore_sloppy_crc = conf->filestore_sloppy_crc; + m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size; + m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size; + } + if (changed.count("filestore_commit_timeout")) { + Mutex::Locker l(sync_entry_timeo_lock); + m_filestore_commit_timeout = conf->filestore_commit_timeout; + } + if (changed.count("filestore_dump_file")) { + if (conf->filestore_dump_file.length() && + conf->filestore_dump_file != "-") { + dump_start(conf->filestore_dump_file); + } else { + dump_stop(); + } + } +} + +int FileStore::set_throttle_params() +{ + stringstream ss; + bool valid = throttle_bytes.set_params( + cct->_conf->filestore_queue_low_threshhold, + cct->_conf->filestore_queue_high_threshhold, + cct->_conf->filestore_expected_throughput_bytes, + cct->_conf->filestore_queue_high_delay_multiple? + cct->_conf->filestore_queue_high_delay_multiple: + cct->_conf->filestore_queue_high_delay_multiple_bytes, + cct->_conf->filestore_queue_max_delay_multiple? + cct->_conf->filestore_queue_max_delay_multiple: + cct->_conf->filestore_queue_max_delay_multiple_bytes, + cct->_conf->filestore_queue_max_bytes, + &ss); + + valid &= throttle_ops.set_params( + cct->_conf->filestore_queue_low_threshhold, + cct->_conf->filestore_queue_high_threshhold, + cct->_conf->filestore_expected_throughput_ops, + cct->_conf->filestore_queue_high_delay_multiple? + cct->_conf->filestore_queue_high_delay_multiple: + cct->_conf->filestore_queue_high_delay_multiple_ops, + cct->_conf->filestore_queue_max_delay_multiple? + cct->_conf->filestore_queue_max_delay_multiple: + cct->_conf->filestore_queue_max_delay_multiple_ops, + cct->_conf->filestore_queue_max_ops, + &ss); + + logger->set(l_filestore_op_queue_max_ops, throttle_ops.get_max()); + logger->set(l_filestore_op_queue_max_bytes, throttle_bytes.get_max()); + + if (!valid) { + derr << "tried to set invalid params: " + << ss.str() + << dendl; + } + return valid ? 0 : -EINVAL; +} + +void FileStore::dump_start(const std::string& file) +{ + dout(10) << __FUNC__ << ": " << file << dendl; + if (m_filestore_do_dump) { + dump_stop(); + } + m_filestore_dump_fmt.reset(); + m_filestore_dump_fmt.open_array_section("dump"); + m_filestore_dump.open(file.c_str()); + m_filestore_do_dump = true; +} + +void FileStore::dump_stop() +{ + dout(10) << __FUNC__ << dendl; + m_filestore_do_dump = false; + if (m_filestore_dump.is_open()) { + m_filestore_dump_fmt.close_section(); + m_filestore_dump_fmt.flush(m_filestore_dump); + m_filestore_dump.flush(); + m_filestore_dump.close(); + } +} + +void FileStore::dump_transactions(vector<ObjectStore::Transaction>& ls, uint64_t seq, OpSequencer *osr) +{ + m_filestore_dump_fmt.open_array_section("transactions"); + unsigned trans_num = 0; + for (vector<ObjectStore::Transaction>::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) { + m_filestore_dump_fmt.open_object_section("transaction"); + m_filestore_dump_fmt.dump_stream("osr") << osr->cid; + m_filestore_dump_fmt.dump_unsigned("seq", seq); + m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num); + (*i).dump(&m_filestore_dump_fmt); + m_filestore_dump_fmt.close_section(); + } + m_filestore_dump_fmt.close_section(); + m_filestore_dump_fmt.flush(m_filestore_dump); + m_filestore_dump.flush(); +} + +void FileStore::get_db_statistics(Formatter* f) +{ + object_map->db->get_statistics(f); +} + +void FileStore::set_xattr_limits_via_conf() +{ + uint32_t fs_xattr_size; + uint32_t fs_xattrs; + uint32_t fs_xattr_max_value_size; + + switch (m_fs_type) { +#if defined(__linux__) + case XFS_SUPER_MAGIC: + fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_xfs; + fs_xattrs = cct->_conf->filestore_max_inline_xattrs_xfs; + fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_xfs; + break; + case BTRFS_SUPER_MAGIC: + fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_btrfs; + fs_xattrs = cct->_conf->filestore_max_inline_xattrs_btrfs; + fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_btrfs; + break; +#endif + default: + fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_other; + fs_xattrs = cct->_conf->filestore_max_inline_xattrs_other; + fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_other; + break; + } + + // Use override value if set + if (cct->_conf->filestore_max_inline_xattr_size) + m_filestore_max_inline_xattr_size = cct->_conf->filestore_max_inline_xattr_size; + else + m_filestore_max_inline_xattr_size = fs_xattr_size; + + // Use override value if set + if (cct->_conf->filestore_max_inline_xattrs) + m_filestore_max_inline_xattrs = cct->_conf->filestore_max_inline_xattrs; + else + m_filestore_max_inline_xattrs = fs_xattrs; + + // Use override value if set + if (cct->_conf->filestore_max_xattr_value_size) + m_filestore_max_xattr_value_size = cct->_conf->filestore_max_xattr_value_size; + else + m_filestore_max_xattr_value_size = fs_xattr_max_value_size; + + if (m_filestore_max_xattr_value_size < cct->_conf->osd_max_object_name_len) { + derr << "WARNING: max attr value size (" + << m_filestore_max_xattr_value_size + << ") is smaller than osd_max_object_name_len (" + << cct->_conf->osd_max_object_name_len + << "). Your backend filesystem appears to not support attrs large " + << "enough to handle the configured max rados name size. You may get " + << "unexpected ENAMETOOLONG errors on rados operations or buggy " + << "behavior" + << dendl; + } +} + +uint64_t FileStore::estimate_objects_overhead(uint64_t num_objects) +{ + uint64_t res = num_objects * blk_size / 2; //assumes that each object uses ( in average ) additional 1/2 block due to FS allocation granularity. + return res; +} + +int FileStore::apply_layout_settings(const coll_t &cid, int target_level) +{ + dout(20) << __FUNC__ << ": " << cid << " target level: " + << target_level << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) { + dout(10) << "Error getting index for " << cid << ": " << cpp_strerror(r) + << dendl; + return r; + } + + return index->apply_layout_settings(target_level); +} + + +// -- FSSuperblock -- + +void FSSuperblock::encode(bufferlist &bl) const +{ + ENCODE_START(2, 1, bl); + compat_features.encode(bl); + encode(omap_backend, bl); + ENCODE_FINISH(bl); +} + +void FSSuperblock::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(2, bl); + compat_features.decode(bl); + if (struct_v >= 2) + decode(omap_backend, bl); + else + omap_backend = "leveldb"; + DECODE_FINISH(bl); +} + +void FSSuperblock::dump(Formatter *f) const +{ + f->open_object_section("compat"); + compat_features.dump(f); + f->dump_string("omap_backend", omap_backend); + f->close_section(); +} + +void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o) +{ + FSSuperblock z; + o.push_back(new FSSuperblock(z)); + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); + z.compat_features = CompatSet(feature_compat, feature_ro_compat, + feature_incompat); + o.push_back(new FSSuperblock(z)); + z.omap_backend = "rocksdb"; + o.push_back(new FSSuperblock(z)); +} + +#undef dout_prefix +#define dout_prefix *_dout << "filestore.osr(" << this << ") " + +void FileStore::OpSequencer::_register_apply(Op *o) +{ + if (o->registered_apply) { + dout(20) << __func__ << " " << o << " already registered" << dendl; + return; + } + o->registered_apply = true; + for (auto& t : o->tls) { + for (auto& i : t.get_object_index()) { + uint32_t key = i.first.hobj.get_hash(); + applying.emplace(make_pair(key, &i.first)); + dout(20) << __func__ << " " << o << " " << i.first << " (" + << &i.first << ")" << dendl; + } + } +} + +void FileStore::OpSequencer::_unregister_apply(Op *o) +{ + ceph_assert(o->registered_apply); + for (auto& t : o->tls) { + for (auto& i : t.get_object_index()) { + uint32_t key = i.first.hobj.get_hash(); + auto p = applying.find(key); + bool removed = false; + while (p != applying.end() && + p->first == key) { + if (p->second == &i.first) { + dout(20) << __func__ << " " << o << " " << i.first << " (" + << &i.first << ")" << dendl; + applying.erase(p); + removed = true; + break; + } + ++p; + } + ceph_assert(removed); + } + } +} + +void FileStore::OpSequencer::wait_for_apply(const ghobject_t& oid) +{ + Mutex::Locker l(qlock); + uint32_t key = oid.hobj.get_hash(); +retry: + while (true) { + // search all items in hash slot for a matching object + auto p = applying.find(key); + while (p != applying.end() && + p->first == key) { + if (*p->second == oid) { + dout(20) << __func__ << " " << oid << " waiting on " << p->second + << dendl; + cond.Wait(qlock); + goto retry; + } + ++p; + } + break; + } + dout(20) << __func__ << " " << oid << " done" << dendl; +} diff --git a/src/os/filestore/FileStore.h b/src/os/filestore/FileStore.h new file mode 100644 index 00000000..e09b9e04 --- /dev/null +++ b/src/os/filestore/FileStore.h @@ -0,0 +1,938 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FILESTORE_H +#define CEPH_FILESTORE_H + +#include "include/types.h" + +#include <map> +#include <deque> +#include <atomic> +#include <fstream> + + +#include <boost/scoped_ptr.hpp> + +#include "include/unordered_map.h" + +#include "include/ceph_assert.h" + +#include "os/ObjectStore.h" +#include "JournalingObjectStore.h" + +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "common/perf_counters.h" +#include "common/zipkin_trace.h" + +#include "common/Mutex.h" +#include "HashIndex.h" +#include "IndexManager.h" +#include "os/ObjectMap.h" +#include "SequencerPosition.h" +#include "FDCache.h" +#include "WBThrottle.h" + +#include "include/uuid.h" + +#if defined(__linux__) +# ifndef BTRFS_SUPER_MAGIC +#define BTRFS_SUPER_MAGIC 0x9123683EUL +# endif +# ifndef XFS_SUPER_MAGIC +#define XFS_SUPER_MAGIC 0x58465342UL +# endif +# ifndef ZFS_SUPER_MAGIC +#define ZFS_SUPER_MAGIC 0x2fc12fc1UL +# endif +#endif + + +class FileStoreBackend; + +#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects") + +enum { + l_filestore_first = 84000, + l_filestore_journal_queue_ops, + l_filestore_journal_queue_bytes, + l_filestore_journal_ops, + l_filestore_journal_bytes, + l_filestore_journal_latency, + l_filestore_journal_wr, + l_filestore_journal_wr_bytes, + l_filestore_journal_full, + l_filestore_committing, + l_filestore_commitcycle, + l_filestore_commitcycle_interval, + l_filestore_commitcycle_latency, + l_filestore_op_queue_max_ops, + l_filestore_op_queue_ops, + l_filestore_ops, + l_filestore_op_queue_max_bytes, + l_filestore_op_queue_bytes, + l_filestore_bytes, + l_filestore_apply_latency, + l_filestore_queue_transaction_latency_avg, + l_filestore_sync_pause_max_lat, + l_filestore_last, +}; + +class FSSuperblock { +public: + CompatSet compat_features; + string omap_backend; + + FSSuperblock() { } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<FSSuperblock*>& o); +}; +WRITE_CLASS_ENCODER(FSSuperblock) + +inline ostream& operator<<(ostream& out, const FSSuperblock& sb) +{ + return out << "sb(" << sb.compat_features << "): " + << sb.omap_backend; +} + +class FileStore : public JournalingObjectStore, + public md_config_obs_t +{ + static const uint32_t target_version = 4; +public: + uint32_t get_target_version() { + return target_version; + } + + static int get_block_device_fsid(CephContext* cct, const string& path, + uuid_d *fsid); + struct FSPerfTracker { + PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns; + PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns; + + objectstore_perf_stat_t get_cur_stats() const { + objectstore_perf_stat_t ret; + ret.os_commit_latency_ns = os_commit_latency_ns.current_avg(); + ret.os_apply_latency_ns = os_apply_latency_ns.current_avg(); + return ret; + } + + void update_from_perfcounters(PerfCounters &logger); + } perf_tracker; + objectstore_perf_stat_t get_cur_stats() override { + perf_tracker.update_from_perfcounters(*logger); + return perf_tracker.get_cur_stats(); + } + const PerfCounters* get_perf_counters() const override { + return logger; + } + +private: + string internal_name; ///< internal name, used to name the perfcounter instance + string basedir, journalpath; + osflagbits_t generic_flags; + std::string current_fn; + std::string current_op_seq_fn; + std::string omap_dir; + uuid_d fsid; + + size_t blk_size; ///< fs block size + + int fsid_fd, op_fd, basedir_fd, current_fd; + + FileStoreBackend *backend; + + void create_backend(unsigned long f_type); + + string devname; + + int vdo_fd = -1; + string vdo_name; + + deque<uint64_t> snaps; + + // Indexed Collections + IndexManager index_manager; + int get_index(const coll_t& c, Index *index); + int init_index(const coll_t& c); + + bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) { + // - normal temp case: cid is pg, object is temp (pool < -1) + // - hammer temp case: cid is pg (or already temp), object pool is -1 + return cid.is_pg() && oid.hobj.pool <= -1; + } + void init_temp_collections(); + + void handle_eio(); + + // ObjectMap + boost::scoped_ptr<ObjectMap> object_map; + + // helper fns + int get_cdir(const coll_t& cid, char *s, int len); + + /// read a uuid from fd + int read_fsid(int fd, uuid_d *uuid); + + /// lock fsid_fd + int lock_fsid(); + + // sync thread + Mutex lock; + bool force_sync; + Cond sync_cond; + + Mutex sync_entry_timeo_lock; + SafeTimer timer; + + list<Context*> sync_waiters; + bool stop; + void sync_entry(); + struct SyncThread : public Thread { + FileStore *fs; + explicit SyncThread(FileStore *f) : fs(f) {} + void *entry() override { + fs->sync_entry(); + return 0; + } + } sync_thread; + + // -- op workqueue -- + struct Op { + utime_t start; + uint64_t op; + vector<Transaction> tls; + Context *onreadable, *onreadable_sync; + uint64_t ops, bytes; + TrackedOpRef osd_op; + ZTracer::Trace trace; + bool registered_apply = false; + }; + class OpSequencer : public CollectionImpl { + CephContext *cct; + Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock) + list<Op*> q; + list<uint64_t> jq; + list<pair<uint64_t, Context*> > flush_commit_waiters; + Cond cond; + string osr_name_str; + /// hash of pointers to ghobject_t's for in-flight writes + unordered_multimap<uint32_t,const ghobject_t*> applying; + public: + Mutex apply_lock; // for apply mutual exclusion + int id; + const char *osr_name; + + /// get_max_uncompleted + bool _get_max_uncompleted( + uint64_t *seq ///< [out] max uncompleted seq + ) { + ceph_assert(qlock.is_locked()); + ceph_assert(seq); + *seq = 0; + if (q.empty() && jq.empty()) + return true; + + if (!q.empty()) + *seq = q.back()->op; + if (!jq.empty() && jq.back() > *seq) + *seq = jq.back(); + + return false; + } /// @returns true if both queues are empty + + /// get_min_uncompleted + bool _get_min_uncompleted( + uint64_t *seq ///< [out] min uncompleted seq + ) { + ceph_assert(qlock.is_locked()); + ceph_assert(seq); + *seq = 0; + if (q.empty() && jq.empty()) + return true; + + if (!q.empty()) + *seq = q.front()->op; + if (!jq.empty() && jq.front() < *seq) + *seq = jq.front(); + + return false; + } /// @returns true if both queues are empty + + void _wake_flush_waiters(list<Context*> *to_queue) { + uint64_t seq; + if (_get_min_uncompleted(&seq)) + seq = -1; + + for (list<pair<uint64_t, Context*> >::iterator i = + flush_commit_waiters.begin(); + i != flush_commit_waiters.end() && i->first < seq; + flush_commit_waiters.erase(i++)) { + to_queue->push_back(i->second); + } + } + + void queue_journal(Op *o) { + Mutex::Locker l(qlock); + jq.push_back(o->op); + _register_apply(o); + } + void dequeue_journal(list<Context*> *to_queue) { + Mutex::Locker l(qlock); + jq.pop_front(); + cond.Signal(); + _wake_flush_waiters(to_queue); + } + void queue(Op *o) { + Mutex::Locker l(qlock); + q.push_back(o); + _register_apply(o); + o->trace.keyval("queue depth", q.size()); + } + void _register_apply(Op *o); + void _unregister_apply(Op *o); + void wait_for_apply(const ghobject_t& oid); + Op *peek_queue() { + Mutex::Locker l(qlock); + ceph_assert(apply_lock.is_locked()); + return q.front(); + } + + Op *dequeue(list<Context*> *to_queue) { + ceph_assert(to_queue); + ceph_assert(apply_lock.is_locked()); + Mutex::Locker l(qlock); + Op *o = q.front(); + q.pop_front(); + cond.Signal(); + _unregister_apply(o); + _wake_flush_waiters(to_queue); + return o; + } + + void flush() override { + Mutex::Locker l(qlock); + + while (cct->_conf->filestore_blackhole) + cond.Wait(qlock); // wait forever + + + // get max for journal _or_ op queues + uint64_t seq = 0; + if (!q.empty()) + seq = q.back()->op; + if (!jq.empty() && jq.back() > seq) + seq = jq.back(); + + if (seq) { + // everything prior to our watermark to drain through either/both queues + while ((!q.empty() && q.front()->op <= seq) || + (!jq.empty() && jq.front() <= seq)) + cond.Wait(qlock); + } + } + bool flush_commit(Context *c) override { + Mutex::Locker l(qlock); + uint64_t seq = 0; + if (_get_max_uncompleted(&seq)) { + return true; + } else { + flush_commit_waiters.push_back(make_pair(seq, c)); + return false; + } + } + + OpSequencer(CephContext* cct, int i, coll_t cid) + : CollectionImpl(cid), + cct(cct), + qlock("FileStore::OpSequencer::qlock", false, false), + osr_name_str(stringify(cid)), + apply_lock("FileStore::OpSequencer::apply_lock", false, false), + id(i), + osr_name(osr_name_str.c_str()) {} + ~OpSequencer() override { + ceph_assert(q.empty()); + } + }; + typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef; + + Mutex coll_lock; + map<coll_t,OpSequencerRef> coll_map; + + friend ostream& operator<<(ostream& out, const OpSequencer& s); + + FDCache fdcache; + WBThrottle wbthrottle; + + std::atomic<int64_t> next_osr_id = { 0 }; + bool m_disable_wbthrottle; + deque<OpSequencer*> op_queue; + BackoffThrottle throttle_ops, throttle_bytes; + const int m_ondisk_finisher_num; + const int m_apply_finisher_num; + vector<Finisher*> ondisk_finishers; + vector<Finisher*> apply_finishers; + + ThreadPool op_tp; + struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> { + FileStore *store; + OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp) + : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {} + + bool _enqueue(OpSequencer *osr) override { + store->op_queue.push_back(osr); + return true; + } + void _dequeue(OpSequencer *o) override { + ceph_abort(); + } + bool _empty() override { + return store->op_queue.empty(); + } + OpSequencer *_dequeue() override { + if (store->op_queue.empty()) + return nullptr; + OpSequencer *osr = store->op_queue.front(); + store->op_queue.pop_front(); + return osr; + } + void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override { + store->_do_op(osr, handle); + } + void _process_finish(OpSequencer *osr) override { + store->_finish_op(osr); + } + void _clear() override { + ceph_assert(store->op_queue.empty()); + } + } op_wq; + + void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle); + void _finish_op(OpSequencer *o); + Op *build_op(vector<Transaction>& tls, + Context *onreadable, Context *onreadable_sync, + TrackedOpRef osd_op); + void queue_op(OpSequencer *osr, Op *o); + void op_queue_reserve_throttle(Op *o); + void op_queue_release_throttle(Op *o); + void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk); + friend struct C_JournaledAhead; + + void new_journal(); + + PerfCounters *logger; + + ZTracer::Endpoint trace_endpoint; + +public: + int lfn_find(const ghobject_t& oid, const Index& index, + IndexedPath *path = nullptr); + int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length); + int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf); + int lfn_open( + const coll_t& cid, + const ghobject_t& oid, + bool create, + FDRef *outfd, + Index *index = nullptr); + + void lfn_close(FDRef fd); + int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ; + int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos, + bool force_clear_omap=false); + +public: + FileStore(CephContext* cct, const std::string &base, const std::string &jdev, + osflagbits_t flags = 0, + const char *internal_name = "filestore", bool update_to=false); + ~FileStore() override; + + string get_type() override { + return "filestore"; + } + + int _detect_fs(); + int _sanity_check_fs(); + + bool test_mount_in_use() override; + int read_op_seq(uint64_t *seq); + int write_op_seq(int, uint64_t seq); + int mount() override; + int umount() override; + + int validate_hobject_key(const hobject_t &obj) const override; + + unsigned get_max_attr_name_length() override { + // xattr limit is 128; leave room for our prefixes (user.ceph._), + // some margin, and cap at 100 + return 100; + } + int mkfs() override; + int mkjournal() override; + bool wants_journal() override { + return true; + } + bool allows_journal() override { + return true; + } + bool needs_journal() override { + return false; + } + + bool is_sync_onreadable() const override { + return false; + } + + bool is_rotational() override; + bool is_journal_rotational() override; + + void dump_perf_counters(Formatter *f) override { + f->open_object_section("perf_counters"); + logger->dump_formatted(f, false); + f->close_section(); + } + + int flush_cache(ostream *os = NULL) override; + int write_version_stamp(); + int version_stamp_is_valid(uint32_t *version); + int update_version_stamp(); + int upgrade() override; + + bool can_sort_nibblewise() override { + return true; // i support legacy sort order + } + + void collect_metadata(map<string,string> *pm) override; + int get_devices(set<string> *ls) override; + + int statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts = nullptr) override; + int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override; + + int _do_transactions( + vector<Transaction> &tls, uint64_t op_seq, + ThreadPool::TPHandle *handle, + const char *osr_name); + int do_transactions(vector<Transaction> &tls, uint64_t op_seq) override { + return _do_transactions(tls, op_seq, nullptr, "replay"); + } + void _do_transaction( + Transaction& t, uint64_t op_seq, int trans_num, + ThreadPool::TPHandle *handle, const char *osr_name); + + CollectionHandle open_collection(const coll_t& c) override; + CollectionHandle create_new_collection(const coll_t& c) override; + void set_collection_commit_queue(const coll_t& cid, + ContextQueue *commit_queue) override { + } + + int queue_transactions(CollectionHandle& ch, vector<Transaction>& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = nullptr) override; + + /** + * set replay guard xattr on given file + * + * This will ensure that we will not replay this (or any previous) operation + * against this particular inode/object. + * + * @param fd open file descriptor for the file/object + * @param spos sequencer position of the last operation we should not replay + */ + void _set_replay_guard(int fd, + const SequencerPosition& spos, + const ghobject_t *oid=0, + bool in_progress=false); + void _set_replay_guard(const coll_t& cid, + const SequencerPosition& spos, + bool in_progress); + void _set_global_replay_guard(const coll_t& cid, + const SequencerPosition &spos); + + /// close a replay guard opened with in_progress=true + void _close_replay_guard(int fd, const SequencerPosition& spos, + const ghobject_t *oid=0); + void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos); + + /** + * check replay guard xattr on given file + * + * Check the current position against any marker on the file that + * indicates which operations have already been applied. If the + * current or a newer operation has been marked as applied, we + * should not replay the current operation again. + * + * If we are not replaying the journal, we already return true. It + * is only on replay that we might return false, indicated that the + * operation should not be performed (again). + * + * @param fd open fd on the file/object in question + * @param spos sequencerposition for an operation we could apply/replay + * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress + */ + int _check_replay_guard(int fd, const SequencerPosition& spos); + int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos); + int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos); + int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos); + + // ------------------ + // objects + int pick_object_revision_lt(ghobject_t& oid) { + return 0; + } + using ObjectStore::exists; + bool exists(CollectionHandle& c, const ghobject_t& oid) override; + using ObjectStore::stat; + int stat( + CollectionHandle& c, + const ghobject_t& oid, + struct stat *st, + bool allow_eio = false) override; + using ObjectStore::set_collection_opts; + int set_collection_opts( + CollectionHandle& c, + const pool_opts_t& opts) override; + using ObjectStore::read; + int read( + CollectionHandle& c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0) override; + int _do_fiemap(int fd, uint64_t offset, size_t len, + map<uint64_t, uint64_t> *m); + int _do_seek_hole_data(int fd, uint64_t offset, size_t len, + map<uint64_t, uint64_t> *m); + using ObjectStore::fiemap; + int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) override; + int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override; + + int _touch(const coll_t& cid, const ghobject_t& oid); + int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, + const bufferlist& bl, uint32_t fadvise_flags = 0); + int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len); + int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size); + int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid, + const SequencerPosition& spos); + int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid, + uint64_t srcoff, uint64_t len, uint64_t dstoff, + const SequencerPosition& spos); + int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); + int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); + int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false); + int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos); + + int _fgetattr(int fd, const char *name, bufferptr& bp); + int _fgetattrs(int fd, map<string,bufferptr>& aset); + int _fsetattrs(int fd, map<string, bufferptr> &aset); + + void do_force_sync(); + void start_sync(Context *onsafe); + void sync(); + void _flush_op_queue(); + void flush(); + void sync_and_flush(); + + int flush_journal() override; + int dump_journal(ostream& out) override; + + void set_fsid(uuid_d u) override { + fsid = u; + } + uuid_d get_fsid() override { return fsid; } + + uint64_t estimate_objects_overhead(uint64_t num_objects) override; + + // DEBUG read error injection, an object is removed from both on delete() + Mutex read_error_lock; + set<ghobject_t> data_error_set; // read() will return -EIO + set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO + void inject_data_error(const ghobject_t &oid) override; + void inject_mdata_error(const ghobject_t &oid) override; + + void compact() override { + ceph_assert(object_map); + object_map->compact(); + } + + bool has_builtin_csum() const override { + return false; + } + + void debug_obj_on_delete(const ghobject_t &oid); + bool debug_data_eio(const ghobject_t &oid); + bool debug_mdata_eio(const ghobject_t &oid); + + int snapshot(const string& name) override; + + // attrs + using ObjectStore::getattr; + using ObjectStore::getattrs; + int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, bufferptr &bp) override; + int getattrs(CollectionHandle& c, const ghobject_t& oid, map<string,bufferptr>& aset) override; + + int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset, + const SequencerPosition &spos); + int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name, + const SequencerPosition &spos); + int _rmattrs(const coll_t& cid, const ghobject_t& oid, + const SequencerPosition &spos); + + int _collection_remove_recursive(const coll_t &cid, + const SequencerPosition &spos); + + int _collection_set_bits(const coll_t& cid, int bits); + + // collections + using ObjectStore::collection_list; + int collection_bits(CollectionHandle& c) override; + int collection_list(CollectionHandle& c, + const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *next) override { + c->flush(); + return collection_list(c->cid, start, end, max, ls, next); + } + int collection_list(const coll_t& cid, + const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *next); + int list_collections(vector<coll_t>& ls) override; + int list_collections(vector<coll_t>& ls, bool include_temp); + int collection_stat(const coll_t& c, struct stat *st); + bool collection_exists(const coll_t& c) override; + int collection_empty(CollectionHandle& c, bool *empty) override { + c->flush(); + return collection_empty(c->cid, empty); + } + int collection_empty(const coll_t& cid, bool *empty); + + // omap (see ObjectStore.h for documentation) + using ObjectStore::omap_get; + int omap_get(CollectionHandle& c, const ghobject_t &oid, bufferlist *header, + map<string, bufferlist> *out) override; + using ObjectStore::omap_get_header; + int omap_get_header( + CollectionHandle& c, + const ghobject_t &oid, + bufferlist *out, + bool allow_eio = false) override; + using ObjectStore::omap_get_keys; + int omap_get_keys(CollectionHandle& c, const ghobject_t &oid, set<string> *keys) override; + using ObjectStore::omap_get_values; + int omap_get_values(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys, + map<string, bufferlist> *out) override; + using ObjectStore::omap_check_keys; + int omap_check_keys(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys, + set<string> *out) override; + using ObjectStore::get_omap_iterator; + ObjectMap::ObjectMapIterator get_omap_iterator(CollectionHandle& c, const ghobject_t &oid) override; + ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& cid, const ghobject_t &oid); + + int _create_collection(const coll_t& c, int bits, + const SequencerPosition &spos); + int _destroy_collection(const coll_t& c); + /** + * Give an expected number of objects hint to the collection. + * + * @param c - collection id. + * @param pg_num - pg number of the pool this collection belongs to + * @param expected_num_objs - expected number of objects in this collection + * @param spos - sequence position + * + * @return 0 on success, an error code otherwise + */ + int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num, + uint64_t expected_num_objs, + const SequencerPosition &spos); + int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid, + const SequencerPosition& spos); + int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, + coll_t c, const ghobject_t& o, + const SequencerPosition& spos, + bool ignore_enoent = false); + + int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size); + + void dump_start(const std::string& file); + void dump_stop(); + void dump_transactions(vector<Transaction>& ls, uint64_t seq, OpSequencer *osr); + + virtual int apply_layout_settings(const coll_t &cid, int target_level); + + void get_db_statistics(Formatter* f) override; + +private: + void _inject_failure(); + + // omap + int _omap_clear(const coll_t& cid, const ghobject_t &oid, + const SequencerPosition &spos); + int _omap_setkeys(const coll_t& cid, const ghobject_t &oid, + const map<string, bufferlist> &aset, + const SequencerPosition &spos); + int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const set<string> &keys, + const SequencerPosition &spos); + int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid, + const string& first, const string& last, + const SequencerPosition &spos); + int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl, + const SequencerPosition &spos); + int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest, + const SequencerPosition &spos); + int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest, + const SequencerPosition &spos); + + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) override; + int set_throttle_params(); + float m_filestore_commit_timeout; + bool m_filestore_journal_parallel; + bool m_filestore_journal_trailing; + bool m_filestore_journal_writeahead; + int m_filestore_fiemap_threshold; + double m_filestore_max_sync_interval; + double m_filestore_min_sync_interval; + bool m_filestore_fail_eio; + bool m_filestore_fadvise; + int do_update; + bool m_journal_dio, m_journal_aio, m_journal_force_aio; + std::string m_osd_rollback_to_cluster_snap; + bool m_osd_use_stale_snap; + bool m_filestore_do_dump; + std::ofstream m_filestore_dump; + JSONFormatter m_filestore_dump_fmt; + std::atomic<int64_t> m_filestore_kill_at = { 0 }; + bool m_filestore_sloppy_crc; + int m_filestore_sloppy_crc_block_size; + uint64_t m_filestore_max_alloc_hint_size; + unsigned long m_fs_type; + + //Determined xattr handling based on fs type + void set_xattr_limits_via_conf(); + uint32_t m_filestore_max_inline_xattr_size; + uint32_t m_filestore_max_inline_xattrs; + uint32_t m_filestore_max_xattr_value_size; + + FSSuperblock superblock; + + /** + * write_superblock() + * + * Write superblock to persisent storage + * + * return value: 0 on success, otherwise negative errno + */ + int write_superblock(); + + /** + * read_superblock() + * + * Fill in FileStore::superblock by reading persistent storage + * + * return value: 0 on success, otherwise negative errno + */ + int read_superblock(); + + friend class FileStoreBackend; + friend class TestFileStore; +}; + +ostream& operator<<(ostream& out, const FileStore::OpSequencer& s); + +struct fiemap; + +class FileStoreBackend { +private: + FileStore *filestore; +protected: + int get_basedir_fd() { + return filestore->basedir_fd; + } + int get_current_fd() { + return filestore->current_fd; + } + int get_op_fd() { + return filestore->op_fd; + } + size_t get_blksize() { + return filestore->blk_size; + } + const string& get_basedir_path() { + return filestore->basedir; + } + const string& get_journal_path() { + return filestore->journalpath; + } + const string& get_current_path() { + return filestore->current_fn; + } + int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) { + if (has_fiemap() || has_seek_data_hole()) { + return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff); + } else { + return filestore->_do_copy_range(from, to, srcoff, len, dstoff); + } + } + int get_crc_block_size() { + return filestore->m_filestore_sloppy_crc_block_size; + } + +public: + explicit FileStoreBackend(FileStore *fs) : filestore(fs) {} + virtual ~FileStoreBackend() {} + + CephContext* cct() const { + return filestore->cct; + } + + static FileStoreBackend *create(unsigned long f_type, FileStore *fs); + + virtual const char *get_name() = 0; + virtual int detect_features() = 0; + virtual int create_current() = 0; + virtual bool can_checkpoint() = 0; + virtual int list_checkpoints(list<string>& ls) = 0; + virtual int create_checkpoint(const string& name, uint64_t *cid) = 0; + virtual int sync_checkpoint(uint64_t id) = 0; + virtual int rollback_to(const string& name) = 0; + virtual int destroy_checkpoint(const string& name) = 0; + virtual int syncfs() = 0; + virtual bool has_fiemap() = 0; + virtual bool has_seek_data_hole() = 0; + virtual bool is_rotational() = 0; + virtual bool is_journal_rotational() = 0; + virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0; + virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0; + virtual int set_alloc_hint(int fd, uint64_t hint) = 0; + virtual bool has_splice() const = 0; + + // hooks for (sloppy) crc tracking + virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0; + virtual int _crc_update_truncate(int fd, loff_t off) = 0; + virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0; + virtual int _crc_update_clone_range(int srcfd, int destfd, + loff_t srcoff, size_t len, loff_t dstoff) = 0; + virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, + ostream *out) = 0; +}; + +#endif diff --git a/src/os/filestore/GenericFileStoreBackend.cc b/src/os/filestore/GenericFileStoreBackend.cc new file mode 100644 index 00000000..a75d501f --- /dev/null +++ b/src/os/filestore/GenericFileStoreBackend.cc @@ -0,0 +1,468 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" +#include "include/types.h" + +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> + +#if defined(__linux__) +#include <linux/fs.h> +#endif + +#include "include/compat.h" +#include "include/linux_fiemap.h" + +#include <iostream> +#include <fstream> +#include <sstream> + +#include "GenericFileStoreBackend.h" + +#include "common/errno.h" +#include "common/config.h" +#include "common/sync_filesystem.h" +#include "common/blkdev.h" + +#include "common/SloppyCRCMap.h" +#include "os/filestore/chain_xattr.h" + +#define SLOPPY_CRC_XATTR "user.cephos.scrc" + + +#define dout_context cct() +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") " + +#define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) +#define ALIGNED(x, by) (!((x) % (by))) +#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) + +GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs): + FileStoreBackend(fs), + ioctl_fiemap(false), + seek_data_hole(false), + use_splice(false), + m_filestore_fiemap(cct()->_conf->filestore_fiemap), + m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole), + m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data), + m_filestore_splice(cct()->_conf->filestore_splice) +{ + // rotational? + { + // NOTE: the below won't work on btrfs; we'll assume rotational. + string fn = get_basedir_path(); + int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) { + return; + } + BlkDev blkdev(fd); + m_rotational = blkdev.is_rotational(); + dout(20) << __func__ << " basedir " << fn + << " rotational " << (int)m_rotational << dendl; + ::close(fd); + } + // journal rotational? + { + // NOTE: the below won't work on btrfs; we'll assume rotational. + string fn = get_journal_path(); + int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) { + return; + } + BlkDev blkdev(fd); + m_journal_rotational = blkdev.is_rotational(); + dout(20) << __func__ << " journal filename " << fn.c_str() + << " journal rotational " << (int)m_journal_rotational << dendl; + ::close(fd); + } +} + +int GenericFileStoreBackend::detect_features() +{ + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str()); + + int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, 0644); + if (fd < 0) { + fd = -errno; + derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl; + return fd; + } + + // ext4 has a bug in older kernels where fiemap will return an empty + // result in some cases. this is a file layout that triggers the bug + // on 2.6.34-rc5. + int v[] = { + 0x0000000000016000, 0x0000000000007000, + 0x000000000004a000, 0x0000000000007000, + 0x0000000000060000, 0x0000000000001000, + 0x0000000000061000, 0x0000000000008000, + 0x0000000000069000, 0x0000000000007000, + 0x00000000000a3000, 0x000000000000c000, + 0x000000000024e000, 0x000000000000c000, + 0x000000000028b000, 0x0000000000009000, + 0x00000000002b1000, 0x0000000000003000, + 0, 0 + }; + for (int i=0; v[i]; i++) { + int off = v[i++]; + int len = v[i]; + + // write a large extent + char buf[len]; + memset(buf, 1, sizeof(buf)); + int r = ::lseek(fd, off, SEEK_SET); + if (r < 0) { + r = -errno; + derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + r = write(fd, buf, sizeof(buf)); + if (r < 0) { + derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + } + + // fiemap an extent inside that + if (!m_filestore_fiemap) { + dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl; + ioctl_fiemap = false; + } else { + struct fiemap *fiemap; + int r = do_fiemap(fd, 2430421, 59284, &fiemap); + if (r < 0) { + dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl; + ioctl_fiemap = false; + } else { + if (fiemap->fm_mapped_extents == 0) { + dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl; + ioctl_fiemap = false; + } else { + dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl; + ioctl_fiemap = true; + } + free(fiemap); + } + } + + // SEEK_DATA/SEEK_HOLE detection + if (!m_filestore_seek_data_hole) { + dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl; + seek_data_hole = false; + } else { +#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) + // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running + // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned. + // Fall back to use fiemap. + off_t hole_pos; + + hole_pos = lseek(fd, 0, SEEK_HOLE); + if (hole_pos < 0) { + if (errno == EINVAL) { + dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl; + seek_data_hole = false; + } else { + derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return -errno; + } + } else { + dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl; + seek_data_hole = true; + } +#endif + } + + //splice detection +#ifdef CEPH_HAVE_SPLICE + if (!m_filestore_splice) { + dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl; + use_splice = false; + } else { + int pipefd[2]; + loff_t off_in = 0; + int r; + if (pipe_cloexec(pipefd) < 0) { + int e = errno; + dout(0) << "detect_features: splice pipe met error " << cpp_strerror(e) << dendl; + } else { + lseek(fd, 0, SEEK_SET); + r = splice(fd, &off_in, pipefd[1], NULL, 10, 0); + if (!(r < 0 && errno == EINVAL)) { + use_splice = true; + dout(0) << "detect_features: splice is supported" << dendl; + } else + dout(0) << "detect_features: splice is NOT supported" << dendl; + close(pipefd[0]); + close(pipefd[1]); + } + } +#endif + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + + + bool have_syncfs = false; +#ifdef HAVE_SYS_SYNCFS + if (::syncfs(get_basedir_fd()) == 0) { + dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl; + have_syncfs = true; + } else { + dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl; + } +#elif defined(SYS_syncfs) + if (syscall(SYS_syncfs, get_basedir_fd()) == 0) { + dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl; + have_syncfs = true; + } else { + dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; + } +#elif defined(__NR_syncfs) + if (syscall(__NR_syncfs, get_basedir_fd()) == 0) { + dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl; + have_syncfs = true; + } else { + dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; + } +#endif + if (!have_syncfs) { + dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl; + if (m_filestore_fsync_flushes_journal_data) { + dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl; + } else { + dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl; + dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl; + } + } + + return 0; +} + +int GenericFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "_create_current: current/ exists but is not a directory" << dendl; + ret = -EINVAL; + } + } else { + ret = ::mkdir(get_current_path().c_str(), 0755); + if (ret < 0) { + ret = -errno; + dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl; + } + } + return ret; +} + +int GenericFileStoreBackend::syncfs() +{ + int ret; + if (m_filestore_fsync_flushes_journal_data) { + dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl; + // make the file system's journal commit. + // this works with ext3, but NOT ext4 + ret = ::fsync(get_op_fd()); + if (ret < 0) + ret = -errno; + } else { + dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl; + ret = sync_filesystem(get_current_fd()); + } + return ret; +} + +int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) +{ + struct fiemap *fiemap = NULL; + struct fiemap *_realloc_fiemap = NULL; + int size; + int ret; + + fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1); + if (!fiemap) + return -ENOMEM; + /* + * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096), + * the result is (logical=4096, len=4096). It leak the [3990, 4096). + * Commit:"xfs: fix rounding error of fiemap length parameter + * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug. + * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug. + */ + fiemap->fm_start = start - start % CEPH_PAGE_SIZE; + fiemap->fm_length = len + start % CEPH_PAGE_SIZE; + fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */ + +#if defined(__APPLE__) || defined(__FreeBSD__) + ret = -ENOTSUP; + goto done_err; +#else + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { + ret = -errno; + goto done_err; + } +#endif + size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); + + _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size); + if (!_realloc_fiemap) { + ret = -ENOMEM; + goto done_err; + } else { + fiemap = _realloc_fiemap; + } + + memset(fiemap->fm_extents, 0, size); + + fiemap->fm_extent_count = fiemap->fm_mapped_extents; + fiemap->fm_mapped_extents = 0; + +#if defined(__APPLE__) || defined(__FreeBSD__) + ret = -ENOTSUP; + goto done_err; +#else + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { + ret = -errno; + goto done_err; + } + *pfiemap = fiemap; +#endif + return 0; + +done_err: + *pfiemap = NULL; + free(fiemap); + return ret; +} + + +int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm) +{ + char buf[100]; + bufferptr bp; + int r = 0; + int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf)); + if (l == -ENODATA) { + return 0; + } + if (l >= 0) { + bp = buffer::create(l); + memcpy(bp.c_str(), buf, l); + } else if (l == -ERANGE) { + l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0); + if (l > 0) { + bp = buffer::create(l); + l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l); + } + } + bufferlist bl; + bl.append(std::move(bp)); + auto p = bl.cbegin(); + try { + decode(*cm, p); + } + catch (buffer::error &e) { + r = -EIO; + } + if (r < 0) + derr << __func__ << " got " << cpp_strerror(r) << dendl; + return r; +} + +int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm) +{ + bufferlist bl; + encode(*cm, bl); + int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length()); + if (r < 0) + derr << __func__ << " got " << cpp_strerror(r) << dendl; + return r; +} + +int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + ostringstream ss; + scm.write(off, len, bl, &ss); + dout(30) << __func__ << "\n" << ss.str() << dendl; + r = _crc_save(fd, &scm); + return r; +} + +int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + scm.truncate(off); + r = _crc_save(fd, &scm); + return r; +} + +int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + scm.zero(off, len); + r = _crc_save(fd, &scm); + return r; +} + +int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd, + loff_t srcoff, size_t len, loff_t dstoff) +{ + SloppyCRCMap scm_src(get_crc_block_size()); + SloppyCRCMap scm_dst(get_crc_block_size()); + int r = _crc_load_or_init(srcfd, &scm_src); + if (r < 0) + return r; + r = _crc_load_or_init(destfd, &scm_dst); + if (r < 0) + return r; + ostringstream ss; + scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss); + dout(30) << __func__ << "\n" << ss.str() << dendl; + r = _crc_save(destfd, &scm_dst); + return r; +} + +int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, + ostream *out) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + return scm.read(off, len, bl, out); +} diff --git a/src/os/filestore/GenericFileStoreBackend.h b/src/os/filestore/GenericFileStoreBackend.h new file mode 100644 index 00000000..207c3d0d --- /dev/null +++ b/src/os/filestore/GenericFileStoreBackend.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_GENERICFILESTOREBACKEDN_H +#define CEPH_GENERICFILESTOREBACKEDN_H + +#include "FileStore.h" + +class SloppyCRCMap; + +class GenericFileStoreBackend : public FileStoreBackend { +private: + bool ioctl_fiemap; + bool seek_data_hole; + bool use_splice; + bool m_filestore_fiemap; + bool m_filestore_seek_data_hole; + bool m_filestore_fsync_flushes_journal_data; + bool m_filestore_splice; + bool m_rotational = true; + bool m_journal_rotational = true; +public: + explicit GenericFileStoreBackend(FileStore *fs); + ~GenericFileStoreBackend() override {} + + const char *get_name() override { + return "generic"; + } + int detect_features() override; + int create_current() override; + bool can_checkpoint() override { return false; } + bool is_rotational() override { + return m_rotational; + } + bool is_journal_rotational() override { + return m_journal_rotational; + } + int list_checkpoints(list<string>& ls) override { return 0; } + int create_checkpoint(const string& name, uint64_t *cid) override { return -EOPNOTSUPP; } + int sync_checkpoint(uint64_t id) override { return -EOPNOTSUPP; } + int rollback_to(const string& name) override { return -EOPNOTSUPP; } + int destroy_checkpoint(const string& name) override { return -EOPNOTSUPP; } + int syncfs() override; + bool has_fiemap() override { return ioctl_fiemap; } + bool has_seek_data_hole() override { return seek_data_hole; } + int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) override; + int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override { + return _copy_range(from, to, srcoff, len, dstoff); + } + int set_alloc_hint(int fd, uint64_t hint) override { return -EOPNOTSUPP; } + bool has_splice() const override { return use_splice; } +private: + int _crc_load_or_init(int fd, SloppyCRCMap *cm); + int _crc_save(int fd, SloppyCRCMap *cm); +public: + int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) override; + int _crc_update_truncate(int fd, loff_t off) override; + int _crc_update_zero(int fd, loff_t off, size_t len) override; + int _crc_update_clone_range(int srcfd, int destfd, + loff_t srcoff, size_t len, loff_t dstoff) override; + int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, + ostream *out) override; +}; +#endif diff --git a/src/os/filestore/HashIndex.cc b/src/os/filestore/HashIndex.cc new file mode 100644 index 00000000..ab56b43c --- /dev/null +++ b/src/os/filestore/HashIndex.cc @@ -0,0 +1,1195 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include "include/types.h" +#include "include/buffer.h" +#include "osd/osd_types.h" +#include <errno.h> + +#include "HashIndex.h" + +#include "common/errno.h" +#include "common/debug.h" +#define dout_context cct +#define dout_subsys ceph_subsys_filestore + +const string HashIndex::SUBDIR_ATTR = "contents"; +const string HashIndex::SETTINGS_ATTR = "settings"; +const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op"; + +/// hex digit to integer value +int hex_to_int(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + ceph_abort(); +} + +/// int value to hex digit +char int_to_hex(int v) +{ + ceph_assert(v < 16); + if (v < 10) + return '0' + v; + return 'A' + v - 10; +} + +/// reverse bits in a nibble (0..15) +int reverse_nibble_bits(int in) +{ + ceph_assert(in < 16); + return + ((in & 8) >> 3) | + ((in & 4) >> 1) | + ((in & 2) << 1) | + ((in & 1) << 3); +} + +/// reverse nibble bits in a hex digit +char reverse_hexdigit_bits(char c) +{ + return int_to_hex(reverse_nibble_bits(hex_to_int(c))); +} + +/// reverse nibble bits in a hex string +string reverse_hexdigit_bits_string(string s) +{ + for (unsigned i=0; i<s.size(); ++i) + s[i] = reverse_hexdigit_bits(s[i]); + return s; +} + +/// compare hex digit (as length 1 string) bitwise +bool cmp_hexdigit_bitwise(const string& l, const string& r) +{ + ceph_assert(l.length() == 1 && r.length() == 1); + int lv = hex_to_int(l[0]); + int rv = hex_to_int(r[0]); + ceph_assert(lv < 16); + ceph_assert(rv < 16); + return reverse_nibble_bits(lv) < reverse_nibble_bits(rv); +} + +/// compare hex digit string bitwise +bool cmp_hexdigit_string_bitwise(const string& l, const string& r) +{ + string ll = reverse_hexdigit_bits_string(l); + string rr = reverse_hexdigit_bits_string(r); + return ll < rr; +} + +int HashIndex::cleanup() { + bufferlist bl; + int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) { + // No in progress operations! + return 0; + } + auto i = bl.cbegin(); + InProgressOp in_progress(i); + subdir_info_s info; + r = get_info(in_progress.path, &info); + if (r == -ENOENT) { + return end_split_or_merge(in_progress.path); + } else if (r < 0) { + return r; + } + + if (in_progress.is_split()) + return complete_split(in_progress.path, info); + else if (in_progress.is_merge()) + return complete_merge(in_progress.path, info); + else if (in_progress.is_col_split()) { + for (vector<string>::iterator i = in_progress.path.begin(); + i != in_progress.path.end(); + ++i) { + vector<string> path(in_progress.path.begin(), i); + int r = reset_attr(path); + if (r < 0) + return r; + } + return 0; + } + else + return -EINVAL; +} + +int HashIndex::reset_attr( + const vector<string> &path) +{ + int exists = 0; + int r = path_exists(path, &exists); + if (r < 0) + return r; + if (!exists) + return 0; + map<string, ghobject_t> objects; + vector<string> subdirs; + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + + subdir_info_s info; + info.hash_level = path.size(); + info.objs = objects.size(); + info.subdirs = subdirs.size(); + return set_info(path, info); +} + +int HashIndex::col_split_level( + HashIndex &from, + HashIndex &to, + const vector<string> &path, + uint32_t inbits, + uint32_t match, + unsigned *mkdirred) +{ + /* For each subdir, move, recurse, or ignore based on comparing the low order + * bits of the hash represented by the subdir path with inbits, match passed + * in. + */ + vector<string> subdirs; + int r = from.list_subdirs(path, &subdirs); + if (r < 0) + return r; + map<string, ghobject_t> objects; + r = from.list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + + set<string> to_move; + for (vector<string>::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + uint32_t bits = 0; + uint32_t hash = 0; + vector<string> sub_path(path.begin(), path.end()); + sub_path.push_back(*i); + path_to_hobject_hash_prefix(sub_path, &bits, &hash); + if (bits < inbits) { + if (hobject_t::match_hash(hash, bits, match)) { + r = col_split_level( + from, + to, + sub_path, + inbits, + match, + mkdirred); + if (r < 0) + return r; + if (*mkdirred > path.size()) + *mkdirred = path.size(); + } // else, skip, doesn't need to be moved or recursed into + } else { + if (hobject_t::match_hash(hash, inbits, match)) { + to_move.insert(*i); + } + } // else, skip, doesn't need to be moved or recursed into + } + + /* Then, do the same for each object */ + map<string, ghobject_t> objs_to_move; + for (map<string, ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (i->second.match(inbits, match)) { + objs_to_move.insert(*i); + } + } + + if (objs_to_move.empty() && to_move.empty()) + return 0; + + // Make parent directories as needed + while (*mkdirred < path.size()) { + ++*mkdirred; + int exists = 0; + vector<string> creating_path(path.begin(), path.begin()+*mkdirred); + r = to.path_exists(creating_path, &exists); + if (r < 0) + return r; + if (exists) + continue; + subdir_info_s info; + info.objs = 0; + info.subdirs = 0; + info.hash_level = creating_path.size(); + if (*mkdirred < path.size() - 1) + info.subdirs = 1; + r = to.start_col_split(creating_path); + if (r < 0) + return r; + r = to.create_path(creating_path); + if (r < 0) + return r; + r = to.set_info(creating_path, info); + if (r < 0) + return r; + r = to.end_split_or_merge(creating_path); + if (r < 0) + return r; + } + + subdir_info_s from_info; + subdir_info_s to_info; + r = from.get_info(path, &from_info); + if (r < 0) + return r; + r = to.get_info(path, &to_info); + if (r < 0) + return r; + + from.start_col_split(path); + to.start_col_split(path); + + // Do subdir moves + for (set<string>::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + from_info.subdirs--; + to_info.subdirs++; + r = move_subdir(from, to, path, *i); + if (r < 0) + return r; + } + + for (map<string, ghobject_t>::iterator i = objs_to_move.begin(); + i != objs_to_move.end(); + ++i) { + from_info.objs--; + to_info.objs++; + r = move_object(from, to, path, *i); + if (r < 0) + return r; + } + + + r = to.set_info(path, to_info); + if (r < 0) + return r; + r = from.set_info(path, from_info); + if (r < 0) + return r; + from.end_split_or_merge(path); + to.end_split_or_merge(path); + return 0; +} + +int HashIndex::_merge( + uint32_t bits, + CollectionIndex* dest) { + dout(20) << __func__ << " bits " << bits << dendl; + ceph_assert(collection_version() == dest->collection_version()); + + vector<string> emptypath; + + // pre-split to common/target level so that any shared prefix DIR_? + // directories already exist at the destination. Since each + // directory is a nibble (4 bits), + unsigned shared = bits / 4; + dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl; + if (shared) { + split_dirs(emptypath, shared); + ((HashIndex*)dest)->split_dirs(emptypath, shared); + } + + // now merge the contents + _merge_dirs(*this, *(HashIndex*)dest, emptypath); + + return 0; +} + +int HashIndex::_merge_dirs( + HashIndex& from, + HashIndex& to, + const vector<string>& path) +{ + dout(20) << __func__ << " path " << path << dendl; + int r; + + vector<string> src_subs, dst_subs; + r = from.list_subdirs(path, &src_subs); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "from.list_subdirs" + << dendl; + return r; + } + r = to.list_subdirs(path, &dst_subs); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "to.list_subdirs" + << dendl; + return r; + } + + for (auto& i : src_subs) { + if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) { + // move it + r = move_subdir(from, to, path, i); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "move_subdir(...," + << path << "," << i << ")" + << dendl; + return r; + } + } else { + // common, recurse! + vector<string> nested = path; + nested.push_back(i); + r = _merge_dirs(from, to, nested); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "rec _merge_dirs" + << dendl; + return r; + } + + // now remove it + r = remove_path(nested); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "remove_path " + << nested + << dendl; + return r; + } + } + } + + // objects + map<string, ghobject_t> objects; + r = from.list_objects(path, 0, 0, &objects); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "from.list_objects" + << dendl; + return r; + } + + for (auto& i : objects) { + r = move_object(from, to, path, i); + if (r < 0) { + lgeneric_subdout(g_ceph_context,filestore,20) << __func__ + << " r " << r << " from " + << "move_object(...," + << path << "," << i << ")" + << dendl; + return r; + } + } + + return 0; +} + + +int HashIndex::_split( + uint32_t match, + uint32_t bits, + CollectionIndex* dest) { + ceph_assert(collection_version() == dest->collection_version()); + unsigned mkdirred = 0; + + return col_split_level( + *this, + *static_cast<HashIndex*>(dest), + vector<string>(), + bits, + match, + &mkdirred); +} + +int HashIndex::split_dirs(const vector<string> &path, int target_level) { + dout(20) << __func__ << " " << path << " target level: " + << target_level << dendl; + subdir_info_s info; + int r = get_info(path, &info); + if (r < 0) { + dout(10) << "error looking up info for " << path << ": " + << cpp_strerror(r) << dendl; + return r; + } + + if (must_split(info, target_level)) { + dout(1) << __func__ << " " << path << " has " << info.objs + << " objects, " << info.hash_level + << " level, starting split in pg " << coll() << "." << dendl; + r = initiate_split(path, info); + if (r < 0) { + dout(10) << "error initiating split on " << path << ": " + << cpp_strerror(r) << dendl; + return r; + } + + r = complete_split(path, info); + dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "." + << dendl; + if (r < 0) { + dout(10) << "error completing split on " << path << ": " + << cpp_strerror(r) << dendl; + return r; + } + } + + vector<string> subdirs; + r = list_subdirs(path, &subdirs); + if (r < 0) { + dout(10) << "error listing subdirs of " << path << ": " + << cpp_strerror(r) << dendl; + return r; + } + for (vector<string>::const_iterator it = subdirs.begin(); + it != subdirs.end(); ++it) { + vector<string> subdir_path(path); + subdir_path.push_back(*it); + r = split_dirs(subdir_path, target_level); + if (r < 0) { + return r; + } + } + + return r; +} + +int HashIndex::apply_layout_settings(int target_level) { + vector<string> path; + dout(10) << __func__ << " split multiple = " << split_multiplier + << " merge threshold = " << merge_threshold + << " split rand factor = " << cct->_conf->filestore_split_rand_factor + << " target level = " << target_level + << dendl; + int r = write_settings(); + if (r < 0) + return r; + return split_dirs(path, target_level); +} + +int HashIndex::_init() { + subdir_info_s info; + vector<string> path; + int r = set_info(path, info); + if (r < 0) + return r; + return write_settings(); +} + +int HashIndex::write_settings() { + if (cct->_conf->filestore_split_rand_factor > 0) { + settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor; + } else { + settings.split_rand_factor = 0; + } + vector<string> path; + bufferlist bl; + settings.encode(bl); + return add_attr_path(path, SETTINGS_ATTR, bl); +} + +int HashIndex::read_settings() { + vector<string> path; + bufferlist bl; + int r = get_attr_path(path, SETTINGS_ATTR, bl); + if (r == -ENODATA) + return 0; + if (r < 0) { + derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl; + return r; + } + auto it = bl.cbegin(); + settings.decode(it); + dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl; + return 0; +} + +/* LFNIndex virtual method implementations */ +int HashIndex::_created(const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name) { + subdir_info_s info; + int r; + r = get_info(path, &info); + if (r < 0) + return r; + info.objs++; + r = set_info(path, info); + if (r < 0) + return r; + + if (must_split(info)) { + dout(1) << __func__ << " " << path << " has " << info.objs + << " objects, starting split in pg " << coll() << "." << dendl; + int r = initiate_split(path, info); + if (r < 0) + return r; + r = complete_split(path, info); + dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "." + << dendl; + return r; + } else { + return 0; + } +} + +int HashIndex::_remove(const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name) { + int r; + r = remove_object(path, oid); + if (r < 0) + return r; + subdir_info_s info; + r = get_info(path, &info); + if (r < 0) + return r; + info.objs--; + r = set_info(path, info); + if (r < 0) + return r; + if (must_merge(info)) { + r = initiate_merge(path, info); + if (r < 0) + return r; + return complete_merge(path, info); + } else { + return 0; + } +} + +int HashIndex::_lookup(const ghobject_t &oid, + vector<string> *path, + string *mangled_name, + int *hardlink) { + vector<string> path_comp; + get_path_components(oid, &path_comp); + vector<string>::iterator next = path_comp.begin(); + int exists; + while (1) { + int r = path_exists(*path, &exists); + if (r < 0) + return r; + if (!exists) { + if (path->empty()) + return -ENOENT; + path->pop_back(); + break; + } + if (next == path_comp.end()) + break; + path->push_back(*(next++)); + } + return get_mangled_name(*path, oid, mangled_name, hardlink); +} + +int HashIndex::_collection_list_partial(const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next) { + vector<string> path; + ghobject_t _next; + if (!next) + next = &_next; + *next = start; + dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl; + return list_by_hash(path, end, max_count, next, ls); +} + +int HashIndex::prep_delete() { + return recursive_remove(vector<string>()); +} + +int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) { + int ret; + vector<string> path; + subdir_info_s root_info; + // Make sure there is neither objects nor sub-folders + // in this collection + ret = get_info(path, &root_info); + if (ret < 0) + return ret; + + // Do the folder splitting first + ret = pre_split_folder(pg_num, expected_num_objs); + if (ret < 0) + return ret; + // Initialize the folder info starting from root + return init_split_folder(path, 0); +} + +int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs) +{ + // If folder merging is enabled (by setting the threshold positive), + // no need to split + if (merge_threshold > 0) + return 0; + const coll_t c = coll(); + // Do not split if the expected number of objects in this collection is zero (by default) + if (expected_num_objs == 0) + return 0; + + // Calculate the number of leaf folders (which actually store files) + // need to be created + const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16; + uint64_t leavies = expected_num_objs / objs_per_folder ; + // No need to split + if (leavies == 0 || expected_num_objs == objs_per_folder) + return 0; + + spg_t spgid; + if (!c.is_pg_prefix(&spgid)) + return -EINVAL; + const ps_t ps = spgid.pgid.ps(); + + // the most significant bits of pg_num + const int pg_num_bits = calc_num_bits(pg_num - 1); + ps_t tmp_id = ps; + // calculate the number of levels we only create one sub folder + int num = pg_num_bits / 4; + // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111, + // so that splitting starts at level 3 + if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) { + --num; + } + + int ret; + // Start with creation that only has one subfolder + vector<string> paths; + int dump_num = num; + while (num-- > 0) { + ps_t v = tmp_id & 0x0000000f; + paths.push_back(to_hex(v)); + ret = create_path(paths); + if (ret < 0 && ret != -EEXIST) + return ret; + tmp_id = tmp_id >> 4; + } + + // Starting from here, we can split by creating multiple subfolders + const int left_bits = pg_num_bits - dump_num * 4; + // this variable denotes how many bits (for this level) that can be + // used for sub folder splitting + int split_bits = 4 - left_bits; + // the below logic is inspired by rados.h#ceph_stable_mod, + // it basically determines how many sub-folders should we + // create for splitting + ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT + if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) { + ++split_bits; + } + const uint32_t subs = (1 << split_bits); + // Calculate how many levels we create starting from here + int level = 0; + int level_limit = MAX_HASH_LEVEL - dump_num - 1; + uint64_t actual_leaves = subs; + while (actual_leaves < leavies && level < level_limit) { + ++level; + actual_leaves <<= 4; + } + for (uint32_t i = 0; i < subs; ++i) { + ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT + int v = tmp_id | (i << ((4 - split_bits) % 4)); + paths.push_back(to_hex(v)); + ret = create_path(paths); + if (ret < 0 && ret != -EEXIST) + return ret; + ret = recursive_create_path(paths, level); + if (ret < 0) + return ret; + paths.pop_back(); + } + return 0; +} + +int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level) +{ + // Get the number of sub directories for the current path + vector<string> subdirs; + int ret = list_subdirs(path, &subdirs); + if (ret < 0) + return ret; + subdir_info_s info; + info.subdirs = subdirs.size(); + info.hash_level = hash_level; + ret = set_info(path, info); + if (ret < 0) + return ret; + ret = fsync_dir(path); + if (ret < 0) + return ret; + + // Do the same for subdirs + vector<string>::const_iterator iter; + for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) { + path.push_back(*iter); + ret = init_split_folder(path, hash_level + 1); + if (ret < 0) + return ret; + path.pop_back(); + } + return 0; +} + +int HashIndex::recursive_create_path(vector<string>& path, int level) +{ + if (level == 0) + return 0; + for (int i = 0; i < 16; ++i) { + path.push_back(to_hex(i)); + int ret = create_path(path); + if (ret < 0 && ret != -EEXIST) + return ret; + ret = recursive_create_path(path, level - 1); + if (ret < 0) + return ret; + path.pop_back(); + } + return 0; +} + +int HashIndex::recursive_remove(const vector<string> &path) { + return _recursive_remove(path, true); +} + +int HashIndex::_recursive_remove(const vector<string> &path, bool top) { + vector<string> subdirs; + dout(20) << __func__ << " path=" << path << dendl; + int r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + map<string, ghobject_t> objects; + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + if (!objects.empty()) + return -ENOTEMPTY; + vector<string> subdir(path); + for (vector<string>::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + subdir.push_back(*i); + r = _recursive_remove(subdir, false); + if (r < 0) + return r; + subdir.pop_back(); + } + if (top) + return 0; + else + return remove_path(path); +} + +int HashIndex::start_col_split(const vector<string> &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::COL_SPLIT, path); + op_tag.encode(bl); + int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector<string>()); +} + +int HashIndex::start_split(const vector<string> &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::SPLIT, path); + op_tag.encode(bl); + int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector<string>()); +} + +int HashIndex::start_merge(const vector<string> &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::MERGE, path); + op_tag.encode(bl); + int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector<string>()); +} + +int HashIndex::end_split_or_merge(const vector<string> &path) { + return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG); +} + +int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) { + bufferlist buf; + int r = get_attr_path(path, SUBDIR_ATTR, buf); + if (r < 0) + return r; + auto bufiter = buf.cbegin(); + info->decode(bufiter); + ceph_assert(path.size() == (unsigned)info->hash_level); + return 0; +} + +int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) { + bufferlist buf; + ceph_assert(path.size() == (unsigned)info.hash_level); + info.encode(buf); + return add_attr_path(path, SUBDIR_ATTR, buf); +} + +bool HashIndex::must_merge(const subdir_info_s &info) { + return (info.hash_level > 0 && + merge_threshold > 0 && + info.objs < (unsigned)merge_threshold && + info.subdirs == 0); +} + +bool HashIndex::must_split(const subdir_info_s &info, int target_level) { + // target_level is used for ceph-objectstore-tool to split dirs offline. + // if it is set (defalult is 0) and current hash level < target_level, + // this dir would be split no matters how many objects it has. + return (info.hash_level < (unsigned)MAX_HASH_LEVEL && + ((target_level > 0 && info.hash_level < (unsigned)target_level) || + (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16)))); +} + +int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) { + return start_merge(path); +} + +int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) { + vector<string> dst = path; + dst.pop_back(); + subdir_info_s dstinfo; + int r, exists; + r = path_exists(path, &exists); + if (r < 0) + return r; + r = get_info(dst, &dstinfo); + if (r < 0) + return r; + if (exists) { + r = move_objects(path, dst); + if (r < 0) + return r; + r = reset_attr(dst); + if (r < 0) + return r; + r = remove_path(path); + if (r < 0) + return r; + } + if (must_merge(dstinfo)) { + r = initiate_merge(dst, dstinfo); + if (r < 0) + return r; + r = fsync_dir(dst); + if (r < 0) + return r; + return complete_merge(dst, dstinfo); + } + r = fsync_dir(dst); + if (r < 0) + return r; + return end_split_or_merge(path); +} + +int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) { + return start_split(path); +} + +int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) { + int level = info.hash_level; + map<string, ghobject_t> objects; + vector<string> dst = path; + int r; + dst.push_back(""); + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + vector<string> subdirs_vec; + r = list_subdirs(path, &subdirs_vec); + if (r < 0) + return r; + set<string> subdirs; + subdirs.insert(subdirs_vec.begin(), subdirs_vec.end()); + map<string, map<string, ghobject_t> > mapped; + map<string, ghobject_t> moved; + int num_moved = 0; + for (map<string, ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + vector<string> new_path; + get_path_components(i->second, &new_path); + mapped[new_path[level]][i->first] = i->second; + } + for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin(); + i != mapped.end(); + ) { + dst[level] = i->first; + /* If the info already exists, it must be correct, + * we may be picking up a partially finished split */ + subdir_info_s temp; + // subdir has already been fully copied + if (subdirs.count(i->first) && !get_info(dst, &temp)) { + for (map<string, ghobject_t>::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + moved[j->first] = j->second; + num_moved++; + objects.erase(j->first); + } + ++i; + continue; + } + + subdir_info_s info_new; + info_new.objs = i->second.size(); + info_new.subdirs = 0; + info_new.hash_level = level + 1; + if (must_merge(info_new) && !subdirs.count(i->first)) { + mapped.erase(i++); + continue; + } + + // Subdir doesn't yet exist + if (!subdirs.count(i->first)) { + info.subdirs += 1; + r = create_path(dst); + if (r < 0) + return r; + } // else subdir has been created but only partially copied + + for (map<string, ghobject_t>::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + moved[j->first] = j->second; + num_moved++; + objects.erase(j->first); + r = link_object(path, dst, j->second, j->first); + // May be a partially finished split + if (r < 0 && r != -EEXIST) { + return r; + } + } + + r = fsync_dir(dst); + if (r < 0) + return r; + + // Presence of info must imply that all objects have been copied + r = set_info(dst, info_new); + if (r < 0) + return r; + + r = fsync_dir(dst); + if (r < 0) + return r; + + ++i; + } + r = remove_objects(path, moved, &objects); + if (r < 0) + return r; + info.objs = objects.size(); + r = reset_attr(path); + if (r < 0) + return r; + r = fsync_dir(path); + if (r < 0) + return r; + return end_split_or_merge(path); +} + +void HashIndex::get_path_components(const ghobject_t &oid, + vector<string> *path) { + char buf[MAX_HASH_LEVEL + 1]; + snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key()); + + // Path components are the hex characters of oid.hobj.hash, least + // significant first + for (int i = 0; i < MAX_HASH_LEVEL; ++i) { + path->push_back(string(&buf[i], 1)); + } +} + +string HashIndex::get_hash_str(uint32_t hash) { + char buf[MAX_HASH_LEVEL + 1]; + snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash); + string retval; + for (int i = 0; i < MAX_HASH_LEVEL; ++i) { + retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]); + } + return retval; +} + +string HashIndex::get_path_str(const ghobject_t &oid) { + ceph_assert(!oid.is_max()); + return get_hash_str(oid.hobj.get_hash()); +} + +uint32_t HashIndex::hash_prefix_to_hash(string prefix) { + while (prefix.size() < sizeof(uint32_t) * 2) { + prefix.push_back('0'); + } + uint32_t hash; + sscanf(prefix.c_str(), "%x", &hash); + // nibble reverse + hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4); + hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8); + hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16); + return hash; +} + +int HashIndex::get_path_contents_by_hash_bitwise( + const vector<string> &path, + const ghobject_t *next_object, + set<string, CmpHexdigitStringBitwise> *hash_prefixes, + set<pair<string, ghobject_t>, CmpPairBitwise> *objects) +{ + map<string, ghobject_t> rev_objects; + int r; + r = list_objects(path, 0, 0, &rev_objects); + if (r < 0) + return r; + // bitwise sort + for (map<string, ghobject_t>::iterator i = rev_objects.begin(); + i != rev_objects.end(); + ++i) { + if (next_object && i->second < *next_object) + continue; + string hash_prefix = get_path_str(i->second); + hash_prefixes->insert(hash_prefix); + objects->insert(pair<string, ghobject_t>(hash_prefix, i->second)); + } + vector<string> subdirs; + r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + + // sort subdirs bitwise (by reversing hex digit nibbles) + std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise); + + // Local to this function, we will convert the prefix strings + // (previously simply the reversed hex digits) to also have each + // digit's nibbles reversed. This will make the strings sort + // bitwise. + string cur_prefix; + for (vector<string>::const_iterator i = path.begin(); + i != path.end(); + ++i) { + cur_prefix.append(reverse_hexdigit_bits_string(*i)); + } + string next_object_string; + if (next_object) + next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object)); + for (vector<string>::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + string candidate = cur_prefix + reverse_hexdigit_bits_string(*i); + if (next_object) { + if (next_object->is_max()) + continue; + if (candidate < next_object_string.substr(0, candidate.size())) + continue; + } + // re-reverse the hex digit nibbles for the caller + hash_prefixes->insert(reverse_hexdigit_bits_string(candidate)); + } + return 0; +} + +int HashIndex::list_by_hash(const vector<string> &path, + const ghobject_t &end, + int max_count, + ghobject_t *next, + vector<ghobject_t> *out) +{ + ceph_assert(out); + return list_by_hash_bitwise(path, end, max_count, next, out); +} + +int HashIndex::list_by_hash_bitwise( + const vector<string> &path, + const ghobject_t& end, + int max_count, + ghobject_t *next, + vector<ghobject_t> *out) +{ + vector<string> next_path = path; + next_path.push_back(""); + set<string, CmpHexdigitStringBitwise> hash_prefixes; + set<pair<string, ghobject_t>, CmpPairBitwise> objects; + int r = get_path_contents_by_hash_bitwise(path, + next, + &hash_prefixes, + &objects); + if (r < 0) + return r; + for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin(); + i != hash_prefixes.end(); + ++i) { + dout(20) << __func__ << " prefix " << *i << dendl; + set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound( + make_pair(*i, ghobject_t())); + if (j == objects.end() || j->first != *i) { + *(next_path.rbegin()) = *(i->rbegin()); + ghobject_t next_recurse; + if (next) + next_recurse = *next; + r = list_by_hash_bitwise(next_path, + end, + max_count, + &next_recurse, + out); + + if (r < 0) + return r; + if (!next_recurse.is_max()) { + if (next) + *next = next_recurse; + return 0; + } + } else { + while (j != objects.end() && j->first == *i) { + if (max_count > 0 && out->size() == (unsigned)max_count) { + if (next) + *next = j->second; + return 0; + } + if (j->second >= end) { + if (next) + *next = j->second; + return 0; + } + if (!next || j->second >= *next) { + dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl; + out->push_back(j->second); + } + ++j; + } + } + } + if (next) + *next = ghobject_t::get_max(); + return 0; +} + + diff --git a/src/os/filestore/HashIndex.h b/src/os/filestore/HashIndex.h new file mode 100644 index 00000000..7e34d155 --- /dev/null +++ b/src/os/filestore/HashIndex.h @@ -0,0 +1,462 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_HASHINDEX_H +#define CEPH_HASHINDEX_H + +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "LFNIndex.h" + +extern string reverse_hexdigit_bits_string(string l); + +/** + * Implements collection prehashing. + * + * @verbatim + * (root) - 0 - 0 + * - 1 + * - E + * - 1 + * - 2 - D - 0 + * . + * . + * . + * - F - 0 + * @endverbatim + * + * A file is located at the longest existing directory from the root + * given by the hex characters in the hash beginning with the least + * significant. + * + * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2) + * would be located in (root)/2/D/0/ + * + * Subdirectories are created when the number of objects in a + * directory exceed 16 * (abs(merge_threshhold) * split_multiplier + + * split_rand_factor). The number of objects in a directory is encoded + * as subdir_info_s in an xattr on the directory. + */ +class HashIndex : public LFNIndex { +private: + /// Attribute name for storing subdir info @see subdir_info_s + static const string SUBDIR_ATTR; + /// Attribute name for storing index-wide settings + static const string SETTINGS_ATTR; + /// Attribute name for storing in progress op tag + static const string IN_PROGRESS_OP_TAG; + /// Size (bits) in object hash + static const int PATH_HASH_LEN = 32; + /// Max length of hashed path + static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4); + + /** + * Merges occur when the number of object drops below + * merge_threshold and splits occur when the number of objects + * exceeds: + * + * 16 * (abs(merge_threshold) * split_multiplier + split_rand_factor) + * + * Please note if merge_threshold is less than zero, it will never + * do merging + */ + int merge_threshold; + int split_multiplier; + + /// Encodes current subdir state for determining when to split/merge. + struct subdir_info_s { + uint64_t objs; ///< Objects in subdir. + uint32_t subdirs; ///< Subdirs in subdir. + uint32_t hash_level; ///< Hashlevel of subdir. + + subdir_info_s() : objs(0), subdirs(0), hash_level(0) {} + + void encode(bufferlist &bl) const + { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(objs, bl); + encode(subdirs, bl); + encode(hash_level, bl); + } + + void decode(bufferlist::const_iterator &bl) + { + using ceph::decode; + __u8 v; + decode(v, bl); + ceph_assert(v == 1); + decode(objs, bl); + decode(subdirs, bl); + decode(hash_level, bl); + } + }; + + struct settings_s { + uint32_t split_rand_factor; ///< random factor added to split threshold (only on root of collection) + settings_s() : split_rand_factor(0) {} + void encode(bufferlist &bl) const + { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(split_rand_factor, bl); + } + void decode(bufferlist::const_iterator &bl) + { + using ceph::decode; + __u8 v; + decode(v, bl); + decode(split_rand_factor, bl); + } + } settings; + + /// Encodes in progress split or merge + struct InProgressOp { + static const int SPLIT = 0; + static const int MERGE = 1; + static const int COL_SPLIT = 2; + int op; + vector<string> path; + + InProgressOp(int op, const vector<string> &path) + : op(op), path(path) {} + + explicit InProgressOp(bufferlist::const_iterator &bl) { + decode(bl); + } + + bool is_split() const { return op == SPLIT; } + bool is_col_split() const { return op == COL_SPLIT; } + bool is_merge() const { return op == MERGE; } + + void encode(bufferlist &bl) const { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(op, bl); + encode(path, bl); + } + + void decode(bufferlist::const_iterator &bl) { + using ceph::decode; + __u8 v; + decode(v, bl); + ceph_assert(v == 1); + decode(op, bl); + decode(path, bl); + } + }; + + +public: + /// Constructor. + HashIndex( + CephContext* cct, + coll_t collection, ///< [in] Collection + const char *base_path, ///< [in] Path to the index root. + int merge_at, ///< [in] Merge threshold. + int split_multiple, ///< [in] Split threshold. + uint32_t index_version,///< [in] Index version + double retry_probability=0) ///< [in] retry probability + : LFNIndex(cct, collection, base_path, index_version, retry_probability), + merge_threshold(merge_at), + split_multiplier(split_multiple) + {} + + int read_settings() override; + + /// @see CollectionIndex + uint32_t collection_version() override { return index_version; } + + /// @see CollectionIndex + int cleanup() override; + + /// @see CollectionIndex + int prep_delete() override; + + /// @see CollectionIndex + int _split( + uint32_t match, + uint32_t bits, + CollectionIndex* dest + ) override; + + /// @see CollectionIndex + int _merge( + uint32_t bits, + CollectionIndex* dest + ) override; + + int _merge_dirs( + HashIndex& from, + HashIndex& to, + const vector<string>& path); + + /// @see CollectionIndex + int apply_layout_settings(int target_level) override; + +protected: + int _init() override; + + int _created( + const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name + ) override; + int _remove( + const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name + ) override; + int _lookup( + const ghobject_t &oid, + vector<string> *path, + string *mangled_name, + int *hardlink + ) override; + + /** + * Pre-hash the collection to create folders according to the expected number + * of objects in this collection. + */ + int _pre_hash_collection( + uint32_t pg_num, + uint64_t expected_num_objs + ) override; + + int _collection_list_partial( + const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next + ) override; +private: + /// Internal recursively remove path and its subdirs + int _recursive_remove( + const vector<string> &path, ///< [in] path to remove + bool top ///< [in] internal tracking of first caller + ); /// @return Error Code, 0 on success + /// Recursively remove path and its subdirs + int recursive_remove( + const vector<string> &path ///< [in] path to remove + ); /// @return Error Code, 0 on success + /// Tag root directory at beginning of col_split + int start_col_split( + const vector<string> &path ///< [in] path to split + ); ///< @return Error Code, 0 on success + /// Tag root directory at beginning of split + int start_split( + const vector<string> &path ///< [in] path to split + ); ///< @return Error Code, 0 on success + /// Tag root directory at beginning of split + int start_merge( + const vector<string> &path ///< [in] path to merge + ); ///< @return Error Code, 0 on success + /// Remove tag at end of split or merge + int end_split_or_merge( + const vector<string> &path ///< [in] path to split or merged + ); ///< @return Error Code, 0 on success + /// Gets info from the xattr on the subdir represented by path + int get_info( + const vector<string> &path, ///< [in] Path from which to read attribute. + subdir_info_s *info ///< [out] Attribute value + ); /// @return Error Code, 0 on success + + /// Sets info to the xattr on the subdir represented by path + int set_info( + const vector<string> &path, ///< [in] Path on which to set attribute. + const subdir_info_s &info ///< [in] Value to set + ); /// @return Error Code, 0 on success + + /// Encapsulates logic for when to split. + bool must_merge( + const subdir_info_s &info ///< [in] Info to check + ); /// @return True if info must be merged, False otherwise + + /// Encapsulates logic for when to merge. + bool must_split( + const subdir_info_s &info, ///< [in] Info to check + int target_level = 0 + ); /// @return True if info must be split, False otherwise + + /// Initiates merge + int initiate_merge( + const vector<string> &path, ///< [in] Subdir to merge + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Completes merge + int complete_merge( + const vector<string> &path, ///< [in] Subdir to merge + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Resets attr to match actual subdir contents + int reset_attr( + const vector<string> &path ///< [in] path to cleanup + ); + + /// Initiate Split + int initiate_split( + const vector<string> &path, ///< [in] Subdir to split + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Completes Split + int complete_split( + const vector<string> &path, ///< [in] Subdir to split + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Determine path components from hoid hash + void get_path_components( + const ghobject_t &oid, ///< [in] Object for which to get path components + vector<string> *path ///< [out] Path components for hoid. + ); + + /// Pre-hash and split folders to avoid runtime splitting + /// according to the given expected object number. + int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs); + + /// Initialize the folder (dir info) with the given hash + /// level and number of its subdirs. + int init_split_folder(vector<string> &path, uint32_t hash_level); + + /// do collection split for path + static int col_split_level( + HashIndex &from, ///< [in] from index + HashIndex &dest, ///< [in] to index + const vector<string> &path, ///< [in] path to split + uint32_t bits, ///< [in] num bits to match + uint32_t match, ///< [in] bits to match + unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred + ); + + + /** + * Get string representation of ghobject_t/hash + * + * e.g: 0x01234567 -> "76543210" + */ + static string get_path_str( + const ghobject_t &oid ///< [in] Object to get hash string for + ); ///< @return Hash string for hoid. + + /// Get string from hash, @see get_path_str + static string get_hash_str( + uint32_t hash ///< [in] Hash to convert to a string. + ); ///< @return String representation of hash + + /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00 + static uint32_t hash_prefix_to_hash( + string prefix ///< [in] string to convert + ); ///< @return Hash + + /// Get hash mod from path + static void path_to_hobject_hash_prefix( + const vector<string> &path,///< [in] path to convert + uint32_t *bits, ///< [out] bits + uint32_t *hash ///< [out] hash + ) { + string hash_str; + for (vector<string>::const_iterator i = path.begin(); + i != path.end(); + ++i) { + hash_str.push_back(*i->begin()); + } + uint32_t rev_hash = hash_prefix_to_hash(hash_str); + if (hash) + *hash = rev_hash; + if (bits) + *bits = path.size() * 4; + } + + /// Calculate the number of bits. + static int calc_num_bits(uint64_t n) { + int ret = 0; + while (n > 0) { + n = n >> 1; + ret++; + } + return ret; + } + + /// Convert a number to hex string (upper case). + static string to_hex(int n) { + ceph_assert(n >= 0 && n < 16); + char c = (n <= 9 ? ('0' + n) : ('A' + n - 10)); + string str; + str.append(1, c); + return str; + } + + struct CmpPairBitwise { + bool operator()(const pair<string, ghobject_t>& l, + const pair<string, ghobject_t>& r) const + { + if (l.first < r.first) + return true; + if (l.first > r.first) + return false; + if (cmp(l.second, r.second) < 0) + return true; + return false; + } + }; + + struct CmpHexdigitStringBitwise { + bool operator()(const string& l, const string& r) const { + return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r); + } + }; + + /// Get path contents by hash + int get_path_contents_by_hash_bitwise( + const vector<string> &path, /// [in] Path to list + const ghobject_t *next_object, /// [in] list > *next_object + set<string, CmpHexdigitStringBitwise> *hash_prefixes, /// [out] prefixes in dir + set<pair<string, ghobject_t>, CmpPairBitwise> *objects /// [out] objects + ); + + /// List objects in collection in ghobject_t order + int list_by_hash( + const vector<string> &path, /// [in] Path to list + const ghobject_t &end, /// [in] List only objects < end + int max_count, /// [in] List at most max_count + ghobject_t *next, /// [in,out] List objects >= *next + vector<ghobject_t> *out /// [out] Listed objects + ); ///< @return Error Code, 0 on success + /// List objects in collection in ghobject_t order + int list_by_hash_bitwise( + const vector<string> &path, /// [in] Path to list + const ghobject_t &end, /// [in] List only objects < end + int max_count, /// [in] List at most max_count + ghobject_t *next, /// [in,out] List objects >= *next + vector<ghobject_t> *out /// [out] Listed objects + ); ///< @return Error Code, 0 on success + + /// Create the given levels of sub directories from the given root. + /// The contents of *path* is not changed after calling this function. + int recursive_create_path(vector<string>& path, int level); + + /// split each dir below the given path + int split_dirs(const vector<string> &path, int target_level = 0); + + int write_settings(); +}; + +#endif diff --git a/src/os/filestore/IndexManager.cc b/src/os/filestore/IndexManager.cc new file mode 100644 index 00000000..73095026 --- /dev/null +++ b/src/os/filestore/IndexManager.cc @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/unordered_map.h" + +#if defined(__FreeBSD__) +#include <sys/param.h> +#endif + +#include <errno.h> + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/config.h" +#include "common/debug.h" +#include "include/buffer.h" + +#include "IndexManager.h" +#include "HashIndex.h" +#include "CollectionIndex.h" + +#include "chain_xattr.h" + +static int set_version(const char *path, uint32_t version) { + bufferlist bl; + encode(version, bl); + return chain_setxattr<true, true>( + path, "user.cephos.collection_version", bl.c_str(), + bl.length()); +} + +static int get_version(const char *path, uint32_t *version) { + bufferptr bp(PATH_MAX); + int r = chain_getxattr(path, "user.cephos.collection_version", + bp.c_str(), bp.length()); + if (r < 0) { + if (r != -ENOENT) { + *version = 0; + return 0; + } else { + return r; + } + } + bp.set_length(r); + bufferlist bl; + bl.push_back(bp); + auto i = bl.cbegin(); + decode(*version, i); + return 0; +} + +IndexManager::~IndexManager() { + + for (ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.begin(); + it != col_indices.end(); ++it) { + + delete it->second; + it->second = NULL; + } + col_indices.clear(); +} + + +int IndexManager::init_index(coll_t c, const char *path, uint32_t version) { + RWLock::WLocker l(lock); + int r = set_version(path, version); + if (r < 0) + return r; + HashIndex index(cct, c, path, cct->_conf->filestore_merge_threshold, + cct->_conf->filestore_split_multiple, + version, + cct->_conf->filestore_index_retry_probability); + r = index.init(); + if (r < 0) + return r; + return index.read_settings(); +} + +int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) { + if (upgrade) { + // Need to check the collection generation + int r; + uint32_t version = 0; + r = get_version(path, &version); + if (r < 0) + return r; + + switch (version) { + case CollectionIndex::FLAT_INDEX_TAG: + case CollectionIndex::HASH_INDEX_TAG: // fall through + case CollectionIndex::HASH_INDEX_TAG_2: // fall through + case CollectionIndex::HOBJECT_WITH_POOL: { + // Must be a HashIndex + *index = new HashIndex(cct, c, path, + cct->_conf->filestore_merge_threshold, + cct->_conf->filestore_split_multiple, + version); + return (*index)->read_settings(); + } + default: ceph_abort(); + } + + } else { + // No need to check + *index = new HashIndex(cct, c, path, cct->_conf->filestore_merge_threshold, + cct->_conf->filestore_split_multiple, + CollectionIndex::HOBJECT_WITH_POOL, + cct->_conf->filestore_index_retry_probability); + return (*index)->read_settings(); + } +} + +bool IndexManager::get_index_optimistic(coll_t c, Index *index) { + RWLock::RLocker l(lock); + ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c); + if (it == col_indices.end()) + return false; + index->index = it->second; + return true; +} + +int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) { + if (get_index_optimistic(c, index)) + return 0; + RWLock::WLocker l(lock); + ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c); + if (it == col_indices.end()) { + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str()); + CollectionIndex* colIndex = NULL; + int r = build_index(c, path, &colIndex); + if (r < 0) + return r; + col_indices[c] = colIndex; + index->index = colIndex; + } else { + index->index = it->second; + } + return 0; +} diff --git a/src/os/filestore/IndexManager.h b/src/os/filestore/IndexManager.h new file mode 100644 index 00000000..19cd2926 --- /dev/null +++ b/src/os/filestore/IndexManager.h @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef OS_INDEXMANAGER_H +#define OS_INDEXMANAGER_H + +#include "include/unordered_map.h" + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/config.h" +#include "common/debug.h" + +#include "CollectionIndex.h" +#include "HashIndex.h" + + +/// Public type for Index +struct Index { + CollectionIndex *index; + + Index() : index(NULL) {} + explicit Index(CollectionIndex* index) : index(index) {} + + CollectionIndex *operator->() { return index; } + CollectionIndex &operator*() { return *index; } +}; + + +/** + * Encapsulates mutual exclusion for CollectionIndexes. + * + * Allowing a modification (removal or addition of an object) to occur + * while a read is occurring (lookup of an object's path and use of + * that path) may result in the path becoming invalid. Thus, during + * the lifetime of a CollectionIndex object and any paths returned + * by it, no other concurrent accesses may be allowed. + * This is enforced by using CollectionIndex::access_lock + */ +class IndexManager { + CephContext* cct; + RWLock lock; ///< Lock for Index Manager + bool upgrade; + ceph::unordered_map<coll_t, CollectionIndex* > col_indices; + + /** + * Index factory + * + * Encapsulates logic for handling legacy FileStore + * layouts + * + * @param [in] c Collection for which to get index + * @param [in] path Path to collection + * @param [out] index Index for c + * @return error code + */ + int build_index(coll_t c, const char *path, CollectionIndex **index); + bool get_index_optimistic(coll_t c, Index *index); +public: + /// Constructor + explicit IndexManager(CephContext* cct, + bool upgrade) : cct(cct), + lock("IndexManager lock"), + upgrade(upgrade) {} + + ~IndexManager(); + + /** + * Reserve and return index for c + * + * @param [in] c Collection for which to get index + * @param [in] baseDir base directory of collections + * @param [out] index Index for c + * @return error code + */ + int get_index(coll_t c, const string& baseDir, Index *index); + + /** + * Initialize index for collection c at path + * + * @param [in] c Collection for which to init Index + * @param [in] path Path to collection + * @param [in] filestore_version version of containing FileStore + * @return error code + */ + int init_index(coll_t c, const char *path, uint32_t filestore_version); +}; + +#endif diff --git a/src/os/filestore/Journal.h b/src/os/filestore/Journal.h new file mode 100644 index 00000000..cfb667d8 --- /dev/null +++ b/src/os/filestore/Journal.h @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_JOURNAL_H +#define CEPH_JOURNAL_H + +#include <errno.h> + +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "common/Finisher.h" +#include "common/TrackedOp.h" +#include "os/ObjectStore.h" +#include "common/zipkin_trace.h" + +class PerfCounters; + +class Journal { +protected: + uuid_d fsid; + Finisher *finisher; +public: + CephContext* cct; + PerfCounters *logger; +protected: + Cond *do_sync_cond; + bool wait_on_full; + +public: + Journal(CephContext* cct, uuid_d f, Finisher *fin, Cond *c=0) : + fsid(f), finisher(fin), cct(cct), logger(NULL), + do_sync_cond(c), + wait_on_full(false) { } + virtual ~Journal() { } + + virtual int check() = 0; ///< check if journal appears valid + virtual int create() = 0; ///< create a fresh journal + virtual int open(uint64_t fs_op_seq) = 0; ///< open an existing journal + virtual void close() = 0; ///< close an open journal + + virtual void flush() = 0; + + virtual void get_devices(set<string> *ls) {} + virtual void collect_metadata(map<string,string> *pm) {} + /** + * reserve_throttle_and_backoff + * + * Implementation may throttle or backoff based on ops + * reserved here but not yet released using committed_thru. + */ + virtual void reserve_throttle_and_backoff(uint64_t count) = 0; + + virtual int dump(ostream& out) { return -EOPNOTSUPP; } + + void set_wait_on_full(bool b) { wait_on_full = b; } + + // writes + virtual bool is_writeable() = 0; + virtual int make_writeable() = 0; + virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len, + Context *oncommit, + TrackedOpRef osd_op = TrackedOpRef()) = 0; + virtual void commit_start(uint64_t seq) = 0; + virtual void committed_thru(uint64_t seq) = 0; + + /// Read next journal entry - asserts on invalid journal + virtual bool read_entry( + bufferlist &bl, ///< [out] payload on successful read + uint64_t &seq ///< [in,out] sequence number on last successful read + ) = 0; ///< @return true on successful read, false on journal end + + virtual bool should_commit_now() = 0; + + virtual int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) = 0; + + virtual off64_t get_journal_size_estimate() { return 0; } + + // reads/recovery + +}; + +#endif diff --git a/src/os/filestore/JournalThrottle.cc b/src/os/filestore/JournalThrottle.cc new file mode 100644 index 00000000..8475bbbf --- /dev/null +++ b/src/os/filestore/JournalThrottle.cc @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "JournalThrottle.h" +#include "include/ceph_assert.h" + +bool JournalThrottle::set_params( + double _low_threshhold, + double _high_threshhold, + double _expected_throughput, + double _high_multiple, + double _max_multiple, + uint64_t _throttle_max, + std::ostream *errstream) +{ + return throttle.set_params( + _low_threshhold, + _high_threshhold, + _expected_throughput, + _high_multiple, + _max_multiple, + _throttle_max, + errstream); +} + +std::chrono::duration<double> JournalThrottle::get(uint64_t c) +{ + return throttle.get(c); +} + +uint64_t JournalThrottle::take(uint64_t c) +{ + return throttle.take(c); +} + +void JournalThrottle::register_throttle_seq(uint64_t seq, uint64_t c) +{ + locker l(lock); + journaled_ops.push_back(std::make_pair(seq, c)); +} + +std::pair<uint64_t, uint64_t> JournalThrottle::flush(uint64_t mono_id) +{ + uint64_t to_put_bytes = 0; + uint64_t to_put_ops = 0; + { + locker l(lock); + while (!journaled_ops.empty() && + journaled_ops.front().first <= mono_id) { + to_put_bytes += journaled_ops.front().second; + to_put_ops++; + journaled_ops.pop_front(); + } + } + throttle.put(to_put_bytes); + return make_pair(to_put_ops, to_put_bytes); +} + +uint64_t JournalThrottle::get_current() +{ + return throttle.get_current(); +} + +uint64_t JournalThrottle::get_max() +{ + return throttle.get_max(); +} diff --git a/src/os/filestore/JournalThrottle.h b/src/os/filestore/JournalThrottle.h new file mode 100644 index 00000000..75485d6d --- /dev/null +++ b/src/os/filestore/JournalThrottle.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_JOURNAL_THROTTLE_H +#define CEPH_JOURNAL_THROTTLE_H + +#include "common/Throttle.h" + +#include <list> +#include <deque> +#include <condition_variable> +#include <thread> +#include <vector> +#include <chrono> +#include <iostream> + +/** + * JournalThrottle + * + * Throttle designed to implement dynamic throttling as the journal fills + * up. The goal is to not delay ops at all when the journal is relatively + * empty, delay ops somewhat as the journal begins to fill (with the delay + * getting linearly longer as the journal fills up to a high water mark), + * and to delay much more aggressively (though still linearly with usage) + * until we hit the max value. + * + * The implementation simply wraps BackoffThrottle with a queue of + * journaled but not synced ops. + * + * The usage pattern is as follows: + * 1) Call get(seq, bytes) before taking the op_queue_throttle + * 2) Once the journal is flushed, flush(max_op_id_flushed) + */ +class JournalThrottle { + BackoffThrottle throttle; + + std::mutex lock; + /// deque<id, count> + std::deque<std::pair<uint64_t, uint64_t> > journaled_ops; + using locker = std::unique_lock<std::mutex>; + +public: + /** + * set_params + * + * Sets params. If the params are invalid, returns false + * and populates errstream (if non-null) with a user compreshensible + * explanation. + */ + bool set_params( + double low_threshhold, + double high_threshhold, + double expected_throughput, + double high_multiple, + double max_multiple, + uint64_t throttle_max, + std::ostream *errstream); + + /** + * gets specified throttle for id mono_id, waiting as necessary + * + * @param c [in] amount to take + * @return duration waited + */ + std::chrono::duration<double> get(uint64_t c); + + /** + * take + * + * Takes specified throttle without waiting + */ + uint64_t take(uint64_t c); + + /** + * register_throttle_seq + * + * Registers a sequence number with an amount of throttle to + * release upon flush() + * + * @param seq [in] seq + */ + void register_throttle_seq(uint64_t seq, uint64_t c); + + + /** + * Releases throttle held by ids <= mono_id + * + * @param mono_id [in] id up to which to flush + * @returns pair<ops_flushed, bytes_flushed> + */ + std::pair<uint64_t, uint64_t> flush(uint64_t mono_id); + + uint64_t get_current(); + uint64_t get_max(); + + JournalThrottle( + unsigned expected_concurrency ///< [in] determines size of conds + ) : throttle(g_ceph_context, "filestore_journal", expected_concurrency) {} +}; + +#endif diff --git a/src/os/filestore/JournalingObjectStore.cc b/src/os/filestore/JournalingObjectStore.cc new file mode 100644 index 00000000..714d0935 --- /dev/null +++ b/src/os/filestore/JournalingObjectStore.cc @@ -0,0 +1,271 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "JournalingObjectStore.h" + +#include "common/errno.h" +#include "common/debug.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_journal +#undef dout_prefix +#define dout_prefix *_dout << "journal " + + + +void JournalingObjectStore::journal_start() +{ + dout(10) << "journal_start" << dendl; + finisher.start(); +} + +void JournalingObjectStore::journal_stop() +{ + dout(10) << "journal_stop" << dendl; + finisher.wait_for_empty(); + finisher.stop(); +} + +// A journal_replay() makes journal writeable, this closes that out. +void JournalingObjectStore::journal_write_close() +{ + if (journal) { + journal->close(); + delete journal; + journal = 0; + } + apply_manager.reset(); +} + +int JournalingObjectStore::journal_replay(uint64_t fs_op_seq) +{ + dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl; + + if (cct->_conf->journal_replay_from) { + dout(0) << "journal_replay forcing replay from " + << cct->_conf->journal_replay_from + << " instead of " << fs_op_seq << dendl; + // the previous op is the last one committed + fs_op_seq = cct->_conf->journal_replay_from - 1; + } + + uint64_t op_seq = fs_op_seq; + apply_manager.init_seq(fs_op_seq); + + if (!journal) { + submit_manager.set_op_seq(op_seq); + return 0; + } + + int err = journal->open(op_seq); + if (err < 0) { + dout(3) << "journal_replay open failed with " + << cpp_strerror(err) << dendl; + delete journal; + journal = 0; + return err; + } + + replaying = true; + + int count = 0; + while (1) { + bufferlist bl; + uint64_t seq = op_seq + 1; + if (!journal->read_entry(bl, seq)) { + dout(3) << "journal_replay: end of journal, done." << dendl; + break; + } + + if (seq <= op_seq) { + dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl; + continue; + } + ceph_assert(op_seq == seq-1); + + dout(3) << "journal_replay: applying op seq " << seq << dendl; + auto p = bl.cbegin(); + vector<ObjectStore::Transaction> tls; + while (!p.end()) { + tls.emplace_back(Transaction(p)); + } + + apply_manager.op_apply_start(seq); + int r = do_transactions(tls, seq); + apply_manager.op_apply_finish(seq); + + op_seq = seq; + count++; + + dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl; + } + + if (count) + dout(3) << "journal_replay: total = " << count << dendl; + + replaying = false; + + submit_manager.set_op_seq(op_seq); + + // done reading, make writeable. + err = journal->make_writeable(); + if (err < 0) + return err; + + if (!count) + journal->committed_thru(fs_op_seq); + + return count; +} + + +// ------------------------------------ + +uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op) +{ + Mutex::Locker l(apply_lock); + while (blocked) { + dout(10) << "op_apply_start blocked, waiting" << dendl; + blocked_cond.Wait(apply_lock); + } + dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> " + << (open_ops+1) << dendl; + ceph_assert(!blocked); + ceph_assert(op > committed_seq); + open_ops++; + return op; +} + +void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op) +{ + Mutex::Locker l(apply_lock); + dout(10) << "op_apply_finish " << op << " open_ops " << open_ops << " -> " + << (open_ops-1) << ", max_applied_seq " << max_applied_seq << " -> " + << std::max(op, max_applied_seq) << dendl; + --open_ops; + ceph_assert(open_ops >= 0); + + // signal a blocked commit_start + if (blocked) { + blocked_cond.Signal(); + } + + // there can be multiple applies in flight; track the max value we + // note. note that we can't _read_ this value and learn anything + // meaningful unless/until we've quiesced all in-flight applies. + if (op > max_applied_seq) + max_applied_seq = op; +} + +uint64_t JournalingObjectStore::SubmitManager::op_submit_start() +{ + lock.Lock(); + uint64_t op = ++op_seq; + dout(10) << "op_submit_start " << op << dendl; + return op; +} + +void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op) +{ + dout(10) << "op_submit_finish " << op << dendl; + if (op != op_submitted + 1) { + dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1) + << ", OUT OF ORDER" << dendl; + ceph_abort_msg("out of order op_submit_finish"); + } + op_submitted = op; + lock.Unlock(); +} + + +// ------------------------------------------ + +void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c) +{ + Mutex::Locker l(com_lock); + ceph_assert(c); + commit_waiters[op].push_back(c); +} + +bool JournalingObjectStore::ApplyManager::commit_start() +{ + bool ret = false; + + { + Mutex::Locker l(apply_lock); + dout(10) << "commit_start max_applied_seq " << max_applied_seq + << ", open_ops " << open_ops << dendl; + blocked = true; + while (open_ops > 0) { + dout(10) << "commit_start waiting for " << open_ops + << " open ops to drain" << dendl; + blocked_cond.Wait(apply_lock); + } + ceph_assert(open_ops == 0); + dout(10) << "commit_start blocked, all open_ops have completed" << dendl; + { + Mutex::Locker l(com_lock); + if (max_applied_seq == committed_seq) { + dout(10) << "commit_start nothing to do" << dendl; + blocked = false; + ceph_assert(commit_waiters.empty()); + goto out; + } + + committing_seq = max_applied_seq; + + dout(10) << "commit_start committing " << committing_seq + << ", still blocked" << dendl; + } + } + ret = true; + + if (journal) + journal->commit_start(committing_seq); // tell the journal too + out: + return ret; +} + +void JournalingObjectStore::ApplyManager::commit_started() +{ + Mutex::Locker l(apply_lock); + // allow new ops. (underlying fs should now be committing all prior ops) + dout(10) << "commit_started committing " << committing_seq << ", unblocking" + << dendl; + blocked = false; + blocked_cond.Signal(); +} + +void JournalingObjectStore::ApplyManager::commit_finish() +{ + Mutex::Locker l(com_lock); + dout(10) << "commit_finish thru " << committing_seq << dendl; + + if (journal) + journal->committed_thru(committing_seq); + + committed_seq = committing_seq; + + map<version_t, vector<Context*> >::iterator p = commit_waiters.begin(); + while (p != commit_waiters.end() && + p->first <= committing_seq) { + finisher.queue(p->second); + commit_waiters.erase(p++); + } +} + +void JournalingObjectStore::_op_journal_transactions( + bufferlist& tbl, uint32_t orig_len, uint64_t op, + Context *onjournal, TrackedOpRef osd_op) +{ + if (osd_op.get()) + dout(10) << "op_journal_transactions " << op << " reqid_t " + << (static_cast<OpRequest *>(osd_op.get()))->get_reqid() << dendl; + else + dout(10) << "op_journal_transactions " << op << dendl; + + if (journal && journal->is_writeable()) { + journal->submit_entry(op, tbl, orig_len, onjournal, osd_op); + } else if (onjournal) { + apply_manager.add_waiter(op, onjournal); + } +} diff --git a/src/os/filestore/JournalingObjectStore.h b/src/os/filestore/JournalingObjectStore.h new file mode 100644 index 00000000..a289d0e8 --- /dev/null +++ b/src/os/filestore/JournalingObjectStore.h @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_JOURNALINGOBJECTSTORE_H +#define CEPH_JOURNALINGOBJECTSTORE_H + +#include "os/ObjectStore.h" +#include "Journal.h" +#include "FileJournal.h" +#include "common/RWLock.h" +#include "osd/OpRequest.h" + +class JournalingObjectStore : public ObjectStore { +protected: + Journal *journal; + Finisher finisher; + + + class SubmitManager { + CephContext* cct; + Mutex lock; + uint64_t op_seq; + uint64_t op_submitted; + public: + SubmitManager(CephContext* cct) : + cct(cct), lock("JOS::SubmitManager::lock", false, true, false), + op_seq(0), op_submitted(0) + {} + uint64_t op_submit_start(); + void op_submit_finish(uint64_t op); + void set_op_seq(uint64_t seq) { + Mutex::Locker l(lock); + op_submitted = op_seq = seq; + } + uint64_t get_op_seq() { + return op_seq; + } + } submit_manager; + + class ApplyManager { + CephContext* cct; + Journal *&journal; + Finisher &finisher; + + Mutex apply_lock; + bool blocked; + Cond blocked_cond; + int open_ops; + uint64_t max_applied_seq; + + Mutex com_lock; + map<version_t, vector<Context*> > commit_waiters; + uint64_t committing_seq, committed_seq; + + public: + ApplyManager(CephContext* cct, Journal *&j, Finisher &f) : + cct(cct), journal(j), finisher(f), + apply_lock("JOS::ApplyManager::apply_lock", false, true, false), + blocked(false), + open_ops(0), + max_applied_seq(0), + com_lock("JOS::ApplyManager::com_lock", false, true, false), + committing_seq(0), committed_seq(0) {} + void reset() { + ceph_assert(open_ops == 0); + ceph_assert(blocked == false); + max_applied_seq = 0; + committing_seq = 0; + committed_seq = 0; + } + void add_waiter(uint64_t, Context*); + uint64_t op_apply_start(uint64_t op); + void op_apply_finish(uint64_t op); + bool commit_start(); + void commit_started(); + void commit_finish(); + bool is_committing() { + Mutex::Locker l(com_lock); + return committing_seq != committed_seq; + } + uint64_t get_committed_seq() { + Mutex::Locker l(com_lock); + return committed_seq; + } + uint64_t get_committing_seq() { + Mutex::Locker l(com_lock); + return committing_seq; + } + void init_seq(uint64_t fs_op_seq) { + { + Mutex::Locker l(com_lock); + committed_seq = fs_op_seq; + committing_seq = fs_op_seq; + } + { + Mutex::Locker l(apply_lock); + max_applied_seq = fs_op_seq; + } + } + } apply_manager; + + bool replaying; + +protected: + void journal_start(); + void journal_stop(); + void journal_write_close(); + int journal_replay(uint64_t fs_op_seq); + + void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op, + Context *onjournal, TrackedOpRef osd_op); + + virtual int do_transactions(vector<ObjectStore::Transaction>& tls, uint64_t op_seq) = 0; + +public: + bool is_committing() { + return apply_manager.is_committing(); + } + uint64_t get_committed_seq() { + return apply_manager.get_committed_seq(); + } + +public: + JournalingObjectStore(CephContext* cct, const std::string& path) + : ObjectStore(cct, path), + journal(NULL), + finisher(cct, "JournalObjectStore", "fn_jrn_objstore"), + submit_manager(cct), + apply_manager(cct, journal, finisher), + replaying(false) {} + + ~JournalingObjectStore() override { + } +}; + +#endif diff --git a/src/os/filestore/LFNIndex.cc b/src/os/filestore/LFNIndex.cc new file mode 100644 index 00000000..2451ae8c --- /dev/null +++ b/src/os/filestore/LFNIndex.cc @@ -0,0 +1,1407 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <string> +#include <map> +#include <set> +#include <vector> +#include <errno.h> +#include <string.h> + +#if defined(__FreeBSD__) +#include <sys/param.h> +#endif + +#include "osd/osd_types.h" +#include "include/object.h" +#include "common/config.h" +#include "common/debug.h" +#include "include/buffer.h" +#include "common/ceph_crypto.h" +#include "common/errno.h" +#include "include/compat.h" +#include "chain_xattr.h" + +#include "LFNIndex.h" +using ceph::crypto::SHA1; + +#define dout_context cct +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") " + + +const string LFNIndex::LFN_ATTR = "user.cephos.lfn"; +const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash."; +const string LFNIndex::SUBDIR_PREFIX = "DIR_"; +const string LFNIndex::FILENAME_COOKIE = "long"; +const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN - + FILENAME_COOKIE.size() - + FILENAME_EXTRA; +void LFNIndex::maybe_inject_failure() +{ + if (error_injection_enabled) { + if (current_failure > last_failure && + (((double)(rand() % 10000))/((double)(10000)) + < error_injection_probability)) { + last_failure = current_failure; + current_failure = 0; + throw RetryException(); + } + ++current_failure; + } +} + +// Helper to close fd's when we leave scope. This is useful when used +// in combination with RetryException, thrown by the above. +struct FDCloser { + int fd; + explicit FDCloser(int f) : fd(f) {} + ~FDCloser() { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } +}; + + +/* Public methods */ + +uint64_t LFNIndex::get_max_escaped_name_len(const hobject_t &obj) +{ + ghobject_t ghobj(obj); + ghobj.shard_id = shard_id_t(0); + ghobj.generation = 0; + ghobj.hobj.snap = 0; + return lfn_generate_object_name_current(ghobj).size(); +} + +int LFNIndex::init() +{ + return _init(); +} + +int LFNIndex::created(const ghobject_t &oid, const char *path) +{ + WRAP_RETRY( + vector<string> path_comp; + string short_name; + r = decompose_full_path(path, &path_comp, 0, &short_name); + if (r < 0) + goto out; + r = lfn_created(path_comp, oid, short_name); + if (r < 0) { + if (failed) { + /* This is hacky, but the only way we get ENOENT from lfn_created here is + * if we did a failure injection in _created below AND actually started the + * split or merge. In that case, lfn_created already suceeded, and + * WRAP_RETRY already cleaned it up and we are actually done. In a real + * failure, the filestore itself would have ended up calling this with + * the new path, not the old one, so we'd find it. + */ + r = 0; + } + goto out; + } + r = _created(path_comp, oid, short_name); + if (r < 0) + goto out; + ); +} + +int LFNIndex::unlink(const ghobject_t &oid) +{ + WRAP_RETRY( + vector<string> path; + string short_name; + r = _lookup(oid, &path, &short_name, NULL); + if (r < 0) { + goto out; + } + r = _remove(path, oid, short_name); + if (r < 0) { + goto out; + } + ); +} + +int LFNIndex::lookup(const ghobject_t &oid, + IndexedPath *out_path, + int *hardlink) +{ + WRAP_RETRY( + vector<string> path; + string short_name; + r = _lookup(oid, &path, &short_name, hardlink); + if (r < 0) + goto out; + string full_path = get_full_path(path, short_name); + *out_path = std::make_shared<Path>(full_path, this); + r = 0; + ); +} + +int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) +{ + return _pre_hash_collection(pg_num, expected_num_objs); +} + + +int LFNIndex::collection_list_partial(const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next) +{ + return _collection_list_partial(start, end, max_count, ls, next); +} + +/* Derived class utility methods */ + +int LFNIndex::fsync_dir(const vector<string> &path) +{ + maybe_inject_failure(); + int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + FDCloser f(fd); + maybe_inject_failure(); + int r = ::fsync(fd); + maybe_inject_failure(); + if (r < 0) { + derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + return 0; +} + +int LFNIndex::link_object(const vector<string> &from, + const vector<string> &to, + const ghobject_t &oid, + const string &from_short_name) +{ + int r; + string from_path = get_full_path(from, from_short_name); + string to_path; + maybe_inject_failure(); + r = lfn_get_name(to, oid, 0, &to_path, 0); + if (r < 0) + return r; + maybe_inject_failure(); + r = ::link(from_path.c_str(), to_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::remove_objects(const vector<string> &dir, + const map<string, ghobject_t> &to_remove, + map<string, ghobject_t> *remaining) +{ + set<string> clean_chains; + for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin(); + to_clean != to_remove.end(); + ++to_clean) { + if (!lfn_is_hashed_filename(to_clean->first)) { + maybe_inject_failure(); + int r = ::unlink(get_full_path(dir, to_clean->first).c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + continue; + } + if (clean_chains.count(lfn_get_short_name(to_clean->second, 0))) + continue; + set<int> holes; + map<int, pair<string, ghobject_t> > chain; + for (int i = 0; ; ++i) { + string short_name = lfn_get_short_name(to_clean->second, i); + if (remaining->count(short_name)) { + chain[i] = *(remaining->find(short_name)); + } else if (to_remove.count(short_name)) { + holes.insert(i); + } else { + break; + } + } + + map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin(); + for (set<int>::iterator i = holes.begin(); + i != holes.end(); + ++i) { + if (candidate == chain.rend() || *i > candidate->first) { + string remove_path_name = + get_full_path(dir, lfn_get_short_name(to_clean->second, *i)); + maybe_inject_failure(); + int r = ::unlink(remove_path_name.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + continue; + } + string from = get_full_path(dir, candidate->second.first); + string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i)); + maybe_inject_failure(); + int r = ::rename(from.c_str(), to.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + remaining->erase(candidate->second.first); + remaining->insert(pair<string, ghobject_t>( + lfn_get_short_name(candidate->second.second, *i), + candidate->second.second)); + ++candidate; + } + if (!holes.empty()) + clean_chains.insert(lfn_get_short_name(to_clean->second, 0)); + } + return 0; +} + +int LFNIndex::move_objects(const vector<string> &from, + const vector<string> &to) +{ + map<string, ghobject_t> to_move; + int r; + r = list_objects(from, 0, NULL, &to_move); + if (r < 0) + return r; + for (map<string,ghobject_t>::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + string from_path = get_full_path(from, i->first); + string to_path, to_name; + r = lfn_get_name(to, i->second, &to_name, &to_path, 0); + if (r < 0) + return r; + maybe_inject_failure(); + r = ::link(from_path.c_str(), to_path.c_str()); + if (r < 0 && errno != EEXIST) + return -errno; + maybe_inject_failure(); + r = lfn_created(to, i->second, to_name); + maybe_inject_failure(); + if (r < 0) + return r; + } + r = fsync_dir(to); + if (r < 0) + return r; + for (map<string,ghobject_t>::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + maybe_inject_failure(); + r = ::unlink(get_full_path(from, i->first).c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } + return fsync_dir(from); +} + +int LFNIndex::remove_object(const vector<string> &from, + const ghobject_t &oid) +{ + string short_name; + int r, exist; + maybe_inject_failure(); + r = get_mangled_name(from, oid, &short_name, &exist); + maybe_inject_failure(); + if (r < 0) + return r; + if (exist == 0) + return -ENOENT; + return lfn_unlink(from, oid, short_name); +} + +int LFNIndex::get_mangled_name(const vector<string> &from, + const ghobject_t &oid, + string *mangled_name, int *hardlink) +{ + return lfn_get_name(from, oid, mangled_name, 0, hardlink); +} + +int LFNIndex::move_subdir( + LFNIndex &from, + LFNIndex &dest, + const vector<string> &path, + string dir + ) +{ + vector<string> sub_path(path.begin(), path.end()); + sub_path.push_back(dir); + string from_path(from.get_full_path_subdir(sub_path)); + string to_path(dest.get_full_path_subdir(sub_path)); + int r = ::rename(from_path.c_str(), to_path.c_str()); + if (r < 0) + return -errno; + return 0; +} + +int LFNIndex::move_object( + LFNIndex &from, + LFNIndex &dest, + const vector<string> &path, + const pair<string, ghobject_t> &obj + ) +{ + string from_path(from.get_full_path(path, obj.first)); + string to_path; + string to_name; + int exists; + int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists); + if (r < 0) + return r; + if (!exists) { + r = ::link(from_path.c_str(), to_path.c_str()); + if (r < 0) + return r; + } + r = dest.lfn_created(path, obj.second, to_name); + if (r < 0) + return r; + r = dest.fsync_dir(path); + if (r < 0) + return r; + r = from.remove_object(path, obj.second); + if (r < 0) + return r; + return from.fsync_dir(path); +} + + +static int get_hobject_from_oinfo(const char *dir, const char *file, + ghobject_t *o) +{ + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/%s", dir, file); + // Hack, user.ceph._ is the attribute used to store the object info + bufferptr bp; + int r = chain_getxattr_buf( + path, + "user.ceph._", + &bp); + if (r < 0) + return r; + bufferlist bl; + if (r > 0) + bl.push_back(bp); + object_info_t oi(bl); + *o = ghobject_t(oi.soid); + return 0; +} + + +int LFNIndex::list_objects(const vector<string> &to_list, int max_objs, + long *handle, map<string, ghobject_t> *out) +{ + string to_list_path = get_full_path_subdir(to_list); + DIR *dir = ::opendir(to_list_path.c_str()); + if (!dir) { + return -errno; + } + + if (handle && *handle) { + seekdir(dir, *handle); + } + + struct dirent *de = nullptr; + int r = 0; + int listed = 0; + bool end = true; + while ((de = ::readdir(dir))) { + end = false; + if (max_objs > 0 && listed >= max_objs) { + break; + } + if (de->d_name[0] == '.') + continue; + string short_name(de->d_name); + ghobject_t obj; + if (lfn_is_object(short_name)) { + r = lfn_translate(to_list, short_name, &obj); + if (r == -EINVAL) { + continue; + } else if (r < 0) { + goto cleanup; + } else { + string long_name = lfn_generate_object_name(obj); + if (!lfn_must_hash(long_name)) { + ceph_assert(long_name == short_name); + } + if (index_version == HASH_INDEX_TAG) + get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj); + + out->insert(pair<string, ghobject_t>(short_name, obj)); + ++listed; + } + } + } + + if (handle && !end) { + *handle = telldir(dir); + } + + r = 0; + cleanup: + ::closedir(dir); + return r; +} + +int LFNIndex::list_subdirs(const vector<string> &to_list, + vector<string> *out) +{ + string to_list_path = get_full_path_subdir(to_list); + DIR *dir = ::opendir(to_list_path.c_str()); + if (!dir) + return -errno; + + struct dirent *de = nullptr; + while ((de = ::readdir(dir))) { + string short_name(de->d_name); + string demangled_name; + if (lfn_is_subdir(short_name, &demangled_name)) { + out->push_back(demangled_name); + } + } + + ::closedir(dir); + return 0; +} + +int LFNIndex::create_path(const vector<string> &to_create) +{ + maybe_inject_failure(); + int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::remove_path(const vector<string> &to_remove) +{ + maybe_inject_failure(); + int r = ::rmdir(get_full_path_subdir(to_remove).c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::path_exists(const vector<string> &to_check, int *exists) +{ + string full_path = get_full_path_subdir(to_check); + struct stat buf; + if (::stat(full_path.c_str(), &buf)) { + int r = -errno; + if (r == -ENOENT) { + *exists = 0; + return 0; + } else { + return r; + } + } else { + *exists = 1; + return 0; + } +} + +int LFNIndex::add_attr_path(const vector<string> &path, + const string &attr_name, + bufferlist &attr_value) +{ + string full_path = get_full_path_subdir(path); + maybe_inject_failure(); + return chain_setxattr<false, true>( + full_path.c_str(), mangle_attr_name(attr_name).c_str(), + reinterpret_cast<void *>(attr_value.c_str()), + attr_value.length()); +} + +int LFNIndex::get_attr_path(const vector<string> &path, + const string &attr_name, + bufferlist &attr_value) +{ + string full_path = get_full_path_subdir(path); + bufferptr bp; + int r = chain_getxattr_buf( + full_path.c_str(), + mangle_attr_name(attr_name).c_str(), + &bp); + if (r > 0) + attr_value.push_back(bp); + return r; +} + +int LFNIndex::remove_attr_path(const vector<string> &path, + const string &attr_name) +{ + string full_path = get_full_path_subdir(path); + string mangled_attr_name = mangle_attr_name(attr_name); + maybe_inject_failure(); + return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str()); +} + +string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid) +{ + char s[FILENAME_MAX_LEN]; + char *end = s + sizeof(s); + char *t = s; + + ceph_assert(oid.generation == ghobject_t::NO_GEN); + const char *i = oid.hobj.oid.name.c_str(); + // Escape subdir prefix + if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { + *t++ = '\\'; + *t++ = 'd'; + i += 4; + } + while (*i && t < end) { + if (*i == '\\') { + *t++ = '\\'; + *t++ = '\\'; + } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading . + *t++ = '\\'; + *t++ = '.'; + } else if (*i == '/') { + *t++ = '\\'; + *t++ = 's'; + } else + *t++ = *i; + i++; + } + + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "_head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "_snapdir"); + else + t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap); + snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); + + return string(s); +} + +static void append_escaped(string::const_iterator begin, + string::const_iterator end, + string *out) +{ + for (string::const_iterator i = begin; i != end; ++i) { + if (*i == '\\') { + out->append("\\\\"); + } else if (*i == '/') { + out->append("\\s"); + } else if (*i == '_') { + out->append("\\u"); + } else if (*i == '\0') { + out->append("\\n"); + } else { + out->append(i, i+1); + } + } +} + +string LFNIndex::lfn_generate_object_name_current(const ghobject_t &oid) +{ + string full_name; + string::const_iterator i = oid.hobj.oid.name.begin(); + if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { + full_name.append("\\d"); + i += 4; + } else if (oid.hobj.oid.name[0] == '.') { + full_name.append("\\."); + ++i; + } + append_escaped(i, oid.hobj.oid.name.end(), &full_name); + full_name.append("_"); + append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name); + full_name.append("_"); + + char buf[PATH_MAX]; + char *t = buf; + const char *end = t + sizeof(buf); + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); + t += snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); + full_name.append(buf, t); + full_name.append("_"); + + append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name); + full_name.append("_"); + + t = buf; + if (oid.hobj.pool == -1) + t += snprintf(t, end - t, "none"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool); + full_name.append(buf, t); + + if (oid.generation != ghobject_t::NO_GEN || + oid.shard_id != shard_id_t::NO_SHARD) { + full_name.append("_"); + + t = buf; + t += snprintf(t, end - buf, "%llx", (long long unsigned)oid.generation); + full_name.append(buf, t); + + full_name.append("_"); + + t = buf; + t += snprintf(t, end - buf, "%x", (int)oid.shard_id); + full_name.append(buf, t); + } + + return full_name; +} + +string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid) +{ + if (index_version == HASH_INDEX_TAG) + return lfn_generate_object_name_keyless(oid); + + ceph_assert(oid.generation == ghobject_t::NO_GEN); + string full_name; + string::const_iterator i = oid.hobj.oid.name.begin(); + if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { + full_name.append("\\d"); + i += 4; + } else if (oid.hobj.oid.name[0] == '.') { + full_name.append("\\."); + ++i; + } + append_escaped(i, oid.hobj.oid.name.end(), &full_name); + full_name.append("_"); + append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name); + full_name.append("_"); + + char snap_with_hash[PATH_MAX]; + char *t = snap_with_hash; + char *end = t + sizeof(snap_with_hash); + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); + snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); + full_name += string(snap_with_hash); + return full_name; +} + +int LFNIndex::lfn_get_name(const vector<string> &path, + const ghobject_t &oid, + string *mangled_name, string *out_path, + int *hardlink) +{ + string full_name = lfn_generate_object_name(oid); + int r; + + if (!lfn_must_hash(full_name)) { + if (mangled_name) + *mangled_name = full_name; + if (out_path) + *out_path = get_full_path(path, full_name); + if (hardlink) { + struct stat buf; + string full_path = get_full_path(path, full_name); + maybe_inject_failure(); + r = ::stat(full_path.c_str(), &buf); + if (r < 0) { + if (errno == ENOENT) + *hardlink = 0; + else + return -errno; + } else { + *hardlink = buf.st_nlink; + } + } + return 0; + } + + int i = 0; + string candidate; + string candidate_path; + for ( ; ; ++i) { + candidate = lfn_get_short_name(oid, i); + candidate_path = get_full_path(path, candidate); + bufferptr bp; + r = chain_getxattr_buf( + candidate_path.c_str(), + get_lfn_attr().c_str(), + &bp); + if (r < 0) { + if (errno != ENODATA && errno != ENOENT) + return -errno; + if (errno == ENODATA) { + // Left over from incomplete transaction, it'll be replayed + maybe_inject_failure(); + r = ::unlink(candidate_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } + if (mangled_name) + *mangled_name = candidate; + if (out_path) + *out_path = candidate_path; + if (hardlink) + *hardlink = 0; + return 0; + } + ceph_assert(r > 0); + string lfn(bp.c_str(), bp.length()); + if (lfn == full_name) { + if (mangled_name) + *mangled_name = candidate; + if (out_path) + *out_path = candidate_path; + if (hardlink) { + struct stat st; + r = ::stat(candidate_path.c_str(), &st); + if (r < 0) { + if (errno == ENOENT) + *hardlink = 0; + else + return -errno; + } else { + *hardlink = st.st_nlink; + } + } + return 0; + } + bp = bufferptr(); + r = chain_getxattr_buf( + candidate_path.c_str(), + get_alt_lfn_attr().c_str(), + &bp); + if (r > 0) { + // only consider alt name if nlink > 1 + struct stat st; + int rc = ::stat(candidate_path.c_str(), &st); + if (rc < 0) + return -errno; + if (st.st_nlink <= 1) { + // left over from incomplete unlink, remove + maybe_inject_failure(); + dout(20) << __func__ << " found extra alt attr for " << candidate_path + << ", long name " << string(bp.c_str(), bp.length()) << dendl; + rc = chain_removexattr(candidate_path.c_str(), + get_alt_lfn_attr().c_str()); + maybe_inject_failure(); + if (rc < 0) + return rc; + continue; + } + string lfn(bp.c_str(), bp.length()); + if (lfn == full_name) { + dout(20) << __func__ << " used alt attr for " << full_name << dendl; + if (mangled_name) + *mangled_name = candidate; + if (out_path) + *out_path = candidate_path; + if (hardlink) + *hardlink = st.st_nlink; + return 0; + } + } + } + ceph_abort(); // Unreachable + return 0; +} + +int LFNIndex::lfn_created(const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name) +{ + if (!lfn_is_hashed_filename(mangled_name)) + return 0; + string full_path = get_full_path(path, mangled_name); + string full_name = lfn_generate_object_name(oid); + maybe_inject_failure(); + + // if the main attr exists and is different, move it to the alt attr. + bufferptr bp; + int r = chain_getxattr_buf( + full_path.c_str(), + get_lfn_attr().c_str(), + &bp); + if (r > 0) { + string lfn(bp.c_str(), bp.length()); + if (lfn != full_name) { + dout(20) << __func__ << " " << mangled_name + << " moving old name to alt attr " + << lfn + << ", new name is " << full_name << dendl; + r = chain_setxattr<false, true>( + full_path.c_str(), get_alt_lfn_attr().c_str(), + bp.c_str(), bp.length()); + if (r < 0) + return r; + } + } + + return chain_setxattr<false, true>( + full_path.c_str(), get_lfn_attr().c_str(), + full_name.c_str(), full_name.size()); +} + +int LFNIndex::lfn_unlink(const vector<string> &path, + const ghobject_t &oid, + const string &mangled_name) +{ + if (!lfn_is_hashed_filename(mangled_name)) { + string full_path = get_full_path(path, mangled_name); + maybe_inject_failure(); + int r = ::unlink(full_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + return 0; + } + + int i = 0; + for ( ; ; ++i) { + string candidate = lfn_get_short_name(oid, i); + if (candidate == mangled_name) + break; + } + int removed_index = i; + ++i; + for ( ; ; ++i) { + struct stat buf; + string to_check = lfn_get_short_name(oid, i); + string to_check_path = get_full_path(path, to_check); + int r = ::stat(to_check_path.c_str(), &buf); + if (r < 0) { + if (errno == ENOENT) { + break; + } else { + return -errno; + } + } + } + string full_path = get_full_path(path, mangled_name); + int fd = ::open(full_path.c_str(), O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + FDCloser f(fd); + if (i == removed_index + 1) { + maybe_inject_failure(); + int r = ::unlink(full_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } else { + string& rename_to = full_path; + string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1)); + maybe_inject_failure(); + int r = ::rename(rename_from.c_str(), rename_to.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } + struct stat st; + int r = ::fstat(fd, &st); + if (r == 0 && st.st_nlink > 0) { + // remove alt attr + dout(20) << __func__ << " removing alt attr from " << full_path << dendl; + fsync_dir(path); + chain_fremovexattr(fd, get_alt_lfn_attr().c_str()); + } + return r; +} + +int LFNIndex::lfn_translate(const vector<string> &path, + const string &short_name, + ghobject_t *out) +{ + if (!lfn_is_hashed_filename(short_name)) { + return lfn_parse_object_name(short_name, out); + } + string full_path = get_full_path(path, short_name); + // First, check alt attr + bufferptr bp; + int r = chain_getxattr_buf( + full_path.c_str(), + get_alt_lfn_attr().c_str(), + &bp); + if (r > 0) { + // There is an alt attr, does it match? + string lfn(bp.c_str(), bp.length()); + if (short_name_matches(short_name.c_str(), lfn.c_str())) { + return lfn_parse_object_name(lfn, out); + } + } + + // Get lfn_attr + bp = bufferptr(); + r = chain_getxattr_buf( + full_path.c_str(), + get_lfn_attr().c_str(), + &bp); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + string long_name(bp.c_str(), bp.length()); + return lfn_parse_object_name(long_name, out); +} + +bool LFNIndex::lfn_is_object(const string &short_name) +{ + return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0); +} + +bool LFNIndex::lfn_is_subdir(const string &name, string *demangled) +{ + if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) { + if (demangled) + *demangled = demangle_path_component(name); + return 1; + } + return 0; +} + +static int parse_object(const char *s, ghobject_t& o) +{ + const char *hash = s + strlen(s) - 1; + while (*hash != '_' && + hash > s) + hash--; + const char *bar = hash - 1; + while (*bar != '_' && + bar > s) + bar--; + if (*bar == '_') { + char buf[bar-s + 1]; + char *t = buf; + const char *i = s; + while (i < bar) { + if (*i == '\\') { + i++; + switch (*i) { + case '\\': *t++ = '\\'; break; + case '.': *t++ = '.'; break; + case 's': *t++ = '/'; break; + case 'd': { + *t++ = 'D'; + *t++ = 'I'; + *t++ = 'R'; + *t++ = '_'; + break; + } + default: ceph_abort(); + } + } else { + *t++ = *i; + } + i++; + } + *t = 0; + o.hobj.oid.name = string(buf, t-buf); + if (strncmp(bar+1, "head", 4) == 0) + o.hobj.snap = CEPH_NOSNAP; + else if (strncmp(bar+1, "snapdir", 7) == 0) + o.hobj.snap = CEPH_SNAPDIR; + else + o.hobj.snap = strtoull(bar+1, NULL, 16); + + uint32_t hobject_hash_input; + sscanf(hash, "_%X", &hobject_hash_input); + o.hobj.set_hash(hobject_hash_input); + + return 1; + } + return 0; +} + +int LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out) +{ + int r = parse_object(long_name.c_str(), *out); + int64_t pool = -1; + spg_t pg; + if (coll().is_pg_prefix(&pg)) + pool = (int64_t)pg.pgid.pool(); + out->hobj.pool = pool; + if (!r) return -EINVAL; + string temp = lfn_generate_object_name(*out); + return 0; +} + +static bool append_unescaped(string::const_iterator begin, + string::const_iterator end, + string *out) +{ + for (string::const_iterator i = begin; i != end; ++i) { + if (*i == '\\') { + ++i; + if (*i == '\\') + out->append("\\"); + else if (*i == 's') + out->append("/"); + else if (*i == 'n') + (*out) += '\0'; + else if (*i == 'u') + out->append("_"); + else + return false; + } else { + out->append(i, i+1); + } + } + return true; +} + +int LFNIndex::lfn_parse_object_name_poolless(const string &long_name, + ghobject_t *out) +{ + string name; + string key; + uint32_t hash; + snapid_t snap; + + string::const_iterator current = long_name.begin(); + if (*current == '\\') { + ++current; + if (current == long_name.end()) { + return -EINVAL; + } else if (*current == 'd') { + name.append("DIR_"); + ++current; + } else if (*current == '.') { + name.append("."); + ++current; + } else { + --current; + } + } + + string::const_iterator end = current; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &name)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &key)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + string snap_str(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end != long_name.end()) + return -EINVAL; + string hash_str(current, end); + + if (snap_str == "head") + snap = CEPH_NOSNAP; + else if (snap_str == "snapdir") + snap = CEPH_SNAPDIR; + else + snap = strtoull(snap_str.c_str(), NULL, 16); + sscanf(hash_str.c_str(), "%X", &hash); + + + int64_t pool = -1; + spg_t pg; + if (coll().is_pg_prefix(&pg)) + pool = (int64_t)pg.pgid.pool(); + (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, "")); + return 0; +} + + +int LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out) +{ + string name; + string key; + string ns; + uint32_t hash; + snapid_t snap; + uint64_t pool; + gen_t generation = ghobject_t::NO_GEN; + shard_id_t shard_id = shard_id_t::NO_SHARD; + + if (index_version == HASH_INDEX_TAG) + return lfn_parse_object_name_keyless(long_name, out); + if (index_version == HASH_INDEX_TAG_2) + return lfn_parse_object_name_poolless(long_name, out); + + string::const_iterator current = long_name.begin(); + if (*current == '\\') { + ++current; + if (current == long_name.end()) { + return -EINVAL; + } else if (*current == 'd') { + name.append("DIR_"); + ++current; + } else if (*current == '.') { + name.append("."); + ++current; + } else { + --current; + } + } + + string::const_iterator end = current; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &name)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &key)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + string snap_str(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + string hash_str(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + if (!append_unescaped(current, end, &ns)) + return -EINVAL; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + string pstring(current, end); + + // Optional generation/shard_id + string genstring, shardstring; + if (end != long_name.end()) { + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return -EINVAL; + genstring = string(current, end); + + generation = (gen_t)strtoull(genstring.c_str(), NULL, 16); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end != long_name.end()) + return -EINVAL; + shardstring = string(current, end); + + shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16); + } + + if (snap_str == "head") + snap = CEPH_NOSNAP; + else if (snap_str == "snapdir") + snap = CEPH_SNAPDIR; + else + snap = strtoull(snap_str.c_str(), NULL, 16); + sscanf(hash_str.c_str(), "%X", &hash); + + if (pstring == "none") + pool = (uint64_t)-1; + else + pool = strtoull(pstring.c_str(), NULL, 16); + + (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id); + return 0; +} + +bool LFNIndex::lfn_is_hashed_filename(const string &name) +{ + if (name.size() < (unsigned)FILENAME_SHORT_LEN) { + return 0; + } + if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size()) + == FILENAME_COOKIE) { + return 1; + } else { + return 0; + } +} + +bool LFNIndex::lfn_must_hash(const string &long_name) +{ + return (int)long_name.size() >= FILENAME_SHORT_LEN; +} + +static inline void buf_to_hex(const unsigned char *buf, int len, char *str) +{ + int i; + str[0] = '\0'; + for (i = 0; i < len; i++) { + sprintf(&str[i*2], "%02x", (int)buf[i]); + } +} + +int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len) +{ + if (buf_len < FILENAME_HASH_LEN + 1) + return -EINVAL; + + char buf[FILENAME_LFN_DIGEST_SIZE]; + char hex[FILENAME_LFN_DIGEST_SIZE * 2]; + + SHA1 h; + h.Update((const unsigned char *)filename, strlen(filename)); + h.Final((unsigned char *)buf); + + buf_to_hex((unsigned char *)buf, (FILENAME_HASH_LEN + 1) / 2, hex); + strncpy(hash, hex, FILENAME_HASH_LEN); + hash[FILENAME_HASH_LEN] = '\0'; + return 0; +} + +void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len) +{ + char hash[FILENAME_HASH_LEN + 1]; + + ceph_assert(len >= FILENAME_SHORT_LEN + 4); + + strncpy(filename, old_filename, FILENAME_PREFIX_LEN); + filename[FILENAME_PREFIX_LEN] = '\0'; + if ((int)strlen(filename) < FILENAME_PREFIX_LEN) + return; + if (old_filename[FILENAME_PREFIX_LEN] == '\0') + return; + + hash_filename(old_filename, hash, sizeof(hash)); + int ofs = FILENAME_PREFIX_LEN; + while (1) { + int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str()); + if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs) + break; + ofs--; + } +} + +bool LFNIndex::short_name_matches(const char *short_name, const char *cand_long_name) +{ + const char *end = short_name; + while (*end) ++end; + const char *suffix = end; + if (suffix > short_name) --suffix; // last char + while (suffix > short_name && *suffix != '_') --suffix; // back to first _ + if (suffix > short_name) --suffix; // one behind that + while (suffix > short_name && *suffix != '_') --suffix; // back to second _ + + int index = -1; + char buf[FILENAME_SHORT_LEN + 4]; + ceph_assert((end - suffix) < (int)sizeof(buf)); + int r = sscanf(suffix, "_%d_%s", &index, buf); + if (r < 2) + return false; + if (strcmp(buf, FILENAME_COOKIE.c_str()) != 0) + return false; + build_filename(cand_long_name, index, buf, sizeof(buf)); + return strcmp(short_name, buf) == 0; +} + +string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i) +{ + string long_name = lfn_generate_object_name(oid); + ceph_assert(lfn_must_hash(long_name)); + char buf[FILENAME_SHORT_LEN + 4]; + build_filename(long_name.c_str(), i, buf, sizeof(buf)); + return string(buf); +} + +const string &LFNIndex::get_base_path() +{ + return base_path; +} + +string LFNIndex::get_full_path_subdir(const vector<string> &rel) +{ + string retval = get_base_path(); + for (vector<string>::const_iterator i = rel.begin(); + i != rel.end(); + ++i) { + retval += "/"; + retval += mangle_path_component(*i); + } + return retval; +} + +string LFNIndex::get_full_path(const vector<string> &rel, const string &name) +{ + return get_full_path_subdir(rel) + "/" + name; +} + +string LFNIndex::mangle_path_component(const string &component) +{ + return SUBDIR_PREFIX + component; +} + +string LFNIndex::demangle_path_component(const string &component) +{ + return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size()); +} + +int LFNIndex::decompose_full_path(const char *in, vector<string> *out, + ghobject_t *oid, string *shortname) +{ + const char *beginning = in + get_base_path().size(); + const char *end = beginning; + while (1) { + end++; + beginning = end++; + for ( ; *end != '\0' && *end != '/'; ++end) ; + if (*end != '\0') { + out->push_back(demangle_path_component(string(beginning, end - beginning))); + continue; + } else { + break; + } + } + *shortname = string(beginning, end - beginning); + if (oid) { + int r = lfn_translate(*out, *shortname, oid); + if (r < 0) + return r; + } + return 0; +} + +string LFNIndex::mangle_attr_name(const string &attr) +{ + return PHASH_ATTR_PREFIX + attr; +} diff --git a/src/os/filestore/LFNIndex.h b/src/os/filestore/LFNIndex.h new file mode 100644 index 00000000..149ed10f --- /dev/null +++ b/src/os/filestore/LFNIndex.h @@ -0,0 +1,614 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef OS_LFNINDEX_H +#define OS_LFNINDEX_H + +#include <string> +#include <map> +#include <set> +#include <vector> +#include <exception> + +#include "osd/osd_types.h" +#include "include/object.h" +#include "common/ceph_crypto.h" + +#include "CollectionIndex.h" + +/** + * LFNIndex also encapsulates logic for manipulating + * subdirectories of a collection as well as the long filename + * logic. + * + * The protected methods provide machinery for derived classes to + * manipulate subdirectories and objects. + * + * The virtual methods are to be overridden to provide the actual + * hashed layout. + * + * User must call created when an object is created. + * + * Synchronization: Calling code must ensure that there are no object + * creations or deletions during the lifetime of a Path object (except + * of an object at that path). + * + * Unless otherwise noted, methods which return an int return 0 on success + * and a negative error code on failure. + */ +#define WRAP_RETRY(x) { \ + bool failed = false; \ + int r = 0; \ + init_inject_failure(); \ + while (1) { \ + try { \ + if (failed) { \ + r = cleanup(); \ + ceph_assert(r == 0); \ + } \ + { x } \ + out: \ + complete_inject_failure(); \ + return r; \ + } catch (RetryException&) { \ + failed = true; \ + } catch (...) { \ + ceph_abort(); \ + } \ + } \ + return -1; \ + } \ + + + +class LFNIndex : public CollectionIndex { + /// Hash digest output size. + static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE; + /// Length of filename hash. + static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE; + /// Max filename size. + static const int FILENAME_MAX_LEN = 4096; + /// Length of hashed filename. + static const int FILENAME_SHORT_LEN = 255; + /// Length of hashed filename prefix. + static const int FILENAME_PREFIX_LEN; + /// Length of hashed filename cookie. + static const int FILENAME_EXTRA = 4; + /// Lfn cookie value. + static const string FILENAME_COOKIE; + /// Name of LFN attribute for storing full name. + static const string LFN_ATTR; + /// Prefix for subdir index attributes. + static const string PHASH_ATTR_PREFIX; + /// Prefix for index subdirectories. + static const string SUBDIR_PREFIX; + + /// Path to Index base. + const string base_path; + +protected: + const uint32_t index_version; + + /// true if retry injection is enabled + struct RetryException : public exception {}; + bool error_injection_enabled; + bool error_injection_on; + double error_injection_probability; + uint64_t last_failure; + uint64_t current_failure; + void init_inject_failure() { + if (error_injection_on) { + error_injection_enabled = true; + last_failure = current_failure = 0; + } + } + void maybe_inject_failure(); + void complete_inject_failure() { + error_injection_enabled = false; + } + +private: + string lfn_attribute, lfn_alt_attribute; + coll_t collection; + +public: + /// Constructor + LFNIndex( + CephContext* cct, + coll_t collection, + const char *base_path, ///< [in] path to Index root + uint32_t index_version, + double _error_injection_probability=0) + : CollectionIndex(cct, collection), + base_path(base_path), + index_version(index_version), + error_injection_enabled(false), + error_injection_on(_error_injection_probability != 0), + error_injection_probability(_error_injection_probability), + last_failure(0), current_failure(0), + collection(collection) { + if (index_version == HASH_INDEX_TAG) { + lfn_attribute = LFN_ATTR; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", index_version); + lfn_attribute = LFN_ATTR + string(buf); + lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt"; + } + } + + coll_t coll() const override { return collection; } + + /// Virtual destructor + ~LFNIndex() override {} + + /// @see CollectionIndex + int init() override; + + /// @see CollectionIndex + int cleanup() override = 0; + + /// @see CollectionIndex + int created( + const ghobject_t &oid, + const char *path + ) override; + + /// @see CollectionIndex + int unlink( + const ghobject_t &oid + ) override; + + /// @see CollectionIndex + int lookup( + const ghobject_t &oid, + IndexedPath *path, + int *hardlink + ) override; + + /// @see CollectionIndex; + int pre_hash_collection( + uint32_t pg_num, + uint64_t expected_num_objs + ) override; + + /// @see CollectionIndex + int collection_list_partial( + const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next + ) override; + + virtual int _split( + uint32_t match, //< [in] value to match + uint32_t bits, //< [in] bits to check + CollectionIndex* dest //< [in] destination index + ) = 0; + virtual int _merge( + uint32_t bits, //< [in] bits for target + CollectionIndex* dest //< [in] destination index + ) = 0; + + /// @see CollectionIndex + int split( + uint32_t match, + uint32_t bits, + CollectionIndex* dest + ) override { + WRAP_RETRY( + r = _split(match, bits, dest); + goto out; + ); + } + + /// @see CollectionIndex + int merge( + uint32_t bits, + CollectionIndex* dest + ) override { + WRAP_RETRY( + r = _merge(bits, dest); + goto out; + ); + } + + /** + * Returns the length of the longest escaped name which could result + * from any clone, shard, or rollback object of this object + */ + static uint64_t get_max_escaped_name_len(const hobject_t &obj); + +protected: + virtual int _init() = 0; + + /// Will be called upon object creation + virtual int _created( + const vector<string> &path, ///< [in] Path to subdir. + const ghobject_t &oid, ///< [in] Object created. + const string &mangled_name ///< [in] Mangled filename. + ) = 0; + + /// Will be called to remove an object + virtual int _remove( + const vector<string> &path, ///< [in] Path to subdir. + const ghobject_t &oid, ///< [in] Object to remove. + const string &mangled_name ///< [in] Mangled filename. + ) = 0; + + /// Return the path and mangled_name for oid. + virtual int _lookup( + const ghobject_t &oid,///< [in] Object for lookup. + vector<string> *path, ///< [out] Path to the object. + string *mangled_name, ///< [out] Mangled filename. + int *exists ///< [out] True if the object exists. + ) = 0; + + /// Pre-hash the collection with the given pg number and + /// expected number of objects in the collection. + virtual int _pre_hash_collection( + uint32_t pg_num, + uint64_t expected_num_objs + ) = 0; + + /// @see CollectionIndex + virtual int _collection_list_partial( + const ghobject_t &start, + const ghobject_t &end, + int max_count, + vector<ghobject_t> *ls, + ghobject_t *next + ) = 0; + +protected: + + /* Non-virtual utility methods */ + + /// Sync a subdirectory + int fsync_dir( + const vector<string> &path ///< [in] Path to sync + ); ///< @return Error Code, 0 on success + + /// Link an object from from into to + int link_object( + const vector<string> &from, ///< [in] Source subdirectory. + const vector<string> &to, ///< [in] Dest subdirectory. + const ghobject_t &oid, ///< [in] Object to move. + const string &from_short_name ///< [in] Mangled filename of oid. + ); ///< @return Error Code, 0 on success + + /** + * Efficiently remove objects from a subdirectory + * + * remove_object invalidates mangled names in the directory requiring + * the mangled name of each additional object to be looked up a second + * time. remove_objects removes the need for additional lookups + * + * @param [in] dir Directory from which to remove. + * @param [in] map of objects to remove to mangle names + * @param [in,out] map of filenames to objects + * @return Error Code, 0 on success. + */ + int remove_objects( + const vector<string> &dir, + const map<string, ghobject_t> &to_remove, + map<string, ghobject_t> *remaining + ); + + + /** + * Moves contents of from into to. + * + * Invalidates mangled names in to. If interrupted, all objects will be + * present in to before objects are removed from from. Ignores EEXIST + * while linking into to. + * @return Error Code, 0 on success + */ + int move_objects( + const vector<string> &from, ///< [in] Source subdirectory. + const vector<string> &to ///< [in] Dest subdirectory. + ); + + /** + * Remove an object from from. + * + * Invalidates mangled names in from. + * @return Error Code, 0 on success + */ + int remove_object( + const vector<string> &from, ///< [in] Directory from which to remove. + const ghobject_t &to_remove ///< [in] Object to remove. + ); + + /** + * Gets the filename corresponding to oid in from. + * + * The filename may differ between subdirectories. Furthermore, + * file creations ore removals in from may invalidate the name. + * @return Error code on failure, 0 on success + */ + int get_mangled_name( + const vector<string> &from, ///< [in] Subdirectory + const ghobject_t &oid, ///< [in] Object + string *mangled_name, ///< [out] Filename + int *hardlink ///< [out] hardlink for this file, hardlink=0 mean no-exist + ); + + /// do move subdir from from to dest + static int move_subdir( + LFNIndex &from, ///< [in] from index + LFNIndex &dest, ///< [in] to index + const vector<string> &path, ///< [in] path containing dir + string dir ///< [in] dir to move + ); + + /// do move object from from to dest + static int move_object( + LFNIndex &from, ///< [in] from index + LFNIndex &dest, ///< [in] to index + const vector<string> &path, ///< [in] path to split + const pair<string, ghobject_t> &obj ///< [in] obj to move + ); + + /** + * Lists objects in to_list. + * + * @param [in] to_list Directory to list. + * @param [in] max_objects Max number to list. + * @param [in,out] handle Cookie for continuing the listing. + * Initialize to zero to start at the beginning of the directory. + * @param [out] out Mapping of listed object filenames to objects. + * @return Error code on failure, 0 on success + */ + int list_objects( + const vector<string> &to_list, + int max_objects, + long *handle, + map<string, ghobject_t> *out + ); + + /// Lists subdirectories. + int list_subdirs( + const vector<string> &to_list, ///< [in] Directory to list. + vector<string> *out ///< [out] Subdirectories listed. + ); + + /// Create subdirectory. + int create_path( + const vector<string> &to_create ///< [in] Subdirectory to create. + ); + + /// Remove subdirectory. + int remove_path( + const vector<string> &to_remove ///< [in] Subdirectory to remove. + ); + + /// Check whether to_check exists. + int path_exists( + const vector<string> &to_check, ///< [in] Subdirectory to check. + int *exists ///< [out] 1 if it exists, 0 else + ); + + /// Save attr_value to attr_name attribute on path. + int add_attr_path( + const vector<string> &path, ///< [in] Path to modify. + const string &attr_name, ///< [in] Name of attribute. + bufferlist &attr_value ///< [in] Value to save. + ); + + /// Read into attr_value attribute attr_name on path. + int get_attr_path( + const vector<string> &path, ///< [in] Path to read. + const string &attr_name, ///< [in] Attribute to read. + bufferlist &attr_value ///< [out] Attribute value read. + ); + + /// Remove attr from path + int remove_attr_path( + const vector<string> &path, ///< [in] path from which to remove attr + const string &attr_name ///< [in] attr to remove + ); ///< @return Error code, 0 on success + +private: + /* lfn translation functions */ + + /** + * Gets the version specific lfn attribute tag + */ + const string &get_lfn_attr() const { + return lfn_attribute; + } + const string &get_alt_lfn_attr() const { + return lfn_alt_attribute; + } + + /** + * Gets the filename corresponding to oid in path. + * + * @param [in] path Path in which to get filename for oid. + * @param [in] oid Object for which to get filename. + * @param [out] mangled_name Filename for oid, pass NULL if not needed. + * @param [out] full_path Fullpath for oid, pass NULL if not needed. + * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if + * not needed + * @return Error Code, 0 on success. + */ + int lfn_get_name( + const vector<string> &path, + const ghobject_t &oid, + string *mangled_name, + string *full_path, + int *hardlink + ); + + /// Adjusts path contents when oid is created at name mangled_name. + int lfn_created( + const vector<string> &path, ///< [in] Path to adjust. + const ghobject_t &oid, ///< [in] Object created. + const string &mangled_name ///< [in] Filename of created object. + ); + + /// Removes oid from path while adjusting path contents + int lfn_unlink( + const vector<string> &path, ///< [in] Path containing oid. + const ghobject_t &oid, ///< [in] Object to remove. + const string &mangled_name ///< [in] Filename of object to remove. + ); + + ///Transate a file into and ghobject_t. + int lfn_translate( + const vector<string> &path, ///< [in] Path containing the file. + const string &short_name, ///< [in] Filename to translate. + ghobject_t *out ///< [out] Object found. + ); ///< @return Negative error code on error, 0 if not an object, 1 else + + /* manglers/demanglers */ + /// Filters object filenames + bool lfn_is_object( + const string &short_name ///< [in] Filename to check + ); ///< True if short_name is an object, false otherwise + + /// Filters subdir filenames + bool lfn_is_subdir( + const string &short_name, ///< [in] Filename to check. + string *demangled_name ///< [out] Demangled subdir name. + ); ///< @return True if short_name is a subdir, false otherwise + + /// Generate object name + string lfn_generate_object_name_keyless( + const ghobject_t &oid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Generate object name + string lfn_generate_object_name_poolless( + const ghobject_t &oid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Generate object name + static string lfn_generate_object_name_current( + const ghobject_t &oid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Generate object name + string lfn_generate_object_name( + const ghobject_t &oid ///< [in] Object for which to generate. + ) { + if (index_version == HASH_INDEX_TAG) + return lfn_generate_object_name_keyless(oid); + if (index_version == HASH_INDEX_TAG_2) + return lfn_generate_object_name_poolless(oid); + else + return lfn_generate_object_name_current(oid); + } ///< @return Generated object name. + + /// Parse object name + int lfn_parse_object_name_keyless( + const string &long_name, ///< [in] Name to parse + ghobject_t *out ///< [out] Resulting Object + ); ///< @return True if successful, False otherwise. + + /// Parse object name + int lfn_parse_object_name_poolless( + const string &long_name, ///< [in] Name to parse + ghobject_t *out ///< [out] Resulting Object + ); ///< @return True if successful, False otherwise. + + /// Parse object name + int lfn_parse_object_name( + const string &long_name, ///< [in] Name to parse + ghobject_t *out ///< [out] Resulting Object + ); ///< @return True if successful, False otherwise. + + /// Checks whether short_name is a hashed filename. + bool lfn_is_hashed_filename( + const string &short_name ///< [in] Name to check. + ); ///< @return True if short_name is hashed, False otherwise. + + /// Checks whether long_name must be hashed. + bool lfn_must_hash( + const string &long_name ///< [in] Name to check. + ); ///< @return True if long_name must be hashed, False otherwise. + + /// Generate hashed name. + string lfn_get_short_name( + const ghobject_t &oid, ///< [in] Object for which to generate. + int i ///< [in] Index of hashed name to generate. + ); ///< @return Hashed filename. + + /* other common methods */ + /// Gets the base path + const string &get_base_path(); ///< @return Index base_path + + /// Get full path the subdir + string get_full_path_subdir( + const vector<string> &rel ///< [in] The subdir. + ); ///< @return Full path to rel. + + /// Get full path to object + string get_full_path( + const vector<string> &rel, ///< [in] Path to object. + const string &name ///< [in] Filename of object. + ); ///< @return Fullpath to object at name in rel. + + /// Get mangled path component + string mangle_path_component( + const string &component ///< [in] Component to mangle + ); /// @return Mangled component + + /// Demangle component + string demangle_path_component( + const string &component ///< [in] Subdir name to demangle + ); ///< @return Demangled path component. + + /// Decompose full path into object name and filename. + int decompose_full_path( + const char *in, ///< [in] Full path to object. + vector<string> *out, ///< [out] Path to object at in. + ghobject_t *oid, ///< [out] Object at in. + string *shortname ///< [out] Filename of object at in. + ); ///< @return Error Code, 0 on success. + + /// Mangle attribute name + string mangle_attr_name( + const string &attr ///< [in] Attribute to mangle. + ); ///< @return Mangled attribute name. + + /// checks whether long_name could hash to short_name + bool short_name_matches( + const char *short_name, ///< [in] name to check against + const char *cand_long_name ///< [in] candidate long name + ); + + /// Builds hashed filename + void build_filename( + const char *old_filename, ///< [in] Filename to convert. + int i, ///< [in] Index of hash. + char *filename, ///< [out] Resulting filename. + int len ///< [in] Size of buffer for filename + ); ///< @return Error Code, 0 on success + + /// Get hash of filename + int hash_filename( + const char *filename, ///< [in] Filename to hash. + char *hash, ///< [out] Hash of filename. + int len ///< [in] Size of hash buffer. + ); ///< @return Error Code, 0 on success. + + friend class TestWrapLFNIndex; +}; +typedef LFNIndex::IndexedPath IndexedPath; + +#endif diff --git a/src/os/filestore/SequencerPosition.h b/src/os/filestore/SequencerPosition.h new file mode 100644 index 00000000..164112ee --- /dev/null +++ b/src/os/filestore/SequencerPosition.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef __CEPH_OS_SEQUENCERPOSITION_H +#define __CEPH_OS_SEQUENCERPOSITION_H + +#include "include/types.h" +#include "include/cmp.h" +#include "include/encoding.h" +#include "common/Formatter.h" + +#include <ostream> + +/** + * transaction and op offset + */ +struct SequencerPosition { + uint64_t seq; ///< seq + uint32_t trans; ///< transaction in that seq (0-based) + uint32_t op; ///< op in that transaction (0-based) + + SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(seq, bl); + encode(trans, bl); + encode(op, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(seq, p); + decode(trans, p); + decode(op, p); + DECODE_FINISH(p); + } + void dump(Formatter *f) const { + f->dump_unsigned("seq", seq); + f->dump_unsigned("trans", trans); + f->dump_unsigned("op", op); + } + static void generate_test_instances(list<SequencerPosition*>& o) { + o.push_back(new SequencerPosition); + o.push_back(new SequencerPosition(1, 2, 3)); + o.push_back(new SequencerPosition(4, 5, 6)); + } +}; +WRITE_CLASS_ENCODER(SequencerPosition) + +inline ostream& operator<<(ostream& out, const SequencerPosition& t) { + return out << t.seq << "." << t.trans << "." << t.op; +} + +WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op) +WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op) + + +#endif diff --git a/src/os/filestore/WBThrottle.cc b/src/os/filestore/WBThrottle.cc new file mode 100644 index 00000000..ba2ed131 --- /dev/null +++ b/src/os/filestore/WBThrottle.cc @@ -0,0 +1,272 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "acconfig.h" + +#include "os/filestore/WBThrottle.h" +#include "common/perf_counters.h" +#include "common/errno.h" + +WBThrottle::WBThrottle(CephContext *cct) : + cur_ios(0), cur_size(0), + cct(cct), + logger(NULL), + stopping(true), + lock("WBThrottle::lock", false, true, false), + fs(XFS) +{ + { + Mutex::Locker l(lock); + set_from_conf(); + } + ceph_assert(cct); + PerfCountersBuilder b( + cct, string("WBThrottle"), + l_wbthrottle_first, l_wbthrottle_last); + b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data", NULL, 0, unit_t(UNIT_BYTES)); + b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations"); + b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations"); + b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write"); + b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries"); + logger = b.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); + for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i) + logger->set(i, 0); + + cct->_conf.add_observer(this); +} + +WBThrottle::~WBThrottle() { + ceph_assert(cct); + cct->get_perfcounters_collection()->remove(logger); + delete logger; + cct->_conf.remove_observer(this); +} + +void WBThrottle::start() +{ + { + Mutex::Locker l(lock); + stopping = false; + } + create("wb_throttle"); +} + +void WBThrottle::stop() +{ + { + Mutex::Locker l(lock); + stopping = true; + cond.Signal(); + } + + join(); +} + +const char** WBThrottle::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "filestore_wbthrottle_btrfs_bytes_start_flusher", + "filestore_wbthrottle_btrfs_bytes_hard_limit", + "filestore_wbthrottle_btrfs_ios_start_flusher", + "filestore_wbthrottle_btrfs_ios_hard_limit", + "filestore_wbthrottle_btrfs_inodes_start_flusher", + "filestore_wbthrottle_btrfs_inodes_hard_limit", + "filestore_wbthrottle_xfs_bytes_start_flusher", + "filestore_wbthrottle_xfs_bytes_hard_limit", + "filestore_wbthrottle_xfs_ios_start_flusher", + "filestore_wbthrottle_xfs_ios_hard_limit", + "filestore_wbthrottle_xfs_inodes_start_flusher", + "filestore_wbthrottle_xfs_inodes_hard_limit", + NULL + }; + return KEYS; +} + +void WBThrottle::set_from_conf() +{ + ceph_assert(lock.is_locked()); + if (fs == BTRFS) { + size_limits.first = + cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher; + size_limits.second = + cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit; + io_limits.first = + cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher; + io_limits.second = + cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit; + fd_limits.first = + cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher; + fd_limits.second = + cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit; + } else if (fs == XFS) { + size_limits.first = + cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher; + size_limits.second = + cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit; + io_limits.first = + cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher; + io_limits.second = + cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit; + fd_limits.first = + cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher; + fd_limits.second = + cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit; + } else { + ceph_abort_msg("invalid value for fs"); + } + cond.Signal(); +} + +void WBThrottle::handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + Mutex::Locker l(lock); + for (const char** i = get_tracked_conf_keys(); *i; ++i) { + if (changed.count(*i)) { + set_from_conf(); + return; + } + } +} + +bool WBThrottle::get_next_should_flush( + boost::tuple<ghobject_t, FDRef, PendingWB> *next) +{ + ceph_assert(lock.is_locked()); + ceph_assert(next); + while (!stopping && (!beyond_limit() || pending_wbs.empty())) + cond.Wait(lock); + if (stopping) + return false; + ceph_assert(!pending_wbs.empty()); + ghobject_t obj(pop_object()); + + ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i = + pending_wbs.find(obj); + *next = boost::make_tuple(obj, i->second.second, i->second.first); + pending_wbs.erase(i); + return true; +} + + +void *WBThrottle::entry() +{ + Mutex::Locker l(lock); + boost::tuple<ghobject_t, FDRef, PendingWB> wb; + while (get_next_should_flush(&wb)) { + clearing = wb.get<0>(); + cur_ios -= wb.get<2>().ios; + logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios); + logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios); + cur_size -= wb.get<2>().size; + logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size); + logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size); + logger->dec(l_wbthrottle_inodes_dirtied); + logger->inc(l_wbthrottle_inodes_wb); + lock.Unlock(); +#if defined(HAVE_FDATASYNC) + int r = ::fdatasync(**wb.get<1>()); +#else + int r = ::fsync(**wb.get<1>()); +#endif + if (r < 0) { + lderr(cct) << "WBThrottle fsync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +#ifdef HAVE_POSIX_FADVISE + if (cct->_conf->filestore_fadvise && wb.get<2>().nocache) { + int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED); + ceph_assert(fa_r == 0); + } +#endif + lock.Lock(); + clearing = ghobject_t(); + cond.Signal(); + wb = boost::tuple<ghobject_t, FDRef, PendingWB>(); + } + return 0; +} + +void WBThrottle::queue_wb( + FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len, + bool nocache) +{ + Mutex::Locker l(lock); + ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter = + pending_wbs.find(hoid); + if (wbiter == pending_wbs.end()) { + wbiter = pending_wbs.insert( + make_pair(hoid, + make_pair( + PendingWB(), + fd))).first; + logger->inc(l_wbthrottle_inodes_dirtied); + } else { + remove_object(hoid); + } + + cur_ios++; + logger->inc(l_wbthrottle_ios_dirtied); + cur_size += len; + logger->inc(l_wbthrottle_bytes_dirtied, len); + + wbiter->second.first.add(nocache, len, 1); + insert_object(hoid); + if (beyond_limit()) + cond.Signal(); +} + +void WBThrottle::clear() +{ + Mutex::Locker l(lock); + for (ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i = + pending_wbs.begin(); + i != pending_wbs.end(); + ++i) { +#ifdef HAVE_POSIX_FADVISE + if (cct->_conf->filestore_fadvise && i->second.first.nocache) { + int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED); + ceph_assert(fa_r == 0); + } +#endif + + } + cur_ios = cur_size = 0; + logger->set(l_wbthrottle_ios_dirtied, 0); + logger->set(l_wbthrottle_bytes_dirtied, 0); + logger->set(l_wbthrottle_inodes_dirtied, 0); + pending_wbs.clear(); + lru.clear(); + rev_lru.clear(); + cond.Signal(); +} + +void WBThrottle::clear_object(const ghobject_t &hoid) +{ + Mutex::Locker l(lock); + while (clearing == hoid) + cond.Wait(lock); + ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i = + pending_wbs.find(hoid); + if (i == pending_wbs.end()) + return; + + cur_ios -= i->second.first.ios; + logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios); + cur_size -= i->second.first.size; + logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size); + logger->dec(l_wbthrottle_inodes_dirtied); + + pending_wbs.erase(i); + remove_object(hoid); + cond.Signal(); +} + +void WBThrottle::throttle() +{ + Mutex::Locker l(lock); + while (!stopping && need_flush()) + cond.Wait(lock); +} diff --git a/src/os/filestore/WBThrottle.h b/src/os/filestore/WBThrottle.h new file mode 100644 index 00000000..ef809ea4 --- /dev/null +++ b/src/os/filestore/WBThrottle.h @@ -0,0 +1,187 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef WBTHROTTLE_H +#define WBTHROTTLE_H + +#include "include/unordered_map.h" +#include <boost/tuple/tuple.hpp> +#include "common/Formatter.h" +#include "common/hobject.h" +#include "include/interval_set.h" +#include "FDCache.h" +#include "common/Thread.h" +#include "common/ceph_context.h" + +class PerfCounters; +enum { + l_wbthrottle_first = 999090, + l_wbthrottle_bytes_dirtied, + l_wbthrottle_bytes_wb, + l_wbthrottle_ios_dirtied, + l_wbthrottle_ios_wb, + l_wbthrottle_inodes_dirtied, + l_wbthrottle_inodes_wb, + l_wbthrottle_last +}; + +/** + * WBThrottle + * + * Tracks, throttles, and flushes outstanding IO + */ +class WBThrottle : Thread, public md_config_obs_t { + ghobject_t clearing; + /* *_limits.first is the start_flusher limit and + * *_limits.second is the hard limit + */ + + /// Limits on unflushed bytes + pair<uint64_t, uint64_t> size_limits; + + /// Limits on unflushed ios + pair<uint64_t, uint64_t> io_limits; + + /// Limits on unflushed objects + pair<uint64_t, uint64_t> fd_limits; + + uint64_t cur_ios; /// Currently unflushed IOs + uint64_t cur_size; /// Currently unflushed bytes + + /** + * PendingWB tracks the ios pending on an object. + */ + class PendingWB { + public: + bool nocache; + uint64_t size; + uint64_t ios; + PendingWB() : nocache(true), size(0), ios(0) {} + void add(bool _nocache, uint64_t _size, uint64_t _ios) { + if (!_nocache) + nocache = false; // only nocache if all writes are nocache + size += _size; + ios += _ios; + } + }; + + CephContext *cct; + PerfCounters *logger; + bool stopping; + Mutex lock; + Cond cond; + + + /** + * Flush objects in lru order + */ + list<ghobject_t> lru; + ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator> rev_lru; + void remove_object(const ghobject_t &oid) { + ceph_assert(lock.is_locked()); + ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator>::iterator iter = + rev_lru.find(oid); + if (iter == rev_lru.end()) + return; + + lru.erase(iter->second); + rev_lru.erase(iter); + } + ghobject_t pop_object() { + ceph_assert(!lru.empty()); + ghobject_t oid(lru.front()); + lru.pop_front(); + rev_lru.erase(oid); + return oid; + } + void insert_object(const ghobject_t &oid) { + ceph_assert(rev_lru.find(oid) == rev_lru.end()); + lru.push_back(oid); + rev_lru.insert(make_pair(oid, --lru.end())); + } + + ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs; + + /// get next flush to perform + bool get_next_should_flush( + boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush + ); ///< @return false if we are shutting down +public: + enum FS { + BTRFS, + XFS + }; + +private: + FS fs; + + void set_from_conf(); + bool beyond_limit() const { + if (cur_ios < io_limits.first && + pending_wbs.size() < fd_limits.first && + cur_size < size_limits.first) + return false; + else + return true; + } + bool need_flush() const { + if (cur_ios < io_limits.second && + pending_wbs.size() < fd_limits.second && + cur_size < size_limits.second) + return false; + else + return true; + } + +public: + explicit WBThrottle(CephContext *cct); + ~WBThrottle() override; + + void start(); + void stop(); + /// Set fs as XFS or BTRFS + void set_fs(FS new_fs) { + Mutex::Locker l(lock); + fs = new_fs; + set_from_conf(); + } + + /// Queue wb on oid, fd taking throttle (does not block) + void queue_wb( + FDRef fd, ///< [in] FDRef to oid + const ghobject_t &oid, ///< [in] object + uint64_t offset, ///< [in] offset written + uint64_t len, ///< [in] length written + bool nocache ///< [in] try to clear out of cache after write + ); + + /// Clear all wb (probably due to sync) + void clear(); + + /// Clear object + void clear_object(const ghobject_t &oid); + + /// Block until there is throttle available + void throttle(); + + /// md_config_obs_t + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) override; + + /// Thread + void *entry() override; +}; + +#endif diff --git a/src/os/filestore/XfsFileStoreBackend.cc b/src/os/filestore/XfsFileStoreBackend.cc new file mode 100644 index 00000000..1081d146 --- /dev/null +++ b/src/os/filestore/XfsFileStoreBackend.cc @@ -0,0 +1,149 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "XfsFileStoreBackend.h" + +#include <errno.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <unistd.h> +#include <sys/utsname.h> + +#include <xfs/xfs.h> + +#include "common/errno.h" +#include "common/linux_version.h" +#include "include/ceph_assert.h" +#include "include/compat.h" + +#define dout_context cct() +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") " + +XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs): + GenericFileStoreBackend(fs), m_has_extsize(false) { } + +/* + * Set extsize attr on a file to val. Should be a free-standing + * function, but dout_prefix expanding to a call to get_basedir_path() + * protected member function won't let it. + */ +int XfsFileStoreBackend::set_extsize(int fd, unsigned int val) +{ + struct fsxattr fsx; + struct stat sb; + int ret; + + if (fstat(fd, &sb) < 0) { + ret = -errno; + dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl; + return ret; + } + if (!S_ISREG(sb.st_mode)) { + dout(0) << "set_extsize: invalid target file type" << dendl; + return -EINVAL; + } + + if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) { + ret = -errno; + dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl; + return ret; + } + + // already set? + if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val) + return 0; + + // xfs won't change extent size if any extents are allocated + if (fsx.fsx_nextents != 0) + return 0; + + fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE; + fsx.fsx_extsize = val; + + if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) { + ret = -errno; + dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl; + return ret; + } + + return 0; +} + +int XfsFileStoreBackend::detect_features() +{ + int ret; + + ret = GenericFileStoreBackend::detect_features(); + if (ret < 0) + return ret; + + // extsize? + int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600); + if (fd < 0) { + ret = -errno; + dout(0) << "detect_feature: failed to create test file for extsize attr: " + << cpp_strerror(ret) << dendl; + goto out; + } + if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) { + ret = -errno; + dout(0) << "detect_feature: failed to unlink test file for extsize attr: " + << cpp_strerror(ret) << dendl; + goto out_close; + } + + if (cct()->_conf->filestore_xfs_extsize) { + ret = set_extsize(fd, 1U << 15); // a few pages + if (ret) { + ret = 0; + dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl; + goto out_close; + } + + // make sure we have 3.5 or newer, which includes this fix + // aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d + // for this set_extsize bug + // http://oss.sgi.com/bugzilla/show_bug.cgi?id=874 + int ver = get_linux_version(); + if (ver == 0) { + dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl; + m_has_extsize = false; + } else if (ver < KERNEL_VERSION(3, 5, 0)) { + dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl; + m_has_extsize = false; + } else { + dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl; + m_has_extsize = true; + } + } else { + dout(0) << "detect_feature: extsize is disabled by conf" << dendl; + } + +out_close: + TEMP_FAILURE_RETRY(::close(fd)); +out: + return ret; +} + +int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint) +{ + if (!m_has_extsize) + return -EOPNOTSUPP; + + ceph_assert(hint < UINT_MAX); + return set_extsize(fd, hint); +} diff --git a/src/os/filestore/XfsFileStoreBackend.h b/src/os/filestore/XfsFileStoreBackend.h new file mode 100644 index 00000000..e8b81f9a --- /dev/null +++ b/src/os/filestore/XfsFileStoreBackend.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_XFSFILESTOREBACKEND_H +#define CEPH_XFSFILESTOREBACKEND_H + +#include "GenericFileStoreBackend.h" + +#include "include/int_types.h" + +class XfsFileStoreBackend : public GenericFileStoreBackend { +private: + bool m_has_extsize; + int set_extsize(int fd, unsigned int val); +public: + explicit XfsFileStoreBackend(FileStore *fs); + ~XfsFileStoreBackend() override {} + const char *get_name() override { + return "xfs"; + } + int detect_features() override; + int set_alloc_hint(int fd, uint64_t hint) override; +}; + +#endif /* CEPH_XFSFILESTOREBACKEND_H */ diff --git a/src/os/filestore/ZFSFileStoreBackend.cc b/src/os/filestore/ZFSFileStoreBackend.cc new file mode 100644 index 00000000..e85dbd52 --- /dev/null +++ b/src/os/filestore/ZFSFileStoreBackend.cc @@ -0,0 +1,258 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/int_types.h" +#include "include/types.h" + +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> + +#include "include/compat.h" +#include "include/linux_fiemap.h" +#include "include/color.h" +#include "include/buffer.h" +#include "include/ceph_assert.h" + +#include <iostream> +#include <fstream> +#include <sstream> + +#include "common/errno.h" +#include "common/config.h" +#include "common/sync_filesystem.h" + +#include "ZFSFileStoreBackend.h" + +#define dout_context cct() +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") " + +ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) : + GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL), + m_filestore_zfs_snap(cct()->_conf->filestore_zfs_snap) +{ + int ret = zfs.init(); + if (ret < 0) { + dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl; + return; + } + + base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM); + if (!base_zh) { + dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl; + return; + } + + update_current_zh(); +} + +ZFSFileStoreBackend::~ZFSFileStoreBackend() +{ + if (base_zh) + zfs.close(base_zh); + if (current_zh) + zfs.close(current_zh); +} + +int ZFSFileStoreBackend::update_current_zh() +{ + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); + ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM); + if (zh) { + char *mnt; + if (zfs.is_mounted(zh, &mnt)) { + int ret = get_current_path() == mnt; + free(mnt); + if (ret) { + current_zh = zh; + return 0; + } + } else { + int ret = zfs.mount(zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh) + << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + } + zfs.close(zh); + } else { + dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl; + return -ENOENT; + } + + zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM); + if (zh) { + if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) { + current_zh = zh; + return 0; + } + zfs.close(zh); + dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl; + } else { + dout(0) << "update_current_zh: current/ not exist" << dendl; + } + return -ENOENT; +} + +int ZFSFileStoreBackend::detect_features() +{ + if (!current_zh) + dout(0) << "detect_features: null zfs handle for current/" << dendl; + return 0; +} + +bool ZFSFileStoreBackend::can_checkpoint() +{ + return m_filestore_zfs_snap && current_zh != NULL; +} + +int ZFSFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "create_current: current/ exists but is not a directory" << dendl; + return -ENOTDIR; + } + return 0; + } else if (errno != ENOENT) { + ret = -errno; + dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl; + return ret; + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); + ret = zfs.create(path, ZFS::TYPE_FILESYSTEM); + if (ret < 0 && errno != EEXIST) { + ret = -errno; + dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + + ret = update_current_zh(); + return ret; +} + +static int list_checkpoints_callback(ZFS::Handle *zh, void *data) +{ + list<string> *ls = static_cast<list<string> *>(data); + string str = ZFS::get_name(zh); + size_t pos = str.find('@'); + ceph_assert(pos != string::npos && pos + 1 != str.length()); + ls->push_back(str.substr(pos + 1)); + return 0; +} + +int ZFSFileStoreBackend::list_checkpoints(list<string>& ls) +{ + dout(10) << "list_checkpoints:" << dendl; + if (!current_zh) + return -EINVAL; + + list<string> snaps; + int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps); + if (ret < 0) { + ret = -errno; + dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl; + return ret; + } + ls.swap(snaps); + return 0; +} + +int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid) +{ + dout(10) << "create_checkpoint: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + // looks like zfsonlinux doesn't flush dirty data when taking snapshot + int ret = sync_filesystem(get_current_fd()); + if (ret < 0) { + ret = -errno; + dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl; + return ret; + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); + ret = zfs.snapshot(path, false); + if (ret < 0) { + ret = -errno; + dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl; + return ret; + } + if (cid) + *cid = 0; + return 0; +} + +int ZFSFileStoreBackend::rollback_to(const string& name) +{ + dout(10) << "rollback_to: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + // umount current to avoid triggering online rollback deadlock + int ret; + if (zfs.is_mounted(current_zh, NULL)) { + ret = zfs.umount(current_zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl; + } + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); + + ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT); + if (!snap_zh) { + dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl; + return -ENOENT; + } + + ret = zfs.rollback(current_zh, snap_zh, false); + if (ret < 0) { + ret = -errno; + dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl; + } + + if (!zfs.is_mounted(current_zh, NULL)) { + int ret = zfs.mount(current_zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + } + + zfs.close(snap_zh); + return ret; +} + +int ZFSFileStoreBackend::destroy_checkpoint(const string& name) +{ + dout(10) << "destroy_checkpoint: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + int ret = zfs.destroy_snaps(current_zh, name.c_str(), true); + if (ret < 0) { + ret = -errno; + dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl; + } + return ret; +} diff --git a/src/os/filestore/ZFSFileStoreBackend.h b/src/os/filestore/ZFSFileStoreBackend.h new file mode 100644 index 00000000..b1fa9887 --- /dev/null +++ b/src/os/filestore/ZFSFileStoreBackend.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_ZFSFILESTOREBACKEND_H +#define CEPH_ZFSFILESTOREBACKEND_H + +#ifdef HAVE_LIBZFS +#include "GenericFileStoreBackend.h" +#include "os/fs/ZFS.h" + +class ZFSFileStoreBackend : public GenericFileStoreBackend { +private: + ZFS zfs; + ZFS::Handle *base_zh; + ZFS::Handle *current_zh; + bool m_filestore_zfs_snap; + int update_current_zh(); +public: + explicit ZFSFileStoreBackend(FileStore *fs); + ~ZFSFileStoreBackend(); + const char *get_name() override { + return "zfs"; + } + int detect_features(); + bool can_checkpoint(); + int create_current(); + int list_checkpoints(list<string>& ls); + int create_checkpoint(const string& name, uint64_t *cid); + int rollback_to(const string& name); + int destroy_checkpoint(const string& name); +}; +#endif +#endif diff --git a/src/os/filestore/chain_xattr.cc b/src/os/filestore/chain_xattr.cc new file mode 100644 index 00000000..e4dedd29 --- /dev/null +++ b/src/os/filestore/chain_xattr.cc @@ -0,0 +1,413 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "chain_xattr.h" +#include <errno.h> // for ERANGE, ENODATA, ENOMEM +#include <stdio.h> // for size_t, snprintf +#include <stdlib.h> // for free, malloc +#include <string.h> // for strcpy, strlen +#include "include/ceph_assert.h" // for assert +#include "include/buffer.h" + +#if defined(__linux__) +#include <linux/fs.h> +#endif + +#include "include/ceph_assert.h" + +/* + * chaining xattrs + * + * In order to support xattrs that are larger than the xattr size limit that some file systems + * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys + * are set as follows: + * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char + * being esacped ("@@"). + * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@<id>" + * where <id> marks the num of xattr in the chain. + */ + +void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len) +{ + int pos = 0; + + while (*name) { + switch (*name) { + case '@': /* escape it */ + pos += 2; + ceph_assert (pos < raw_len - 1); + *raw_name = '@'; + raw_name++; + *raw_name = '@'; + break; + default: + pos++; + ceph_assert(pos < raw_len - 1); + *raw_name = *name; + break; + } + name++; + raw_name++; + } + + if (!i) { + *raw_name = '\0'; + } else { + int r = snprintf(raw_name, raw_len - pos, "@%d", i); + ceph_assert(r < raw_len - pos); + } +} + +static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first) +{ + int pos = 0; + + *is_first = true; + while (*raw_name) { + switch (*raw_name) { + case '@': /* escape it */ + raw_name++; + if (!*raw_name) + break; + if (*raw_name != '@') { + *is_first = false; + goto done; + } + + /* fall through */ + default: + *name = *raw_name; + break; + } + pos++; + ceph_assert(pos < name_len); + name++; + raw_name++; + } +done: + *name = '\0'; + return pos; +} + + +// setxattr + +static int getxattr_len(const char *fn, const char *name) +{ + int i = 0, total = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_getxattr(fn, raw_name, 0, 0); + if (!i && r < 0) + return r; + if (r < 0) + break; + total += r; + i++; + } while (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN); + + return total; +} + +int chain_getxattr(const char *fn, const char *name, void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + int r; + size_t chunk_size; + + if (!size) + return getxattr_len(fn, name); + + do { + chunk_size = size; + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + + r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size); + if (i && r == -ENODATA) { + ret = pos; + break; + } + if (r < 0) { + ret = r; + break; + } + + if (r > 0) { + pos += r; + size -= r; + } + + i++; + } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN)); + + if (r >= 0) { + ret = pos; + /* is there another chunk? that can happen if the last read size span over + exactly one block */ + if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN || + chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_getxattr(fn, raw_name, 0, 0); + if (r > 0) { // there's another chunk.. the original buffer was too small + ret = -ERANGE; + } + } + } + return ret; +} + +int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp) +{ + size_t size = 1024; // Initial + while (1) { + bufferptr buf(size); + int r = chain_getxattr( + fn, + name, + buf.c_str(), + size); + if (r > 0) { + buf.set_length(r); + if (bp) + bp->swap(buf); + return r; + } else if (r == 0) { + return 0; + } else { + if (r == -ERANGE) { + size *= 2; + } else { + return r; + } + } + } + ceph_abort_msg("unreachable"); + return 0; +} + +static int chain_fgetxattr_len(int fd, const char *name) +{ + int i = 0, total = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fgetxattr(fd, raw_name, 0, 0); + if (!i && r < 0) + return r; + if (r < 0) + break; + total += r; + i++; + } while (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN); + + return total; +} + +int chain_fgetxattr(int fd, const char *name, void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + int r; + size_t chunk_size; + + if (!size) + return chain_fgetxattr_len(fd, name); + + do { + chunk_size = size; + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + + r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size); + if (i && r == -ENODATA) { + ret = pos; + break; + } + if (r < 0) { + ret = r; + break; + } + + if (r > 0) { + pos += r; + size -= r; + } + + i++; + } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN)); + + if (r >= 0) { + ret = pos; + /* is there another chunk? that can happen if the last read size span over + exactly one block */ + if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN || + chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fgetxattr(fd, raw_name, 0, 0); + if (r > 0) { // there's another chunk.. the original buffer was too small + ret = -ERANGE; + } + } + } + return ret; +} + + +// setxattr + +int get_xattr_block_size(size_t size) +{ + if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD) + // this may fit in the inode; stripe over short attrs so that XFS + // won't kick it out. + return CHAIN_XATTR_SHORT_BLOCK_LEN; + return CHAIN_XATTR_MAX_BLOCK_LEN; +} + +// removexattr + +int chain_removexattr(const char *fn, const char *name) +{ + int i = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_removexattr(fn, raw_name); + if (!i && r < 0) { + return r; + } + i++; + } while (r >= 0); + return 0; +} + +int chain_fremovexattr(int fd, const char *name) +{ + int i = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fremovexattr(fd, raw_name); + if (!i && r < 0) { + return r; + } + i++; + } while (r >= 0); + return 0; +} + + +// listxattr + +int chain_listxattr(const char *fn, char *names, size_t len) { + int r; + + if (!len) + return sys_listxattr(fn, names, len) * 2; + + r = sys_listxattr(fn, 0, 0); + if (r < 0) + return r; + + size_t total_len = r * 2; // should be enough + char *full_buf = (char *)malloc(total_len); + if (!full_buf) + return -ENOMEM; + + r = sys_listxattr(fn, full_buf, total_len); + if (r < 0) { + free(full_buf); + return r; + } + + char *p = full_buf; + const char *end = full_buf + r; + char *dest = names; + char *dest_end = names + len; + + while (p < end) { + char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int attr_len = strlen(p); + bool is_first; + int name_len = translate_raw_name(p, name, sizeof(name), &is_first); + if (is_first) { + if (dest + name_len > dest_end) { + r = -ERANGE; + goto done; + } + strcpy(dest, name); + dest += name_len + 1; + } + p += attr_len + 1; + } + r = dest - names; + +done: + free(full_buf); + return r; +} + +int chain_flistxattr(int fd, char *names, size_t len) { + int r; + char *p; + const char * end; + char *dest; + char *dest_end; + + if (!len) + return sys_flistxattr(fd, names, len) * 2; + + r = sys_flistxattr(fd, 0, 0); + if (r < 0) + return r; + + size_t total_len = r * 2; // should be enough + char *full_buf = (char *)malloc(total_len); + if (!full_buf) + return -ENOMEM; + + r = sys_flistxattr(fd, full_buf, total_len); + if (r < 0) + goto done; + + p = full_buf; + end = full_buf + r; + dest = names; + dest_end = names + len; + + while (p < end) { + char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int attr_len = strlen(p); + bool is_first; + int name_len = translate_raw_name(p, name, sizeof(name), &is_first); + if (is_first) { + if (dest + name_len > dest_end) { + r = -ERANGE; + goto done; + } + strcpy(dest, name); + dest += name_len + 1; + } + p += attr_len + 1; + } + r = dest - names; + +done: + free(full_buf); + return r; +} diff --git a/src/os/filestore/chain_xattr.h b/src/os/filestore/chain_xattr.h new file mode 100644 index 00000000..a2d17fa6 --- /dev/null +++ b/src/os/filestore/chain_xattr.h @@ -0,0 +1,182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef __CEPH_OSD_CHAIN_XATTR_H +#define __CEPH_OSD_CHAIN_XATTR_H + +#include "include/compat.h" +#include <errno.h> +#include <stdio.h> +#include "common/xattr.h" +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" + +#if defined(__linux__) +#include <linux/limits.h> +#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2) +#elif defined(__APPLE__) +#include <sys/xattr.h> +#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2) +#else +#define CHAIN_XATTR_MAX_NAME_LEN 128 +#endif + +#define CHAIN_XATTR_MAX_BLOCK_LEN 2048 + +/* + * XFS will only inline xattrs < 255 bytes, so for xattrs that are + * likely to fit in the inode, stripe over short xattrs. + */ +#define CHAIN_XATTR_SHORT_BLOCK_LEN 250 +#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000 + +// wrappers to hide annoying errno handling. + +static inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size) +{ + int r = ::ceph_os_fgetxattr(fd, name, val, size); + return (r < 0 ? -errno : r); +} +static inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size) +{ + int r = ::ceph_os_getxattr(fn, name, val, size); + return (r < 0 ? -errno : r); +} + +static inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size) +{ + int r = ::ceph_os_setxattr(fn, name, val, size); + return (r < 0 ? -errno : r); +} +static inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size) +{ + int r = ::ceph_os_fsetxattr(fd, name, val, size); + return (r < 0 ? -errno : r); +} + +static inline int sys_listxattr(const char *fn, char *names, size_t len) +{ + int r = ::ceph_os_listxattr(fn, names, len); + return (r < 0 ? -errno : r); +} +static inline int sys_flistxattr(int fd, char *names, size_t len) +{ + int r = ::ceph_os_flistxattr(fd, names, len); + return (r < 0 ? -errno : r); +} + +static inline int sys_removexattr(const char *fn, const char *name) +{ + int r = ::ceph_os_removexattr(fn, name); + return (r < 0 ? -errno : r); +} +static inline int sys_fremovexattr(int fd, const char *name) +{ + int r = ::ceph_os_fremovexattr(fd, name); + return (r < 0 ? -errno : r); +} + + +// wrappers to chain large values across multiple xattrs + +int chain_getxattr(const char *fn, const char *name, void *val, size_t size); +int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp); +int chain_fgetxattr(int fd, const char *name, void *val, size_t size); + +int get_xattr_block_size(size_t size); +void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len); + +template <bool skip_chain_cleanup=false, bool ensure_single_attr=false> +int chain_setxattr( + const char *fn, const char *name, const void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + size_t max_chunk_size = + ensure_single_attr ? size : get_xattr_block_size(size); + + static_assert( + !skip_chain_cleanup || ensure_single_attr, + "skip_chain_cleanup must imply ensure_single_attr"); + + do { + size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size); + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + size -= chunk_size; + + int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size); + if (r < 0) { + ret = r; + break; + } + pos += chunk_size; + ret = pos; + i++; + ceph_assert(size == 0 || !ensure_single_attr); + } while (size); + + if (ret >= 0 && !skip_chain_cleanup) { + int r; + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_removexattr(fn, raw_name); + if (r < 0 && r != -ENODATA) + ret = r; + i++; + } while (r != -ENODATA); + } + + return ret; +} + +template <bool skip_chain_cleanup=false, bool ensure_single_attr=false> +int chain_fsetxattr( + int fd, const char *name, const void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + size_t max_chunk_size = + ensure_single_attr ? size : get_xattr_block_size(size); + + static_assert( + !skip_chain_cleanup || ensure_single_attr, + "skip_chain_cleanup must imply ensure_single_attr"); + + do { + size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size); + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + size -= chunk_size; + + int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size); + if (r < 0) { + ret = r; + break; + } + pos += chunk_size; + ret = pos; + i++; + ceph_assert(size == 0 || !ensure_single_attr); + } while (size); + + if (ret >= 0 && !skip_chain_cleanup) { + int r; + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fremovexattr(fd, raw_name); + if (r < 0 && r != -ENODATA) + ret = r; + i++; + } while (r != -ENODATA); + } + + return ret; +} + +int chain_listxattr(const char *fn, char *names, size_t len); +int chain_flistxattr(int fd, char *names, size_t len); +int chain_removexattr(const char *fn, const char *name); +int chain_fremovexattr(int fd, const char *name); + +#endif diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc new file mode 100644 index 00000000..c40fd0de --- /dev/null +++ b/src/os/fs/FS.cc @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#ifdef __linux__ +#include <linux/falloc.h> +#endif + +#include "FS.h" + +#include "acconfig.h" + +#ifdef HAVE_LIBXFS +#include "XFS.h" +#endif + +#if defined(__APPLE__) || defined(__FreeBSD__) +#include <sys/mount.h> +#else +#include <sys/vfs.h> +#endif +#include "include/compat.h" + +// --------------- + +FS *FS::create(uint64_t f_type) +{ + switch (f_type) { +#ifdef HAVE_LIBXFS + case XFS_SUPER_MAGIC: + return new XFS; +#endif + default: + return new FS; + } +} + +FS *FS::create_by_fd(int fd) +{ + struct statfs st; + ::fstatfs(fd, &st); + return create(st.f_type); +} + +// --------------- + +int FS::set_alloc_hint(int fd, uint64_t hint) +{ + return 0; // no-op +} + +#ifdef HAVE_NAME_TO_HANDLE_AT +int FS::get_handle(int fd, std::string *h) +{ + char buf[sizeof(struct file_handle) + MAX_HANDLE_SZ]; + struct file_handle *fh = (struct file_handle *)buf; + int mount_id; + + fh->handle_bytes = MAX_HANDLE_SZ; + int r = name_to_handle_at(fd, "", fh, &mount_id, AT_EMPTY_PATH); + if (r < 0) { + return -errno; + } + *h = std::string(buf, fh->handle_bytes + sizeof(struct file_handle)); + return 0; +} + +int FS::open_handle(int mount_fd, const std::string& h, int flags) +{ + if (h.length() < sizeof(struct file_handle)) { + return -EINVAL; + } + struct file_handle *fh = (struct file_handle *)h.data(); + if (fh->handle_bytes > h.length()) { + return -ERANGE; + } + int fd = open_by_handle_at(mount_fd, fh, flags); + if (fd < 0) + return -errno; + return fd; +} + +#else // HAVE_NAME_TO_HANDLE_AT + +int FS::get_handle(int fd, std::string *h) +{ + return -EOPNOTSUPP; +} + +int FS::open_handle(int mount_fd, const std::string& h, int flags) +{ + return -EOPNOTSUPP; +} + +#endif // HAVE_NAME_TO_HANDLE_AT + +int FS::copy_file_range(int to_fd, uint64_t to_offset, + int from_fd, + uint64_t from_offset, uint64_t from_len) +{ + ceph_abort_msg("write me"); +} + +int FS::zero(int fd, uint64_t offset, uint64_t length) +{ + int r; + + /* + + From the fallocate(2) man page: + + Specifying the FALLOC_FL_PUNCH_HOLE flag (available since Linux 2.6.38) + in mode deallocates space (i.e., creates a hole) in the byte range + starting at offset and continuing for len bytes. Within the specified + range, partial filesystem blocks are zeroed, and whole filesystem + blocks are removed from the file. After a successful call, subsequent + reads from this range will return zeroes. + + The FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE in + mode; in other words, even when punching off the end of the file, the + file size (as reported by stat(2)) does not change. + + Not all filesystems support FALLOC_FL_PUNCH_HOLE; if a filesystem + doesn't support the operation, an error is returned. The operation is + supported on at least the following filesystems: + + * XFS (since Linux 2.6.38) + + * ext4 (since Linux 3.0) + + * Btrfs (since Linux 3.7) + + * tmpfs (since Linux 3.5) + + So: we only do this is PUNCH_HOLE *and* KEEP_SIZE are defined. + + */ +#if !defined(__APPLE__) && !defined(__FreeBSD__) +# ifdef CEPH_HAVE_FALLOCATE +# ifdef FALLOC_FL_KEEP_SIZE + // first try fallocate + r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length); + if (r < 0) { + r = -errno; + } + if (r != -EOPNOTSUPP) { + goto out; // a real error + } + // if that failed (-EOPNOTSUPP), fall back to writing zeros. +# endif +# endif +#endif + + { + // fall back to writing zeros + bufferlist bl; + bl.append_zero(length); + r = ::lseek64(fd, offset, SEEK_SET); + if (r < 0) { + r = -errno; + goto out; + } + r = bl.write_fd(fd); + } + + out: + return r; +} + +// --------------- + diff --git a/src/os/fs/FS.h b/src/os/fs/FS.h new file mode 100644 index 00000000..aafa64e5 --- /dev/null +++ b/src/os/fs/FS.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OS_FS_H +#define CEPH_OS_FS_H + +#include <errno.h> +#include <time.h> + +#include <string> + +#include "include/types.h" +#include "common/Mutex.h" +#include "common/Cond.h" + +class FS { +public: + virtual ~FS() { } + + static FS *create(uint64_t f_type); + static FS *create_by_fd(int fd); + + virtual const char *get_name() { + return "generic"; + } + + virtual int set_alloc_hint(int fd, uint64_t hint); + + virtual int get_handle(int fd, std::string *h); + virtual int open_handle(int mount_fd, const std::string& h, int flags); + + virtual int copy_file_range(int to_fd, uint64_t to_offset, + int from_fd, + uint64_t from_offset, uint64_t from_len); + virtual int zero(int fd, uint64_t offset, uint64_t length); + + // -- aio -- +}; + +#endif diff --git a/src/os/fs/XFS.cc b/src/os/fs/XFS.cc new file mode 100644 index 00000000..c72ee1a0 --- /dev/null +++ b/src/os/fs/XFS.cc @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "XFS.h" + +#include <xfs/xfs.h> + +int XFS::set_alloc_hint(int fd, uint64_t val) +{ + struct fsxattr fsx; + struct stat sb; + int ret; + + if (fstat(fd, &sb) < 0) { + ret = -errno; + return ret; + } + if (!S_ISREG(sb.st_mode)) { + return -EINVAL; + } + + if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) { + ret = -errno; + return ret; + } + + // already set? + if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val) + return 0; + + // xfs won't change extent size if any extents are allocated + if (fsx.fsx_nextents != 0) + return 0; + + fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE; + fsx.fsx_extsize = val; + + if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) { + ret = -errno; + return ret; + } + + return 0; +} diff --git a/src/os/fs/XFS.h b/src/os/fs/XFS.h new file mode 100644 index 00000000..f0ea717e --- /dev/null +++ b/src/os/fs/XFS.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OS_XFS_H +#define CEPH_OS_XFS_H + +#include "FS.h" + +# ifndef XFS_SUPER_MAGIC +#define XFS_SUPER_MAGIC 0x58465342 +# endif + +class XFS : public FS { + const char *get_name() override { + return "xfs"; + } + int set_alloc_hint(int fd, uint64_t hint) override; +}; + +#endif diff --git a/src/os/fs/ZFS.cc b/src/os/fs/ZFS.cc new file mode 100644 index 00000000..02520796 --- /dev/null +++ b/src/os/fs/ZFS.cc @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#define HAVE_IOCTL_IN_SYS_IOCTL_H +#include <libzfs.h> +#include "ZFS.h" + +const int ZFS::TYPE_FILESYSTEM = ZFS_TYPE_FILESYSTEM; +const int ZFS::TYPE_SNAPSHOT = ZFS_TYPE_SNAPSHOT; +const int ZFS::TYPE_VOLUME = ZFS_TYPE_VOLUME; +const int ZFS::TYPE_DATASET = ZFS_TYPE_DATASET; + +ZFS::~ZFS() +{ + if (g_zfs) + ::libzfs_fini((libzfs_handle_t*)g_zfs); +} + +int ZFS::init() +{ + g_zfs = ::libzfs_init(); + return g_zfs ? 0 : -EINVAL; +} + +ZFS::Handle *ZFS::open(const char *n, int t) +{ + return (ZFS::Handle*)::zfs_open((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t); +} + +void ZFS::close(ZFS::Handle *h) +{ + ::zfs_close((zfs_handle_t*)h); +} + +const char *ZFS::get_name(ZFS::Handle *h) +{ + return ::zfs_get_name((zfs_handle_t*)h); +} + +ZFS::Handle *ZFS::path_to_zhandle(const char *p, int t) +{ + return ::zfs_path_to_zhandle((libzfs_handle_t*)g_zfs, (char *)p, (zfs_type_t)t); +} + +int ZFS::create(const char *n, int t) +{ + return ::zfs_create((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t, NULL); +} + +int ZFS::snapshot(const char *n, bool r) +{ + return ::zfs_snapshot((libzfs_handle_t*)g_zfs, n, (boolean_t)r, NULL); +} + +int ZFS::rollback(ZFS::Handle *h, ZFS::Handle *snap, bool f) +{ + return ::zfs_rollback((zfs_handle_t*)h, (zfs_handle_t*)snap, (boolean_t)f); +} + +int ZFS::destroy_snaps(ZFS::Handle *h, const char *n, bool d) +{ + return ::zfs_destroy_snaps((zfs_handle_t*)h, (char *)n, (boolean_t)d); +} + +bool ZFS::is_mounted(ZFS::Handle *h, char **p) +{ + return (bool)::zfs_is_mounted((zfs_handle_t*)h, p); +} + +int ZFS::mount(ZFS::Handle *h, const char *o, int f) +{ + return ::zfs_mount((zfs_handle_t*)h, o, f); +} + +int ZFS::umount(ZFS::Handle *h, const char *o, int f) +{ + return ::zfs_unmount((zfs_handle_t*)h, o, f); +} + +int ZFS::iter_snapshots_sorted(ZFS::Handle *h, ZFS::iter_func f, void *d) +{ + return ::zfs_iter_snapshots_sorted((zfs_handle_t*)h, (zfs_iter_f)f, d); +} diff --git a/src/os/fs/ZFS.h b/src/os/fs/ZFS.h new file mode 100644 index 00000000..3ebe1110 --- /dev/null +++ b/src/os/fs/ZFS.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_ZFS_H +#define CEPH_ZFS_H + +// Simple wrapper to hide libzfs.h. (it conflicts with standard linux headers) +class ZFS { + void *g_zfs; +public: + + static const int TYPE_FILESYSTEM; + static const int TYPE_SNAPSHOT; + static const int TYPE_VOLUME; + static const int TYPE_POOL; + static const int TYPE_DATASET; + + typedef void Handle; + typedef int (*iter_func)(Handle *, void *); + + static const char *get_name(Handle *); + + ZFS() : g_zfs(NULL) {} + ~ZFS(); + int init(); + Handle *open(const char *, int); + void close(Handle *); + Handle *path_to_zhandle(const char *, int); + int create(const char *, int); + int snapshot(const char *, bool); + int rollback(Handle *, Handle *, bool); + int destroy_snaps(Handle *, const char *, bool); + int iter_snapshots_sorted(Handle *, iter_func, void *); + int mount(Handle *, const char *, int); + int umount(Handle *, const char *, int); + bool is_mounted(Handle *, char **); +}; + +#endif diff --git a/src/os/fs/btrfs_ioctl.h b/src/os/fs/btrfs_ioctl.h new file mode 100644 index 00000000..277498ca --- /dev/null +++ b/src/os/fs/btrfs_ioctl.h @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __IOCTL_ +#define __IOCTL_ + +#if defined(__linux__) +#include <linux/ioctl.h> +#elif defined(__FreeBSD__) +#include <sys/ioctl.h> +#endif + +#define BTRFS_IOCTL_MAGIC 0x94 +#define BTRFS_VOL_NAME_MAX 255 + +/* this should be 4k */ +#define BTRFS_PATH_NAME_MAX 4087 +struct btrfs_ioctl_vol_args { + __s64 fd; + char name[BTRFS_PATH_NAME_MAX + 1]; +}; + +#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) + +#define BTRFS_SUBVOL_NAME_MAX 4039 +struct btrfs_ioctl_vol_args_v2 { + __s64 fd; + __u64 transid; + __u64 flags; + __u64 unused[4]; + char name[BTRFS_SUBVOL_NAME_MAX + 1]; +}; + +#define BTRFS_INO_LOOKUP_PATH_MAX 4080 +struct btrfs_ioctl_ino_lookup_args { + __u64 treeid; + __u64 objectid; + char name[BTRFS_INO_LOOKUP_PATH_MAX]; +}; + +struct btrfs_ioctl_search_key { + /* which root are we searching. 0 is the tree of tree roots */ + __u64 tree_id; + + /* keys returned will be >= min and <= max */ + __u64 min_objectid; + __u64 max_objectid; + + /* keys returned will be >= min and <= max */ + __u64 min_offset; + __u64 max_offset; + + /* max and min transids to search for */ + __u64 min_transid; + __u64 max_transid; + + /* keys returned will be >= min and <= max */ + __u32 min_type; + __u32 max_type; + + /* + * how many items did userland ask for, and how many are we + * returning + */ + __u32 nr_items; + + /* align to 64 bits */ + __u32 unused; + + /* some extra for later */ + __u64 unused1; + __u64 unused2; + __u64 unused3; + __u64 unused4; +}; + +struct btrfs_ioctl_search_header { + __u64 transid; + __u64 objectid; + __u64 offset; + __u32 type; + __u32 len; +}; + +#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) +/* + * the buf is an array of search headers where + * each header is followed by the actual item + * the type field is expanded to 32 bits for alignment + */ +struct btrfs_ioctl_search_args { + struct btrfs_ioctl_search_key key; + char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; +}; + +struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; +}; + +/* flags for the defrag range ioctl */ +#define BTRFS_DEFRAG_RANGE_COMPRESS 1 +#define BTRFS_DEFRAG_RANGE_START_IO 2 + +struct btrfs_ioctl_defrag_range_args { + /* start of the defrag operation */ + __u64 start; + + /* number of bytes to defrag, use (u64)-1 to say all */ + __u64 len; + + /* + * flags for the operation, which can include turning + * on compression for this one defrag + */ + __u64 flags; + + /* + * any extent bigger than this will be considered + * already defragged. Use 0 to take the kernel default + * Use 1 to say every single extent must be rewritten + */ + __u32 extent_thresh; + + /* spare for later */ + __u32 unused[5]; +}; + +struct btrfs_ioctl_space_info { + __u64 flags; + __u64 total_bytes; + __u64 used_bytes; +}; + +struct btrfs_ioctl_space_args { + __u64 space_slots; + __u64 total_spaces; + struct btrfs_ioctl_space_info spaces[0]; +}; + +#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ + struct btrfs_ioctl_vol_args) +/* trans start and trans end are dangerous, and only for + * use by applications that know how to avoid the + * resulting deadlocks + */ +#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) +#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) + +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) +#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ + struct btrfs_ioctl_vol_args) + +#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ + struct btrfs_ioctl_clone_range_args) + +#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ + struct btrfs_ioctl_defrag_range_args) +#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ + struct btrfs_ioctl_search_args) +#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ + struct btrfs_ioctl_ino_lookup_args) +#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) +#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ + struct btrfs_ioctl_space_args) +#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) +#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) +#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ + struct btrfs_ioctl_vol_args_v2) +#endif diff --git a/src/os/kstore/KStore.cc b/src/os/kstore/KStore.cc new file mode 100644 index 00000000..bc352bb0 --- /dev/null +++ b/src/os/kstore/KStore.cc @@ -0,0 +1,3436 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#if defined(__FreeBSD__) +#include <sys/param.h> +#include <sys/mount.h> +#endif + +#include "KStore.h" +#include "osd/osd_types.h" +#include "os/kv.h" +#include "include/compat.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "common/Formatter.h" + + +#define dout_context cct +#define dout_subsys ceph_subsys_kstore + +/* + + TODO: + + * superblock, features + * refcounted extents (for efficient clone) + + */ + +const string PREFIX_SUPER = "S"; // field -> value +const string PREFIX_COLL = "C"; // collection name -> (nothing) +const string PREFIX_OBJ = "O"; // object name -> onode +const string PREFIX_DATA = "D"; // nid + offset -> data +const string PREFIX_OMAP = "M"; // u64 + keyname -> value + +/* + * object name key structure + * + * 2 chars: shard (-- for none, or hex digit, so that we sort properly) + * encoded u64: poolid + 2^63 (so that it sorts properly) + * encoded u32: hash (bit reversed) + * + * 1 char: '.' + * + * escaped string: namespace + * + * 1 char: '<', '=', or '>'. if =, then object key == object name, and + * we are followed just by the key. otherwise, we are followed by + * the key and then the object name. + * escaped string: key + * escaped string: object name (unless '=' above) + * + * encoded u64: snap + * encoded u64: generation + */ + +/* + * string encoding in the key + * + * The key string needs to lexicographically sort the same way that + * ghobject_t does. We do this by escaping anything <= to '#' with # + * plus a 2 digit hex string, and anything >= '~' with ~ plus the two + * hex digits. + * + * We use ! as a terminator for strings; this works because it is < # + * and will get escaped if it is present in the string. + * + */ + +static void append_escaped(const string &in, string *out) +{ + char hexbyte[8]; + for (string::const_iterator i = in.begin(); i != in.end(); ++i) { + if ((unsigned char)*i <= '#') { + snprintf(hexbyte, sizeof(hexbyte), "#%02x", (uint8_t)*i); + out->append(hexbyte); + } else if ((unsigned char)*i >= '~') { + snprintf(hexbyte, sizeof(hexbyte), "~%02x", (uint8_t)*i); + out->append(hexbyte); + } else { + out->push_back(*i); + } + } + out->push_back('!'); +} + +static int decode_escaped(const char *p, string *out) +{ + const char *orig_p = p; + while (*p && *p != '!') { + if (*p == '#' || *p == '~') { + unsigned hex; + int r = sscanf(++p, "%2x", &hex); + if (r < 1) + return -EINVAL; + out->push_back((char)hex); + p += 2; + } else { + out->push_back(*p++); + } + } + return p - orig_p; +} + +// some things we encode in binary (as le32 or le64); print the +// resulting key strings nicely +static string pretty_binary_string(const string& in) +{ + char buf[10]; + string out; + out.reserve(in.length() * 3); + enum { NONE, HEX, STRING } mode = NONE; + unsigned from = 0, i; + for (i=0; i < in.length(); ++i) { + if ((in[i] < 32 || (unsigned char)in[i] > 126) || + (mode == HEX && in.length() - i >= 4 && + ((in[i] < 32 || (unsigned char)in[i] > 126) || + (in[i+1] < 32 || (unsigned char)in[i+1] > 126) || + (in[i+2] < 32 || (unsigned char)in[i+2] > 126) || + (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) { + if (mode == STRING) { + out.append(in.substr(from, i - from)); + out.push_back('\''); + } + if (mode != HEX) { + out.append("0x"); + mode = HEX; + } + if (in.length() - i >= 4) { + // print a whole u32 at once + snprintf(buf, sizeof(buf), "%08x", + (uint32_t)(((unsigned char)in[i] << 24) | + ((unsigned char)in[i+1] << 16) | + ((unsigned char)in[i+2] << 8) | + ((unsigned char)in[i+3] << 0))); + i += 3; + } else { + snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]); + } + out.append(buf); + } else { + if (mode != STRING) { + out.push_back('\''); + mode = STRING; + from = i; + } + } + } + if (mode == STRING) { + out.append(in.substr(from, i - from)); + out.push_back('\''); + } + return out; +} + +static void _key_encode_shard(shard_id_t shard, string *key) +{ + // make field ordering match with ghobject_t compare operations + if (shard == shard_id_t::NO_SHARD) { + // otherwise ff will sort *after* 0, not before. + key->append("--"); + } else { + char buf[32]; + snprintf(buf, sizeof(buf), "%02x", (int)shard); + key->append(buf); + } +} +static const char *_key_decode_shard(const char *key, shard_id_t *pshard) +{ + if (key[0] == '-') { + *pshard = shard_id_t::NO_SHARD; + } else { + unsigned shard; + int r = sscanf(key, "%x", &shard); + if (r < 1) + return NULL; + *pshard = shard_id_t(shard); + } + return key + 2; +} + +static void get_coll_key_range(const coll_t& cid, int bits, + string *temp_start, string *temp_end, + string *start, string *end) +{ + temp_start->clear(); + temp_end->clear(); + start->clear(); + end->clear(); + + spg_t pgid; + if (cid.is_pg(&pgid)) { + _key_encode_shard(pgid.shard, start); + *end = *start; + *temp_start = *start; + *temp_end = *start; + + _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start); + _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start); + _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), start); + _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), temp_start); + start->append("."); + temp_start->append("."); + + _key_encode_u64(pgid.pool() + 0x8000000000000000ull, end); + _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_end); + + uint64_t end_hash = + hobject_t::_reverse_bits(pgid.ps()) + (1ull << (32-bits)); + if (end_hash <= 0xffffffffull) { + _key_encode_u32(end_hash, end); + _key_encode_u32(end_hash, temp_end); + end->append("."); + temp_end->append("."); + } else { + _key_encode_u32(0xffffffff, end); + _key_encode_u32(0xffffffff, temp_end); + end->append(":"); + temp_end->append(":"); + } + } else { + _key_encode_shard(shard_id_t::NO_SHARD, start); + _key_encode_u64(-1ull + 0x8000000000000000ull, start); + *end = *start; + _key_encode_u32(0, start); + start->append("."); + _key_encode_u32(0xffffffff, end); + end->append(":"); + + // no separate temp section + *temp_start = *end; + *temp_end = *end; + } +} + +static int get_key_object(const string& key, ghobject_t *oid); + +static void get_object_key(CephContext* cct, const ghobject_t& oid, + string *key) +{ + key->clear(); + + _key_encode_shard(oid.shard_id, key); + _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key); + _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key); + key->append("."); + + append_escaped(oid.hobj.nspace, key); + + if (oid.hobj.get_key().length()) { + // is a key... could be < = or >. + // (ASCII chars < = and > sort in that order, yay) + if (oid.hobj.get_key() < oid.hobj.oid.name) { + key->append("<"); + append_escaped(oid.hobj.get_key(), key); + append_escaped(oid.hobj.oid.name, key); + } else if (oid.hobj.get_key() > oid.hobj.oid.name) { + key->append(">"); + append_escaped(oid.hobj.get_key(), key); + append_escaped(oid.hobj.oid.name, key); + } else { + // same as no key + key->append("="); + append_escaped(oid.hobj.oid.name, key); + } + } else { + // no key + key->append("="); + append_escaped(oid.hobj.oid.name, key); + } + + _key_encode_u64(oid.hobj.snap, key); + _key_encode_u64(oid.generation, key); + + // sanity check + if (true) { + ghobject_t t; + int r = get_key_object(*key, &t); + if (r || t != oid) { + derr << " r " << r << dendl; + derr << "key " << pretty_binary_string(*key) << dendl; + derr << "oid " << oid << dendl; + derr << " t " << t << dendl; + ceph_assert(t == oid); + } + } +} + +static int get_key_object(const string& key, ghobject_t *oid) +{ + int r; + const char *p = key.c_str(); + + p = _key_decode_shard(p, &oid->shard_id); + + uint64_t pool; + p = _key_decode_u64(p, &pool); + oid->hobj.pool = pool - 0x8000000000000000ull; + + unsigned hash; + p = _key_decode_u32(p, &hash); + oid->hobj.set_bitwise_key_u32(hash); + if (*p != '.') + return -5; + ++p; + + r = decode_escaped(p, &oid->hobj.nspace); + if (r < 0) + return -6; + p += r + 1; + + if (*p == '=') { + // no key + ++p; + r = decode_escaped(p, &oid->hobj.oid.name); + if (r < 0) + return -7; + p += r + 1; + } else if (*p == '<' || *p == '>') { + // key + name + ++p; + string okey; + r = decode_escaped(p, &okey); + if (r < 0) + return -8; + p += r + 1; + r = decode_escaped(p, &oid->hobj.oid.name); + if (r < 0) + return -9; + p += r + 1; + oid->hobj.set_key(okey); + } else { + // malformed + return -10; + } + + p = _key_decode_u64(p, &oid->hobj.snap.val); + p = _key_decode_u64(p, &oid->generation); + if (*p) { + // if we get something other than a null terminator here, + // something goes wrong. + return -12; + } + + return 0; +} + + +static void get_data_key(uint64_t nid, uint64_t offset, string *out) +{ + _key_encode_u64(nid, out); + _key_encode_u64(offset, out); +} + +// '-' < '.' < '~' +static void get_omap_header(uint64_t id, string *out) +{ + _key_encode_u64(id, out); + out->push_back('-'); +} + +// hmm, I don't think there's any need to escape the user key since we +// have a clean prefix. +static void get_omap_key(uint64_t id, const string& key, string *out) +{ + _key_encode_u64(id, out); + out->push_back('.'); + out->append(key); +} + +static void rewrite_omap_key(uint64_t id, string old, string *out) +{ + _key_encode_u64(id, out); + out->append(old.substr(out->length())); +} + +static void decode_omap_key(const string& key, string *user_key) +{ + *user_key = key.substr(sizeof(uint64_t) + 1); +} + +static void get_omap_tail(uint64_t id, string *out) +{ + _key_encode_u64(id, out); + out->push_back('~'); +} + + + +// Onode + +#undef dout_prefix +#define dout_prefix *_dout << "kstore.onode(" << this << ") " + +void KStore::Onode::flush() +{ + std::unique_lock<std::mutex> l(flush_lock); + dout(20) << __func__ << " " << flush_txns << dendl; + while (!flush_txns.empty()) + flush_cond.wait(l); + dout(20) << __func__ << " done" << dendl; +} + +// OnodeHashLRU + +#undef dout_prefix +#define dout_prefix *_dout << "kstore.lru(" << this << ") " + +void KStore::OnodeHashLRU::_touch(OnodeRef o) +{ + lru_list_t::iterator p = lru.iterator_to(*o); + lru.erase(p); + lru.push_front(*o); +} + +void KStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o) +{ + std::lock_guard<std::mutex> l(lock); + dout(30) << __func__ << " " << oid << " " << o << dendl; + ceph_assert(onode_map.count(oid) == 0); + onode_map[oid] = o; + lru.push_front(*o); +} + +KStore::OnodeRef KStore::OnodeHashLRU::lookup(const ghobject_t& oid) +{ + std::lock_guard<std::mutex> l(lock); + dout(30) << __func__ << dendl; + ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid); + if (p == onode_map.end()) { + dout(30) << __func__ << " " << oid << " miss" << dendl; + return OnodeRef(); + } + dout(30) << __func__ << " " << oid << " hit " << p->second << dendl; + _touch(p->second); + return p->second; +} + +void KStore::OnodeHashLRU::clear() +{ + std::lock_guard<std::mutex> l(lock); + dout(10) << __func__ << dendl; + lru.clear(); + onode_map.clear(); +} + +void KStore::OnodeHashLRU::rename(const ghobject_t& old_oid, + const ghobject_t& new_oid) +{ + std::lock_guard<std::mutex> l(lock); + dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl; + ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn; + po = onode_map.find(old_oid); + pn = onode_map.find(new_oid); + + ceph_assert(po != onode_map.end()); + if (pn != onode_map.end()) { + lru_list_t::iterator p = lru.iterator_to(*pn->second); + lru.erase(p); + onode_map.erase(pn); + } + OnodeRef o = po->second; + + // install a non-existent onode it its place + po->second.reset(new Onode(cct, old_oid, o->key)); + lru.push_back(*po->second); + + // fix oid, key + onode_map.insert(make_pair(new_oid, o)); + _touch(o); + o->oid = new_oid; + get_object_key(cct, new_oid, &o->key); +} + +bool KStore::OnodeHashLRU::get_next( + const ghobject_t& after, + pair<ghobject_t,OnodeRef> *next) +{ + std::lock_guard<std::mutex> l(lock); + dout(20) << __func__ << " after " << after << dendl; + + if (after == ghobject_t()) { + if (lru.empty()) { + return false; + } + ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.begin(); + ceph_assert(p != onode_map.end()); + next->first = p->first; + next->second = p->second; + return true; + } + + ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(after); + ceph_assert(p != onode_map.end()); // for now + lru_list_t::iterator pi = lru.iterator_to(*p->second); + ++pi; + if (pi == lru.end()) { + return false; + } + next->first = pi->oid; + next->second = onode_map[pi->oid]; + return true; +} + +int KStore::OnodeHashLRU::trim(int max) +{ + std::lock_guard<std::mutex> l(lock); + dout(20) << __func__ << " max " << max + << " size " << onode_map.size() << dendl; + int trimmed = 0; + int num = onode_map.size() - max; + if (onode_map.size() == 0 || num <= 0) + return 0; // don't even try + + lru_list_t::iterator p = lru.end(); + if (num) + --p; + while (num > 0) { + Onode *o = &*p; + int refs = o->nref.load(); + if (refs > 1) { + dout(20) << __func__ << " " << o->oid << " has " << refs + << " refs; stopping with " << num << " left to trim" << dendl; + break; + } + dout(30) << __func__ << " trim " << o->oid << dendl; + if (p != lru.begin()) { + lru.erase(p--); + } else { + lru.erase(p); + ceph_assert(num == 1); + } + o->get(); // paranoia + onode_map.erase(o->oid); + o->put(); + --num; + ++trimmed; + } + return trimmed; +} + +// ======================================================= + +// Collection + +#undef dout_prefix +#define dout_prefix *_dout << "kstore(" << store->path << ").collection(" << cid << ") " + +KStore::Collection::Collection(KStore *ns, coll_t cid) + : CollectionImpl(cid), + store(ns), + lock("KStore::Collection::lock", true, false), + osr(new OpSequencer()), + onode_map(store->cct) +{ +} + +void KStore::Collection::flush() +{ + osr->flush(); +} + +bool KStore::Collection::flush_commit(Context *c) +{ + return osr->flush_commit(c); +} + + +KStore::OnodeRef KStore::Collection::get_onode( + const ghobject_t& oid, + bool create) +{ + ceph_assert(create ? lock.is_wlocked() : lock.is_locked()); + + spg_t pgid; + if (cid.is_pg(&pgid)) { + if (!oid.match(cnode.bits, pgid.ps())) { + lderr(store->cct) << __func__ << " oid " << oid << " not part of " + << pgid << " bits " << cnode.bits << dendl; + ceph_abort(); + } + } + + OnodeRef o = onode_map.lookup(oid); + if (o) + return o; + + string key; + get_object_key(store->cct, oid, &key); + + ldout(store->cct, 20) << __func__ << " oid " << oid << " key " + << pretty_binary_string(key) << dendl; + + bufferlist v; + int r = store->db->get(PREFIX_OBJ, key, &v); + ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl; + Onode *on; + if (v.length() == 0) { + ceph_assert(r == -ENOENT); + if (!create) + return OnodeRef(); + + // new + on = new Onode(store->cct, oid, key); + on->dirty = true; + } else { + // loaded + ceph_assert(r >=0); + on = new Onode(store->cct, oid, key); + on->exists = true; + auto p = v.cbegin(); + decode(on->onode, p); + } + o.reset(on); + onode_map.add(oid, o); + return o; +} + + + +// ======================================================= + +#undef dout_prefix +#define dout_prefix *_dout << "kstore(" << path << ") " + +KStore::KStore(CephContext *cct, const string& path) + : ObjectStore(cct, path), + db(NULL), + basedir(path), + path_fd(-1), + fsid_fd(-1), + mounted(false), + coll_lock("KStore::coll_lock"), + nid_last(0), + nid_max(0), + throttle_ops(cct, "kstore_max_ops", cct->_conf->kstore_max_ops), + throttle_bytes(cct, "kstore_max_bytes", cct->_conf->kstore_max_bytes), + finisher(cct), + kv_sync_thread(this), + kv_stop(false), + logger(nullptr) +{ + _init_logger(); +} + +KStore::~KStore() +{ + _shutdown_logger(); + ceph_assert(!mounted); + ceph_assert(db == NULL); + ceph_assert(fsid_fd < 0); +} + +void KStore::_init_logger() +{ + // XXX + PerfCountersBuilder b(cct, "KStore", + l_kstore_first, l_kstore_last); + b.add_time_avg(l_kstore_state_prepare_lat, "state_prepare_lat", "Average prepare state latency"); + b.add_time_avg(l_kstore_state_kv_queued_lat, "state_kv_queued_lat", "Average kv_queued state latency"); + b.add_time_avg(l_kstore_state_kv_done_lat, "state_kv_done_lat", "Average kv_done state latency"); + b.add_time_avg(l_kstore_state_finishing_lat, "state_finishing_lat", "Average finishing state latency"); + b.add_time_avg(l_kstore_state_done_lat, "state_done_lat", "Average done state latency"); + logger = b.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); +} + +void KStore::_shutdown_logger() +{ + // XXX + cct->get_perfcounters_collection()->remove(logger); + delete logger; +} + +int KStore::_open_path() +{ + ceph_assert(path_fd < 0); + path_fd = ::open(path.c_str(), O_DIRECTORY|O_CLOEXEC); + if (path_fd < 0) { + int r = -errno; + derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r) + << dendl; + return r; + } + return 0; +} + +void KStore::_close_path() +{ + VOID_TEMP_FAILURE_RETRY(::close(path_fd)); + path_fd = -1; +} + +int KStore::_open_fsid(bool create) +{ + ceph_assert(fsid_fd < 0); + int flags = O_RDWR; + if (create) + flags |= O_CREAT; + fsid_fd = ::openat(path_fd, "fsid", flags, 0644); + if (fsid_fd < 0) { + int err = -errno; + derr << __func__ << " " << cpp_strerror(err) << dendl; + return err; + } + return 0; +} + +int KStore::_read_fsid(uuid_d *uuid) +{ + char fsid_str[40]; + memset(fsid_str, 0, sizeof(fsid_str)); + int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) { + derr << __func__ << " failed: " << cpp_strerror(ret) << dendl; + return ret; + } + if (ret > 36) + fsid_str[36] = 0; + else + fsid_str[ret] = 0; + if (!uuid->parse(fsid_str)) { + derr << __func__ << " unparsable uuid " << fsid_str << dendl; + return -EINVAL; + } + return 0; +} + +int KStore::_write_fsid() +{ + int r = ::ftruncate(fsid_fd, 0); + if (r < 0) { + r = -errno; + derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl; + return r; + } + string str = stringify(fsid) + "\n"; + r = safe_write(fsid_fd, str.c_str(), str.length()); + if (r < 0) { + derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl; + return r; + } + r = ::fsync(fsid_fd); + if (r < 0) { + r = -errno; + derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +void KStore::_close_fsid() +{ + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; +} + +int KStore::_lock_fsid() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + derr << __func__ << " failed to lock " << path << "/fsid" + << " (is another ceph-osd still running?)" + << cpp_strerror(err) << dendl; + return -err; + } + return 0; +} + +bool KStore::test_mount_in_use() +{ + // most error conditions mean the mount is not in use (e.g., because + // it doesn't exist). only if we fail to lock do we conclude it is + // in use. + bool ret = false; + int r = _open_path(); + if (r < 0) + return false; + r = _open_fsid(false); + if (r < 0) + goto out_path; + r = _lock_fsid(); + if (r < 0) + ret = true; // if we can't lock, it is in use + _close_fsid(); + out_path: + _close_path(); + return ret; +} + +int KStore::_open_db(bool create) +{ + int r; + ceph_assert(!db); + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/db", path.c_str()); + + string kv_backend; + if (create) { + kv_backend = cct->_conf->kstore_backend; + } else { + r = read_meta("kv_backend", &kv_backend); + if (r < 0) { + derr << __func__ << " uanble to read 'kv_backend' meta" << dendl; + return -EIO; + } + } + dout(10) << __func__ << " kv_backend = " << kv_backend << dendl; + + if (create) { + int r = ::mkdir(fn, 0755); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r) + << dendl; + return r; + } + + // wal_dir, too! + char walfn[PATH_MAX]; + snprintf(walfn, sizeof(walfn), "%s/db.wal", path.c_str()); + r = ::mkdir(walfn, 0755); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " failed to create " << walfn + << ": " << cpp_strerror(r) + << dendl; + return r; + } + } + + db = KeyValueDB::create(cct, kv_backend, fn); + if (!db) { + derr << __func__ << " error creating db" << dendl; + return -EIO; + } + string options; + if (kv_backend == "rocksdb") + options = cct->_conf->kstore_rocksdb_options; + db->init(options); + stringstream err; + if (create) + r = db->create_and_open(err); + else + r = db->open(err); + if (r) { + derr << __func__ << " erroring opening db: " << err.str() << dendl; + delete db; + db = NULL; + return -EIO; + } + dout(1) << __func__ << " opened " << kv_backend + << " path " << fn << " options " << options << dendl; + return 0; +} + +void KStore::_close_db() +{ + ceph_assert(db); + delete db; + db = NULL; +} + +int KStore::_open_collections(int *errors) +{ + ceph_assert(coll_map.empty()); + KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL); + for (it->upper_bound(string()); + it->valid(); + it->next()) { + coll_t cid; + if (cid.parse(it->key())) { + CollectionRef c(new Collection(this, cid)); + bufferlist bl = it->value(); + auto p = bl.cbegin(); + try { + decode(c->cnode, p); + } catch (buffer::error& e) { + derr << __func__ << " failed to decode cnode, key:" + << pretty_binary_string(it->key()) << dendl; + return -EIO; + } + dout(20) << __func__ << " opened " << cid << dendl; + coll_map[cid] = c; + } else { + derr << __func__ << " unrecognized collection " << it->key() << dendl; + if (errors) + (*errors)++; + } + } + return 0; +} + +int KStore::mkfs() +{ + dout(1) << __func__ << " path " << path << dendl; + int r; + uuid_d old_fsid; + + r = _open_path(); + if (r < 0) + return r; + + r = _open_fsid(true); + if (r < 0) + goto out_path_fd; + + r = _lock_fsid(); + if (r < 0) + goto out_close_fsid; + + r = _read_fsid(&old_fsid); + if (r < 0 || old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << __func__ << " generated fsid " << fsid << dendl; + } else { + dout(1) << __func__ << " using provided fsid " << fsid << dendl; + } + // we'll write it last. + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << __func__ << " on-disk fsid " << old_fsid + << " != provided " << fsid << dendl; + r = -EINVAL; + goto out_close_fsid; + } + fsid = old_fsid; + dout(1) << __func__ << " already created, fsid is " << fsid << dendl; + goto out_close_fsid; + } + + r = _open_db(true); + if (r < 0) + goto out_close_fsid; + + r = write_meta("kv_backend", cct->_conf->kstore_backend); + if (r < 0) + goto out_close_db; + + r = write_meta("type", "kstore"); + if (r < 0) + goto out_close_db; + + // indicate mkfs completion/success by writing the fsid file + r = _write_fsid(); + if (r == 0) + dout(10) << __func__ << " success" << dendl; + else + derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl; + + out_close_db: + _close_db(); + out_close_fsid: + _close_fsid(); + out_path_fd: + _close_path(); + return r; +} + +int KStore::mount() +{ + dout(1) << __func__ << " path " << path << dendl; + + if (cct->_conf->kstore_fsck_on_mount) { + int rc = fsck(cct->_conf->kstore_fsck_on_mount_deep); + if (rc < 0) + return rc; + } + + int r = _open_path(); + if (r < 0) + return r; + r = _open_fsid(false); + if (r < 0) + goto out_path; + + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + + r = _open_db(false); + if (r < 0) + goto out_fsid; + + r = _open_super_meta(); + if (r < 0) + goto out_db; + + r = _open_collections(); + if (r < 0) + goto out_db; + + finisher.start(); + kv_sync_thread.create("kstore_kv_sync"); + + mounted = true; + return 0; + + out_db: + _close_db(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + return r; +} + +int KStore::umount() +{ + ceph_assert(mounted); + dout(1) << __func__ << dendl; + + _sync(); + _reap_collections(); + coll_map.clear(); + + dout(20) << __func__ << " stopping kv thread" << dendl; + _kv_stop(); + dout(20) << __func__ << " draining finisher" << dendl; + finisher.wait_for_empty(); + dout(20) << __func__ << " stopping finisher" << dendl; + finisher.stop(); + dout(20) << __func__ << " closing" << dendl; + + mounted = false; + _close_db(); + _close_fsid(); + _close_path(); + return 0; +} + +int KStore::fsck(bool deep) +{ + dout(1) << __func__ << dendl; + int errors = 0; + dout(1) << __func__ << " finish with " << errors << " errors" << dendl; + return errors; +} + +void KStore::_sync() +{ + dout(10) << __func__ << dendl; + + std::unique_lock<std::mutex> l(kv_lock); + while (!kv_committing.empty() || + !kv_queue.empty()) { + dout(20) << " waiting for kv to commit" << dendl; + kv_sync_cond.wait(l); + } + + dout(10) << __func__ << " done" << dendl; +} + +int KStore::statfs(struct store_statfs_t* buf0, osd_alert_list_t* alerts) +{ + struct statfs buf; + buf0->reset(); + if (alerts) { + alerts->clear(); // returns nothing for now + } + if (::statfs(basedir.c_str(), &buf) < 0) { + int r = -errno; + ceph_assert(r != -ENOENT); + return r; + } + + buf0->total = buf.f_blocks * buf.f_bsize; + buf0->available = buf.f_bavail * buf.f_bsize; + + return 0; +} + +ObjectStore::CollectionHandle KStore::open_collection(const coll_t& cid) +{ + return _get_collection(cid); +} + +ObjectStore::CollectionHandle KStore::create_new_collection(const coll_t& cid) +{ + auto *c = new Collection(this, cid); + RWLock::WLocker l(coll_lock); + new_coll_map[cid] = c; + return c; +} + +int KStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) +{ + return -ENOTSUP; +} + +// --------------- +// cache + +KStore::CollectionRef KStore::_get_collection(coll_t cid) +{ + RWLock::RLocker l(coll_lock); + ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid); + if (cp == coll_map.end()) + return CollectionRef(); + return cp->second; +} + +void KStore::_queue_reap_collection(CollectionRef& c) +{ + dout(10) << __func__ << " " << c->cid << dendl; + std::lock_guard<std::mutex> l(reap_lock); + removed_collections.push_back(c); +} + +void KStore::_reap_collections() +{ + list<CollectionRef> removed_colls; + std::lock_guard<std::mutex> l(reap_lock); + removed_colls.swap(removed_collections); + + for (list<CollectionRef>::iterator p = removed_colls.begin(); + p != removed_colls.end(); + ++p) { + CollectionRef c = *p; + dout(10) << __func__ << " " << c->cid << dendl; + { + pair<ghobject_t,OnodeRef> next; + while (c->onode_map.get_next(next.first, &next)) { + ceph_assert(!next.second->exists); + if (!next.second->flush_txns.empty()) { + dout(10) << __func__ << " " << c->cid << " " << next.second->oid + << " flush_txns " << next.second->flush_txns << dendl; + return; + } + } + } + c->onode_map.clear(); + dout(10) << __func__ << " " << c->cid << " done" << dendl; + } + + dout(10) << __func__ << " all reaped" << dendl; +} + +// --------------- +// read operations + +bool KStore::exists(CollectionHandle& ch, const ghobject_t& oid) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + return false; + return true; +} + +int KStore::stat( + CollectionHandle& ch, + const ghobject_t& oid, + struct stat *st, + bool allow_eio) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + return -ENOENT; + st->st_size = o->onode.size; + st->st_blksize = 4096; + st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize; + st->st_nlink = 1; + return 0; +} + +int KStore::set_collection_opts( + CollectionHandle& ch, + const pool_opts_t& opts) +{ + return -EOPNOTSUPP; +} + +int KStore::read( + CollectionHandle& ch, + const ghobject_t& oid, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags) +{ + dout(15) << __func__ << " " << ch->cid << " " << oid + << " " << offset << "~" << length + << dendl; + bl.clear(); + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + + int r; + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (offset == length && offset == 0) + length = o->onode.size; + + r = _do_read(o, offset, length, bl, op_flags); + + out: + dout(10) << __func__ << " " << ch->cid << " " << oid + << " " << offset << "~" << length + << " = " << r << dendl; + return r; +} + +int KStore::_do_read( + OnodeRef o, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags) +{ + int r = 0; + uint64_t stripe_size = o->onode.stripe_size; + uint64_t stripe_off; + + dout(20) << __func__ << " " << offset << "~" << length << " size " + << o->onode.size << " nid " << o->onode.nid << dendl; + bl.clear(); + + if (offset > o->onode.size) { + goto out; + } + if (offset + length > o->onode.size) { + length = o->onode.size - offset; + } + if (stripe_size == 0) { + bl.append_zero(length); + r = length; + goto out; + } + + o->flush(); + + stripe_off = offset % stripe_size; + while (length > 0) { + bufferlist stripe; + _do_read_stripe(o, offset - stripe_off, &stripe); + dout(30) << __func__ << " stripe " << offset - stripe_off << " got " + << stripe.length() << dendl; + unsigned swant = std::min<unsigned>(stripe_size - stripe_off, length); + if (stripe.length()) { + if (swant == stripe.length()) { + bl.claim_append(stripe); + dout(30) << __func__ << " taking full stripe" << dendl; + } else { + unsigned l = 0; + if (stripe_off < stripe.length()) { + l = std::min<uint64_t>(stripe.length() - stripe_off, swant); + bufferlist t; + t.substr_of(stripe, stripe_off, l); + bl.claim_append(t); + dout(30) << __func__ << " taking " << stripe_off << "~" << l << dendl; + } + if (l < swant) { + bl.append_zero(swant - l); + dout(30) << __func__ << " adding " << swant - l << " zeros" << dendl; + } + } + } else { + dout(30) << __func__ << " generating " << swant << " zeros" << dendl; + bl.append_zero(swant); + } + offset += swant; + length -= swant; + stripe_off = 0; + } + r = bl.length(); + dout(30) << " result:\n"; + bl.hexdump(*_dout); + *_dout << dendl; + + out: + return r; +} + +int KStore::fiemap( + CollectionHandle& ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl) +{ + map<uint64_t, uint64_t> m; + int r = fiemap(ch, oid, offset, len, m); + if (r >= 0) { + encode(m, bl); + } + return r; +} + +int KStore::fiemap( + CollectionHandle& ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + map<uint64_t, uint64_t>& destmap) +{ + CollectionRef c = static_cast<Collection*>(ch.get()); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + return -ENOENT; + } + + if (offset > o->onode.size) + goto out; + + if (offset + len > o->onode.size) { + len = o->onode.size - offset; + } + + dout(20) << __func__ << " " << offset << "~" << len << " size " + << o->onode.size << dendl; + + // FIXME: do something smarter here + destmap[0] = o->onode.size; + + out: + dout(20) << __func__ << " " << offset << "~" << len + << " size = 0 (" << destmap << ")" << dendl; + return 0; +} + +int KStore::getattr( + CollectionHandle& ch, + const ghobject_t& oid, + const char *name, + bufferptr& value) +{ + dout(15) << __func__ << " " << ch->cid << " " << oid << " " << name << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + int r; + string k(name); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (!o->onode.attrs.count(k)) { + r = -ENODATA; + goto out; + } + value = o->onode.attrs[k]; + r = 0; + out: + dout(10) << __func__ << " " << ch->cid << " " << oid << " " << name + << " = " << r << dendl; + return r; +} + +int KStore::getattrs( + CollectionHandle& ch, + const ghobject_t& oid, + map<string,bufferptr>& aset) +{ + dout(15) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + int r; + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + aset = o->onode.attrs; + r = 0; + out: + dout(10) << __func__ << " " << ch->cid << " " << oid + << " = " << r << dendl; + return r; +} + +int KStore::list_collections(vector<coll_t>& ls) +{ + RWLock::RLocker l(coll_lock); + for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin(); + p != coll_map.end(); + ++p) + ls.push_back(p->first); + return 0; +} + +bool KStore::collection_exists(const coll_t& c) +{ + RWLock::RLocker l(coll_lock); + return coll_map.count(c); +} + +int KStore::collection_empty(CollectionHandle& ch, bool *empty) +{ + dout(15) << __func__ << " " << ch->cid << dendl; + vector<ghobject_t> ls; + ghobject_t next; + int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1, + &ls, &next); + if (r < 0) { + derr << __func__ << " collection_list returned: " << cpp_strerror(r) + << dendl; + return r; + } + *empty = ls.empty(); + dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl; + return 0; +} + +int KStore::collection_bits(CollectionHandle& ch) +{ + dout(15) << __func__ << " " << ch->cid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl; + return c->cnode.bits; +} + +int KStore::collection_list( + CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *pnext) + +{ + Collection *c = static_cast<Collection*>(c_.get()); + c->flush(); + dout(15) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max << dendl; + int r; + { + RWLock::RLocker l(c->lock); + r = _collection_list(c, start, end, max, ls, pnext); + } + + dout(10) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max + << " = " << r << ", ls.size() = " << ls->size() + << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl; + return r; +} + +int KStore::_collection_list( + Collection* c, const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *pnext) +{ + int r = 0; + KeyValueDB::Iterator it; + string temp_start_key, temp_end_key; + string start_key, end_key; + bool set_next = false; + string pend; + bool temp; + + ghobject_t static_next; + if (!pnext) + pnext = &static_next; + + if (start == ghobject_t::get_max() || + start.hobj.is_max()) { + goto out; + } + get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key, + &start_key, &end_key); + dout(20) << __func__ + << " range " << pretty_binary_string(temp_start_key) + << " to " << pretty_binary_string(temp_end_key) + << " and " << pretty_binary_string(start_key) + << " to " << pretty_binary_string(end_key) + << " start " << start << dendl; + it = db->get_iterator(PREFIX_OBJ); + if (start == ghobject_t() || start == c->cid.get_min_hobj()) { + it->upper_bound(temp_start_key); + temp = true; + } else { + string k; + get_object_key(cct, start, &k); + if (start.hobj.is_temp()) { + temp = true; + ceph_assert(k >= temp_start_key && k < temp_end_key); + } else { + temp = false; + ceph_assert(k >= start_key && k < end_key); + } + dout(20) << " start from " << pretty_binary_string(k) + << " temp=" << (int)temp << dendl; + it->lower_bound(k); + } + if (end.hobj.is_max()) { + pend = temp ? temp_end_key : end_key; + } else { + if (end.hobj.is_temp()) { + if (temp) + get_object_key(cct, end, &pend); + else + goto out; + } else { + if (temp) + pend = temp_end_key; + else + get_object_key(cct, end, &pend); + } + } + dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl; + while (true) { + if (!it->valid() || it->key() >= pend) { + if (!it->valid()) + dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; + else + dout(20) << __func__ << " key " << pretty_binary_string(it->key()) + << " > " << end << dendl; + if (temp) { + if (end.hobj.is_temp()) { + if (it->valid() && it->key() < temp_end_key) { + int r = get_key_object(it->key(), pnext); + ceph_assert(r == 0); + set_next = true; + } + break; + } + dout(30) << __func__ << " switch to non-temp namespace" << dendl; + temp = false; + it->upper_bound(start_key); + if (end.hobj.is_max()) + pend = end_key; + else + get_object_key(cct, end, &pend); + dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl; + continue; + } + if (it->valid() && it->key() < end_key) { + int r = get_key_object(it->key(), pnext); + ceph_assert(r == 0); + set_next = true; + } + break; + } + dout(20) << __func__ << " key " << pretty_binary_string(it->key()) << dendl; + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + ceph_assert(r == 0); + if (ls->size() >= (unsigned)max) { + dout(20) << __func__ << " reached max " << max << dendl; + *pnext = oid; + set_next = true; + break; + } + ls->push_back(oid); + it->next(); + } +out: + if (!set_next) { + *pnext = ghobject_t::get_max(); + } + return r; +} + +// omap reads + +KStore::OmapIteratorImpl::OmapIteratorImpl( + CollectionRef c, OnodeRef o, KeyValueDB::Iterator it) + : c(c), o(o), it(it) +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + get_omap_key(o->onode.omap_head, string(), &head); + get_omap_tail(o->onode.omap_head, &tail); + it->lower_bound(head); + } +} + +int KStore::OmapIteratorImpl::seek_to_first() +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + it->lower_bound(head); + } else { + it = KeyValueDB::Iterator(); + } + return 0; +} + +int KStore::OmapIteratorImpl::upper_bound(const string& after) +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + string key; + get_omap_key(o->onode.omap_head, after, &key); + it->upper_bound(key); + } else { + it = KeyValueDB::Iterator(); + } + return 0; +} + +int KStore::OmapIteratorImpl::lower_bound(const string& to) +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + string key; + get_omap_key(o->onode.omap_head, to, &key); + it->lower_bound(key); + } else { + it = KeyValueDB::Iterator(); + } + return 0; +} + +bool KStore::OmapIteratorImpl::valid() +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head && it->valid() && it->raw_key().second <= tail) { + return true; + } else { + return false; + } +} + +int KStore::OmapIteratorImpl::next() +{ + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + it->next(); + return 0; + } else { + return -1; + } +} + +string KStore::OmapIteratorImpl::key() +{ + RWLock::RLocker l(c->lock); + ceph_assert(it->valid()); + string db_key = it->raw_key().second; + string user_key; + decode_omap_key(db_key, &user_key); + return user_key; +} + +bufferlist KStore::OmapIteratorImpl::value() +{ + RWLock::RLocker l(c->lock); + ceph_assert(it->valid()); + return it->value(); +} + +int KStore::omap_get( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) +{ + dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + { + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string head, tail; + get_omap_header(o->onode.omap_head, &head); + get_omap_tail(o->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() == head) { + dout(30) << __func__ << " got header" << dendl; + *header = it->value(); + } else if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + string user_key; + decode_omap_key(it->key(), &user_key); + dout(30) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + ceph_assert(it->key() < tail); + (*out)[user_key] = it->value(); + } + it->next(); + } + } + out: + dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl; + return r; +} + +int KStore::omap_get_header( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) +{ + dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + { + string head; + get_omap_header(o->onode.omap_head, &head); + if (db->get(PREFIX_OMAP, head, header) >= 0) { + dout(30) << __func__ << " got header" << dendl; + } else { + dout(30) << __func__ << " no header" << dendl; + } + } + out: + dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl; + return r; +} + +int KStore::omap_get_keys( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set<string> *keys ///< [out] Keys defined on oid + ) +{ + dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + { + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string head, tail; + get_omap_key(o->onode.omap_head, string(), &head); + get_omap_tail(o->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } + string user_key; + decode_omap_key(it->key(), &user_key); + dout(30) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + ceph_assert(it->key() < tail); + keys->insert(user_key); + it->next(); + } + } + out: + dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl; + return r; +} + +int KStore::omap_get_values( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) +{ + dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) { + string key; + get_omap_key(o->onode.omap_head, *p, &key); + bufferlist val; + if (db->get(PREFIX_OMAP, key, &val) >= 0) { + dout(30) << __func__ << " got " << pretty_binary_string(key) + << " -> " << *p << dendl; + out->insert(make_pair(*p, val)); + } + } + out: + dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl; + return r; +} + +int KStore::omap_check_keys( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) +{ + dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) { + string key; + get_omap_key(o->onode.omap_head, *p, &key); + bufferlist val; + if (db->get(PREFIX_OMAP, key, &val) >= 0) { + dout(30) << __func__ << " have " << pretty_binary_string(key) + << " -> " << *p << dendl; + out->insert(*p); + } else { + dout(30) << __func__ << " miss " << pretty_binary_string(key) + << " -> " << *p << dendl; + } + } + out: + dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl; + return r; +} + +ObjectMap::ObjectMapIterator KStore::get_omap_iterator( + CollectionHandle& ch, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) +{ + + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl; + return ObjectMap::ObjectMapIterator(); + } + o->flush(); + dout(10) << __func__ << " header = " << o->onode.omap_head <<dendl; + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it)); +} + + +// ----------------- +// write helpers + +int KStore::_open_super_meta() +{ + // nid + { + nid_max = 0; + bufferlist bl; + db->get(PREFIX_SUPER, "nid_max", &bl); + auto p = bl.cbegin(); + try { + decode(nid_max, p); + } catch (buffer::error& e) { + } + dout(10) << __func__ << " old nid_max " << nid_max << dendl; + nid_last = nid_max; + } + return 0; +} + +void KStore::_assign_nid(TransContext *txc, OnodeRef o) +{ + if (o->onode.nid) + return; + std::lock_guard<std::mutex> l(nid_lock); + o->onode.nid = ++nid_last; + dout(20) << __func__ << " " << o->oid << " nid " << o->onode.nid << dendl; + if (nid_last > nid_max) { + nid_max += cct->_conf->kstore_nid_prealloc; + bufferlist bl; + encode(nid_max, bl); + txc->t->set(PREFIX_SUPER, "nid_max", bl); + dout(10) << __func__ << " nid_max now " << nid_max << dendl; + } +} + +KStore::TransContext *KStore::_txc_create(OpSequencer *osr) +{ + TransContext *txc = new TransContext(osr); + txc->t = db->get_transaction(); + osr->queue_new(txc); + dout(20) << __func__ << " osr " << osr << " = " << txc << dendl; + return txc; +} + +void KStore::_txc_state_proc(TransContext *txc) +{ + while (true) { + dout(10) << __func__ << " txc " << txc + << " " << txc->get_state_name() << dendl; + switch (txc->state) { + case TransContext::STATE_PREPARE: + txc->log_state_latency(logger, l_kstore_state_prepare_lat); + txc->state = TransContext::STATE_KV_QUEUED; + if (!cct->_conf->kstore_sync_transaction) { + std::lock_guard<std::mutex> l(kv_lock); + if (cct->_conf->kstore_sync_submit_transaction) { + int r = db->submit_transaction(txc->t); + ceph_assert(r == 0); + } + kv_queue.push_back(txc); + kv_cond.notify_one(); + return; + } + { + int r = db->submit_transaction_sync(txc->t); + ceph_assert(r == 0); + } + break; + + case TransContext::STATE_KV_QUEUED: + txc->log_state_latency(logger, l_kstore_state_kv_queued_lat); + txc->state = TransContext::STATE_KV_DONE; + _txc_finish_kv(txc); + // ** fall-thru ** + + case TransContext::STATE_KV_DONE: + txc->log_state_latency(logger, l_kstore_state_kv_done_lat); + txc->state = TransContext::STATE_FINISHING; + // ** fall-thru ** + + case TransContext::TransContext::STATE_FINISHING: + txc->log_state_latency(logger, l_kstore_state_finishing_lat); + _txc_finish(txc); + return; + + default: + derr << __func__ << " unexpected txc " << txc + << " state " << txc->get_state_name() << dendl; + ceph_abort_msg("unexpected txc state"); + return; + } + } +} + +void KStore::_txc_finalize(OpSequencer *osr, TransContext *txc) +{ + dout(20) << __func__ << " osr " << osr << " txc " << txc + << " onodes " << txc->onodes << dendl; + + // finalize onodes + for (set<OnodeRef>::iterator p = txc->onodes.begin(); + p != txc->onodes.end(); + ++p) { + bufferlist bl; + encode((*p)->onode, bl); + dout(20) << " onode size is " << bl.length() << dendl; + txc->t->set(PREFIX_OBJ, (*p)->key, bl); + + std::lock_guard<std::mutex> l((*p)->flush_lock); + (*p)->flush_txns.insert(txc); + } +} + +void KStore::_txc_finish_kv(TransContext *txc) +{ + dout(20) << __func__ << " txc " << txc << dendl; + + // warning: we're calling onreadable_sync inside the sequencer lock + if (txc->onreadable_sync) { + txc->onreadable_sync->complete(0); + txc->onreadable_sync = NULL; + } + if (txc->onreadable) { + finisher.queue(txc->onreadable); + txc->onreadable = NULL; + } + if (txc->oncommit) { + finisher.queue(txc->oncommit); + txc->oncommit = NULL; + } + if (!txc->oncommits.empty()) { + finisher.queue(txc->oncommits); + } + + throttle_ops.put(txc->ops); + throttle_bytes.put(txc->bytes); +} + +void KStore::_txc_finish(TransContext *txc) +{ + dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl; + ceph_assert(txc->state == TransContext::STATE_FINISHING); + + for (set<OnodeRef>::iterator p = txc->onodes.begin(); + p != txc->onodes.end(); + ++p) { + std::lock_guard<std::mutex> l((*p)->flush_lock); + dout(20) << __func__ << " onode " << *p << " had " << (*p)->flush_txns + << dendl; + ceph_assert((*p)->flush_txns.count(txc)); + (*p)->flush_txns.erase(txc); + if ((*p)->flush_txns.empty()) { + (*p)->flush_cond.notify_all(); + (*p)->clear_pending_stripes(); + } + } + + // clear out refs + txc->onodes.clear(); + + while (!txc->removed_collections.empty()) { + _queue_reap_collection(txc->removed_collections.front()); + txc->removed_collections.pop_front(); + } + + OpSequencerRef osr = txc->osr; + { + std::lock_guard<std::mutex> l(osr->qlock); + txc->state = TransContext::STATE_DONE; + } + + _osr_reap_done(osr.get()); +} + +void KStore::_osr_reap_done(OpSequencer *osr) +{ + std::lock_guard<std::mutex> l(osr->qlock); + dout(20) << __func__ << " osr " << osr << dendl; + while (!osr->q.empty()) { + TransContext *txc = &osr->q.front(); + dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name() + << dendl; + if (txc->state != TransContext::STATE_DONE) { + break; + } + + if (txc->first_collection) { + txc->first_collection->onode_map.trim(cct->_conf->kstore_onode_map_size); + } + + osr->q.pop_front(); + txc->log_state_latency(logger, l_kstore_state_done_lat); + delete txc; + osr->qcond.notify_all(); + if (osr->q.empty()) + dout(20) << __func__ << " osr " << osr << " q now empty" << dendl; + } +} + +void KStore::_kv_sync_thread() +{ + dout(10) << __func__ << " start" << dendl; + std::unique_lock<std::mutex> l(kv_lock); + while (true) { + ceph_assert(kv_committing.empty()); + if (kv_queue.empty()) { + if (kv_stop) + break; + dout(20) << __func__ << " sleep" << dendl; + kv_sync_cond.notify_all(); + kv_cond.wait(l); + dout(20) << __func__ << " wake" << dendl; + } else { + dout(20) << __func__ << " committing " << kv_queue.size() << dendl; + kv_committing.swap(kv_queue); + utime_t start = ceph_clock_now(); + l.unlock(); + + dout(30) << __func__ << " committing txc " << kv_committing << dendl; + + // one transaction to force a sync + KeyValueDB::Transaction t = db->get_transaction(); + if (!cct->_conf->kstore_sync_submit_transaction) { + for (std::deque<TransContext *>::iterator it = kv_committing.begin(); + it != kv_committing.end(); + ++it) { + int r = db->submit_transaction((*it)->t); + ceph_assert(r == 0); + } + } + int r = db->submit_transaction_sync(t); + ceph_assert(r == 0); + utime_t finish = ceph_clock_now(); + utime_t dur = finish - start; + dout(20) << __func__ << " committed " << kv_committing.size() + << " in " << dur << dendl; + while (!kv_committing.empty()) { + TransContext *txc = kv_committing.front(); + _txc_state_proc(txc); + kv_committing.pop_front(); + } + + // this is as good a place as any ... + _reap_collections(); + + l.lock(); + } + } + dout(10) << __func__ << " finish" << dendl; +} + + +// --------------------------- +// transactions + +int KStore::queue_transactions( + CollectionHandle& ch, + vector<Transaction>& tls, + TrackedOpRef op, + ThreadPool::TPHandle *handle) +{ + Context *onreadable; + Context *ondisk; + Context *onreadable_sync; + ObjectStore::Transaction::collect_contexts( + tls, &onreadable, &ondisk, &onreadable_sync); + + // set up the sequencer + Collection *c = static_cast<Collection*>(ch.get()); + OpSequencer *osr = c->osr.get(); + dout(10) << __func__ << " ch " << ch.get() << " " << c->cid << dendl; + + // prepare + TransContext *txc = _txc_create(osr); + txc->onreadable = onreadable; + txc->onreadable_sync = onreadable_sync; + txc->oncommit = ondisk; + + for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) { + txc->ops += (*p).get_num_ops(); + txc->bytes += (*p).get_num_bytes(); + _txc_add_transaction(txc, &(*p)); + } + + _txc_finalize(osr, txc); + + throttle_ops.get(txc->ops); + throttle_bytes.get(txc->bytes); + + // execute (start) + _txc_state_proc(txc); + return 0; +} + +void KStore::_txc_add_transaction(TransContext *txc, Transaction *t) +{ + Transaction::iterator i = t->begin(); + + dout(30) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t->dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + + vector<CollectionRef> cvec(i.colls.size()); + unsigned j = 0; + for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end(); + ++p, ++j) { + cvec[j] = _get_collection(*p); + + // note first collection we reference + if (!j && !txc->first_collection) + txc->first_collection = cvec[j]; + } + vector<OnodeRef> ovec(i.objects.size()); + + for (int pos = 0; i.have_op(); ++pos) { + Transaction::Op *op = i.decode_op(); + int r = 0; + + // no coll or obj + if (op->op == Transaction::OP_NOP) + continue; + + // collection operations + CollectionRef &c = cvec[op->cid]; + switch (op->op) { + case Transaction::OP_RMCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _remove_collection(txc, cid, &c); + if (!r) + continue; + } + break; + + case Transaction::OP_MKCOLL: + { + ceph_assert(!c); + coll_t cid = i.get_cid(op->cid); + r = _create_collection(txc, cid, op->split_bits, &c); + if (!r) + continue; + } + break; + + case Transaction::OP_SPLIT_COLLECTION: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_SPLIT_COLLECTION2: + { + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem); + if (!r) + continue; + } + break; + + case Transaction::OP_MERGE_COLLECTION: + { + uint32_t bits = op->split_bits; + r = _merge_collection(txc, &c, cvec[op->dest_cid], bits); + if (!r) + continue; + } + break; + + case Transaction::OP_COLL_HINT: + { + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + auto hiter = hint.cbegin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + decode(pg_num, hiter); + decode(num_objs, hiter); + dout(10) << __func__ << " collection hint objects is a no-op, " + << " pg_num " << pg_num << " num_objects " << num_objs + << dendl; + } else { + // Ignore the hint + dout(10) << __func__ << " unknown collection hint " << type << dendl; + } + continue; + } + break; + + case Transaction::OP_COLL_SETATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RMATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RENAME: + ceph_abort_msg("not implemented"); + break; + } + if (r < 0) { + derr << " error " << cpp_strerror(r) + << " not handled on operation " << op->op + << " (op " << pos << ", counting from 0)" << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t->dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + ceph_abort_msg("unexpected error"); + } + + // object operations + RWLock::WLocker l(c->lock); + OnodeRef &o = ovec[op->oid]; + if (!o) { + // these operations implicity create the object + bool create = false; + if (op->op == Transaction::OP_TOUCH || + op->op == Transaction::OP_WRITE || + op->op == Transaction::OP_ZERO) { + create = true; + } + ghobject_t oid = i.get_oid(op->oid); + o = c->get_onode(oid, create); + if (!create) { + if (!o || !o->exists) { + dout(10) << __func__ << " op " << op->op << " got ENOENT on " + << oid << dendl; + r = -ENOENT; + goto endop; + } + } + } + + switch (op->op) { + case Transaction::OP_TOUCH: + r = _touch(txc, c, o); + break; + + case Transaction::OP_WRITE: + { + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + r = _write(txc, c, o, off, len, bl, fadvise_flags); + } + break; + + case Transaction::OP_ZERO: + { + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(txc, c, o, off, len); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + uint64_t off = op->off; + r = _truncate(txc, c, o, off); + } + break; + + case Transaction::OP_REMOVE: + r = _remove(txc, c, o); + break; + + case Transaction::OP_SETATTR: + { + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + map<string, bufferptr> to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(txc, c, o, to_set); + } + break; + + case Transaction::OP_SETATTRS: + { + map<string, bufferptr> aset; + i.decode_attrset(aset); + r = _setattrs(txc, c, o, aset); + } + break; + + case Transaction::OP_RMATTR: + { + string name = i.decode_string(); + r = _rmattr(txc, c, o, name); + } + break; + + case Transaction::OP_RMATTRS: + { + r = _rmattrs(txc, c, o); + } + break; + + case Transaction::OP_CLONE: + { + const ghobject_t& noid = i.get_oid(op->dest_oid); + OnodeRef no = c->get_onode(noid, true); + r = _clone(txc, c, o, no); + } + break; + + case Transaction::OP_CLONERANGE: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_CLONERANGE2: + { + const ghobject_t& noid = i.get_oid(op->dest_oid); + OnodeRef no = c->get_onode(noid, true); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + r = _clone_range(txc, c, o, no, srcoff, len, dstoff); + } + break; + + case Transaction::OP_COLL_ADD: + ceph_abort_msg("not implemented"); + break; + + case Transaction::OP_COLL_REMOVE: + ceph_abort_msg("not implemented"); + break; + + case Transaction::OP_COLL_MOVE: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + ceph_assert(op->cid == op->dest_cid); + const ghobject_t& noid = i.get_oid(op->dest_oid); + OnodeRef no = c->get_onode(noid, true); + r = _rename(txc, c, o, no, noid); + o.reset(); + } + break; + + case Transaction::OP_TRY_RENAME: + { + const ghobject_t& noid = i.get_oid(op->dest_oid); + OnodeRef no = c->get_onode(noid, true); + r = _rename(txc, c, o, no, noid); + if (r == -ENOENT) + r = 0; + o.reset(); + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + r = _omap_clear(txc, c, o); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + bufferlist aset_bl; + i.decode_attrset_bl(&aset_bl); + r = _omap_setkeys(txc, c, o, aset_bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + bufferlist keys_bl; + i.decode_keyset_bl(&keys_bl); + r = _omap_rmkeys(txc, c, o, keys_bl); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkey_range(txc, c, o, first, last); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + bufferlist bl; + i.decode_bl(bl); + r = _omap_setheader(txc, c, o, bl); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + uint64_t expected_object_size = op->expected_object_size; + uint64_t expected_write_size = op->expected_write_size; + uint32_t flags = op->alloc_hint_flags; + r = _setallochint(txc, c, o, + expected_object_size, + expected_write_size, + flags); + } + break; + + default: + derr << "bad op " << op->op << dendl; + ceph_abort(); + } + + endop: + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD)) + // -ENOENT is usually okay + ok = true; + if (r == -ENODATA) + ok = true; + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) + msg = "ENOENT on clone suggests osd bug"; + + if (r == -ENOSPC) + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC from key value store, misconfigured cluster"; + + if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } + + dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op->op + << " (op " << pos << ", counting from 0)" << dendl; + dout(0) << msg << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t->dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + ceph_abort_msg("unexpected error"); + } + } + } +} + + + +// ----------------- +// write operations + +int KStore::_touch(TransContext *txc, + CollectionRef& c, + OnodeRef &o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + o->exists = true; + _assign_nid(txc, o); + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +void KStore::_dump_onode(OnodeRef o) +{ + dout(30) << __func__ << " " << o + << " nid " << o->onode.nid + << " size " << o->onode.size + << " expected_object_size " << o->onode.expected_object_size + << " expected_write_size " << o->onode.expected_write_size + << dendl; + for (map<string,bufferptr>::iterator p = o->onode.attrs.begin(); + p != o->onode.attrs.end(); + ++p) { + dout(30) << __func__ << " attr " << p->first + << " len " << p->second.length() << dendl; + } +} + +void KStore::_do_read_stripe(OnodeRef o, uint64_t offset, bufferlist *pbl) +{ + map<uint64_t,bufferlist>::iterator p = o->pending_stripes.find(offset); + if (p == o->pending_stripes.end()) { + string key; + get_data_key(o->onode.nid, offset, &key); + db->get(PREFIX_DATA, key, pbl); + o->pending_stripes[offset] = *pbl; + } else { + *pbl = p->second; + } +} + +void KStore::_do_write_stripe(TransContext *txc, OnodeRef o, + uint64_t offset, bufferlist& bl) +{ + o->pending_stripes[offset] = bl; + string key; + get_data_key(o->onode.nid, offset, &key); + txc->t->set(PREFIX_DATA, key, bl); +} + +void KStore::_do_remove_stripe(TransContext *txc, OnodeRef o, uint64_t offset) +{ + o->pending_stripes.erase(offset); + string key; + get_data_key(o->onode.nid, offset, &key); + txc->t->rmkey(PREFIX_DATA, key); +} + +int KStore::_do_write(TransContext *txc, + OnodeRef o, + uint64_t offset, uint64_t length, + bufferlist& orig_bl, + uint32_t fadvise_flags) +{ + int r = 0; + + dout(20) << __func__ + << " " << o->oid << " " << offset << "~" << length + << " - have " << o->onode.size + << " bytes, nid " << o->onode.nid << dendl; + _dump_onode(o); + o->exists = true; + + if (length == 0) { + return 0; + } + + uint64_t stripe_size = o->onode.stripe_size; + if (!stripe_size) { + o->onode.stripe_size = cct->_conf->kstore_default_stripe_size; + stripe_size = o->onode.stripe_size; + } + + unsigned bl_off = 0; + while (length > 0) { + uint64_t offset_rem = offset % stripe_size; + uint64_t end_rem = (offset + length) % stripe_size; + if (offset_rem == 0 && end_rem == 0) { + bufferlist bl; + bl.substr_of(orig_bl, bl_off, stripe_size); + dout(30) << __func__ << " full stripe " << offset << dendl; + _do_write_stripe(txc, o, offset, bl); + offset += stripe_size; + length -= stripe_size; + bl_off += stripe_size; + continue; + } + uint64_t stripe_off = offset - offset_rem; + bufferlist prev; + _do_read_stripe(o, stripe_off, &prev); + dout(20) << __func__ << " read previous stripe " << stripe_off + << ", got " << prev.length() << dendl; + bufferlist bl; + if (offset_rem) { + unsigned p = std::min<uint64_t>(prev.length(), offset_rem); + if (p) { + dout(20) << __func__ << " reuse leading " << p << " bytes" << dendl; + bl.substr_of(prev, 0, p); + } + if (p < offset_rem) { + dout(20) << __func__ << " add leading " << offset_rem - p << " zeros" << dendl; + bl.append_zero(offset_rem - p); + } + } + unsigned use = stripe_size - offset_rem; + if (use > length) + use -= stripe_size - end_rem; + dout(20) << __func__ << " using " << use << " for this stripe" << dendl; + bufferlist t; + t.substr_of(orig_bl, bl_off, use); + bl.claim_append(t); + bl_off += use; + if (end_rem) { + if (end_rem < prev.length()) { + unsigned l = prev.length() - end_rem; + dout(20) << __func__ << " reuse trailing " << l << " bytes" << dendl; + bufferlist t; + t.substr_of(prev, end_rem, l); + bl.claim_append(t); + } + } + dout(30) << " writing:\n"; + bl.hexdump(*_dout); + *_dout << dendl; + _do_write_stripe(txc, o, stripe_off, bl); + offset += use; + length -= use; + } + + if (offset > o->onode.size) { + dout(20) << __func__ << " extending size to " << offset + length + << dendl; + o->onode.size = offset; + } + + return r; +} + +int KStore::_write(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t length, + bufferlist& bl, + uint32_t fadvise_flags) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << offset << "~" << length + << dendl; + _assign_nid(txc, o); + int r = _do_write(txc, o, offset, length, bl, fadvise_flags); + txc->write_onode(o); + + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << offset << "~" << length + << " = " << r << dendl; + return r; +} + +int KStore::_zero(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t length) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << offset << "~" << length + << dendl; + int r = 0; + o->exists = true; + + _dump_onode(o); + _assign_nid(txc, o); + + uint64_t stripe_size = o->onode.stripe_size; + if (stripe_size) { + uint64_t end = offset + length; + uint64_t pos = offset; + uint64_t stripe_off = pos % stripe_size; + while (pos < offset + length) { + if (stripe_off || end - pos < stripe_size) { + bufferlist stripe; + _do_read_stripe(o, pos - stripe_off, &stripe); + dout(30) << __func__ << " stripe " << pos - stripe_off << " got " + << stripe.length() << dendl; + bufferlist bl; + bl.substr_of(stripe, 0, std::min<uint64_t>(stripe.length(), stripe_off)); + if (end >= pos - stripe_off + stripe_size || + end >= o->onode.size) { + dout(20) << __func__ << " truncated stripe " << pos - stripe_off + << " to " << bl.length() << dendl; + } else { + auto len = end - (pos - stripe_off + bl.length()); + bl.append_zero(len); + dout(20) << __func__ << " adding " << len << " of zeros" << dendl; + if (stripe.length() > bl.length()) { + unsigned l = stripe.length() - bl.length(); + bufferlist t; + t.substr_of(stripe, stripe.length() - l, l); + dout(20) << __func__ << " keeping tail " << l << " of stripe" << dendl; + bl.claim_append(t); + } + } + _do_write_stripe(txc, o, pos - stripe_off, bl); + pos += stripe_size - stripe_off; + stripe_off = 0; + } else { + dout(20) << __func__ << " rm stripe " << pos << dendl; + _do_remove_stripe(txc, o, pos - stripe_off); + pos += stripe_size; + } + } + } + if (offset + length > o->onode.size) { + o->onode.size = offset + length; + dout(20) << __func__ << " extending size to " << offset + length + << dendl; + } + txc->write_onode(o); + + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << offset << "~" << length + << " = " << r << dendl; + return r; +} + +int KStore::_do_truncate(TransContext *txc, OnodeRef o, uint64_t offset) +{ + uint64_t stripe_size = o->onode.stripe_size; + + o->flush(); + + // trim down stripes + if (stripe_size) { + uint64_t pos = offset; + uint64_t stripe_off = pos % stripe_size; + while (pos < o->onode.size) { + if (stripe_off) { + bufferlist stripe; + _do_read_stripe(o, pos - stripe_off, &stripe); + dout(30) << __func__ << " stripe " << pos - stripe_off << " got " + << stripe.length() << dendl; + bufferlist t; + t.substr_of(stripe, 0, std::min<uint64_t>(stripe_off, stripe.length())); + _do_write_stripe(txc, o, pos - stripe_off, t); + dout(20) << __func__ << " truncated stripe " << pos - stripe_off + << " to " << t.length() << dendl; + pos += stripe_size - stripe_off; + stripe_off = 0; + } else { + dout(20) << __func__ << " rm stripe " << pos << dendl; + _do_remove_stripe(txc, o, pos - stripe_off); + pos += stripe_size; + } + } + + // trim down cached tail + if (o->tail_bl.length()) { + if (offset / stripe_size != o->onode.size / stripe_size) { + dout(20) << __func__ << " clear cached tail" << dendl; + o->clear_tail(); + } + } + } + + o->onode.size = offset; + dout(10) << __func__ << " truncate size to " << offset << dendl; + + txc->write_onode(o); + return 0; +} + +int KStore::_truncate(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << offset + << dendl; + int r = _do_truncate(txc, o, offset); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << offset + << " = " << r << dendl; + return r; +} + +int KStore::_do_remove(TransContext *txc, + OnodeRef o) +{ + string key; + + _do_truncate(txc, o, 0); + + o->onode.size = 0; + if (o->onode.omap_head) { + _do_omap_clear(txc, o->onode.omap_head); + } + o->exists = false; + o->onode = kstore_onode_t(); + txc->onodes.erase(o); + get_object_key(cct, o->oid, &key); + txc->t->rmkey(PREFIX_OBJ, key); + return 0; +} + +int KStore::_remove(TransContext *txc, + CollectionRef& c, + OnodeRef &o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = _do_remove(txc, o); + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int KStore::_setattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name, + bufferptr& val) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << name << " (" << val.length() << " bytes)" + << dendl; + int r = 0; + o->onode.attrs[name] = val; + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << name << " (" << val.length() << " bytes)" + << " = " << r << dendl; + return r; +} + +int KStore::_setattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const map<string,bufferptr>& aset) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << aset.size() << " keys" + << dendl; + int r = 0; + for (map<string,bufferptr>::const_iterator p = aset.begin(); + p != aset.end(); ++p) { + if (p->second.is_partial()) + o->onode.attrs[p->first] = bufferptr(p->second.c_str(), p->second.length()); + else + o->onode.attrs[p->first] = p->second; + } + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << aset.size() << " keys" + << " = " << r << dendl; + return r; +} + + +int KStore::_rmattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << name << dendl; + int r = 0; + o->onode.attrs.erase(name); + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << name << " = " << r << dendl; + return r; +} + +int KStore::_rmattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + o->onode.attrs.clear(); + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +void KStore::_do_omap_clear(TransContext *txc, uint64_t id) +{ + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string prefix, tail; + get_omap_header(id, &prefix); + get_omap_tail(id, &tail); + it->lower_bound(prefix); + while (it->valid()) { + if (it->key() >= tail) { + dout(30) << __func__ << " stop at " << tail << dendl; + break; + } + txc->t->rmkey(PREFIX_OMAP, it->key()); + dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl; + it->next(); + } +} + +int KStore::_omap_clear(TransContext *txc, + CollectionRef& c, + OnodeRef& o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + if (o->onode.omap_head != 0) { + _do_omap_clear(txc, o->onode.omap_head); + } + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int KStore::_omap_setkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist &bl) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r; + auto p = bl.cbegin(); + __u32 num; + if (!o->onode.omap_head) { + o->onode.omap_head = o->onode.nid; + txc->write_onode(o); + } + decode(num, p); + while (num--) { + string key; + bufferlist value; + decode(key, p); + decode(value, p); + string final_key; + get_omap_key(o->onode.omap_head, key, &final_key); + dout(30) << __func__ << " " << pretty_binary_string(final_key) + << " <- " << key << dendl; + txc->t->set(PREFIX_OMAP, final_key, value); + } + r = 0; + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int KStore::_omap_setheader(TransContext *txc, + CollectionRef& c, + OnodeRef &o, + bufferlist& bl) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r; + string key; + if (!o->onode.omap_head) { + o->onode.omap_head = o->onode.nid; + txc->write_onode(o); + } + get_omap_header(o->onode.omap_head, &key); + txc->t->set(PREFIX_OMAP, key, bl); + r = 0; + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int KStore::_omap_rmkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const bufferlist& bl) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + auto p = bl.cbegin(); + __u32 num; + + if (!o->onode.omap_head) { + r = 0; + goto out; + } + decode(num, p); + while (num--) { + string key; + decode(key, p); + string final_key; + get_omap_key(o->onode.omap_head, key, &final_key); + dout(30) << __func__ << " rm " << pretty_binary_string(final_key) + << " <- " << key << dendl; + txc->t->rmkey(PREFIX_OMAP, final_key); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int KStore::_omap_rmkey_range(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& first, const string& last) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + KeyValueDB::Iterator it; + string key_first, key_last; + int r = 0; + + if (!o->onode.omap_head) { + goto out; + } + it = db->get_iterator(PREFIX_OMAP); + get_omap_key(o->onode.omap_head, first, &key_first); + get_omap_key(o->onode.omap_head, last, &key_last); + it->lower_bound(key_first); + while (it->valid()) { + if (it->key() >= key_last) { + dout(30) << __func__ << " stop at " << pretty_binary_string(key_last) + << dendl; + break; + } + txc->t->rmkey(PREFIX_OMAP, it->key()); + dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl; + it->next(); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int KStore::_setallochint(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << " flags " << flags + << dendl; + int r = 0; + o->onode.expected_object_size = expected_object_size; + o->onode.expected_write_size = expected_write_size; + o->onode.alloc_hint_flags = flags; + + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << " = " << r << dendl; + return r; +} + +int KStore::_clone(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << dendl; + int r = 0; + if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) { + derr << __func__ << " mismatched hash on " << oldo->oid + << " and " << newo->oid << dendl; + return -EINVAL; + } + + bufferlist bl; + newo->exists = true; + _assign_nid(txc, newo); + + // data + oldo->flush(); + + r = _do_read(oldo, 0, oldo->onode.size, bl, 0); + if (r < 0) + goto out; + + // truncate any old data + r = _do_truncate(txc, newo, 0); + if (r < 0) + goto out; + + r = _do_write(txc, newo, 0, oldo->onode.size, bl, 0); + if (r < 0) + goto out; + + newo->onode.attrs = oldo->onode.attrs; + + // clone omap + if (newo->onode.omap_head) { + dout(20) << __func__ << " clearing old omap data" << dendl; + _do_omap_clear(txc, newo->onode.omap_head); + } + if (oldo->onode.omap_head) { + dout(20) << __func__ << " copying omap data" << dendl; + if (!newo->onode.omap_head) { + newo->onode.omap_head = newo->onode.nid; + } + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string head, tail; + get_omap_header(oldo->onode.omap_head, &head); + get_omap_tail(oldo->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + string key; + if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + dout(30) << __func__ << " got header/data " + << pretty_binary_string(it->key()) << dendl; + ceph_assert(it->key() < tail); + rewrite_omap_key(newo->onode.omap_head, it->key(), &key); + txc->t->set(PREFIX_OMAP, key, it->value()); + } + it->next(); + } + } + + txc->write_onode(newo); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << " = " << r << dendl; + return r; +} + +int KStore::_clone_range(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + uint64_t srcoff, uint64_t length, uint64_t dstoff) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << " from " << srcoff << "~" << length + << " to offset " << dstoff << dendl; + int r = 0; + + bufferlist bl; + newo->exists = true; + _assign_nid(txc, newo); + + r = _do_read(oldo, srcoff, length, bl, 0); + if (r < 0) + goto out; + + r = _do_write(txc, newo, dstoff, bl.length(), bl, 0); + if (r < 0) + goto out; + + txc->write_onode(newo); + + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << " from " << srcoff << "~" << length + << " to offset " << dstoff + << " = " << r << dendl; + return r; +} + +int KStore::_rename(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + const ghobject_t& new_oid) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << new_oid << dendl; + int r; + ghobject_t old_oid = oldo->oid; + bufferlist bl; + string old_key, new_key; + + if (newo && newo->exists) { + // destination object already exists, remove it first + r = _do_remove(txc, newo); + if (r < 0) + goto out; + } + + txc->t->rmkey(PREFIX_OBJ, oldo->key); + txc->write_onode(oldo); + c->onode_map.rename(old_oid, new_oid); // this adjusts oldo->{oid,key} + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " = " << r << dendl; + return r; +} + +// collections + +int KStore::_create_collection( + TransContext *txc, + coll_t cid, + unsigned bits, + CollectionRef *c) +{ + dout(15) << __func__ << " " << cid << " bits " << bits << dendl; + int r; + bufferlist bl; + + { + RWLock::WLocker l(coll_lock); + if (*c) { + r = -EEXIST; + goto out; + } + auto p = new_coll_map.find(cid); + ceph_assert(p != new_coll_map.end()); + *c = p->second; + ceph_assert((*c)->cid == cid); + (*c)->cnode.bits = bits; + coll_map[cid] = *c; + new_coll_map.erase(p); + } + encode((*c)->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(cid), bl); + r = 0; + + out: + dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl; + return r; +} + +int KStore::_remove_collection(TransContext *txc, coll_t cid, + CollectionRef *c) +{ + dout(15) << __func__ << " " << cid << dendl; + int r; + + { + RWLock::WLocker l(coll_lock); + if (!*c) { + r = -ENOENT; + goto out; + } + size_t nonexistent_count = 0; + pair<ghobject_t,OnodeRef> next_onode; + while ((*c)->onode_map.get_next(next_onode.first, &next_onode)) { + if (next_onode.second->exists) { + r = -ENOTEMPTY; + goto out; + } + ++nonexistent_count; + } + vector<ghobject_t> ls; + ghobject_t next; + // Enumerate onodes in db, up to nonexistent_count + 1 + // then check if all of them are marked as non-existent. + // Bypass the check if returned number is greater than nonexistent_count + r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(), + nonexistent_count + 1, &ls, &next); + if (r >= 0) { + bool exists = false; //ls.size() > nonexistent_count; + for (auto it = ls.begin(); !exists && it < ls.end(); ++it) { + dout(10) << __func__ << " oid " << *it << dendl; + auto onode = (*c)->onode_map.lookup(*it); + exists = !onode || onode->exists; + if (exists) { + dout(10) << __func__ << " " << *it + << " exists in db" << dendl; + } + } + if (!exists) { + coll_map.erase(cid); + txc->removed_collections.push_back(*c); + c->reset(); + txc->t->rmkey(PREFIX_COLL, stringify(cid)); + r = 0; + } else { + dout(10) << __func__ << " " << cid + << " is non-empty" << dendl; + r = -ENOTEMPTY; + } + } + } + + out: + dout(10) << __func__ << " " << cid << " = " << r << dendl; + return r; +} + +int KStore::_split_collection(TransContext *txc, + CollectionRef& c, + CollectionRef& d, + unsigned bits, int rem) +{ + dout(15) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << dendl; + int r; + RWLock::WLocker l(c->lock); + RWLock::WLocker l2(d->lock); + c->onode_map.clear(); + d->onode_map.clear(); + c->cnode.bits = bits; + ceph_assert(d->cnode.bits == bits); + r = 0; + + bufferlist bl; + encode(c->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(c->cid), bl); + + dout(10) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << " = " << r << dendl; + return r; +} + +int KStore::_merge_collection(TransContext *txc, + CollectionRef *c, + CollectionRef& d, + unsigned bits) +{ + dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid << " " + << " bits " << bits << dendl; + int r; + RWLock::WLocker l((*c)->lock); + RWLock::WLocker l2(d->lock); + (*c)->onode_map.clear(); + d->onode_map.clear(); + d->cnode.bits = bits; + r = 0; + + coll_t cid = (*c)->cid; + + bufferlist bl; + encode(d->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(d->cid), bl); + + coll_map.erase((*c)->cid); + txc->removed_collections.push_back(*c); + c->reset(); + txc->t->rmkey(PREFIX_COLL, stringify(cid)); + + dout(10) << __func__ << " " << cid << " to " << d->cid << " " + << " bits " << bits << " = " << r << dendl; + return r; +} + +// =========================================== diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h new file mode 100644 index 00000000..227227fb --- /dev/null +++ b/src/os/kstore/KStore.h @@ -0,0 +1,692 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_KSTORE_H +#define CEPH_OSD_KSTORE_H + +#include "acconfig.h" + +#include <unistd.h> + +#include <atomic> +#include <mutex> +#include <condition_variable> + +#include "include/ceph_assert.h" +#include "include/unordered_map.h" +#include "common/Finisher.h" +#include "common/RWLock.h" +#include "common/Throttle.h" +#include "common/WorkQueue.h" +#include "os/ObjectStore.h" +#include "common/perf_counters.h" +#include "os/fs/FS.h" +#include "kv/KeyValueDB.h" + +#include "kstore_types.h" + +#include "boost/intrusive/list.hpp" + +enum { + l_kstore_first = 832430, + l_kstore_state_prepare_lat, + l_kstore_state_kv_queued_lat, + l_kstore_state_kv_done_lat, + l_kstore_state_finishing_lat, + l_kstore_state_done_lat, + l_kstore_last +}; + +class KStore : public ObjectStore { + // ----------------------------------------------------- + // types +public: + + class TransContext; + + /// an in-memory object + struct Onode { + CephContext* cct; + std::atomic_int nref; ///< reference count + + ghobject_t oid; + string key; ///< key under PREFIX_OBJ where we are stored + boost::intrusive::list_member_hook<> lru_item; + + kstore_onode_t onode; ///< metadata stored as value in kv store + bool dirty; // ??? + bool exists; + + std::mutex flush_lock; ///< protect flush_txns + std::condition_variable flush_cond; ///< wait here for unapplied txns + set<TransContext*> flush_txns; ///< committing txns + + uint64_t tail_offset; + bufferlist tail_bl; + + map<uint64_t,bufferlist> pending_stripes; ///< unwritten stripes + + Onode(CephContext* cct, const ghobject_t& o, const string& k) + : cct(cct), + nref(0), + oid(o), + key(k), + dirty(false), + exists(false), + tail_offset(0) { + } + + void flush(); + void get() { + ++nref; + } + void put() { + if (--nref == 0) + delete this; + } + + void clear_tail() { + tail_offset = 0; + tail_bl.clear(); + } + void clear_pending_stripes() { + pending_stripes.clear(); + } + }; + typedef boost::intrusive_ptr<Onode> OnodeRef; + + struct OnodeHashLRU { + CephContext* cct; + typedef boost::intrusive::list< + Onode, + boost::intrusive::member_hook< + Onode, + boost::intrusive::list_member_hook<>, + &Onode::lru_item> > lru_list_t; + + std::mutex lock; + ceph::unordered_map<ghobject_t,OnodeRef> onode_map; ///< forward lookups + lru_list_t lru; ///< lru + + OnodeHashLRU(CephContext* cct) : cct(cct) {} + + void add(const ghobject_t& oid, OnodeRef o); + void _touch(OnodeRef o); + OnodeRef lookup(const ghobject_t& o); + void rename(const ghobject_t& old_oid, const ghobject_t& new_oid); + void clear(); + bool get_next(const ghobject_t& after, pair<ghobject_t,OnodeRef> *next); + int trim(int max=-1); + }; + + class OpSequencer; + typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef; + + struct Collection : public CollectionImpl { + KStore *store; + kstore_cnode_t cnode; + RWLock lock; + + OpSequencerRef osr; + + // cache onodes on a per-collection basis to avoid lock + // contention. + OnodeHashLRU onode_map; + + OnodeRef get_onode(const ghobject_t& oid, bool create); + + bool contains(const ghobject_t& oid) { + if (cid.is_meta()) + return oid.hobj.pool == -1; + spg_t spgid; + if (cid.is_pg(&spgid)) + return + spgid.pgid.contains(cnode.bits, oid) && + oid.shard_id == spgid.shard; + return false; + } + + void flush() override; + bool flush_commit(Context *c) override; + + Collection(KStore *ns, coll_t c); + }; + typedef boost::intrusive_ptr<Collection> CollectionRef; + + class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { + CollectionRef c; + OnodeRef o; + KeyValueDB::Iterator it; + string head, tail; + public: + OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it); + int seek_to_first() override; + int upper_bound(const string &after) override; + int lower_bound(const string &to) override; + bool valid() override; + int next() override; + string key() override; + bufferlist value() override; + int status() override { + return 0; + } + }; + + struct TransContext { + typedef enum { + STATE_PREPARE, + STATE_AIO_WAIT, + STATE_IO_DONE, + STATE_KV_QUEUED, + STATE_KV_COMMITTING, + STATE_KV_DONE, + STATE_FINISHING, + STATE_DONE, + } state_t; + + state_t state; + + const char *get_state_name() { + switch (state) { + case STATE_PREPARE: return "prepare"; + case STATE_AIO_WAIT: return "aio_wait"; + case STATE_IO_DONE: return "io_done"; + case STATE_KV_QUEUED: return "kv_queued"; + case STATE_KV_COMMITTING: return "kv_committing"; + case STATE_KV_DONE: return "kv_done"; + case STATE_FINISHING: return "finishing"; + case STATE_DONE: return "done"; + } + return "???"; + } + + void log_state_latency(PerfCounters *logger, int state) { + utime_t lat, now = ceph_clock_now(); + lat = now - start; + logger->tinc(state, lat); + start = now; + } + + CollectionRef ch; + OpSequencerRef osr; + boost::intrusive::list_member_hook<> sequencer_item; + + uint64_t ops, bytes; + + set<OnodeRef> onodes; ///< these onodes need to be updated/written + KeyValueDB::Transaction t; ///< then we will commit this + Context *oncommit; ///< signal on commit + Context *onreadable; ///< signal on readable + Context *onreadable_sync; ///< signal on readable + list<Context*> oncommits; ///< more commit completions + list<CollectionRef> removed_collections; ///< colls we removed + + CollectionRef first_collection; ///< first referenced collection + utime_t start; + explicit TransContext(OpSequencer *o) + : state(STATE_PREPARE), + osr(o), + ops(0), + bytes(0), + oncommit(NULL), + onreadable(NULL), + onreadable_sync(NULL), + start(ceph_clock_now()){ + //cout << "txc new " << this << std::endl; + } + ~TransContext() { + //cout << "txc del " << this << std::endl; + } + + void write_onode(OnodeRef &o) { + onodes.insert(o); + } + }; + + class OpSequencer : public RefCountedObject { + public: + std::mutex qlock; + std::condition_variable qcond; + typedef boost::intrusive::list< + TransContext, + boost::intrusive::member_hook< + TransContext, + boost::intrusive::list_member_hook<>, + &TransContext::sequencer_item> > q_list_t; + q_list_t q; ///< transactions + + ~OpSequencer() { + ceph_assert(q.empty()); + } + + void queue_new(TransContext *txc) { + std::lock_guard<std::mutex> l(qlock); + q.push_back(*txc); + } + + void flush() { + std::unique_lock<std::mutex> l(qlock); + while (!q.empty()) + qcond.wait(l); + } + + bool flush_commit(Context *c) { + std::lock_guard<std::mutex> l(qlock); + if (q.empty()) { + return true; + } + TransContext *txc = &q.back(); + if (txc->state >= TransContext::STATE_KV_DONE) { + return true; + } + ceph_assert(txc->state < TransContext::STATE_KV_DONE); + txc->oncommits.push_back(c); + return false; + } + }; + + struct KVSyncThread : public Thread { + KStore *store; + explicit KVSyncThread(KStore *s) : store(s) {} + void *entry() override { + store->_kv_sync_thread(); + return NULL; + } + }; + + // -------------------------------------------------------- + // members +private: + KeyValueDB *db; + uuid_d fsid; + string basedir; + int path_fd; ///< open handle to $path + int fsid_fd; ///< open handle (locked) to $path/fsid + bool mounted; + + RWLock coll_lock; ///< rwlock to protect coll_map + ceph::unordered_map<coll_t, CollectionRef> coll_map; + map<coll_t,CollectionRef> new_coll_map; + + std::mutex nid_lock; + uint64_t nid_last; + uint64_t nid_max; + + Throttle throttle_ops, throttle_bytes; ///< submit to commit + + Finisher finisher; + + KVSyncThread kv_sync_thread; + std::mutex kv_lock; + std::condition_variable kv_cond, kv_sync_cond; + bool kv_stop; + deque<TransContext*> kv_queue, kv_committing; + + //Logger *logger; + PerfCounters *logger; + std::mutex reap_lock; + list<CollectionRef> removed_collections; + + + // -------------------------------------------------------- + // private methods + + void _init_logger(); + void _shutdown_logger(); + + int _open_path(); + void _close_path(); + int _open_fsid(bool create); + int _lock_fsid(); + int _read_fsid(uuid_d *f); + int _write_fsid(); + void _close_fsid(); + int _open_db(bool create); + void _close_db(); + int _open_collections(int *errors=0); + void _close_collections(); + + int _open_super_meta(); + + CollectionRef _get_collection(coll_t cid); + void _queue_reap_collection(CollectionRef& c); + void _reap_collections(); + + void _assign_nid(TransContext *txc, OnodeRef o); + + void _dump_onode(OnodeRef o); + + TransContext *_txc_create(OpSequencer *osr); + void _txc_release(TransContext *txc, uint64_t offset, uint64_t length); + void _txc_add_transaction(TransContext *txc, Transaction *t); + void _txc_finalize(OpSequencer *osr, TransContext *txc); + void _txc_state_proc(TransContext *txc); + void _txc_finish_kv(TransContext *txc); + void _txc_finish(TransContext *txc); + + void _osr_reap_done(OpSequencer *osr); + + void _kv_sync_thread(); + void _kv_stop() { + { + std::lock_guard<std::mutex> l(kv_lock); + kv_stop = true; + kv_cond.notify_all(); + } + kv_sync_thread.join(); + kv_stop = false; + } + + void _do_read_stripe(OnodeRef o, uint64_t offset, bufferlist *pbl); + void _do_write_stripe(TransContext *txc, OnodeRef o, + uint64_t offset, bufferlist& bl); + void _do_remove_stripe(TransContext *txc, OnodeRef o, uint64_t offset); + + int _collection_list( + Collection *c, const ghobject_t& start, const ghobject_t& end, + int max, vector<ghobject_t> *ls, ghobject_t *next); + +public: + KStore(CephContext *cct, const string& path); + ~KStore() override; + + string get_type() override { + return "kstore"; + } + + bool needs_journal() override { return false; }; + bool wants_journal() override { return false; }; + bool allows_journal() override { return false; }; + + static int get_block_device_fsid(const string& path, uuid_d *fsid); + + bool test_mount_in_use() override; + + int mount() override; + int umount() override; + void _sync(); + + int fsck(bool deep) override; + + + int validate_hobject_key(const hobject_t &obj) const override { + return 0; + } + unsigned get_max_attr_name_length() override { + return 256; // arbitrary; there is no real limit internally + } + + int mkfs() override; + int mkjournal() override { + return 0; + } + void dump_perf_counters(Formatter *f) override { + f->open_object_section("perf_counters"); + logger->dump_formatted(f, false); + f->close_section(); + } + void get_db_statistics(Formatter *f) override { + db->get_statistics(f); + } + int statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts = nullptr) override; + int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override; + + CollectionHandle open_collection(const coll_t& c) override; + CollectionHandle create_new_collection(const coll_t& c) override; + void set_collection_commit_queue(const coll_t& cid, + ContextQueue *commit_queue) override { + } + + using ObjectStore::exists; + bool exists(CollectionHandle& c, const ghobject_t& oid) override; + using ObjectStore::stat; + int stat( + CollectionHandle& c, + const ghobject_t& oid, + struct stat *st, + bool allow_eio = false) override; // struct stat? + int set_collection_opts( + CollectionHandle& c, + const pool_opts_t& opts) override; + using ObjectStore::read; + int read( + CollectionHandle& c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0) override; + int _do_read( + OnodeRef o, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0); + + using ObjectStore::fiemap; + int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override; + int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& outbl) override; + using ObjectStore::getattr; + int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, bufferptr& value) override; + using ObjectStore::getattrs; + int getattrs(CollectionHandle& c, const ghobject_t& oid, map<string,bufferptr>& aset) override; + + int list_collections(vector<coll_t>& ls) override; + bool collection_exists(const coll_t& c) override; + int collection_empty(CollectionHandle& c, bool *empty) override; + int collection_bits(CollectionHandle& c) override; + int collection_list( + CollectionHandle &c, const ghobject_t& start, const ghobject_t& end, + int max, + vector<ghobject_t> *ls, ghobject_t *next) override; + + using ObjectStore::omap_get; + int omap_get( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) override; + + using ObjectStore::omap_get_header; + /// Get omap header + int omap_get_header( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio = false ///< [in] don't assert on eio + ) override; + + using ObjectStore::omap_get_keys; + /// Get keys defined on oid + int omap_get_keys( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set<string> *keys ///< [out] Keys defined on oid + ) override; + + using ObjectStore::omap_get_values; + /// Get key values + int omap_get_values( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) override; + + using ObjectStore::omap_check_keys; + /// Filters keys into out which are defined on oid + int omap_check_keys( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) override; + + using ObjectStore::get_omap_iterator; + ObjectMap::ObjectMapIterator get_omap_iterator( + CollectionHandle& c, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) override; + + void set_fsid(uuid_d u) override { + fsid = u; + } + uuid_d get_fsid() override { + return fsid; + } + + uint64_t estimate_objects_overhead(uint64_t num_objects) override { + return num_objects * 300; //assuming per-object overhead is 300 bytes + } + + objectstore_perf_stat_t get_cur_stats() override { + return objectstore_perf_stat_t(); + } + const PerfCounters* get_perf_counters() const override { + return logger; + } + + + int queue_transactions( + CollectionHandle& ch, + vector<Transaction>& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL) override; + + void compact () override { + ceph_assert(db); + db->compact(); + } + +private: + // -------------------------------------------------------- + // write ops + + int _write(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t len, + bufferlist& bl, + uint32_t fadvise_flags); + int _do_write(TransContext *txc, + OnodeRef o, + uint64_t offset, uint64_t length, + bufferlist& bl, + uint32_t fadvise_flags); + int _touch(TransContext *txc, + CollectionRef& c, + OnodeRef& o); + int _zero(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t len); + int _do_truncate(TransContext *txc, + OnodeRef o, + uint64_t offset); + int _truncate(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset); + int _remove(TransContext *txc, + CollectionRef& c, + OnodeRef& o); + int _do_remove(TransContext *txc, + OnodeRef o); + int _setattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name, + bufferptr& val); + int _setattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const map<string,bufferptr>& aset); + int _rmattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name); + int _rmattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o); + void _do_omap_clear(TransContext *txc, uint64_t id); + int _omap_clear(TransContext *txc, + CollectionRef& c, + OnodeRef& o); + int _omap_setkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist& bl); + int _omap_setheader(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist& header); + int _omap_rmkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const bufferlist& bl); + int _omap_rmkey_range(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& first, const string& last); + int _setallochint(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + int _clone(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo); + int _clone_range(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + uint64_t srcoff, uint64_t length, uint64_t dstoff); + int _rename(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + const ghobject_t& new_oid); + int _create_collection(TransContext *txc, coll_t cid, unsigned bits, + CollectionRef *c); + int _remove_collection(TransContext *txc, coll_t cid, CollectionRef *c); + int _split_collection(TransContext *txc, + CollectionRef& c, + CollectionRef& d, + unsigned bits, int rem); + int _merge_collection(TransContext *txc, + CollectionRef *c, + CollectionRef& d, + unsigned bits); + +}; + +static inline void intrusive_ptr_add_ref(KStore::Onode *o) { + o->get(); +} +static inline void intrusive_ptr_release(KStore::Onode *o) { + o->put(); +} + +static inline void intrusive_ptr_add_ref(KStore::OpSequencer *o) { + o->get(); +} +static inline void intrusive_ptr_release(KStore::OpSequencer *o) { + o->put(); +} + +#endif diff --git a/src/os/kstore/kstore_types.cc b/src/os/kstore/kstore_types.cc new file mode 100644 index 00000000..07270374 --- /dev/null +++ b/src/os/kstore/kstore_types.cc @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "kstore_types.h" +#include "common/Formatter.h" +#include "include/stringify.h" + +// cnode_t + +void kstore_cnode_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(bits, bl); + ENCODE_FINISH(bl); +} + +void kstore_cnode_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(bits, p); + DECODE_FINISH(p); +} + +void kstore_cnode_t::dump(Formatter *f) const +{ + f->dump_unsigned("bits", bits); +} + +void kstore_cnode_t::generate_test_instances(list<kstore_cnode_t*>& o) +{ + o.push_back(new kstore_cnode_t()); + o.push_back(new kstore_cnode_t(0)); + o.push_back(new kstore_cnode_t(123)); +} + + +// kstore_onode_t + +void kstore_onode_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(nid, bl); + encode(size, bl); + encode(attrs, bl); + encode(omap_head, bl); + encode(stripe_size, bl); + encode(expected_object_size, bl); + encode(expected_write_size, bl); + encode(alloc_hint_flags, bl); + ENCODE_FINISH(bl); +} + +void kstore_onode_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(nid, p); + decode(size, p); + decode(attrs, p); + decode(omap_head, p); + decode(stripe_size, p); + decode(expected_object_size, p); + decode(expected_write_size, p); + decode(alloc_hint_flags, p); + DECODE_FINISH(p); +} + +void kstore_onode_t::dump(Formatter *f) const +{ + f->dump_unsigned("nid", nid); + f->dump_unsigned("size", size); + f->open_object_section("attrs"); + for (map<string,bufferptr>::const_iterator p = attrs.begin(); + p != attrs.end(); ++p) { + f->open_object_section("attr"); + f->dump_string("name", p->first); + f->dump_unsigned("len", p->second.length()); + f->close_section(); + } + f->close_section(); + f->dump_unsigned("omap_head", omap_head); + f->dump_unsigned("stripe_size", stripe_size); + f->dump_unsigned("expected_object_size", expected_object_size); + f->dump_unsigned("expected_write_size", expected_write_size); + f->dump_unsigned("alloc_hint_flags", alloc_hint_flags); +} + +void kstore_onode_t::generate_test_instances(list<kstore_onode_t*>& o) +{ + o.push_back(new kstore_onode_t()); + // FIXME +} diff --git a/src/os/kstore/kstore_types.h b/src/os/kstore/kstore_types.h new file mode 100644 index 00000000..13c33fb6 --- /dev/null +++ b/src/os/kstore/kstore_types.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_KSTORE_TYPES_H +#define CEPH_OSD_KSTORE_TYPES_H + +#include <ostream> +#include "include/types.h" +#include "include/interval_set.h" +#include "include/utime.h" +#include "common/hobject.h" + +namespace ceph { + class Formatter; +} +/// collection metadata +struct kstore_cnode_t { + uint32_t bits; ///< how many bits of coll pgid are significant + + explicit kstore_cnode_t(int b=0) : bits(b) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<kstore_cnode_t*>& o); +}; +WRITE_CLASS_ENCODER(kstore_cnode_t) + +/// onode: per-object metadata +struct kstore_onode_t { + uint64_t nid; ///< numeric id (locally unique) + uint64_t size; ///< object size + map<string, bufferptr> attrs; ///< attrs + uint64_t omap_head; ///< id for omap root node + uint32_t stripe_size; ///< stripe size + + uint32_t expected_object_size; + uint32_t expected_write_size; + uint32_t alloc_hint_flags; + + kstore_onode_t() + : nid(0), + size(0), + omap_head(0), + stripe_size(0), + expected_object_size(0), + expected_write_size(0), + alloc_hint_flags(0) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<kstore_onode_t*>& o); +}; +WRITE_CLASS_ENCODER(kstore_onode_t) + +#endif diff --git a/src/os/kv.h b/src/os/kv.h new file mode 100644 index 00000000..64048b08 --- /dev/null +++ b/src/os/kv.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OS_KV_H +#define CEPH_OS_KV_H + +#include <string> +#include "include/byteorder.h" + +// some key encoding helpers +template<typename T> +inline static void _key_encode_u32(uint32_t u, T *key) { + uint32_t bu; +#ifdef CEPH_BIG_ENDIAN + bu = u; +#elif defined(CEPH_LITTLE_ENDIAN) + bu = swab(u); +#else +# error wtf +#endif + key->append((char*)&bu, 4); +} + +template<typename T> +inline static void _key_encode_u32(uint32_t u, size_t pos, T *key) { + uint32_t bu; +#ifdef CEPH_BIG_ENDIAN + bu = u; +#elif defined(CEPH_LITTLE_ENDIAN) + bu = swab(u); +#else +# error wtf +#endif + key->replace(pos, sizeof(bu), (char*)&bu, sizeof(bu)); +} + +inline static const char *_key_decode_u32(const char *key, uint32_t *pu) { + uint32_t bu; + memcpy(&bu, key, 4); +#ifdef CEPH_BIG_ENDIAN + *pu = bu; +#elif defined(CEPH_LITTLE_ENDIAN) + *pu = swab(bu); +#else +# error wtf +#endif + return key + 4; +} + +template<typename T> +inline static void _key_encode_u64(uint64_t u, T *key) { + uint64_t bu; +#ifdef CEPH_BIG_ENDIAN + bu = u; +#elif defined(CEPH_LITTLE_ENDIAN) + bu = swab(u); +#else +# error wtf +#endif + key->append((char*)&bu, 8); +} + +inline static const char *_key_decode_u64(const char *key, uint64_t *pu) { + uint64_t bu; + memcpy(&bu, key, 8); +#ifdef CEPH_BIG_ENDIAN + *pu = bu; +#elif defined(CEPH_LITTLE_ENDIAN) + *pu = swab(bu); +#else +# error wtf +#endif + return key + 8; +} + +#endif diff --git a/src/os/memstore/MemStore.cc b/src/os/memstore/MemStore.cc new file mode 100644 index 00000000..dc1d5ff5 --- /dev/null +++ b/src/os/memstore/MemStore.cc @@ -0,0 +1,1801 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "acconfig.h" + +#ifdef HAVE_SYS_MOUNT_H +#include <sys/mount.h> +#endif + +#ifdef HAVE_SYS_PARAM_H +#include <sys/param.h> +#endif + +#include "include/types.h" +#include "include/stringify.h" +#include "include/unordered_map.h" +#include "common/errno.h" +#include "MemStore.h" +#include "include/compat.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "memstore(" << path << ") " + +// for comparing collections for lock ordering +bool operator>(const MemStore::CollectionRef& l, + const MemStore::CollectionRef& r) +{ + return (unsigned long)l.get() > (unsigned long)r.get(); +} + + +int MemStore::mount() +{ + int r = _load(); + if (r < 0) + return r; + finisher.start(); + return 0; +} + +int MemStore::umount() +{ + finisher.wait_for_empty(); + finisher.stop(); + return _save(); +} + +int MemStore::_save() +{ + dout(10) << __func__ << dendl; + dump_all(); + set<coll_t> collections; + for (ceph::unordered_map<coll_t,CollectionRef>::iterator p = coll_map.begin(); + p != coll_map.end(); + ++p) { + dout(20) << __func__ << " coll " << p->first << " " << p->second << dendl; + collections.insert(p->first); + bufferlist bl; + ceph_assert(p->second); + p->second->encode(bl); + string fn = path + "/" + stringify(p->first); + int r = bl.write_file(fn.c_str()); + if (r < 0) + return r; + } + + string fn = path + "/collections"; + bufferlist bl; + encode(collections, bl); + int r = bl.write_file(fn.c_str()); + if (r < 0) + return r; + + return 0; +} + +void MemStore::dump_all() +{ + Formatter *f = Formatter::create("json-pretty"); + f->open_object_section("store"); + dump(f); + f->close_section(); + dout(0) << "dump:"; + f->flush(*_dout); + *_dout << dendl; + delete f; +} + +void MemStore::dump(Formatter *f) +{ + f->open_array_section("collections"); + for (ceph::unordered_map<coll_t,CollectionRef>::iterator p = coll_map.begin(); + p != coll_map.end(); + ++p) { + f->open_object_section("collection"); + f->dump_string("name", stringify(p->first)); + + f->open_array_section("xattrs"); + for (map<string,bufferptr>::iterator q = p->second->xattr.begin(); + q != p->second->xattr.end(); + ++q) { + f->open_object_section("xattr"); + f->dump_string("name", q->first); + f->dump_int("length", q->second.length()); + f->close_section(); + } + f->close_section(); + + f->open_array_section("objects"); + for (map<ghobject_t,ObjectRef>::iterator q = p->second->object_map.begin(); + q != p->second->object_map.end(); + ++q) { + f->open_object_section("object"); + f->dump_string("name", stringify(q->first)); + if (q->second) + q->second->dump(f); + f->close_section(); + } + f->close_section(); + + f->close_section(); + } + f->close_section(); +} + +int MemStore::_load() +{ + dout(10) << __func__ << dendl; + bufferlist bl; + string fn = path + "/collections"; + string err; + int r = bl.read_file(fn.c_str(), &err); + if (r < 0) + return r; + + set<coll_t> collections; + auto p = bl.cbegin(); + decode(collections, p); + + for (set<coll_t>::iterator q = collections.begin(); + q != collections.end(); + ++q) { + string fn = path + "/" + stringify(*q); + bufferlist cbl; + int r = cbl.read_file(fn.c_str(), &err); + if (r < 0) + return r; + CollectionRef c(new Collection(cct, *q)); + auto p = cbl.cbegin(); + c->decode(p); + coll_map[*q] = c; + used_bytes += c->used_bytes(); + } + + dump_all(); + + return 0; +} + +void MemStore::set_fsid(uuid_d u) +{ + int r = write_meta("fsid", stringify(u)); + ceph_assert(r >= 0); +} + +uuid_d MemStore::get_fsid() +{ + string fsid_str; + int r = read_meta("fsid", &fsid_str); + ceph_assert(r >= 0); + uuid_d uuid; + bool b = uuid.parse(fsid_str.c_str()); + ceph_assert(b); + return uuid; +} + +int MemStore::mkfs() +{ + string fsid_str; + int r = read_meta("fsid", &fsid_str); + if (r == -ENOENT) { + uuid_d fsid; + fsid.generate_random(); + fsid_str = stringify(fsid); + r = write_meta("fsid", fsid_str); + if (r < 0) + return r; + dout(1) << __func__ << " new fsid " << fsid_str << dendl; + } else if (r < 0) { + return r; + } else { + dout(1) << __func__ << " had fsid " << fsid_str << dendl; + } + + string fn = path + "/collections"; + derr << path << dendl; + bufferlist bl; + set<coll_t> collections; + encode(collections, bl); + r = bl.write_file(fn.c_str()); + if (r < 0) + return r; + + r = write_meta("type", "memstore"); + if (r < 0) + return r; + + return 0; +} + +int MemStore::statfs(struct store_statfs_t *st, osd_alert_list_t* alerts) +{ + dout(10) << __func__ << dendl; + if (alerts) { + alerts->clear(); // returns nothing for now + } + st->reset(); + st->total = cct->_conf->memstore_device_bytes; + st->available = std::max<int64_t>(st->total - used_bytes, 0); + dout(10) << __func__ << ": used_bytes: " << used_bytes + << "/" << cct->_conf->memstore_device_bytes << dendl; + return 0; +} + +int MemStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) +{ + return -ENOTSUP; +} + +objectstore_perf_stat_t MemStore::get_cur_stats() +{ + // fixme + return objectstore_perf_stat_t(); +} + +MemStore::CollectionRef MemStore::get_collection(const coll_t& cid) +{ + std::shared_lock l{coll_lock}; + ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid); + if (cp == coll_map.end()) + return CollectionRef(); + return cp->second; +} + +ObjectStore::CollectionHandle MemStore::create_new_collection(const coll_t& cid) +{ + std::lock_guard l{coll_lock}; + Collection *c = new Collection(cct, cid); + new_coll_map[cid] = c; + return c; +} + + +// --------------- +// read operations + +bool MemStore::exists(CollectionHandle &c_, const ghobject_t& oid) +{ + Collection *c = static_cast<Collection*>(c_.get()); + dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; + if (!c->exists) + return false; + + // Perform equivalent of c->get_object_(oid) != NULL. In C++11 the + // shared_ptr needs to be compared to nullptr. + return (bool)c->get_object(oid); +} + +int MemStore::stat( + CollectionHandle &c_, + const ghobject_t& oid, + struct stat *st, + bool allow_eio) +{ + Collection *c = static_cast<Collection*>(c_.get()); + dout(10) << __func__ << " " << c->cid << " " << oid << dendl; + if (!c->exists) + return -ENOENT; + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + st->st_size = o->get_size(); + st->st_blksize = 4096; + st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize; + st->st_nlink = 1; + return 0; +} + +int MemStore::set_collection_opts( + CollectionHandle& ch, + const pool_opts_t& opts) +{ + return -EOPNOTSUPP; +} + +int MemStore::read( + CollectionHandle &c_, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags) +{ + Collection *c = static_cast<Collection*>(c_.get()); + dout(10) << __func__ << " " << c->cid << " " << oid << " " + << offset << "~" << len << dendl; + if (!c->exists) + return -ENOENT; + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + if (offset >= o->get_size()) + return 0; + size_t l = len; + if (l == 0 && offset == 0) // note: len == 0 means read the entire object + l = o->get_size(); + else if (offset + l > o->get_size()) + l = o->get_size() - offset; + bl.clear(); + return o->read(offset, l, bl); +} + +int MemStore::fiemap(CollectionHandle& ch, const ghobject_t& oid, + uint64_t offset, size_t len, bufferlist& bl) +{ + map<uint64_t, uint64_t> destmap; + int r = fiemap(ch, oid, offset, len, destmap); + if (r >= 0) + encode(destmap, bl); + return r; +} + +int MemStore::fiemap(CollectionHandle& ch, const ghobject_t& oid, + uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << " " << offset << "~" + << len << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + size_t l = len; + if (offset + l > o->get_size()) + l = o->get_size() - offset; + if (offset >= o->get_size()) + goto out; + destmap[offset] = l; + out: + return 0; +} + +int MemStore::getattr(CollectionHandle &c_, const ghobject_t& oid, + const char *name, bufferptr& value) +{ + Collection *c = static_cast<Collection*>(c_.get()); + dout(10) << __func__ << " " << c->cid << " " << oid << " " << name << dendl; + if (!c->exists) + return -ENOENT; + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + string k(name); + std::lock_guard lock{o->xattr_mutex}; + if (!o->xattr.count(k)) { + return -ENODATA; + } + value = o->xattr[k]; + return 0; +} + +int MemStore::getattrs(CollectionHandle &c_, const ghobject_t& oid, + map<string,bufferptr>& aset) +{ + Collection *c = static_cast<Collection*>(c_.get()); + dout(10) << __func__ << " " << c->cid << " " << oid << dendl; + if (!c->exists) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->xattr_mutex}; + aset = o->xattr; + return 0; +} + +int MemStore::list_collections(vector<coll_t>& ls) +{ + dout(10) << __func__ << dendl; + std::shared_lock l{coll_lock}; + for (ceph::unordered_map<coll_t,CollectionRef>::iterator p = coll_map.begin(); + p != coll_map.end(); + ++p) { + ls.push_back(p->first); + } + return 0; +} + +bool MemStore::collection_exists(const coll_t& cid) +{ + dout(10) << __func__ << " " << cid << dendl; + std::shared_lock l{coll_lock}; + return coll_map.count(cid); +} + +int MemStore::collection_empty(CollectionHandle& ch, bool *empty) +{ + dout(10) << __func__ << " " << ch->cid << dendl; + CollectionRef c = static_cast<Collection*>(ch.get()); + std::shared_lock l{c->lock}; + *empty = c->object_map.empty(); + return 0; +} + +int MemStore::collection_bits(CollectionHandle& ch) +{ + dout(10) << __func__ << " " << ch->cid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + std::shared_lock l{c->lock}; + return c->bits; +} + +int MemStore::collection_list(CollectionHandle& ch, + const ghobject_t& start, + const ghobject_t& end, + int max, + vector<ghobject_t> *ls, ghobject_t *next) +{ + Collection *c = static_cast<Collection*>(ch.get()); + std::shared_lock l{c->lock}; + + dout(10) << __func__ << " cid " << ch->cid << " start " << start + << " end " << end << dendl; + map<ghobject_t,ObjectRef>::iterator p = c->object_map.lower_bound(start); + while (p != c->object_map.end() && + ls->size() < (unsigned)max && + p->first < end) { + ls->push_back(p->first); + ++p; + } + if (next != NULL) { + if (p == c->object_map.end()) + *next = ghobject_t::get_max(); + else + *next = p->first; + } + dout(10) << __func__ << " cid " << ch->cid << " got " << ls->size() << dendl; + return 0; +} + +int MemStore::omap_get( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + *header = o->omap_header; + *out = o->omap; + return 0; +} + +int MemStore::omap_get_header( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + *header = o->omap_header; + return 0; +} + +int MemStore::omap_get_keys( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set<string> *keys ///< [out] Keys defined on oid + ) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + for (map<string,bufferlist>::iterator p = o->omap.begin(); + p != o->omap.end(); + ++p) + keys->insert(p->first); + return 0; +} + +int MemStore::omap_get_values( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + for (set<string>::const_iterator p = keys.begin(); + p != keys.end(); + ++p) { + map<string,bufferlist>::iterator q = o->omap.find(*p); + if (q != o->omap.end()) + out->insert(*q); + } + return 0; +} + +int MemStore::omap_check_keys( + CollectionHandle& ch, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + for (set<string>::const_iterator p = keys.begin(); + p != keys.end(); + ++p) { + map<string,bufferlist>::iterator q = o->omap.find(*p); + if (q != o->omap.end()) + out->insert(*p); + } + return 0; +} + +class MemStore::OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { + CollectionRef c; + ObjectRef o; + map<string,bufferlist>::iterator it; +public: + OmapIteratorImpl(CollectionRef c, ObjectRef o) + : c(c), o(o), it(o->omap.begin()) {} + + int seek_to_first() override { + std::lock_guard lock{o->omap_mutex}; + it = o->omap.begin(); + return 0; + } + int upper_bound(const string &after) override { + std::lock_guard lock{o->omap_mutex}; + it = o->omap.upper_bound(after); + return 0; + } + int lower_bound(const string &to) override { + std::lock_guard lock{o->omap_mutex}; + it = o->omap.lower_bound(to); + return 0; + } + bool valid() override { + std::lock_guard lock{o->omap_mutex}; + return it != o->omap.end(); + } + int next() override { + std::lock_guard lock{o->omap_mutex}; + ++it; + return 0; + } + string key() override { + std::lock_guard lock{o->omap_mutex}; + return it->first; + } + bufferlist value() override { + std::lock_guard lock{o->omap_mutex}; + return it->second; + } + int status() override { + return 0; + } +}; + +ObjectMap::ObjectMapIterator MemStore::get_omap_iterator( + CollectionHandle& ch, + const ghobject_t& oid) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + ObjectRef o = c->get_object(oid); + if (!o) + return ObjectMap::ObjectMapIterator(); + return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o)); +} + + +// --------------- +// write operations + +int MemStore::queue_transactions( + CollectionHandle& ch, + vector<Transaction>& tls, + TrackedOpRef op, + ThreadPool::TPHandle *handle) +{ + // because memstore operations are synchronous, we can implement the + // Sequencer with a mutex. this guarantees ordering on a given sequencer, + // while allowing operations on different sequencers to happen in parallel + Collection *c = static_cast<Collection*>(ch.get()); + std::unique_lock lock{c->sequencer_mutex}; + + for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) { + // poke the TPHandle heartbeat just to exercise that code path + if (handle) + handle->reset_tp_timeout(); + + _do_transaction(*p); + } + + Context *on_apply = NULL, *on_apply_sync = NULL, *on_commit = NULL; + ObjectStore::Transaction::collect_contexts(tls, &on_apply, &on_commit, + &on_apply_sync); + if (on_apply_sync) + on_apply_sync->complete(0); + if (on_apply) + finisher.queue(on_apply); + if (on_commit) + finisher.queue(on_commit); + return 0; +} + +void MemStore::_do_transaction(Transaction& t) +{ + Transaction::iterator i = t.begin(); + int pos = 0; + + while (i.have_op()) { + Transaction::Op *op = i.decode_op(); + int r = 0; + + switch (op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_TOUCH: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _touch(cid, oid); + } + break; + + case Transaction::OP_WRITE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + r = _write(cid, oid, off, len, bl, fadvise_flags); + } + break; + + case Transaction::OP_ZERO: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(cid, oid, off, len); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + r = _truncate(cid, oid, off); + } + break; + + case Transaction::OP_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _remove(cid, oid); + } + break; + + case Transaction::OP_SETATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + map<string, bufferptr> to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(cid, oid, to_set); + } + break; + + case Transaction::OP_SETATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + map<string, bufferptr> aset; + i.decode_attrset(aset); + r = _setattrs(cid, oid, aset); + } + break; + + case Transaction::OP_RMATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + string name = i.decode_string(); + r = _rmattr(cid, oid, name.c_str()); + } + break; + + case Transaction::OP_RMATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _rmattrs(cid, oid); + } + break; + + case Transaction::OP_CLONE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + r = _clone(cid, oid, noid); + } + break; + + case Transaction::OP_CLONERANGE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + uint64_t off = op->off; + uint64_t len = op->len; + r = _clone_range(cid, oid, noid, off, len, off); + } + break; + + case Transaction::OP_CLONERANGE2: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + r = _clone_range(cid, oid, noid, srcoff, len, dstoff); + } + break; + + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _create_collection(cid, op->split_bits); + } + break; + + case Transaction::OP_COLL_HINT: + { + coll_t cid = i.get_cid(op->cid); + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + auto hiter = hint.cbegin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + decode(pg_num, hiter); + decode(num_objs, hiter); + r = _collection_hint_expected_num_objs(cid, pg_num, num_objs); + } else { + // Ignore the hint + dout(10) << "Unrecognized collection hint type: " << type << dendl; + } + } + break; + + case Transaction::OP_RMCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _destroy_collection(cid); + } + break; + + case Transaction::OP_COLL_ADD: + { + coll_t ocid = i.get_cid(op->cid); + coll_t ncid = i.get_cid(op->dest_cid); + ghobject_t oid = i.get_oid(op->oid); + r = _collection_add(ncid, ocid, oid); + } + break; + + case Transaction::OP_COLL_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _remove(cid, oid); + } + break; + + case Transaction::OP_COLL_MOVE: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + coll_t oldcid = i.get_cid(op->cid); + ghobject_t oldoid = i.get_oid(op->oid); + coll_t newcid = i.get_cid(op->dest_cid); + ghobject_t newoid = i.get_oid(op->dest_oid); + r = _collection_move_rename(oldcid, oldoid, newcid, newoid); + if (r == -ENOENT) + r = 0; + } + break; + + case Transaction::OP_TRY_RENAME: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oldoid = i.get_oid(op->oid); + ghobject_t newoid = i.get_oid(op->dest_oid); + r = _collection_move_rename(cid, oldoid, cid, newoid); + if (r == -ENOENT) + r = 0; + } + break; + + case Transaction::OP_COLL_SETATTR: + { + ceph_abort_msg("not implemented"); + } + break; + + case Transaction::OP_COLL_RMATTR: + { + ceph_abort_msg("not implemented"); + } + break; + + case Transaction::OP_COLL_RENAME: + { + ceph_abort_msg("not implemented"); + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _omap_clear(cid, oid); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + bufferlist aset_bl; + i.decode_attrset_bl(&aset_bl); + r = _omap_setkeys(cid, oid, aset_bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + bufferlist keys_bl; + i.decode_keyset_bl(&keys_bl); + r = _omap_rmkeys(cid, oid, keys_bl); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkeyrange(cid, oid, first, last); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + bufferlist bl; + i.decode_bl(bl); + r = _omap_setheader(cid, oid, bl); + } + break; + case Transaction::OP_SPLIT_COLLECTION: + ceph_abort_msg("deprecated"); + break; + case Transaction::OP_SPLIT_COLLECTION2: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + coll_t dest = i.get_cid(op->dest_cid); + r = _split_collection(cid, bits, rem, dest); + } + break; + case Transaction::OP_MERGE_COLLECTION: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + coll_t dest = i.get_cid(op->dest_cid); + r = _merge_collection(cid, bits, dest); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + r = 0; + } + break; + + case Transaction::OP_COLL_SET_BITS: + { + r = 0; + } + break; + + default: + derr << "bad op " << op->op << dendl; + ceph_abort(); + } + + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD)) + // -ENOENT is usually okay + ok = true; + if (r == -ENODATA) + ok = true; + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) + msg = "ENOENT on clone suggests osd bug"; + + if (r == -ENOSPC) + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC from MemStore, misconfigured cluster or insufficient memory"; + + if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + dump_all(); + } + + derr << " error " << cpp_strerror(r) << " not handled on operation " << op->op + << " (op " << pos << ", counting from 0)" << dendl; + dout(0) << msg << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + ceph_abort_msg("unexpected error"); + } + } + + ++pos; + } +} + +int MemStore::_touch(const coll_t& cid, const ghobject_t& oid) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + c->get_or_create_object(oid); + return 0; +} + +int MemStore::_write(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len, const bufferlist& bl, + uint32_t fadvise_flags) +{ + dout(10) << __func__ << " " << cid << " " << oid << " " + << offset << "~" << len << dendl; + ceph_assert(len == bl.length()); + + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + if (len > 0) { + const ssize_t old_size = o->get_size(); + o->write(offset, bl); + used_bytes += (o->get_size() - old_size); + } + + return 0; +} + +int MemStore::_zero(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len) +{ + dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~" + << len << dendl; + bufferlist bl; + bl.append_zero(len); + return _write(cid, oid, offset, len, bl); +} + +int MemStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size) +{ + dout(10) << __func__ << " " << cid << " " << oid << " " << size << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + const ssize_t old_size = o->get_size(); + int r = o->truncate(size); + used_bytes += (o->get_size() - old_size); + return r; +} + +int MemStore::_remove(const coll_t& cid, const ghobject_t& oid) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + std::lock_guard l{c->lock}; + + auto i = c->object_hash.find(oid); + if (i == c->object_hash.end()) + return -ENOENT; + used_bytes -= i->second->get_size(); + c->object_hash.erase(i); + c->object_map.erase(oid); + + return 0; +} + +int MemStore::_setattrs(const coll_t& cid, const ghobject_t& oid, + map<string,bufferptr>& aset) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->xattr_mutex}; + for (map<string,bufferptr>::const_iterator p = aset.begin(); p != aset.end(); ++p) + o->xattr[p->first] = p->second; + return 0; +} + +int MemStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name) +{ + dout(10) << __func__ << " " << cid << " " << oid << " " << name << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->xattr_mutex}; + auto i = o->xattr.find(name); + if (i == o->xattr.end()) + return -ENODATA; + o->xattr.erase(i); + return 0; +} + +int MemStore::_rmattrs(const coll_t& cid, const ghobject_t& oid) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->xattr_mutex}; + o->xattr.clear(); + return 0; +} + +int MemStore::_clone(const coll_t& cid, const ghobject_t& oldoid, + const ghobject_t& newoid) +{ + dout(10) << __func__ << " " << cid << " " << oldoid + << " -> " << newoid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef oo = c->get_object(oldoid); + if (!oo) + return -ENOENT; + ObjectRef no = c->get_or_create_object(newoid); + used_bytes += oo->get_size() - no->get_size(); + no->clone(oo.get(), 0, oo->get_size(), 0); + + // take xattr and omap locks with std::lock() + std::scoped_lock l{oo->xattr_mutex, + no->xattr_mutex, + oo->omap_mutex, + no->omap_mutex}; + + no->omap_header = oo->omap_header; + no->omap = oo->omap; + no->xattr = oo->xattr; + return 0; +} + +int MemStore::_clone_range(const coll_t& cid, const ghobject_t& oldoid, + const ghobject_t& newoid, + uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(10) << __func__ << " " << cid << " " + << oldoid << " " << srcoff << "~" << len << " -> " + << newoid << " " << dstoff << "~" << len + << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef oo = c->get_object(oldoid); + if (!oo) + return -ENOENT; + ObjectRef no = c->get_or_create_object(newoid); + if (srcoff >= oo->get_size()) + return 0; + if (srcoff + len >= oo->get_size()) + len = oo->get_size() - srcoff; + + const ssize_t old_size = no->get_size(); + no->clone(oo.get(), srcoff, len, dstoff); + used_bytes += (no->get_size() - old_size); + + return len; +} + +int MemStore::_omap_clear(const coll_t& cid, const ghobject_t &oid) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + o->omap.clear(); + o->omap_header.clear(); + return 0; +} + +int MemStore::_omap_setkeys(const coll_t& cid, const ghobject_t &oid, + bufferlist& aset_bl) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + auto p = aset_bl.cbegin(); + __u32 num; + decode(num, p); + while (num--) { + string key; + decode(key, p); + decode(o->omap[key], p); + } + return 0; +} + +int MemStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &oid, + bufferlist& keys_bl) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + auto p = keys_bl.cbegin(); + __u32 num; + decode(num, p); + while (num--) { + string key; + decode(key, p); + o->omap.erase(key); + } + return 0; +} + +int MemStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid, + const string& first, const string& last) +{ + dout(10) << __func__ << " " << cid << " " << oid << " " << first + << " " << last << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + map<string,bufferlist>::iterator p = o->omap.lower_bound(first); + map<string,bufferlist>::iterator e = o->omap.lower_bound(last); + o->omap.erase(p, e); + return 0; +} + +int MemStore::_omap_setheader(const coll_t& cid, const ghobject_t &oid, + const bufferlist &bl) +{ + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + std::lock_guard lock{o->omap_mutex}; + o->omap_header = bl; + return 0; +} + +int MemStore::_create_collection(const coll_t& cid, int bits) +{ + dout(10) << __func__ << " " << cid << dendl; + std::lock_guard l{coll_lock}; + auto result = coll_map.insert(std::make_pair(cid, CollectionRef())); + if (!result.second) + return -EEXIST; + auto p = new_coll_map.find(cid); + ceph_assert(p != new_coll_map.end()); + result.first->second = p->second; + result.first->second->bits = bits; + new_coll_map.erase(p); + return 0; +} + +int MemStore::_destroy_collection(const coll_t& cid) +{ + dout(10) << __func__ << " " << cid << dendl; + std::lock_guard l{coll_lock}; + ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid); + if (cp == coll_map.end()) + return -ENOENT; + { + std::shared_lock l2{cp->second->lock}; + if (!cp->second->object_map.empty()) + return -ENOTEMPTY; + cp->second->exists = false; + } + used_bytes -= cp->second->used_bytes(); + coll_map.erase(cp); + return 0; +} + +int MemStore::_collection_add(const coll_t& cid, const coll_t& ocid, const ghobject_t& oid) +{ + dout(10) << __func__ << " " << cid << " " << ocid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + CollectionRef oc = get_collection(ocid); + if (!oc) + return -ENOENT; + + std::scoped_lock l{std::min(&(*c), &(*oc))->lock, + std::max(&(*c), &(*oc))->lock}; + + if (c->object_hash.count(oid)) + return -EEXIST; + if (oc->object_hash.count(oid) == 0) + return -ENOENT; + ObjectRef o = oc->object_hash[oid]; + c->object_map[oid] = o; + c->object_hash[oid] = o; + return 0; +} + +int MemStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, + coll_t cid, const ghobject_t& oid) +{ + dout(10) << __func__ << " " << oldcid << " " << oldoid << " -> " + << cid << " " << oid << dendl; + CollectionRef c = get_collection(cid); + if (!c) + return -ENOENT; + CollectionRef oc = get_collection(oldcid); + if (!oc) + return -ENOENT; + + // note: c and oc may be the same + ceph_assert(&(*c) == &(*oc)); + + std::lock_guard l{c->lock}; + if (c->object_hash.count(oid)) + return -EEXIST; + if (oc->object_hash.count(oldoid) == 0) + return -ENOENT; + { + ObjectRef o = oc->object_hash[oldoid]; + c->object_map[oid] = o; + c->object_hash[oid] = o; + oc->object_map.erase(oldoid); + oc->object_hash.erase(oldoid); + } + return 0; +} + +int MemStore::_split_collection(const coll_t& cid, uint32_t bits, uint32_t match, + coll_t dest) +{ + dout(10) << __func__ << " " << cid << " " << bits << " " << match << " " + << dest << dendl; + CollectionRef sc = get_collection(cid); + if (!sc) + return -ENOENT; + CollectionRef dc = get_collection(dest); + if (!dc) + return -ENOENT; + + std::scoped_lock l{std::min(&(*sc), &(*dc))->lock, + std::max(&(*sc), &(*dc))->lock}; + + map<ghobject_t,ObjectRef>::iterator p = sc->object_map.begin(); + while (p != sc->object_map.end()) { + if (p->first.match(bits, match)) { + dout(20) << " moving " << p->first << dendl; + dc->object_map.insert(make_pair(p->first, p->second)); + dc->object_hash.insert(make_pair(p->first, p->second)); + sc->object_hash.erase(p->first); + sc->object_map.erase(p++); + } else { + ++p; + } + } + + sc->bits = bits; + ceph_assert(dc->bits == (int)bits); + + return 0; +} + +int MemStore::_merge_collection(const coll_t& cid, uint32_t bits, coll_t dest) +{ + dout(10) << __func__ << " " << cid << " " << bits << " " + << dest << dendl; + CollectionRef sc = get_collection(cid); + if (!sc) + return -ENOENT; + CollectionRef dc = get_collection(dest); + if (!dc) + return -ENOENT; + { + std::scoped_lock l{std::min(&(*sc), &(*dc))->lock, + std::max(&(*sc), &(*dc))->lock}; + + map<ghobject_t,ObjectRef>::iterator p = sc->object_map.begin(); + while (p != sc->object_map.end()) { + dout(20) << " moving " << p->first << dendl; + dc->object_map.insert(make_pair(p->first, p->second)); + dc->object_hash.insert(make_pair(p->first, p->second)); + sc->object_hash.erase(p->first); + sc->object_map.erase(p++); + } + + dc->bits = bits; + } + + { + std::lock_guard l{coll_lock}; + ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid); + ceph_assert(cp != coll_map.end()); + used_bytes -= cp->second->used_bytes(); + coll_map.erase(cp); + } + + return 0; +} + +namespace { +struct BufferlistObject : public MemStore::Object { + ceph::spinlock mutex; + bufferlist data; + + size_t get_size() const override { return data.length(); } + + int read(uint64_t offset, uint64_t len, bufferlist &bl) override; + int write(uint64_t offset, const bufferlist &bl) override; + int clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff) override; + int truncate(uint64_t offset) override; + + void encode(bufferlist& bl) const override { + ENCODE_START(1, 1, bl); + encode(data, bl); + encode_base(bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) override { + DECODE_START(1, p); + decode(data, p); + decode_base(p); + DECODE_FINISH(p); + } +}; +} +// BufferlistObject +int BufferlistObject::read(uint64_t offset, uint64_t len, + bufferlist &bl) +{ + std::lock_guard<decltype(mutex)> lock(mutex); + bl.substr_of(data, offset, len); + return bl.length(); +} + +int BufferlistObject::write(uint64_t offset, const bufferlist &src) +{ + unsigned len = src.length(); + + std::lock_guard<decltype(mutex)> lock(mutex); + + // before + bufferlist newdata; + if (get_size() >= offset) { + newdata.substr_of(data, 0, offset); + } else { + if (get_size()) { + newdata.substr_of(data, 0, get_size()); + } + newdata.append_zero(offset - get_size()); + } + + newdata.append(src); + + // after + if (get_size() > offset + len) { + bufferlist tail; + tail.substr_of(data, offset + len, get_size() - (offset + len)); + newdata.append(tail); + } + + data.claim(newdata); + return 0; +} + +int BufferlistObject::clone(Object *src, uint64_t srcoff, + uint64_t len, uint64_t dstoff) +{ + auto srcbl = dynamic_cast<BufferlistObject*>(src); + if (srcbl == nullptr) + return -ENOTSUP; + + bufferlist bl; + { + std::lock_guard<decltype(srcbl->mutex)> lock(srcbl->mutex); + if (srcoff == dstoff && len == src->get_size()) { + data = srcbl->data; + return 0; + } + bl.substr_of(srcbl->data, srcoff, len); + } + return write(dstoff, bl); +} + +int BufferlistObject::truncate(uint64_t size) +{ + std::lock_guard<decltype(mutex)> lock(mutex); + if (get_size() > size) { + bufferlist bl; + bl.substr_of(data, 0, size); + data.claim(bl); + } else if (get_size() == size) { + // do nothing + } else { + data.append_zero(size - get_size()); + } + return 0; +} + +// PageSetObject + +struct MemStore::PageSetObject : public Object { + PageSet data; + uint64_t data_len; +#if defined(__GLIBCXX__) + // use a thread-local vector for the pages returned by PageSet, so we + // can avoid allocations in read/write() + static thread_local PageSet::page_vector tls_pages; +#endif + + explicit PageSetObject(size_t page_size) : data(page_size), data_len(0) {} + + size_t get_size() const override { return data_len; } + + int read(uint64_t offset, uint64_t len, bufferlist &bl) override; + int write(uint64_t offset, const bufferlist &bl) override; + int clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff) override; + int truncate(uint64_t offset) override; + + void encode(bufferlist& bl) const override { + ENCODE_START(1, 1, bl); + encode(data_len, bl); + data.encode(bl); + encode_base(bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) override { + DECODE_START(1, p); + decode(data_len, p); + data.decode(p); + decode_base(p); + DECODE_FINISH(p); + } +}; + +#if defined(__GLIBCXX__) +// use a thread-local vector for the pages returned by PageSet, so we +// can avoid allocations in read/write() +thread_local PageSet::page_vector MemStore::PageSetObject::tls_pages; +#define DEFINE_PAGE_VECTOR(name) +#else +#define DEFINE_PAGE_VECTOR(name) PageSet::page_vector name; +#endif + +int MemStore::PageSetObject::read(uint64_t offset, uint64_t len, bufferlist& bl) +{ + const auto start = offset; + const auto end = offset + len; + auto remaining = len; + + DEFINE_PAGE_VECTOR(tls_pages); + data.get_range(offset, len, tls_pages); + + // allocate a buffer for the data + buffer::ptr buf(len); + + auto p = tls_pages.begin(); + while (remaining) { + // no more pages in range + if (p == tls_pages.end() || (*p)->offset >= end) { + buf.zero(offset - start, remaining); + break; + } + auto page = *p; + + // fill any holes between pages with zeroes + if (page->offset > offset) { + const auto count = std::min(remaining, page->offset - offset); + buf.zero(offset - start, count); + remaining -= count; + offset = page->offset; + if (!remaining) + break; + } + + // read from page + const auto page_offset = offset - page->offset; + const auto count = min(remaining, data.get_page_size() - page_offset); + + buf.copy_in(offset - start, count, page->data + page_offset); + + remaining -= count; + offset += count; + + ++p; + } + + tls_pages.clear(); // drop page refs + + bl.append(std::move(buf)); + return len; +} + +int MemStore::PageSetObject::write(uint64_t offset, const bufferlist &src) +{ + unsigned len = src.length(); + + DEFINE_PAGE_VECTOR(tls_pages); + // make sure the page range is allocated + data.alloc_range(offset, src.length(), tls_pages); + + auto page = tls_pages.begin(); + + auto p = src.begin(); + while (len > 0) { + unsigned page_offset = offset - (*page)->offset; + unsigned pageoff = data.get_page_size() - page_offset; + unsigned count = min(len, pageoff); + p.copy(count, (*page)->data + page_offset); + offset += count; + len -= count; + if (count == pageoff) + ++page; + } + if (data_len < offset) + data_len = offset; + tls_pages.clear(); // drop page refs + return 0; +} + +int MemStore::PageSetObject::clone(Object *src, uint64_t srcoff, + uint64_t len, uint64_t dstoff) +{ + const int64_t delta = dstoff - srcoff; + + auto &src_data = static_cast<PageSetObject*>(src)->data; + const uint64_t src_page_size = src_data.get_page_size(); + + auto &dst_data = data; + const auto dst_page_size = dst_data.get_page_size(); + + DEFINE_PAGE_VECTOR(tls_pages); + PageSet::page_vector dst_pages; + + while (len) { + // limit to 16 pages at a time so tls_pages doesn't balloon in size + auto count = std::min(len, (uint64_t)src_page_size * 16); + src_data.get_range(srcoff, count, tls_pages); + + // allocate the destination range + // TODO: avoid allocating pages for holes in the source range + dst_data.alloc_range(srcoff + delta, count, dst_pages); + auto dst_iter = dst_pages.begin(); + + for (auto &src_page : tls_pages) { + auto sbegin = std::max(srcoff, src_page->offset); + auto send = std::min(srcoff + count, src_page->offset + src_page_size); + + // zero-fill holes before src_page + if (srcoff < sbegin) { + while (dst_iter != dst_pages.end()) { + auto &dst_page = *dst_iter; + auto dbegin = std::max(srcoff + delta, dst_page->offset); + auto dend = std::min(sbegin + delta, dst_page->offset + dst_page_size); + std::fill(dst_page->data + dbegin - dst_page->offset, + dst_page->data + dend - dst_page->offset, 0); + if (dend < dst_page->offset + dst_page_size) + break; + ++dst_iter; + } + const auto c = sbegin - srcoff; + count -= c; + len -= c; + } + + // copy data from src page to dst pages + while (dst_iter != dst_pages.end()) { + auto &dst_page = *dst_iter; + auto dbegin = std::max(sbegin + delta, dst_page->offset); + auto dend = std::min(send + delta, dst_page->offset + dst_page_size); + + std::copy(src_page->data + (dbegin - delta) - src_page->offset, + src_page->data + (dend - delta) - src_page->offset, + dst_page->data + dbegin - dst_page->offset); + if (dend < dst_page->offset + dst_page_size) + break; + ++dst_iter; + } + + const auto c = send - sbegin; + count -= c; + len -= c; + srcoff = send; + dstoff = send + delta; + } + tls_pages.clear(); // drop page refs + + // zero-fill holes after the last src_page + if (count > 0) { + while (dst_iter != dst_pages.end()) { + auto &dst_page = *dst_iter; + auto dbegin = std::max(dstoff, dst_page->offset); + auto dend = std::min(dstoff + count, dst_page->offset + dst_page_size); + std::fill(dst_page->data + dbegin - dst_page->offset, + dst_page->data + dend - dst_page->offset, 0); + ++dst_iter; + } + srcoff += count; + dstoff += count; + len -= count; + } + dst_pages.clear(); // drop page refs + } + + // update object size + if (data_len < dstoff) + data_len = dstoff; + return 0; +} + +int MemStore::PageSetObject::truncate(uint64_t size) +{ + data.free_pages_after(size); + data_len = size; + + const auto page_size = data.get_page_size(); + const auto page_offset = size & ~(page_size-1); + if (page_offset == size) + return 0; + + DEFINE_PAGE_VECTOR(tls_pages); + // write zeroes to the rest of the last page + data.get_range(page_offset, page_size, tls_pages); + if (tls_pages.empty()) + return 0; + + auto page = tls_pages.begin(); + auto data = (*page)->data; + std::fill(data + (size - page_offset), data + page_size, 0); + tls_pages.clear(); // drop page ref + return 0; +} + + +MemStore::ObjectRef MemStore::Collection::create_object() const { + if (use_page_set) + return new PageSetObject(cct->_conf->memstore_page_size); + return new BufferlistObject(); +} diff --git a/src/os/memstore/MemStore.h b/src/os/memstore/MemStore.h new file mode 100644 index 00000000..3d361631 --- /dev/null +++ b/src/os/memstore/MemStore.h @@ -0,0 +1,414 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013- Sage Weil <sage@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_MEMSTORE_H +#define CEPH_MEMSTORE_H + +#include <mutex> +#include <boost/intrusive_ptr.hpp> + +#include "include/unordered_map.h" +#include "common/Finisher.h" +#include "common/RefCountedObj.h" +#include "common/RWLock.h" +#include "os/ObjectStore.h" +#include "PageSet.h" +#include "include/ceph_assert.h" + +class MemStore : public ObjectStore { +public: + struct Object : public RefCountedObject { + ceph::mutex xattr_mutex{ceph::make_mutex("MemStore::Object::xattr_mutex")}; + ceph::mutex omap_mutex{ceph::make_mutex("MemStore::Object::omap_mutex")}; + map<string,bufferptr> xattr; + bufferlist omap_header; + map<string,bufferlist> omap; + + typedef boost::intrusive_ptr<Object> Ref; + friend void intrusive_ptr_add_ref(Object *o) { o->get(); } + friend void intrusive_ptr_release(Object *o) { o->put(); } + + Object() : RefCountedObject(nullptr, 0) {} + // interface for object data + virtual size_t get_size() const = 0; + virtual int read(uint64_t offset, uint64_t len, bufferlist &bl) = 0; + virtual int write(uint64_t offset, const bufferlist &bl) = 0; + virtual int clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff) = 0; + virtual int truncate(uint64_t offset) = 0; + virtual void encode(bufferlist& bl) const = 0; + virtual void decode(bufferlist::const_iterator& p) = 0; + + void encode_base(bufferlist& bl) const { + using ceph::encode; + encode(xattr, bl); + encode(omap_header, bl); + encode(omap, bl); + } + void decode_base(bufferlist::const_iterator& p) { + using ceph::decode; + decode(xattr, p); + decode(omap_header, p); + decode(omap, p); + } + + void dump(Formatter *f) const { + f->dump_int("data_len", get_size()); + f->dump_int("omap_header_len", omap_header.length()); + + f->open_array_section("xattrs"); + for (map<string,bufferptr>::const_iterator p = xattr.begin(); + p != xattr.end(); + ++p) { + f->open_object_section("xattr"); + f->dump_string("name", p->first); + f->dump_int("length", p->second.length()); + f->close_section(); + } + f->close_section(); + + f->open_array_section("omap"); + for (map<string,bufferlist>::const_iterator p = omap.begin(); + p != omap.end(); + ++p) { + f->open_object_section("pair"); + f->dump_string("key", p->first); + f->dump_int("length", p->second.length()); + f->close_section(); + } + f->close_section(); + } + }; + typedef Object::Ref ObjectRef; + + struct PageSetObject; + struct Collection : public CollectionImpl { + int bits = 0; + CephContext *cct; + bool use_page_set; + ceph::unordered_map<ghobject_t, ObjectRef> object_hash; ///< for lookup + map<ghobject_t, ObjectRef> object_map; ///< for iteration + map<string,bufferptr> xattr; + /// for object_{map,hash} + ceph::shared_mutex lock{ + ceph::make_shared_mutex("MemStore::Collection::lock", true, false)}; + + bool exists = true; + ceph::mutex sequencer_mutex{ + ceph::make_mutex("MemStore::Collection::sequencer_mutex")}; + + typedef boost::intrusive_ptr<Collection> Ref; + friend void intrusive_ptr_add_ref(Collection *c) { c->get(); } + friend void intrusive_ptr_release(Collection *c) { c->put(); } + + ObjectRef create_object() const; + + // NOTE: The lock only needs to protect the object_map/hash, not the + // contents of individual objects. The osd is already sequencing + // reads and writes, so we will never see them concurrently at this + // level. + + ObjectRef get_object(ghobject_t oid) { + std::shared_lock l{lock}; + auto o = object_hash.find(oid); + if (o == object_hash.end()) + return ObjectRef(); + return o->second; + } + + ObjectRef get_or_create_object(ghobject_t oid) { + std::lock_guard l{lock}; + auto result = object_hash.emplace(oid, ObjectRef()); + if (result.second) + object_map[oid] = result.first->second = create_object(); + return result.first->second; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(xattr, bl); + encode(use_page_set, bl); + uint32_t s = object_map.size(); + encode(s, bl); + for (map<ghobject_t, ObjectRef>::const_iterator p = object_map.begin(); + p != object_map.end(); + ++p) { + encode(p->first, bl); + p->second->encode(bl); + } + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(xattr, p); + decode(use_page_set, p); + uint32_t s; + decode(s, p); + while (s--) { + ghobject_t k; + decode(k, p); + auto o = create_object(); + o->decode(p); + object_map.insert(make_pair(k, o)); + object_hash.insert(make_pair(k, o)); + } + DECODE_FINISH(p); + } + + uint64_t used_bytes() const { + uint64_t result = 0; + for (map<ghobject_t, ObjectRef>::const_iterator p = object_map.begin(); + p != object_map.end(); + ++p) { + result += p->second->get_size(); + } + + return result; + } + + void flush() override { + } + bool flush_commit(Context *c) override { + return true; + } + + explicit Collection(CephContext *cct, coll_t c) + : CollectionImpl(c), + cct(cct), + use_page_set(cct->_conf->memstore_page_set) {} + }; + typedef Collection::Ref CollectionRef; + +private: + class OmapIteratorImpl; + + + ceph::unordered_map<coll_t, CollectionRef> coll_map; + /// rwlock to protect coll_map + ceph::shared_mutex coll_lock{ + ceph::make_shared_mutex("MemStore::coll_lock")}; + map<coll_t,CollectionRef> new_coll_map; + + CollectionRef get_collection(const coll_t& cid); + + Finisher finisher; + + uint64_t used_bytes; + + void _do_transaction(Transaction& t); + + int _touch(const coll_t& cid, const ghobject_t& oid); + int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, + const bufferlist& bl, uint32_t fadvise_flags = 0); + int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len); + int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size); + int _remove(const coll_t& cid, const ghobject_t& oid); + int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset); + int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name); + int _rmattrs(const coll_t& cid, const ghobject_t& oid); + int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid); + int _clone_range(const coll_t& cid, const ghobject_t& oldoid, + const ghobject_t& newoid, + uint64_t srcoff, uint64_t len, uint64_t dstoff); + int _omap_clear(const coll_t& cid, const ghobject_t &oid); + int _omap_setkeys(const coll_t& cid, const ghobject_t &oid, bufferlist& aset_bl); + int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, bufferlist& keys_bl); + int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid, + const string& first, const string& last); + int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl); + + int _collection_hint_expected_num_objs(const coll_t& cid, uint32_t pg_num, + uint64_t num_objs) const { return 0; } + int _create_collection(const coll_t& c, int bits); + int _destroy_collection(const coll_t& c); + int _collection_add(const coll_t& cid, const coll_t& ocid, const ghobject_t& oid); + int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, + coll_t cid, const ghobject_t& o); + int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest); + int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest); + + int _save(); + int _load(); + + void dump(Formatter *f); + void dump_all(); + +public: + MemStore(CephContext *cct, const string& path) + : ObjectStore(cct, path), + finisher(cct), + used_bytes(0) {} + ~MemStore() override { } + + string get_type() override { + return "memstore"; + } + + bool test_mount_in_use() override { + return false; + } + + int mount() override; + int umount() override; + + int fsck(bool deep) override { + return 0; + } + + int validate_hobject_key(const hobject_t &obj) const override { + return 0; + } + unsigned get_max_attr_name_length() override { + return 256; // arbitrary; there is no real limit internally + } + + int mkfs() override; + int mkjournal() override { + return 0; + } + bool wants_journal() override { + return false; + } + bool allows_journal() override { + return false; + } + bool needs_journal() override { + return false; + } + + int get_devices(set<string> *ls) override { + // no devices for us! + return 0; + } + + int statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts = nullptr) override; + int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override; + + bool exists(CollectionHandle &c, const ghobject_t& oid) override; + int stat(CollectionHandle &c, const ghobject_t& oid, + struct stat *st, bool allow_eio = false) override; + int set_collection_opts( + CollectionHandle& c, + const pool_opts_t& opts) override; + int read( + CollectionHandle &c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0) override; + using ObjectStore::fiemap; + int fiemap(CollectionHandle& c, const ghobject_t& oid, + uint64_t offset, size_t len, bufferlist& bl) override; + int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, + size_t len, map<uint64_t, uint64_t>& destmap) override; + int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name, + bufferptr& value) override; + int getattrs(CollectionHandle &c, const ghobject_t& oid, + map<string,bufferptr>& aset) override; + + int list_collections(vector<coll_t>& ls) override; + + CollectionHandle open_collection(const coll_t& c) override { + return get_collection(c); + } + CollectionHandle create_new_collection(const coll_t& c) override; + + void set_collection_commit_queue(const coll_t& cid, + ContextQueue *commit_queue) override { + } + + bool collection_exists(const coll_t& c) override; + int collection_empty(CollectionHandle& c, bool *empty) override; + int collection_bits(CollectionHandle& c) override; + int collection_list(CollectionHandle& cid, + const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *next) override; + + using ObjectStore::omap_get; + int omap_get( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) override; + + using ObjectStore::omap_get_header; + /// Get omap header + int omap_get_header( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio = false ///< [in] don't assert on eio + ) override; + + using ObjectStore::omap_get_keys; + /// Get keys defined on oid + int omap_get_keys( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set<string> *keys ///< [out] Keys defined on oid + ) override; + + using ObjectStore::omap_get_values; + /// Get key values + int omap_get_values( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) override; + + using ObjectStore::omap_check_keys; + /// Filters keys into out which are defined on oid + int omap_check_keys( + CollectionHandle& c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) override; + + using ObjectStore::get_omap_iterator; + ObjectMap::ObjectMapIterator get_omap_iterator( + CollectionHandle& c, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) override; + + void set_fsid(uuid_d u) override; + uuid_d get_fsid() override; + + uint64_t estimate_objects_overhead(uint64_t num_objects) override { + return 0; //do not care + } + + objectstore_perf_stat_t get_cur_stats() override; + + const PerfCounters* get_perf_counters() const override { + return nullptr; + } + + + int queue_transactions( + CollectionHandle& ch, + vector<Transaction>& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL) override; +}; + + + + +#endif diff --git a/src/os/memstore/PageSet.h b/src/os/memstore/PageSet.h new file mode 100644 index 00000000..8e243281 --- /dev/null +++ b/src/os/memstore/PageSet.h @@ -0,0 +1,232 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013- Sage Weil <sage@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_PAGESET_H +#define CEPH_PAGESET_H + +#include <algorithm> +#include <atomic> +#include <cassert> +#include <mutex> +#include <vector> +#include <boost/intrusive/avl_set.hpp> +#include <boost/intrusive_ptr.hpp> + +#include "include/encoding.h" + +struct Page { + char *const data; + boost::intrusive::avl_set_member_hook<> hook; + uint64_t offset; + + // avoid RefCountedObject because it has a virtual destructor + std::atomic<uint16_t> nrefs; + void get() { ++nrefs; } + void put() { if (--nrefs == 0) delete this; } + + typedef boost::intrusive_ptr<Page> Ref; + friend void intrusive_ptr_add_ref(Page *p) { p->get(); } + friend void intrusive_ptr_release(Page *p) { p->put(); } + + // key-value comparison functor for avl + struct Less { + bool operator()(uint64_t offset, const Page &page) const { + return offset < page.offset; + } + bool operator()(const Page &page, uint64_t offset) const { + return page.offset < offset; + } + bool operator()(const Page &lhs, const Page &rhs) const { + return lhs.offset < rhs.offset; + } + }; + void encode(bufferlist &bl, size_t page_size) const { + using ceph::encode; + bl.append(buffer::copy(data, page_size)); + encode(offset, bl); + } + void decode(bufferlist::const_iterator &p, size_t page_size) { + using ceph::decode; + p.copy(page_size, data); + decode(offset, p); + } + + static Ref create(size_t page_size, uint64_t offset = 0) { + // ensure proper alignment of the Page + const auto align = alignof(Page); + page_size = (page_size + align - 1) & ~(align - 1); + // allocate the Page and its data in a single buffer + auto buffer = new char[page_size + sizeof(Page)]; + // place the Page structure at the end of the buffer + return new (buffer + page_size) Page(buffer, offset); + } + + // copy disabled + Page(const Page&) = delete; + const Page& operator=(const Page&) = delete; + + private: // private constructor, use create() instead + Page(char *data, uint64_t offset) : data(data), offset(offset), nrefs(1) {} + + static void operator delete(void *p) { + delete[] reinterpret_cast<Page*>(p)->data; + } +}; + +class PageSet { + public: + // alloc_range() and get_range() return page refs in a vector + typedef std::vector<Page::Ref> page_vector; + + private: + // store pages in a boost intrusive avl_set + typedef Page::Less page_cmp; + typedef boost::intrusive::member_hook<Page, + boost::intrusive::avl_set_member_hook<>, + &Page::hook> member_option; + typedef boost::intrusive::avl_set<Page, + boost::intrusive::compare<page_cmp>, member_option> page_set; + + typedef typename page_set::iterator iterator; + + page_set pages; + uint64_t page_size; + + typedef std::mutex lock_type; + lock_type mutex; + + void free_pages(iterator cur, iterator end) { + while (cur != end) { + Page *page = &*cur; + cur = pages.erase(cur); + page->put(); + } + } + + int count_pages(uint64_t offset, uint64_t len) const { + // count the overlapping pages + int count = 0; + if (offset % page_size) { + count++; + size_t rem = page_size - offset % page_size; + len = len <= rem ? 0 : len - rem; + } + count += len / page_size; + if (len % page_size) + count++; + return count; + } + + public: + explicit PageSet(size_t page_size) : page_size(page_size) {} + PageSet(PageSet &&rhs) + : pages(std::move(rhs.pages)), page_size(rhs.page_size) {} + ~PageSet() { + free_pages(pages.begin(), pages.end()); + } + + // disable copy + PageSet(const PageSet&) = delete; + const PageSet& operator=(const PageSet&) = delete; + + bool empty() const { return pages.empty(); } + size_t size() const { return pages.size(); } + size_t get_page_size() const { return page_size; } + + // allocate all pages that intersect the range [offset,length) + void alloc_range(uint64_t offset, uint64_t length, page_vector &range) { + // loop in reverse so we can provide hints to avl_set::insert_check() + // and get O(1) insertions after the first + uint64_t position = offset + length - 1; + + range.resize(count_pages(offset, length)); + auto out = range.rbegin(); + + std::lock_guard<lock_type> lock(mutex); + iterator cur = pages.end(); + while (length) { + const uint64_t page_offset = position & ~(page_size-1); + + typename page_set::insert_commit_data commit; + auto insert = pages.insert_check(cur, page_offset, page_cmp(), commit); + if (insert.second) { + auto page = Page::create(page_size, page_offset); + cur = pages.insert_commit(*page, commit); + + // assume that the caller will write to the range [offset,length), + // so we only need to zero memory outside of this range + + // zero end of page past offset + length + if (offset + length < page->offset + page_size) + std::fill(page->data + offset + length - page->offset, + page->data + page_size, 0); + // zero front of page between page_offset and offset + if (offset > page->offset) + std::fill(page->data, page->data + offset - page->offset, 0); + } else { // exists + cur = insert.first; + } + // add a reference to output vector + out->reset(&*cur); + ++out; + + auto c = std::min(length, (position & (page_size-1)) + 1); + position -= c; + length -= c; + } + // make sure we sized the vector correctly + ceph_assert(out == range.rend()); + } + + // return all allocated pages that intersect the range [offset,length) + void get_range(uint64_t offset, uint64_t length, page_vector &range) { + auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp()); + while (cur != pages.end() && cur->offset < offset + length) + range.push_back(&*cur++); + } + + void free_pages_after(uint64_t offset) { + std::lock_guard<lock_type> lock(mutex); + auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp()); + if (cur == pages.end()) + return; + if (cur->offset < offset) + cur++; + free_pages(cur, pages.end()); + } + + void encode(bufferlist &bl) const { + using ceph::encode; + encode(page_size, bl); + unsigned count = pages.size(); + encode(count, bl); + for (auto p = pages.rbegin(); p != pages.rend(); ++p) + p->encode(bl, page_size); + } + void decode(bufferlist::const_iterator &p) { + using ceph::decode; + ceph_assert(empty()); + decode(page_size, p); + unsigned count; + decode(count, p); + auto cur = pages.end(); + for (unsigned i = 0; i < count; i++) { + auto page = Page::create(page_size); + page->decode(p, page_size); + cur = pages.insert_before(cur, *page); + } + } +}; + +#endif // CEPH_PAGESET_H |