diff options
Diffstat (limited to 'src/os/bluestore/BlueStore.cc')
-rw-r--r-- | src/os/bluestore/BlueStore.cc | 19631 |
1 files changed, 19631 insertions, 0 deletions
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc new file mode 100644 index 000000000..aa14d0204 --- /dev/null +++ b/src/os/bluestore/BlueStore.cc @@ -0,0 +1,19631 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <bit> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <algorithm> + +#include <boost/container/flat_set.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/random/mersenne_twister.hpp> +#include <boost/random/uniform_real.hpp> + +#include "include/cpp-btree/btree_set.h" + +#include "BlueStore.h" +#include "bluestore_common.h" +#include "simple_bitmap.h" +#include "os/kv.h" +#include "include/compat.h" +#include "include/intarith.h" +#include "include/stringify.h" +#include "include/str_map.h" +#include "include/util.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "common/PriorityCache.h" +#include "common/url_escape.h" +#include "Allocator.h" +#include "FreelistManager.h" +#include "BlueFS.h" +#include "BlueRocksEnv.h" +#include "auth/Crypto.h" +#include "common/EventTrace.h" +#include "perfglue/heap_profiler.h" +#include "common/blkdev.h" +#include "common/numa.h" +#include "common/pretty_binary.h" +#include "kv/KeyValueHistogram.h" + +#ifdef HAVE_LIBZBD +#include "ZonedAllocator.h" +#include "ZonedFreelistManager.h" +#endif + +#if defined(WITH_LTTNG) +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/bluestore.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore + +using bid_t = decltype(BlueStore::Blob::id); + +// bluestore_cache_onode +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode, + bluestore_cache_onode); + +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer, + bluestore_cache_buffer); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent, + bluestore_extent); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob, + bluestore_blob); +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob, + bluestore_shared_blob); + +// bluestore_txc +MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext, + bluestore_txc); +using std::byte; +using std::deque; +using std::min; +using std::make_pair; +using std::numeric_limits; +using std::pair; +using std::less; +using std::list; +using std::make_unique; +using std::map; +using std::max; +using std::ostream; +using std::ostringstream; +using std::set; +using std::string; +using std::stringstream; +using std::unique_ptr; +using std::vector; + +using ceph::bufferlist; +using ceph::bufferptr; +using ceph::coarse_mono_clock; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_timespan; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::timespan_str; + +// kv store prefixes +const string PREFIX_SUPER = "S"; // field -> value +const string PREFIX_STAT = "T"; // field -> value(int64 array) +const string PREFIX_COLL = "C"; // collection name -> cnode_t +const string PREFIX_OBJ = "O"; // object name -> onode_t +const string PREFIX_OMAP = "M"; // u64 + keyname -> value +const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll) +const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value +const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value +const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t +const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist) +const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager) +const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t + +#ifdef HAVE_LIBZBD +const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager) +const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager) +const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata) +#endif + +const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs"; + +// write a label in the first block. always use this size. note that +// bluefs makes a matching assumption about the location of its +// superblock (always the second block of the device). +#define BDEV_LABEL_BLOCK_SIZE 4096 + +// reserve: label (4k) + bluefs super (4k), which means we start at 8k. +#define SUPER_RESERVED 8192 + +#define OBJECT_MAX_SIZE 0xffffffff // 32 bits + + +/* + * extent map blob encoding + * + * we use the low bits of the blobid field to indicate some common scenarios + * and spanning vs local ids. See ExtentMap::{encode,decode}_some(). + */ +#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous +#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0 +#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent +#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id +#define BLOBID_SHIFT_BITS 4 + +/* + * object name key structure + * + * encoded u8: shard + 2^7 (so that it sorts properly) + * encoded u64: poolid + 2^63 (so that it sorts properly) + * encoded u32: hash (bit reversed) + * + * escaped string: namespace + * + * escaped string: key or object name + * 1 char: '<', '=', or '>'. if =, then object key == object name, and + * we are done. otherwise, we are followed by the object name. + * escaped string: object name (unless '=' above) + * + * encoded u64: snap + * encoded u64: generation + * 'o' + */ +#define ONODE_KEY_SUFFIX 'o' + +/* + * extent shard key + * + * object prefix key + * u32 + * 'x' + */ +#define EXTENT_SHARD_KEY_SUFFIX 'x' + +/* + * string encoding in the key + * + * The key string needs to lexicographically sort the same way that + * ghobject_t does. We do this by escaping anything <= to '#' with # + * plus a 2 digit hex string, and anything >= '~' with ~ plus the two + * hex digits. + * + * We use ! as a terminator for strings; this works because it is < # + * and will get escaped if it is present in the string. + * + * NOTE: There is a bug in this implementation: due to implicit + * character type conversion in comparison it may produce unexpected + * ordering. Unfortunately fixing the bug would mean invalidating the + * keys in existing deployments. Instead we do additional sorting + * where it is needed. + */ +template<typename S> +static void append_escaped(const string &in, S *out) +{ + char hexbyte[in.length() * 3 + 1]; + char* ptr = &hexbyte[0]; + for (string::const_iterator i = in.begin(); i != in.end(); ++i) { + if (*i <= '#') { // bug: unexpected result for *i > 0x7f + *ptr++ = '#'; + *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f]; + *ptr++ = "0123456789abcdef"[*i & 0x0f]; + } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f + *ptr++ = '~'; + *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f]; + *ptr++ = "0123456789abcdef"[*i & 0x0f]; + } else { + *ptr++ = *i; + } + } + *ptr++ = '!'; + out->append(hexbyte, ptr - &hexbyte[0]); +} + +inline unsigned h2i(char c) +{ + if ((c >= '0') && (c <= '9')) { + return c - 0x30; + } else if ((c >= 'a') && (c <= 'f')) { + return c - 'a' + 10; + } else if ((c >= 'A') && (c <= 'F')) { + return c - 'A' + 10; + } else { + return 256; // make it always larger than 255 + } +} + +static int decode_escaped(const char *p, string *out) +{ + char buff[256]; + char* ptr = &buff[0]; + char* max = &buff[252]; + const char *orig_p = p; + while (*p && *p != '!') { + if (*p == '#' || *p == '~') { + unsigned hex = 0; + p++; + hex = h2i(*p++) << 4; + if (hex > 255) { + return -EINVAL; + } + hex |= h2i(*p++); + if (hex > 255) { + return -EINVAL; + } + *ptr++ = hex; + } else { + *ptr++ = *p++; + } + if (ptr > max) { + out->append(buff, ptr-buff); + ptr = &buff[0]; + } + } + if (ptr != buff) { + out->append(buff, ptr-buff); + } + return p - orig_p; +} + +template<typename T> +static void _key_encode_shard(shard_id_t shard, T *key) +{ + key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80)); +} + +static const char *_key_decode_shard(const char *key, shard_id_t *pshard) +{ + pshard->id = (uint8_t)*key - (uint8_t)0x80; + return key + 1; +} + +static void get_coll_range(const coll_t& cid, int bits, + ghobject_t *temp_start, ghobject_t *temp_end, + ghobject_t *start, ghobject_t *end, bool legacy) +{ + spg_t pgid; + constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max(); + // use different nspaces due to we use different schemes when encoding + // keys for listing objects + const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff"; + if (cid.is_pg(&pgid)) { + start->shard_id = pgid.shard; + *temp_start = *start; + + start->hobj.pool = pgid.pool(); + temp_start->hobj.pool = -2ll - pgid.pool(); + + *end = *start; + *temp_end = *temp_start; + + uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps()); + start->hobj.set_bitwise_key_u32(reverse_hash); + temp_start->hobj.set_bitwise_key_u32(reverse_hash); + + uint64_t end_hash = reverse_hash + (1ull << (32 - bits)); + if (end_hash > MAX_HASH) { + // make sure end hobj is even greater than the maximum possible hobj + end->hobj.set_bitwise_key_u32(MAX_HASH); + temp_end->hobj.set_bitwise_key_u32(MAX_HASH); + end->hobj.nspace = MAX_NSPACE; + } else { + end->hobj.set_bitwise_key_u32(end_hash); + temp_end->hobj.set_bitwise_key_u32(end_hash); + } + } else { + start->shard_id = shard_id_t::NO_SHARD; + start->hobj.pool = -1ull; + + *end = *start; + start->hobj.set_bitwise_key_u32(0); + end->hobj.set_bitwise_key_u32(MAX_HASH); + end->hobj.nspace = MAX_NSPACE; + // no separate temp section + *temp_start = *end; + *temp_end = *end; + } + + start->generation = 0; + end->generation = 0; + temp_start->generation = 0; + temp_end->generation = 0; +} + +static void get_shared_blob_key(uint64_t sbid, string *key) +{ + key->clear(); + _key_encode_u64(sbid, key); +} + +static int get_key_shared_blob(const string& key, uint64_t *sbid) +{ + const char *p = key.c_str(); + if (key.length() < sizeof(uint64_t)) + return -1; + _key_decode_u64(p, sbid); + return 0; +} + +template<typename S> +static void _key_encode_prefix(const ghobject_t& oid, S *key) +{ + _key_encode_shard(oid.shard_id, key); + _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key); + _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key); +} + +static const char *_key_decode_prefix(const char *p, ghobject_t *oid) +{ + p = _key_decode_shard(p, &oid->shard_id); + + uint64_t pool; + p = _key_decode_u64(p, &pool); + oid->hobj.pool = pool - 0x8000000000000000ull; + + unsigned hash; + p = _key_decode_u32(p, &hash); + + oid->hobj.set_bitwise_key_u32(hash); + + return p; +} + + +#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4) + +static int _get_key_object(const char *p, ghobject_t *oid) +{ + int r; + + p = _key_decode_prefix(p, oid); + + r = decode_escaped(p, &oid->hobj.nspace); + if (r < 0) + return -2; + p += r + 1; + + string k; + r = decode_escaped(p, &k); + if (r < 0) + return -3; + p += r + 1; + if (*p == '=') { + // no key + ++p; + oid->hobj.oid.name = k; + } else if (*p == '<' || *p == '>') { + // key + name + ++p; + r = decode_escaped(p, &oid->hobj.oid.name); + if (r < 0) + return -5; + p += r + 1; + oid->hobj.set_key(k); + } else { + // malformed + return -6; + } + + p = _key_decode_u64(p, &oid->hobj.snap.val); + p = _key_decode_u64(p, &oid->generation); + + if (*p != ONODE_KEY_SUFFIX) { + return -7; + } + p++; + if (*p) { + // if we get something other than a null terminator here, + // something goes wrong. + return -8; + } + + return 0; +} + +template<typename S> +static int get_key_object(const S& key, ghobject_t *oid) +{ + if (key.length() < ENCODED_KEY_PREFIX_LEN) + return -1; + if (key.length() == ENCODED_KEY_PREFIX_LEN) + return -2; + const char *p = key.c_str(); + return _get_key_object(p, oid); +} + +template<typename S> +static void _get_object_key(const ghobject_t& oid, S *key) +{ + size_t max_len = ENCODED_KEY_PREFIX_LEN + + (oid.hobj.nspace.length() * 3 + 1) + + (oid.hobj.get_key().length() * 3 + 1) + + 1 + // for '<', '=', or '>' + (oid.hobj.oid.name.length() * 3 + 1) + + 8 + 8 + 1; + key->reserve(max_len); + + _key_encode_prefix(oid, key); + + append_escaped(oid.hobj.nspace, key); + + if (oid.hobj.get_key().length()) { + // is a key... could be < = or >. + append_escaped(oid.hobj.get_key(), key); + // (ASCII chars < = and > sort in that order, yay) + int r = oid.hobj.get_key().compare(oid.hobj.oid.name); + if (r) { + key->append(r > 0 ? ">" : "<"); + append_escaped(oid.hobj.oid.name, key); + } else { + // same as no key + key->append("="); + } + } else { + // no key + append_escaped(oid.hobj.oid.name, key); + key->append("="); + } + + _key_encode_u64(oid.hobj.snap, key); + _key_encode_u64(oid.generation, key); + + key->push_back(ONODE_KEY_SUFFIX); +} + +template<typename S> +static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key) +{ + key->clear(); + _get_object_key(oid, key); + + // sanity check + if (true) { + ghobject_t t; + int r = get_key_object(*key, &t); + if (r || t != oid) { + derr << " r " << r << dendl; + derr << "key " << pretty_binary_string(*key) << dendl; + derr << "oid " << oid << dendl; + derr << " t " << t << dendl; + ceph_assert(r == 0 && t == oid); + } + } +} + +// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing +// char lets us quickly test whether it is a shard key without decoding any +// of the prefix bytes. +template<typename S> +static void get_extent_shard_key(const S& onode_key, uint32_t offset, + string *key) +{ + key->clear(); + key->reserve(onode_key.length() + 4 + 1); + key->append(onode_key.c_str(), onode_key.size()); + _key_encode_u32(offset, key); + key->push_back(EXTENT_SHARD_KEY_SUFFIX); +} + +static void rewrite_extent_shard_key(uint32_t offset, string *key) +{ + ceph_assert(key->size() > sizeof(uint32_t) + 1); + ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX); + _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key); +} + +template<typename S> +static void generate_extent_shard_key_and_apply( + const S& onode_key, + uint32_t offset, + string *key, + std::function<void(const string& final_key)> apply) +{ + if (key->empty()) { // make full key + ceph_assert(!onode_key.empty()); + get_extent_shard_key(onode_key, offset, key); + } else { + rewrite_extent_shard_key(offset, key); + } + apply(*key); +} + +int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset) +{ + ceph_assert(key.size() > sizeof(uint32_t) + 1); + ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX); + int okey_len = key.size() - sizeof(uint32_t) - 1; + *onode_key = key.substr(0, okey_len); + const char *p = key.data() + okey_len; + _key_decode_u32(p, offset); + return 0; +} + +static bool is_extent_shard_key(const string& key) +{ + return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX; +} + +static void get_deferred_key(uint64_t seq, string *out) +{ + _key_encode_u64(seq, out); +} + +static void get_pool_stat_key(int64_t pool_id, string *key) +{ + key->clear(); + _key_encode_u64(pool_id, key); +} + +static int get_key_pool_stat(const string& key, uint64_t* pool_id) +{ + const char *p = key.c_str(); + if (key.length() < sizeof(uint64_t)) + return -1; + _key_decode_u64(p, pool_id); + return 0; +} + +#ifdef HAVE_LIBZBD +static void get_zone_offset_object_key( + uint32_t zone, + uint64_t offset, + ghobject_t oid, + std::string *key) +{ + key->clear(); + _key_encode_u32(zone, key); + _key_encode_u64(offset, key); + _get_object_key(oid, key); +} + +static int get_key_zone_offset_object( + const string& key, + uint32_t *zone, + uint64_t *offset, + ghobject_t *oid) +{ + const char *p = key.c_str(); + if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1) + return -1; + p = _key_decode_u32(p, zone); + p = _key_decode_u64(p, offset); + int r = _get_key_object(p, oid); + if (r < 0) { + return r; + } + return 0; +} +#endif + +template <int LogLevelV> +void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em) +{ + uint64_t pos = 0; + for (auto& s : em.shards) { + dout(LogLevelV) << __func__ << " shard " << *s.shard_info + << (s.loaded ? " (loaded)" : "") + << (s.dirty ? " (dirty)" : "") + << dendl; + } + for (auto& e : em.extent_map) { + dout(LogLevelV) << __func__ << " " << e << dendl; + ceph_assert(e.logical_offset >= pos); + pos = e.logical_offset + e.length; + const bluestore_blob_t& blob = e.blob->get_blob(); + if (blob.has_csum()) { + vector<uint64_t> v; + unsigned n = blob.get_csum_count(); + for (unsigned i = 0; i < n; ++i) + v.push_back(blob.get_csum_item(i)); + dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec + << dendl; + } + std::lock_guard l(e.blob->shared_blob->get_cache()->lock); + for (auto& i : e.blob->shared_blob->bc.buffer_map) { + dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first + << "~" << i.second->length << std::dec + << " " << *i.second << dendl; + } + } +} + +template <int LogLevelV> +void _dump_onode(CephContext *cct, const BlueStore::Onode& o) +{ + if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>()) + return; + dout(LogLevelV) << __func__ << " " << &o << " " << o.oid + << " nid " << o.onode.nid + << " size 0x" << std::hex << o.onode.size + << " (" << std::dec << o.onode.size << ")" + << " expected_object_size " << o.onode.expected_object_size + << " expected_write_size " << o.onode.expected_write_size + << " in " << o.onode.extent_map_shards.size() << " shards" + << ", " << o.extent_map.spanning_blob_map.size() + << " spanning blobs" + << dendl; + for (auto& [zone, offset] : o.onode.zone_offset_refs) { + dout(LogLevelV) << __func__ << " zone ref 0x" << std::hex << zone + << " offset 0x" << offset << std::dec << dendl; + } + for (auto p = o.onode.attrs.begin(); + p != o.onode.attrs.end(); + ++p) { + dout(LogLevelV) << __func__ << " attr " << p->first + << " len " << p->second.length() << dendl; + } + _dump_extent_map<LogLevelV>(cct, o.extent_map); +} + +template <int LogLevelV> +void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t) +{ + dout(LogLevelV) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t->dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; +} + +// Buffer + +ostream& operator<<(ostream& out, const BlueStore::Buffer& b) +{ + out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex + << b.offset << "~" << b.length << std::dec + << " " << BlueStore::Buffer::get_state_name(b.state); + if (b.flags) + out << " " << BlueStore::Buffer::get_flag_name(b.flags); + return out << ")"; +} + +namespace { + +/* + * Due to a bug in key string encoding (see a comment for append_escaped) + * the KeyValueDB iterator does not lexicographically sort the same + * way that ghobject_t does: objects with the same hash may have wrong order. + * + * This is the iterator wrapper that fixes the keys order. + */ + +class CollectionListIterator { +public: + CollectionListIterator(const KeyValueDB::Iterator &it) + : m_it(it) { + } + virtual ~CollectionListIterator() { + } + + virtual bool valid() const = 0; + virtual const ghobject_t &oid() const = 0; + virtual void lower_bound(const ghobject_t &oid) = 0; + virtual void upper_bound(const ghobject_t &oid) = 0; + virtual void next() = 0; + + virtual int cmp(const ghobject_t &oid) const = 0; + + bool is_ge(const ghobject_t &oid) const { + return cmp(oid) >= 0; + } + + bool is_lt(const ghobject_t &oid) const { + return cmp(oid) < 0; + } + +protected: + KeyValueDB::Iterator m_it; +}; + +class SimpleCollectionListIterator : public CollectionListIterator { +public: + SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it) + : CollectionListIterator(it), m_cct(cct) { + } + + bool valid() const override { + return m_it->valid(); + } + + const ghobject_t &oid() const override { + ceph_assert(valid()); + + return m_oid; + } + + void lower_bound(const ghobject_t &oid) override { + string key; + get_object_key(m_cct, oid, &key); + + m_it->lower_bound(key); + get_oid(); + } + + void upper_bound(const ghobject_t &oid) override { + string key; + get_object_key(m_cct, oid, &key); + + m_it->upper_bound(key); + get_oid(); + } + + void next() override { + ceph_assert(valid()); + + m_it->next(); + get_oid(); + } + + int cmp(const ghobject_t &oid) const override { + ceph_assert(valid()); + + string key; + get_object_key(m_cct, oid, &key); + + return m_it->key().compare(key); + } + +private: + CephContext *m_cct; + ghobject_t m_oid; + + void get_oid() { + m_oid = ghobject_t(); + while (m_it->valid() && is_extent_shard_key(m_it->key())) { + m_it->next(); + } + if (!valid()) { + return; + } + + int r = get_key_object(m_it->key(), &m_oid); + ceph_assert(r == 0); + } +}; + +class SortedCollectionListIterator : public CollectionListIterator { +public: + SortedCollectionListIterator(const KeyValueDB::Iterator &it) + : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) { + } + + bool valid() const override { + return m_chunk_iter != m_chunk.end(); + } + + const ghobject_t &oid() const override { + ceph_assert(valid()); + + return m_chunk_iter->first; + } + + void lower_bound(const ghobject_t &oid) override { + std::string key; + _key_encode_prefix(oid, &key); + + m_it->lower_bound(key); + m_chunk_iter = m_chunk.end(); + if (!get_next_chunk()) { + return; + } + + if (this->oid().shard_id != oid.shard_id || + this->oid().hobj.pool != oid.hobj.pool || + this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) { + return; + } + + m_chunk_iter = m_chunk.lower_bound(oid); + if (m_chunk_iter == m_chunk.end()) { + get_next_chunk(); + } + } + + void upper_bound(const ghobject_t &oid) override { + lower_bound(oid); + + if (valid() && this->oid() == oid) { + next(); + } + } + + void next() override { + ceph_assert(valid()); + + m_chunk_iter++; + if (m_chunk_iter == m_chunk.end()) { + get_next_chunk(); + } + } + + int cmp(const ghobject_t &oid) const override { + ceph_assert(valid()); + + if (this->oid() < oid) { + return -1; + } + if (this->oid() > oid) { + return 1; + } + return 0; + } + +private: + std::map<ghobject_t, std::string> m_chunk; + std::map<ghobject_t, std::string>::iterator m_chunk_iter; + + bool get_next_chunk() { + while (m_it->valid() && is_extent_shard_key(m_it->key())) { + m_it->next(); + } + + if (!m_it->valid()) { + return false; + } + + ghobject_t oid; + int r = get_key_object(m_it->key(), &oid); + ceph_assert(r == 0); + + m_chunk.clear(); + while (true) { + m_chunk.insert({oid, m_it->key()}); + + do { + m_it->next(); + } while (m_it->valid() && is_extent_shard_key(m_it->key())); + + if (!m_it->valid()) { + break; + } + + ghobject_t next; + r = get_key_object(m_it->key(), &next); + ceph_assert(r == 0); + if (next.shard_id != oid.shard_id || + next.hobj.pool != oid.hobj.pool || + next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) { + break; + } + oid = next; + } + + m_chunk_iter = m_chunk.begin(); + return true; + } +}; + +} // anonymous namespace + +// Garbage Collector + +void BlueStore::GarbageCollector::process_protrusive_extents( + const BlueStore::ExtentMap& extent_map, + uint64_t start_offset, + uint64_t end_offset, + uint64_t start_touch_offset, + uint64_t end_touch_offset, + uint64_t min_alloc_size) +{ + ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset); + + uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size); + uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size); + + dout(30) << __func__ << " (hex): [" << std::hex + << lookup_start_offset << ", " << lookup_end_offset + << ")" << std::dec << dendl; + + for (auto it = extent_map.seek_lextent(lookup_start_offset); + it != extent_map.extent_map.end() && + it->logical_offset < lookup_end_offset; + ++it) { + uint64_t alloc_unit_start = it->logical_offset / min_alloc_size; + uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size; + + dout(30) << __func__ << " " << *it + << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end + << dendl; + + Blob* b = it->blob.get(); + + if (it->logical_offset >=start_touch_offset && + it->logical_end() <= end_touch_offset) { + // Process extents within the range affected by + // the current write request. + // Need to take into account if existing extents + // can be merged with them (uncompressed case) + if (!b->get_blob().is_compressed()) { + if (blob_info_counted && used_alloc_unit == alloc_unit_start) { + --blob_info_counted->expected_allocations; // don't need to allocate + // new AU for compressed + // data since another + // collocated uncompressed + // blob already exists + dout(30) << __func__ << " --expected:" + << alloc_unit_start << dendl; + } + used_alloc_unit = alloc_unit_end; + blob_info_counted = nullptr; + } + } else if (b->get_blob().is_compressed()) { + + // additionally we take compressed blobs that were not impacted + // by the write into account too + BlobInfo& bi = + affected_blobs.emplace( + b, BlobInfo(b->get_referenced_bytes())).first->second; + + int adjust = + (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1; + bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust; + dout(30) << __func__ << " expected_allocations=" + << bi.expected_allocations << " end_au:" + << alloc_unit_end << dendl; + + blob_info_counted = &bi; + used_alloc_unit = alloc_unit_end; + + ceph_assert(it->length <= bi.referenced_bytes); + bi.referenced_bytes -= it->length; + dout(30) << __func__ << " affected_blob:" << *b + << " unref 0x" << std::hex << it->length + << " referenced = 0x" << bi.referenced_bytes + << std::dec << dendl; + // NOTE: we can't move specific blob to resulting GC list here + // when reference counter == 0 since subsequent extents might + // decrement its expected_allocation. + // Hence need to enumerate all the extents first. + if (!bi.collect_candidate) { + bi.first_lextent = it; + bi.collect_candidate = true; + } + bi.last_lextent = it; + } else { + if (blob_info_counted && used_alloc_unit == alloc_unit_start) { + // don't need to allocate new AU for compressed data since another + // collocated uncompressed blob already exists + --blob_info_counted->expected_allocations; + dout(30) << __func__ << " --expected_allocations:" + << alloc_unit_start << dendl; + } + used_alloc_unit = alloc_unit_end; + blob_info_counted = nullptr; + } + } + + for (auto b_it = affected_blobs.begin(); + b_it != affected_blobs.end(); + ++b_it) { + Blob* b = b_it->first; + BlobInfo& bi = b_it->second; + if (bi.referenced_bytes == 0) { + uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length(); + int64_t blob_expected_for_release = + round_up_to(len_on_disk, min_alloc_size) / min_alloc_size; + + dout(30) << __func__ << " " << *(b_it->first) + << " expected4release=" << blob_expected_for_release + << " expected_allocations=" << bi.expected_allocations + << dendl; + int64_t benefit = blob_expected_for_release - bi.expected_allocations; + if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) { + if (bi.collect_candidate) { + auto it = bi.first_lextent; + bool bExit = false; + do { + if (it->blob.get() == b) { + extents_to_collect.insert(it->logical_offset, it->length); + } + bExit = it == bi.last_lextent; + ++it; + } while (!bExit); + } + expected_for_release += blob_expected_for_release; + expected_allocations += bi.expected_allocations; + } + } + } +} + +int64_t BlueStore::GarbageCollector::estimate( + uint64_t start_offset, + uint64_t length, + const BlueStore::ExtentMap& extent_map, + const BlueStore::old_extent_map_t& old_extents, + uint64_t min_alloc_size) +{ + + affected_blobs.clear(); + extents_to_collect.clear(); + used_alloc_unit = boost::optional<uint64_t >(); + blob_info_counted = nullptr; + + uint64_t gc_start_offset = start_offset; + uint64_t gc_end_offset = start_offset + length; + + uint64_t end_offset = start_offset + length; + + for (auto it = old_extents.begin(); it != old_extents.end(); ++it) { + Blob* b = it->e.blob.get(); + if (b->get_blob().is_compressed()) { + + // update gc_start_offset/gc_end_offset if needed + gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start()); + gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end()); + + auto o = it->e.logical_offset; + auto l = it->e.length; + + uint64_t ref_bytes = b->get_referenced_bytes(); + // micro optimization to bypass blobs that have no more references + if (ref_bytes != 0) { + dout(30) << __func__ << " affected_blob:" << *b + << " unref 0x" << std::hex << o << "~" << l + << std::dec << dendl; + affected_blobs.emplace(b, BlobInfo(ref_bytes)); + } + } + } + dout(30) << __func__ << " gc range(hex): [" << std::hex + << gc_start_offset << ", " << gc_end_offset + << ")" << std::dec << dendl; + + // enumerate preceeding extents to check if they reference affected blobs + if (gc_start_offset < start_offset || gc_end_offset > end_offset) { + process_protrusive_extents(extent_map, + gc_start_offset, + gc_end_offset, + start_offset, + end_offset, + min_alloc_size); + } + return expected_for_release - expected_allocations; +} + +// LruOnodeCacheShard +struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { + typedef boost::intrusive::list< + BlueStore::Onode, + boost::intrusive::member_hook< + BlueStore::Onode, + boost::intrusive::list_member_hook<>, + &BlueStore::Onode::lru_item> > list_t; + + list_t lru; + + explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {} + + void _add(BlueStore::Onode* o, int level) override + { + o->set_cached(); + if (o->pin_nref == 1) { + (level > 0) ? lru.push_front(*o) : lru.push_back(*o); + o->cache_age_bin = age_bins.front(); + *(o->cache_age_bin) += 1; + } + ++num; // we count both pinned and unpinned entries + dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" + << num << dendl; + } + void _rm(BlueStore::Onode* o) override + { + o->clear_cached(); + if (o->lru_item.is_linked()) { + *(o->cache_age_bin) -= 1; + lru.erase(lru.iterator_to(*o)); + } + ceph_assert(num); + --num; + dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl; + } + + void maybe_unpin(BlueStore::Onode* o) override + { + OnodeCacheShard* ocs = this; + ocs->lock.lock(); + // It is possible that during waiting split_cache moved us to different OnodeCacheShard. + while (ocs != o->c->get_onode_cache()) { + ocs->lock.unlock(); + ocs = o->c->get_onode_cache(); + ocs->lock.lock(); + } + if (o->is_cached() && o->pin_nref == 1) { + if(!o->lru_item.is_linked()) { + if (o->exists) { + lru.push_front(*o); + o->cache_age_bin = age_bins.front(); + *(o->cache_age_bin) += 1; + dout(20) << __func__ << " " << this << " " << o->oid << " unpinned" + << dendl; + } else { + ceph_assert(num); + --num; + o->clear_cached(); + dout(20) << __func__ << " " << this << " " << o->oid << " removed" + << dendl; + // remove will also decrement nref + o->c->onode_space._remove(o->oid); + } + } else if (o->exists) { + // move onode within LRU + lru.erase(lru.iterator_to(*o)); + lru.push_front(*o); + if (o->cache_age_bin != age_bins.front()) { + *(o->cache_age_bin) -= 1; + o->cache_age_bin = age_bins.front(); + *(o->cache_age_bin) += 1; + } + dout(20) << __func__ << " " << this << " " << o->oid << " touched" + << dendl; + } + } + ocs->lock.unlock(); + } + + void _trim_to(uint64_t new_size) override + { + if (new_size >= lru.size()) { + return; // don't even try + } + uint64_t n = num - new_size; // note: we might get empty LRU + // before n == 0 due to pinned + // entries. And hence being unable + // to reach new_size target. + while (n-- > 0 && lru.size() > 0) { + BlueStore::Onode *o = &lru.back(); + lru.pop_back(); + + dout(20) << __func__ << " rm " << o->oid << " " + << o->nref << " " << o->cached << dendl; + + *(o->cache_age_bin) -= 1; + if (o->pin_nref > 1) { + dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << dendl; + } else { + ceph_assert(num); + --num; + o->clear_cached(); + o->c->onode_space._remove(o->oid); + } + } + } + void _move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override + { + if (to == this) { + return; + } + _rm(o); + ceph_assert(o->nref > 1); + to->_add(o, 0); + } + void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override + { + std::lock_guard l(lock); + *onodes += num; + *pinned_onodes += num - lru.size(); + } +}; + +// OnodeCacheShard +BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create( + CephContext* cct, + string type, + PerfCounters *logger) +{ + BlueStore::OnodeCacheShard *c = nullptr; + // Currently we only implement an LRU cache for onodes + c = new LruOnodeCacheShard(cct); + c->logger = logger; + return c; +} + +// LruBufferCacheShard +struct LruBufferCacheShard : public BlueStore::BufferCacheShard { + typedef boost::intrusive::list< + BlueStore::Buffer, + boost::intrusive::member_hook< + BlueStore::Buffer, + boost::intrusive::list_member_hook<>, + &BlueStore::Buffer::lru_item> > list_t; + list_t lru; + + explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {} + + void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override { + if (near) { + auto q = lru.iterator_to(*near); + lru.insert(q, *b); + } else if (level > 0) { + lru.push_front(*b); + } else { + lru.push_back(*b); + } + buffer_bytes += b->length; + b->cache_age_bin = age_bins.front(); + *(b->cache_age_bin) += b->length; + num = lru.size(); + } + void _rm(BlueStore::Buffer *b) override { + ceph_assert(buffer_bytes >= b->length); + buffer_bytes -= b->length; + assert(*(b->cache_age_bin) >= b->length); + *(b->cache_age_bin) -= b->length; + auto q = lru.iterator_to(*b); + lru.erase(q); + num = lru.size(); + } + void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override { + src->_rm(b); + _add(b, 0, nullptr); + } + void _adjust_size(BlueStore::Buffer *b, int64_t delta) override { + ceph_assert((int64_t)buffer_bytes + delta >= 0); + buffer_bytes += delta; + assert(*(b->cache_age_bin) + delta >= 0); + *(b->cache_age_bin) += delta; + } + void _touch(BlueStore::Buffer *b) override { + auto p = lru.iterator_to(*b); + lru.erase(p); + lru.push_front(*b); + *(b->cache_age_bin) -= b->length; + b->cache_age_bin = age_bins.front(); + *(b->cache_age_bin) += b->length; + num = lru.size(); + _audit("_touch_buffer end"); + } + + void _trim_to(uint64_t max) override + { + while (buffer_bytes > max) { + auto i = lru.rbegin(); + if (i == lru.rend()) { + // stop if lru is now empty + break; + } + + BlueStore::Buffer *b = &*i; + ceph_assert(b->is_clean()); + dout(20) << __func__ << " rm " << *b << dendl; + assert(*(b->cache_age_bin) >= b->length); + *(b->cache_age_bin) -= b->length; + b->space->_rm_buffer(this, b); + } + num = lru.size(); + } + + void add_stats(uint64_t *extents, + uint64_t *blobs, + uint64_t *buffers, + uint64_t *bytes) override { + *extents += num_extents; + *blobs += num_blobs; + *buffers += num; + *bytes += buffer_bytes; + } +#ifdef DEBUG_CACHE + void _audit(const char *s) override + { + dout(10) << __func__ << " " << when << " start" << dendl; + uint64_t s = 0; + for (auto i = lru.begin(); i != lru.end(); ++i) { + s += i->length; + } + if (s != buffer_bytes) { + derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s + << dendl; + for (auto i = lru.begin(); i != lru.end(); ++i) { + derr << __func__ << " " << *i << dendl; + } + ceph_assert(s == buffer_bytes); + } + dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes + << " ok" << dendl; + } +#endif +}; + +// TwoQBufferCacheShard + +struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard { + typedef boost::intrusive::list< + BlueStore::Buffer, + boost::intrusive::member_hook< + BlueStore::Buffer, + boost::intrusive::list_member_hook<>, + &BlueStore::Buffer::lru_item> > list_t; + list_t hot; ///< "Am" hot buffers + list_t warm_in; ///< "A1in" newly warm buffers + list_t warm_out; ///< "A1out" empty buffers we've evicted + + enum { + BUFFER_NEW = 0, + BUFFER_WARM_IN, ///< in warm_in + BUFFER_WARM_OUT, ///< in warm_out + BUFFER_HOT, ///< in hot + BUFFER_TYPE_MAX + }; + + uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type + +public: + explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {} + + void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override + { + dout(20) << __func__ << " level " << level << " near " << near + << " on " << *b + << " which has cache_private " << b->cache_private << dendl; + if (near) { + b->cache_private = near->cache_private; + switch (b->cache_private) { + case BUFFER_WARM_IN: + warm_in.insert(warm_in.iterator_to(*near), *b); + break; + case BUFFER_WARM_OUT: + ceph_assert(b->is_empty()); + warm_out.insert(warm_out.iterator_to(*near), *b); + break; + case BUFFER_HOT: + hot.insert(hot.iterator_to(*near), *b); + break; + default: + ceph_abort_msg("bad cache_private"); + } + } else if (b->cache_private == BUFFER_NEW) { + b->cache_private = BUFFER_WARM_IN; + if (level > 0) { + warm_in.push_front(*b); + } else { + // take caller hint to start at the back of the warm queue + warm_in.push_back(*b); + } + } else { + // we got a hint from discard + switch (b->cache_private) { + case BUFFER_WARM_IN: + // stay in warm_in. move to front, even though 2Q doesn't actually + // do this. + dout(20) << __func__ << " move to front of warm " << *b << dendl; + warm_in.push_front(*b); + break; + case BUFFER_WARM_OUT: + b->cache_private = BUFFER_HOT; + // move to hot. fall-thru + case BUFFER_HOT: + dout(20) << __func__ << " move to front of hot " << *b << dendl; + hot.push_front(*b); + break; + default: + ceph_abort_msg("bad cache_private"); + } + } + b->cache_age_bin = age_bins.front(); + if (!b->is_empty()) { + buffer_bytes += b->length; + list_bytes[b->cache_private] += b->length; + *(b->cache_age_bin) += b->length; + } + num = hot.size() + warm_in.size(); + } + + void _rm(BlueStore::Buffer *b) override + { + dout(20) << __func__ << " " << *b << dendl; + if (!b->is_empty()) { + ceph_assert(buffer_bytes >= b->length); + buffer_bytes -= b->length; + ceph_assert(list_bytes[b->cache_private] >= b->length); + list_bytes[b->cache_private] -= b->length; + assert(*(b->cache_age_bin) >= b->length); + *(b->cache_age_bin) -= b->length; + } + switch (b->cache_private) { + case BUFFER_WARM_IN: + warm_in.erase(warm_in.iterator_to(*b)); + break; + case BUFFER_WARM_OUT: + warm_out.erase(warm_out.iterator_to(*b)); + break; + case BUFFER_HOT: + hot.erase(hot.iterator_to(*b)); + break; + default: + ceph_abort_msg("bad cache_private"); + } + num = hot.size() + warm_in.size(); + } + + void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override + { + TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc); + src->_rm(b); + + // preserve which list we're on (even if we can't preserve the order!) + switch (b->cache_private) { + case BUFFER_WARM_IN: + ceph_assert(!b->is_empty()); + warm_in.push_back(*b); + break; + case BUFFER_WARM_OUT: + ceph_assert(b->is_empty()); + warm_out.push_back(*b); + break; + case BUFFER_HOT: + ceph_assert(!b->is_empty()); + hot.push_back(*b); + break; + default: + ceph_abort_msg("bad cache_private"); + } + if (!b->is_empty()) { + buffer_bytes += b->length; + list_bytes[b->cache_private] += b->length; + *(b->cache_age_bin) += b->length; + } + num = hot.size() + warm_in.size(); + } + + void _adjust_size(BlueStore::Buffer *b, int64_t delta) override + { + dout(20) << __func__ << " delta " << delta << " on " << *b << dendl; + if (!b->is_empty()) { + ceph_assert((int64_t)buffer_bytes + delta >= 0); + buffer_bytes += delta; + ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0); + list_bytes[b->cache_private] += delta; + assert(*(b->cache_age_bin) + delta >= 0); + *(b->cache_age_bin) += delta; + } + } + + void _touch(BlueStore::Buffer *b) override { + switch (b->cache_private) { + case BUFFER_WARM_IN: + // do nothing (somewhat counter-intuitively!) + break; + case BUFFER_WARM_OUT: + // move from warm_out to hot LRU + ceph_abort_msg("this happens via discard hint"); + break; + case BUFFER_HOT: + // move to front of hot LRU + hot.erase(hot.iterator_to(*b)); + hot.push_front(*b); + break; + } + *(b->cache_age_bin) -= b->length; + b->cache_age_bin = age_bins.front(); + *(b->cache_age_bin) += b->length; + num = hot.size() + warm_in.size(); + _audit("_touch_buffer end"); + } + + void _trim_to(uint64_t max) override + { + if (buffer_bytes > max) { + uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio; + uint64_t khot = max - kin; + + // pre-calculate kout based on average buffer size too, + // which is typical(the warm_in and hot lists may change later) + uint64_t kout = 0; + uint64_t buffer_num = hot.size() + warm_in.size(); + if (buffer_num) { + uint64_t avg_size = buffer_bytes / buffer_num; + ceph_assert(avg_size); + uint64_t calculated_num = max / avg_size; + kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio; + } + + if (list_bytes[BUFFER_HOT] < khot) { + // hot is small, give slack to warm_in + kin += khot - list_bytes[BUFFER_HOT]; + } else if (list_bytes[BUFFER_WARM_IN] < kin) { + // warm_in is small, give slack to hot + khot += kin - list_bytes[BUFFER_WARM_IN]; + } + + // adjust warm_in list + int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin; + uint64_t evicted = 0; + + while (to_evict_bytes > 0) { + auto p = warm_in.rbegin(); + if (p == warm_in.rend()) { + // stop if warm_in list is now empty + break; + } + + BlueStore::Buffer *b = &*p; + ceph_assert(b->is_clean()); + dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl; + ceph_assert(buffer_bytes >= b->length); + buffer_bytes -= b->length; + ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length); + list_bytes[BUFFER_WARM_IN] -= b->length; + assert(*(b->cache_age_bin) >= b->length); + *(b->cache_age_bin) -= b->length; + to_evict_bytes -= b->length; + evicted += b->length; + b->state = BlueStore::Buffer::STATE_EMPTY; + b->data.clear(); + warm_in.erase(warm_in.iterator_to(*b)); + warm_out.push_front(*b); + b->cache_private = BUFFER_WARM_OUT; + } + + if (evicted > 0) { + dout(20) << __func__ << " evicted " << byte_u_t(evicted) + << " from warm_in list, done evicting warm_in buffers" + << dendl; + } + + // adjust hot list + to_evict_bytes = list_bytes[BUFFER_HOT] - khot; + evicted = 0; + + while (to_evict_bytes > 0) { + auto p = hot.rbegin(); + if (p == hot.rend()) { + // stop if hot list is now empty + break; + } + + BlueStore::Buffer *b = &*p; + dout(20) << __func__ << " buffer_hot rm " << *b << dendl; + ceph_assert(b->is_clean()); + // adjust evict size before buffer goes invalid + to_evict_bytes -= b->length; + evicted += b->length; + b->space->_rm_buffer(this, b); + } + + if (evicted > 0) { + dout(20) << __func__ << " evicted " << byte_u_t(evicted) + << " from hot list, done evicting hot buffers" + << dendl; + } + + // adjust warm out list too, if necessary + int64_t n = warm_out.size() - kout; + while (n-- > 0) { + BlueStore::Buffer *b = &*warm_out.rbegin(); + ceph_assert(b->is_empty()); + dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl; + b->space->_rm_buffer(this, b); + } + } + num = hot.size() + warm_in.size(); + } + + void add_stats(uint64_t *extents, + uint64_t *blobs, + uint64_t *buffers, + uint64_t *bytes) override { + *extents += num_extents; + *blobs += num_blobs; + *buffers += num; + *bytes += buffer_bytes; + } + +#ifdef DEBUG_CACHE + void _audit(const char *s) override + { + dout(10) << __func__ << " " << when << " start" << dendl; + uint64_t s = 0; + for (auto i = hot.begin(); i != hot.end(); ++i) { + s += i->length; + } + + uint64_t hot_bytes = s; + if (hot_bytes != list_bytes[BUFFER_HOT]) { + derr << __func__ << " hot_list_bytes " + << list_bytes[BUFFER_HOT] + << " != actual " << hot_bytes + << dendl; + ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]); + } + + for (auto i = warm_in.begin(); i != warm_in.end(); ++i) { + s += i->length; + } + + uint64_t warm_in_bytes = s - hot_bytes; + if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) { + derr << __func__ << " warm_in_list_bytes " + << list_bytes[BUFFER_WARM_IN] + << " != actual " << warm_in_bytes + << dendl; + ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]); + } + + if (s != buffer_bytes) { + derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s + << dendl; + ceph_assert(s == buffer_bytes); + } + + dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes + << " ok" << dendl; + } +#endif +}; + +// BuferCacheShard + +BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create( + CephContext* cct, + string type, + PerfCounters *logger) +{ + BufferCacheShard *c = nullptr; + if (type == "lru") + c = new LruBufferCacheShard(cct); + else if (type == "2q") + c = new TwoQBufferCacheShard(cct); + else + ceph_abort_msg("unrecognized cache type"); + c->logger = logger; + return c; +} + +// BufferSpace + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") " + +void BlueStore::BufferSpace::_clear(BufferCacheShard* cache) +{ + // note: we already hold cache->lock + ldout(cache->cct, 20) << __func__ << dendl; + while (!buffer_map.empty()) { + _rm_buffer(cache, buffer_map.begin()); + } +} + +int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) +{ + // note: we already hold cache->lock + ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length + << std::dec << dendl; + int cache_private = 0; + cache->_audit("discard start"); + auto i = _data_lower_bound(offset); + uint32_t end = offset + length; + while (i != buffer_map.end()) { + Buffer *b = i->second.get(); + if (b->offset >= end) { + break; + } + if (b->cache_private > cache_private) { + cache_private = b->cache_private; + } + if (b->offset < offset) { + int64_t front = offset - b->offset; + if (b->end() > end) { + // drop middle (split) + uint32_t tail = b->end() - end; + if (b->data.length()) { + bufferlist bl; + bl.substr_of(b->data, b->length - tail, tail); + Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags); + nb->maybe_rebuild(); + _add_buffer(cache, nb, 0, b); + } else { + _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail, + b->flags), + 0, b); + } + if (!b->is_writing()) { + cache->_adjust_size(b, front - (int64_t)b->length); + } + b->truncate(front); + b->maybe_rebuild(); + cache->_audit("discard end 1"); + break; + } else { + // drop tail + if (!b->is_writing()) { + cache->_adjust_size(b, front - (int64_t)b->length); + } + b->truncate(front); + b->maybe_rebuild(); + ++i; + continue; + } + } + if (b->end() <= end) { + // drop entire buffer + _rm_buffer(cache, i++); + continue; + } + // drop front + uint32_t keep = b->end() - end; + if (b->data.length()) { + bufferlist bl; + bl.substr_of(b->data, b->length - keep, keep); + Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags); + nb->maybe_rebuild(); + _add_buffer(cache, nb, 0, b); + } else { + _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep, + b->flags), + 0, b); + } + _rm_buffer(cache, i); + cache->_audit("discard end 2"); + break; + } + return cache_private; +} + +void BlueStore::BufferSpace::read( + BufferCacheShard* cache, + uint32_t offset, + uint32_t length, + BlueStore::ready_regions_t& res, + interval_set<uint32_t>& res_intervals, + int flags) +{ + res.clear(); + res_intervals.clear(); + uint32_t want_bytes = length; + uint32_t end = offset + length; + + { + std::lock_guard l(cache->lock); + for (auto i = _data_lower_bound(offset); + i != buffer_map.end() && offset < end && i->first < end; + ++i) { + Buffer *b = i->second.get(); + ceph_assert(b->end() > offset); + + bool val = false; + if (flags & BYPASS_CLEAN_CACHE) + val = b->is_writing(); + else + val = b->is_writing() || b->is_clean(); + if (val) { + if (b->offset < offset) { + uint32_t skip = offset - b->offset; + uint32_t l = min(length, b->length - skip); + res[offset].substr_of(b->data, skip, l); + res_intervals.insert(offset, l); + offset += l; + length -= l; + if (!b->is_writing()) { + cache->_touch(b); + } + continue; + } + if (b->offset > offset) { + uint32_t gap = b->offset - offset; + if (length <= gap) { + break; + } + offset += gap; + length -= gap; + } + if (!b->is_writing()) { + cache->_touch(b); + } + if (b->length > length) { + res[offset].substr_of(b->data, 0, length); + res_intervals.insert(offset, length); + break; + } else { + res[offset].append(b->data); + res_intervals.insert(offset, b->length); + if (b->length == length) + break; + offset += b->length; + length -= b->length; + } + } + } + } + + uint64_t hit_bytes = res_intervals.size(); + ceph_assert(hit_bytes <= want_bytes); + uint64_t miss_bytes = want_bytes - hit_bytes; + cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes); + cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes); +} + +void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq) +{ + auto i = writing.begin(); + while (i != writing.end()) { + if (i->seq > seq) { + break; + } + if (i->seq < seq) { + ++i; + continue; + } + + Buffer *b = &*i; + ceph_assert(b->is_writing()); + + if (b->flags & Buffer::FLAG_NOCACHE) { + writing.erase(i++); + ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl; + buffer_map.erase(b->offset); + } else { + b->state = Buffer::STATE_CLEAN; + writing.erase(i++); + b->maybe_rebuild(); + b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data); + cache->_add(b, 1, nullptr); + ldout(cache->cct, 20) << __func__ << " added " << *b << dendl; + } + } + cache->_trim(); + cache->_audit("finish_write end"); +} + +void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r) +{ + std::lock_guard lk(cache->lock); + if (buffer_map.empty()) + return; + + auto p = --buffer_map.end(); + while (true) { + if (p->second->end() <= pos) + break; + + if (p->second->offset < pos) { + ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl; + size_t left = pos - p->second->offset; + size_t right = p->second->length - left; + if (p->second->data.length()) { + bufferlist bl; + bl.substr_of(p->second->data, left, right); + r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, + 0, bl, p->second->flags), + 0, p->second.get()); + } else { + r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, + 0, right, p->second->flags), + 0, p->second.get()); + } + cache->_adjust_size(p->second.get(), -right); + p->second->truncate(left); + break; + } + + ceph_assert(p->second->end() > pos); + ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl; + if (p->second->data.length()) { + r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, + p->second->offset - pos, p->second->data, p->second->flags), + 0, p->second.get()); + } else { + r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, + p->second->offset - pos, p->second->length, p->second->flags), + 0, p->second.get()); + } + if (p == buffer_map.begin()) { + _rm_buffer(cache, p); + break; + } else { + _rm_buffer(cache, p--); + } + } + ceph_assert(writing.empty()); + cache->_trim(); +} + +// OnodeSpace + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") " + +BlueStore::OnodeRef BlueStore::OnodeSpace::add_onode(const ghobject_t& oid, + OnodeRef& o) +{ + std::lock_guard l(cache->lock); + // add entry or return existing one + auto p = onode_map.emplace(oid, o); + if (!p.second) { + ldout(cache->cct, 30) << __func__ << " " << oid << " " << o + << " raced, returning existing " << p.first->second + << dendl; + return p.first->second; + } + ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl; + cache->_add(o.get(), 1); + cache->_trim(); + return o; +} + +void BlueStore::OnodeSpace::_remove(const ghobject_t& oid) +{ + ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl; + onode_map.erase(oid); +} + +BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid) +{ + ldout(cache->cct, 30) << __func__ << dendl; + OnodeRef o; + + { + std::lock_guard l(cache->lock); + ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid); + if (p == onode_map.end()) { + ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl; + cache->logger->inc(l_bluestore_onode_misses); + } else { + ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second + << " " << p->second->nref + << " " << p->second->cached + << dendl; + // This will pin onode and implicitly touch the cache when Onode + // eventually will become unpinned + o = p->second; + + cache->logger->inc(l_bluestore_onode_hits); + } + } + + return o; +} + +void BlueStore::OnodeSpace::clear() +{ + std::lock_guard l(cache->lock); + ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl; + for (auto &p : onode_map) { + cache->_rm(p.second.get()); + } + onode_map.clear(); +} + +bool BlueStore::OnodeSpace::empty() +{ + std::lock_guard l(cache->lock); + return onode_map.empty(); +} + +void BlueStore::OnodeSpace::rename( + OnodeRef& oldo, + const ghobject_t& old_oid, + const ghobject_t& new_oid, + const mempool::bluestore_cache_meta::string& new_okey) +{ + std::lock_guard l(cache->lock); + ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid + << dendl; + ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn; + po = onode_map.find(old_oid); + pn = onode_map.find(new_oid); + ceph_assert(po != pn); + + ceph_assert(po != onode_map.end()); + if (pn != onode_map.end()) { + ldout(cache->cct, 30) << __func__ << " removing target " << pn->second + << dendl; + cache->_rm(pn->second.get()); + onode_map.erase(pn); + } + OnodeRef o = po->second; + + // install a non-existent onode at old location + oldo.reset(new Onode(o->c, old_oid, o->key)); + po->second = oldo; + cache->_add(oldo.get(), 1); + // add at new position and fix oid, key. + // This will pin 'o' and implicitly touch cache + // when it will eventually become unpinned + onode_map.insert(make_pair(new_oid, o)); + + o->oid = new_oid; + o->key = new_okey; + cache->_trim(); +} + +bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f) +{ + std::lock_guard l(cache->lock); + ldout(cache->cct, 20) << __func__ << dendl; + for (auto& i : onode_map) { + if (f(i.second.get())) { + return true; + } + } + return false; +} + +template <int LogLevelV = 30> +void BlueStore::OnodeSpace::dump(CephContext *cct) +{ + for (auto& i : onode_map) { + ldout(cct, LogLevelV) << i.first << " : " << i.second + << " " << i.second->nref + << " " << i.second->cached + << dendl; + } +} + +// SharedBlob + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") " +#undef dout_context +#define dout_context coll->store->cct + +void BlueStore::SharedBlob::dump(Formatter* f) const +{ + f->dump_bool("loaded", loaded); + if (loaded) { + persistent->dump(f); + } else { + f->dump_unsigned("sbid_unloaded", sbid_unloaded); + } +} + +ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb) +{ + out << "SharedBlob(" << &sb; + + if (sb.loaded) { + out << " loaded " << *sb.persistent; + } else { + out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec; + } + return out << ")"; +} + +BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll) + : coll(_coll), sbid_unloaded(i) +{ + ceph_assert(sbid_unloaded > 0); + if (get_cache()) { + get_cache()->add_blob(); + } +} + +BlueStore::SharedBlob::~SharedBlob() +{ + if (loaded && persistent) { + delete persistent; + } +} + +void BlueStore::SharedBlob::put() +{ + if (--nref == 0) { + dout(20) << __func__ << " " << this + << " removing self from set " << get_parent() + << dendl; + again: + auto coll_snap = coll; + if (coll_snap) { + std::lock_guard l(coll_snap->cache->lock); + if (coll_snap != coll) { + goto again; + } + if (!coll_snap->shared_blob_set.remove(this, true)) { + // race with lookup + return; + } + bc._clear(coll_snap->cache); + coll_snap->cache->rm_blob(); + } + delete this; + } +} + +void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length) +{ + ceph_assert(persistent); + persistent->ref_map.get(offset, length); +} + +void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length, + PExtentVector *r, + bool *unshare) +{ + ceph_assert(persistent); + persistent->ref_map.put(offset, length, r, + unshare && !*unshare ? unshare : nullptr); +} + +void BlueStore::SharedBlob::finish_write(uint64_t seq) +{ + while (true) { + BufferCacheShard *cache = coll->cache; + std::lock_guard l(cache->lock); + if (coll->cache != cache) { + dout(20) << __func__ + << " raced with sb cache update, was " << cache + << ", now " << coll->cache << ", retrying" + << dendl; + continue; + } + bc._finish_write(cache, seq); + break; + } +} + +// SharedBlobSet + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") " + +template <int LogLevelV = 30> +void BlueStore::SharedBlobSet::dump(CephContext *cct) +{ + std::lock_guard l(lock); + for (auto& i : sb_map) { + ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl; + } +} + +// Blob + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.blob(" << this << ") " + +void BlueStore::Blob::dump(Formatter* f) const +{ + if (is_spanning()) { + f->dump_unsigned("spanning_id ", id); + } + blob.dump(f); + if (shared_blob) { + f->dump_object("shared", *shared_blob); + } +} + +ostream& operator<<(ostream& out, const BlueStore::Blob& b) +{ + out << "Blob(" << &b; + if (b.is_spanning()) { + out << " spanning " << b.id; + } + out << " " << b.get_blob() << " " << b.get_blob_use_tracker(); + if (b.shared_blob) { + out << " " << *b.shared_blob; + } else { + out << " (shared_blob=NULL)"; + } + out << ")"; + return out; +} + +void BlueStore::Blob::discard_unallocated(Collection *coll) +{ + if (get_blob().is_shared()) { + return; + } + if (get_blob().is_compressed()) { + bool discard = false; + bool all_invalid = true; + for (auto e : get_blob().get_extents()) { + if (!e.is_valid()) { + discard = true; + } else { + all_invalid = false; + } + } + ceph_assert(discard == all_invalid); // in case of compressed blob all + // or none pextents are invalid. + if (discard) { + shared_blob->bc.discard(shared_blob->get_cache(), 0, + get_blob().get_logical_length()); + } + } else { + size_t pos = 0; + for (auto e : get_blob().get_extents()) { + if (!e.is_valid()) { + dout(20) << __func__ << " 0x" << std::hex << pos + << "~" << e.length + << std::dec << dendl; + shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length); + } + pos += e.length; + } + if (get_blob().can_prune_tail()) { + dirty_blob().prune_tail(); + used_in_blob.prune_tail(get_blob().get_ondisk_length()); + dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl; + } + } +} + +void BlueStore::Blob::get_ref( + Collection *coll, + uint32_t offset, + uint32_t length) +{ + // Caller has to initialize Blob's logical length prior to increment + // references. Otherwise one is neither unable to determine required + // amount of counters in case of per-au tracking nor obtain min_release_size + // for single counter mode. + ceph_assert(get_blob().get_logical_length() != 0); + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << " " << *this << dendl; + + if (used_in_blob.is_empty()) { + uint32_t min_release_size = + get_blob().get_release_size(coll->store->min_alloc_size); + uint64_t l = get_blob().get_logical_length(); + dout(20) << __func__ << " init 0x" << std::hex << l << ", " + << min_release_size << std::dec << dendl; + used_in_blob.init(l, min_release_size); + } + used_in_blob.get( + offset, + length); +} + +bool BlueStore::Blob::put_ref( + Collection *coll, + uint32_t offset, + uint32_t length, + PExtentVector *r) +{ + PExtentVector logical; + + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << " " << *this << dendl; + + bool empty = used_in_blob.put( + offset, + length, + &logical); + r->clear(); + // nothing to release + if (!empty && logical.empty()) { + return false; + } + + bluestore_blob_t& b = dirty_blob(); + return b.release_extents(empty, logical, r); +} + +bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size, + uint32_t target_blob_size, + uint32_t b_offset, + uint32_t *length0) { + ceph_assert(min_alloc_size); + ceph_assert(target_blob_size); + if (!get_blob().is_mutable()) { + return false; + } + + uint32_t length = *length0; + uint32_t end = b_offset + length; + + // Currently for the sake of simplicity we omit blob reuse if data is + // unaligned with csum chunk. Later we can perform padding if needed. + if (get_blob().has_csum() && + ((b_offset % get_blob().get_csum_chunk_size()) != 0 || + (end % get_blob().get_csum_chunk_size()) != 0)) { + return false; + } + + auto blen = get_blob().get_logical_length(); + uint32_t new_blen = blen; + + // make sure target_blob_size isn't less than current blob len + target_blob_size = std::max(blen, target_blob_size); + + if (b_offset >= blen) { + // new data totally stands out of the existing blob + new_blen = end; + } else { + // new data overlaps with the existing blob + new_blen = std::max(blen, end); + + uint32_t overlap = 0; + if (new_blen > blen) { + overlap = blen - b_offset; + } else { + overlap = length; + } + + if (!get_blob().is_unallocated(b_offset, overlap)) { + // abort if any piece of the overlap has already been allocated + return false; + } + } + + if (new_blen > blen) { + int64_t overflow = int64_t(new_blen) - target_blob_size; + // Unable to decrease the provided length to fit into max_blob_size + if (overflow >= length) { + return false; + } + + // FIXME: in some cases we could reduce unused resolution + if (get_blob().has_unused()) { + return false; + } + + if (overflow > 0) { + new_blen -= overflow; + length -= overflow; + *length0 = length; + } + + if (new_blen > blen) { + dirty_blob().add_tail(new_blen); + used_in_blob.add_tail(new_blen, + get_blob().get_release_size(min_alloc_size)); + } + } + return true; +} + +void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r) +{ + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " start " << *this << dendl; + ceph_assert(blob.can_split()); + ceph_assert(used_in_blob.can_split()); + bluestore_blob_t &lb = dirty_blob(); + bluestore_blob_t &rb = r->dirty_blob(); + + used_in_blob.split( + blob_offset, + &(r->used_in_blob)); + + lb.split(blob_offset, rb); + shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc); + + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " finish " << *this << dendl; + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " and " << *r << dendl; +} + +#ifndef CACHE_BLOB_BL +void BlueStore::Blob::decode( + bufferptr::const_iterator& p, + uint64_t struct_v, + uint64_t* sbid, + bool include_ref_map, + Collection *coll) +{ + denc(blob, p, struct_v); + if (blob.is_shared()) { + denc(*sbid, p); + } + if (include_ref_map) { + if (struct_v > 1) { + used_in_blob.decode(p); + } else { + used_in_blob.clear(); + bluestore_extent_ref_map_t legacy_ref_map; + legacy_ref_map.decode(p); + if (coll) { + for (auto r : legacy_ref_map.ref_map) { + get_ref( + coll, + r.first, + r.second.refs * r.second.length); + } + } + } + } +} +#endif + +// Extent + +void BlueStore::Extent::dump(Formatter* f) const +{ + f->dump_unsigned("logical_offset", logical_offset); + f->dump_unsigned("length", length); + f->dump_unsigned("blob_offset", blob_offset); + f->dump_object("blob", *blob); +} + +ostream& operator<<(ostream& out, const BlueStore::Extent& e) +{ + return out << std::hex << "0x" << e.logical_offset << "~" << e.length + << ": 0x" << e.blob_offset << "~" << e.length << std::dec + << " " << *e.blob; +} + +// OldExtent +BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c, + uint32_t lo, + uint32_t o, + uint32_t l, + BlobRef& b) { + OldExtent* oe = new OldExtent(lo, o, l, b); + b->put_ref(c.get(), o, l, &(oe->r)); + oe->blob_empty = !b->is_referenced(); + return oe; +} + +// ExtentMap + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") " +#undef dout_context +#define dout_context onode->c->store->cct + +BlueStore::ExtentMap::ExtentMap(Onode *o, size_t inline_shard_prealloc_size) + : onode(o), + inline_bl(inline_shard_prealloc_size) { +} + +void BlueStore::ExtentMap::dump(Formatter* f) const +{ + f->open_array_section("extents"); + + for (auto& e : extent_map) { + f->dump_object("extent", e); + } + f->close_section(); +} + +void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc, + CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff, + uint64_t& length, uint64_t& dstoff) { + + auto cct = onode->c->store->cct; + bool inject_21040 = + cct->_conf->bluestore_debug_inject_bug21040; + vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size()); + for (auto& e : oldo->extent_map.extent_map) { + e.blob->last_encoded_id = -1; + } + + int n = 0; + uint64_t end = srcoff + length; + uint32_t dirty_range_begin = 0; + uint32_t dirty_range_end = 0; + bool src_dirty = false; + for (auto ep = oldo->extent_map.seek_lextent(srcoff); + ep != oldo->extent_map.extent_map.end(); + ++ep) { + auto& e = *ep; + if (e.logical_offset >= end) { + break; + } + dout(20) << __func__ << " src " << e << dendl; + BlobRef cb; + bool blob_duped = true; + if (e.blob->last_encoded_id >= 0) { + cb = id_to_blob[e.blob->last_encoded_id]; + blob_duped = false; + } else { + // dup the blob + const bluestore_blob_t& blob = e.blob->get_blob(); + // make sure it is shared + if (!blob.is_shared()) { + c->make_blob_shared(b->_assign_blobid(txc), e.blob); + if (!inject_21040 && !src_dirty) { + src_dirty = true; + dirty_range_begin = e.logical_offset; + } else if (inject_21040 && + dirty_range_begin == 0 && dirty_range_end == 0) { + dirty_range_begin = e.logical_offset; + } + ceph_assert(e.logical_end() > 0); + // -1 to exclude next potential shard + dirty_range_end = e.logical_end() - 1; + } else { + c->load_shared_blob(e.blob->shared_blob); + } + cb = new Blob(); + e.blob->last_encoded_id = n; + id_to_blob[n] = cb; + e.blob->dup(*cb); + // bump the extent refs on the copied blob's extents + for (auto p : blob.get_extents()) { + if (p.is_valid()) { + e.blob->shared_blob->get_ref(p.offset, p.length); + } + } + txc->write_shared_blob(e.blob->shared_blob); + dout(20) << __func__ << " new " << *cb << dendl; + } + + int skip_front, skip_back; + if (e.logical_offset < srcoff) { + skip_front = srcoff - e.logical_offset; + } else { + skip_front = 0; + } + if (e.logical_end() > end) { + skip_back = e.logical_end() - end; + } else { + skip_back = 0; + } + + Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff, + e.blob_offset + skip_front, e.length - skip_front - skip_back, cb); + newo->extent_map.extent_map.insert(*ne); + ne->blob->get_ref(c.get(), ne->blob_offset, ne->length); + // fixme: we may leave parts of new blob unreferenced that could + // be freed (relative to the shared_blob). + txc->statfs_delta.stored() += ne->length; + if (e.blob->get_blob().is_compressed()) { + txc->statfs_delta.compressed_original() += ne->length; + if (blob_duped) { + txc->statfs_delta.compressed() += + cb->get_blob().get_compressed_payload_length(); + } + } + dout(20) << __func__ << " dst " << *ne << dendl; + ++n; + } + if ((!inject_21040 && src_dirty) || + (inject_21040 && dirty_range_end > dirty_range_begin)) { + oldo->extent_map.dirty_range(dirty_range_begin, + dirty_range_end - dirty_range_begin); + txc->write_onode(oldo); + } + txc->write_onode(newo); + + if (dstoff + length > newo->onode.size) { + newo->onode.size = dstoff + length; + } + newo->extent_map.dirty_range(dstoff, length); +} +void BlueStore::ExtentMap::update(KeyValueDB::Transaction t, + bool force) +{ + auto cct = onode->c->store->cct; //used by dout + dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl; + if (onode->onode.extent_map_shards.empty()) { + if (inline_bl.length() == 0) { + unsigned n; + // we need to encode inline_bl to measure encoded length + bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n); + inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl); + ceph_assert(!never_happen); + size_t len = inline_bl.length(); + dout(20) << __func__ << " inline shard " << len << " bytes from " << n + << " extents" << dendl; + if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) { + request_reshard(0, OBJECT_MAX_SIZE); + return; + } + } + // will persist in the onode key. + } else { + // pending shard update + struct dirty_shard_t { + Shard *shard; + bufferlist bl; + dirty_shard_t(Shard *s) : shard(s) {} + }; + vector<dirty_shard_t> encoded_shards; + // allocate slots for all shards in a single call instead of + // doing multiple allocations - one per each dirty shard + encoded_shards.reserve(shards.size()); + + auto p = shards.begin(); + auto prev_p = p; + while (p != shards.end()) { + ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset); + auto n = p; + ++n; + if (p->dirty) { + uint32_t endoff; + if (n == shards.end()) { + endoff = OBJECT_MAX_SIZE; + } else { + endoff = n->shard_info->offset; + } + encoded_shards.emplace_back(dirty_shard_t(&(*p))); + bufferlist& bl = encoded_shards.back().bl; + if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset, + bl, &p->extents)) { + if (force) { + derr << __func__ << " encode_some needs reshard" << dendl; + ceph_assert(!force); + } + } + size_t len = bl.length(); + + dout(20) << __func__ << " shard 0x" << std::hex + << p->shard_info->offset << std::dec << " is " << len + << " bytes (was " << p->shard_info->bytes << ") from " + << p->extents << " extents" << dendl; + + if (!force) { + if (len > cct->_conf->bluestore_extent_map_shard_max_size) { + // we are big; reshard ourselves + request_reshard(p->shard_info->offset, endoff); + } + // avoid resharding the trailing shard, even if it is small + else if (n != shards.end() && + len < g_conf()->bluestore_extent_map_shard_min_size) { + ceph_assert(endoff != OBJECT_MAX_SIZE); + if (p == shards.begin()) { + // we are the first shard, combine with next shard + request_reshard(p->shard_info->offset, endoff + 1); + } else { + // combine either with the previous shard or the next, + // whichever is smaller + if (prev_p->shard_info->bytes > n->shard_info->bytes) { + request_reshard(p->shard_info->offset, endoff + 1); + } else { + request_reshard(prev_p->shard_info->offset, endoff); + } + } + } + } + } + prev_p = p; + p = n; + } + if (needs_reshard()) { + return; + } + + // schedule DB update for dirty shards + string key; + for (auto& it : encoded_shards) { + dout(20) << __func__ << " encoding key for shard 0x" << std::hex + << it.shard->shard_info->offset << std::dec << dendl; + it.shard->dirty = false; + it.shard->shard_info->bytes = it.bl.length(); + generate_extent_shard_key_and_apply( + onode->key, + it.shard->shard_info->offset, + &key, + [&](const string& final_key) { + t->set(PREFIX_OBJ, final_key, it.bl); + } + ); + } + } +} + +bid_t BlueStore::ExtentMap::allocate_spanning_blob_id() +{ + if (spanning_blob_map.empty()) + return 0; + bid_t bid = spanning_blob_map.rbegin()->first + 1; + // bid is valid and available. + if (bid >= 0) + return bid; + // Find next unused bid; + bid = rand() % (numeric_limits<bid_t>::max() + 1); + const auto begin_bid = bid; + do { + if (!spanning_blob_map.count(bid)) + return bid; + else { + bid++; + if (bid < 0) bid = 0; + } + } while (bid != begin_bid); + auto cct = onode->c->store->cct; // used by dout + _dump_onode<0>(cct, *onode); + ceph_abort_msg("no available blob id"); +} + +void BlueStore::ExtentMap::reshard( + KeyValueDB *db, + KeyValueDB::Transaction t) +{ + auto cct = onode->c->store->cct; // used by dout + + dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << "," + << needs_reshard_end << ")" << std::dec + << " of " << onode->onode.extent_map_shards.size() + << " shards on " << onode->oid << dendl; + for (auto& p : spanning_blob_map) { + dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second + << dendl; + } + // determine shard index range + unsigned si_begin = 0, si_end = 0; + if (!shards.empty()) { + while (si_begin + 1 < shards.size() && + shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) { + ++si_begin; + } + needs_reshard_begin = shards[si_begin].shard_info->offset; + for (si_end = si_begin; si_end < shards.size(); ++si_end) { + if (shards[si_end].shard_info->offset >= needs_reshard_end) { + needs_reshard_end = shards[si_end].shard_info->offset; + break; + } + } + if (si_end == shards.size()) { + needs_reshard_end = OBJECT_MAX_SIZE; + } + dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")" + << " over 0x[" << std::hex << needs_reshard_begin << "," + << needs_reshard_end << ")" << std::dec << dendl; + } + + fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin)); + + // we may need to fault in a larger interval later must have all + // referring extents for spanning blobs loaded in order to have + // accurate use_tracker values. + uint32_t spanning_scan_begin = needs_reshard_begin; + uint32_t spanning_scan_end = needs_reshard_end; + + // remove old keys + string key; + for (unsigned i = si_begin; i < si_end; ++i) { + generate_extent_shard_key_and_apply( + onode->key, shards[i].shard_info->offset, &key, + [&](const string& final_key) { + t->rmkey(PREFIX_OBJ, final_key); + } + ); + } + + // calculate average extent size + unsigned bytes = 0; + unsigned extents = 0; + if (onode->onode.extent_map_shards.empty()) { + bytes = inline_bl.length(); + extents = extent_map.size(); + } else { + for (unsigned i = si_begin; i < si_end; ++i) { + bytes += shards[i].shard_info->bytes; + extents += shards[i].extents; + } + } + unsigned target = cct->_conf->bluestore_extent_map_shard_target_size; + unsigned slop = target * + cct->_conf->bluestore_extent_map_shard_target_size_slop; + unsigned extent_avg = bytes / std::max(1u, extents); + dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target + << ", slop " << slop << dendl; + + // reshard + unsigned estimate = 0; + unsigned offset = needs_reshard_begin; + vector<bluestore_onode_t::shard_info> new_shard_info; + unsigned max_blob_end = 0; + Extent dummy(needs_reshard_begin); + for (auto e = extent_map.lower_bound(dummy); + e != extent_map.end(); + ++e) { + if (e->logical_offset >= needs_reshard_end) { + break; + } + dout(30) << " extent " << *e << dendl; + + // disfavor shard boundaries that span a blob + bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset; + if (estimate && + estimate + extent_avg > target + (would_span ? slop : 0)) { + // new shard + if (offset == needs_reshard_begin) { + new_shard_info.emplace_back(bluestore_onode_t::shard_info()); + new_shard_info.back().offset = offset; + dout(20) << __func__ << " new shard 0x" << std::hex << offset + << std::dec << dendl; + } + offset = e->logical_offset; + new_shard_info.emplace_back(bluestore_onode_t::shard_info()); + new_shard_info.back().offset = offset; + dout(20) << __func__ << " new shard 0x" << std::hex << offset + << std::dec << dendl; + estimate = 0; + } + estimate += extent_avg; + unsigned bs = e->blob_start(); + if (bs < spanning_scan_begin) { + spanning_scan_begin = bs; + } + uint32_t be = e->blob_end(); + if (be > max_blob_end) { + max_blob_end = be; + } + if (be > spanning_scan_end) { + spanning_scan_end = be; + } + } + if (new_shard_info.empty() && (si_begin > 0 || + si_end < shards.size())) { + // we resharded a partial range; we must produce at least one output + // shard + new_shard_info.emplace_back(bluestore_onode_t::shard_info()); + new_shard_info.back().offset = needs_reshard_begin; + dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin + << std::dec << " (singleton degenerate case)" << dendl; + } + + auto& sv = onode->onode.extent_map_shards; + dout(20) << __func__ << " new " << new_shard_info << dendl; + dout(20) << __func__ << " old " << sv << dendl; + if (sv.empty()) { + // no old shards to keep + sv.swap(new_shard_info); + init_shards(true, true); + } else { + // splice in new shards + sv.erase(sv.begin() + si_begin, sv.begin() + si_end); + shards.erase(shards.begin() + si_begin, shards.begin() + si_end); + sv.insert( + sv.begin() + si_begin, + new_shard_info.begin(), + new_shard_info.end()); + shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard()); + si_end = si_begin + new_shard_info.size(); + + ceph_assert(sv.size() == shards.size()); + + // note that we need to update every shard_info of shards here, + // as sv might have been totally re-allocated above + for (unsigned i = 0; i < shards.size(); i++) { + shards[i].shard_info = &sv[i]; + } + + // mark newly added shards as dirty + for (unsigned i = si_begin; i < si_end; ++i) { + shards[i].loaded = true; + shards[i].dirty = true; + } + } + dout(20) << __func__ << " fin " << sv << dendl; + inline_bl.clear(); + + if (sv.empty()) { + // no more shards; unspan all previously spanning blobs + auto p = spanning_blob_map.begin(); + while (p != spanning_blob_map.end()) { + p->second->id = -1; + dout(30) << __func__ << " un-spanning " << *p->second << dendl; + p = spanning_blob_map.erase(p); + } + } else { + // identify new spanning blobs + dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex + << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl; + if (spanning_scan_begin < needs_reshard_begin) { + fault_range(db, spanning_scan_begin, + needs_reshard_begin - spanning_scan_begin); + } + if (spanning_scan_end > needs_reshard_end) { + fault_range(db, needs_reshard_end, + spanning_scan_end - needs_reshard_end); + } + auto sp = sv.begin() + si_begin; + auto esp = sv.end(); + unsigned shard_start = sp->offset; + unsigned shard_end; + ++sp; + if (sp == esp) { + shard_end = OBJECT_MAX_SIZE; + } else { + shard_end = sp->offset; + } + Extent dummy(needs_reshard_begin); + + bool was_too_many_blobs_check = false; + auto too_many_blobs_threshold = + g_conf()->bluestore_debug_too_many_blobs_threshold; + auto& dumped_onodes = onode->c->onode_space.cache->dumped_onodes; + decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oid_slot = nullptr; + decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oldest_slot = nullptr; + + for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) { + if (e->logical_offset >= needs_reshard_end) { + break; + } + dout(30) << " extent " << *e << dendl; + while (e->logical_offset >= shard_end) { + shard_start = shard_end; + ceph_assert(sp != esp); + ++sp; + if (sp == esp) { + shard_end = OBJECT_MAX_SIZE; + } else { + shard_end = sp->offset; + } + dout(30) << __func__ << " shard 0x" << std::hex << shard_start + << " to 0x" << shard_end << std::dec << dendl; + } + + if (e->blob_escapes_range(shard_start, shard_end - shard_start)) { + if (!e->blob->is_spanning()) { + // We have two options: (1) split the blob into pieces at the + // shard boundaries (and adjust extents accordingly), or (2) + // mark it spanning. We prefer to cut the blob if we can. Note that + // we may have to split it multiple times--potentially at every + // shard boundary. + bool must_span = false; + BlobRef b = e->blob; + if (b->can_split()) { + uint32_t bstart = e->blob_start(); + uint32_t bend = e->blob_end(); + for (const auto& sh : shards) { + if (bstart < sh.shard_info->offset && + bend > sh.shard_info->offset) { + uint32_t blob_offset = sh.shard_info->offset - bstart; + if (b->can_split_at(blob_offset)) { + dout(20) << __func__ << " splitting blob, bstart 0x" + << std::hex << bstart << " blob_offset 0x" + << blob_offset << std::dec << " " << *b << dendl; + b = split_blob(b, blob_offset, sh.shard_info->offset); + // switch b to the new right-hand side, in case it + // *also* has to get split. + bstart += blob_offset; + onode->c->store->logger->inc(l_bluestore_blob_split); + } else { + must_span = true; + break; + } + } + } + } else { + must_span = true; + } + if (must_span) { + auto bid = allocate_spanning_blob_id(); + b->id = bid; + spanning_blob_map[b->id] = b; + dout(20) << __func__ << " adding spanning " << *b << dendl; + if (!was_too_many_blobs_check && + too_many_blobs_threshold && + spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) { + + was_too_many_blobs_check = true; + for (size_t i = 0; i < dumped_onodes.size(); ++i) { + if (dumped_onodes[i].first == onode->oid) { + oid_slot = &dumped_onodes[i]; + break; + } + if (!oldest_slot || (oldest_slot && + dumped_onodes[i].second < oldest_slot->second)) { + oldest_slot = &dumped_onodes[i]; + } + } + } + } + } + } else { + if (e->blob->is_spanning()) { + spanning_blob_map.erase(e->blob->id); + e->blob->id = -1; + dout(30) << __func__ << " un-spanning " << *e->blob << dendl; + } + } + } + bool do_dump = (!oid_slot && was_too_many_blobs_check) || + (oid_slot && + (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60))); + if (do_dump) { + dout(0) << __func__ + << " spanning blob count exceeds threshold, " + << spanning_blob_map.size() << " spanning blobs" + << dendl; + _dump_onode<0>(cct, *onode); + if (oid_slot) { + oid_slot->second = mono_clock::now(); + } else { + ceph_assert(oldest_slot); + oldest_slot->first = onode->oid; + oldest_slot->second = mono_clock::now(); + } + } + } + + clear_needs_reshard(); +} + +bool BlueStore::ExtentMap::encode_some( + uint32_t offset, + uint32_t length, + bufferlist& bl, + unsigned *pn) +{ + Extent dummy(offset); + auto start = extent_map.lower_bound(dummy); + uint32_t end = offset + length; + + __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level. + + unsigned n = 0; + size_t bound = 0; + bool must_reshard = false; + for (auto p = start; + p != extent_map.end() && p->logical_offset < end; + ++p, ++n) { + ceph_assert(p->logical_offset >= offset); + p->blob->last_encoded_id = -1; + if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) { + dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << " hit new spanning blob " << *p << dendl; + request_reshard(p->blob_start(), p->blob_end()); + must_reshard = true; + } + if (!must_reshard) { + denc_varint(0, bound); // blobid + denc_varint(0, bound); // logical_offset + denc_varint(0, bound); // len + denc_varint(0, bound); // blob_offset + + p->blob->bound_encode( + bound, + struct_v, + p->blob->shared_blob->get_sbid(), + false); + } + } + if (must_reshard) { + return true; + } + + denc(struct_v, bound); + denc_varint(0, bound); // number of extents + + { + auto app = bl.get_contiguous_appender(bound); + denc(struct_v, app); + denc_varint(n, app); + if (pn) { + *pn = n; + } + + n = 0; + uint64_t pos = 0; + uint64_t prev_len = 0; + for (auto p = start; + p != extent_map.end() && p->logical_offset < end; + ++p, ++n) { + unsigned blobid; + bool include_blob = false; + if (p->blob->is_spanning()) { + blobid = p->blob->id << BLOBID_SHIFT_BITS; + blobid |= BLOBID_FLAG_SPANNING; + } else if (p->blob->last_encoded_id < 0) { + p->blob->last_encoded_id = n + 1; // so it is always non-zero + include_blob = true; + blobid = 0; // the decoder will infer the id from n + } else { + blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS; + } + if (p->logical_offset == pos) { + blobid |= BLOBID_FLAG_CONTIGUOUS; + } + if (p->blob_offset == 0) { + blobid |= BLOBID_FLAG_ZEROOFFSET; + } + if (p->length == prev_len) { + blobid |= BLOBID_FLAG_SAMELENGTH; + } else { + prev_len = p->length; + } + denc_varint(blobid, app); + if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) { + denc_varint_lowz(p->logical_offset - pos, app); + } + if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) { + denc_varint_lowz(p->blob_offset, app); + } + if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) { + denc_varint_lowz(p->length, app); + } + pos = p->logical_end(); + if (include_blob) { + p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false); + } + } + } + /*derr << __func__ << bl << dendl; + derr << __func__ << ":"; + bl.hexdump(*_dout); + *_dout << dendl; + */ + return false; +} + +/////////////////// BlueStore::ExtentMap::DecoderExtent /////////// +void BlueStore::ExtentMap::ExtentDecoder::decode_extent( + Extent* le, + __u8 struct_v, + bptr_c_it_t& p, + Collection* c) +{ + uint64_t blobid; + denc_varint(blobid, p); + if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) { + uint64_t gap; + denc_varint_lowz(gap, p); + pos += gap; + } + le->logical_offset = pos; + if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) { + denc_varint_lowz(le->blob_offset, p); + } else { + le->blob_offset = 0; + } + if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) { + denc_varint_lowz(prev_len, p); + } + le->length = prev_len; + if (blobid & BLOBID_FLAG_SPANNING) { + consume_blobid(le, true, blobid >> BLOBID_SHIFT_BITS); + } else { + blobid >>= BLOBID_SHIFT_BITS; + if (blobid) { + consume_blobid(le, false, blobid - 1); + } else { + Blob *b = new Blob(); + uint64_t sbid = 0; + b->decode(p, struct_v, &sbid, false, c); + consume_blob(le, extent_pos, sbid, b); + } + } + pos += prev_len; + ++extent_pos; +} + +unsigned BlueStore::ExtentMap::ExtentDecoder::decode_some( + const bufferlist& bl, Collection* c) +{ + __u8 struct_v; + uint32_t num; + + ceph_assert(bl.get_num_buffers() <= 1); + auto p = bl.front().begin_deep(); + denc(struct_v, p); + // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level below. + ceph_assert(struct_v == 1 || struct_v == 2); + denc_varint(num, p); + + extent_pos = 0; + while (!p.end()) { + Extent* le = get_next_extent(); + decode_extent(le, struct_v, p, c); + add_extent(le); + } + ceph_assert(extent_pos == num); + return num; +} + +void BlueStore::ExtentMap::ExtentDecoder::decode_spanning_blobs( + bptr_c_it_t& p, Collection* c) +{ + __u8 struct_v; + denc(struct_v, p); + // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level. + ceph_assert(struct_v == 1 || struct_v == 2); + + unsigned n; + denc_varint(n, p); + while (n--) { + BlueStore::BlobRef b(new Blob()); + denc_varint(b->id, p); + uint64_t sbid = 0; + b->decode(p, struct_v, &sbid, true, c); + consume_spanning_blob(sbid, b); + } +} + +/////////////////// BlueStore::ExtentMap::DecoderExtentFull /////////// +void BlueStore::ExtentMap::ExtentDecoderFull::consume_blobid( + BlueStore::Extent* le, bool spanning, uint64_t blobid) { + ceph_assert(le); + if (spanning) { + le->assign_blob(extent_map.get_spanning_blob(blobid)); + } else { + ceph_assert(blobid < blobs.size()); + le->assign_blob(blobs[blobid]); + // we build ref_map dynamically for non-spanning blobs + le->blob->get_ref( + extent_map.onode->c, + le->blob_offset, + le->length); + } +} + +void BlueStore::ExtentMap::ExtentDecoderFull::consume_blob( + BlueStore::Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b) { + ceph_assert(le); + blobs.resize(extent_no + 1); + blobs[extent_no] = b; + extent_map.onode->c->open_shared_blob(sbid, b); + le->assign_blob(b); + le->blob->get_ref( + extent_map.onode->c, + le->blob_offset, + le->length); +} + +void BlueStore::ExtentMap::ExtentDecoderFull::consume_spanning_blob( + uint64_t sbid, BlueStore::BlobRef b) { + extent_map.spanning_blob_map[b->id] = b; + extent_map.onode->c->open_shared_blob(sbid, b); +} + +BlueStore::Extent* BlueStore::ExtentMap::ExtentDecoderFull::get_next_extent() +{ + return new Extent(); +} + +void BlueStore::ExtentMap::ExtentDecoderFull::add_extent(BlueStore::Extent* le) +{ + extent_map.extent_map.insert(*le); +} + +unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl) +{ + ExtentDecoderFull edecoder(*this); + unsigned n = edecoder.decode_some(bl, onode->c); + return n; +} + +void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p) +{ + // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level. + __u8 struct_v = 2; + + denc(struct_v, p); + denc_varint((uint32_t)0, p); + size_t key_size = 0; + denc_varint((uint32_t)0, key_size); + p += spanning_blob_map.size() * key_size; + for (const auto& i : spanning_blob_map) { + i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true); + } +} + +void BlueStore::ExtentMap::encode_spanning_blobs( + bufferlist::contiguous_appender& p) +{ + // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level. + __u8 struct_v = 2; + + denc(struct_v, p); + denc_varint(spanning_blob_map.size(), p); + for (auto& i : spanning_blob_map) { + denc_varint(i.second->id, p); + i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true); + } +} + +void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty) +{ + shards.resize(onode->onode.extent_map_shards.size()); + unsigned i = 0; + for (auto &s : onode->onode.extent_map_shards) { + shards[i].shard_info = &s; + shards[i].loaded = loaded; + shards[i].dirty = dirty; + ++i; + } +} + +void BlueStore::ExtentMap::fault_range( + KeyValueDB *db, + uint32_t offset, + uint32_t length) +{ + dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + auto start = seek_shard(offset); + auto last = seek_shard(offset + length); + + if (start < 0) + return; + + ceph_assert(last >= start); + string key; + while (start <= last) { + ceph_assert((size_t)start < shards.size()); + auto p = &shards[start]; + if (!p->loaded) { + dout(30) << __func__ << " opening shard 0x" << std::hex + << p->shard_info->offset << std::dec << dendl; + bufferlist v; + generate_extent_shard_key_and_apply( + onode->key, p->shard_info->offset, &key, + [&](const string& final_key) { + int r = db->get(PREFIX_OBJ, final_key, &v); + if (r < 0) { + derr << __func__ << " missing shard 0x" << std::hex + << p->shard_info->offset << std::dec << " for " << onode->oid + << dendl; + ceph_assert(r >= 0); + } + } + ); + p->extents = decode_some(v); + p->loaded = true; + dout(20) << __func__ << " open shard 0x" << std::hex + << p->shard_info->offset + << " for range 0x" << offset << "~" << length << std::dec + << " (" << v.length() << " bytes)" << dendl; + ceph_assert(p->dirty == false); + ceph_assert(v.length() == p->shard_info->bytes); + onode->c->store->logger->inc(l_bluestore_onode_shard_misses); + } else { + onode->c->store->logger->inc(l_bluestore_onode_shard_hits); + } + ++start; + } +} + +void BlueStore::ExtentMap::dirty_range( + uint32_t offset, + uint32_t length) +{ + dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + if (shards.empty()) { + dout(20) << __func__ << " mark inline shard dirty" << dendl; + inline_bl.clear(); + return; + } + auto start = seek_shard(offset); + if (length == 0) { + length = 1; + } + auto last = seek_shard(offset + length - 1); + if (start < 0) + return; + + ceph_assert(last >= start); + while (start <= last) { + ceph_assert((size_t)start < shards.size()); + auto p = &shards[start]; + if (!p->loaded) { + derr << __func__ << "on write 0x" << std::hex << offset + << "~" << length << " shard 0x" << p->shard_info->offset + << std::dec << " is not loaded, can't mark dirty" << dendl; + ceph_abort_msg("can't mark unloaded shard dirty"); + } + if (!p->dirty) { + dout(20) << __func__ << " mark shard 0x" << std::hex + << p->shard_info->offset << std::dec << " dirty" << dendl; + p->dirty = true; + } + ++start; + } +} + +BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find( + uint64_t offset) +{ + Extent dummy(offset); + return extent_map.find(dummy); +} + +BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent( + uint64_t offset) +{ + Extent dummy(offset); + auto fp = extent_map.lower_bound(dummy); + if (fp != extent_map.begin()) { + --fp; + if (fp->logical_end() <= offset) { + ++fp; + } + } + return fp; +} + +BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent( + uint64_t offset) const +{ + Extent dummy(offset); + auto fp = extent_map.lower_bound(dummy); + if (fp != extent_map.begin()) { + --fp; + if (fp->logical_end() <= offset) { + ++fp; + } + } + return fp; +} + +bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length) +{ + auto fp = seek_lextent(offset); + if (fp == extent_map.end() || fp->logical_offset >= offset + length) { + return false; + } + return true; +} + +int BlueStore::ExtentMap::compress_extent_map( + uint64_t offset, + uint64_t length) +{ + if (extent_map.empty()) + return 0; + int removed = 0; + auto p = seek_lextent(offset); + if (p != extent_map.begin()) { + --p; // start to the left of offset + } + // the caller should have just written to this region + ceph_assert(p != extent_map.end()); + + // identify the *next* shard + auto pshard = shards.begin(); + while (pshard != shards.end() && + p->logical_offset >= pshard->shard_info->offset) { + ++pshard; + } + uint64_t shard_end; + if (pshard != shards.end()) { + shard_end = pshard->shard_info->offset; + } else { + shard_end = OBJECT_MAX_SIZE; + } + + auto n = p; + for (++n; n != extent_map.end(); p = n++) { + if (n->logical_offset > offset + length) { + break; // stop after end + } + while (n != extent_map.end() && + p->logical_end() == n->logical_offset && + p->blob == n->blob && + p->blob_offset + p->length == n->blob_offset && + n->logical_offset < shard_end) { + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << " next shard 0x" << shard_end << std::dec + << " merging " << *p << " and " << *n << dendl; + p->length += n->length; + rm(n++); + ++removed; + } + if (n == extent_map.end()) { + break; + } + if (n->logical_offset >= shard_end) { + ceph_assert(pshard != shards.end()); + ++pshard; + if (pshard != shards.end()) { + shard_end = pshard->shard_info->offset; + } else { + shard_end = OBJECT_MAX_SIZE; + } + } + } + if (removed) { + onode->c->store->logger->inc(l_bluestore_extent_compress, removed); + } + return removed; +} + +void BlueStore::ExtentMap::punch_hole( + CollectionRef &c, + uint64_t offset, + uint64_t length, + old_extent_map_t *old_extents) +{ + auto p = seek_lextent(offset); + uint64_t end = offset + length; + while (p != extent_map.end()) { + if (p->logical_offset >= end) { + break; + } + if (p->logical_offset < offset) { + if (p->logical_end() > end) { + // split and deref middle + uint64_t front = offset - p->logical_offset; + OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front, + length, p->blob); + old_extents->push_back(*oe); + add(end, + p->blob_offset + front + length, + p->length - front - length, + p->blob); + p->length = front; + break; + } else { + // deref tail + ceph_assert(p->logical_end() > offset); // else seek_lextent bug + uint64_t keep = offset - p->logical_offset; + OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep, + p->length - keep, p->blob); + old_extents->push_back(*oe); + p->length = keep; + ++p; + continue; + } + } + if (p->logical_offset + p->length <= end) { + // deref whole lextent + OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset, + p->length, p->blob); + old_extents->push_back(*oe); + rm(p++); + continue; + } + // deref head + uint64_t keep = p->logical_end() - end; + OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset, + p->length - keep, p->blob); + old_extents->push_back(*oe); + + add(end, p->blob_offset + p->length - keep, keep, p->blob); + rm(p); + break; + } +} + +BlueStore::Extent *BlueStore::ExtentMap::set_lextent( + CollectionRef &c, + uint64_t logical_offset, + uint64_t blob_offset, uint64_t length, BlobRef b, + old_extent_map_t *old_extents) +{ + // We need to have completely initialized Blob to increment its ref counters. + ceph_assert(b->get_blob().get_logical_length() != 0); + + // Do get_ref prior to punch_hole to prevent from putting reused blob into + // old_extents list if we overwre the blob totally + // This might happen during WAL overwrite. + b->get_ref(onode->c, blob_offset, length); + + if (old_extents) { + punch_hole(c, logical_offset, length, old_extents); + } + + Extent *le = new Extent(logical_offset, blob_offset, length, b); + extent_map.insert(*le); + if (spans_shard(logical_offset, length)) { + request_reshard(logical_offset, logical_offset + length); + } + return le; +} + +BlueStore::BlobRef BlueStore::ExtentMap::split_blob( + BlobRef lb, + uint32_t blob_offset, + uint32_t pos) +{ + uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset; + dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos + << " blob_offset 0x" << blob_offset << std::dec << " " << *lb + << dendl; + BlobRef rb = onode->c->new_blob(); + lb->split(onode->c, blob_offset, rb.get()); + + for (auto ep = seek_lextent(pos); + ep != extent_map.end() && ep->logical_offset < end_pos; + ++ep) { + if (ep->blob != lb) { + continue; + } + if (ep->logical_offset < pos) { + // split extent + size_t left = pos - ep->logical_offset; + Extent *ne = new Extent(pos, 0, ep->length - left, rb); + extent_map.insert(*ne); + ep->length = left; + dout(30) << __func__ << " split " << *ep << dendl; + dout(30) << __func__ << " to " << *ne << dendl; + } else { + // switch blob + ceph_assert(ep->blob_offset >= blob_offset); + + ep->blob = rb; + ep->blob_offset -= blob_offset; + dout(30) << __func__ << " adjusted " << *ep << dendl; + } + } + return rb; +} + +// Onode + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " " + +const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags) +{ + if (bluestore_onode_t::is_pgmeta_omap(flags)) { + return PREFIX_PGMETA_OMAP; + } + if (bluestore_onode_t::is_perpg_omap(flags)) { + return PREFIX_PERPG_OMAP; + } + if (bluestore_onode_t::is_perpool_omap(flags)) { + return PREFIX_PERPOOL_OMAP; + } + return PREFIX_OMAP; +} + +// '-' < '.' < '~' +void BlueStore::Onode::calc_omap_header( + uint8_t flags, + const Onode* o, + std::string* out) +{ + if (!bluestore_onode_t::is_pgmeta_omap(flags)) { + if (bluestore_onode_t::is_perpg_omap(flags)) { + _key_encode_u64(o->c->pool(), out); + _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out); + } else if (bluestore_onode_t::is_perpool_omap(flags)) { + _key_encode_u64(o->c->pool(), out); + } + } + _key_encode_u64(o->onode.nid, out); + out->push_back('-'); +} + +void BlueStore::Onode::calc_omap_key(uint8_t flags, + const Onode* o, + const std::string& key, + std::string* out) +{ + if (!bluestore_onode_t::is_pgmeta_omap(flags)) { + if (bluestore_onode_t::is_perpg_omap(flags)) { + _key_encode_u64(o->c->pool(), out); + _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out); + } else if (bluestore_onode_t::is_perpool_omap(flags)) { + _key_encode_u64(o->c->pool(), out); + } + } + _key_encode_u64(o->onode.nid, out); + out->push_back('.'); + out->append(key); +} + +void BlueStore::Onode::calc_omap_tail( + uint8_t flags, + const Onode* o, + std::string* out) +{ + if (!bluestore_onode_t::is_pgmeta_omap(flags)) { + if (bluestore_onode_t::is_perpg_omap(flags)) { + _key_encode_u64(o->c->pool(), out); + _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out); + } else if (bluestore_onode_t::is_perpool_omap(flags)) { + _key_encode_u64(o->c->pool(), out); + } + } + _key_encode_u64(o->onode.nid, out); + out->push_back('~'); +} + +void BlueStore::Onode::get() +{ + ++nref; + ++pin_nref; +} +void BlueStore::Onode::put() +{ + if (--pin_nref == 1) { + c->get_onode_cache()->maybe_unpin(this); + } + if (--nref == 0) { + delete this; + } +} + +void BlueStore::Onode::decode_raw( + BlueStore::Onode* on, + const bufferlist& v, + BlueStore::ExtentMap::ExtentDecoder& edecoder) +{ + on->exists = true; + auto p = v.front().begin_deep(); + on->onode.decode(p); + + // initialize extent_map + edecoder.decode_spanning_blobs(p, on->c); + if (on->onode.extent_map_shards.empty()) { + denc(on->extent_map.inline_bl, p); + edecoder.decode_some(on->extent_map.inline_bl, on->c); + } +} + +BlueStore::Onode* BlueStore::Onode::create_decode( + CollectionRef c, + const ghobject_t& oid, + const string& key, + const bufferlist& v, + bool allow_empty) +{ + ceph_assert(v.length() || allow_empty); + Onode* on = new Onode(c.get(), oid, key); + + if (v.length()) { + ExtentMap::ExtentDecoderFull edecoder(on->extent_map); + decode_raw(on, v, edecoder); + + for (auto& i : on->onode.attrs) { + i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } + + // initialize extent_map + if (on->onode.extent_map_shards.empty()) { + on->extent_map.inline_bl.reassign_to_mempool( + mempool::mempool_bluestore_cache_data); + } else { + on->extent_map.init_shards(false, false); + } + } + return on; +} + +void BlueStore::Onode::flush() +{ + if (flushing_count.load()) { + ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl; + waiting_count++; + std::unique_lock l(flush_lock); + while (flushing_count.load()) { + flush_cond.wait(l); + } + waiting_count--; + } + ldout(c->store->cct, 20) << __func__ << " done" << dendl; +} + +void BlueStore::Onode::dump(Formatter* f) const +{ + onode.dump(f); + extent_map.dump(f); +} + +void BlueStore::Onode::rewrite_omap_key(const string& old, string *out) +{ + if (!onode.is_pgmeta_omap()) { + if (onode.is_perpg_omap()) { + _key_encode_u64(c->pool(), out); + _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out); + } else if (onode.is_perpool_omap()) { + _key_encode_u64(c->pool(), out); + } + } + _key_encode_u64(onode.nid, out); + out->append(old.c_str() + out->length(), old.size() - out->length()); +} + +void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) +{ + size_t pos = sizeof(uint64_t) + 1; + if (!onode.is_pgmeta_omap()) { + if (onode.is_perpg_omap()) { + pos += sizeof(uint64_t) + sizeof(uint32_t); + } else if (onode.is_perpool_omap()) { + pos += sizeof(uint64_t); + } + } + *user_key = key.substr(pos); +} + +// ======================================================= +// WriteContext + +/// Checks for writes to the same pextent within a blob +bool BlueStore::WriteContext::has_conflict( + BlobRef b, + uint64_t loffs, + uint64_t loffs_end, + uint64_t min_alloc_size) +{ + ceph_assert((loffs % min_alloc_size) == 0); + ceph_assert((loffs_end % min_alloc_size) == 0); + for (auto w : writes) { + if (b == w.b) { + auto loffs2 = p2align(w.logical_offset, min_alloc_size); + auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size); + if ((loffs <= loffs2 && loffs_end > loffs2) || + (loffs >= loffs2 && loffs < loffs2_end)) { + return true; + } + } + } + return false; +} + +// ======================================================= + +// DeferredBatch +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") " +#undef dout_context +#define dout_context cct + +void BlueStore::DeferredBatch::prepare_write( + CephContext *cct, + uint64_t seq, uint64_t offset, uint64_t length, + bufferlist::const_iterator& blp) +{ + _discard(cct, offset, length); + auto i = iomap.insert(make_pair(offset, deferred_io())); + ceph_assert(i.second); // this should be a new insertion + i.first->second.seq = seq; + blp.copy(length, i.first->second.bl); + i.first->second.bl.reassign_to_mempool( + mempool::mempool_bluestore_writing_deferred); + dout(20) << __func__ << " seq " << seq + << " 0x" << std::hex << offset << "~" << length + << " crc " << i.first->second.bl.crc32c(-1) + << std::dec << dendl; + seq_bytes[seq] += length; +#ifdef DEBUG_DEFERRED + _audit(cct); +#endif +} + +void BlueStore::DeferredBatch::_discard( + CephContext *cct, uint64_t offset, uint64_t length) +{ + generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + auto p = iomap.lower_bound(offset); + if (p != iomap.begin()) { + --p; + auto end = p->first + p->second.bl.length(); + if (end > offset) { + bufferlist head; + head.substr_of(p->second.bl, 0, offset - p->first); + dout(20) << __func__ << " keep head " << p->second.seq + << " 0x" << std::hex << p->first << "~" << p->second.bl.length() + << " -> 0x" << head.length() << std::dec << dendl; + auto i = seq_bytes.find(p->second.seq); + ceph_assert(i != seq_bytes.end()); + if (end > offset + length) { + bufferlist tail; + tail.substr_of(p->second.bl, offset + length - p->first, + end - (offset + length)); + dout(20) << __func__ << " keep tail " << p->second.seq + << " 0x" << std::hex << p->first << "~" << p->second.bl.length() + << " -> 0x" << tail.length() << std::dec << dendl; + auto &n = iomap[offset + length]; + n.bl.swap(tail); + n.seq = p->second.seq; + i->second -= length; + } else { + i->second -= end - offset; + } + ceph_assert(i->second >= 0); + p->second.bl.swap(head); + } + ++p; + } + while (p != iomap.end()) { + if (p->first >= offset + length) { + break; + } + auto i = seq_bytes.find(p->second.seq); + ceph_assert(i != seq_bytes.end()); + auto end = p->first + p->second.bl.length(); + if (end > offset + length) { + unsigned drop_front = offset + length - p->first; + unsigned keep_tail = end - (offset + length); + dout(20) << __func__ << " truncate front " << p->second.seq + << " 0x" << std::hex << p->first << "~" << p->second.bl.length() + << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail + << " to 0x" << (offset + length) << "~" << keep_tail + << std::dec << dendl; + auto &s = iomap[offset + length]; + s.seq = p->second.seq; + s.bl.substr_of(p->second.bl, drop_front, keep_tail); + i->second -= drop_front; + } else { + dout(20) << __func__ << " drop " << p->second.seq + << " 0x" << std::hex << p->first << "~" << p->second.bl.length() + << std::dec << dendl; + i->second -= p->second.bl.length(); + } + ceph_assert(i->second >= 0); + p = iomap.erase(p); + } +} + +void BlueStore::DeferredBatch::_audit(CephContext *cct) +{ + map<uint64_t,int> sb; + for (auto p : seq_bytes) { + sb[p.first] = 0; // make sure we have the same set of keys + } + uint64_t pos = 0; + for (auto& p : iomap) { + ceph_assert(p.first >= pos); + sb[p.second.seq] += p.second.bl.length(); + pos = p.first + p.second.bl.length(); + } + ceph_assert(sb == seq_bytes); +} + + +// Collection + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") " + +BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid) + : CollectionImpl(store_->cct, cid), + store(store_), + cache(bc), + exists(true), + onode_space(oc), + commit_queue(nullptr) +{ +} + +bool BlueStore::Collection::flush_commit(Context *c) +{ + return osr->flush_commit(c); +} + +void BlueStore::Collection::flush() +{ + osr->flush(); +} + +void BlueStore::Collection::flush_all_but_last() +{ + osr->flush_all_but_last(); +} + +void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b) +{ + ceph_assert(!b->shared_blob); + const bluestore_blob_t& blob = b->get_blob(); + if (!blob.is_shared()) { + b->shared_blob = new SharedBlob(this); + return; + } + + b->shared_blob = shared_blob_set.lookup(sbid); + if (b->shared_blob) { + ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid + << std::dec << " had " << *b->shared_blob << dendl; + } else { + b->shared_blob = new SharedBlob(sbid, this); + shared_blob_set.add(this, b->shared_blob.get()); + ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid + << std::dec << " opened " << *b->shared_blob + << dendl; + } +} + +void BlueStore::Collection::load_shared_blob(SharedBlobRef sb) +{ + if (!sb->is_loaded()) { + + bufferlist v; + string key; + auto sbid = sb->get_sbid(); + get_shared_blob_key(sbid, &key); + int r = store->db->get(PREFIX_SHARED_BLOB, key, &v); + if (r < 0) { + lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid + << std::dec << " not found at key " + << pretty_binary_string(key) << dendl; + ceph_abort_msg("uh oh, missing shared_blob"); + } + + sb->loaded = true; + sb->persistent = new bluestore_shared_blob_t(sbid); + auto p = v.cbegin(); + decode(*(sb->persistent), p); + ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid + << std::dec << " loaded shared_blob " << *sb << dendl; + } +} + +void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b) +{ + ldout(store->cct, 10) << __func__ << " " << *b << dendl; + ceph_assert(!b->shared_blob->is_loaded()); + + // update blob + bluestore_blob_t& blob = b->dirty_blob(); + blob.set_flag(bluestore_blob_t::FLAG_SHARED); + + // update shared blob + b->shared_blob->loaded = true; + b->shared_blob->persistent = new bluestore_shared_blob_t(sbid); + shared_blob_set.add(this, b->shared_blob.get()); + for (auto p : blob.get_extents()) { + if (p.is_valid()) { + b->shared_blob->get_ref( + p.offset, + p.length); + } + } + ldout(store->cct, 20) << __func__ << " now " << *b << dendl; +} + +uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb) +{ + ldout(store->cct, 10) << __func__ << " " << *sb << dendl; + ceph_assert(sb->is_loaded()); + + uint64_t sbid = sb->get_sbid(); + shared_blob_set.remove(sb); + sb->loaded = false; + delete sb->persistent; + sb->sbid_unloaded = 0; + ldout(store->cct, 20) << __func__ << " now " << *sb << dendl; + return sbid; +} + +BlueStore::OnodeRef BlueStore::Collection::get_onode( + const ghobject_t& oid, + bool create, + bool is_createop) +{ + ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock)); + + spg_t pgid; + if (cid.is_pg(&pgid)) { + if (!oid.match(cnode.bits, pgid.ps())) { + lderr(store->cct) << __func__ << " oid " << oid << " not part of " + << pgid << " bits " << cnode.bits << dendl; + ceph_abort(); + } + } + + OnodeRef o = onode_space.lookup(oid); + if (o) + return o; + + string key; + get_object_key(store->cct, oid, &key); + + ldout(store->cct, 20) << __func__ << " oid " << oid << " key " + << pretty_binary_string(key) << dendl; + + bufferlist v; + int r = -ENOENT; + Onode *on; + if (!is_createop) { + r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v); + ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl; + } + if (v.length() == 0) { + ceph_assert(r == -ENOENT); + if (!create) + return OnodeRef(); + } else { + ceph_assert(r >= 0); + } + + // new object, load onode if available + on = Onode::create_decode(this, oid, key, v, true); + o.reset(on); + return onode_space.add_onode(oid, o); +} + +void BlueStore::Collection::split_cache( + Collection *dest) +{ + ldout(store->cct, 10) << __func__ << " to " << dest << dendl; + + auto *ocache = get_onode_cache(); + auto *ocache_dest = dest->get_onode_cache(); + + // lock cache shards + std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock); + std::lock_guard l(ocache->lock, std::adopt_lock); + std::lock_guard l2(ocache_dest->lock, std::adopt_lock); + std::lock_guard l3(cache->lock, std::adopt_lock); + std::lock_guard l4(dest->cache->lock, std::adopt_lock); + + int destbits = dest->cnode.bits; + spg_t destpg; + bool is_pg = dest->cid.is_pg(&destpg); + ceph_assert(is_pg); + + auto p = onode_space.onode_map.begin(); + while (p != onode_space.onode_map.end()) { + OnodeRef o = p->second; + if (!p->second->oid.match(destbits, destpg.pgid.ps())) { + // onode does not belong to this child + ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid + << dendl; + ++p; + } else { + ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid + << dendl; + + // ensuring that nref is always >= 2 and hence onode is pinned + OnodeRef o_pin = o; + + p = onode_space.onode_map.erase(p); + dest->onode_space.onode_map[o->oid] = o; + if (o->cached) { + get_onode_cache()->_move_pinned(dest->get_onode_cache(), o.get()); + } + o->c = dest; + + // move over shared blobs and buffers. cover shared blobs from + // both extent map and spanning blob map (the full extent map + // may not be faulted in) + vector<SharedBlob*> sbvec; + for (auto& e : o->extent_map.extent_map) { + sbvec.push_back(e.blob->shared_blob.get()); + } + for (auto& b : o->extent_map.spanning_blob_map) { + sbvec.push_back(b.second->shared_blob.get()); + } + for (auto sb : sbvec) { + if (sb->coll == dest) { + ldout(store->cct, 20) << __func__ << " already moved " << *sb + << dendl; + continue; + } + ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl; + if (sb->get_sbid()) { + ldout(store->cct, 20) << __func__ + << " moving registration " << *sb << dendl; + shared_blob_set.remove(sb); + dest->shared_blob_set.add(dest, sb); + } + sb->coll = dest; + if (dest->cache != cache) { + for (auto& i : sb->bc.buffer_map) { + if (!i.second->is_writing()) { + ldout(store->cct, 20) << __func__ << " moving " << *i.second + << dendl; + dest->cache->_move(cache, i.second.get()); + } + } + } + } + } + } + dest->cache->_trim(); +} + +// ======================================================= + +// MempoolThread + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") " +#undef dout_context +#define dout_context store->cct + +void *BlueStore::MempoolThread::entry() +{ + std::unique_lock l{lock}; + + uint32_t prev_config_change = store->config_changed.load(); + uint64_t base = store->osd_memory_base; + double fragmentation = store->osd_memory_expected_fragmentation; + uint64_t target = store->osd_memory_target; + uint64_t min = store->osd_memory_cache_min; + uint64_t max = min; + + // When setting the maximum amount of memory to use for cache, first + // assume some base amount of memory for the OSD and then fudge in + // some overhead for fragmentation that scales with cache usage. + uint64_t ltarget = (1.0 - fragmentation) * target; + if (ltarget > base + min) { + max = ltarget - base; + } + + binned_kv_cache = store->db->get_priority_cache(); + binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ); + if (store->cache_autotune && binned_kv_cache != nullptr) { + pcm = std::make_shared<PriorityCache::Manager>( + store->cct, min, max, target, true, "bluestore-pricache"); + pcm->insert("kv", binned_kv_cache, true); + pcm->insert("meta", meta_cache, true); + pcm->insert("data", data_cache, true); + if (binned_kv_onode_cache != nullptr) { + pcm->insert("kv_onode", binned_kv_onode_cache, true); + } + } + + utime_t next_balance = ceph_clock_now(); + utime_t next_resize = ceph_clock_now(); + utime_t next_bin_rotation = ceph_clock_now(); + utime_t next_deferred_force_submit = ceph_clock_now(); + utime_t alloc_stats_dump_clock = ceph_clock_now(); + + bool interval_stats_trim = false; + while (!stop) { + // Update pcm cache settings if related configuration was changed + uint32_t cur_config_change = store->config_changed.load(); + if (cur_config_change != prev_config_change) { + _update_cache_settings(); + prev_config_change = cur_config_change; + } + + // define various intervals for background work + double age_bin_interval = store->cache_age_bin_interval; + double autotune_interval = store->cache_autotune_interval; + double resize_interval = store->osd_memory_cache_resize_interval; + double max_defer_interval = store->max_defer_interval; + double alloc_stats_dump_interval = + store->cct->_conf->bluestore_alloc_stats_dump_interval; + + // alloc stats dump + if (alloc_stats_dump_interval > 0 && + alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) { + store->_record_allocation_stats(); + alloc_stats_dump_clock = ceph_clock_now(); + } + // cache age binning + if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) { + if (binned_kv_cache != nullptr) { + binned_kv_cache->import_bins(store->kv_bins); + } + if (binned_kv_onode_cache != nullptr) { + binned_kv_onode_cache->import_bins(store->kv_onode_bins); + } + meta_cache->import_bins(store->meta_bins); + data_cache->import_bins(store->data_bins); + + if (pcm != nullptr) { + pcm->shift_bins(); + } + next_bin_rotation = ceph_clock_now(); + next_bin_rotation += age_bin_interval; + } + // cache balancing + if (autotune_interval > 0 && next_balance < ceph_clock_now()) { + if (binned_kv_cache != nullptr) { + binned_kv_cache->set_cache_ratio(store->cache_kv_ratio); + } + if (binned_kv_onode_cache != nullptr) { + binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio); + } + meta_cache->set_cache_ratio(store->cache_meta_ratio); + data_cache->set_cache_ratio(store->cache_data_ratio); + + // Log events at 5 instead of 20 when balance happens. + interval_stats_trim = true; + + if (pcm != nullptr) { + pcm->balance(); + } + + next_balance = ceph_clock_now(); + next_balance += autotune_interval; + } + // memory resizing (ie autotuning) + if (resize_interval > 0 && next_resize < ceph_clock_now()) { + if (ceph_using_tcmalloc() && pcm != nullptr) { + pcm->tune_memory(); + } + next_resize = ceph_clock_now(); + next_resize += resize_interval; + } + // deferred force submit + if (max_defer_interval > 0 && + next_deferred_force_submit < ceph_clock_now()) { + if (store->get_deferred_last_submitted() + max_defer_interval < + ceph_clock_now()) { + store->deferred_try_submit(); + } + next_deferred_force_submit = ceph_clock_now(); + next_deferred_force_submit += max_defer_interval/3; + } + + // Now Resize the shards + _resize_shards(interval_stats_trim); + interval_stats_trim = false; + + store->_update_logger(); + auto wait = ceph::make_timespan( + store->cct->_conf->bluestore_cache_trim_interval); + cond.wait_for(l, wait); + } + // do final dump + store->_record_allocation_stats(); + stop = false; + pcm = nullptr; + return NULL; +} + +void BlueStore::MempoolThread::_resize_shards(bool interval_stats) +{ + size_t onode_shards = store->onode_cache_shards.size(); + size_t buffer_shards = store->buffer_cache_shards.size(); + int64_t kv_used = store->db->get_cache_usage(); + int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ); + int64_t meta_used = meta_cache->_get_used_bytes(); + int64_t data_used = data_cache->_get_used_bytes(); + + uint64_t cache_size = store->cache_size; + int64_t kv_alloc = + static_cast<int64_t>(store->cache_kv_ratio * cache_size); + int64_t kv_onode_alloc = + static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size); + int64_t meta_alloc = + static_cast<int64_t>(store->cache_meta_ratio * cache_size); + int64_t data_alloc = + static_cast<int64_t>(store->cache_data_ratio * cache_size); + + if (pcm != nullptr && binned_kv_cache != nullptr) { + cache_size = pcm->get_tuned_mem(); + kv_alloc = binned_kv_cache->get_committed_size(); + meta_alloc = meta_cache->get_committed_size(); + data_alloc = data_cache->get_committed_size(); + if (binned_kv_onode_cache != nullptr) { + kv_onode_alloc = binned_kv_onode_cache->get_committed_size(); + } + } + + if (interval_stats) { + dout(5) << __func__ << " cache_size: " << cache_size + << " kv_alloc: " << kv_alloc + << " kv_used: " << kv_used + << " kv_onode_alloc: " << kv_onode_alloc + << " kv_onode_used: " << kv_onode_used + << " meta_alloc: " << meta_alloc + << " meta_used: " << meta_used + << " data_alloc: " << data_alloc + << " data_used: " << data_used << dendl; + } else { + dout(20) << __func__ << " cache_size: " << cache_size + << " kv_alloc: " << kv_alloc + << " kv_used: " << kv_used + << " kv_onode_alloc: " << kv_onode_alloc + << " kv_onode_used: " << kv_onode_used + << " meta_alloc: " << meta_alloc + << " meta_used: " << meta_used + << " data_alloc: " << data_alloc + << " data_used: " << data_used << dendl; + } + + uint64_t max_shard_onodes = static_cast<uint64_t>( + (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode()); + uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards); + + dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes + << " max_shard_buffer: " << max_shard_buffer << dendl; + + for (auto i : store->onode_cache_shards) { + i->set_max(max_shard_onodes); + } + for (auto i : store->buffer_cache_shards) { + i->set_max(max_shard_buffer); + } +} + +void BlueStore::MempoolThread::_update_cache_settings() +{ + // Nothing to do if pcm is not used. + if (pcm == nullptr) { + return; + } + + uint64_t target = store->osd_memory_target; + uint64_t base = store->osd_memory_base; + uint64_t min = store->osd_memory_cache_min; + uint64_t max = min; + double fragmentation = store->osd_memory_expected_fragmentation; + + uint64_t ltarget = (1.0 - fragmentation) * target; + if (ltarget > base + min) { + max = ltarget - base; + } + + // set pcm cache levels + pcm->set_target_memory(target); + pcm->set_min_memory(min); + pcm->set_max_memory(max); + + dout(5) << __func__ << " updated pcm target: " << target + << " pcm min: " << min + << " pcm max: " << max + << dendl; +} + +// ======================================================= + +// OmapIteratorImpl + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") " + +BlueStore::OmapIteratorImpl::OmapIteratorImpl( + PerfCounters* _logger, CollectionRef c, OnodeRef& o, KeyValueDB::Iterator it) + : logger(_logger), c(c), o(o), it(it) +{ + logger->inc(l_bluestore_omap_iterator_count); + std::shared_lock l(c->lock); + if (o->onode.has_omap()) { + o->get_omap_key(string(), &head); + o->get_omap_tail(&tail); + it->lower_bound(head); + } +} +BlueStore::OmapIteratorImpl::~OmapIteratorImpl() +{ + logger->dec(l_bluestore_omap_iterator_count); +} + +string BlueStore::OmapIteratorImpl::_stringify() const +{ + stringstream s; + s << " omap_iterator(cid = " << c->cid + <<", oid = " << o->oid << ")"; + return s.str(); +} + +int BlueStore::OmapIteratorImpl::seek_to_first() +{ + std::shared_lock l(c->lock); + auto start1 = mono_clock::now(); + if (o->onode.has_omap()) { + it->lower_bound(head); + } else { + it = KeyValueDB::Iterator(); + } + c->store->log_latency( + __func__, + l_bluestore_omap_seek_to_first_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + + return 0; +} + +int BlueStore::OmapIteratorImpl::upper_bound(const string& after) +{ + std::shared_lock l(c->lock); + auto start1 = mono_clock::now(); + if (o->onode.has_omap()) { + string key; + o->get_omap_key(after, &key); + ldout(c->store->cct,20) << __func__ << " after " << after << " key " + << pretty_binary_string(key) << dendl; + it->upper_bound(key); + } else { + it = KeyValueDB::Iterator(); + } + c->store->log_latency_fn( + __func__, + l_bluestore_omap_upper_bound_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age, + [&] (const ceph::timespan& lat) { + return ", after = " + after + + _stringify(); + } + ); + return 0; +} + +int BlueStore::OmapIteratorImpl::lower_bound(const string& to) +{ + std::shared_lock l(c->lock); + auto start1 = mono_clock::now(); + if (o->onode.has_omap()) { + string key; + o->get_omap_key(to, &key); + ldout(c->store->cct,20) << __func__ << " to " << to << " key " + << pretty_binary_string(key) << dendl; + it->lower_bound(key); + } else { + it = KeyValueDB::Iterator(); + } + c->store->log_latency_fn( + __func__, + l_bluestore_omap_lower_bound_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age, + [&] (const ceph::timespan& lat) { + return ", to = " + to + + _stringify(); + } + ); + return 0; +} + +bool BlueStore::OmapIteratorImpl::valid() +{ + std::shared_lock l(c->lock); + bool r = o->onode.has_omap() && it && it->valid() && + it->raw_key().second < tail; + if (it && it->valid()) { + ldout(c->store->cct,20) << __func__ << " is at " + << pretty_binary_string(it->raw_key().second) + << dendl; + } + return r; +} + +int BlueStore::OmapIteratorImpl::next() +{ + int r = -1; + std::shared_lock l(c->lock); + auto start1 = mono_clock::now(); + if (o->onode.has_omap()) { + it->next(); + r = 0; + } + c->store->log_latency( + __func__, + l_bluestore_omap_next_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + + return r; +} + +string BlueStore::OmapIteratorImpl::key() +{ + std::shared_lock l(c->lock); + ceph_assert(it->valid()); + string db_key = it->raw_key().second; + string user_key; + o->decode_omap_key(db_key, &user_key); + + return user_key; +} + +bufferlist BlueStore::OmapIteratorImpl::value() +{ + std::shared_lock l(c->lock); + ceph_assert(it->valid()); + return it->value(); +} + + +// ===================================== + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore(" << path << ") " +#undef dout_context +#define dout_context cct + + +static void aio_cb(void *priv, void *priv2) +{ + BlueStore *store = static_cast<BlueStore*>(priv); + BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2); + c->aio_finish(store); +} + +static void discard_cb(void *priv, void *priv2) +{ + BlueStore *store = static_cast<BlueStore*>(priv); + interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); + store->handle_discard(*tmp); +} + +void BlueStore::handle_discard(interval_set<uint64_t>& to_release) +{ + dout(10) << __func__ << dendl; + ceph_assert(alloc); + alloc->release(to_release); +} + +BlueStore::BlueStore(CephContext *cct, const string& path) + : BlueStore(cct, path, 0) {} + +BlueStore::BlueStore(CephContext *cct, + const string& path, + uint64_t _min_alloc_size) + : ObjectStore(cct, path), + throttle(cct), + finisher(cct, "commit_finisher", "cfin"), + kv_sync_thread(this), + kv_finalize_thread(this), +#ifdef HAVE_LIBZBD + zoned_cleaner_thread(this), +#endif + min_alloc_size(_min_alloc_size), + min_alloc_size_order(std::countr_zero(_min_alloc_size)), + mempool_thread(this) +{ + _init_logger(); + cct->_conf.add_observer(this); + set_cache_shards(1); +} + +BlueStore::~BlueStore() +{ + cct->_conf.remove_observer(this); + _shutdown_logger(); + ceph_assert(!mounted); + ceph_assert(db == NULL); + ceph_assert(bluefs == NULL); + ceph_assert(fsid_fd < 0); + ceph_assert(path_fd < 0); + for (auto i : onode_cache_shards) { + delete i; + } + for (auto i : buffer_cache_shards) { + delete i; + } + onode_cache_shards.clear(); + buffer_cache_shards.clear(); +} + +const char **BlueStore::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "bluestore_csum_type", + "bluestore_compression_mode", + "bluestore_compression_algorithm", + "bluestore_compression_min_blob_size", + "bluestore_compression_min_blob_size_ssd", + "bluestore_compression_min_blob_size_hdd", + "bluestore_compression_max_blob_size", + "bluestore_compression_max_blob_size_ssd", + "bluestore_compression_max_blob_size_hdd", + "bluestore_compression_required_ratio", + "bluestore_max_alloc_size", + "bluestore_prefer_deferred_size", + "bluestore_prefer_deferred_size_hdd", + "bluestore_prefer_deferred_size_ssd", + "bluestore_deferred_batch_ops", + "bluestore_deferred_batch_ops_hdd", + "bluestore_deferred_batch_ops_ssd", + "bluestore_throttle_bytes", + "bluestore_throttle_deferred_bytes", + "bluestore_throttle_cost_per_io_hdd", + "bluestore_throttle_cost_per_io_ssd", + "bluestore_throttle_cost_per_io", + "bluestore_max_blob_size", + "bluestore_max_blob_size_ssd", + "bluestore_max_blob_size_hdd", + "osd_memory_target", + "osd_memory_target_cgroup_limit_ratio", + "osd_memory_base", + "osd_memory_cache_min", + "osd_memory_expected_fragmentation", + "bluestore_cache_autotune", + "bluestore_cache_autotune_interval", + "bluestore_cache_age_bin_interval", + "bluestore_cache_kv_age_bins", + "bluestore_cache_kv_onode_age_bins", + "bluestore_cache_meta_age_bins", + "bluestore_cache_data_age_bins", + "bluestore_warn_on_legacy_statfs", + "bluestore_warn_on_no_per_pool_omap", + "bluestore_warn_on_no_per_pg_omap", + "bluestore_max_defer_interval", + NULL + }; + return KEYS; +} + +void BlueStore::handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + if (changed.count("bluestore_warn_on_legacy_statfs")) { + _check_legacy_statfs_alert(); + } + if (changed.count("bluestore_warn_on_no_per_pool_omap") || + changed.count("bluestore_warn_on_no_per_pg_omap")) { + _check_no_per_pg_or_pool_omap_alert(); + } + + if (changed.count("bluestore_csum_type")) { + _set_csum(); + } + if (changed.count("bluestore_compression_mode") || + changed.count("bluestore_compression_algorithm") || + changed.count("bluestore_compression_min_blob_size") || + changed.count("bluestore_compression_max_blob_size")) { + if (bdev) { + _set_compression(); + } + } + if (changed.count("bluestore_max_blob_size") || + changed.count("bluestore_max_blob_size_ssd") || + changed.count("bluestore_max_blob_size_hdd")) { + if (bdev) { + // only after startup + _set_blob_size(); + } + } + if (changed.count("bluestore_prefer_deferred_size") || + changed.count("bluestore_prefer_deferred_size_hdd") || + changed.count("bluestore_prefer_deferred_size_ssd") || + changed.count("bluestore_max_alloc_size") || + changed.count("bluestore_deferred_batch_ops") || + changed.count("bluestore_deferred_batch_ops_hdd") || + changed.count("bluestore_deferred_batch_ops_ssd")) { + if (bdev) { + // only after startup + _set_alloc_sizes(); + } + } + if (changed.count("bluestore_throttle_cost_per_io") || + changed.count("bluestore_throttle_cost_per_io_hdd") || + changed.count("bluestore_throttle_cost_per_io_ssd")) { + if (bdev) { + _set_throttle_params(); + } + } + if (changed.count("bluestore_throttle_bytes") || + changed.count("bluestore_throttle_deferred_bytes") || + changed.count("bluestore_throttle_trace_rate")) { + throttle.reset_throttle(conf); + } + if (changed.count("bluestore_max_defer_interval")) { + if (bdev) { + _set_max_defer_interval(); + } + } + if (changed.count("osd_memory_target") || + changed.count("osd_memory_base") || + changed.count("osd_memory_cache_min") || + changed.count("osd_memory_expected_fragmentation")) { + _update_osd_memory_options(); + } +} + +void BlueStore::_set_compression() +{ + auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode); + if (m) { + _clear_compression_alert(); + comp_mode = *m; + } else { + derr << __func__ << " unrecognized value '" + << cct->_conf->bluestore_compression_mode + << "' for bluestore_compression_mode, reverting to 'none'" + << dendl; + comp_mode = Compressor::COMP_NONE; + string s("unknown mode: "); + s += cct->_conf->bluestore_compression_mode; + _set_compression_alert(true, s.c_str()); + } + + compressor = nullptr; + + if (cct->_conf->bluestore_compression_min_blob_size) { + comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size; + } else { + ceph_assert(bdev); + if (_use_rotational_settings()) { + comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd; + } else { + comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd; + } + } + + if (cct->_conf->bluestore_compression_max_blob_size) { + comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size; + } else { + ceph_assert(bdev); + if (_use_rotational_settings()) { + comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd; + } else { + comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd; + } + } + + auto& alg_name = cct->_conf->bluestore_compression_algorithm; + if (!alg_name.empty()) { + compressor = Compressor::create(cct, alg_name); + if (!compressor) { + derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor" + << dendl; + _set_compression_alert(false, alg_name.c_str()); + } + } + + dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode) + << " alg " << (compressor ? compressor->get_type_name() : "(none)") + << " min_blob " << comp_min_blob_size + << " max_blob " << comp_max_blob_size + << dendl; +} + +void BlueStore::_set_csum() +{ + csum_type = Checksummer::CSUM_NONE; + int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type); + if (t > Checksummer::CSUM_NONE) + csum_type = t; + + dout(10) << __func__ << " csum_type " + << Checksummer::get_csum_type_string(csum_type) + << dendl; +} + +void BlueStore::_set_throttle_params() +{ + if (cct->_conf->bluestore_throttle_cost_per_io) { + throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io; + } else { + ceph_assert(bdev); + if (_use_rotational_settings()) { + throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd; + } else { + throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd; + } + } + + dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io + << dendl; +} +void BlueStore::_set_blob_size() +{ + if (cct->_conf->bluestore_max_blob_size) { + max_blob_size = cct->_conf->bluestore_max_blob_size; + } else { + ceph_assert(bdev); + if (_use_rotational_settings()) { + max_blob_size = cct->_conf->bluestore_max_blob_size_hdd; + } else { + max_blob_size = cct->_conf->bluestore_max_blob_size_ssd; + } + } + dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size + << std::dec << dendl; +} + +void BlueStore::_update_osd_memory_options() +{ + osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target"); + osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base"); + osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation"); + osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min"); + config_changed++; + dout(10) << __func__ + << " osd_memory_target " << osd_memory_target + << " osd_memory_base " << osd_memory_base + << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation + << " osd_memory_cache_min " << osd_memory_cache_min + << dendl; +} + +int BlueStore::_set_cache_sizes() +{ + ceph_assert(bdev); + cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune"); + cache_autotune_interval = + cct->_conf.get_val<double>("bluestore_cache_autotune_interval"); + cache_age_bin_interval = + cct->_conf.get_val<double>("bluestore_cache_age_bin_interval"); + auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals) + { + std::string intervals_str = cct->_conf.get_val<std::string>(conf_name); + std::istringstream interval_stream(intervals_str); + std::copy( + std::istream_iterator<uint64_t>(interval_stream), + std::istream_iterator<uint64_t>(), + std::back_inserter(*intervals)); + }; + _set_bin("bluestore_cache_age_bins_kv", &kv_bins); + _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins); + _set_bin("bluestore_cache_age_bins_meta", &meta_bins); + _set_bin("bluestore_cache_age_bins_data", &data_bins); + + osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target"); + osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base"); + osd_memory_expected_fragmentation = + cct->_conf.get_val<double>("osd_memory_expected_fragmentation"); + osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min"); + osd_memory_cache_resize_interval = + cct->_conf.get_val<double>("osd_memory_cache_resize_interval"); + + if (cct->_conf->bluestore_cache_size) { + cache_size = cct->_conf->bluestore_cache_size; + } else { + // choose global cache size based on backend type + if (_use_rotational_settings()) { + cache_size = cct->_conf->bluestore_cache_size_hdd; + } else { + cache_size = cct->_conf->bluestore_cache_size_ssd; + } + } + + cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio"); + if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) { + derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio + << ") must be in range [0,1.0]" << dendl; + return -EINVAL; + } + + cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio"); + if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) { + derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio + << ") must be in range [0,1.0]" << dendl; + return -EINVAL; + } + + cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio"); + if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) { + derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio + << ") must be in range [0,1.0]" << dendl; + return -EINVAL; + } + + if (cache_meta_ratio + cache_kv_ratio > 1.0) { + derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio + << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio + << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0" + << dendl; + return -EINVAL; + } + + cache_data_ratio = (double)1.0 - + (double)cache_meta_ratio - + (double)cache_kv_ratio - + (double)cache_kv_onode_ratio; + if (cache_data_ratio < 0) { + // deal with floating point imprecision + cache_data_ratio = 0; + } + + dout(1) << __func__ << " cache_size " << cache_size + << " meta " << cache_meta_ratio + << " kv " << cache_kv_ratio + << " data " << cache_data_ratio + << dendl; + return 0; +} + +int BlueStore::write_meta(const std::string& key, const std::string& value) +{ + bluestore_bdev_label_t label; + string p = path + "/block"; + int r = _read_bdev_label(cct, p, &label); + if (r < 0) { + return ObjectStore::write_meta(key, value); + } + label.meta[key] = value; + r = _write_bdev_label(cct, p, label); + ceph_assert(r == 0); + return ObjectStore::write_meta(key, value); +} + +int BlueStore::read_meta(const std::string& key, std::string *value) +{ + bluestore_bdev_label_t label; + string p = path + "/block"; + int r = _read_bdev_label(cct, p, &label); + if (r < 0) { + return ObjectStore::read_meta(key, value); + } + auto i = label.meta.find(key); + if (i == label.meta.end()) { + return ObjectStore::read_meta(key, value); + } + *value = i->second; + return 0; +} + +void BlueStore::_init_logger() +{ + PerfCountersBuilder b(cct, "bluestore", + l_bluestore_first, l_bluestore_last); + + // space utilization stats + //**************************************** + b.add_u64(l_bluestore_allocated, "allocated", + "Sum for allocated bytes", + "al_b", + PerfCountersBuilder::PRIO_CRITICAL, + unit_t(UNIT_BYTES)); + b.add_u64(l_bluestore_stored, "stored", + "Sum for stored bytes", + "st_b", + PerfCountersBuilder::PRIO_CRITICAL, + unit_t(UNIT_BYTES)); + b.add_u64(l_bluestore_fragmentation, "fragmentation_micros", + "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000"); + b.add_u64(l_bluestore_alloc_unit, "alloc_unit", + "allocation unit size in bytes", + "au_b", + PerfCountersBuilder::PRIO_CRITICAL, + unit_t(UNIT_BYTES)); + //**************************************** + + // Update op processing state latencies + //**************************************** + b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat", + "Average prepare state latency", + "sprl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat", + "Average aio_wait state latency", + "sawl", PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat", + "Average io_done state latency", + "sidl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat", + "Average kv_queued state latency", + "skql", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat", + "Average kv_commiting state latency", + "skcl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat", + "Average kv_done state latency", + "skdl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat", + "Average finishing state latency", + "sfnl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat", + "Average done state latency", + "sdnl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat", + "Average deferred_queued state latency", + "sdql", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat", + "Average aio_wait state latency", + "sdal", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat", + "Average cleanup state latency", + "sdcl", PerfCountersBuilder::PRIO_USEFUL); + //**************************************** + + // Update Transaction stats + //**************************************** + b.add_time_avg(l_bluestore_throttle_lat, "txc_throttle_lat", + "Average submit throttle latency", + "th_l", PerfCountersBuilder::PRIO_CRITICAL); + b.add_time_avg(l_bluestore_submit_lat, "txc_submit_lat", + "Average submit latency", + "s_l", PerfCountersBuilder::PRIO_CRITICAL); + b.add_time_avg(l_bluestore_commit_lat, "txc_commit_lat", + "Average commit latency", + "c_l", PerfCountersBuilder::PRIO_CRITICAL); + b.add_u64_counter(l_bluestore_txc, "txc_count", "Transactions committed"); + //**************************************** + + // Read op stats + //**************************************** + b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat", + "Average read onode metadata latency", + "roml", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat", + "Average read I/O waiting latency", + "rwal", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_csum_lat, "csum_lat", + "Average checksum latency", + "csml", PerfCountersBuilder::PRIO_USEFUL); + b.add_u64_counter(l_bluestore_read_eio, "read_eio", + "Read EIO errors propagated to high level callers"); + b.add_u64_counter(l_bluestore_reads_with_retries, "reads_with_retries", + "Read operations that required at least one retry due to failed checksum validation", + "rd_r", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_read_lat, "read_lat", + "Average read latency", + "r_l", PerfCountersBuilder::PRIO_CRITICAL); + //**************************************** + + // kv_thread latencies + //**************************************** + b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat", + "Average kv_thread flush latency", + "kfsl", PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat", + "Average kv_thread commit latency", + "kcol", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat", + "Average kv_sync thread latency", + "kscl", PerfCountersBuilder::PRIO_INTERESTING); + b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat", + "Average kv_finalize thread latency", + "kfll", PerfCountersBuilder::PRIO_INTERESTING); + //**************************************** + + // write op stats + //**************************************** + b.add_u64_counter(l_bluestore_write_big, "write_big", + "Large aligned writes into fresh blobs"); + b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes", + "Large aligned writes into fresh blobs (bytes)", + NULL, + PerfCountersBuilder::PRIO_DEBUGONLY, + unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_write_big_blobs, "write_big_blobs", + "Large aligned writes into fresh blobs (blobs)"); + b.add_u64_counter(l_bluestore_write_big_deferred, + "write_big_deferred", + "Big overwrites using deferred"); + + b.add_u64_counter(l_bluestore_write_small, "write_small", + "Small writes into existing or sparse small blobs"); + b.add_u64_counter(l_bluestore_write_small_bytes, "write_small_bytes", + "Small writes into existing or sparse small blobs (bytes)", + NULL, + PerfCountersBuilder::PRIO_DEBUGONLY, + unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_write_small_unused, + "write_small_unused", + "Small writes into unused portion of existing blob"); + b.add_u64_counter(l_bluestore_write_small_pre_read, + "write_small_pre_read", + "Small writes that required we read some data (possibly " + "cached) to fill out the block"); + + b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes", + "Sum for write-op padded bytes", + NULL, + PerfCountersBuilder::PRIO_DEBUGONLY, + unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops", + "Sum for write penalty read ops"); + b.add_u64_counter(l_bluestore_write_new, "write_new", + "Write into new blob"); + + b.add_u64_counter(l_bluestore_issued_deferred_writes, + "issued_deferred_writes", + "Total deferred writes issued"); + b.add_u64_counter(l_bluestore_issued_deferred_write_bytes, + "issued_deferred_write_bytes", + "Total bytes in issued deferred writes", + NULL, + PerfCountersBuilder::PRIO_DEBUGONLY, + unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_submitted_deferred_writes, + "submitted_deferred_writes", + "Total deferred writes submitted to disk"); + b.add_u64_counter(l_bluestore_submitted_deferred_write_bytes, + "submitted_deferred_write_bytes", + "Total bytes submitted to disk by deferred writes", + NULL, + PerfCountersBuilder::PRIO_DEBUGONLY, + unit_t(UNIT_BYTES)); + + b.add_u64_counter(l_bluestore_write_big_skipped_blobs, + "write_big_skipped_blobs", + "Large aligned writes into fresh blobs skipped due to zero detection (blobs)"); + b.add_u64_counter(l_bluestore_write_big_skipped_bytes, + "write_big_skipped_bytes", + "Large aligned writes into fresh blobs skipped due to zero detection (bytes)"); + b.add_u64_counter(l_bluestore_write_small_skipped, + "write_small_skipped", + "Small writes into existing or sparse small blobs skipped due to zero detection"); + b.add_u64_counter(l_bluestore_write_small_skipped_bytes, + "write_small_skipped_bytes", + "Small writes into existing or sparse small blobs skipped due to zero detection (bytes)"); + //**************************************** + + // compressions stats + //**************************************** + b.add_u64(l_bluestore_compressed, "compressed", + "Sum for stored compressed bytes", + "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluestore_compressed_allocated, "compressed_allocated", + "Sum for bytes allocated for compressed data", + "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64(l_bluestore_compressed_original, "compressed_original", + "Sum for original bytes that were compressed", + "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_time_avg(l_bluestore_compress_lat, "compress_lat", + "Average compress latency", + "_cpl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat", + "Average decompress latency", + "dcpl", PerfCountersBuilder::PRIO_USEFUL); + b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count", + "Sum for beneficial compress ops"); + b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count", + "Sum for compress ops rejected due to low net gain of space"); + //**************************************** + + // onode cache stats + //**************************************** + b.add_u64(l_bluestore_onodes, "onodes", + "Number of onodes in cache"); + b.add_u64(l_bluestore_pinned_onodes, "onodes_pinned", + "Number of pinned onodes in cache"); + b.add_u64_counter(l_bluestore_onode_hits, "onode_hits", + "Count of onode cache lookup hits", + "o_ht", PerfCountersBuilder::PRIO_USEFUL); + b.add_u64_counter(l_bluestore_onode_misses, "onode_misses", + "Count of onode cache lookup misses", + "o_ms", PerfCountersBuilder::PRIO_USEFUL); + b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits", + "Count of onode shard cache lookups hits"); + b.add_u64_counter(l_bluestore_onode_shard_misses, + "onode_shard_misses", + "Count of onode shard cache lookups misses"); + b.add_u64(l_bluestore_extents, "onode_extents", + "Number of extents in cache"); + b.add_u64(l_bluestore_blobs, "onode_blobs", + "Number of blobs in cache"); + //**************************************** + + // buffer cache stats + //**************************************** + b.add_u64(l_bluestore_buffers, "buffers", + "Number of buffers in cache"); + b.add_u64(l_bluestore_buffer_bytes, "buffer_bytes", + "Number of buffer bytes in cache", + NULL, + PerfCountersBuilder::PRIO_DEBUGONLY, + unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_buffer_hit_bytes, "buffer_hit_bytes", + "Sum for bytes of read hit in the cache", + NULL, + PerfCountersBuilder::PRIO_DEBUGONLY, + unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluestore_buffer_miss_bytes, "buffer_miss_bytes", + "Sum for bytes of read missed in the cache", + NULL, + PerfCountersBuilder::PRIO_DEBUGONLY, + unit_t(UNIT_BYTES)); + //**************************************** + + // internal stats + //**************************************** + b.add_u64_counter(l_bluestore_onode_reshard, "onode_reshard", + "Onode extent map reshard events"); + b.add_u64_counter(l_bluestore_blob_split, "blob_split", + "Sum for blob splitting due to resharding"); + b.add_u64_counter(l_bluestore_extent_compress, "extent_compress", + "Sum for extents that have been removed due to compression"); + b.add_u64_counter(l_bluestore_gc_merged, "gc_merged", + "Sum for extents that have been merged due to garbage " + "collection"); + //**************************************** + // misc + //**************************************** + b.add_u64_counter(l_bluestore_omap_iterator_count, "omap_iterator_count", + "Open omap iterators count"); + b.add_u64_counter(l_bluestore_omap_rmkeys_count, "omap_rmkeys_count", + "amount of omap keys removed via rmkeys"); + b.add_u64_counter(l_bluestore_omap_rmkey_ranges_count, "omap_rmkey_range_count", + "amount of omap key ranges removed via rmkeys"); + //**************************************** + // other client ops latencies + //**************************************** + b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat", + "Average omap iterator seek_to_first call latency", + "osfl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat", + "Average omap iterator upper_bound call latency", + "oubl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat", + "Average omap iterator lower_bound call latency", + "olbl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat", + "Average omap iterator next call latency", + "onxl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat", + "Average omap get_keys call latency", + "ogkl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat", + "Average omap get_values call latency", + "ogvl", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_omap_clear_lat, "omap_clear_lat", + "Average omap clear call latency"); + b.add_time_avg(l_bluestore_clist_lat, "clist_lat", + "Average collection listing latency", + "cl_l", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_remove_lat, "remove_lat", + "Average removal latency", + "rm_l", PerfCountersBuilder::PRIO_USEFUL); + b.add_time_avg(l_bluestore_truncate_lat, "truncate_lat", + "Average truncate latency", + "tr_l", PerfCountersBuilder::PRIO_USEFUL); + //**************************************** + + // Resulting size axis configuration for op histograms, values are in bytes + PerfHistogramCommon::axis_config_d alloc_hist_x_axis_config{ + "Given size (bytes)", + PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale + 0, ///< Start at 0 + 4096, ///< Quantization unit + 13, ///< Enough to cover 4+M requests + }; + // Req size axis configuration for op histograms, values are in bytes + PerfHistogramCommon::axis_config_d alloc_hist_y_axis_config{ + "Request size (bytes)", + PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale + 0, ///< Start at 0 + 4096, ///< Quantization unit + 13, ///< Enough to cover 4+M requests + }; + b.add_u64_counter_histogram( + l_bluestore_allocate_hist, "allocate_histogram", + alloc_hist_x_axis_config, alloc_hist_y_axis_config, + "Histogram of requested block allocations vs. given ones"); + + logger = b.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); +} + +int BlueStore::_reload_logger() +{ + struct store_statfs_t store_statfs; + int r = statfs(&store_statfs); + if (r >= 0) { + logger->set(l_bluestore_allocated, store_statfs.allocated); + logger->set(l_bluestore_stored, store_statfs.data_stored); + logger->set(l_bluestore_compressed, store_statfs.data_compressed); + logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated); + logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original); + } + return r; +} + +void BlueStore::_shutdown_logger() +{ + cct->get_perfcounters_collection()->remove(logger); + delete logger; +} + +int BlueStore::get_block_device_fsid(CephContext* cct, const string& path, + uuid_d *fsid) +{ + bluestore_bdev_label_t label; + int r = _read_bdev_label(cct, path, &label); + if (r < 0) + return r; + *fsid = label.osd_uuid; + return 0; +} + +int BlueStore::_open_path() +{ + // sanity check(s) + ceph_assert(path_fd < 0); + path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC)); + if (path_fd < 0) { + int r = -errno; + derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r) + << dendl; + return r; + } + return 0; +} + +void BlueStore::_close_path() +{ + VOID_TEMP_FAILURE_RETRY(::close(path_fd)); + path_fd = -1; +} + +int BlueStore::_write_bdev_label(CephContext *cct, + const string &path, bluestore_bdev_label_t label) +{ + dout(10) << __func__ << " path " << path << " label " << label << dendl; + bufferlist bl; + encode(label, bl); + uint32_t crc = bl.crc32c(-1); + encode(crc, bl); + ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE); + bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length()); + z.zero(); + bl.append(std::move(z)); + + int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC|O_DIRECT)); + if (fd < 0) { + fd = -errno; + derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd) + << dendl; + return fd; + } + bl.rebuild_aligned_size_and_memory(BDEV_LABEL_BLOCK_SIZE, BDEV_LABEL_BLOCK_SIZE, IOV_MAX); + int r = bl.write_fd(fd); + if (r < 0) { + derr << __func__ << " failed to write to " << path + << ": " << cpp_strerror(r) << dendl; + goto out; + } + r = ::fsync(fd); + if (r < 0) { + derr << __func__ << " failed to fsync " << path + << ": " << cpp_strerror(r) << dendl; + } +out: + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; +} + +int BlueStore::_read_bdev_label(CephContext* cct, const string &path, + bluestore_bdev_label_t *label) +{ + dout(10) << __func__ << dendl; + int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC)); + if (fd < 0) { + fd = -errno; + derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd) + << dendl; + return fd; + } + bufferlist bl; + int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + if (r < 0) { + derr << __func__ << " failed to read from " << path + << ": " << cpp_strerror(r) << dendl; + return r; + } + + uint32_t crc, expected_crc; + auto p = bl.cbegin(); + try { + decode(*label, p); + bufferlist t; + t.substr_of(bl, 0, p.get_off()); + crc = t.crc32c(-1); + decode(expected_crc, p); + } + catch (ceph::buffer::error& e) { + derr << __func__ << " unable to decode label at offset " << p.get_off() + << ": " << e.what() + << dendl; + return -ENOENT; + } + if (crc != expected_crc) { + derr << __func__ << " bad crc on label, expected " << expected_crc + << " != actual " << crc << dendl; + return -EIO; + } + dout(10) << __func__ << " got " << *label << dendl; + return 0; +} + +int BlueStore::_check_or_set_bdev_label( + string path, uint64_t size, string desc, bool create) +{ + bluestore_bdev_label_t label; + if (create) { + label.osd_uuid = fsid; + label.size = size; + label.btime = ceph_clock_now(); + label.description = desc; + int r = _write_bdev_label(cct, path, label); + if (r < 0) + return r; + } else { + int r = _read_bdev_label(cct, path, &label); + if (r < 0) + return r; + if (cct->_conf->bluestore_debug_permit_any_bdev_label) { + dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid + << " and fsid " << fsid << " check bypassed" << dendl; + } else if (label.osd_uuid != fsid) { + derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid + << " does not match our fsid " << fsid << dendl; + return -EIO; + } + } + return 0; +} + +void BlueStore::_set_alloc_sizes(void) +{ + max_alloc_size = cct->_conf->bluestore_max_alloc_size; + +#ifdef HAVE_LIBZBD + ceph_assert(bdev); + if (bdev->is_smr()) { + prefer_deferred_size = 0; + } else +#endif + if (cct->_conf->bluestore_prefer_deferred_size) { + prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size; + } else { + if (_use_rotational_settings()) { + prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd; + } else { + prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd; + } + } + + if (cct->_conf->bluestore_deferred_batch_ops) { + deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops; + } else { + if (_use_rotational_settings()) { + deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd; + } else { + deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd; + } + } + + dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size + << std::dec << " order " << (int)min_alloc_size_order + << " max_alloc_size 0x" << std::hex << max_alloc_size + << " prefer_deferred_size 0x" << prefer_deferred_size + << std::dec + << " deferred_batch_ops " << deferred_batch_ops + << dendl; +} + +int BlueStore::_open_bdev(bool create) +{ + ceph_assert(bdev == NULL); + string p = path + "/block"; + bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this)); + int r = bdev->open(p); + if (r < 0) + goto fail; + + if (create && cct->_conf->bdev_enable_discard) { + interval_set<uint64_t> whole_device; + whole_device.insert(0, bdev->get_size()); + bdev->try_discard(whole_device, false); + } + + if (bdev->supported_bdev_label()) { + r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create); + if (r < 0) + goto fail_close; + } + + // initialize global block parameters + block_size = bdev->get_block_size(); + block_mask = ~(block_size - 1); + block_size_order = std::countr_zero(block_size); + ceph_assert(block_size == 1u << block_size_order); + _set_max_defer_interval(); + // and set cache_size based on device type + r = _set_cache_sizes(); + if (r < 0) { + goto fail_close; + } + // get block dev optimal io size + optimal_io_size = bdev->get_optimal_io_size(); + + return 0; + + fail_close: + bdev->close(); + fail: + delete bdev; + bdev = NULL; + return r; +} + +void BlueStore::_validate_bdev() +{ + ceph_assert(bdev); + uint64_t dev_size = bdev->get_size(); + ceph_assert(dev_size > _get_ondisk_reserved()); +} + +void BlueStore::_close_bdev() +{ + ceph_assert(bdev); + bdev->close(); + delete bdev; + bdev = NULL; +} + +int BlueStore::_open_fm(KeyValueDB::Transaction t, + bool read_only, + bool db_avail, + bool fm_restore) +{ + int r; + + dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl; + ceph_assert(fm == NULL); + // fm_restore means we are transitioning from null-fm to bitmap-fm + ceph_assert(!fm_restore || (freelist_type != "null")); + // fm restore must pass in a valid transaction + ceph_assert(!fm_restore || (t != nullptr)); + + // when function is called in repair mode (to_repair=true) we skip db->open()/create() + bool can_have_null_fm = !is_db_rotational() && + !read_only && + db_avail && + cct->_conf->bluestore_allocation_from_file && + !bdev->is_smr(); + + // When allocation-info is stored in a single file we set freelist_type to "null" + if (can_have_null_fm) { + freelist_type = "null"; + need_to_destage_allocation_file = true; + } + fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC); + ceph_assert(fm); + if (t) { + // create mode. initialize freespace + dout(20) << __func__ << " initializing freespace" << dendl; + { + bufferlist bl; + bl.append(freelist_type); + t->set(PREFIX_SUPER, "freelist_type", bl); + } + // being able to allocate in units less than bdev block size + // seems to be a bad idea. + ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size); + + uint64_t alloc_size = min_alloc_size; + if (bdev->is_smr() && freelist_type != "zoned") { + derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)" + << dendl; + return -EINVAL; + } + if (!bdev->is_smr() && freelist_type == "zoned") { + derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned" + << dendl; + return -EINVAL; + } + + fm->create(bdev->get_size(), alloc_size, + zone_size, first_sequential_zone, + t); + + // allocate superblock reserved space. note that we do not mark + // bluefs space as allocated in the freelist; we instead rely on + // bluefs doing that itself. + auto reserved = _get_ondisk_reserved(); + if (fm_restore) { + // we need to allocate the full space in restore case + // as later we will add free-space marked in the allocator file + fm->allocate(0, bdev->get_size(), t); + } else { + // allocate superblock reserved space. note that we do not mark + // bluefs space as allocated in the freelist; we instead rely on + // bluefs doing that itself. + fm->allocate(0, reserved, t); + } + // debug code - not needed for NULL FM + if (cct->_conf->bluestore_debug_prefill > 0) { + uint64_t end = bdev->get_size() - reserved; + dout(1) << __func__ << " pre-fragmenting freespace, using " + << cct->_conf->bluestore_debug_prefill << " with max free extent " + << cct->_conf->bluestore_debug_prefragment_max << dendl; + uint64_t start = p2roundup(reserved, min_alloc_size); + uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size; + float r = cct->_conf->bluestore_debug_prefill; + r /= 1.0 - r; + bool stop = false; + + while (!stop && start < end) { + uint64_t l = (rand() % max_b + 1) * min_alloc_size; + if (start + l > end) { + l = end - start; + l = p2align(l, min_alloc_size); + } + ceph_assert(start + l <= end); + + uint64_t u = 1 + (uint64_t)(r * (double)l); + u = p2roundup(u, min_alloc_size); + if (start + l + u > end) { + u = end - (start + l); + // trim to align so we don't overflow again + u = p2align(u, min_alloc_size); + stop = true; + } + ceph_assert(start + l + u <= end); + + dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l + << " use 0x" << u << std::dec << dendl; + + if (u == 0) { + // break if u has been trimmed to nothing + break; + } + + fm->allocate(start + l, u, t); + start += l + u; + } + } + r = _write_out_fm_meta(0); + ceph_assert(r == 0); + } else { + if (can_have_null_fm) { + commit_to_null_manager(); + } + r = fm->init(db, read_only, + [&](const std::string& key, std::string* result) { + return read_meta(key, result); + }); + if (r < 0) { + derr << __func__ << " failed: " << cpp_strerror(r) << dendl; + delete fm; + fm = NULL; + return r; + } + } + // if space size tracked by free list manager is that higher than actual + // dev size one can hit out-of-space allocation which will result + // in data loss and/or assertions + // Probably user altered the device size somehow. + // The only fix for now is to redeploy OSD. + if (fm->get_size() >= bdev->get_size() + min_alloc_size) { + ostringstream ss; + ss << "slow device size mismatch detected, " + << " fm size(" << fm->get_size() + << ") > slow device size(" << bdev->get_size() + << "), Please stop using this OSD as it might cause data loss."; + _set_disk_size_mismatch_alert(ss.str()); + } + return 0; +} + +void BlueStore::_close_fm() +{ + dout(10) << __func__ << dendl; + ceph_assert(fm); + fm->shutdown(); + delete fm; + fm = NULL; +} + +int BlueStore::_write_out_fm_meta(uint64_t target_size) +{ + int r = 0; + string p = path + "/block"; + + std::vector<std::pair<string, string>> fm_meta; + fm->get_meta(target_size, &fm_meta); + + for (auto& m : fm_meta) { + r = write_meta(m.first, m.second); + ceph_assert(r == 0); + } + return r; +} + +int BlueStore::_create_alloc() +{ + ceph_assert(alloc == NULL); + ceph_assert(shared_alloc.a == NULL); + ceph_assert(bdev->get_size()); + + uint64_t alloc_size = min_alloc_size; + + std::string allocator_type = cct->_conf->bluestore_allocator; + +#ifdef HAVE_LIBZBD + if (freelist_type == "zoned") { + allocator_type = "zoned"; + } +#endif + + alloc = Allocator::create( + cct, allocator_type, + bdev->get_size(), + alloc_size, + zone_size, + first_sequential_zone, + "block"); + if (!alloc) { + lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator" + << dendl; + return -EINVAL; + } + +#ifdef HAVE_LIBZBD + if (freelist_type == "zoned") { + Allocator *a = Allocator::create( + cct, cct->_conf->bluestore_allocator, + bdev->get_conventional_region_size(), + alloc_size, + 0, 0, + "zoned_block"); + if (!a) { + lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator + << " allocator" << dendl; + delete alloc; + return -EINVAL; + } + shared_alloc.set(a, alloc_size); + } else +#endif + { + // BlueFS will share the same allocator + shared_alloc.set(alloc, alloc_size); + } + + return 0; +} + +int BlueStore::_init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments) +{ + int r = _create_alloc(); + if (r < 0) { + return r; + } + ceph_assert(alloc != NULL); + +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + auto a = dynamic_cast<ZonedAllocator*>(alloc); + ceph_assert(a); + auto f = dynamic_cast<ZonedFreelistManager*>(fm); + ceph_assert(f); + vector<uint64_t> wp = bdev->get_zones(); + vector<zone_state_t> zones = f->get_zone_states(db); + ceph_assert(wp.size() == zones.size()); + + // reconcile zone state + auto num_zones = bdev->get_size() / zone_size; + for (unsigned i = first_sequential_zone; i < num_zones; ++i) { + ceph_assert(wp[i] >= i * zone_size); + ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone + uint64_t p = wp[i] - i * zone_size; + if (zones[i].write_pointer > p) { + derr << __func__ << " zone 0x" << std::hex << i + << " bluestore write pointer 0x" << zones[i].write_pointer + << " > device write pointer 0x" << p + << std::dec << " -- VERY SUSPICIOUS!" << dendl; + } else if (zones[i].write_pointer < p) { + // this is "normal" in that it can happen after any crash (if we have a + // write in flight but did not manage to commit the transaction) + auto delta = p - zones[i].write_pointer; + dout(1) << __func__ << " zone 0x" << std::hex << i + << " device write pointer 0x" << p + << " > bluestore pointer 0x" << zones[i].write_pointer + << ", advancing 0x" << delta << std::dec << dendl; + (*zone_adjustments)[zones[i].write_pointer] = delta; + zones[i].num_dead_bytes += delta; + zones[i].write_pointer = p; + } + } + + // start with conventional zone "free" (bluefs may adjust this when it starts up) + auto reserved = _get_ondisk_reserved(); + // for now we require a conventional zone + ceph_assert(bdev->get_conventional_region_size()); + ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region + shared_alloc.a->init_add_free( + reserved, + p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved); + + // init sequential zone based on the device's write pointers + a->init_from_zone_pointers(std::move(zones)); + dout(1) << __func__ + << " loaded zone pointers: " + << std::hex + << ", allocator type " << alloc->get_type() + << ", capacity 0x" << alloc->get_capacity() + << ", block size 0x" << alloc->get_block_size() + << ", free 0x" << alloc->get_free() + << ", fragmentation " << alloc->get_fragmentation() + << std::dec << dendl; + + return 0; + } +#endif + + uint64_t num = 0, bytes = 0; + utime_t start_time = ceph_clock_now(); + if (!fm->is_null_manager()) { + // This is the original path - loading allocation map from RocksDB and feeding into the allocator + dout(5) << __func__ << "::NCB::loading allocation from FM -> alloc" << dendl; + // initialize from freelist + fm->enumerate_reset(); + uint64_t offset, length; + while (fm->enumerate_next(db, &offset, &length)) { + alloc->init_add_free(offset, length); + ++num; + bytes += length; + } + fm->enumerate_reset(); + + utime_t duration = ceph_clock_now() - start_time; + dout(5) << __func__ << "::num_entries=" << num << " free_size=" << bytes << " alloc_size=" << + alloc->get_capacity() - bytes << " time=" << duration << " seconds" << dendl; + } else { + // This is the new path reading the allocation map from a flat bluefs file and feeding them into the allocator + + if (!cct->_conf->bluestore_allocation_from_file) { + derr << __func__ << "::NCB::cct->_conf->bluestore_allocation_from_file is set to FALSE with an active NULL-FM" << dendl; + derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl; + return -ENOTSUP; // Operation not supported + } + if (restore_allocator(alloc, &num, &bytes) == 0) { + dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl; + } else { + // This must mean that we had an unplanned shutdown and didn't manage to destage the allocator + dout(0) << __func__ << "::NCB::restore_allocator() failed! Run Full Recovery from ONodes (might take a while) ..." << dendl; + // if failed must recover from on-disk ONode internal state + if (read_allocation_from_drive_on_startup() != 0) { + derr << __func__ << "::NCB::Failed Recovery" << dendl; + derr << __func__ << "::NCB::Ceph-OSD won't start, make sure your drives are connected and readable" << dendl; + derr << __func__ << "::NCB::If no HW fault is found, please report failure and consider redeploying OSD" << dendl; + return -ENOTRECOVERABLE; + } + } + } + dout(1) << __func__ + << " loaded " << byte_u_t(bytes) << " in " << num << " extents" + << std::hex + << ", allocator type " << alloc->get_type() + << ", capacity 0x" << alloc->get_capacity() + << ", block size 0x" << alloc->get_block_size() + << ", free 0x" << alloc->get_free() + << ", fragmentation " << alloc->get_fragmentation() + << std::dec << dendl; + + return 0; +} + +void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments) +{ + int r = 0; +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + if (zone_adjustments.empty()) { + return; + } + dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl; + auto f = dynamic_cast<ZonedFreelistManager*>(fm); + ceph_assert(f); + KeyValueDB::Transaction t = db->get_transaction(); + for (auto& i : zone_adjustments) { + // allocate AND release since this gap is now dead space + // note that the offset is imprecise, but only need to select the zone + f->allocate(i.first, i.second, t); + f->release(i.first, i.second, t); + } + r = db->submit_transaction_sync(t); + } else +#endif + if (fm->is_null_manager()) { + // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected + // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount() + // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort + // to recovery from RocksDB::ONodes + r = invalidate_allocation_file_on_bluefs(); + } + ceph_assert(r >= 0); +} + +void BlueStore::_close_alloc() +{ + ceph_assert(bdev); + bdev->discard_drain(); + + ceph_assert(alloc); + alloc->shutdown(); + delete alloc; + + ceph_assert(shared_alloc.a); + if (alloc != shared_alloc.a) { + shared_alloc.a->shutdown(); + delete shared_alloc.a; + } + + shared_alloc.reset(); + alloc = nullptr; +} + +int BlueStore::_open_fsid(bool create) +{ + ceph_assert(fsid_fd < 0); + int flags = O_RDWR|O_CLOEXEC; + if (create) + flags |= O_CREAT; + fsid_fd = ::openat(path_fd, "fsid", flags, 0644); + if (fsid_fd < 0) { + int err = -errno; + derr << __func__ << " " << cpp_strerror(err) << dendl; + return err; + } + return 0; +} + +int BlueStore::_read_fsid(uuid_d *uuid) +{ + char fsid_str[40]; + memset(fsid_str, 0, sizeof(fsid_str)); + int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) { + derr << __func__ << " failed: " << cpp_strerror(ret) << dendl; + return ret; + } + if (ret > 36) + fsid_str[36] = 0; + else + fsid_str[ret] = 0; + if (!uuid->parse(fsid_str)) { + derr << __func__ << " unparsable uuid " << fsid_str << dendl; + return -EINVAL; + } + return 0; +} + +int BlueStore::_write_fsid() +{ + int r = ::ftruncate(fsid_fd, 0); + if (r < 0) { + r = -errno; + derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl; + return r; + } + string str = stringify(fsid) + "\n"; + r = safe_write(fsid_fd, str.c_str(), str.length()); + if (r < 0) { + derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl; + return r; + } + r = ::fsync(fsid_fd); + if (r < 0) { + r = -errno; + derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +void BlueStore::_close_fsid() +{ + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; +} + +int BlueStore::_lock_fsid() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + derr << __func__ << " failed to lock " << path << "/fsid" + << " (is another ceph-osd still running?)" + << cpp_strerror(err) << dendl; + return -err; + } + return 0; +} + +bool BlueStore::is_rotational() +{ + if (bdev) { + return bdev->is_rotational(); + } + + bool rotational = true; + int r = _open_path(); + if (r < 0) + goto out; + r = _open_fsid(false); + if (r < 0) + goto out_path; + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + r = _open_bdev(false); + if (r < 0) + goto out_fsid; + rotational = bdev->is_rotational(); + _close_bdev(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + out: + return rotational; +} + +bool BlueStore::is_journal_rotational() +{ + if (!bluefs) { + dout(5) << __func__ << " bluefs disabled, default to store media type" + << dendl; + return is_rotational(); + } + dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl; + return bluefs->wal_is_rotational(); +} + +bool BlueStore::is_db_rotational() +{ + if (!bluefs) { + dout(5) << __func__ << " bluefs disabled, default to store media type" + << dendl; + return is_rotational(); + } + dout(10) << __func__ << " " << (int)bluefs->db_is_rotational() << dendl; + return bluefs->db_is_rotational(); +} + +bool BlueStore::_use_rotational_settings() +{ + if (cct->_conf->bluestore_debug_enforce_settings == "hdd") { + return true; + } + if (cct->_conf->bluestore_debug_enforce_settings == "ssd") { + return false; + } + return bdev->is_rotational(); +} + +bool BlueStore::is_statfs_recoverable() const +{ + // abuse fm for now + return has_null_manager(); +} + +bool BlueStore::test_mount_in_use() +{ + // most error conditions mean the mount is not in use (e.g., because + // it doesn't exist). only if we fail to lock do we conclude it is + // in use. + bool ret = false; + int r = _open_path(); + if (r < 0) + return false; + r = _open_fsid(false); + if (r < 0) + goto out_path; + r = _lock_fsid(); + if (r < 0) + ret = true; // if we can't lock, it is in use + _close_fsid(); + out_path: + _close_path(); + return ret; +} + +int BlueStore::_minimal_open_bluefs(bool create) +{ + int r; + bluefs = new BlueFS(cct); + + string bfn; + struct stat st; + + bfn = path + "/block.db"; + if (::stat(bfn.c_str(), &st) == 0) { + r = bluefs->add_block_device( + BlueFS::BDEV_DB, bfn, + create && cct->_conf->bdev_enable_discard, + SUPER_RESERVED); + if (r < 0) { + derr << __func__ << " add block device(" << bfn << ") returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + + if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) { + r = _check_or_set_bdev_label( + bfn, + bluefs->get_block_device_size(BlueFS::BDEV_DB), + "bluefs db", create); + if (r < 0) { + derr << __func__ + << " check block device(" << bfn << ") label returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW; + bluefs_layout.dedicated_db = true; + } else { + r = -errno; + if (::lstat(bfn.c_str(), &st) == -1) { + r = 0; + bluefs_layout.shared_bdev = BlueFS::BDEV_DB; + } else { + derr << __func__ << " " << bfn << " symlink exists but target unusable: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + + // shared device + bfn = path + "/block"; + // never trim here + r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false, + 0, // no need to provide valid 'reserved' for shared dev + &shared_alloc); + if (r < 0) { + derr << __func__ << " add block device(" << bfn << ") returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + + bfn = path + "/block.wal"; + if (::stat(bfn.c_str(), &st) == 0) { + r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn, + create && cct->_conf->bdev_enable_discard, + BDEV_LABEL_BLOCK_SIZE); + if (r < 0) { + derr << __func__ << " add block device(" << bfn << ") returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + + if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) { + r = _check_or_set_bdev_label( + bfn, + bluefs->get_block_device_size(BlueFS::BDEV_WAL), + "bluefs wal", create); + if (r < 0) { + derr << __func__ << " check block device(" << bfn + << ") label returned: " << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + + bluefs_layout.dedicated_wal = true; + } else { + r = 0; + if (::lstat(bfn.c_str(), &st) != -1) { + r = -errno; + derr << __func__ << " " << bfn << " symlink exists but target unusable: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + return 0; + +free_bluefs: + ceph_assert(bluefs); + delete bluefs; + bluefs = NULL; + return r; +} + +int BlueStore::_open_bluefs(bool create, bool read_only) +{ + int r = _minimal_open_bluefs(create); + if (r < 0) { + return r; + } + BlueFSVolumeSelector* vselector = nullptr; + if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW || + cct->_conf->bluestore_volume_selection_policy == "use_some_extra_enforced" || + cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") { + + string options = cct->_conf->bluestore_rocksdb_options; + string options_annex = cct->_conf->bluestore_rocksdb_options_annex; + if (!options_annex.empty()) { + if (!options.empty() && + *options.rbegin() != ',') { + options += ','; + } + options += options_annex; + } + + rocksdb::Options rocks_opts; + r = RocksDBStore::ParseOptionsFromStringStatic( + cct, + options, + rocks_opts, + nullptr); + if (r < 0) { + return r; + } + if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") { + vselector = new FitToFastVolumeSelector( + bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, + bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, + bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100); + } else { + double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor; + vselector = + new RocksDBBlueFSVolumeSelector( + bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, + bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, + bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100, + 1024 * 1024 * 1024, //FIXME: set expected l0 size here + rocks_opts.max_bytes_for_level_base, + rocks_opts.max_bytes_for_level_multiplier, + reserved_factor, + cct->_conf->bluestore_volume_selection_reserved, + cct->_conf->bluestore_volume_selection_policy.find("use_some_extra") + == 0); + } + } + if (create) { + bluefs->mkfs(fsid, bluefs_layout); + } + bluefs->set_volume_selector(vselector); + r = bluefs->mount(); + if (r < 0) { + derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl; + } + ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0); + return r; +} + +void BlueStore::_close_bluefs() +{ + bluefs->umount(db_was_opened_read_only); + _minimal_close_bluefs(); +} + +void BlueStore::_minimal_close_bluefs() +{ + delete bluefs; + bluefs = NULL; +} + +int BlueStore::_is_bluefs(bool create, bool* ret) +{ + if (create) { + *ret = cct->_conf->bluestore_bluefs; + } else { + string s; + int r = read_meta("bluefs", &s); + if (r < 0) { + derr << __func__ << " unable to read 'bluefs' meta" << dendl; + return -EIO; + } + if (s == "1") { + *ret = true; + } else if (s == "0") { + *ret = false; + } else { + derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting" + << dendl; + return -EIO; + } + } + return 0; +} + +/* +* opens both DB and dependant super_meta, FreelistManager and allocator +* in the proper order +*/ +int BlueStore::_open_db_and_around(bool read_only, bool to_repair) +{ + dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl; + { + string type; + int r = read_meta("type", &type); + if (r < 0) { + derr << __func__ << " failed to load os-type: " << cpp_strerror(r) + << dendl; + return r; + } + + if (type != "bluestore") { + derr << __func__ << " expected bluestore, but type is " << type << dendl; + return -EIO; + } + } + + // SMR devices may require a freelist adjustment, but that can only happen after + // the db is read-write. we'll stash pending changes here. + std::map<uint64_t, uint64_t> zone_adjustments; + + int r = _open_path(); + if (r < 0) + return r; + r = _open_fsid(false); + if (r < 0) + goto out_path; + + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + + r = _open_bdev(false); + if (r < 0) + goto out_fsid; + + // GBH: can probably skip open_db step in REad-Only mode when operating in NULL-FM mode + // (might need to open if failed to restore from file) + + // open in read-only first to read FM list and init allocator + // as they might be needed for some BlueFS procedures + r = _open_db(false, false, true); + if (r < 0) + goto out_bdev; + + r = _open_super_meta(); + if (r < 0) { + goto out_db; + } + + r = _open_fm(nullptr, true, false); + if (r < 0) + goto out_db; + + r = _init_alloc(&zone_adjustments); + if (r < 0) + goto out_fm; + + // Re-open in the proper mode(s). + + // Can't simply bypass second open for read-only mode as we need to + // load allocated extents from bluefs into allocator. + // And now it's time to do that + // + _close_db(); + r = _open_db(false, to_repair, read_only); + if (r < 0) { + goto out_alloc; + } + + if (!read_only) { + _post_init_alloc(zone_adjustments); + } + + // when function is called in repair mode (to_repair=true) we skip db->open()/create() + // we can't change bluestore allocation so no need to invlidate allocation-file + if (fm->is_null_manager() && !read_only && !to_repair) { + // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected + // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount() + // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort + // to recovery from RocksDB::ONodes + r = invalidate_allocation_file_on_bluefs(); + if (r != 0) { + derr << __func__ << "::NCB::invalidate_allocation_file_on_bluefs() failed!" << dendl; + goto out_alloc; + } + } + + // when function is called in repair mode (to_repair=true) we skip db->open()/create() + if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file +#ifdef HAVE_LIBZBD + && !bdev->is_smr() +#endif + ) { + dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl; + commit_to_null_manager(); + need_to_destage_allocation_file = true; + dout(10) << __func__ << "::NCB::need_to_destage_allocation_file was set" << dendl; + } + + return 0; + +out_alloc: + _close_alloc(); +out_fm: + _close_fm(); + out_db: + _close_db(); + out_bdev: + _close_bdev(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + return r; +} + +void BlueStore::_close_db_and_around() +{ + if (db) { + _close_db(); + } + _close_around_db(); +} + +void BlueStore::_close_around_db() +{ + if (bluefs) { + _close_bluefs(); + } + _close_fm(); + _close_alloc(); + _close_bdev(); + _close_fsid(); + _close_path(); +} + +int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair) +{ + _kv_only = true; + int r = _open_db_and_around(false, to_repair); + if (r == 0) { + *pdb = db; + } else { + *pdb = nullptr; + } + return r; +} + +int BlueStore::close_db_environment() +{ + if (db) { + delete db; + db = nullptr; + } + _close_around_db(); + return 0; +} + +/* gets access to bluefs supporting RocksDB */ +BlueFS* BlueStore::get_bluefs() { + return bluefs; +} + +int BlueStore::_prepare_db_environment(bool create, bool read_only, + std::string* _fn, std::string* _kv_backend) +{ + int r; + ceph_assert(!db); + std::string& fn=*_fn; + std::string& kv_backend=*_kv_backend; + fn = path + "/db"; + std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator); + + if (create) { + kv_backend = cct->_conf->bluestore_kvbackend; + } else { + r = read_meta("kv_backend", &kv_backend); + if (r < 0) { + derr << __func__ << " unable to read 'kv_backend' meta" << dendl; + return -EIO; + } + } + dout(10) << __func__ << " kv_backend = " << kv_backend << dendl; + + bool do_bluefs; + r = _is_bluefs(create, &do_bluefs); + if (r < 0) { + return r; + } + dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl; + + map<string,string> kv_options; + // force separate wal dir for all new deployments. + kv_options["separate_wal_dir"] = 1; + rocksdb::Env *env = NULL; + if (do_bluefs) { + dout(10) << __func__ << " initializing bluefs" << dendl; + if (kv_backend != "rocksdb") { + derr << " backend must be rocksdb to use bluefs" << dendl; + return -EINVAL; + } + + r = _open_bluefs(create, read_only); + if (r < 0) { + return r; + } + + if (cct->_conf->bluestore_bluefs_env_mirror) { + rocksdb::Env* a = new BlueRocksEnv(bluefs); + rocksdb::Env* b = rocksdb::Env::Default(); + if (create) { + string cmd = "rm -rf " + path + "/db " + + path + "/db.slow " + + path + "/db.wal"; + int r = system(cmd.c_str()); + (void)r; + } + env = new rocksdb::EnvMirror(b, a, false, true); + } else { + env = new BlueRocksEnv(bluefs); + + // simplify the dir names, too, as "seen" by rocksdb + fn = "db"; + } + BlueFSVolumeSelector::paths paths; + bluefs->get_vselector_paths(fn, paths); + + { + ostringstream db_paths; + bool first = true; + for (auto& p : paths) { + if (!first) { + db_paths << " "; + } + first = false; + db_paths << p.first << "," << p.second; + + } + kv_options["db_paths"] = db_paths.str(); + dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl; + } + + if (create) { + for (auto& p : paths) { + env->CreateDir(p.first); + } + // Selectors don't provide wal path so far hence create explicitly + env->CreateDir(fn + ".wal"); + } else { + std::vector<std::string> res; + // check for dir presence + auto r = env->GetChildren(fn+".wal", &res); + if (r.IsNotFound()) { + kv_options.erase("separate_wal_dir"); + } + } + } else { + string walfn = path + "/db.wal"; + + if (create) { + int r = ::mkdir(fn.c_str(), 0755); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r) + << dendl; + return r; + } + + // wal_dir, too! + r = ::mkdir(walfn.c_str(), 0755); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " failed to create " << walfn + << ": " << cpp_strerror(r) + << dendl; + return r; + } + } else { + struct stat st; + r = ::stat(walfn.c_str(), &st); + if (r < 0 && errno == ENOENT) { + kv_options.erase("separate_wal_dir"); + } + } + } + + + db = KeyValueDB::create(cct, + kv_backend, + fn, + kv_options, + static_cast<void*>(env)); + if (!db) { + derr << __func__ << " error creating db" << dendl; + if (bluefs) { + _close_bluefs(); + } + // delete env manually here since we can't depend on db to do this + // under this case + delete env; + env = NULL; + return -EIO; + } + + FreelistManager::setup_merge_operators(db, freelist_type); + db->set_merge_operator(PREFIX_STAT, merge_op); + db->set_cache_size(cache_kv_ratio * cache_size); + return 0; +} + +int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only) +{ + int r; + ceph_assert(!(create && read_only)); + string options; + string options_annex; + stringstream err; + string kv_dir_fn; + string kv_backend; + std::string sharding_def; + // prevent write attempts to BlueFS in case we failed before BlueFS was opened + db_was_opened_read_only = true; + r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend); + if (r < 0) { + derr << __func__ << " failed to prepare db environment: " << err.str() << dendl; + return -EIO; + } + // if reached here then BlueFS is already opened + db_was_opened_read_only = read_only; + dout(10) << __func__ << "::db_was_opened_read_only was set to " << read_only << dendl; + if (kv_backend == "rocksdb") { + options = cct->_conf->bluestore_rocksdb_options; + options_annex = cct->_conf->bluestore_rocksdb_options_annex; + if (!options_annex.empty()) { + if (!options.empty() && + *options.rbegin() != ',') { + options += ','; + } + options += options_annex; + } + + if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) { + sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs"); + } + } + + db->init(options); + if (to_repair_db) + return 0; + if (create) { + r = db->create_and_open(err, sharding_def); + } else { + // we pass in cf list here, but it is only used if the db already has + // column families created. + r = read_only ? + db->open_read_only(err, sharding_def) : + db->open(err, sharding_def); + } + if (r) { + derr << __func__ << " erroring opening db: " << err.str() << dendl; + _close_db(); + return -EIO; + } + dout(1) << __func__ << " opened " << kv_backend + << " path " << kv_dir_fn << " options " << options << dendl; + return 0; +} + +void BlueStore::_close_db() +{ + dout(10) << __func__ << ":read_only=" << db_was_opened_read_only + << " fm=" << fm + << " destage_alloc_file=" << need_to_destage_allocation_file + << " per_pool=" << per_pool_stat_collection + << " pool stats=" << osd_pools.size() + << dendl; + bool do_destage = !db_was_opened_read_only && need_to_destage_allocation_file; + if (do_destage && is_statfs_recoverable()) { + auto t = db->get_transaction(); + store_statfs_t s; + if (per_pool_stat_collection) { + KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE); + uint64_t pool_id; + for (it->upper_bound(string()); it->valid(); it->next()) { + int r = get_key_pool_stat(it->key(), &pool_id); + if (r >= 0) { + dout(10) << __func__ << " wiping statfs for: " << pool_id << dendl; + } else { + derr << __func__ << " wiping invalid statfs key: " << it->key() << dendl; + } + t->rmkey(PREFIX_STAT, it->key()); + } + + std::lock_guard l(vstatfs_lock); + for(auto &p : osd_pools) { + string key; + get_pool_stat_key(p.first, &key); + bufferlist bl; + if (!p.second.is_empty()) { + p.second.encode(bl); + p.second.publish(&s); + t->set(PREFIX_STAT, key, bl); + dout(10) << __func__ << " persisting: " + << p.first << "->" << s + << dendl; + } + } + } else { + bufferlist bl; + { + std::lock_guard l(vstatfs_lock); + vstatfs.encode(bl); + vstatfs.publish(&s); + } + t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl); + dout(10) << __func__ << "persisting: " << s << dendl; + } + int r = db->submit_transaction_sync(t); + dout(10) << __func__ << " statfs persisted." << dendl; + ceph_assert(r >= 0); + } + ceph_assert(db); + delete db; + db = nullptr; + + if (do_destage && fm && fm->is_null_manager()) { + int ret = store_allocator(alloc); + if (ret != 0) { + derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl; + } + } + + if (bluefs) { + _close_bluefs(); + } +} + +void BlueStore::_dump_alloc_on_failure() +{ + auto dump_interval = + cct->_conf->bluestore_bluefs_alloc_failure_dump_interval; + if (dump_interval > 0 && + next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) { + shared_alloc.a->dump(); + next_dump_on_bluefs_alloc_failure = ceph_clock_now(); + next_dump_on_bluefs_alloc_failure += dump_interval; + } +} + +int BlueStore::_open_collections() +{ + if (!coll_map.empty()) { + // could be opened from another path + dout(20) << __func__ << "::NCB::collections are already opened, nothing to do" << dendl; + return 0; + } + + dout(10) << __func__ << dendl; + collections_had_errors = false; + KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL); + size_t load_cnt = 0; + for (it->upper_bound(string()); + it->valid(); + it->next()) { + coll_t cid; + if (cid.parse(it->key())) { + auto c = ceph::make_ref<Collection>( + this, + onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())], + buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())], + cid); + bufferlist bl = it->value(); + auto p = bl.cbegin(); + try { + decode(c->cnode, p); + } catch (ceph::buffer::error& e) { + derr << __func__ << " failed to decode cnode, key:" + << pretty_binary_string(it->key()) << dendl; + return -EIO; + } + dout(20) << __func__ << " opened " << cid << " " << c + << " " << c->cnode << dendl; + _osr_attach(c.get()); + coll_map[cid] = c; + load_cnt++; + } else { + derr << __func__ << " unrecognized collection " << it->key() << dendl; + collections_had_errors = true; + } + } + dout(10) << __func__ << " collections loaded: " << load_cnt + << dendl; + return 0; +} + +void BlueStore::_fsck_collections(int64_t* errors) +{ + if (collections_had_errors) { + dout(10) << __func__ << dendl; + KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE); + for (it->upper_bound(string()); + it->valid(); + it->next()) { + coll_t cid; + if (!cid.parse(it->key())) { + derr << __func__ << " unrecognized collection " << it->key() << dendl; + if (errors) { + (*errors)++; + } + } + } + } +} + +void BlueStore::_set_per_pool_omap() +{ + per_pool_omap = OMAP_BULK; + bufferlist bl; + db->get(PREFIX_SUPER, "per_pool_omap", &bl); + if (bl.length()) { + auto s = bl.to_str(); + if (s == stringify(OMAP_PER_POOL)) { + per_pool_omap = OMAP_PER_POOL; + } else if (s == stringify(OMAP_PER_PG)) { + per_pool_omap = OMAP_PER_PG; + } else { + ceph_assert(s == stringify(OMAP_BULK)); + } + dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl; + } else { + dout(10) << __func__ << " per_pool_omap not present" << dendl; + } + _check_no_per_pg_or_pool_omap_alert(); +} + +void BlueStore::_open_statfs() +{ + osd_pools.clear(); + vstatfs.reset(); + + bufferlist bl; + int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl); + if (r >= 0) { + per_pool_stat_collection = false; + if (size_t(bl.length()) >= sizeof(vstatfs.values)) { + auto it = bl.cbegin(); + vstatfs.decode(it); + dout(10) << __func__ << " store_statfs is found" << dendl; + } else { + dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl; + } + _check_legacy_statfs_alert(); + } else { + per_pool_stat_collection = true; + dout(10) << __func__ << " per-pool statfs is enabled" << dendl; + KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE); + for (it->upper_bound(string()); + it->valid(); + it->next()) { + + uint64_t pool_id; + int r = get_key_pool_stat(it->key(), &pool_id); + ceph_assert(r == 0); + + bufferlist bl; + bl = it->value(); + auto p = bl.cbegin(); + auto& st = osd_pools[pool_id]; + try { + st.decode(p); + vstatfs += st; + + dout(10) << __func__ << " pool " << std::hex << pool_id + << " statfs(hex) " << st + << std::dec << dendl; + } catch (ceph::buffer::error& e) { + derr << __func__ << " failed to decode pool stats, key:" + << pretty_binary_string(it->key()) << dendl; + } + } + } + dout(10) << __func__ << " statfs " << std::hex + << vstatfs << std::dec << dendl; + +} + +int BlueStore::_setup_block_symlink_or_file( + string name, + string epath, + uint64_t size, + bool create) +{ + dout(20) << __func__ << " name " << name << " path " << epath + << " size " << size << " create=" << (int)create << dendl; + int r = 0; + int flags = O_RDWR|O_CLOEXEC; + if (create) + flags |= O_CREAT; + if (epath.length()) { + r = ::symlinkat(epath.c_str(), path_fd, name.c_str()); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to create " << name << " symlink to " + << epath << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) { + int fd = ::openat(path_fd, epath.c_str(), flags, 0644); + if (fd < 0) { + r = -errno; + derr << __func__ << " failed to open " << epath << " file: " + << cpp_strerror(r) << dendl; + return r; + } + // write the Transport ID of the NVMe device + // a transport id for PCIe looks like: "trtype:PCIe traddr:0000:02:00.0" + // where "0000:02:00.0" is the selector of a PCI device, see + // the first column of "lspci -mm -n -D" + // a transport id for tcp looks like: "trype:TCP adrfam:IPv4 traddr:172.31.89.152 trsvcid:4420" + string trid = epath.substr(strlen(SPDK_PREFIX)); + r = ::write(fd, trid.c_str(), trid.size()); + ceph_assert(r == static_cast<int>(trid.size())); + dout(1) << __func__ << " created " << name << " symlink to " + << epath << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + } + if (size) { + int fd = ::openat(path_fd, name.c_str(), flags, 0644); + if (fd >= 0) { + // block file is present + struct stat st; + int r = ::fstat(fd, &st); + if (r == 0 && + S_ISREG(st.st_mode) && // if it is a regular file + st.st_size == 0) { // and is 0 bytes + r = ::ftruncate(fd, size); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to resize " << name << " file to " + << size << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + + if (cct->_conf->bluestore_block_preallocate_file) { + r = ::ceph_posix_fallocate(fd, 0, size); + if (r > 0) { + derr << __func__ << " failed to prefallocate " << name << " file to " + << size << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return -r; + } + } + dout(1) << __func__ << " resized " << name << " file to " + << byte_u_t(size) << dendl; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } else { + int r = -errno; + if (r != -ENOENT) { + derr << __func__ << " failed to open " << name << " file: " + << cpp_strerror(r) << dendl; + return r; + } + } + } + return 0; +} + +int BlueStore::mkfs() +{ + dout(1) << __func__ << " path " << path << dendl; + int r; + uuid_d old_fsid; + uint64_t reserved; + if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) { + derr << __func__ << " osd_max_object_size " + << cct->_conf->osd_max_object_size << " > bluestore max " + << OBJECT_MAX_SIZE << dendl; + return -EINVAL; + } + + { + string done; + r = read_meta("mkfs_done", &done); + if (r == 0) { + dout(1) << __func__ << " already created" << dendl; + if (cct->_conf->bluestore_fsck_on_mkfs) { + r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep); + if (r < 0) { + derr << __func__ << " fsck found fatal error: " << cpp_strerror(r) + << dendl; + return r; + } + if (r > 0) { + derr << __func__ << " fsck found " << r << " errors" << dendl; + r = -EIO; + } + } + return r; // idempotent + } + } + + { + string type; + r = read_meta("type", &type); + if (r == 0) { + if (type != "bluestore") { + derr << __func__ << " expected bluestore, but type is " << type << dendl; + return -EIO; + } + } else { + r = write_meta("type", "bluestore"); + if (r < 0) + return r; + } + } + + r = _open_path(); + if (r < 0) + return r; + + r = _open_fsid(true); + if (r < 0) + goto out_path_fd; + + r = _lock_fsid(); + if (r < 0) + goto out_close_fsid; + + r = _read_fsid(&old_fsid); + if (r < 0 || old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << __func__ << " generated fsid " << fsid << dendl; + } else { + dout(1) << __func__ << " using provided fsid " << fsid << dendl; + } + // we'll write it later. + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << __func__ << " on-disk fsid " << old_fsid + << " != provided " << fsid << dendl; + r = -EINVAL; + goto out_close_fsid; + } + fsid = old_fsid; + } + + r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path, + cct->_conf->bluestore_block_size, + cct->_conf->bluestore_block_create); + if (r < 0) + goto out_close_fsid; + if (cct->_conf->bluestore_bluefs) { + r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path, + cct->_conf->bluestore_block_wal_size, + cct->_conf->bluestore_block_wal_create); + if (r < 0) + goto out_close_fsid; + r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path, + cct->_conf->bluestore_block_db_size, + cct->_conf->bluestore_block_db_create); + if (r < 0) + goto out_close_fsid; + } + + r = _open_bdev(true); + if (r < 0) + goto out_close_fsid; + + // choose freelist manager +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + freelist_type = "zoned"; + zone_size = bdev->get_zone_size(); + first_sequential_zone = bdev->get_conventional_region_size() / zone_size; + bdev->reset_all_zones(); + } else +#endif + { + freelist_type = "bitmap"; + } + dout(10) << " freelist_type " << freelist_type << dendl; + + // choose min_alloc_size + dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size + << " block_size: 0x" << block_size << std::dec << dendl; + if ((cct->_conf->bluestore_use_optimal_io_size_for_min_alloc_size) && (optimal_io_size != 0)) { + dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size + << " for min_alloc_size 0x" << min_alloc_size << std::dec << dendl; + min_alloc_size = optimal_io_size; + } + else if (cct->_conf->bluestore_min_alloc_size) { + min_alloc_size = cct->_conf->bluestore_min_alloc_size; + } else { + ceph_assert(bdev); + if (_use_rotational_settings()) { + min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd; + } else { + min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd; + } + } + _validate_bdev(); + + // make sure min_alloc_size is power of 2 aligned. + if (!std::has_single_bit(min_alloc_size)) { + derr << __func__ << " min_alloc_size 0x" + << std::hex << min_alloc_size << std::dec + << " is not power of 2 aligned!" + << dendl; + r = -EINVAL; + goto out_close_bdev; + } + + // make sure min_alloc_size is >= and aligned with block size + if (min_alloc_size % block_size != 0) { + derr << __func__ << " min_alloc_size 0x" + << std::hex << min_alloc_size + << " is less or not aligned with block_size: 0x" + << block_size << std::dec << dendl; + r = -EINVAL; + goto out_close_bdev; + } + + r = _create_alloc(); + if (r < 0) { + goto out_close_bdev; + } + + reserved = _get_ondisk_reserved(); + alloc->init_add_free(reserved, + p2align(bdev->get_size(), min_alloc_size) - reserved); +#ifdef HAVE_LIBZBD + if (bdev->is_smr() && alloc != shared_alloc.a) { + shared_alloc.a->init_add_free(reserved, + p2align(bdev->get_conventional_region_size(), + min_alloc_size) - reserved); + } +#endif + + r = _open_db(true); + if (r < 0) + goto out_close_alloc; + + { + KeyValueDB::Transaction t = db->get_transaction(); + r = _open_fm(t, false, true); + if (r < 0) + goto out_close_db; + { + bufferlist bl; + encode((uint64_t)0, bl); + t->set(PREFIX_SUPER, "nid_max", bl); + t->set(PREFIX_SUPER, "blobid_max", bl); + } + + { + bufferlist bl; + encode((uint64_t)min_alloc_size, bl); + t->set(PREFIX_SUPER, "min_alloc_size", bl); + } + { + bufferlist bl; + if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) { + bl.append(stringify(OMAP_BULK)); + } else { + bl.append(stringify(OMAP_PER_PG)); + } + t->set(PREFIX_SUPER, "per_pool_omap", bl); + } + +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + { + bufferlist bl; + encode((uint64_t)zone_size, bl); + t->set(PREFIX_SUPER, "zone_size", bl); + } + { + bufferlist bl; + encode((uint64_t)first_sequential_zone, bl); + t->set(PREFIX_SUPER, "first_sequential_zone", bl); + } + } +#endif + + ondisk_format = latest_ondisk_format; + _prepare_ondisk_format_super(t); + db->submit_transaction_sync(t); + } + + r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend); + if (r < 0) + goto out_close_fm; + + r = write_meta("bluefs", stringify(bluefs ? 1 : 0)); + if (r < 0) + goto out_close_fm; + + if (fsid != old_fsid) { + r = _write_fsid(); + if (r < 0) { + derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl; + goto out_close_fm; + } + } + + out_close_fm: + _close_fm(); + out_close_db: + _close_db(); + out_close_alloc: + _close_alloc(); + out_close_bdev: + _close_bdev(); + out_close_fsid: + _close_fsid(); + out_path_fd: + _close_path(); + + if (r == 0 && + cct->_conf->bluestore_fsck_on_mkfs) { + int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep); + if (rc < 0) + return rc; + if (rc > 0) { + derr << __func__ << " fsck found " << rc << " errors" << dendl; + r = -EIO; + } + } + + if (r == 0) { + // indicate success by writing the 'mkfs_done' file + r = write_meta("mkfs_done", "yes"); + } + + if (r < 0) { + derr << __func__ << " failed, " << cpp_strerror(r) << dendl; + } else { + dout(0) << __func__ << " success" << dendl; + } + return r; +} + +int BlueStore::add_new_bluefs_device(int id, const string& dev_path) +{ + dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl; + int r; + ceph_assert(path_fd < 0); + + ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB); + + if (!cct->_conf->bluestore_bluefs) { + derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; + return -EIO; + } + dout(5) << __func__ << "::NCB::calling open_db_and_around(read-only)" << dendl; + r = _open_db_and_around(true); + if (r < 0) { + return r; + } + + if (id == BlueFS::BDEV_NEWWAL) { + string p = path + "/block.wal"; + r = _setup_block_symlink_or_file("block.wal", dev_path, + cct->_conf->bluestore_block_wal_size, + true); + ceph_assert(r == 0); + + r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p, + cct->_conf->bdev_enable_discard, + BDEV_LABEL_BLOCK_SIZE); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { + r = _check_or_set_bdev_label( + p, + bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL), + "bluefs wal", + true); + ceph_assert(r == 0); + } + + bluefs_layout.dedicated_wal = true; + } else if (id == BlueFS::BDEV_NEWDB) { + string p = path + "/block.db"; + r = _setup_block_symlink_or_file("block.db", dev_path, + cct->_conf->bluestore_block_db_size, + true); + ceph_assert(r == 0); + + r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p, + cct->_conf->bdev_enable_discard, + SUPER_RESERVED); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { + r = _check_or_set_bdev_label( + p, + bluefs->get_block_device_size(BlueFS::BDEV_NEWDB), + "bluefs db", + true); + ceph_assert(r == 0); + } + bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW; + bluefs_layout.dedicated_db = true; + } + bluefs->umount(); + bluefs->mount(); + + r = bluefs->prepare_new_device(id, bluefs_layout); + ceph_assert(r == 0); + + if (r < 0) { + derr << __func__ << " failed, " << cpp_strerror(r) << dendl; + } else { + dout(0) << __func__ << " success" << dendl; + } + + _close_db_and_around(); + return r; +} + +int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source, + int id) +{ + dout(10) << __func__ << " id:" << id << dendl; + ceph_assert(path_fd < 0); + + ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB); + + if (!cct->_conf->bluestore_bluefs) { + derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; + return -EIO; + } + + int r = _open_db_and_around(true); + if (r < 0) { + return r; + } + auto close_db = make_scope_guard([&] { + _close_db_and_around(); + }); + uint64_t used_space = 0; + for(auto src_id : devs_source) { + used_space += bluefs->get_used(src_id); + } + uint64_t target_free = bluefs->get_free(id); + if (target_free < used_space) { + derr << __func__ + << " can't migrate, free space at target: " << target_free + << " is less than required space: " << used_space + << dendl; + return -ENOSPC; + } + if (devs_source.count(BlueFS::BDEV_DB)) { + bluefs_layout.shared_bdev = BlueFS::BDEV_DB; + bluefs_layout.dedicated_db = false; + } + if (devs_source.count(BlueFS::BDEV_WAL)) { + bluefs_layout.dedicated_wal = false; + } + r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout); + if (r < 0) { + derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl; + return r; + } + + if (devs_source.count(BlueFS::BDEV_DB)) { + r = unlink(string(path + "/block.db").c_str()); + ceph_assert(r == 0); + } + if (devs_source.count(BlueFS::BDEV_WAL)) { + r = unlink(string(path + "/block.wal").c_str()); + ceph_assert(r == 0); + } + return r; +} + +int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source, + int id, + const string& dev_path) +{ + dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl; + ceph_assert(path_fd < 0); + + ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB); + + if (!cct->_conf->bluestore_bluefs) { + derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; + return -EIO; + } + + int r = _open_db_and_around(true); + if (r < 0) { + return r; + } + auto close_db = make_scope_guard([&] { + _close_db_and_around(); + }); + + string link_db; + string link_wal; + if (devs_source.count(BlueFS::BDEV_DB) && + bluefs_layout.shared_bdev != BlueFS::BDEV_DB) { + link_db = path + "/block.db"; + bluefs_layout.shared_bdev = BlueFS::BDEV_DB; + bluefs_layout.dedicated_db = false; + } + if (devs_source.count(BlueFS::BDEV_WAL)) { + link_wal = path + "/block.wal"; + bluefs_layout.dedicated_wal = false; + } + + size_t target_size = 0; + string target_name; + if (id == BlueFS::BDEV_NEWWAL) { + target_name = "block.wal"; + target_size = cct->_conf->bluestore_block_wal_size; + bluefs_layout.dedicated_wal = true; + + r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path, + cct->_conf->bdev_enable_discard, + BDEV_LABEL_BLOCK_SIZE); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { + r = _check_or_set_bdev_label( + dev_path, + bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL), + "bluefs wal", + true); + ceph_assert(r == 0); + } + } else if (id == BlueFS::BDEV_NEWDB) { + target_name = "block.db"; + target_size = cct->_conf->bluestore_block_db_size; + bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW; + bluefs_layout.dedicated_db = true; + + r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path, + cct->_conf->bdev_enable_discard, + SUPER_RESERVED); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { + r = _check_or_set_bdev_label( + dev_path, + bluefs->get_block_device_size(BlueFS::BDEV_NEWDB), + "bluefs db", + true); + ceph_assert(r == 0); + } + } + + bluefs->umount(); + bluefs->mount(); + + r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout); + + if (r < 0) { + derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl; + return r; + } + + if (!link_db.empty()) { + r = unlink(link_db.c_str()); + ceph_assert(r == 0); + } + if (!link_wal.empty()) { + r = unlink(link_wal.c_str()); + ceph_assert(r == 0); + } + r = _setup_block_symlink_or_file( + target_name, + dev_path, + target_size, + true); + ceph_assert(r == 0); + dout(0) << __func__ << " success" << dendl; + + return r; +} + +string BlueStore::get_device_path(unsigned id) +{ + string res; + if (id < BlueFS::MAX_BDEV) { + switch (id) { + case BlueFS::BDEV_WAL: + res = path + "/block.wal"; + break; + case BlueFS::BDEV_DB: + if (id == bluefs_layout.shared_bdev) { + res = path + "/block"; + } else { + res = path + "/block.db"; + } + break; + case BlueFS::BDEV_SLOW: + res = path + "/block"; + break; + } + } + return res; +} + +int BlueStore::_set_bdev_label_size(const string& path, uint64_t size) +{ + bluestore_bdev_label_t label; + int r = _read_bdev_label(cct, path, &label); + if (r < 0) { + derr << "unable to read label for " << path << ": " + << cpp_strerror(r) << dendl; + } else { + label.size = size; + r = _write_bdev_label(cct, path, label); + if (r < 0) { + derr << "unable to write label for " << path << ": " + << cpp_strerror(r) << dendl; + } + } + return r; +} + +int BlueStore::expand_devices(ostream& out) +{ + int r = _open_db_and_around(true); + ceph_assert(r == 0); + bluefs->dump_block_extents(out); + out << "Expanding DB/WAL..." << std::endl; + for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) { + if (devid == bluefs_layout.shared_bdev ) { + continue; + } + uint64_t size = bluefs->get_block_device_size(devid); + if (size == 0) { + // no bdev + continue; + } + + out << devid + <<" : expanding " << " to 0x" << size << std::dec << std::endl; + string p = get_device_path(devid); + const char* path = p.c_str(); + if (path == nullptr) { + derr << devid + <<": can't find device path " << dendl; + continue; + } + if (bluefs->bdev_support_label(devid)) { + if (_set_bdev_label_size(p, size) >= 0) { + out << devid + << " : size label updated to " << size + << std::endl; + } + } + } + uint64_t size0 = fm->get_size(); + uint64_t size = bdev->get_size(); + if (size0 < size) { + out << bluefs_layout.shared_bdev + << " : expanding " << " from 0x" << std::hex + << size0 << " to 0x" << size << std::dec << std::endl; + _write_out_fm_meta(size); + if (bdev->supported_bdev_label()) { + if (_set_bdev_label_size(path, size) >= 0) { + out << bluefs_layout.shared_bdev + << " : size label updated to " << size + << std::endl; + } + } + + if (fm && fm->is_null_manager()) { + // we grow the allocation range, must reflect it in the allocation file + alloc->init_add_free(size0, size - size0); + need_to_destage_allocation_file = true; + } + _close_db_and_around(); + + // mount in read/write to sync expansion changes + r = _mount(); + ceph_assert(r == 0); + umount(); + } else { + _close_db_and_around(); + } + return r; +} + +int BlueStore::dump_bluefs_sizes(ostream& out) +{ + int r = _open_db_and_around(true); + ceph_assert(r == 0); + bluefs->dump_block_extents(out); + _close_db_and_around(); + return r; +} + +void BlueStore::set_cache_shards(unsigned num) +{ + dout(10) << __func__ << " " << num << dendl; + size_t oold = onode_cache_shards.size(); + size_t bold = buffer_cache_shards.size(); + ceph_assert(num >= oold && num >= bold); + onode_cache_shards.resize(num); + buffer_cache_shards.resize(num); + for (unsigned i = oold; i < num; ++i) { + onode_cache_shards[i] = + OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type, + logger); + } + for (unsigned i = bold; i < num; ++i) { + buffer_cache_shards[i] = + BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type, + logger); + } +} + +//--------------------------------------------- +bool BlueStore::has_null_manager() const +{ + return (fm && fm->is_null_manager()); +} + +int BlueStore::_mount() +{ + dout(5) << __func__ << "NCB:: path " << path << dendl; + + _kv_only = false; + if (cct->_conf->bluestore_fsck_on_mount) { + dout(5) << __func__ << "::NCB::calling fsck()" << dendl; + int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep); + if (rc < 0) + return rc; + if (rc > 0) { + derr << __func__ << " fsck found " << rc << " errors" << dendl; + return -EIO; + } + } + + if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) { + derr << __func__ << " osd_max_object_size " + << cct->_conf->osd_max_object_size << " > bluestore max " + << OBJECT_MAX_SIZE << dendl; + return -EINVAL; + } + + dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl; + int r = _open_db_and_around(false); + if (r < 0) { + return r; + } + auto close_db = make_scope_guard([&] { + if (!mounted) { + _close_db_and_around(); + } + }); + + r = _upgrade_super(); + if (r < 0) { + return r; + } + + // The recovery process for allocation-map needs to open collection early + r = _open_collections(); + if (r < 0) { + return r; + } + auto shutdown_cache = make_scope_guard([&] { + if (!mounted) { + _shutdown_cache(); + } + }); + + r = _reload_logger(); + if (r < 0) { + return r; + } + + _kv_start(); + auto stop_kv = make_scope_guard([&] { + if (!mounted) { + _kv_stop(); + } + }); + + r = _deferred_replay(); + if (r < 0) { + return r; + } + +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + _zoned_cleaner_start(); + } +#endif + + mempool_thread.init(); + + if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) && + cct->_conf->bluestore_fsck_quick_fix_on_mount == true) { + + auto was_per_pool_omap = per_pool_omap; + + dout(1) << __func__ << " quick-fix on mount" << dendl; + _fsck_on_open(FSCK_SHALLOW, true); + + //set again as hopefully it has been fixed + if (was_per_pool_omap != OMAP_PER_PG) { + _set_per_pool_omap(); + } + } + + mounted = true; + return 0; +} + +int BlueStore::umount() +{ + ceph_assert(_kv_only || mounted); + _osr_drain_all(); + + mounted = false; + + ceph_assert(alloc); + + if (!_kv_only) { + mempool_thread.shutdown(); +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + dout(20) << __func__ << " stopping zone cleaner thread" << dendl; + _zoned_cleaner_stop(); + } +#endif + dout(20) << __func__ << " stopping kv thread" << dendl; + _kv_stop(); + // skip cache cleanup step on fast shutdown + if (likely(!m_fast_shutdown)) { + _shutdown_cache(); + } + dout(20) << __func__ << " closing" << dendl; + } + _close_db_and_around(); + // disable fsck on fast-shutdown + if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) { + int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep); + if (rc < 0) + return rc; + if (rc > 0) { + derr << __func__ << " fsck found " << rc << " errors" << dendl; + return -EIO; + } + } + return 0; +} + +int BlueStore::cold_open() +{ + return _open_db_and_around(true); +} + +int BlueStore::cold_close() +{ + _close_db_and_around(); + return 0; +} + +// derr wrapper to limit enormous output and avoid log flooding. +// Of limited use where such output is expected for now +#define fsck_derr(err_cnt, threshold) \ + if (err_cnt <= threshold) { \ + bool need_skip_print = err_cnt == threshold; \ + derr + +#define fsck_dendl \ + dendl; \ + if (need_skip_print) \ + derr << "more error lines skipped..." << dendl; \ + } + +int _fsck_sum_extents( + const PExtentVector& extents, + bool compressed, + store_statfs_t& expected_statfs) +{ + for (auto e : extents) { + if (!e.is_valid()) + continue; + expected_statfs.allocated += e.length; + if (compressed) { + expected_statfs.data_compressed_allocated += e.length; + } + } + return 0; +} + +int BlueStore::_fsck_check_extents( + std::string_view ctx_descr, + const PExtentVector& extents, + bool compressed, + mempool_dynamic_bitset &used_blocks, + uint64_t granularity, + BlueStoreRepairer* repairer, + store_statfs_t& expected_statfs, + FSCKDepth depth) +{ + dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl; + int errors = 0; + for (auto e : extents) { + if (!e.is_valid()) + continue; + expected_statfs.allocated += e.length; + if (compressed) { + expected_statfs.data_compressed_allocated += e.length; + } + if (depth != FSCK_SHALLOW) { + bool already = false; + apply_for_bitset_range( + e.offset, e.length, granularity, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + if (bs.test(pos)) { + if (repairer) { + repairer->note_misreference( + pos * min_alloc_size, min_alloc_size, !already); + } + if (!already) { + derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e + << " or a subset is already allocated (misreferenced)" << dendl; + ++errors; + already = true; + } + } + else + bs.set(pos); + }); + + if (e.end() > bdev->get_size()) { + derr << "fsck error: " << ctx_descr << ", extent " << e + << " past end of block device" << dendl; + ++errors; + } + } + } + return errors; +} + +void BlueStore::_fsck_check_statfs( + const store_statfs_t& expected_statfs, + const per_pool_statfs& expected_pool_statfs, + int64_t& errors, + int64_t& warnings, + BlueStoreRepairer* repairer) +{ + string key; + store_statfs_t actual_statfs; + store_statfs_t s; + { + // make a copy + per_pool_statfs my_expected_pool_statfs(expected_pool_statfs); + auto op = osd_pools.begin(); + while (op != osd_pools.end()) { + get_pool_stat_key(op->first, &key); + op->second.publish(&s); + auto it_expected = my_expected_pool_statfs.find(op->first); + if (it_expected == my_expected_pool_statfs.end()) { + auto op0 = op++; + if (op0->second.is_empty()) { + // It's OK to lack relevant empty statfs record + continue; + } + derr << __func__ << "::fsck error: " << std::hex + << "pool " << op0->first << " has got no statfs to match against: " + << s + << std::dec << dendl; + ++errors; + if (repairer) { + osd_pools.erase(op0); + repairer->remove_key(db, PREFIX_STAT, key); + } + } else { + if (!(s == it_expected->second)) { + derr << "fsck error: actual " << s + << " != expected " << it_expected->second + << " for pool " + << std::hex << op->first << std::dec << dendl; + ++errors; + if (repairer) { + // repair in-memory in a hope this would be flushed properly on shutdown + s = it_expected->second; + op->second = it_expected->second; + repairer->fix_statfs(db, key, it_expected->second); + } + } + actual_statfs.add(s); + my_expected_pool_statfs.erase(it_expected); + ++op; + } + } + // check stats that lack matching entities in osd_pools + for (auto &p : my_expected_pool_statfs) { + if (p.second.is_zero()) { + // It's OK to lack relevant empty statfs record + continue; + } + get_pool_stat_key(p.first, &key); + derr << __func__ << "::fsck error: " << std::hex + << "pool " << p.first << " has got no actual statfs: " + << std::dec << p.second + << dendl; + ++errors; + if (repairer) { + osd_pools[p.first] = p.second; + repairer->fix_statfs(db, key, p.second); + actual_statfs.add(p.second); + } + } + } + // process global statfs + if (repairer) { + if (!per_pool_stat_collection) { + // by virtue of running this method, we correct the top-level + // error of having global stats + repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY); + per_pool_stat_collection = true; + } + vstatfs = actual_statfs; + dout(20) << __func__ << " setting vstatfs to " << actual_statfs << dendl; + } else if (!per_pool_stat_collection) { + // check global stats only if fscking (not repairing) w/o per-pool stats + vstatfs.publish(&s); + if (!(s == expected_statfs)) { + derr << "fsck error: actual " << s + << " != expected " << expected_statfs << dendl; + ++errors; + } + } +} + +void BlueStore::_fsck_repair_shared_blobs( + BlueStoreRepairer& repairer, + shared_blob_2hash_tracker_t& sb_ref_counts, + sb_info_space_efficient_map_t& sb_info) +{ + auto sb_ref_mismatches = sb_ref_counts.count_non_zero(); + dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: " + << sb_ref_mismatches << dendl; + if (!sb_ref_mismatches) // not expected to succeed, just in case + return; + + + auto foreach_shared_blob = [&](std::function< + void (coll_t, + ghobject_t, + uint64_t, + const bluestore_blob_t&)> cb) { + auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + CollectionRef c; + spg_t pgid; + for (it->lower_bound(string()); it->valid(); it->next()) { + dout(30) << __func__ << " key " + << pretty_binary_string(it->key()) + << dendl; + if (is_extent_shard_key(it->key())) { + continue; + } + + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + if (r < 0) { + continue; + } + + if (!c || + oid.shard_id != pgid.shard || + oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || + !c->contains(oid)) { + c = nullptr; + for (auto& p : coll_map) { + if (p.second->contains(oid)) { + c = p.second; + break; + } + } + if (!c) { + continue; + } + } + dout(20) << __func__ + << " inspecting shared blob refs for col:" << c->cid + << " obj:" << oid + << dendl; + + OnodeRef o; + o.reset(Onode::create_decode(c, oid, it->key(), it->value())); + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + + _dump_onode<30>(cct, *o); + + mempool::bluestore_fsck::set<BlobRef> passed_sbs; + for (auto& e : o->extent_map.extent_map) { + auto& b = e.blob->get_blob(); + if (b.is_shared() && passed_sbs.count(e.blob) == 0) { + auto sbid = e.blob->shared_blob->get_sbid(); + cb(c->cid, oid, sbid, b); + passed_sbs.emplace(e.blob); + } + } // for ... extent_map + } // for ... it->valid + } //if (it(PREFIX_OBJ)) + }; //foreach_shared_blob fn declaration + + mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map; + + // first iteration over objects to identify all the broken sbids + foreach_shared_blob( [&](coll_t cid, + ghobject_t oid, + uint64_t sbid, + const bluestore_blob_t& b) { + auto it = refs_map.lower_bound(sbid); + if(it != refs_map.end() && it->first == sbid) { + return; + } + for (auto& p : b.get_extents()) { + if (p.is_valid() && + !sb_ref_counts.test_all_zero_range(sbid, + p.offset, + p.length)) { + refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t()); + dout(20) << __func__ + << " broken shared blob found for col:" << cid + << " obj:" << oid + << " sbid 0x " << std::hex << sbid << std::dec + << dendl; + break; + } + } + }); + + // second iteration over objects to build new ref map for the broken sbids + foreach_shared_blob( [&](coll_t cid, + ghobject_t oid, + uint64_t sbid, + const bluestore_blob_t& b) { + auto it = refs_map.find(sbid); + if(it == refs_map.end()) { + return; + } + for (auto& p : b.get_extents()) { + if (p.is_valid()) { + it->second.get(p.offset, p.length); + break; + } + } + }); + + // update shared blob records + auto ref_it = refs_map.begin(); + while (ref_it != refs_map.end()) { + size_t cnt = 0; + const size_t max_transactions = 4096; + KeyValueDB::Transaction txn = db->get_transaction(); + for (cnt = 0; + cnt < max_transactions && ref_it != refs_map.end(); + ref_it++) { + auto sbid = ref_it->first; + dout(20) << __func__ << " repaired shared_blob 0x" + << std::hex << sbid << std::dec + << ref_it->second << dendl; + repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0); + cnt++; + } + if (cnt) { + db->submit_transaction_sync(txn); + cnt = 0; + } + } + // remove stray shared blob records + size_t cnt = 0; + const size_t max_transactions = 4096; + KeyValueDB::Transaction txn = db->get_transaction(); + sb_info.foreach_stray([&](const sb_info_t& sbi) { + auto sbid = sbi.get_sbid(); + dout(20) << __func__ << " removing stray shared_blob 0x" + << std::hex << sbid << std::dec + << dendl; + repairer.fix_shared_blob(txn, sbid, nullptr, 0); + cnt++; + if (cnt >= max_transactions) {} + db->submit_transaction_sync(txn); + txn = db->get_transaction(); + cnt = 0; + }); + if (cnt > 0) { + db->submit_transaction_sync(txn); + } + + // amount of repairs to report to be equal to previously + // determined error estimation, not the actual number of updated shared blobs + repairer.inc_repaired(sb_ref_mismatches); +} + +BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow( + BlueStore::FSCKDepth depth, + int64_t pool_id, + BlueStore::CollectionRef c, + const ghobject_t& oid, + const string& key, + const bufferlist& value, + mempool::bluestore_fsck::list<string>* expecting_shards, + map<BlobRef, bluestore_blob_t::unused_t>* referenced, + const BlueStore::FSCK_ObjectCtx& ctx) +{ + auto& errors = ctx.errors; + auto& num_objects = ctx.num_objects; + auto& num_extents = ctx.num_extents; + auto& num_blobs = ctx.num_blobs; + auto& num_sharded_objects = ctx.num_sharded_objects; + auto& num_spanning_blobs = ctx.num_spanning_blobs; + auto used_blocks = ctx.used_blocks; + auto sb_info_lock = ctx.sb_info_lock; + auto& sb_info = ctx.sb_info; + auto& sb_ref_counts = ctx.sb_ref_counts; + auto repairer = ctx.repairer; + + store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ? + &ctx.expected_pool_statfs[pool_id] : + &ctx.expected_store_statfs; + + map<uint32_t, uint64_t> zone_first_offsets; // for zoned/smr devices + + dout(10) << __func__ << " " << oid << dendl; + OnodeRef o; + o.reset(Onode::create_decode(c, oid, key, value)); + ++num_objects; + + num_spanning_blobs += o->extent_map.spanning_blob_map.size(); + + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + _dump_onode<30>(cct, *o); + // shards + if (!o->extent_map.shards.empty()) { + ++num_sharded_objects; + if (depth != FSCK_SHALLOW) { + ceph_assert(expecting_shards); + for (auto& s : o->extent_map.shards) { + dout(20) << __func__ << " shard " << *s.shard_info << dendl; + expecting_shards->push_back(string()); + get_extent_shard_key(o->key, s.shard_info->offset, + &expecting_shards->back()); + if (s.shard_info->offset >= o->onode.size) { + derr << "fsck error: " << oid << " shard 0x" << std::hex + << s.shard_info->offset << " past EOF at 0x" << o->onode.size + << std::dec << dendl; + ++errors; + } + } + } + } + + // lextents + uint64_t pos = 0; + mempool::bluestore_fsck::map<BlobRef, + bluestore_blob_use_tracker_t> ref_map; + for (auto& l : o->extent_map.extent_map) { + dout(20) << __func__ << " " << l << dendl; + if (l.logical_offset < pos) { + derr << "fsck error: " << oid << " lextent at 0x" + << std::hex << l.logical_offset + << " overlaps with the previous, which ends at 0x" << pos + << std::dec << dendl; + ++errors; + } + if (depth != FSCK_SHALLOW && + o->extent_map.spans_shard(l.logical_offset, l.length)) { + derr << "fsck error: " << oid << " lextent at 0x" + << std::hex << l.logical_offset << "~" << l.length + << " spans a shard boundary" + << std::dec << dendl; + ++errors; + } + pos = l.logical_offset + l.length; + res_statfs->data_stored += l.length; + ceph_assert(l.blob); + const bluestore_blob_t& blob = l.blob->get_blob(); + +#ifdef HAVE_LIBZBD + if (bdev->is_smr() && depth != FSCK_SHALLOW) { + for (auto& e : blob.get_extents()) { + if (e.is_valid()) { + uint32_t zone = e.offset / zone_size; + uint64_t offset = e.offset % zone_size; + auto p = zone_first_offsets.find(zone); + if (p == zone_first_offsets.end() || p->second > offset) { + // FIXME: use interator for guided insert? + zone_first_offsets[zone] = offset; + } + } + } + } +#endif + + auto& ref = ref_map[l.blob]; + if (ref.is_empty()) { + uint32_t min_release_size = blob.get_release_size(min_alloc_size); + uint32_t l = blob.get_logical_length(); + ref.init(l, min_release_size); + } + ref.get( + l.blob_offset, + l.length); + ++num_extents; + if (depth != FSCK_SHALLOW && + blob.has_unused()) { + ceph_assert(referenced); + auto p = referenced->find(l.blob); + bluestore_blob_t::unused_t* pu; + if (p == referenced->end()) { + pu = &(*referenced)[l.blob]; + } + else { + pu = &p->second; + } + uint64_t blob_len = blob.get_logical_length(); + ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0); + ceph_assert(l.blob_offset + l.length <= blob_len); + uint64_t chunk_size = blob_len / (sizeof(*pu) * 8); + uint64_t start = l.blob_offset / chunk_size; + uint64_t end = + round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size; + for (auto i = start; i < end; ++i) { + (*pu) |= (1u << i); + } + } + } //for (auto& l : o->extent_map.extent_map) + + for (auto& i : ref_map) { + ++num_blobs; + const bluestore_blob_t& blob = i.first->get_blob(); + bool equal = + depth == FSCK_SHALLOW ? true : + i.first->get_blob_use_tracker().equal(i.second); + if (!equal) { + derr << "fsck error: " << oid << " blob " << *i.first + << " doesn't match expected ref_map " << i.second << dendl; + ++errors; + } + if (blob.is_compressed()) { + res_statfs->data_compressed += blob.get_compressed_payload_length(); + res_statfs->data_compressed_original += + i.first->get_referenced_bytes(); + } + if (depth != FSCK_SHALLOW && repairer) { + for (auto e : blob.get_extents()) { + if (!e.is_valid()) + continue; + repairer->set_space_used(e.offset, e.length, c->cid, oid); + } + } + if (blob.is_shared()) { + if (i.first->shared_blob->get_sbid() > blobid_max) { + derr << "fsck error: " << oid << " blob " << blob + << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max " + << blobid_max << dendl; + ++errors; + } else if (i.first->shared_blob->get_sbid() == 0) { + derr << "fsck error: " << oid << " blob " << blob + << " marked as shared but has uninitialized sbid" + << dendl; + ++errors; + } + // the below lock is optional and provided in multithreading mode only + if (sb_info_lock) { + sb_info_lock->lock(); + } + auto sbid = i.first->shared_blob->get_sbid(); + sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid()); + ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID || + sbi.pool_id == oid.hobj.get_logical_pool()); + sbi.pool_id = oid.hobj.get_logical_pool(); + bool compressed = blob.is_compressed(); + for (auto e : blob.get_extents()) { + if (e.is_valid()) { + if (compressed) { + ceph_assert(sbi.allocated_chunks <= 0); + sbi.allocated_chunks -= (e.length >> min_alloc_size_order); + } else { + ceph_assert(sbi.allocated_chunks >= 0); + sbi.allocated_chunks += (e.length >> min_alloc_size_order); + } + sb_ref_counts.inc_range(sbid, e.offset, e.length, 1); + } + } + if (sb_info_lock) { + sb_info_lock->unlock(); + } + } else if (depth != FSCK_SHALLOW) { + ceph_assert(used_blocks); + string ctx_descr = " oid " + stringify(oid); + errors += _fsck_check_extents(ctx_descr, + blob.get_extents(), + blob.is_compressed(), + *used_blocks, + fm->get_alloc_size(), + repairer, + *res_statfs, + depth); + } else { + errors += _fsck_sum_extents( + blob.get_extents(), + blob.is_compressed(), + *res_statfs); + } + } // for (auto& i : ref_map) + + { + auto &sbm = o->extent_map.spanning_blob_map; + size_t broken = 0; + BlobRef first_broken; + for (auto it = sbm.begin(); it != sbm.end();) { + auto it1 = it++; + if (ref_map.count(it1->second) == 0) { + if (!broken) { + first_broken = it1->second; + ++errors; + derr << "fsck error:" << " stray spanning blob found:" << it1->first + << dendl; + } + broken++; + if (repairer) { + sbm.erase(it1); + } + } + } + +#ifdef HAVE_LIBZBD + if (bdev->is_smr() && depth != FSCK_SHALLOW) { + for (auto& [zone, first_offset] : zone_first_offsets) { + auto p = (*ctx.zone_refs)[zone].find(oid); + if (p != (*ctx.zone_refs)[zone].end()) { + if (first_offset < p->second) { + dout(20) << " slightly wonky zone ref 0x" << std::hex << zone + << " offset 0x" << p->second + << " but first offset is 0x" << first_offset + << "; this can happen due to clone_range" + << dendl; + } else { + dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second + << " <= first offset 0x" << first_offset + << std::dec << dendl; + } + (*ctx.zone_refs)[zone].erase(p); + } else { + derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone + << " but there is no zone ref" << std::dec << dendl; + // FIXME: add repair + ++errors; + } + } + } +#endif + + if (broken) { + derr << "fsck error: " << oid << " - " << broken + << " zombie spanning blob(s) found, the first one: " + << *first_broken << dendl; + if(repairer) { + repairer->fix_spanning_blobs( + db, + [&](KeyValueDB::Transaction txn) { + _record_onode(o, txn); + }); + } + } + } + + if (o->onode.has_omap()) { + _fsck_check_object_omap(depth, o, ctx); + } + + return o; +} + +#include "common/WorkQueue.h" + +class ShallowFSCKThreadPool : public ThreadPool +{ +public: + ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) : + ThreadPool(cct_, nm, tn, n) { + } + void worker(ThreadPool::WorkThread* wt) override { + int next_wq = 0; + while (!_stop) { + next_wq %= work_queues.size(); + WorkQueue_ *wq = work_queues[next_wq++]; + + void* item = wq->_void_dequeue(); + if (item) { + processing++; + TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval); + wq->_void_process(item, tp_handle); + processing--; + } + } + } + template <size_t BatchLen> + struct FSCKWorkQueue : public ThreadPool::WorkQueue_ + { + struct Entry { + int64_t pool_id; + BlueStore::CollectionRef c; + ghobject_t oid; + string key; + bufferlist value; + }; + struct Batch { + std::atomic<size_t> running = { 0 }; + size_t entry_count = 0; + std::array<Entry, BatchLen> entries; + + int64_t errors = 0; + int64_t warnings = 0; + uint64_t num_objects = 0; + uint64_t num_extents = 0; + uint64_t num_blobs = 0; + uint64_t num_sharded_objects = 0; + uint64_t num_spanning_blobs = 0; + store_statfs_t expected_store_statfs; + BlueStore::per_pool_statfs expected_pool_statfs; + }; + + size_t batchCount; + BlueStore* store = nullptr; + + ceph::mutex* sb_info_lock = nullptr; + sb_info_space_efficient_map_t* sb_info = nullptr; + shared_blob_2hash_tracker_t* sb_ref_counts = nullptr; + BlueStoreRepairer* repairer = nullptr; + + Batch* batches = nullptr; + size_t last_batch_pos = 0; + bool batch_acquired = false; + + FSCKWorkQueue(std::string n, + size_t _batchCount, + BlueStore* _store, + ceph::mutex* _sb_info_lock, + sb_info_space_efficient_map_t& _sb_info, + shared_blob_2hash_tracker_t& _sb_ref_counts, + BlueStoreRepairer* _repairer) : + WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()), + batchCount(_batchCount), + store(_store), + sb_info_lock(_sb_info_lock), + sb_info(&_sb_info), + sb_ref_counts(&_sb_ref_counts), + repairer(_repairer) + { + batches = new Batch[batchCount]; + } + ~FSCKWorkQueue() { + delete[] batches; + } + + /// Remove all work items from the queue. + void _clear() override { + //do nothing + } + /// Check whether there is anything to do. + bool _empty() override { + ceph_assert(false); + } + + /// Get the next work item to process. + void* _void_dequeue() override { + size_t pos = rand() % batchCount; + size_t pos0 = pos; + do { + auto& batch = batches[pos]; + if (batch.running.fetch_add(1) == 0) { + if (batch.entry_count) { + return &batch; + } + } + batch.running--; + pos++; + pos %= batchCount; + } while (pos != pos0); + return nullptr; + } + /** @brief Process the work item. + * This function will be called several times in parallel + * and must therefore be thread-safe. */ + void _void_process(void* item, TPHandle& handle) override { + Batch* batch = (Batch*)item; + + BlueStore::FSCK_ObjectCtx ctx( + batch->errors, + batch->warnings, + batch->num_objects, + batch->num_extents, + batch->num_blobs, + batch->num_sharded_objects, + batch->num_spanning_blobs, + nullptr, // used_blocks + nullptr, //used_omap_head + nullptr, + sb_info_lock, + *sb_info, + *sb_ref_counts, + batch->expected_store_statfs, + batch->expected_pool_statfs, + repairer); + + for (size_t i = 0; i < batch->entry_count; i++) { + auto& entry = batch->entries[i]; + + store->fsck_check_objects_shallow( + BlueStore::FSCK_SHALLOW, + entry.pool_id, + entry.c, + entry.oid, + entry.key, + entry.value, + nullptr, // expecting_shards - this will need a protection if passed + nullptr, // referenced + ctx); + } + batch->entry_count = 0; + batch->running--; + } + /** @brief Synchronously finish processing a work item. + * This function is called after _void_process with the global thread pool lock held, + * so at most one copy will execute simultaneously for a given thread pool. + * It can be used for non-thread-safe finalization. */ + void _void_process_finish(void*) override { + ceph_assert(false); + } + + bool queue( + int64_t pool_id, + BlueStore::CollectionRef c, + const ghobject_t& oid, + const string& key, + const bufferlist& value) { + bool res = false; + size_t pos0 = last_batch_pos; + if (!batch_acquired) { + do { + auto& batch = batches[last_batch_pos]; + if (batch.running.fetch_add(1) == 0) { + if (batch.entry_count < BatchLen) { + batch_acquired = true; + break; + } + } + batch.running.fetch_sub(1); + last_batch_pos++; + last_batch_pos %= batchCount; + } while (last_batch_pos != pos0); + } + if (batch_acquired) { + auto& batch = batches[last_batch_pos]; + ceph_assert(batch.running); + ceph_assert(batch.entry_count < BatchLen); + + auto& entry = batch.entries[batch.entry_count]; + entry.pool_id = pool_id; + entry.c = c; + entry.oid = oid; + entry.key = key; + entry.value = value; + + ++batch.entry_count; + if (batch.entry_count == BatchLen) { + batch_acquired = false; + batch.running.fetch_sub(1); + last_batch_pos++; + last_batch_pos %= batchCount; + } + res = true; + } + return res; + } + + void finalize(ThreadPool& tp, + BlueStore::FSCK_ObjectCtx& ctx) { + if (batch_acquired) { + auto& batch = batches[last_batch_pos]; + ceph_assert(batch.running); + batch.running.fetch_sub(1); + } + tp.stop(); + + for (size_t i = 0; i < batchCount; i++) { + auto& batch = batches[i]; + + //process leftovers if any + if (batch.entry_count) { + TPHandle tp_handle(store->cct, + nullptr, + timeout_interval, + suicide_interval); + ceph_assert(batch.running == 0); + + batch.running++; // just to be on-par with the regular call + _void_process(&batch, tp_handle); + } + ceph_assert(batch.entry_count == 0); + + ctx.errors += batch.errors; + ctx.warnings += batch.warnings; + ctx.num_objects += batch.num_objects; + ctx.num_extents += batch.num_extents; + ctx.num_blobs += batch.num_blobs; + ctx.num_sharded_objects += batch.num_sharded_objects; + ctx.num_spanning_blobs += batch.num_spanning_blobs; + + ctx.expected_store_statfs.add(batch.expected_store_statfs); + + for (auto it = batch.expected_pool_statfs.begin(); + it != batch.expected_pool_statfs.end(); + it++) { + ctx.expected_pool_statfs[it->first].add(it->second); + } + } + } + }; +}; + +void BlueStore::_fsck_check_object_omap(FSCKDepth depth, + OnodeRef& o, + const BlueStore::FSCK_ObjectCtx& ctx) +{ + auto& errors = ctx.errors; + auto& warnings = ctx.warnings; + auto repairer = ctx.repairer; + + ceph_assert(o->onode.has_omap()); + if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) { + if (per_pool_omap == OMAP_PER_POOL) { + fsck_derr(errors, MAX_FSCK_ERROR_LINES) + << "fsck error: " << o->oid + << " has omap that is not per-pool or pgmeta" + << fsck_dendl; + ++errors; + } else { + const char* w; + int64_t num; + if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) { + ++errors; + num = errors; + w = "error"; + } else { + ++warnings; + num = warnings; + w = "warning"; + } + fsck_derr(num, MAX_FSCK_ERROR_LINES) + << "fsck " << w << ": " << o->oid + << " has omap that is not per-pool or pgmeta" + << fsck_dendl; + } + } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) { + if (per_pool_omap == OMAP_PER_PG) { + fsck_derr(errors, MAX_FSCK_ERROR_LINES) + << "fsck error: " << o->oid + << " has omap that is not per-pg or pgmeta" + << fsck_dendl; + ++errors; + } else { + const char* w; + int64_t num; + if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) { + ++errors; + num = errors; + w = "error"; + } else { + ++warnings; + num = warnings; + w = "warning"; + } + fsck_derr(num, MAX_FSCK_ERROR_LINES) + << "fsck " << w << ": " << o->oid + << " has omap that is not per-pg or pgmeta" + << fsck_dendl; + } + } + if (repairer && + !o->onode.is_perpg_omap() && + !o->onode.is_pgmeta_omap()) { + dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl; + bufferlist header; + map<string, bufferlist> kv; + { + KeyValueDB::Transaction txn = db->get_transaction(); + uint64_t txn_cost = 0; + const string& prefix = Onode::calc_omap_prefix(o->onode.flags); + uint8_t new_flags = o->onode.flags | + bluestore_onode_t::FLAG_PERPOOL_OMAP | + bluestore_onode_t::FLAG_PERPG_OMAP; + const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags); + + KeyValueDB::Iterator it = db->get_iterator(prefix); + string head, tail; + o->get_omap_header(&head); + o->get_omap_tail(&tail); + it->lower_bound(head); + // head + if (it->valid() && it->key() == head) { + dout(30) << __func__ << " got header" << dendl; + header = it->value(); + if (header.length()) { + string new_head; + Onode::calc_omap_header(new_flags, o.get(), &new_head); + txn->set(new_omap_prefix, new_head, header); + txn_cost += new_head.length() + header.length(); + } + it->next(); + } + // tail + { + string new_tail; + Onode::calc_omap_tail(new_flags, o.get(), &new_tail); + bufferlist empty; + txn->set(new_omap_prefix, new_tail, empty); + txn_cost += new_tail.length() + new_tail.length(); + } + // values + string final_key; + Onode::calc_omap_key(new_flags, o.get(), string(), &final_key); + size_t base_key_len = final_key.size(); + while (it->valid() && it->key() < tail) { + string user_key; + o->decode_omap_key(it->key(), &user_key); + dout(20) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + + final_key.resize(base_key_len); + final_key += user_key; + auto v = it->value(); + txn->set(new_omap_prefix, final_key, v); + txn_cost += final_key.length() + v.length(); + + // submit a portion if cost exceeds 16MB + if (txn_cost >= 16 * (1 << 20) ) { + db->submit_transaction_sync(txn); + txn = db->get_transaction(); + txn_cost = 0; + } + it->next(); + } + if (txn_cost > 0) { + db->submit_transaction_sync(txn); + } + } + // finalize: remove legacy data + { + KeyValueDB::Transaction txn = db->get_transaction(); + // remove old keys + const string& old_omap_prefix = o->get_omap_prefix(); + string old_head, old_tail; + o->get_omap_header(&old_head); + o->get_omap_tail(&old_tail); + txn->rm_range_keys(old_omap_prefix, old_head, old_tail); + txn->rmkey(old_omap_prefix, old_tail); + // set flag + o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP); + _record_onode(o, txn); + db->submit_transaction_sync(txn); + repairer->inc_repaired(); + repairer->request_compaction(); + } + } +} + +void BlueStore::_fsck_check_objects( + FSCKDepth depth, + BlueStore::FSCK_ObjectCtx& ctx) +{ + auto& errors = ctx.errors; + auto sb_info_lock = ctx.sb_info_lock; + auto& sb_info = ctx.sb_info; + auto& sb_ref_counts = ctx.sb_ref_counts; + auto repairer = ctx.repairer; + + uint64_t_btree_t used_nids; + + size_t processed_myself = 0; + + auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); + mempool::bluestore_fsck::list<string> expecting_shards; + if (it) { + const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads; + typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ; + std::unique_ptr<WQ> wq( + new WQ( + "FSCKWorkQueue", + (thread_count ? : 1) * 32, + this, + sb_info_lock, + sb_info, + sb_ref_counts, + repairer)); + + ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count); + + thread_pool.add_work_queue(wq.get()); + if (depth == FSCK_SHALLOW && thread_count > 0) { + //not the best place but let's check anyway + ceph_assert(sb_info_lock); + thread_pool.start(); + } + + // fill global if not overriden below + CollectionRef c; + int64_t pool_id = -1; + spg_t pgid; + for (it->lower_bound(string()); it->valid(); it->next()) { + dout(30) << __func__ << " key " + << pretty_binary_string(it->key()) << dendl; + if (is_extent_shard_key(it->key())) { + if (depth == FSCK_SHALLOW) { + continue; + } + while (!expecting_shards.empty() && + expecting_shards.front() < it->key()) { + derr << "fsck error: missing shard key " + << pretty_binary_string(expecting_shards.front()) + << dendl; + ++errors; + expecting_shards.pop_front(); + } + if (!expecting_shards.empty() && + expecting_shards.front() == it->key()) { + // all good + expecting_shards.pop_front(); + continue; + } + + uint32_t offset; + string okey; + get_key_extent_shard(it->key(), &okey, &offset); + derr << "fsck error: stray shard 0x" << std::hex << offset + << std::dec << dendl; + if (expecting_shards.empty()) { + derr << "fsck error: " << pretty_binary_string(it->key()) + << " is unexpected" << dendl; + ++errors; + continue; + } + while (expecting_shards.front() > it->key()) { + derr << "fsck error: saw " << pretty_binary_string(it->key()) + << dendl; + derr << "fsck error: exp " + << pretty_binary_string(expecting_shards.front()) << dendl; + ++errors; + expecting_shards.pop_front(); + if (expecting_shards.empty()) { + break; + } + } + continue; + } + + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + if (r < 0) { + derr << "fsck error: bad object key " + << pretty_binary_string(it->key()) << dendl; + ++errors; + continue; + } + if (!c || + oid.shard_id != pgid.shard || + oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || + !c->contains(oid)) { + c = nullptr; + for (auto& p : coll_map) { + if (p.second->contains(oid)) { + c = p.second; + break; + } + } + if (!c) { + derr << "fsck error: stray object " << oid + << " not owned by any collection" << dendl; + ++errors; + continue; + } + pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID; + dout(20) << __func__ << " collection " << c->cid << " " << c->cnode + << dendl; + } + + if (depth != FSCK_SHALLOW && + !expecting_shards.empty()) { + for (auto& k : expecting_shards) { + derr << "fsck error: missing shard key " + << pretty_binary_string(k) << dendl; + } + ++errors; + expecting_shards.clear(); + } + + bool queued = false; + if (depth == FSCK_SHALLOW && thread_count > 0) { + queued = wq->queue( + pool_id, + c, + oid, + it->key(), + it->value()); + } + OnodeRef o; + map<BlobRef, bluestore_blob_t::unused_t> referenced; + + if (!queued) { + ++processed_myself; + o = fsck_check_objects_shallow( + depth, + pool_id, + c, + oid, + it->key(), + it->value(), + &expecting_shards, + &referenced, + ctx); + } + + if (depth != FSCK_SHALLOW) { + ceph_assert(o != nullptr); + if (o->onode.nid) { + if (o->onode.nid > nid_max) { + derr << "fsck error: " << oid << " nid " << o->onode.nid + << " > nid_max " << nid_max << dendl; + ++errors; + } + if (used_nids.count(o->onode.nid)) { + derr << "fsck error: " << oid << " nid " << o->onode.nid + << " already in use" << dendl; + ++errors; + continue; // go for next object + } + used_nids.insert(o->onode.nid); + } + for (auto& i : referenced) { + dout(20) << __func__ << " referenced 0x" << std::hex << i.second + << std::dec << " for " << *i.first << dendl; + const bluestore_blob_t& blob = i.first->get_blob(); + if (i.second & blob.unused) { + derr << "fsck error: " << oid << " blob claims unused 0x" + << std::hex << blob.unused + << " but extents reference 0x" << i.second << std::dec + << " on blob " << *i.first << dendl; + ++errors; + } + if (blob.has_csum()) { + uint64_t blob_len = blob.get_logical_length(); + uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8); + unsigned csum_count = blob.get_csum_count(); + unsigned csum_chunk_size = blob.get_csum_chunk_size(); + for (unsigned p = 0; p < csum_count; ++p) { + unsigned pos = p * csum_chunk_size; + unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit] + unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size; + unsigned mask = 1u << firstbit; + for (unsigned b = firstbit + 1; b <= lastbit; ++b) { + mask |= 1u << b; + } + if ((blob.unused & mask) == mask) { + // this csum chunk region is marked unused + if (blob.get_csum_item(p) != 0) { + derr << "fsck error: " << oid + << " blob claims csum chunk 0x" << std::hex << pos + << "~" << csum_chunk_size + << " is unused (mask 0x" << mask << " of unused 0x" + << blob.unused << ") but csum is non-zero 0x" + << blob.get_csum_item(p) << std::dec << " on blob " + << *i.first << dendl; + ++errors; + } + } + } + } + } + // omap + if (o->onode.has_omap()) { + ceph_assert(ctx.used_omap_head); + if (ctx.used_omap_head->count(o->onode.nid)) { + derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid + << " already in use" << dendl; + ++errors; + } else { + ctx.used_omap_head->insert(o->onode.nid); + } + } // if (o->onode.has_omap()) + if (depth == FSCK_DEEP) { + bufferlist bl; + uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap; + uint64_t offset = 0; + do { + uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block); + int r = _do_read(c.get(), o, offset, l, bl, + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + if (r < 0) { + ++errors; + derr << "fsck error: " << oid << std::hex + << " error during read: " + << " " << offset << "~" << l + << " " << cpp_strerror(r) << std::dec + << dendl; + break; + } + offset += l; + } while (offset < o->onode.size); + } // deep + } //if (depth != FSCK_SHALLOW) + } // for (it->lower_bound(string()); it->valid(); it->next()) + if (depth == FSCK_SHALLOW && thread_count > 0) { + wq->finalize(thread_pool, ctx); + if (processed_myself) { + // may be needs more threads? + dout(0) << __func__ << " partial offload" + << ", done myself " << processed_myself + << " of " << ctx.num_objects + << "objects, threads " << thread_count + << dendl; + } + } + } // if (it) +} +/** +An overview for currently implemented repair logics +performed in fsck in two stages: detection(+preparation) and commit. +Detection stage (in processing order): + (Issue -> Repair action to schedule) + - Detect undecodable keys for Shared Blobs -> Remove + - Detect undecodable records for Shared Blobs -> Remove + (might trigger missed Shared Blob detection below) + - Detect stray records for Shared Blobs -> Remove + - Detect misreferenced pextents -> Fix + Prepare Bloom-like filter to track cid/oid -> pextent + Prepare list of extents that are improperly referenced + Enumerate Onode records that might use 'misreferenced' pextents + (Bloom-like filter applied to reduce computation) + Per each questinable Onode enumerate all blobs and identify broken ones + (i.e. blobs having 'misreferences') + Rewrite each broken blob data by allocating another extents and + copying data there + If blob is shared - unshare it and mark corresponding Shared Blob + for removal + Release previously allocated space + Update Extent Map + - Detect missed Shared Blobs -> Recreate + - Detect undecodable deferred transaction -> Remove + - Detect Freelist Manager's 'false free' entries -> Mark as used + - Detect Freelist Manager's leaked entries -> Mark as free + - Detect statfs inconsistency - Update + Commit stage (separate DB commit per each step): + - Apply leaked FM entries fix + - Apply 'false free' FM entries fix + - Apply 'Remove' actions + - Apply fix for misreference pextents + - Apply Shared Blob recreate + (can be merged with the step above if misreferences were dectected) + - Apply StatFS update +*/ +int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair) +{ + dout(5) << __func__ + << (repair ? " repair" : " check") + << (depth == FSCK_DEEP ? " (deep)" : + depth == FSCK_SHALLOW ? " (shallow)" : " (regular)") + << dendl; + + // in deep mode we need R/W write access to be able to replay deferred ops + const bool read_only = !(repair || depth == FSCK_DEEP); + int r = _open_db_and_around(read_only); + if (r < 0) { + return r; + } + auto close_db = make_scope_guard([&] { + _close_db_and_around(); + }); + + if (!read_only) { + r = _upgrade_super(); + if (r < 0) { + return r; + } + } + + // NullFreelistManager needs to open collection early + r = _open_collections(); + if (r < 0) { + return r; + } + + mempool_thread.init(); + auto stop_mempool = make_scope_guard([&] { + mempool_thread.shutdown(); + _shutdown_cache(); + }); + // we need finisher and kv_{sync,finalize}_thread *just* for replay + // enable in repair or deep mode modes only + if (!read_only) { + _kv_start(); + r = _deferred_replay(); + _kv_stop(); + } + + if (r < 0) { + return r; + } + return _fsck_on_open(depth, repair); +} + +int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) +{ + uint64_t sb_hash_size = uint64_t( + cct->_conf.get_val<Option::size_t>("osd_memory_target") * + cct->_conf.get_val<double>( + "bluestore_fsck_shared_blob_tracker_size")); + + dout(1) << __func__ + << " <<<START>>>" + << (repair ? " repair" : " check") + << (depth == FSCK_DEEP ? " (deep)" : + depth == FSCK_SHALLOW ? " (shallow)" : " (regular)") + << " start sb_tracker_hash_size:" << sb_hash_size + << dendl; + int64_t errors = 0; + int64_t warnings = 0; + unsigned repaired = 0; + + uint64_t_btree_t used_omap_head; + uint64_t_btree_t used_sbids; + + mempool_dynamic_bitset used_blocks, bluefs_used_blocks; + KeyValueDB::Iterator it; + store_statfs_t expected_store_statfs; + per_pool_statfs expected_pool_statfs; + + sb_info_space_efficient_map_t sb_info; + shared_blob_2hash_tracker_t sb_ref_counts( + sb_hash_size, + min_alloc_size); + size_t sb_ref_mismatches = 0; + + /// map of oid -> (first_)offset for each zone + std::vector<std::unordered_map<ghobject_t, uint64_t>> zone_refs; // FIXME: this may be a lot of RAM! + + uint64_t num_objects = 0; + uint64_t num_extents = 0; + uint64_t num_blobs = 0; + uint64_t num_spanning_blobs = 0; + uint64_t num_shared_blobs = 0; + uint64_t num_sharded_objects = 0; + BlueStoreRepairer repairer; + + auto alloc_size = fm->get_alloc_size(); + + utime_t start = ceph_clock_now(); + + _fsck_collections(&errors); + used_blocks.resize(fm->get_alloc_units()); + + if (bluefs) { + interval_set<uint64_t> bluefs_extents; + + bluefs->foreach_block_extents( + bluefs_layout.shared_bdev, + [&](uint64_t start, uint32_t len) { + apply_for_bitset_range(start, len, alloc_size, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset& bs) { + ceph_assert(pos < bs.size()); + bs.set(pos); + } + ); + } + ); + } + + bluefs_used_blocks = used_blocks; + + apply_for_bitset_range( + 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + bs.set(pos); + } + ); + + + if (repair) { + repairer.init_space_usage_tracker( + bdev->get_size(), + min_alloc_size); + } + + if (bluefs) { + int r = bluefs->fsck(); + if (r < 0) { + return r; + } + if (r > 0) + errors += r; + } + + if (!per_pool_stat_collection) { + const char *w; + if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) { + w = "error"; + ++errors; + } else { + w = "warning"; + ++warnings; + } + derr << "fsck " << w << ": store not yet converted to per-pool stats" + << dendl; + } + if (per_pool_omap != OMAP_PER_PG) { + const char *w; + if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) { + w = "error"; + ++errors; + } else { + w = "warning"; + ++warnings; + } + derr << "fsck " << w << ": store not yet converted to per-pg omap" + << dendl; + } + + if (g_conf()->bluestore_debug_fsck_abort) { + dout(1) << __func__ << " debug abort" << dendl; + goto out_scan; + } + +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + auto a = dynamic_cast<ZonedAllocator*>(alloc); + ceph_assert(a); + auto f = dynamic_cast<ZonedFreelistManager*>(fm); + ceph_assert(f); + vector<uint64_t> wp = bdev->get_zones(); + vector<zone_state_t> zones = f->get_zone_states(db); + ceph_assert(wp.size() == zones.size()); + auto num_zones = bdev->get_size() / zone_size; + for (unsigned i = first_sequential_zone; i < num_zones; ++i) { + uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size; + if (zones[i].write_pointer > p && + zones[i].num_dead_bytes < zones[i].write_pointer) { + derr << "fsck error: zone 0x" << std::hex << i + << " bluestore write pointer 0x" << zones[i].write_pointer + << " > device write pointer 0x" << p + << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)" + << std::dec << dendl; + ++errors; + } + } + + if (depth != FSCK_SHALLOW) { + // load zone refs + zone_refs.resize(bdev->get_size() / zone_size); + it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + for (it->lower_bound(string()); + it->valid(); + it->next()) { + uint32_t zone = 0; + uint64_t offset = 0; + ghobject_t oid; + string key = it->key(); + int r = get_key_zone_offset_object(key, &zone, &offset, &oid); + if (r < 0) { + derr << "fsck error: invalid zone ref key " << pretty_binary_string(key) + << dendl; + if (repair) { + repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key); + } + ++errors; + continue; + } + dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset + << " -> " << std::dec << oid << dendl; + if (zone_refs[zone].count(oid)) { + derr << "fsck error: second zone ref in zone 0x" << std::hex << zone + << " offset 0x" << offset << std::dec << " for " << oid << dendl; + if (repair) { + repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key); + } + ++errors; + continue; + } + zone_refs[zone][oid] = offset; + } + } + } + } +#endif + + dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl; + it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + string key = it->key(); + uint64_t sbid; + if (get_key_shared_blob(key, &sbid) < 0) { + // Failed to parse the key. + // This gonna to be handled at the second stage + continue; + } + bluestore_shared_blob_t shared_blob(sbid); + bufferlist bl = it->value(); + auto blp = bl.cbegin(); + try { + decode(shared_blob, blp); + } + catch (ceph::buffer::error& e) { + // this gonna to be handled at the second stage + continue; + } + dout(20) << __func__ << " " << shared_blob << dendl; + auto& sbi = sb_info.add_maybe_stray(sbid); + + // primarily to silent the 'unused' warning + ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID); + + for (auto& r : shared_blob.ref_map.ref_map) { + sb_ref_counts.inc_range( + sbid, + r.first, + r.second.length, + -r.second.refs); + } + } + } // if (it) //checking shared_blobs (phase1) + + // walk PREFIX_OBJ + { + dout(1) << __func__ << " walking object keyspace" << dendl; + ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock"); + BlueStore::FSCK_ObjectCtx ctx( + errors, + warnings, + num_objects, + num_extents, + num_blobs, + num_sharded_objects, + num_spanning_blobs, + &used_blocks, + &used_omap_head, + &zone_refs, + //no need for the below lock when in non-shallow mode as + // there is no multithreading in this case + depth == FSCK_SHALLOW ? &sb_info_lock : nullptr, + sb_info, + sb_ref_counts, + expected_store_statfs, + expected_pool_statfs, + repair ? &repairer : nullptr); + + _fsck_check_objects(depth, ctx); + } + +#ifdef HAVE_LIBZBD + if (bdev->is_smr() && depth != FSCK_SHALLOW) { + dout(1) << __func__ << " checking for leaked zone refs" << dendl; + for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) { + for (auto& [oid, offset] : zone_refs[zone]) { + derr << "fsck error: stray zone ref 0x" << std::hex << zone + << " offset 0x" << offset << " -> " << std::dec << oid << dendl; + // FIXME: add repair + ++errors; + } + } + } +#endif + + sb_ref_mismatches = sb_ref_counts.count_non_zero(); + if (sb_ref_mismatches != 0) { + derr << "fsck error:" << "*" << sb_ref_mismatches + << " shared blob references aren't matching, at least " + << sb_ref_mismatches << " found" << dendl; + errors += sb_ref_mismatches; + } + + if (depth != FSCK_SHALLOW && repair) { + _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info); + } + dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl; + it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + // FIXME minor: perhaps simplify for shallow mode? + // fill global if not overriden below + auto expected_statfs = &expected_store_statfs; + for (it->lower_bound(string()); it->valid(); it->next()) { + string key = it->key(); + uint64_t sbid; + if (get_key_shared_blob(key, &sbid)) { + derr << "fsck error: bad key '" << key + << "' in shared blob namespace" << dendl; + if (repair) { + repairer.remove_key(db, PREFIX_SHARED_BLOB, key); + } + ++errors; + continue; + } + auto p = sb_info.find(sbid); + if (p == sb_info.end()) { + if (sb_ref_mismatches > 0) { + // highly likely this has been already reported before, ignoring... + dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x" + << std::hex << sbid << std::dec << dendl; + } else { + derr<< "fsck error: found stray shared blob data for sbid 0x" + << std::hex << sbid << std::dec << dendl; + ++errors; + if (repair) { + repairer.remove_key(db, PREFIX_SHARED_BLOB, key); + } + } + } else { + ++num_shared_blobs; + sb_info_t& sbi = *p; + bluestore_shared_blob_t shared_blob(sbid); + bufferlist bl = it->value(); + auto blp = bl.cbegin(); + try { + decode(shared_blob, blp); + } + catch (ceph::buffer::error& e) { + ++errors; + + derr << "fsck error: failed to decode Shared Blob" + << pretty_binary_string(key) << dendl; + if (repair) { + dout(20) << __func__ << " undecodable Shared Blob, key:'" + << pretty_binary_string(key) + << "', removing" << dendl; + repairer.remove_key(db, PREFIX_SHARED_BLOB, key); + } + continue; + } + dout(20) << __func__ << " " << shared_blob << dendl; + PExtentVector extents; + for (auto& r : shared_blob.ref_map.ref_map) { + extents.emplace_back(bluestore_pextent_t(r.first, r.second.length)); + } + if (sbi.pool_id != sb_info_t::INVALID_POOL_ID && + (per_pool_stat_collection || repair)) { + expected_statfs = &expected_pool_statfs[sbi.pool_id]; + } + std::stringstream ss; + ss << "sbid 0x" << std::hex << sbid << std::dec; + errors += _fsck_check_extents(ss.str(), + extents, + sbi.allocated_chunks < 0, + used_blocks, + fm->get_alloc_size(), + repair ? &repairer : nullptr, + *expected_statfs, + depth); + } + } + } // if (it) /* checking shared_blobs (phase 2)*/ + + if (repair && repairer.preprocess_misreference(db)) { + + dout(1) << __func__ << " sorting out misreferenced extents" << dendl; + auto& misref_extents = repairer.get_misreferences(); + interval_set<uint64_t> to_release; + it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + // fill global if not overriden below + auto expected_statfs = &expected_store_statfs; + + CollectionRef c; + spg_t pgid; + KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn(); + bool bypass_rest = false; + for (it->lower_bound(string()); it->valid() && !bypass_rest; + it->next()) { + dout(30) << __func__ << " key " + << pretty_binary_string(it->key()) << dendl; + if (is_extent_shard_key(it->key())) { + continue; + } + + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + if (r < 0 || !repairer.is_used(oid)) { + continue; + } + + if (!c || + oid.shard_id != pgid.shard || + oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || + !c->contains(oid)) { + c = nullptr; + for (auto& p : coll_map) { + if (p.second->contains(oid)) { + c = p.second; + break; + } + } + if (!c) { + continue; + } + if (per_pool_stat_collection || repair) { + auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID; + expected_statfs = &expected_pool_statfs[pool_id]; + } + } + if (!repairer.is_used(c->cid)) { + continue; + } + + dout(20) << __func__ << " check misreference for col:" << c->cid + << " obj:" << oid << dendl; + + OnodeRef o; + o.reset(Onode::create_decode(c, oid, it->key(), it->value())); + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + mempool::bluestore_fsck::set<BlobRef> blobs; + + for (auto& e : o->extent_map.extent_map) { + blobs.insert(e.blob); + } + bool need_onode_update = false; + bool first_dump = true; + for(auto b : blobs) { + bool broken_blob = false; + auto& pextents = b->dirty_blob().dirty_extents(); + for (auto& e : pextents) { + if (!e.is_valid()) { + continue; + } + // for the sake of simplicity and proper shared blob handling + // always rewrite the whole blob even when it's partially + // misreferenced. + if (misref_extents.intersects(e.offset, e.length)) { + if (first_dump) { + first_dump = false; + _dump_onode<10>(cct, *o); + } + broken_blob = true; + break; + } + } + if (!broken_blob) + continue; + bool compressed = b->get_blob().is_compressed(); + need_onode_update = true; + dout(10) << __func__ + << " fix misreferences in oid:" << oid + << " " << *b << dendl; + uint64_t b_off = 0; + PExtentVector pext_to_release; + pext_to_release.reserve(pextents.size()); + // rewriting all valid pextents + for (auto e = pextents.begin(); e != pextents.end(); + e++) { + auto b_off_cur = b_off; + b_off += e->length; + if (!e->is_valid()) { + continue; + } + PExtentVector exts; + dout(5) << __func__ << "::NCB::(F)alloc=" << alloc << ", length=" << e->length << dendl; + int64_t alloc_len = + alloc->allocate(e->length, min_alloc_size, + 0, 0, &exts); + if (alloc_len < 0 || alloc_len < (int64_t)e->length) { + derr << __func__ + << " failed to allocate 0x" << std::hex << e->length + << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len) + << " min_alloc_size 0x" << min_alloc_size + << " available 0x " << alloc->get_free() + << std::dec << dendl; + if (alloc_len > 0) { + alloc->release(exts); + } + bypass_rest = true; + break; + } + expected_statfs->allocated += e->length; + if (compressed) { + expected_statfs->data_compressed_allocated += e->length; + } + + bufferlist bl; + IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio); + r = bdev->read(e->offset, e->length, &bl, &ioc, false); + if (r < 0) { + derr << __func__ << " failed to read from 0x" << std::hex << e->offset + <<"~" << e->length << std::dec << dendl; + ceph_abort_msg("read failed, wtf"); + } + pext_to_release.push_back(*e); + e = pextents.erase(e); + e = pextents.insert(e, exts.begin(), exts.end()); + b->get_blob().map_bl( + b_off_cur, bl, + [&](uint64_t offset, bufferlist& t) { + int r = bdev->write(offset, t, false); + ceph_assert(r == 0); + }); + e += exts.size() - 1; + for (auto& p : exts) { + fm->allocate(p.offset, p.length, txn); + } + } // for (auto e = pextents.begin(); e != pextents.end(); e++) { + + if (b->get_blob().is_shared()) { + b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED); + + auto sbid = b->shared_blob->get_sbid(); + auto sb_it = sb_info.find(sbid); + ceph_assert(sb_it != sb_info.end()); + sb_info_t& sbi = *sb_it; + + if (sbi.allocated_chunks < 0) { + // NB: it's crucial to use compressed_allocated_chunks from sb_info_t + // as we originally used that value while accumulating + // expected_statfs + expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order; + expected_statfs->data_compressed_allocated -= + uint64_t(-sbi.allocated_chunks) << min_alloc_size_order; + } else { + expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order; + } + sbi.allocated_chunks = 0; + repairer.fix_shared_blob(txn, sbid, nullptr, 0); + + // relying on blob's pextents to decide what to release. + for (auto& p : pext_to_release) { + to_release.union_insert(p.offset, p.length); + } + } else { + for (auto& p : pext_to_release) { + expected_statfs->allocated -= p.length; + if (compressed) { + expected_statfs->data_compressed_allocated -= p.length; + } + to_release.union_insert(p.offset, p.length); + } + } + if (bypass_rest) { + break; + } + } // for(auto b : blobs) + if (need_onode_update) { + o->extent_map.dirty_range(0, OBJECT_MAX_SIZE); + _record_onode(o, txn); + } + } // for (it->lower_bound(string()); it->valid(); it->next()) + + for (auto it = to_release.begin(); it != to_release.end(); ++it) { + dout(10) << __func__ << " release 0x" << std::hex << it.get_start() + << "~" << it.get_len() << std::dec << dendl; + fm->release(it.get_start(), it.get_len(), txn); + } + alloc->release(to_release); + to_release.clear(); + } // if (it) { + } //if (repair && repairer.preprocess_misreference()) { + sb_info.clear(); + sb_ref_counts.reset(); + + dout(1) << __func__ << " checking pool_statfs" << dendl; + _fsck_check_statfs(expected_store_statfs, expected_pool_statfs, + errors, warnings, repair ? &repairer : nullptr); + if (depth != FSCK_SHALLOW) { + dout(1) << __func__ << " checking for stray omap data " << dendl; + it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + uint64_t last_omap_head = 0; + for (it->lower_bound(string()); it->valid(); it->next()) { + uint64_t omap_head; + + _key_decode_u64(it->key().c_str(), &omap_head); + + if (used_omap_head.count(omap_head) == 0 && + omap_head != last_omap_head) { + pair<string,string> rk = it->raw_key(); + fsck_derr(errors, MAX_FSCK_ERROR_LINES) + << "fsck error: found stray omap data on omap_head " + << omap_head << " " << last_omap_head + << " prefix/key: " << url_escape(rk.first) + << " " << url_escape(rk.second) + << fsck_dendl; + ++errors; + last_omap_head = omap_head; + } + } + } + it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + uint64_t last_omap_head = 0; + for (it->lower_bound(string()); it->valid(); it->next()) { + uint64_t omap_head; + _key_decode_u64(it->key().c_str(), &omap_head); + if (used_omap_head.count(omap_head) == 0 && + omap_head != last_omap_head) { + pair<string,string> rk = it->raw_key(); + fsck_derr(errors, MAX_FSCK_ERROR_LINES) + << "fsck error: found stray (pgmeta) omap data on omap_head " + << omap_head << " " << last_omap_head + << " prefix/key: " << url_escape(rk.first) + << " " << url_escape(rk.second) + << fsck_dendl; + last_omap_head = omap_head; + ++errors; + } + } + } + it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + uint64_t last_omap_head = 0; + for (it->lower_bound(string()); it->valid(); it->next()) { + uint64_t pool; + uint64_t omap_head; + string k = it->key(); + const char *c = k.c_str(); + c = _key_decode_u64(c, &pool); + c = _key_decode_u64(c, &omap_head); + if (used_omap_head.count(omap_head) == 0 && + omap_head != last_omap_head) { + pair<string,string> rk = it->raw_key(); + fsck_derr(errors, MAX_FSCK_ERROR_LINES) + << "fsck error: found stray (per-pool) omap data on omap_head " + << omap_head << " " << last_omap_head + << " prefix/key: " << url_escape(rk.first) + << " " << url_escape(rk.second) + << fsck_dendl; + ++errors; + last_omap_head = omap_head; + } + } + } + it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + uint64_t last_omap_head = 0; + for (it->lower_bound(string()); it->valid(); it->next()) { + uint64_t pool; + uint32_t hash; + uint64_t omap_head; + string k = it->key(); + const char* c = k.c_str(); + c = _key_decode_u64(c, &pool); + c = _key_decode_u32(c, &hash); + c = _key_decode_u64(c, &omap_head); + if (used_omap_head.count(omap_head) == 0 && + omap_head != last_omap_head) { + fsck_derr(errors, MAX_FSCK_ERROR_LINES) + << "fsck error: found stray (per-pg) omap data on omap_head " + << " key " << pretty_binary_string(it->key()) + << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl; + ++errors; + last_omap_head = omap_head; + } + } + } + dout(1) << __func__ << " checking deferred events" << dendl; + it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + bufferlist bl = it->value(); + auto p = bl.cbegin(); + bluestore_deferred_transaction_t wt; + try { + decode(wt, p); + } catch (ceph::buffer::error& e) { + derr << "fsck error: failed to decode deferred txn " + << pretty_binary_string(it->key()) << dendl; + if (repair) { + dout(20) << __func__ << " undecodable deferred TXN record, key: '" + << pretty_binary_string(it->key()) + << "', removing" << dendl; + repairer.remove_key(db, PREFIX_DEFERRED, it->key()); + } + continue; + } + dout(20) << __func__ << " deferred " << wt.seq + << " ops " << wt.ops.size() + << " released 0x" << std::hex << wt.released << std::dec << dendl; + for (auto e = wt.released.begin(); e != wt.released.end(); ++e) { + apply_for_bitset_range( + e.get_start(), e.get_len(), alloc_size, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + bs.set(pos); + } + ); + } + } + } + + // skip freelist vs allocated compare when we have Null fm + if (!fm->is_null_manager()) { + dout(1) << __func__ << " checking freelist vs allocated" << dendl; +#ifdef HAVE_LIBZBD + if (freelist_type == "zoned") { + // verify per-zone state + // - verify no allocations beyond write pointer + // - verify num_dead_bytes count (neither allocated nor + // free space past the write pointer) + auto a = dynamic_cast<ZonedAllocator*>(alloc); + auto num_zones = bdev->get_size() / zone_size; + + // mark the free space past the write pointer + for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) { + auto wp = a->get_write_pointer(zone); + uint64_t offset = zone_size * zone + wp; + uint64_t length = zone_size - wp; + if (!length) { + continue; + } + bool intersects = false; + dout(10) << " marking zone 0x" << std::hex << zone + << " region after wp 0x" << offset << "~" << length + << std::dec << dendl; + apply_for_bitset_range( + offset, length, alloc_size, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + if (bs.test(pos)) { + derr << "fsck error: zone 0x" << std::hex << zone + << " has used space at 0x" << pos * alloc_size + << " beyond write pointer 0x" << wp + << std::dec << dendl; + intersects = true; + } else { + bs.set(pos); + } + } + ); + if (intersects) { + ++errors; + } + } + + used_blocks.flip(); + + // skip conventional zones + uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1; + pos = used_blocks.find_next(pos); + + uint64_t zone_dead = 0; + for (uint32_t zone = first_sequential_zone; + zone < num_zones; + ++zone, zone_dead = 0) { + while (pos != decltype(used_blocks)::npos && + (pos * min_alloc_size) / zone_size == zone) { + dout(40) << " zone 0x" << std::hex << zone + << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size + << std::dec << dendl; + zone_dead += min_alloc_size; + pos = used_blocks.find_next(pos); + } + dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead + << std::dec << dendl; + // cross-check dead bytes against zone state + if (a->get_dead_bytes(zone) != zone_dead) { + derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead + << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone) + << dendl; + ++errors; + // TODO: repair + } + } + used_blocks.flip(); + } else +#endif + { + fm->enumerate_reset(); + uint64_t offset, length; + while (fm->enumerate_next(db, &offset, &length)) { + bool intersects = false; + apply_for_bitset_range( + offset, length, alloc_size, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + ceph_assert(pos < bs.size()); + if (bs.test(pos) && !bluefs_used_blocks.test(pos)) { + if (offset == SUPER_RESERVED && + length == min_alloc_size - SUPER_RESERVED) { + // this is due to the change just after luminous to min_alloc_size + // granularity allocations, and our baked in assumption at the top + // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used + // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless, + // since we will never allocate this region below min_alloc_size. + dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED" + << " and min_alloc_size, 0x" << std::hex << offset << "~" + << length << std::dec << dendl; + } else { + intersects = true; + if (repair) { + repairer.fix_false_free(db, fm, + pos * min_alloc_size, + min_alloc_size); + } + } + } else { + bs.set(pos); + } + } + ); + if (intersects) { + derr << "fsck error: free extent 0x" << std::hex << offset + << "~" << length << std::dec + << " intersects allocated blocks" << dendl; + ++errors; + } + } + fm->enumerate_reset(); + + // check for leaked extents + size_t count = used_blocks.count(); + if (used_blocks.size() != count) { + ceph_assert(used_blocks.size() > count); + used_blocks.flip(); + size_t start = used_blocks.find_first(); + while (start != decltype(used_blocks)::npos) { + size_t cur = start; + while (true) { + size_t next = used_blocks.find_next(cur); + if (next != cur + 1) { + ++errors; + derr << "fsck error: leaked extent 0x" << std::hex + << ((uint64_t)start * fm->get_alloc_size()) << "~" + << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec + << dendl; + if (repair) { + repairer.fix_leaked(db, + fm, + start * min_alloc_size, + (cur + 1 - start) * min_alloc_size); + } + start = next; + break; + } + cur = next; + } + } + used_blocks.flip(); + } + } + } + } + if (repair) { + if (per_pool_omap != OMAP_PER_PG) { + dout(5) << __func__ << " fixing per_pg_omap" << dendl; + repairer.fix_per_pool_omap(db, OMAP_PER_PG); + } + + dout(5) << __func__ << " applying repair results" << dendl; + repaired = repairer.apply(db); + dout(5) << __func__ << " repair applied" << dendl; + } + +out_scan: + dout(2) << __func__ << " " << num_objects << " objects, " + << num_sharded_objects << " of them sharded. " + << dendl; + dout(2) << __func__ << " " << num_extents << " extents to " + << num_blobs << " blobs, " + << num_spanning_blobs << " spanning, " + << num_shared_blobs << " shared." + << dendl; + + utime_t duration = ceph_clock_now() - start; + dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, " + << warnings << " warnings, " + << repaired << " repaired, " + << (errors + warnings - (int)repaired) << " remaining in " + << duration << " seconds" << dendl; + + // In non-repair mode we should return error count only as + // it indicates if store status is OK. + // In repair mode both errors and warnings are taken into account + // since repaired counter relates to them both. + return repair ? errors + warnings - (int)repaired : errors; +} + +/// methods to inject various errors fsck can repair +void BlueStore::inject_broken_shared_blob_key(const string& key, + const bufferlist& bl) +{ + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + txn->set(PREFIX_SHARED_BLOB, key, bl); + db->submit_transaction_sync(txn); +}; + +void BlueStore::inject_no_shared_blob_key() +{ + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + ceph_assert(blobid_last > 0); + // kill the last used sbid, this can be broken due to blobid preallocation + // in rare cases, leaving as-is for the sake of simplicity + uint64_t sbid = blobid_last; + + string key; + dout(5) << __func__<< " " << sbid << dendl; + get_shared_blob_key(sbid, &key); + txn->rmkey(PREFIX_SHARED_BLOB, key); + db->submit_transaction_sync(txn); +}; + +void BlueStore::inject_stray_shared_blob_key(uint64_t sbid) +{ + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + + dout(5) << __func__ << " " << sbid << dendl; + + string key; + get_shared_blob_key(sbid, &key); + bluestore_shared_blob_t persistent(sbid); + persistent.ref_map.get(0xdead0000, min_alloc_size); + bufferlist bl; + encode(persistent, bl); + dout(20) << __func__ << " sbid " << sbid + << " takes " << bl.length() << " bytes, updating" + << dendl; + + txn->set(PREFIX_SHARED_BLOB, key, bl); + db->submit_transaction_sync(txn); +}; + + +void BlueStore::inject_leaked(uint64_t len) +{ + PExtentVector exts; + int64_t alloc_len = alloc->allocate(len, min_alloc_size, + min_alloc_size * 256, 0, &exts); + + if (fm->is_null_manager()) { + return; + } + + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + + ceph_assert(alloc_len >= (int64_t)len); + for (auto& p : exts) { + fm->allocate(p.offset, p.length, txn); + } + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_false_free(coll_t cid, ghobject_t oid) +{ + ceph_assert(!fm->is_null_manager()); + + KeyValueDB::Transaction txn; + OnodeRef o; + CollectionRef c = _get_collection(cid); + ceph_assert(c); + { + std::unique_lock l{c->lock}; // just to avoid internal asserts + o = c->get_onode(oid, false); + ceph_assert(o); + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + } + + bool injected = false; + txn = db->get_transaction(); + auto& em = o->extent_map.extent_map; + std::vector<const PExtentVector*> v; + if (em.size()) { + v.push_back(&em.begin()->blob->get_blob().get_extents()); + } + if (em.size() > 1) { + auto it = em.end(); + --it; + v.push_back(&(it->blob->get_blob().get_extents())); + } + for (auto pext : v) { + if (pext->size()) { + auto p = pext->begin(); + while (p != pext->end()) { + if (p->is_valid()) { + dout(20) << __func__ << " release 0x" << std::hex << p->offset + << "~" << p->length << std::dec << dendl; + fm->release(p->offset, p->length, txn); + injected = true; + break; + } + ++p; + } + } + } + ceph_assert(injected); + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_legacy_omap() +{ + dout(1) << __func__ << dendl; + per_pool_omap = OMAP_BULK; + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + txn->rmkey(PREFIX_SUPER, "per_pool_omap"); + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid) +{ + dout(1) << __func__ << " " + << cid << " " << oid + <<dendl; + KeyValueDB::Transaction txn; + OnodeRef o; + CollectionRef c = _get_collection(cid); + ceph_assert(c); + { + std::unique_lock l{ c->lock }; // just to avoid internal asserts + o = c->get_onode(oid, false); + ceph_assert(o); + } + o->onode.clear_flag( + bluestore_onode_t::FLAG_PERPG_OMAP | + bluestore_onode_t::FLAG_PERPOOL_OMAP | + bluestore_onode_t::FLAG_PGMETA_OMAP); + txn = db->get_transaction(); + _record_onode(o, txn); + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_stray_omap(uint64_t head, const string& name) +{ + dout(1) << __func__ << dendl; + KeyValueDB::Transaction txn = db->get_transaction(); + + string key; + bufferlist bl; + _key_encode_u64(head, &key); + key.append(name); + txn->set(PREFIX_OMAP, key, bl); + + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs) +{ + BlueStoreRepairer repairer; + repairer.fix_statfs(db, key, new_statfs); + repairer.apply(db); +} + +void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs) +{ + KeyValueDB::Transaction t = db->get_transaction(); + volatile_statfs v; + v = new_statfs; + bufferlist bl; + v.encode(bl); + t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl); + db->submit_transaction_sync(t); +} + +void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1, + coll_t cid2, ghobject_t oid2, + uint64_t offset) +{ + OnodeRef o1; + CollectionRef c1 = _get_collection(cid1); + ceph_assert(c1); + { + std::unique_lock l{c1->lock}; // just to avoid internal asserts + o1 = c1->get_onode(oid1, false); + ceph_assert(o1); + o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE); + } + OnodeRef o2; + CollectionRef c2 = _get_collection(cid2); + ceph_assert(c2); + { + std::unique_lock l{c2->lock}; // just to avoid internal asserts + o2 = c2->get_onode(oid2, false); + ceph_assert(o2); + o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE); + } + Extent& e1 = *(o1->extent_map.seek_lextent(offset)); + Extent& e2 = *(o2->extent_map.seek_lextent(offset)); + + // require onode/extent layout to be the same (and simple) + // to make things easier + ceph_assert(o1->onode.extent_map_shards.empty()); + ceph_assert(o2->onode.extent_map_shards.empty()); + ceph_assert(o1->extent_map.spanning_blob_map.size() == 0); + ceph_assert(o2->extent_map.spanning_blob_map.size() == 0); + ceph_assert(e1.logical_offset == e2.logical_offset); + ceph_assert(e1.length == e2.length); + ceph_assert(e1.blob_offset == e2.blob_offset); + + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + + // along with misreference error this will create space leaks errors + e2.blob->dirty_blob() = e1.blob->get_blob(); + o2->extent_map.dirty_range(offset, e2.length); + o2->extent_map.update(txn, false); + + _record_onode(o2, txn); + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid, + int16_t blob_id) +{ + OnodeRef o; + CollectionRef c = _get_collection(cid); + ceph_assert(c); + { + std::unique_lock l{ c->lock }; // just to avoid internal asserts + o = c->get_onode(oid, false); + ceph_assert(o); + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + } + + BlobRef b = c->new_blob(); + b->id = blob_id; + o->extent_map.spanning_blob_map[blob_id] = b; + + KeyValueDB::Transaction txn; + txn = db->get_transaction(); + + _record_onode(o, txn); + db->submit_transaction_sync(txn); +} + +void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size) +{ + ceph_assert(bluefs); + + BlueFS::FileWriter* p_handle = nullptr; + auto ret = bluefs->open_for_write(dir, name, &p_handle, false); + ceph_assert(ret == 0); + + std::string s('0', new_size); + bufferlist bl; + bl.append(s); + p_handle->append(bl); + + bluefs->fsync(p_handle); + bluefs->close_writer(p_handle); +} + +void BlueStore::collect_metadata(map<string,string> *pm) +{ + dout(10) << __func__ << dendl; + bdev->collect_metadata("bluestore_bdev_", pm); + if (bluefs) { + (*pm)["bluefs"] = "1"; + // this value is for backward compatibility only + (*pm)["bluefs_single_shared_device"] = \ + stringify((int)bluefs_layout.single_shared_device()); + (*pm)["bluefs_dedicated_db"] = \ + stringify((int)bluefs_layout.dedicated_db); + (*pm)["bluefs_dedicated_wal"] = \ + stringify((int)bluefs_layout.dedicated_wal); + bluefs->collect_metadata(pm, bluefs_layout.shared_bdev); + } else { + (*pm)["bluefs"] = "0"; + } + + // report numa mapping for underlying devices + int node = -1; + set<int> nodes; + set<string> failed; + int r = get_numa_node(&node, &nodes, &failed); + if (r >= 0) { + if (!failed.empty()) { + (*pm)["objectstore_numa_unknown_devices"] = stringify(failed); + } + if (!nodes.empty()) { + dout(1) << __func__ << " devices span numa nodes " << nodes << dendl; + (*pm)["objectstore_numa_nodes"] = stringify(nodes); + } + if (node >= 0) { + (*pm)["objectstore_numa_node"] = stringify(node); + } + } + (*pm)["bluestore_min_alloc_size"] = stringify(min_alloc_size); +} + +int BlueStore::get_numa_node( + int *final_node, + set<int> *out_nodes, + set<string> *out_failed) +{ + int node = -1; + set<string> devices; + get_devices(&devices); + set<int> nodes; + set<string> failed; + for (auto& devname : devices) { + int n; + BlkDev bdev(devname); + int r = bdev.get_numa_node(&n); + if (r < 0) { + dout(10) << __func__ << " bdev " << devname << " can't detect numa_node" + << dendl; + failed.insert(devname); + continue; + } + dout(10) << __func__ << " bdev " << devname << " on numa_node " << n + << dendl; + nodes.insert(n); + if (node < 0) { + node = n; + } + } + if (node >= 0 && nodes.size() == 1 && failed.empty()) { + *final_node = node; + } + if (out_nodes) { + *out_nodes = nodes; + } + if (out_failed) { + *out_failed = failed; + } + return 0; +} + +void BlueStore::prepare_for_fast_shutdown() +{ + m_fast_shutdown = true; +} + +int BlueStore::get_devices(set<string> *ls) +{ + if (bdev) { + bdev->get_devices(ls); + if (bluefs) { + bluefs->get_devices(ls); + } + return 0; + } + + // grumble, we haven't started up yet. + if (int r = _open_path(); r < 0) { + return r; + } + auto close_path = make_scope_guard([&] { + _close_path(); + }); + if (int r = _open_fsid(false); r < 0) { + return r; + } + auto close_fsid = make_scope_guard([&] { + _close_fsid(); + }); + if (int r = _read_fsid(&fsid); r < 0) { + return r; + } + if (int r = _lock_fsid(); r < 0) { + return r; + } + if (int r = _open_bdev(false); r < 0) { + return r; + } + auto close_bdev = make_scope_guard([&] { + _close_bdev(); + }); + if (int r = _minimal_open_bluefs(false); r < 0) { + return r; + } + bdev->get_devices(ls); + if (bluefs) { + bluefs->get_devices(ls); + } + _minimal_close_bluefs(); + return 0; +} + +void BlueStore::_get_statfs_overall(struct store_statfs_t *buf) +{ + buf->reset(); + + auto prefix = per_pool_omap == OMAP_BULK ? + PREFIX_OMAP : + per_pool_omap == OMAP_PER_POOL ? + PREFIX_PERPOOL_OMAP : + PREFIX_PERPG_OMAP; + buf->omap_allocated = + db->estimate_prefix_size(prefix, string()); + + uint64_t bfree = alloc->get_free(); + + if (bluefs) { + buf->internally_reserved = 0; + // include dedicated db, too, if that isn't the shared device. + if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) { + buf->total += bluefs->get_total(BlueFS::BDEV_DB); + } + // call any non-omap bluefs space "internal metadata" + buf->internal_metadata = + bluefs->get_used() + - buf->omap_allocated; + } + + ExtBlkDevState ebd_state; + int rc = bdev->get_ebd_state(ebd_state); + if (rc == 0) { + buf->total += ebd_state.get_physical_total(); + + // we are limited by both the size of the virtual device and the + // underlying physical device. + bfree = std::min(bfree, ebd_state.get_physical_avail()); + + buf->allocated = ebd_state.get_physical_total() - ebd_state.get_physical_avail();; + } else { + buf->total += bdev->get_size(); + } + buf->available = bfree; +} + +int BlueStore::statfs(struct store_statfs_t *buf, + osd_alert_list_t* alerts) +{ + if (alerts) { + alerts->clear(); + _log_alerts(*alerts); + } + _get_statfs_overall(buf); + { + std::lock_guard l(vstatfs_lock); + buf->allocated = vstatfs.allocated(); + buf->data_stored = vstatfs.stored(); + buf->data_compressed = vstatfs.compressed(); + buf->data_compressed_original = vstatfs.compressed_original(); + buf->data_compressed_allocated = vstatfs.compressed_allocated(); + } + + dout(20) << __func__ << " " << *buf << dendl; + return 0; +} + +int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf, + bool *out_per_pool_omap) +{ + dout(20) << __func__ << " pool " << pool_id<< dendl; + + if (!per_pool_stat_collection) { + dout(20) << __func__ << " not supported in legacy mode " << dendl; + return -ENOTSUP; + } + buf->reset(); + + { + std::lock_guard l(vstatfs_lock); + osd_pools[pool_id].publish(buf); + } + + string key_prefix; + _key_encode_u64(pool_id, &key_prefix); + *out_per_pool_omap = per_pool_omap != OMAP_BULK; + // stop calls after db was closed + if (*out_per_pool_omap && db) { + auto prefix = per_pool_omap == OMAP_PER_POOL ? + PREFIX_PERPOOL_OMAP : + PREFIX_PERPG_OMAP; + buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix); + } + + dout(10) << __func__ << *buf << dendl; + return 0; +} + +void BlueStore::_check_legacy_statfs_alert() +{ + string s; + if (!per_pool_stat_collection && + cct->_conf->bluestore_warn_on_legacy_statfs) { + s = "legacy statfs reporting detected, " + "suggest to run store repair to get consistent statistic reports"; + } + std::lock_guard l(qlock); + legacy_statfs_alert = s; +} + +void BlueStore::_check_no_per_pg_or_pool_omap_alert() +{ + string per_pg, per_pool; + if (per_pool_omap != OMAP_PER_PG) { + if (cct->_conf->bluestore_warn_on_no_per_pg_omap) { + per_pg = "legacy (not per-pg) omap detected, " + "suggest to run store repair to benefit from faster PG removal"; + } + if (per_pool_omap != OMAP_PER_POOL) { + if (cct->_conf->bluestore_warn_on_no_per_pool_omap) { + per_pool = "legacy (not per-pool) omap detected, " + "suggest to run store repair to benefit from per-pool omap usage statistics"; + } + } + } + std::lock_guard l(qlock); + no_per_pg_omap_alert = per_pg; + no_per_pool_omap_alert = per_pool; +} + +// --------------- +// cache + +BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid) +{ + std::shared_lock l(coll_lock); + ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid); + if (cp == coll_map.end()) + return CollectionRef(); + return cp->second; +} + +BlueStore::CollectionRef BlueStore::_get_collection_by_oid(const ghobject_t& oid) +{ + std::shared_lock l(coll_lock); + + // FIXME: we must replace this with something more efficient + + for (auto& i : coll_map) { + spg_t spgid; + if (i.first.is_pg(&spgid) && + i.second->contains(oid)) { + return i.second; + } + } + return CollectionRef(); +} + +void BlueStore::_queue_reap_collection(CollectionRef& c) +{ + dout(10) << __func__ << " " << c << " " << c->cid << dendl; + // _reap_collections and this in the same thread, + // so no need a lock. + removed_collections.push_back(c); +} + +void BlueStore::_reap_collections() +{ + + list<CollectionRef> removed_colls; + { + // _queue_reap_collection and this in the same thread. + // So no need a lock. + if (!removed_collections.empty()) + removed_colls.swap(removed_collections); + else + return; + } + + list<CollectionRef>::iterator p = removed_colls.begin(); + while (p != removed_colls.end()) { + CollectionRef c = *p; + dout(10) << __func__ << " " << c << " " << c->cid << dendl; + if (c->onode_space.map_any([&](Onode* o) { + ceph_assert(!o->exists); + if (o->flushing_count.load()) { + dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid + << " flush_txns " << o->flushing_count << dendl; + return true; + } + return false; + })) { + ++p; + continue; + } + c->onode_space.clear(); + p = removed_colls.erase(p); + dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl; + } + if (removed_colls.empty()) { + dout(10) << __func__ << " all reaped" << dendl; + } else { + removed_collections.splice(removed_collections.begin(), removed_colls); + } +} + +void BlueStore::_update_logger() +{ + uint64_t num_onodes = 0; + uint64_t num_pinned_onodes = 0; + uint64_t num_extents = 0; + uint64_t num_blobs = 0; + uint64_t num_buffers = 0; + uint64_t num_buffer_bytes = 0; + for (auto c : onode_cache_shards) { + c->add_stats(&num_onodes, &num_pinned_onodes); + } + for (auto c : buffer_cache_shards) { + c->add_stats(&num_extents, &num_blobs, + &num_buffers, &num_buffer_bytes); + } + logger->set(l_bluestore_onodes, num_onodes); + logger->set(l_bluestore_pinned_onodes, num_pinned_onodes); + logger->set(l_bluestore_extents, num_extents); + logger->set(l_bluestore_blobs, num_blobs); + logger->set(l_bluestore_buffers, num_buffers); + logger->set(l_bluestore_buffer_bytes, num_buffer_bytes); +} + +// --------------- +// read operations + +ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid) +{ + return _get_collection(cid); +} + +ObjectStore::CollectionHandle BlueStore::create_new_collection( + const coll_t& cid) +{ + std::unique_lock l{coll_lock}; + auto c = ceph::make_ref<Collection>( + this, + onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())], + buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())], + cid); + new_coll_map[cid] = c; + _osr_attach(c.get()); + return c; +} + +void BlueStore::set_collection_commit_queue( + const coll_t& cid, + ContextQueue *commit_queue) +{ + if (commit_queue) { + std::shared_lock l(coll_lock); + if (coll_map.count(cid)) { + coll_map[cid]->commit_queue = commit_queue; + } else if (new_coll_map.count(cid)) { + new_coll_map[cid]->commit_queue = commit_queue; + } + } +} + + +bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(10) << __func__ << " " << c->cid << " " << oid << dendl; + if (!c->exists) + return false; + + bool r = true; + + { + std::shared_lock l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + r = false; + } + + return r; +} + +int BlueStore::stat( + CollectionHandle &c_, + const ghobject_t& oid, + struct stat *st, + bool allow_eio) +{ + Collection *c = static_cast<Collection *>(c_.get()); + if (!c->exists) + return -ENOENT; + dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; + + { + std::shared_lock l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + return -ENOENT; + st->st_size = o->onode.size; + st->st_blksize = 4096; + st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize; + st->st_nlink = 1; + } + + int r = 0; + if (_debug_mdata_eio(oid)) { + r = -EIO; + derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; + } + return r; +} +int BlueStore::set_collection_opts( + CollectionHandle& ch, + const pool_opts_t& opts) +{ + Collection *c = static_cast<Collection *>(ch.get()); + dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl; + if (!c->exists) + return -ENOENT; + std::unique_lock l{c->lock}; + c->pool_opts = opts; + return 0; +} + +int BlueStore::read( + CollectionHandle &c_, + const ghobject_t& oid, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags) +{ + auto start = mono_clock::now(); + Collection *c = static_cast<Collection *>(c_.get()); + const coll_t &cid = c->get_cid(); + dout(15) << __func__ << " " << cid << " " << oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << dendl; + if (!c->exists) + return -ENOENT; + + bl.clear(); + int r; + { + std::shared_lock l(c->lock); + auto start1 = mono_clock::now(); + OnodeRef o = c->get_onode(oid, false); + log_latency("get_onode@read", + l_bluestore_read_onode_meta_lat, + mono_clock::now() - start1, + cct->_conf->bluestore_log_op_age); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (offset == length && offset == 0) + length = o->onode.size; + + r = _do_read(c, o, offset, length, bl, op_flags); + if (r == -EIO) { + logger->inc(l_bluestore_read_eio); + } + } + + out: + if (r >= 0 && _debug_data_eio(oid)) { + r = -EIO; + derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; + } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */ + cct->_conf->bluestore_debug_random_read_err && + (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * + 100.0)) == 0) { + dout(0) << __func__ << ": inject random EIO" << dendl; + r = -EIO; + } + dout(10) << __func__ << " " << cid << " " << oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << " = " << r << dendl; + log_latency(__func__, + l_bluestore_read_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + return r; +} + +void BlueStore::_read_cache( + OnodeRef& o, + uint64_t offset, + size_t length, + int read_cache_policy, + ready_regions_t& ready_regions, + blobs2read_t& blobs2read) +{ + // build blob-wise list to of stuff read (that isn't cached) + unsigned left = length; + uint64_t pos = offset; + auto lp = o->extent_map.seek_lextent(offset); + while (left > 0 && lp != o->extent_map.extent_map.end()) { + if (pos < lp->logical_offset) { + unsigned hole = lp->logical_offset - pos; + if (hole >= left) { + break; + } + dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole + << std::dec << dendl; + pos += hole; + left -= hole; + } + BlobRef& bptr = lp->blob; + unsigned l_off = pos - lp->logical_offset; + unsigned b_off = l_off + lp->blob_offset; + unsigned b_len = std::min(left, lp->length - l_off); + + ready_regions_t cache_res; + interval_set<uint32_t> cache_interval; + bptr->shared_blob->bc.read( + bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval, + read_cache_policy); + dout(20) << __func__ << " blob " << *bptr << std::hex + << " need 0x" << b_off << "~" << b_len + << " cache has 0x" << cache_interval + << std::dec << dendl; + + auto pc = cache_res.begin(); + uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size); + while (b_len > 0) { + unsigned l; + if (pc != cache_res.end() && + pc->first == b_off) { + l = pc->second.length(); + ready_regions[pos] = std::move(pc->second); + dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x" + << b_off << "~" << l << std::dec << dendl; + ++pc; + } else { + l = b_len; + if (pc != cache_res.end()) { + ceph_assert(pc->first > b_off); + l = pc->first - b_off; + } + dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x" + << b_off << "~" << l << std::dec << dendl; + // merge regions + { + uint64_t r_off = b_off; + uint64_t r_len = l; + uint64_t front = r_off % chunk_size; + if (front) { + r_off -= front; + r_len += front; + } + unsigned tail = r_len % chunk_size; + if (tail) { + r_len += chunk_size - tail; + } + bool merged = false; + regions2read_t& r2r = blobs2read[bptr]; + if (r2r.size()) { + read_req_t& pre = r2r.back(); + if (r_off <= (pre.r_off + pre.r_len)) { + front += (r_off - pre.r_off); + pre.r_len += (r_off + r_len - pre.r_off - pre.r_len); + pre.regs.emplace_back(region_t(pos, b_off, l, front)); + merged = true; + } + } + if (!merged) { + read_req_t req(r_off, r_len); + req.regs.emplace_back(region_t(pos, b_off, l, front)); + r2r.emplace_back(std::move(req)); + } + } + } + pos += l; + b_off += l; + left -= l; + b_len -= l; + } + ++lp; + } +} + +int BlueStore::_prepare_read_ioc( + blobs2read_t& blobs2read, + vector<bufferlist>* compressed_blob_bls, + IOContext* ioc) +{ + for (auto& p : blobs2read) { + const BlobRef& bptr = p.first; + regions2read_t& r2r = p.second; + dout(20) << __func__ << " blob " << *bptr << " need " + << r2r << dendl; + if (bptr->get_blob().is_compressed()) { + // read the whole thing + if (compressed_blob_bls->empty()) { + // ensure we avoid any reallocation on subsequent blobs + compressed_blob_bls->reserve(blobs2read.size()); + } + compressed_blob_bls->push_back(bufferlist()); + bufferlist& bl = compressed_blob_bls->back(); + auto r = bptr->get_blob().map( + 0, bptr->get_blob().get_ondisk_length(), + [&](uint64_t offset, uint64_t length) { + int r = bdev->aio_read(offset, length, &bl, ioc); + if (r < 0) + return r; + return 0; + }); + if (r < 0) { + derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl; + if (r == -EIO) { + // propagate EIO to caller + return r; + } + ceph_assert(r == 0); + } + } else { + // read the pieces + for (auto& req : r2r) { + dout(20) << __func__ << " region 0x" << std::hex + << req.regs.front().logical_offset + << ": 0x" << req.regs.front().blob_xoffset + << " reading 0x" << req.r_off + << "~" << req.r_len << std::dec + << dendl; + + // read it + auto r = bptr->get_blob().map( + req.r_off, req.r_len, + [&](uint64_t offset, uint64_t length) { + int r = bdev->aio_read(offset, length, &req.bl, ioc); + if (r < 0) + return r; + return 0; + }); + if (r < 0) { + derr << __func__ << " bdev-read failed: " << cpp_strerror(r) + << dendl; + if (r == -EIO) { + // propagate EIO to caller + return r; + } + ceph_assert(r == 0); + } + ceph_assert(req.bl.length() == req.r_len); + } + } + } + return 0; +} + +int BlueStore::_generate_read_result_bl( + OnodeRef& o, + uint64_t offset, + size_t length, + ready_regions_t& ready_regions, + vector<bufferlist>& compressed_blob_bls, + blobs2read_t& blobs2read, + bool buffered, + bool* csum_error, + bufferlist& bl) +{ + // enumerate and decompress desired blobs + auto p = compressed_blob_bls.begin(); + blobs2read_t::iterator b2r_it = blobs2read.begin(); + while (b2r_it != blobs2read.end()) { + const BlobRef& bptr = b2r_it->first; + regions2read_t& r2r = b2r_it->second; + dout(20) << __func__ << " blob " << *bptr << " need " + << r2r << dendl; + if (bptr->get_blob().is_compressed()) { + ceph_assert(p != compressed_blob_bls.end()); + bufferlist& compressed_bl = *p++; + if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl, + r2r.front().regs.front().logical_offset) < 0) { + *csum_error = true; + return -EIO; + } + bufferlist raw_bl; + auto r = _decompress(compressed_bl, &raw_bl); + if (r < 0) + return r; + if (buffered) { + bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0, + raw_bl); + } + for (auto& req : r2r) { + for (auto& r : req.regs) { + ready_regions[r.logical_offset].substr_of( + raw_bl, r.blob_xoffset, r.length); + } + } + } else { + for (auto& req : r2r) { + if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl, + req.regs.front().logical_offset) < 0) { + *csum_error = true; + return -EIO; + } + if (buffered) { + bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), + req.r_off, req.bl); + } + + // prune and keep result + for (const auto& r : req.regs) { + ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length); + } + } + } + ++b2r_it; + } + + // generate a resulting buffer + auto pr = ready_regions.begin(); + auto pr_end = ready_regions.end(); + uint64_t pos = 0; + while (pos < length) { + if (pr != pr_end && pr->first == pos + offset) { + dout(30) << __func__ << " assemble 0x" << std::hex << pos + << ": data from 0x" << pr->first << "~" << pr->second.length() + << std::dec << dendl; + pos += pr->second.length(); + bl.claim_append(pr->second); + ++pr; + } else { + uint64_t l = length - pos; + if (pr != pr_end) { + ceph_assert(pr->first > pos + offset); + l = pr->first - (pos + offset); + } + dout(30) << __func__ << " assemble 0x" << std::hex << pos + << ": zeros for 0x" << (pos + offset) << "~" << l + << std::dec << dendl; + bl.append_zero(l); + pos += l; + } + } + ceph_assert(bl.length() == length); + ceph_assert(pos == length); + ceph_assert(pr == pr_end); + return 0; +} + +int BlueStore::_do_read( + Collection *c, + OnodeRef& o, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags, + uint64_t retry_count) +{ + FUNCTRACE(cct); + int r = 0; + int read_cache_policy = 0; // do not bypass clean or dirty cache + + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << " size 0x" << o->onode.size << " (" << std::dec + << o->onode.size << ")" << dendl; + bl.clear(); + + if (offset >= o->onode.size) { + return r; + } + + // generally, don't buffer anything, unless the client explicitly requests + // it. + bool buffered = false; + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { + dout(20) << __func__ << " will do buffered read" << dendl; + buffered = true; + } else if (cct->_conf->bluestore_default_buffered_read && + (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { + dout(20) << __func__ << " defaulting to buffered read" << dendl; + buffered = true; + } + + if (offset + length > o->onode.size) { + length = o->onode.size - offset; + } + + auto start = mono_clock::now(); + o->extent_map.fault_range(db, offset, length); + log_latency(__func__, + l_bluestore_read_onode_meta_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + _dump_onode<30>(cct, *o); + + // for deep-scrub, we only read dirty cache and bypass clean cache in + // order to read underlying block device in case there are silent disk errors. + if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) { + dout(20) << __func__ << " will bypass cache and do direct read" << dendl; + read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE; + } + + // build blob-wise list to of stuff read (that isn't cached) + ready_regions_t ready_regions; + blobs2read_t blobs2read; + _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read); + + + // read raw blob data. + start = mono_clock::now(); // for the sake of simplicity + // measure the whole block below. + // The error isn't that much... + vector<bufferlist> compressed_blob_bls; + IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio); + r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc); + // we always issue aio for reading, so errors other than EIO are not allowed + if (r < 0) + return r; + + int64_t num_ios = blobs2read.size(); + if (ioc.has_pending_aios()) { + num_ios = ioc.get_num_ios(); + bdev->aio_submit(&ioc); + dout(20) << __func__ << " waiting for aio" << dendl; + ioc.aio_wait(); + r = ioc.get_return_value(); + if (r < 0) { + ceph_assert(r == -EIO); // no other errors allowed + return -EIO; + } + } + log_latency_fn(__func__, + l_bluestore_read_wait_aio_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age, + [&](auto lat) { return ", num_ios = " + stringify(num_ios); } + ); + + bool csum_error = false; + r = _generate_read_result_bl(o, offset, length, ready_regions, + compressed_blob_bls, blobs2read, + buffered && !ioc.skip_cache(), + &csum_error, bl); + if (csum_error) { + // Handles spurious read errors caused by a kernel bug. + // We sometimes get all-zero pages as a result of the read under + // high memory pressure. Retrying the failing read succeeds in most + // cases. + // See also: http://tracker.ceph.com/issues/22464 + if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { + return -EIO; + } + return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1); + } + r = bl.length(); + if (retry_count) { + logger->inc(l_bluestore_reads_with_retries); + dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length + << " failed " << std::dec << retry_count << " times before succeeding" << dendl; + stringstream s; + s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries); + _set_spurious_read_errors_alert(s.str()); + } + return r; +} + +int BlueStore::_verify_csum(OnodeRef& o, + const bluestore_blob_t* blob, uint64_t blob_xoffset, + const bufferlist& bl, + uint64_t logical_offset) const +{ + int bad; + uint64_t bad_csum; + auto start = mono_clock::now(); + int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum); + if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 && + (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) { + derr << __func__ << " injecting bluestore checksum verifcation error" << dendl; + bad = blob_xoffset; + r = -1; + bad_csum = 0xDEADBEEF; + } + if (r < 0) { + if (r == -1) { + PExtentVector pex; + blob->map( + bad, + blob->get_csum_chunk_size(), + [&](uint64_t offset, uint64_t length) { + pex.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + derr << __func__ << " bad " + << Checksummer::get_csum_type_string(blob->csum_type) + << "/0x" << std::hex << blob->get_csum_chunk_size() + << " checksum at blob offset 0x" << bad + << ", got 0x" << bad_csum << ", expected 0x" + << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec + << ", device location " << pex + << ", logical extent 0x" << std::hex + << (logical_offset + bad - blob_xoffset) << "~" + << blob->get_csum_chunk_size() << std::dec + << ", object " << o->oid + << dendl; + } else { + derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl; + } + } + log_latency(__func__, + l_bluestore_csum_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + if (cct->_conf->bluestore_ignore_data_csum) { + return 0; + } + return r; +} + +int BlueStore::_decompress(bufferlist& source, bufferlist* result) +{ + int r = 0; + auto start = mono_clock::now(); + auto i = source.cbegin(); + bluestore_compression_header_t chdr; + decode(chdr, i); + int alg = int(chdr.type); + CompressorRef cp = compressor; + if (!cp || (int)cp->get_type() != alg) { + cp = Compressor::create(cct, alg); + } + + if (!cp.get()) { + // if compressor isn't available - error, because cannot return + // decompressed data? + + const char* alg_name = Compressor::get_comp_alg_name(alg); + derr << __func__ << " can't load decompressor " << alg_name << dendl; + _set_compression_alert(false, alg_name); + r = -EIO; + } else { + r = cp->decompress(i, chdr.length, *result, chdr.compressor_message); + if (r < 0) { + derr << __func__ << " decompression failed with exit code " << r << dendl; + r = -EIO; + } + } + log_latency(__func__, + l_bluestore_decompress_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + return r; +} + +// this stores fiemap into interval_set, other variations +// use it internally +int BlueStore::_fiemap( + CollectionHandle &c_, + const ghobject_t& oid, + uint64_t offset, + size_t length, + interval_set<uint64_t>& destset) +{ + Collection *c = static_cast<Collection *>(c_.get()); + if (!c->exists) + return -ENOENT; + { + std::shared_lock l(c->lock); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + return -ENOENT; + } + _dump_onode<30>(cct, *o); + + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << " size 0x" << o->onode.size << std::dec << dendl; + + boost::intrusive::set<Extent>::iterator ep, eend; + if (offset >= o->onode.size) + goto out; + + if (offset + length > o->onode.size) { + length = o->onode.size - offset; + } + + o->extent_map.fault_range(db, offset, length); + eend = o->extent_map.extent_map.end(); + ep = o->extent_map.seek_lextent(offset); + while (length > 0) { + dout(20) << __func__ << " offset " << offset << dendl; + if (ep != eend && ep->logical_offset + ep->length <= offset) { + ++ep; + continue; + } + + uint64_t x_len = length; + if (ep != eend && ep->logical_offset <= offset) { + uint64_t x_off = offset - ep->logical_offset; + x_len = std::min(x_len, ep->length - x_off); + dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~" + << x_len << std::dec << " blob " << ep->blob << dendl; + destset.insert(offset, x_len); + length -= x_len; + offset += x_len; + if (x_off + x_len == ep->length) + ++ep; + continue; + } + if (ep != eend && + ep->logical_offset > offset && + ep->logical_offset - offset < x_len) { + x_len = ep->logical_offset - offset; + } + offset += x_len; + length -= x_len; + } + } + + out: + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << " size = 0x(" << destset << ")" << std::dec << dendl; + return 0; +} + +int BlueStore::fiemap( + CollectionHandle &c_, + const ghobject_t& oid, + uint64_t offset, + size_t length, + bufferlist& bl) +{ + interval_set<uint64_t> m; + int r = _fiemap(c_, oid, offset, length, m); + if (r >= 0) { + encode(m, bl); + } + return r; +} + +int BlueStore::fiemap( + CollectionHandle &c_, + const ghobject_t& oid, + uint64_t offset, + size_t length, + map<uint64_t, uint64_t>& destmap) +{ + interval_set<uint64_t> m; + int r = _fiemap(c_, oid, offset, length, m); + if (r >= 0) { + destmap = std::move(m).detach(); + } + return r; +} + +int BlueStore::readv( + CollectionHandle &c_, + const ghobject_t& oid, + interval_set<uint64_t>& m, + bufferlist& bl, + uint32_t op_flags) +{ + auto start = mono_clock::now(); + Collection *c = static_cast<Collection *>(c_.get()); + const coll_t &cid = c->get_cid(); + dout(15) << __func__ << " " << cid << " " << oid + << " fiemap " << m + << dendl; + if (!c->exists) + return -ENOENT; + + bl.clear(); + int r; + { + std::shared_lock l(c->lock); + auto start1 = mono_clock::now(); + OnodeRef o = c->get_onode(oid, false); + log_latency("get_onode@read", + l_bluestore_read_onode_meta_lat, + mono_clock::now() - start1, + cct->_conf->bluestore_log_op_age); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (m.empty()) { + r = 0; + goto out; + } + + r = _do_readv(c, o, m, bl, op_flags); + if (r == -EIO) { + logger->inc(l_bluestore_read_eio); + } + } + + out: + if (r >= 0 && _debug_data_eio(oid)) { + r = -EIO; + derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; + } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */ + cct->_conf->bluestore_debug_random_read_err && + (rand() % (int)(cct->_conf->bluestore_debug_random_read_err * + 100.0)) == 0) { + dout(0) << __func__ << ": inject random EIO" << dendl; + r = -EIO; + } + dout(10) << __func__ << " " << cid << " " << oid + << " fiemap " << m << std::dec + << " = " << r << dendl; + log_latency(__func__, + l_bluestore_read_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + return r; +} + +int BlueStore::_do_readv( + Collection *c, + OnodeRef& o, + const interval_set<uint64_t>& m, + bufferlist& bl, + uint32_t op_flags, + uint64_t retry_count) +{ + FUNCTRACE(cct); + int r = 0; + int read_cache_policy = 0; // do not bypass clean or dirty cache + + dout(20) << __func__ << " fiemap " << m << std::hex + << " size 0x" << o->onode.size << " (" << std::dec + << o->onode.size << ")" << dendl; + + // generally, don't buffer anything, unless the client explicitly requests + // it. + bool buffered = false; + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { + dout(20) << __func__ << " will do buffered read" << dendl; + buffered = true; + } else if (cct->_conf->bluestore_default_buffered_read && + (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { + dout(20) << __func__ << " defaulting to buffered read" << dendl; + buffered = true; + } + // this method must be idempotent since we may call it several times + // before we finally read the expected result. + bl.clear(); + + // call fiemap first! + ceph_assert(m.range_start() <= o->onode.size); + ceph_assert(m.range_end() <= o->onode.size); + auto start = mono_clock::now(); + o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start()); + log_latency(__func__, + l_bluestore_read_onode_meta_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + _dump_onode<30>(cct, *o); + + IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio); + vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results; + raw_results.reserve(m.num_intervals()); + int i = 0; + for (auto p = m.begin(); p != m.end(); p++, i++) { + raw_results.push_back({}); + _read_cache(o, p.get_start(), p.get_len(), read_cache_policy, + std::get<0>(raw_results[i]), std::get<2>(raw_results[i])); + r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc); + // we always issue aio for reading, so errors other than EIO are not allowed + if (r < 0) + return r; + } + + auto num_ios = m.size(); + if (ioc.has_pending_aios()) { + num_ios = ioc.get_num_ios(); + bdev->aio_submit(&ioc); + dout(20) << __func__ << " waiting for aio" << dendl; + ioc.aio_wait(); + r = ioc.get_return_value(); + if (r < 0) { + ceph_assert(r == -EIO); // no other errors allowed + return -EIO; + } + } + log_latency_fn(__func__, + l_bluestore_read_wait_aio_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age, + [&](auto lat) { return ", num_ios = " + stringify(num_ios); } + ); + + ceph_assert(raw_results.size() == (size_t)m.num_intervals()); + i = 0; + for (auto p = m.begin(); p != m.end(); p++, i++) { + bool csum_error = false; + bufferlist t; + r = _generate_read_result_bl(o, p.get_start(), p.get_len(), + std::get<0>(raw_results[i]), + std::get<1>(raw_results[i]), + std::get<2>(raw_results[i]), + buffered, &csum_error, t); + if (csum_error) { + // Handles spurious read errors caused by a kernel bug. + // We sometimes get all-zero pages as a result of the read under + // high memory pressure. Retrying the failing read succeeds in most + // cases. + // See also: http://tracker.ceph.com/issues/22464 + if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { + return -EIO; + } + return _do_readv(c, o, m, bl, op_flags, retry_count + 1); + } + bl.claim_append(t); + } + if (retry_count) { + logger->inc(l_bluestore_reads_with_retries); + dout(5) << __func__ << " read fiemap " << m + << " failed " << retry_count << " times before succeeding" + << dendl; + } + return bl.length(); +} + +int BlueStore::dump_onode(CollectionHandle &c_, + const ghobject_t& oid, + const string& section_name, + Formatter *f) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + if (!c->exists) + return -ENOENT; + + int r; + { + std::shared_lock l(c->lock); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + // FIXME minor: actually the next line isn't enough to + // load shared blobs. Leaving as is for now.. + // + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + + _dump_onode<0>(cct, *o); + f->open_object_section(section_name.c_str()); + o->dump(f); + f->close_section(); + r = 0; + } + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " = " << r << dendl; + return r; +} + +int BlueStore::getattr( + CollectionHandle &c_, + const ghobject_t& oid, + const char *name, + bufferptr& value) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl; + if (!c->exists) + return -ENOENT; + + int r; + { + std::shared_lock l(c->lock); + mempool::bluestore_cache_meta::string k(name); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (!o->onode.attrs.count(k)) { + r = -ENODATA; + goto out; + } + value = o->onode.attrs[k]; + r = 0; + } + out: + if (r == 0 && _debug_mdata_eio(oid)) { + r = -EIO; + derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; + } + dout(10) << __func__ << " " << c->cid << " " << oid << " " << name + << " = " << r << dendl; + return r; +} + +int BlueStore::getattrs( + CollectionHandle &c_, + const ghobject_t& oid, + map<string,bufferptr,less<>>& aset) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + if (!c->exists) + return -ENOENT; + + int r; + { + std::shared_lock l(c->lock); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + for (auto& i : o->onode.attrs) { + aset.emplace(i.first.c_str(), i.second); + } + r = 0; + } + + out: + if (r == 0 && _debug_mdata_eio(oid)) { + r = -EIO; + derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; + } + dout(10) << __func__ << " " << c->cid << " " << oid + << " = " << r << dendl; + return r; +} + +int BlueStore::list_collections(vector<coll_t>& ls) +{ + std::shared_lock l(coll_lock); + ls.reserve(coll_map.size()); + for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin(); + p != coll_map.end(); + ++p) + ls.push_back(p->first); + return 0; +} + +bool BlueStore::collection_exists(const coll_t& c) +{ + std::shared_lock l(coll_lock); + return coll_map.count(c); +} + +int BlueStore::collection_empty(CollectionHandle& ch, bool *empty) +{ + dout(15) << __func__ << " " << ch->cid << dendl; + vector<ghobject_t> ls; + ghobject_t next; + int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1, + &ls, &next); + if (r < 0) { + derr << __func__ << " collection_list returned: " << cpp_strerror(r) + << dendl; + return r; + } + *empty = ls.empty(); + dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl; + return 0; +} + +int BlueStore::collection_bits(CollectionHandle& ch) +{ + dout(15) << __func__ << " " << ch->cid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + std::shared_lock l(c->lock); + dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl; + return c->cnode.bits; +} + +int BlueStore::collection_list( + CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *pnext) +{ + Collection *c = static_cast<Collection *>(c_.get()); + c->flush(); + dout(15) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max << dendl; + int r; + { + std::shared_lock l(c->lock); + r = _collection_list(c, start, end, max, false, ls, pnext); + } + + dout(10) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max + << " = " << r << ", ls.size() = " << ls->size() + << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl; + return r; +} + +int BlueStore::collection_list_legacy( + CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *pnext) +{ + Collection *c = static_cast<Collection *>(c_.get()); + c->flush(); + dout(15) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max << dendl; + int r; + { + std::shared_lock l(c->lock); + r = _collection_list(c, start, end, max, true, ls, pnext); + } + + dout(10) << __func__ << " " << c->cid + << " start " << start << " end " << end << " max " << max + << " = " << r << ", ls.size() = " << ls->size() + << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl; + return r; +} + +int BlueStore::_collection_list( + Collection *c, const ghobject_t& start, const ghobject_t& end, int max, + bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext) +{ + + if (!c->exists) + return -ENOENT; + + ghobject_t static_next; + std::unique_ptr<CollectionListIterator> it; + ghobject_t coll_range_temp_start, coll_range_temp_end; + ghobject_t coll_range_start, coll_range_end; + ghobject_t pend; + bool temp; + + if (!pnext) + pnext = &static_next; + + auto log_latency = make_scope_guard( + [&, start_time = mono_clock::now(), func_name = __func__] { + log_latency_fn( + func_name, + l_bluestore_clist_lat, + mono_clock::now() - start_time, + cct->_conf->bluestore_log_collection_list_age, + [&](const ceph::timespan& lat) { + ostringstream ostr; + ostr << ", lat = " << timespan_str(lat) + << " cid =" << c->cid + << " start " << start << " end " << end + << " max " << max; + return ostr.str(); + }); + }); + + if (start.is_max() || start.hobj.is_max()) { + *pnext = ghobject_t::get_max(); + return 0; + } + get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start, + &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy); + dout(20) << __func__ + << " range " << coll_range_temp_start + << " to " << coll_range_temp_end + << " and " << coll_range_start + << " to " << coll_range_end + << " start " << start << dendl; + if (legacy) { + it = std::make_unique<SimpleCollectionListIterator>( + cct, db->get_iterator(PREFIX_OBJ)); + } else { + it = std::make_unique<SortedCollectionListIterator>( + db->get_iterator(PREFIX_OBJ)); + } + if (start == ghobject_t() || + start.hobj == hobject_t() || + start == c->cid.get_min_hobj()) { + it->upper_bound(coll_range_temp_start); + temp = true; + } else { + if (start.hobj.is_temp()) { + temp = true; + ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end); + } else { + temp = false; + ceph_assert(start >= coll_range_start && start < coll_range_end); + } + dout(20) << __func__ << " temp=" << (int)temp << dendl; + it->lower_bound(start); + } + if (end.hobj.is_max()) { + pend = temp ? coll_range_temp_end : coll_range_end; + } else { + if (end.hobj.is_temp()) { + if (temp) { + pend = end; + } else { + *pnext = ghobject_t::get_max(); + return 0; + } + } else { + pend = temp ? coll_range_temp_end : end; + } + } + dout(20) << __func__ << " pend " << pend << dendl; + while (true) { + if (!it->valid() || it->is_ge(pend)) { + if (!it->valid()) + dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; + else + dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl; + if (temp) { + if (end.hobj.is_temp()) { + if (it->valid() && it->is_lt(coll_range_temp_end)) { + *pnext = it->oid(); + return 0; + } + break; + } + dout(30) << __func__ << " switch to non-temp namespace" << dendl; + temp = false; + it->upper_bound(coll_range_start); + if (end.hobj.is_max()) + pend = coll_range_end; + else + pend = end; + dout(30) << __func__ << " pend " << pend << dendl; + continue; + } + if (it->valid() && it->is_lt(coll_range_end)) { + *pnext = it->oid(); + return 0; + } + break; + } + dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl; + if (ls->size() >= (unsigned)max) { + dout(20) << __func__ << " reached max " << max << dendl; + *pnext = it->oid(); + return 0; + } + ls->push_back(it->oid()); + it->next(); + } + *pnext = ghobject_t::get_max(); + return 0; +} + +int BlueStore::omap_get( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + return _omap_get(c, oid, header, out); +} + +int BlueStore::_omap_get( + Collection *c, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map + ) +{ + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + std::shared_lock l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + r = _onode_omap_get(o, header, out); + out: + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +int BlueStore::_onode_omap_get( + const OnodeRef &o, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map<string, bufferlist> *out /// < [out] Key to value map +) +{ + int r = 0; + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) + goto out; + o->flush(); + { + const string& prefix = o->get_omap_prefix(); + string head, tail; + o->get_omap_header(&head); + o->get_omap_tail(&tail); + KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail}); + it->lower_bound(head); + while (it->valid()) { + if (it->key() == head) { + dout(30) << __func__ << " got header" << dendl; + *header = it->value(); + } else if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + string user_key; + o->decode_omap_key(it->key(), &user_key); + dout(20) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + (*out)[user_key] = it->value(); + } + it->next(); + } + } +out: + return r; +} + +int BlueStore::omap_get_header( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + std::shared_lock l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) + goto out; + o->flush(); + { + string head; + o->get_omap_header(&head); + if (db->get(o->get_omap_prefix(), head, header) >= 0) { + dout(30) << __func__ << " got header" << dendl; + } else { + dout(30) << __func__ << " no header" << dendl; + } + } + out: + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +int BlueStore::omap_get_keys( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set<string> *keys ///< [out] Keys defined on oid + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + auto start1 = mono_clock::now(); + std::shared_lock l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) + goto out; + o->flush(); + { + const string& prefix = o->get_omap_prefix(); + string head, tail; + o->get_omap_key(string(), &head); + o->get_omap_tail(&tail); + KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail}); + it->lower_bound(head); + while (it->valid()) { + if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } + string user_key; + o->decode_omap_key(it->key(), &user_key); + dout(20) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + keys->insert(user_key); + it->next(); + } + } + out: + c->store->log_latency( + __func__, + l_bluestore_omap_get_keys_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +int BlueStore::omap_get_values( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to get + map<string, bufferlist> *out ///< [out] Returned keys and values + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + std::shared_lock l(c->lock); + auto start1 = mono_clock::now(); + int r = 0; + string final_key; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) { + goto out; + } + o->flush(); + { + const string& prefix = o->get_omap_prefix(); + o->get_omap_key(string(), &final_key); + size_t base_key_len = final_key.size(); + for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) { + final_key.resize(base_key_len); // keep prefix + final_key += *p; + bufferlist val; + if (db->get(prefix, final_key, &val) >= 0) { + dout(30) << __func__ << " got " << pretty_binary_string(final_key) + << " -> " << *p << dendl; + out->insert(make_pair(*p, val)); + } + } + } + out: + c->store->log_latency( + __func__, + l_bluestore_omap_get_values_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +#ifdef WITH_SEASTAR +int BlueStore::omap_get_values( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::optional<string> &start_after, ///< [in] Keys to get + map<string, bufferlist> *output ///< [out] Returned keys and values + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + std::shared_lock l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) { + goto out; + } + o->flush(); + { + ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid); + if (!iter) { + r = -ENOENT; + goto out; + } + iter->upper_bound(*start_after); + for (; iter->valid(); iter->next()) { + output->insert(make_pair(iter->key(), iter->value())); + } + } + +out: + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} +#endif + +int BlueStore::omap_check_keys( + CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set<string> &keys, ///< [in] Keys to check + set<string> *out ///< [out] Subset of keys defined on oid + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; + if (!c->exists) + return -ENOENT; + std::shared_lock l(c->lock); + int r = 0; + string final_key; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.has_omap()) { + goto out; + } + o->flush(); + { + const string& prefix = o->get_omap_prefix(); + o->get_omap_key(string(), &final_key); + size_t base_key_len = final_key.size(); + for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) { + final_key.resize(base_key_len); // keep prefix + final_key += *p; + bufferlist val; + if (db->get(prefix, final_key, &val) >= 0) { + dout(30) << __func__ << " have " << pretty_binary_string(final_key) + << " -> " << *p << dendl; + out->insert(*p); + } else { + dout(30) << __func__ << " miss " << pretty_binary_string(final_key) + << " -> " << *p << dendl; + } + } + } + out: + dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r + << dendl; + return r; +} + +ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator( + CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; + if (!c->exists) { + return ObjectMap::ObjectMapIterator(); + } + std::shared_lock l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl; + return ObjectMap::ObjectMapIterator(); + } + o->flush(); + dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl; + auto bounds = KeyValueDB::IteratorBounds(); + if (o->onode.has_omap()) { + std::string lower_bound, upper_bound; + o->get_omap_key(string(), &lower_bound); + o->get_omap_tail(&upper_bound); + bounds.lower_bound = std::move(lower_bound); + bounds.upper_bound = std::move(upper_bound); + } + KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds)); + return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(logger,c, o, it)); +} + +// ----------------- +// write helpers + +uint64_t BlueStore::_get_ondisk_reserved() const { + ceph_assert(min_alloc_size); + return round_up_to( + std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size); +} + +void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t) +{ + dout(10) << __func__ << " ondisk_format " << ondisk_format + << " min_compat_ondisk_format " << min_compat_ondisk_format + << dendl; + ceph_assert(ondisk_format == latest_ondisk_format); + { + bufferlist bl; + encode(ondisk_format, bl); + t->set(PREFIX_SUPER, "ondisk_format", bl); + } + { + bufferlist bl; + encode(min_compat_ondisk_format, bl); + t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl); + } +} + +int BlueStore::_open_super_meta() +{ + // nid + { + nid_max = 0; + bufferlist bl; + db->get(PREFIX_SUPER, "nid_max", &bl); + auto p = bl.cbegin(); + try { + uint64_t v; + decode(v, p); + nid_max = v; + } catch (ceph::buffer::error& e) { + derr << __func__ << " unable to read nid_max" << dendl; + return -EIO; + } + dout(1) << __func__ << " old nid_max " << nid_max << dendl; + nid_last = nid_max.load(); + } + + // blobid + { + blobid_max = 0; + bufferlist bl; + db->get(PREFIX_SUPER, "blobid_max", &bl); + auto p = bl.cbegin(); + try { + uint64_t v; + decode(v, p); + blobid_max = v; + } catch (ceph::buffer::error& e) { + derr << __func__ << " unable to read blobid_max" << dendl; + return -EIO; + } + dout(1) << __func__ << " old blobid_max " << blobid_max << dendl; + blobid_last = blobid_max.load(); + } + + // freelist + { + bufferlist bl; + db->get(PREFIX_SUPER, "freelist_type", &bl); + if (bl.length()) { + freelist_type = std::string(bl.c_str(), bl.length()); + } else { + ceph_abort_msg("Not Support extent freelist manager"); + } + dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl; + } + // ondisk format + int32_t compat_ondisk_format = 0; + { + bufferlist bl; + int r = db->get(PREFIX_SUPER, "ondisk_format", &bl); + if (r < 0) { + // base case: kraken bluestore is v1 and readable by v1 + dout(20) << __func__ << " missing ondisk_format; assuming kraken" + << dendl; + ondisk_format = 1; + compat_ondisk_format = 1; + } else { + auto p = bl.cbegin(); + try { + decode(ondisk_format, p); + } catch (ceph::buffer::error& e) { + derr << __func__ << " unable to read ondisk_format" << dendl; + return -EIO; + } + bl.clear(); + { + r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl); + ceph_assert(!r); + auto p = bl.cbegin(); + try { + decode(compat_ondisk_format, p); + } catch (ceph::buffer::error& e) { + derr << __func__ << " unable to read compat_ondisk_format" << dendl; + return -EIO; + } + } + } + dout(1) << __func__ << " ondisk_format " << ondisk_format + << " compat_ondisk_format " << compat_ondisk_format + << dendl; + } + + if (latest_ondisk_format < compat_ondisk_format) { + derr << __func__ << " compat_ondisk_format is " + << compat_ondisk_format << " but we only understand version " + << latest_ondisk_format << dendl; + return -EPERM; + } + + { + bufferlist bl; + db->get(PREFIX_SUPER, "min_alloc_size", &bl); + auto p = bl.cbegin(); + try { + uint64_t val; + decode(val, p); + min_alloc_size = val; + min_alloc_size_order = std::countr_zero(val); + min_alloc_size_mask = min_alloc_size - 1; + + ceph_assert(min_alloc_size == 1u << min_alloc_size_order); + } catch (ceph::buffer::error& e) { + derr << __func__ << " unable to read min_alloc_size" << dendl; + return -EIO; + } + dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size + << std::dec << dendl; + logger->set(l_bluestore_alloc_unit, min_alloc_size); + } + + // smr fields + { + bufferlist bl; + int r = db->get(PREFIX_SUPER, "zone_size", &bl); + if (r >= 0) { + auto p = bl.cbegin(); + decode(zone_size, p); + dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl; + ceph_assert(bdev->is_smr()); + } else { + ceph_assert(!bdev->is_smr()); + } + } + { + bufferlist bl; + int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl); + if (r >= 0) { + auto p = bl.cbegin(); + decode(first_sequential_zone, p); + dout(1) << __func__ << " first_sequential_zone 0x" << std::hex + << first_sequential_zone << std::dec << dendl; + ceph_assert(bdev->is_smr()); + } else { + ceph_assert(!bdev->is_smr()); + } + } + + _set_per_pool_omap(); + + _open_statfs(); + _set_alloc_sizes(); + _set_throttle_params(); + + _set_csum(); + _set_compression(); + _set_blob_size(); + + _validate_bdev(); + return 0; +} + +int BlueStore::_upgrade_super() +{ + dout(1) << __func__ << " from " << ondisk_format << ", latest " + << latest_ondisk_format << dendl; + if (ondisk_format < latest_ondisk_format) { + ceph_assert(ondisk_format > 0); + ceph_assert(ondisk_format < latest_ondisk_format); + + KeyValueDB::Transaction t = db->get_transaction(); + if (ondisk_format == 1) { + // changes: + // - super: added ondisk_format + // - super: added min_readable_ondisk_format + // - super: added min_compat_ondisk_format + // - super: added min_alloc_size + // - super: removed min_min_alloc_size + { + bufferlist bl; + db->get(PREFIX_SUPER, "min_min_alloc_size", &bl); + auto p = bl.cbegin(); + try { + uint64_t val; + decode(val, p); + min_alloc_size = val; + } catch (ceph::buffer::error& e) { + derr << __func__ << " failed to read min_min_alloc_size" << dendl; + return -EIO; + } + t->set(PREFIX_SUPER, "min_alloc_size", bl); + t->rmkey(PREFIX_SUPER, "min_min_alloc_size"); + } + ondisk_format = 2; + } + if (ondisk_format == 2) { + // changes: + // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all* + // oondes are using the per-pool prefix until a repair is run; at that + // point the per_pool_omap=1 key will be set. + // - super: added per_pool_omap key, which indicates that *all* objects + // are using the new prefix and key format + ondisk_format = 3; + } + if (ondisk_format == 3) { + // changes: + // - FreelistManager keeps meta within bdev label + int r = _write_out_fm_meta(0); + ceph_assert(r == 0); + ondisk_format = 4; + } + // This to be the last operation + _prepare_ondisk_format_super(t); + int r = db->submit_transaction_sync(t); + ceph_assert(r == 0); + } + // done + dout(1) << __func__ << " done" << dendl; + return 0; +} + +void BlueStore::_assign_nid(TransContext *txc, OnodeRef& o) +{ + if (o->onode.nid) { + ceph_assert(o->exists); + return; + } + uint64_t nid = ++nid_last; + dout(20) << __func__ << " " << nid << dendl; + o->onode.nid = nid; + txc->last_nid = nid; + o->exists = true; +} + +uint64_t BlueStore::_assign_blobid(TransContext *txc) +{ + uint64_t bid = ++blobid_last; + dout(20) << __func__ << " " << bid << dendl; + txc->last_blobid = bid; + return bid; +} + +void BlueStore::get_db_statistics(Formatter *f) +{ + db->get_statistics(f); +} + +BlueStore::TransContext *BlueStore::_txc_create( + Collection *c, OpSequencer *osr, + list<Context*> *on_commits, + TrackedOpRef osd_op) +{ + TransContext *txc = new TransContext(cct, c, osr, on_commits); + txc->t = db->get_transaction(); + +#ifdef WITH_BLKIN + if (osd_op && osd_op->pg_trace) { + txc->trace.init("TransContext", &trace_endpoint, + &osd_op->pg_trace); + txc->trace.event("txc create"); + txc->trace.keyval("txc seq", txc->seq); + } +#endif + + osr->queue_new(txc); + dout(20) << __func__ << " osr " << osr << " = " << txc + << " seq " << txc->seq << dendl; + return txc; +} + +void BlueStore::_txc_calc_cost(TransContext *txc) +{ + // one "io" for the kv commit + auto ios = 1 + txc->ioc.get_num_ios(); + auto cost = throttle_cost_per_io.load(); + txc->cost = ios * cost + txc->bytes; + txc->ios = ios; + dout(10) << __func__ << " " << txc << " cost " << txc->cost << " (" + << ios << " ios * " << cost << " + " << txc->bytes + << " bytes)" << dendl; +} + +void BlueStore::_txc_update_store_statfs(TransContext *txc) +{ + if (txc->statfs_delta.is_empty()) + return; + + logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated()); + logger->inc(l_bluestore_stored, txc->statfs_delta.stored()); + logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed()); + logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated()); + logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original()); + + if (per_pool_stat_collection) { + if (!is_statfs_recoverable()) { + bufferlist bl; + txc->statfs_delta.encode(bl); + string key; + get_pool_stat_key(txc->osd_pool_id, &key); + txc->t->merge(PREFIX_STAT, key, bl); + } + + std::lock_guard l(vstatfs_lock); + auto& stats = osd_pools[txc->osd_pool_id]; + stats += txc->statfs_delta; + + vstatfs += txc->statfs_delta; //non-persistent in this mode + + } else { + if (!is_statfs_recoverable()) { + bufferlist bl; + txc->statfs_delta.encode(bl); + txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl); + } + + std::lock_guard l(vstatfs_lock); + vstatfs += txc->statfs_delta; + } + txc->statfs_delta.reset(); +} + +void BlueStore::_txc_state_proc(TransContext *txc) +{ + while (true) { + dout(10) << __func__ << " txc " << txc + << " " << txc->get_state_name() << dendl; + switch (txc->get_state()) { + case TransContext::STATE_PREPARE: + throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat); + if (txc->ioc.has_pending_aios()) { + txc->set_state(TransContext::STATE_AIO_WAIT); +#ifdef WITH_BLKIN + if (txc->trace) { + txc->trace.keyval("pending aios", txc->ioc.num_pending.load()); + } +#endif + txc->had_ios = true; + _txc_aio_submit(txc); + return; + } + // ** fall-thru ** + + case TransContext::STATE_AIO_WAIT: + { + mono_clock::duration lat = throttle.log_state_latency( + *txc, logger, l_bluestore_state_aio_wait_lat); + if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) { + dout(0) << __func__ << " slow aio_wait, txc = " << txc + << ", latency = " << lat + << dendl; + } + } + + _txc_finish_io(txc); // may trigger blocked txc's too + return; + + case TransContext::STATE_IO_DONE: + ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io + if (txc->had_ios) { + ++txc->osr->txc_with_unstable_io; + } + throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat); + txc->set_state(TransContext::STATE_KV_QUEUED); + if (cct->_conf->bluestore_sync_submit_transaction) { + if (txc->last_nid >= nid_max || + txc->last_blobid >= blobid_max) { + dout(20) << __func__ + << " last_{nid,blobid} exceeds max, submit via kv thread" + << dendl; + } else if (txc->osr->kv_committing_serially) { + dout(20) << __func__ << " prior txc submitted via kv thread, us too" + << dendl; + // note: this is starvation-prone. once we have a txc in a busy + // sequencer that is committing serially it is possible to keep + // submitting new transactions fast enough that we get stuck doing + // so. the alternative is to block here... fixme? + } else if (txc->osr->txc_with_unstable_io) { + dout(20) << __func__ << " prior txc(s) with unstable ios " + << txc->osr->txc_with_unstable_io.load() << dendl; + } else if (cct->_conf->bluestore_debug_randomize_serial_transaction && + rand() % cct->_conf->bluestore_debug_randomize_serial_transaction + == 0) { + dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread" + << dendl; + } else { + _txc_apply_kv(txc, true); + } + } + { + std::lock_guard l(kv_lock); + kv_queue.push_back(txc); + if (!kv_sync_in_progress) { + kv_sync_in_progress = true; + kv_cond.notify_one(); + } + if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) { + kv_queue_unsubmitted.push_back(txc); + ++txc->osr->kv_committing_serially; + } + if (txc->had_ios) + kv_ios++; + kv_throttle_costs += txc->cost; + } + return; + case TransContext::STATE_KV_SUBMITTED: + _txc_committed_kv(txc); + // ** fall-thru ** + + case TransContext::STATE_KV_DONE: + throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat); + if (txc->deferred_txn) { + txc->set_state(TransContext::STATE_DEFERRED_QUEUED); + _deferred_queue(txc); + return; + } + txc->set_state(TransContext::STATE_FINISHING); + break; + + case TransContext::STATE_DEFERRED_CLEANUP: + throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat); + txc->set_state(TransContext::STATE_FINISHING); + // ** fall-thru ** + + case TransContext::STATE_FINISHING: + throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat); + _txc_finish(txc); + return; + + default: + derr << __func__ << " unexpected txc " << txc + << " state " << txc->get_state_name() << dendl; + ceph_abort_msg("unexpected txc state"); + return; + } + } +} + +void BlueStore::_txc_finish_io(TransContext *txc) +{ + dout(20) << __func__ << " " << txc << dendl; + + /* + * we need to preserve the order of kv transactions, + * even though aio will complete in any order. + */ + + OpSequencer *osr = txc->osr.get(); + std::lock_guard l(osr->qlock); + txc->set_state(TransContext::STATE_IO_DONE); + txc->ioc.release_running_aios(); + OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc); + while (p != osr->q.begin()) { + --p; + if (p->get_state() < TransContext::STATE_IO_DONE) { + dout(20) << __func__ << " " << txc << " blocked by " << &*p << " " + << p->get_state_name() << dendl; + return; + } + if (p->get_state() > TransContext::STATE_IO_DONE) { + ++p; + break; + } + } + do { + _txc_state_proc(&*p++); + } while (p != osr->q.end() && + p->get_state() == TransContext::STATE_IO_DONE); + + if (osr->kv_submitted_waiters) { + osr->qcond.notify_all(); + } +} + +void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t) +{ + dout(20) << __func__ << " txc " << txc + << " onodes " << txc->onodes + << " shared_blobs " << txc->shared_blobs + << dendl; + + // finalize onodes + for (auto o : txc->onodes) { + _record_onode(o, t); + o->flushing_count++; + } + + // objects we modified but didn't affect the onode + auto p = txc->modified_objects.begin(); + while (p != txc->modified_objects.end()) { + if (txc->onodes.count(*p) == 0) { + (*p)->flushing_count++; + ++p; + } else { + // remove dups with onodes list to avoid problems in _txc_finish + p = txc->modified_objects.erase(p); + } + } + + // finalize shared_blobs + for (auto sb : txc->shared_blobs) { + string key; + auto sbid = sb->get_sbid(); + get_shared_blob_key(sbid, &key); + if (sb->persistent->empty()) { + dout(20) << __func__ << " shared_blob 0x" + << std::hex << sbid << std::dec + << " is empty" << dendl; + t->rmkey(PREFIX_SHARED_BLOB, key); + } else { + bufferlist bl; + encode(*(sb->persistent), bl); + dout(20) << __func__ << " shared_blob 0x" + << std::hex << sbid << std::dec + << " is " << bl.length() << " " << *sb << dendl; + t->set(PREFIX_SHARED_BLOB, key, bl); + } + } +} + +void BlueStore::BSPerfTracker::update_from_perfcounters( + PerfCounters &logger) +{ + os_commit_latency_ns.consume_next( + logger.get_tavg_ns( + l_bluestore_commit_lat)); + os_apply_latency_ns.consume_next( + logger.get_tavg_ns( + l_bluestore_commit_lat)); +} + +void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t) +{ + dout(20) << __func__ << " txc " << txc << std::hex + << " allocated 0x" << txc->allocated + << " released 0x" << txc->released + << std::dec << dendl; + + if (!fm->is_null_manager()) + { + // We have to handle the case where we allocate *and* deallocate the + // same region in this transaction. The freelist doesn't like that. + // (Actually, the only thing that cares is the BitmapFreelistManager + // debug check. But that's important.) + interval_set<uint64_t> tmp_allocated, tmp_released; + interval_set<uint64_t> *pallocated = &txc->allocated; + interval_set<uint64_t> *preleased = &txc->released; + if (!txc->allocated.empty() && !txc->released.empty()) { + interval_set<uint64_t> overlap; + overlap.intersection_of(txc->allocated, txc->released); + if (!overlap.empty()) { + tmp_allocated = txc->allocated; + tmp_allocated.subtract(overlap); + tmp_released = txc->released; + tmp_released.subtract(overlap); + dout(20) << __func__ << " overlap 0x" << std::hex << overlap + << ", new allocated 0x" << tmp_allocated + << " released 0x" << tmp_released << std::dec + << dendl; + pallocated = &tmp_allocated; + preleased = &tmp_released; + } + } + + // update freelist with non-overlap sets + for (interval_set<uint64_t>::iterator p = pallocated->begin(); + p != pallocated->end(); + ++p) { + fm->allocate(p.get_start(), p.get_len(), t); + } + for (interval_set<uint64_t>::iterator p = preleased->begin(); + p != preleased->end(); + ++p) { + dout(20) << __func__ << " release 0x" << std::hex << p.get_start() + << "~" << p.get_len() << std::dec << dendl; + fm->release(p.get_start(), p.get_len(), t); + } + } + +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + for (auto& i : txc->old_zone_offset_refs) { + dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second + << " offset 0x" << i.second << std::dec + << " -> " << i.first.first->oid << dendl; + string key; + get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key); + txc->t->rmkey(PREFIX_ZONED_CL_INFO, key); + } + for (auto& i : txc->new_zone_offset_refs) { + // (zone, offset) -> oid + dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second + << " offset 0x" << i.second << std::dec + << " -> " << i.first.first->oid << dendl; + string key; + get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key); + bufferlist v; + txc->t->set(PREFIX_ZONED_CL_INFO, key, v); + } + } +#endif + + _txc_update_store_statfs(txc); +} + +void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction) +{ + ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED); + { +#if defined(WITH_LTTNG) + auto start = mono_clock::now(); +#endif + +#ifdef WITH_BLKIN + if (txc->trace) { + txc->trace.event("db async submit"); + } +#endif + + int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t); + ceph_assert(r == 0); + txc->set_state(TransContext::STATE_KV_SUBMITTED); + if (txc->osr->kv_submitted_waiters) { + std::lock_guard l(txc->osr->qlock); + txc->osr->qcond.notify_all(); + } + +#if defined(WITH_LTTNG) + if (txc->tracing) { + tracepoint( + bluestore, + transaction_kv_submit_latency, + txc->osr->get_sequencer_id(), + txc->seq, + sync_submit_transaction, + ceph::to_seconds<double>(mono_clock::now() - start)); + } +#endif + } + + for (auto ls : { &txc->onodes, &txc->modified_objects }) { + for (auto& o : *ls) { + dout(20) << __func__ << " onode " << o << " had " << o->flushing_count + << dendl; + if (--o->flushing_count == 0 && o->waiting_count.load()) { + std::lock_guard l(o->flush_lock); + o->flush_cond.notify_all(); + } + } + } +} + +void BlueStore::_txc_committed_kv(TransContext *txc) +{ + dout(20) << __func__ << " txc " << txc << dendl; + throttle.complete_kv(*txc); + { + std::lock_guard l(txc->osr->qlock); + txc->set_state(TransContext::STATE_KV_DONE); + if (txc->ch->commit_queue) { + txc->ch->commit_queue->queue(txc->oncommits); + } else { + finisher.queue(txc->oncommits); + } + } + throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat); + log_latency_fn( + __func__, + l_bluestore_commit_lat, + mono_clock::now() - txc->start, + cct->_conf->bluestore_log_op_age, + [&](auto lat) { + return ", txc = " + stringify(txc); + } + ); +} + +void BlueStore::_txc_finish(TransContext *txc) +{ + dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl; + ceph_assert(txc->get_state() == TransContext::STATE_FINISHING); + + for (auto& sb : txc->shared_blobs_written) { + sb->finish_write(txc->seq); + } + txc->shared_blobs_written.clear(); + + while (!txc->removed_collections.empty()) { + _queue_reap_collection(txc->removed_collections.front()); + txc->removed_collections.pop_front(); + } + + OpSequencerRef osr = txc->osr; + bool empty = false; + bool submit_deferred = false; + OpSequencer::q_list_t releasing_txc; + { + std::lock_guard l(osr->qlock); + txc->set_state(TransContext::STATE_DONE); + bool notify = false; + while (!osr->q.empty()) { + TransContext *txc = &osr->q.front(); + dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name() + << dendl; + if (txc->get_state() != TransContext::STATE_DONE) { + if (txc->get_state() == TransContext::STATE_PREPARE && + deferred_aggressive) { + // for _osr_drain_preceding() + notify = true; + } + if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED && + osr->q.size() > g_conf()->bluestore_max_deferred_txc) { + submit_deferred = true; + } + break; + } + + osr->q.pop_front(); + releasing_txc.push_back(*txc); + } + + if (osr->q.empty()) { + dout(20) << __func__ << " osr " << osr << " q now empty" << dendl; + empty = true; + } + + // only drain()/drain_preceding() need wakeup, + // other cases use kv_submitted_waiters + if (notify || empty) { + osr->qcond.notify_all(); + } + } + + while (!releasing_txc.empty()) { + // release to allocator only after all preceding txc's have also + // finished any deferred writes that potentially land in these + // blocks + auto txc = &releasing_txc.front(); + _txc_release_alloc(txc); + releasing_txc.pop_front(); + throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat); + throttle.complete(*txc); + delete txc; + } + + if (submit_deferred) { + // we're pinning memory; flush! we could be more fine-grained here but + // i'm not sure it's worth the bother. + deferred_try_submit(); + } + + if (empty && osr->zombie) { + std::lock_guard l(zombie_osr_lock); + if (zombie_osr_set.erase(osr->cid)) { + dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl; + } else { + dout(10) << __func__ << " empty zombie osr " << osr << " already reaped" + << dendl; + } + } +} + +void BlueStore::_txc_release_alloc(TransContext *txc) +{ + bool discard_queued = false; + // it's expected we're called with lazy_release_lock already taken! + if (unlikely(cct->_conf->bluestore_debug_no_reuse_blocks)) { + goto out; + } + discard_queued = bdev->try_discard(txc->released); + // if async discard succeeded, will do alloc->release when discard callback + // else we should release here + if (!discard_queued) { + dout(10) << __func__ << "(sync) " << txc << " " << std::hex + << txc->released << std::dec << dendl; + alloc->release(txc->released); + } + +out: + txc->allocated.clear(); + txc->released.clear(); +} + +void BlueStore::_osr_attach(Collection *c) +{ + // note: caller has coll_lock + auto q = coll_map.find(c->cid); + if (q != coll_map.end()) { + c->osr = q->second->osr; + ldout(cct, 10) << __func__ << " " << c->cid + << " reusing osr " << c->osr << " from existing coll " + << q->second << dendl; + } else { + std::lock_guard l(zombie_osr_lock); + auto p = zombie_osr_set.find(c->cid); + if (p == zombie_osr_set.end()) { + c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid); + ldout(cct, 10) << __func__ << " " << c->cid + << " fresh osr " << c->osr << dendl; + } else { + c->osr = p->second; + zombie_osr_set.erase(p); + ldout(cct, 10) << __func__ << " " << c->cid + << " resurrecting zombie osr " << c->osr << dendl; + c->osr->zombie = false; + } + } +} + +void BlueStore::_osr_register_zombie(OpSequencer *osr) +{ + std::lock_guard l(zombie_osr_lock); + dout(10) << __func__ << " " << osr << " " << osr->cid << dendl; + osr->zombie = true; + auto i = zombie_osr_set.emplace(osr->cid, osr); + // this is either a new insertion or the same osr is already there + ceph_assert(i.second || i.first->second == osr); +} + +void BlueStore::_osr_drain_preceding(TransContext *txc) +{ + OpSequencer *osr = txc->osr.get(); + dout(10) << __func__ << " " << txc << " osr " << osr << dendl; + ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag? + { + // submit anything pending + osr->deferred_lock.lock(); + if (osr->deferred_pending && !osr->deferred_running) { + _deferred_submit_unlock(osr); + } else { + osr->deferred_lock.unlock(); + } + } + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + if (!kv_sync_in_progress) { + kv_sync_in_progress = true; + kv_cond.notify_one(); + } + } + osr->drain_preceding(txc); + --deferred_aggressive; + dout(10) << __func__ << " " << osr << " done" << dendl; +} + +void BlueStore::_osr_drain(OpSequencer *osr) +{ + dout(10) << __func__ << " " << osr << dendl; + ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag? + { + // submit anything pending + osr->deferred_lock.lock(); + if (osr->deferred_pending && !osr->deferred_running) { + _deferred_submit_unlock(osr); + } else { + osr->deferred_lock.unlock(); + } + } + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + if (!kv_sync_in_progress) { + kv_sync_in_progress = true; + kv_cond.notify_one(); + } + } + osr->drain(); + --deferred_aggressive; + dout(10) << __func__ << " " << osr << " done" << dendl; +} + +void BlueStore::_osr_drain_all() +{ + dout(10) << __func__ << dendl; + + set<OpSequencerRef> s; + vector<OpSequencerRef> zombies; + { + std::shared_lock l(coll_lock); + for (auto& i : coll_map) { + s.insert(i.second->osr); + } + } + { + std::lock_guard l(zombie_osr_lock); + for (auto& i : zombie_osr_set) { + s.insert(i.second); + zombies.push_back(i.second); + } + } + dout(20) << __func__ << " osr_set " << s << dendl; + + ++deferred_aggressive; + { + // submit anything pending + deferred_try_submit(); + } + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + kv_cond.notify_one(); + } + { + std::lock_guard l(kv_finalize_lock); + kv_finalize_cond.notify_one(); + } + for (auto osr : s) { + dout(20) << __func__ << " drain " << osr << dendl; + osr->drain(); + } + --deferred_aggressive; + + { + std::lock_guard l(zombie_osr_lock); + for (auto& osr : zombies) { + if (zombie_osr_set.erase(osr->cid)) { + dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl; + ceph_assert(osr->q.empty()); + } else if (osr->zombie) { + dout(10) << __func__ << " empty zombie osr " << osr + << " already reaped" << dendl; + ceph_assert(osr->q.empty()); + } else { + dout(10) << __func__ << " empty zombie osr " << osr + << " resurrected" << dendl; + } + } + } + + dout(10) << __func__ << " done" << dendl; +} + + +void BlueStore::_kv_start() +{ + dout(10) << __func__ << dendl; + + finisher.start(); + kv_sync_thread.create("bstore_kv_sync"); + kv_finalize_thread.create("bstore_kv_final"); +} + +void BlueStore::_kv_stop() +{ + dout(10) << __func__ << dendl; + { + std::unique_lock l{kv_lock}; + while (!kv_sync_started) { + kv_cond.wait(l); + } + kv_stop = true; + kv_cond.notify_all(); + } + { + std::unique_lock l{kv_finalize_lock}; + while (!kv_finalize_started) { + kv_finalize_cond.wait(l); + } + kv_finalize_stop = true; + kv_finalize_cond.notify_all(); + } + kv_sync_thread.join(); + kv_finalize_thread.join(); + ceph_assert(removed_collections.empty()); + { + std::lock_guard l(kv_lock); + kv_stop = false; + } + { + std::lock_guard l(kv_finalize_lock); + kv_finalize_stop = false; + } + dout(10) << __func__ << " stopping finishers" << dendl; + finisher.wait_for_empty(); + finisher.stop(); + dout(10) << __func__ << " stopped" << dendl; +} + +void BlueStore::_kv_sync_thread() +{ + dout(10) << __func__ << " start" << dendl; + deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable + std::unique_lock l{kv_lock}; + ceph_assert(!kv_sync_started); + kv_sync_started = true; + kv_cond.notify_all(); + + auto t0 = mono_clock::now(); + timespan twait = ceph::make_timespan(0); + size_t kv_submitted = 0; + + while (true) { + auto period = cct->_conf->bluestore_kv_sync_util_logging_s; + auto observation_period = + ceph::make_timespan(period); + auto elapsed = mono_clock::now() - t0; + if (period && elapsed >= observation_period) { + dout(5) << __func__ << " utilization: idle " + << twait << " of " << elapsed + << ", submitted: " << kv_submitted + <<dendl; + t0 = mono_clock::now(); + twait = ceph::make_timespan(0); + kv_submitted = 0; + } + ceph_assert(kv_committing.empty()); + if (kv_queue.empty() && + ((deferred_done_queue.empty() && deferred_stable_queue.empty()) || + !deferred_aggressive)) { + if (kv_stop) + break; + dout(20) << __func__ << " sleep" << dendl; + auto t = mono_clock::now(); + kv_sync_in_progress = false; + kv_cond.wait(l); + twait += mono_clock::now() - t; + + dout(20) << __func__ << " wake" << dendl; + } else { + deque<TransContext*> kv_submitting; + deque<DeferredBatch*> deferred_done, deferred_stable; + uint64_t aios = 0, costs = 0; + + dout(20) << __func__ << " committing " << kv_queue.size() + << " submitting " << kv_queue_unsubmitted.size() + << " deferred done " << deferred_done_queue.size() + << " stable " << deferred_stable_queue.size() + << dendl; + kv_committing.swap(kv_queue); + kv_submitting.swap(kv_queue_unsubmitted); + deferred_done.swap(deferred_done_queue); + deferred_stable.swap(deferred_stable_queue); + aios = kv_ios; + costs = kv_throttle_costs; + kv_ios = 0; + kv_throttle_costs = 0; + l.unlock(); + + dout(30) << __func__ << " committing " << kv_committing << dendl; + dout(30) << __func__ << " submitting " << kv_submitting << dendl; + dout(30) << __func__ << " deferred_done " << deferred_done << dendl; + dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl; + + auto start = mono_clock::now(); + + bool force_flush = false; + // if bluefs is sharing the same device as data (only), then we + // can rely on the bluefs commit to flush the device and make + // deferred aios stable. that means that if we do have done deferred + // txcs AND we are not on a single device, we need to force a flush. + if (bluefs && bluefs_layout.single_shared_device()) { + if (aios) { + force_flush = true; + } else if (kv_committing.empty() && deferred_stable.empty()) { + force_flush = true; // there's nothing else to commit! + } else if (deferred_aggressive) { + force_flush = true; + } + } else { + if (aios || !deferred_done.empty()) { + force_flush = true; + } else { + dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl; + } + } + + if (force_flush) { + dout(20) << __func__ << " num_aios=" << aios + << " force_flush=" << (int)force_flush + << ", flushing, deferred done->stable" << dendl; + // flush/barrier on block device + bdev->flush(); + + // if we flush then deferred done are now deferred stable + if (deferred_stable.empty()) { + deferred_stable.swap(deferred_done); + } else { + deferred_stable.insert(deferred_stable.end(), deferred_done.begin(), + deferred_done.end()); + deferred_done.clear(); + } + } + auto after_flush = mono_clock::now(); + + // we will use one final transaction to force a sync + KeyValueDB::Transaction synct = db->get_transaction(); + + // increase {nid,blobid}_max? note that this covers both the + // case where we are approaching the max and the case we passed + // it. in either case, we increase the max in the earlier txn + // we submit. + uint64_t new_nid_max = 0, new_blobid_max = 0; + if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) { + KeyValueDB::Transaction t = + kv_submitting.empty() ? synct : kv_submitting.front()->t; + new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc; + bufferlist bl; + encode(new_nid_max, bl); + t->set(PREFIX_SUPER, "nid_max", bl); + dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl; + } + if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) { + KeyValueDB::Transaction t = + kv_submitting.empty() ? synct : kv_submitting.front()->t; + new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc; + bufferlist bl; + encode(new_blobid_max, bl); + t->set(PREFIX_SUPER, "blobid_max", bl); + dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl; + } + + for (auto txc : kv_committing) { + throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat); + if (txc->get_state() == TransContext::STATE_KV_QUEUED) { + ++kv_submitted; + _txc_apply_kv(txc, false); + --txc->osr->kv_committing_serially; + } else { + ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED); + } + if (txc->had_ios) { + --txc->osr->txc_with_unstable_io; + } + } + + // release throttle *before* we commit. this allows new ops + // to be prepared and enter pipeline while we are waiting on + // the kv commit sync/flush. then hopefully on the next + // iteration there will already be ops awake. otherwise, we + // end up going to sleep, and then wake up when the very first + // transaction is ready for commit. + throttle.release_kv_throttle(costs); + + // cleanup sync deferred keys + for (auto b : deferred_stable) { + for (auto& txc : b->txcs) { + bluestore_deferred_transaction_t& wt = *txc.deferred_txn; + ceph_assert(wt.released.empty()); // only kraken did this + string key; + get_deferred_key(wt.seq, &key); + synct->rm_single_key(PREFIX_DEFERRED, key); + } + } + +#if defined(WITH_LTTNG) + auto sync_start = mono_clock::now(); +#endif + // submit synct synchronously (block and wait for it to commit) + int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct); + ceph_assert(r == 0); + +#ifdef WITH_BLKIN + for (auto txc : kv_committing) { + if (txc->trace) { + txc->trace.event("db sync submit"); + txc->trace.keyval("kv_committing size", kv_committing.size()); + } + } +#endif + + int committing_size = kv_committing.size(); + int deferred_size = deferred_stable.size(); + +#if defined(WITH_LTTNG) + double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start); + for (auto txc: kv_committing) { + if (txc->tracing) { + tracepoint( + bluestore, + transaction_kv_sync_latency, + txc->osr->get_sequencer_id(), + txc->seq, + kv_committing.size(), + deferred_done.size(), + deferred_stable.size(), + sync_latency); + } + } +#endif + + { + std::unique_lock m{kv_finalize_lock}; + if (kv_committing_to_finalize.empty()) { + kv_committing_to_finalize.swap(kv_committing); + } else { + kv_committing_to_finalize.insert( + kv_committing_to_finalize.end(), + kv_committing.begin(), + kv_committing.end()); + kv_committing.clear(); + } + if (deferred_stable_to_finalize.empty()) { + deferred_stable_to_finalize.swap(deferred_stable); + } else { + deferred_stable_to_finalize.insert( + deferred_stable_to_finalize.end(), + deferred_stable.begin(), + deferred_stable.end()); + deferred_stable.clear(); + } + if (!kv_finalize_in_progress) { + kv_finalize_in_progress = true; + kv_finalize_cond.notify_one(); + } + } + + if (new_nid_max) { + nid_max = new_nid_max; + dout(10) << __func__ << " nid_max now " << nid_max << dendl; + } + if (new_blobid_max) { + blobid_max = new_blobid_max; + dout(10) << __func__ << " blobid_max now " << blobid_max << dendl; + } + + { + auto finish = mono_clock::now(); + ceph::timespan dur_flush = after_flush - start; + ceph::timespan dur_kv = finish - after_flush; + ceph::timespan dur = finish - start; + dout(20) << __func__ << " committed " << committing_size + << " cleaned " << deferred_size + << " in " << dur + << " (" << dur_flush << " flush + " << dur_kv << " kv commit)" + << dendl; + log_latency("kv_flush", + l_bluestore_kv_flush_lat, + dur_flush, + cct->_conf->bluestore_log_op_age); + log_latency("kv_commit", + l_bluestore_kv_commit_lat, + dur_kv, + cct->_conf->bluestore_log_op_age); + log_latency("kv_sync", + l_bluestore_kv_sync_lat, + dur, + cct->_conf->bluestore_log_op_age); + } + + l.lock(); + // previously deferred "done" are now "stable" by virtue of this + // commit cycle. + deferred_stable_queue.swap(deferred_done); + } + } + dout(10) << __func__ << " finish" << dendl; + kv_sync_started = false; +} + +void BlueStore::_kv_finalize_thread() +{ + deque<TransContext*> kv_committed; + deque<DeferredBatch*> deferred_stable; + dout(10) << __func__ << " start" << dendl; + std::unique_lock l(kv_finalize_lock); + ceph_assert(!kv_finalize_started); + kv_finalize_started = true; + kv_finalize_cond.notify_all(); + while (true) { + ceph_assert(kv_committed.empty()); + ceph_assert(deferred_stable.empty()); + if (kv_committing_to_finalize.empty() && + deferred_stable_to_finalize.empty()) { + if (kv_finalize_stop) + break; + dout(20) << __func__ << " sleep" << dendl; + kv_finalize_in_progress = false; + kv_finalize_cond.wait(l); + dout(20) << __func__ << " wake" << dendl; + } else { + kv_committed.swap(kv_committing_to_finalize); + deferred_stable.swap(deferred_stable_to_finalize); + l.unlock(); + dout(20) << __func__ << " kv_committed " << kv_committed << dendl; + dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl; + + auto start = mono_clock::now(); + + while (!kv_committed.empty()) { + TransContext *txc = kv_committed.front(); + ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED); + _txc_state_proc(txc); + kv_committed.pop_front(); + } + + for (auto b : deferred_stable) { + auto p = b->txcs.begin(); + while (p != b->txcs.end()) { + TransContext *txc = &*p; + p = b->txcs.erase(p); // unlink here because + _txc_state_proc(txc); // this may destroy txc + } + delete b; + } + deferred_stable.clear(); + + if (!deferred_aggressive) { + if (deferred_queue_size >= deferred_batch_ops.load() || + throttle.should_submit_deferred()) { + deferred_try_submit(); + } + } + + // this is as good a place as any ... + _reap_collections(); + + logger->set(l_bluestore_fragmentation, + (uint64_t)(alloc->get_fragmentation() * 1000)); + + log_latency("kv_final", + l_bluestore_kv_final_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + + l.lock(); + } + } + dout(10) << __func__ << " finish" << dendl; + kv_finalize_started = false; +} + +#ifdef HAVE_LIBZBD +void BlueStore::_zoned_cleaner_start() +{ + dout(10) << __func__ << dendl; + zoned_cleaner_thread.create("bstore_zcleaner"); +} + +void BlueStore::_zoned_cleaner_stop() +{ + dout(10) << __func__ << dendl; + { + std::unique_lock l{zoned_cleaner_lock}; + while (!zoned_cleaner_started) { + zoned_cleaner_cond.wait(l); + } + zoned_cleaner_stop = true; + zoned_cleaner_cond.notify_all(); + } + zoned_cleaner_thread.join(); + { + std::lock_guard l{zoned_cleaner_lock}; + zoned_cleaner_stop = false; + } + dout(10) << __func__ << " done" << dendl; +} + +void BlueStore::_zoned_cleaner_thread() +{ + dout(10) << __func__ << " start" << dendl; + std::unique_lock l{zoned_cleaner_lock}; + ceph_assert(!zoned_cleaner_started); + zoned_cleaner_started = true; + zoned_cleaner_cond.notify_all(); + auto a = dynamic_cast<ZonedAllocator*>(alloc); + ceph_assert(a); + auto f = dynamic_cast<ZonedFreelistManager*>(fm); + ceph_assert(f); + while (true) { + // thresholds to trigger cleaning + // FIXME + float min_score = .05; // score: bytes saved / bytes moved + uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning + auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved); + if (zone_to_clean < 0) { + if (zoned_cleaner_stop) { + break; + } + auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval); + dout(20) << __func__ << " sleep for " << period << dendl; + zoned_cleaner_cond.wait_for(l, period); + dout(20) << __func__ << " wake" << dendl; + } else { + l.unlock(); + a->set_cleaning_zone(zone_to_clean); + _zoned_clean_zone(zone_to_clean, a, f); + a->clear_cleaning_zone(zone_to_clean); + l.lock(); + } + } + dout(10) << __func__ << " finish" << dendl; + zoned_cleaner_started = false; +} + +void BlueStore::_zoned_clean_zone( + uint64_t zone, + ZonedAllocator *a, + ZonedFreelistManager *f + ) +{ + dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl; + + KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO); + std::string zone_start; + get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start); + for (it->lower_bound(zone_start); it->valid(); it->next()) { + uint32_t z; + uint64_t offset; + ghobject_t oid; + string k = it->key(); + int r = get_key_zone_offset_object(k, &z, &offset, &oid); + if (r < 0) { + derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k) + << dendl; + continue; + } + if (zone != z) { + dout(10) << __func__ << " reached end of zone refs" << dendl; + break; + } + dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset + << std::dec << " " << oid << dendl; + _clean_some(oid, zone); + } + + if (a->get_live_bytes(zone) > 0) { + derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone) + << " live bytes" << std::dec << dendl; + // should we do something else here to avoid a live-lock in the event of a problem? + return; + } + + // make sure transactions flush/drain/commit (and data is all rewritten + // safely elsewhere) before we blow away the cleaned zone + _osr_drain_all(); + + // reset the device zone + dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl; + bdev->reset_zone(zone); + + // record that we can now write there + f->mark_zone_to_clean_free(zone, db); + bdev->flush(); + + // then allow ourselves to start allocating there + dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec + << dendl; + a->reset_zone(zone); +} + +void BlueStore::_clean_some(ghobject_t oid, uint32_t zone) +{ + dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec + << dendl; + + CollectionRef cref = _get_collection_by_oid(oid); + if (!cref) { + dout(10) << __func__ << " can't find collection for " << oid << dendl; + return; + } + Collection *c = cref.get(); + + // serialize io dispatch vs other transactions + std::lock_guard l(atomic_alloc_and_submit_lock); + std::unique_lock l2(c->lock); + + auto o = c->get_onode(oid, false); + if (!o) { + dout(10) << __func__ << " can't find " << oid << dendl; + return; + } + + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + _dump_onode<30>(cct, *o); + + // NOTE: This is a naive rewrite strategy. If any blobs are + // shared, they will be duplicated for each object that references + // them. That means any cloned/snapshotted objects will explode + // their utilization. This won't matter for RGW workloads, but + // for RBD and CephFS it is completely unacceptable, and it's + // entirely reasonable to have "archival" data workloads on SMR + // for CephFS and (possibly/probably) RBD. + // + // At some point we need to replace this with something more + // sophisticated that ensures that a shared blob gets moved once + // and all referencing objects get updated to point to the new + // location. + + map<uint32_t, uint32_t> to_move; + for (auto& e : o->extent_map.extent_map) { + bool touches_zone = false; + for (auto& be : e.blob->get_blob().get_extents()) { + if (be.is_valid()) { + uint32_t z = be.offset / zone_size; + if (z == zone) { + touches_zone = true; + break; + } + } + } + if (touches_zone) { + to_move[e.logical_offset] = e.length; + } + } + if (to_move.empty()) { + dout(10) << __func__ << " no references to zone 0x" << std::hex << zone + << std::dec << " from " << oid << dendl; + return; + } + + dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move + << std::dec << dendl; + OpSequencer *osr = c->osr.get(); + TransContext *txc = _txc_create(c, osr, nullptr); + + spg_t pgid; + if (c->cid.is_pg(&pgid)) { + txc->osd_pool_id = pgid.pool(); + } + + for (auto& [offset, length] : to_move) { + bufferlist bl; + int r = _do_read(c, o, offset, length, bl, 0); + ceph_assert(r == (int)length); + + r = _do_write(txc, cref, o, offset, length, bl, 0); + ceph_assert(r >= 0); + } + txc->write_onode(o); + + _txc_write_nodes(txc, txc->t); + _txc_finalize_kv(txc, txc->t); + _txc_state_proc(txc); +} +#endif + +bluestore_deferred_op_t *BlueStore::_get_deferred_op( + TransContext *txc, uint64_t len) +{ + if (!txc->deferred_txn) { + txc->deferred_txn = new bluestore_deferred_transaction_t; + } + txc->deferred_txn->ops.push_back(bluestore_deferred_op_t()); + logger->inc(l_bluestore_issued_deferred_writes); + logger->inc(l_bluestore_issued_deferred_write_bytes, len); + return &txc->deferred_txn->ops.back(); +} + +void BlueStore::_deferred_queue(TransContext *txc) +{ + dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl; + + DeferredBatch *tmp; + txc->osr->deferred_lock.lock(); + { + if (!txc->osr->deferred_pending) { + tmp = new DeferredBatch(cct, txc->osr.get()); + } else { + tmp = txc->osr->deferred_pending; + } + } + + tmp->txcs.push_back(*txc); + bluestore_deferred_transaction_t& wt = *txc->deferred_txn; + for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) { + const auto& op = *opi; + ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE); + bufferlist::const_iterator p = op.data.begin(); + for (auto e : op.extents) { + tmp->prepare_write(cct, wt.seq, e.offset, e.length, p); + } + } + + { + ++deferred_queue_size; + txc->osr->deferred_pending = tmp; + // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty. + // So we should add osr into deferred_queue. + if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) { + deferred_lock.lock(); + deferred_queue.push_back(*txc->osr); + deferred_lock.unlock(); + } + + if (deferred_aggressive && + !txc->osr->deferred_running) { + _deferred_submit_unlock(txc->osr.get()); + } else { + txc->osr->deferred_lock.unlock(); + } + } + } + +void BlueStore::deferred_try_submit() +{ + dout(20) << __func__ << " " << deferred_queue.size() << " osrs, " + << deferred_queue_size << " txcs" << dendl; + vector<OpSequencerRef> osrs; + + { + std::lock_guard l(deferred_lock); + osrs.reserve(deferred_queue.size()); + for (auto& osr : deferred_queue) { + osrs.push_back(&osr); + } + } + + for (auto& osr : osrs) { + osr->deferred_lock.lock(); + if (osr->deferred_pending) { + if (!osr->deferred_running) { + _deferred_submit_unlock(osr.get()); + } else { + osr->deferred_lock.unlock(); + dout(20) << __func__ << " osr " << osr << " already has running" + << dendl; + } + } else { + osr->deferred_lock.unlock(); + dout(20) << __func__ << " osr " << osr << " has no pending" << dendl; + } + } + + { + std::lock_guard l(deferred_lock); + deferred_last_submitted = ceph_clock_now(); + } +} + +void BlueStore::_deferred_submit_unlock(OpSequencer *osr) +{ + dout(10) << __func__ << " osr " << osr + << " " << osr->deferred_pending->iomap.size() << " ios pending " + << dendl; + ceph_assert(osr->deferred_pending); + ceph_assert(!osr->deferred_running); + + auto b = osr->deferred_pending; + deferred_queue_size -= b->seq_bytes.size(); + ceph_assert(deferred_queue_size >= 0); + + osr->deferred_running = osr->deferred_pending; + osr->deferred_pending = nullptr; + + osr->deferred_lock.unlock(); + + for (auto& txc : b->txcs) { + throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat); + } + uint64_t start = 0, pos = 0; + bufferlist bl; + auto i = b->iomap.begin(); + while (true) { + if (i == b->iomap.end() || i->first != pos) { + if (bl.length()) { + dout(20) << __func__ << " write 0x" << std::hex + << start << "~" << bl.length() + << " crc " << bl.crc32c(-1) << std::dec << dendl; + if (!g_conf()->bluestore_debug_omit_block_device_write) { + logger->inc(l_bluestore_submitted_deferred_writes); + logger->inc(l_bluestore_submitted_deferred_write_bytes, bl.length()); + int r = bdev->aio_write(start, bl, &b->ioc, false); + ceph_assert(r == 0); + } + } + if (i == b->iomap.end()) { + break; + } + start = 0; + pos = i->first; + bl.clear(); + } + dout(20) << __func__ << " seq " << i->second.seq << " 0x" + << std::hex << pos << "~" << i->second.bl.length() << std::dec + << dendl; + if (!bl.length()) { + start = pos; + } + pos += i->second.bl.length(); + bl.claim_append(i->second.bl); + ++i; + } + + bdev->aio_submit(&b->ioc); +} + +struct C_DeferredTrySubmit : public Context { + BlueStore *store; + C_DeferredTrySubmit(BlueStore *s) : store(s) {} + void finish(int r) { + store->deferred_try_submit(); + } +}; + +void BlueStore::_deferred_aio_finish(OpSequencer *osr) +{ + dout(10) << __func__ << " osr " << osr << dendl; + ceph_assert(osr->deferred_running); + DeferredBatch *b = osr->deferred_running; + + { + osr->deferred_lock.lock(); + ceph_assert(osr->deferred_running == b); + osr->deferred_running = nullptr; + if (!osr->deferred_pending) { + dout(20) << __func__ << " dequeueing" << dendl; + { + deferred_lock.lock(); + auto q = deferred_queue.iterator_to(*osr); + deferred_queue.erase(q); + deferred_lock.unlock(); + } + osr->deferred_lock.unlock(); + } else { + osr->deferred_lock.unlock(); + if (deferred_aggressive) { + dout(20) << __func__ << " queuing async deferred_try_submit" << dendl; + finisher.queue(new C_DeferredTrySubmit(this)); + } else { + dout(20) << __func__ << " leaving queued, more pending" << dendl; + } + } + } + + { + uint64_t costs = 0; + { + for (auto& i : b->txcs) { + TransContext *txc = &i; + throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat); + txc->set_state(TransContext::STATE_DEFERRED_CLEANUP); + costs += txc->cost; + } + } + throttle.release_deferred_throttle(costs); + } + + { + std::lock_guard l(kv_lock); + deferred_done_queue.emplace_back(b); + + // in the normal case, do not bother waking up the kv thread; it will + // catch us on the next commit anyway. + if (deferred_aggressive && !kv_sync_in_progress) { + kv_sync_in_progress = true; + kv_cond.notify_one(); + } + } +} + +int BlueStore::_deferred_replay() +{ + dout(10) << __func__ << " start" << dendl; + int count = 0; + int r = 0; + interval_set<uint64_t> bluefs_extents; + if (bluefs) { + bluefs->foreach_block_extents( + bluefs_layout.shared_bdev, + [&] (uint64_t start, uint32_t len) { + bluefs_extents.insert(start, len); + } + ); + } + CollectionRef ch = _get_collection(coll_t::meta()); + bool fake_ch = false; + if (!ch) { + // hmm, replaying initial mkfs? + ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get()); + fake_ch = true; + } + OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get()); + KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED); + for (it->lower_bound(string()); it->valid(); it->next(), ++count) { + dout(20) << __func__ << " replay " << pretty_binary_string(it->key()) + << dendl; + bluestore_deferred_transaction_t *deferred_txn = + new bluestore_deferred_transaction_t; + bufferlist bl = it->value(); + auto p = bl.cbegin(); + try { + decode(*deferred_txn, p); + } catch (ceph::buffer::error& e) { + derr << __func__ << " failed to decode deferred txn " + << pretty_binary_string(it->key()) << dendl; + delete deferred_txn; + r = -EIO; + goto out; + } + bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents); + if (has_some) { + TransContext *txc = _txc_create(ch.get(), osr, nullptr); + txc->deferred_txn = deferred_txn; + txc->set_state(TransContext::STATE_KV_DONE); + _txc_state_proc(txc); + } else { + delete deferred_txn; + } + } + out: + dout(20) << __func__ << " draining osr" << dendl; + _osr_register_zombie(osr); + _osr_drain_all(); + if (fake_ch) { + new_coll_map.clear(); + } + dout(10) << __func__ << " completed " << count << " events" << dendl; + return r; +} + +bool BlueStore::_eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn, + interval_set<uint64_t>& bluefs_extents) +{ + bool has_some = false; + dout(30) << __func__ << " bluefs_extents: " << std::hex << bluefs_extents << std::dec << dendl; + auto it = deferred_txn->ops.begin(); + while (it != deferred_txn->ops.end()) { + // We process a pair of _data_/_extents_ (here: it->data/it->extents) + // by eliminating _extents_ that belong to bluefs, removing relevant parts of _data_ + // example: + // +------------+---------------+---------------+---------------+ + // | data | aaaaaaaabbbbb | bbbbcccccdddd | ddddeeeeeefff | + // | extent | 40000 - 44000 | 50000 - 58000 | 58000 - 60000 | + // | in bluefs? | no | yes | no | + // +------------+---------------+---------------+---------------+ + // result: + // +------------+---------------+---------------+ + // | data | aaaaaaaabbbbb | ddddeeeeeefff | + // | extent | 40000 - 44000 | 58000 - 60000 | + // +------------+---------------+---------------+ + PExtentVector new_extents; + ceph::buffer::list new_data; + uint32_t data_offset = 0; // this tracks location of extent 'e' inside it->data + dout(30) << __func__ << " input extents: " << it->extents << dendl; + for (auto& e: it->extents) { + interval_set<uint64_t> region; + region.insert(e.offset, e.length); + + auto mi = bluefs_extents.lower_bound(e.offset); + if (mi != bluefs_extents.begin()) { + --mi; + if (mi.get_end() <= e.offset) { + ++mi; + } + } + while (mi != bluefs_extents.end() && mi.get_start() < e.offset + e.length) { + // The interval_set does not like (asserts) when we erase interval that does not exist. + // Hence we do we implement (region-mi) by ((region+mi)-mi). + region.union_insert(mi.get_start(), mi.get_len()); + region.erase(mi.get_start(), mi.get_len()); + ++mi; + } + // 'region' is now a subset of e, without parts used by bluefs + // we trim coresponding parts from it->data (actally constructing new_data / new_extents) + for (auto ki = region.begin(); ki != region.end(); ki++) { + ceph::buffer::list chunk; + // A chunk from it->data; data_offset is a an offset where 'e' was located; + // 'ki.get_start() - e.offset' is an offset of ki inside 'e'. + chunk.substr_of(it->data, data_offset + (ki.get_start() - e.offset), ki.get_len()); + new_data.claim_append(chunk); + new_extents.emplace_back(bluestore_pextent_t(ki.get_start(), ki.get_len())); + } + data_offset += e.length; + } + dout(30) << __func__ << " output extents: " << new_extents << dendl; + if (it->data.length() != new_data.length()) { + dout(10) << __func__ << " trimmed deferred extents: " << it->extents << "->" << new_extents << dendl; + } + if (new_extents.size() == 0) { + it = deferred_txn->ops.erase(it); + } else { + has_some = true; + std::swap(it->extents, new_extents); + std::swap(it->data, new_data); + ++it; + } + } + return has_some; +} + +// --------------------------- +// transactions + +int BlueStore::queue_transactions( + CollectionHandle& ch, + vector<Transaction>& tls, + TrackedOpRef op, + ThreadPool::TPHandle *handle) +{ + FUNCTRACE(cct); + list<Context *> on_applied, on_commit, on_applied_sync; + ObjectStore::Transaction::collect_contexts( + tls, &on_applied, &on_commit, &on_applied_sync); + + auto start = mono_clock::now(); + + Collection *c = static_cast<Collection*>(ch.get()); + OpSequencer *osr = c->osr.get(); + dout(10) << __func__ << " ch " << c << " " << c->cid << dendl; + + // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O + // submission to happen atomically because if I/O submission happens in a + // different order than I/O allocation, we end up issuing non-sequential + // writes to the drive. This is a temporary solution until ZONE APPEND + // support matures in the kernel. For more information please see: + // https://www.usenix.org/conference/vault20/presentation/bjorling + if (bdev->is_smr()) { + atomic_alloc_and_submit_lock.lock(); + } + + // prepare + TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr, + &on_commit, op); + + for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) { + txc->bytes += (*p).get_num_bytes(); + _txc_add_transaction(txc, &(*p)); + } + _txc_calc_cost(txc); + + _txc_write_nodes(txc, txc->t); + + // journal deferred items + if (txc->deferred_txn) { + txc->deferred_txn->seq = ++deferred_seq; + bufferlist bl; + encode(*txc->deferred_txn, bl); + string key; + get_deferred_key(txc->deferred_txn->seq, &key); + txc->t->set(PREFIX_DEFERRED, key, bl); + } + + _txc_finalize_kv(txc, txc->t); + +#ifdef WITH_BLKIN + if (txc->trace) { + txc->trace.event("txc encode finished"); + } +#endif + + if (handle) + handle->suspend_tp_timeout(); + + auto tstart = mono_clock::now(); + + if (!throttle.try_start_transaction( + *db, + *txc, + tstart)) { + // ensure we do not block here because of deferred writes + dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive" + << dendl; + ++deferred_aggressive; + deferred_try_submit(); + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + if (!kv_sync_in_progress) { + kv_sync_in_progress = true; + kv_cond.notify_one(); + } + } + throttle.finish_start_transaction(*db, *txc, tstart); + --deferred_aggressive; + } + auto tend = mono_clock::now(); + + if (handle) + handle->reset_tp_timeout(); + + logger->inc(l_bluestore_txc); + + // execute (start) + _txc_state_proc(txc); + + if (bdev->is_smr()) { + atomic_alloc_and_submit_lock.unlock(); + } + + // we're immediately readable (unlike FileStore) + for (auto c : on_applied_sync) { + c->complete(0); + } + if (!on_applied.empty()) { + if (c->commit_queue) { + c->commit_queue->queue(on_applied); + } else { + finisher.queue(on_applied); + } + } + +#ifdef WITH_BLKIN + if (txc->trace) { + txc->trace.event("txc applied"); + } +#endif + + log_latency("submit_transact", + l_bluestore_submit_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + log_latency("throttle_transact", + l_bluestore_throttle_lat, + tend - tstart, + cct->_conf->bluestore_log_op_age); + return 0; +} + +void BlueStore::_txc_aio_submit(TransContext *txc) +{ + dout(10) << __func__ << " txc " << txc << dendl; + bdev->aio_submit(&txc->ioc); +} + +void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t) +{ + Transaction::iterator i = t->begin(); + + _dump_transaction<30>(cct, t); + + vector<CollectionRef> cvec(i.colls.size()); + unsigned j = 0; + for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end(); + ++p, ++j) { + cvec[j] = _get_collection(*p); + } + + vector<OnodeRef> ovec(i.objects.size()); + + for (int pos = 0; i.have_op(); ++pos) { + Transaction::Op *op = i.decode_op(); + int r = 0; + + // no coll or obj + if (op->op == Transaction::OP_NOP) + continue; + + + // collection operations + CollectionRef &c = cvec[op->cid]; + + // initialize osd_pool_id and do a smoke test that all collections belong + // to the same pool + spg_t pgid; + if (!!c ? c->cid.is_pg(&pgid) : false) { + ceph_assert(txc->osd_pool_id == META_POOL_ID || + txc->osd_pool_id == pgid.pool()); + txc->osd_pool_id = pgid.pool(); + } + + switch (op->op) { + case Transaction::OP_RMCOLL: + { + const coll_t &cid = i.get_cid(op->cid); + r = _remove_collection(txc, cid, &c); + if (!r) + continue; + } + break; + + case Transaction::OP_MKCOLL: + { + ceph_assert(!c); + const coll_t &cid = i.get_cid(op->cid); + r = _create_collection(txc, cid, op->split_bits, &c); + if (!r) + continue; + } + break; + + case Transaction::OP_SPLIT_COLLECTION: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_SPLIT_COLLECTION2: + { + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem); + if (!r) + continue; + } + break; + + case Transaction::OP_MERGE_COLLECTION: + { + uint32_t bits = op->split_bits; + r = _merge_collection(txc, &c, cvec[op->dest_cid], bits); + if (!r) + continue; + } + break; + + case Transaction::OP_COLL_HINT: + { + uint32_t type = op->hint; + bufferlist hint; + i.decode_bl(hint); + auto hiter = hint.cbegin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + decode(pg_num, hiter); + decode(num_objs, hiter); + dout(10) << __func__ << " collection hint objects is a no-op, " + << " pg_num " << pg_num << " num_objects " << num_objs + << dendl; + } else { + // Ignore the hint + dout(10) << __func__ << " unknown collection hint " << type << dendl; + } + continue; + } + break; + + case Transaction::OP_COLL_SETATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RMATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RENAME: + ceph_abort_msg("not implemented"); + break; + } + if (r < 0) { + derr << __func__ << " error " << cpp_strerror(r) + << " not handled on operation " << op->op + << " (op " << pos << ", counting from 0)" << dendl; + _dump_transaction<0>(cct, t); + ceph_abort_msg("unexpected error"); + } + + // these operations implicity create the object + bool create = false; + if (op->op == Transaction::OP_TOUCH || + op->op == Transaction::OP_CREATE || + op->op == Transaction::OP_WRITE || + op->op == Transaction::OP_ZERO) { + create = true; + } + + // object operations + std::unique_lock l(c->lock); + OnodeRef &o = ovec[op->oid]; + if (!o) { + ghobject_t oid = i.get_oid(op->oid); + o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE); + } + if (!create && (!o || !o->exists)) { + dout(10) << __func__ << " op " << op->op << " got ENOENT on " + << i.get_oid(op->oid) << dendl; + r = -ENOENT; + goto endop; + } + + switch (op->op) { + case Transaction::OP_CREATE: + case Transaction::OP_TOUCH: + r = _touch(txc, c, o); + break; + + case Transaction::OP_WRITE: + { + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + r = _write(txc, c, o, off, len, bl, fadvise_flags); + } + break; + + case Transaction::OP_ZERO: + { + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(txc, c, o, off, len); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + uint64_t off = op->off; + r = _truncate(txc, c, o, off); + } + break; + + case Transaction::OP_REMOVE: + { + r = _remove(txc, c, o); + } + break; + + case Transaction::OP_SETATTR: + { + string name = i.decode_string(); + bufferptr bp; + i.decode_bp(bp); + r = _setattr(txc, c, o, name, bp); + } + break; + + case Transaction::OP_SETATTRS: + { + map<string, bufferptr> aset; + i.decode_attrset(aset); + r = _setattrs(txc, c, o, aset); + } + break; + + case Transaction::OP_RMATTR: + { + string name = i.decode_string(); + r = _rmattr(txc, c, o, name); + } + break; + + case Transaction::OP_RMATTRS: + { + r = _rmattrs(txc, c, o); + } + break; + + case Transaction::OP_CLONE: + { + OnodeRef& no = ovec[op->dest_oid]; + if (!no) { + const ghobject_t& noid = i.get_oid(op->dest_oid); + no = c->get_onode(noid, true); + } + r = _clone(txc, c, o, no); + } + break; + + case Transaction::OP_CLONERANGE: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_CLONERANGE2: + { + OnodeRef& no = ovec[op->dest_oid]; + if (!no) { + const ghobject_t& noid = i.get_oid(op->dest_oid); + no = c->get_onode(noid, true); + } + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + r = _clone_range(txc, c, o, no, srcoff, len, dstoff); + } + break; + + case Transaction::OP_COLL_ADD: + ceph_abort_msg("not implemented"); + break; + + case Transaction::OP_COLL_REMOVE: + ceph_abort_msg("not implemented"); + break; + + case Transaction::OP_COLL_MOVE: + ceph_abort_msg("deprecated"); + break; + + case Transaction::OP_COLL_MOVE_RENAME: + case Transaction::OP_TRY_RENAME: + { + ceph_assert(op->cid == op->dest_cid); + const ghobject_t& noid = i.get_oid(op->dest_oid); + OnodeRef& no = ovec[op->dest_oid]; + if (!no) { + no = c->get_onode(noid, false); + } + r = _rename(txc, c, o, no, noid); + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + r = _omap_clear(txc, c, o); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + bufferlist aset_bl; + i.decode_attrset_bl(&aset_bl); + r = _omap_setkeys(txc, c, o, aset_bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + bufferlist keys_bl; + i.decode_keyset_bl(&keys_bl); + r = _omap_rmkeys(txc, c, o, keys_bl); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkey_range(txc, c, o, first, last); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + bufferlist bl; + i.decode_bl(bl); + r = _omap_setheader(txc, c, o, bl); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + r = _set_alloc_hint(txc, c, o, + op->expected_object_size, + op->expected_write_size, + op->hint); + } + break; + + default: + derr << __func__ << " bad op " << op->op << dendl; + ceph_abort(); + } + + endop: + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD || + op->op == Transaction::OP_SETATTR || + op->op == Transaction::OP_SETATTRS || + op->op == Transaction::OP_RMATTR || + op->op == Transaction::OP_OMAP_SETKEYS || + op->op == Transaction::OP_OMAP_RMKEYS || + op->op == Transaction::OP_OMAP_RMKEYRANGE || + op->op == Transaction::OP_OMAP_SETHEADER)) + // -ENOENT is usually okay + ok = true; + if (r == -ENODATA) + ok = true; + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) + msg = "ENOENT on clone suggests osd bug"; + + if (r == -ENOSPC) + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC from bluestore, misconfigured cluster"; + + if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } + + derr << __func__ << " error " << cpp_strerror(r) + << " not handled on operation " << op->op + << " (op " << pos << ", counting from 0)" + << dendl; + derr << msg << dendl; + _dump_transaction<0>(cct, t); + ceph_abort_msg("unexpected error"); + } + } + } +} + + + +// ----------------- +// write operations + +int BlueStore::_touch(TransContext *txc, + CollectionRef& c, + OnodeRef& o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + _assign_nid(txc, o); + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +void BlueStore::_pad_zeros( + bufferlist *bl, uint64_t *offset, + uint64_t chunk_size) +{ + auto length = bl->length(); + dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length + << " chunk_size 0x" << chunk_size << std::dec << dendl; + dout(40) << "before:\n"; + bl->hexdump(*_dout); + *_dout << dendl; + // front + size_t front_pad = *offset % chunk_size; + size_t back_pad = 0; + size_t pad_count = 0; + if (front_pad) { + size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length); + bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size); + z.zero(0, front_pad, false); + pad_count += front_pad; + bl->begin().copy(front_copy, z.c_str() + front_pad); + if (front_copy + front_pad < chunk_size) { + back_pad = chunk_size - (length + front_pad); + z.zero(front_pad + length, back_pad, false); + pad_count += back_pad; + } + bufferlist old, t; + old.swap(*bl); + t.substr_of(old, front_copy, length - front_copy); + bl->append(z); + bl->claim_append(t); + *offset -= front_pad; + length += pad_count; + } + + // back + uint64_t end = *offset + length; + unsigned back_copy = end % chunk_size; + if (back_copy) { + ceph_assert(back_pad == 0); + back_pad = chunk_size - back_copy; + ceph_assert(back_copy <= length); + bufferptr tail(chunk_size); + bl->begin(length - back_copy).copy(back_copy, tail.c_str()); + tail.zero(back_copy, back_pad, false); + bufferlist old; + old.swap(*bl); + bl->substr_of(old, 0, length - back_copy); + bl->append(tail); + length += back_pad; + pad_count += back_pad; + } + dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x" + << back_pad << " on front/back, now 0x" << *offset << "~" + << length << std::dec << dendl; + dout(40) << "after:\n"; + bl->hexdump(*_dout); + *_dout << dendl; + if (pad_count) + logger->inc(l_bluestore_write_pad_bytes, pad_count); + ceph_assert(bl->length() == length); +} + +void BlueStore::_do_write_small( + TransContext *txc, + CollectionRef &c, + OnodeRef& o, + uint64_t offset, uint64_t length, + bufferlist::iterator& blp, + WriteContext *wctx) +{ + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length + << std::dec << dendl; + ceph_assert(length < min_alloc_size); + + uint64_t end_offs = offset + length; + + logger->inc(l_bluestore_write_small); + logger->inc(l_bluestore_write_small_bytes, length); + + bufferlist bl; + blp.copy(length, bl); + + auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); + auto min_off = offset >= max_bsize ? offset - max_bsize : 0; + uint32_t alloc_len = min_alloc_size; + auto offset0 = p2align<uint64_t>(offset, alloc_len); + + bool any_change; + + // search suitable extent in both forward and reverse direction in + // [offset - target_max_blob_size, offset + target_max_blob_size] range + // then check if blob can be reused via can_reuse_blob func or apply + // direct/deferred write (the latter for extents including or higher + // than 'offset' only). + o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off); + +#ifdef HAVE_LIBZBD + // On zoned devices, the first goal is to support non-overwrite workloads, + // such as RGW, with large, aligned objects. Therefore, for user writes + // _do_write_small should not trigger. OSDs, however, write and update a tiny + // amount of metadata, such as OSD maps, to disk. For those cases, we + // temporarily just pad them to min_alloc_size and write them to a new place + // on every update. + if (bdev->is_smr()) { + uint64_t b_off = p2phase<uint64_t>(offset, alloc_len); + uint64_t b_off0 = b_off; + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + + // Zero detection -- small block + if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) { + BlobRef b = c->new_blob(); + _pad_zeros(&bl, &b_off0, min_alloc_size); + wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true); + } else { // if (bl.is_zero()) + dout(20) << __func__ << " skip small zero block " << std::hex + << " (0x" << b_off0 << "~" << bl.length() << ")" + << " (0x" << b_off << "~" << length << ")" + << std::dec << dendl; + logger->inc(l_bluestore_write_small_skipped); + logger->inc(l_bluestore_write_small_skipped_bytes, length); + } + + return; + } +#endif + + // Look for an existing mutable blob we can use. + auto begin = o->extent_map.extent_map.begin(); + auto end = o->extent_map.extent_map.end(); + auto ep = o->extent_map.seek_lextent(offset); + if (ep != begin) { + --ep; + if (ep->blob_end() <= offset) { + ++ep; + } + } + auto prev_ep = end; + if (ep != begin) { + prev_ep = ep; + --prev_ep; + } + + boost::container::flat_set<const bluestore_blob_t*> inspected_blobs; + // We don't want to have more blobs than min alloc units fit + // into 2 max blobs + size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1; + bool above_blob_threshold = false; + + inspected_blobs.reserve(blob_threshold); + + uint64_t max_off = 0; + auto start_ep = ep; + auto end_ep = ep; // exclusively + do { + any_change = false; + + if (ep != end && ep->logical_offset < offset + max_bsize) { + BlobRef b = ep->blob; + if (!above_blob_threshold) { + inspected_blobs.insert(&b->get_blob()); + above_blob_threshold = inspected_blobs.size() >= blob_threshold; + } + max_off = ep->logical_end(); + auto bstart = ep->blob_start(); + + dout(20) << __func__ << " considering " << *b + << " bstart 0x" << std::hex << bstart << std::dec << dendl; + if (bstart >= end_offs) { + dout(20) << __func__ << " ignoring distant " << *b << dendl; + } else if (!b->get_blob().is_mutable()) { + dout(20) << __func__ << " ignoring immutable " << *b << dendl; + } else if (ep->logical_offset % min_alloc_size != + ep->blob_offset % min_alloc_size) { + dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl; + } else { + uint64_t chunk_size = b->get_blob().get_chunk_size(block_size); + // can we pad our head/tail out with zeros? + uint64_t head_pad, tail_pad; + head_pad = p2phase(offset, chunk_size); + tail_pad = p2nphase(end_offs, chunk_size); + if (head_pad || tail_pad) { + o->extent_map.fault_range(db, offset - head_pad, + end_offs - offset + head_pad + tail_pad); + } + if (head_pad && + o->extent_map.has_any_lextents(offset - head_pad, head_pad)) { + head_pad = 0; + } + if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) { + tail_pad = 0; + } + + uint64_t b_off = offset - head_pad - bstart; + uint64_t b_len = length + head_pad + tail_pad; + + // direct write into unused blocks of an existing mutable blob? + if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) && + b->get_blob().get_ondisk_length() >= b_off + b_len && + b->get_blob().is_unused(b_off, b_len) && + b->get_blob().is_allocated(b_off, b_len)) { + _apply_padding(head_pad, tail_pad, bl); + + dout(20) << __func__ << " write to unused 0x" << std::hex + << b_off << "~" << b_len + << " pad 0x" << head_pad << " + 0x" << tail_pad + << std::dec << " of mutable " << *b << dendl; + _buffer_cache_write(txc, b, b_off, bl, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + if (!g_conf()->bluestore_debug_omit_block_device_write) { + if (b_len < prefer_deferred_size) { + dout(20) << __func__ << " deferring small 0x" << std::hex + << b_len << std::dec << " unused write via deferred" << dendl; + bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + b->get_blob().map( + b_off, b_len, + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + op->data = bl; + } else { + b->get_blob().map_bl( + b_off, bl, + [&](uint64_t offset, bufferlist& t) { + bdev->aio_write(offset, t, + &txc->ioc, wctx->buffered); + }); + } + } + b->dirty_blob().calc_csum(b_off, bl); + dout(20) << __func__ << " lex old " << *ep << dendl; + Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length, + b, + &wctx->old_extents); + b->dirty_blob().mark_used(le->blob_offset, le->length); + + txc->statfs_delta.stored() += le->length; + dout(20) << __func__ << " lex " << *le << dendl; + logger->inc(l_bluestore_write_small_unused); + return; + } + // read some data to fill out the chunk? + uint64_t head_read = p2phase(b_off, chunk_size); + uint64_t tail_read = p2nphase(b_off + b_len, chunk_size); + if ((head_read || tail_read) && + (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) && + head_read + tail_read < min_alloc_size) { + b_off -= head_read; + b_len += head_read + tail_read; + + } else { + head_read = tail_read = 0; + } + + // chunk-aligned deferred overwrite? + if (b->get_blob().get_ondisk_length() >= b_off + b_len && + b_off % chunk_size == 0 && + b_len % chunk_size == 0 && + b->get_blob().is_allocated(b_off, b_len)) { + + _apply_padding(head_pad, tail_pad, bl); + + dout(20) << __func__ << " reading head 0x" << std::hex << head_read + << " and tail 0x" << tail_read << std::dec << dendl; + if (head_read) { + bufferlist head_bl; + int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read, + head_bl, 0); + ceph_assert(r >= 0 && r <= (int)head_read); + size_t zlen = head_read - r; + if (zlen) { + head_bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + head_bl.claim_append(bl); + bl.swap(head_bl); + logger->inc(l_bluestore_write_penalty_read_ops); + } + if (tail_read) { + bufferlist tail_bl; + int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read, + tail_bl, 0); + ceph_assert(r >= 0 && r <= (int)tail_read); + size_t zlen = tail_read - r; + if (zlen) { + tail_bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + bl.claim_append(tail_bl); + logger->inc(l_bluestore_write_penalty_read_ops); + } + logger->inc(l_bluestore_write_small_pre_read); + + _buffer_cache_write(txc, b, b_off, bl, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + b->dirty_blob().calc_csum(b_off, bl); + + if (!g_conf()->bluestore_debug_omit_block_device_write) { + bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + int r = b->get_blob().map( + b_off, b_len, + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + ceph_assert(r == 0); + op->data = std::move(bl); + dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~" + << b_len << std::dec << " of mutable " << *b + << " at " << op->extents << dendl; + } + + Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length, + b, &wctx->old_extents); + b->dirty_blob().mark_used(le->blob_offset, le->length); + txc->statfs_delta.stored() += le->length; + dout(20) << __func__ << " lex " << *le << dendl; + return; + } + // try to reuse blob if we can + if (b->can_reuse_blob(min_alloc_size, + max_bsize, + offset0 - bstart, + &alloc_len)) { + ceph_assert(alloc_len == min_alloc_size); // expecting data always + // fit into reused blob + // Need to check for pending writes desiring to + // reuse the same pextent. The rationale is that during GC two chunks + // from garbage blobs(compressed?) can share logical space within the same + // AU. That's in turn might be caused by unaligned len in clone_range2. + // Hence the second write will fail in an attempt to reuse blob at + // do_alloc_write(). + if (!wctx->has_conflict(b, + offset0, + offset0 + alloc_len, + min_alloc_size)) { + + // we can't reuse pad_head/pad_tail since they might be truncated + // due to existent extents + uint64_t b_off = offset - bstart; + uint64_t b_off0 = b_off; + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + + // Zero detection -- small block + if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) { + _pad_zeros(&bl, &b_off0, chunk_size); + + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (0x" << b_off0 << "~" << bl.length() << ")" + << " (0x" << b_off << "~" << length << ")" + << std::dec << dendl; + + wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, + false, false); + logger->inc(l_bluestore_write_small_unused); + } else { // if (bl.is_zero()) + dout(20) << __func__ << " skip small zero block " << std::hex + << " (0x" << b_off0 << "~" << bl.length() << ")" + << " (0x" << b_off << "~" << length << ")" + << std::dec << dendl; + logger->inc(l_bluestore_write_small_skipped); + logger->inc(l_bluestore_write_small_skipped_bytes, length); + } + + return; + } + } + } + ++ep; + end_ep = ep; + any_change = true; + } // if (ep != end && ep->logical_offset < offset + max_bsize) + + // check extent for reuse in reverse order + if (prev_ep != end && prev_ep->logical_offset >= min_off) { + BlobRef b = prev_ep->blob; + if (!above_blob_threshold) { + inspected_blobs.insert(&b->get_blob()); + above_blob_threshold = inspected_blobs.size() >= blob_threshold; + } + start_ep = prev_ep; + auto bstart = prev_ep->blob_start(); + dout(20) << __func__ << " considering " << *b + << " bstart 0x" << std::hex << bstart << std::dec << dendl; + if (b->can_reuse_blob(min_alloc_size, + max_bsize, + offset0 - bstart, + &alloc_len)) { + ceph_assert(alloc_len == min_alloc_size); // expecting data always + // fit into reused blob + // Need to check for pending writes desiring to + // reuse the same pextent. The rationale is that during GC two chunks + // from garbage blobs(compressed?) can share logical space within the same + // AU. That's in turn might be caused by unaligned len in clone_range2. + // Hence the second write will fail in an attempt to reuse blob at + // do_alloc_write(). + if (!wctx->has_conflict(b, + offset0, + offset0 + alloc_len, + min_alloc_size)) { + + uint64_t b_off = offset - bstart; + uint64_t b_off0 = b_off; + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + + // Zero detection -- small block + if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) { + uint64_t chunk_size = b->get_blob().get_chunk_size(block_size); + _pad_zeros(&bl, &b_off0, chunk_size); + + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (0x" << b_off0 << "~" << bl.length() << ")" + << " (0x" << b_off << "~" << length << ")" + << std::dec << dendl; + + wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, + false, false); + logger->inc(l_bluestore_write_small_unused); + } else { // if (bl.is_zero()) + dout(20) << __func__ << " skip small zero block " << std::hex + << " (0x" << b_off0 << "~" << bl.length() << ")" + << " (0x" << b_off << "~" << length << ")" + << std::dec << dendl; + logger->inc(l_bluestore_write_small_skipped); + logger->inc(l_bluestore_write_small_skipped_bytes, length); + } + + return; + } + } + if (prev_ep != begin) { + --prev_ep; + any_change = true; + } else { + prev_ep = end; // to avoid useless first extent re-check + } + } // if (prev_ep != end && prev_ep->logical_offset >= min_off) + } while (any_change); + + if (above_blob_threshold) { + dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size() + << " " << std::hex << min_off << "~" << max_off << std::dec + << dendl; + ceph_assert(start_ep != end_ep); + for (auto ep = start_ep; ep != end_ep; ++ep) { + dout(20) << __func__ << " inserting for GC " + << std::hex << ep->logical_offset << "~" << ep->length + << std::dec << dendl; + + wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length); + } + // insert newly written extent to GC + wctx->extents_to_gc.union_insert(offset, length); + dout(20) << __func__ << " inserting (last) for GC " + << std::hex << offset << "~" << length + << std::dec << dendl; + } + uint64_t b_off = p2phase<uint64_t>(offset, alloc_len); + uint64_t b_off0 = b_off; + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + + // Zero detection -- small block + if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) { + // new blob. + BlobRef b = c->new_blob(); + _pad_zeros(&bl, &b_off0, block_size); + wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, + min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity + // doesn't match disk one only + true); + } else { // if (bl.is_zero()) + dout(20) << __func__ << " skip small zero block " << std::hex + << " (0x" << b_off0 << "~" << bl.length() << ")" + << " (0x" << b_off << "~" << length << ")" + << std::dec << dendl; + logger->inc(l_bluestore_write_small_skipped); + logger->inc(l_bluestore_write_small_skipped_bytes, length); + } + + return; +} + +bool BlueStore::BigDeferredWriteContext::can_defer( + BlueStore::extent_map_t::iterator ep, + uint64_t prefer_deferred_size, + uint64_t block_size, + uint64_t offset, + uint64_t l) +{ + bool res = false; + auto& blob = ep->blob->get_blob(); + if (offset >= ep->blob_start() && + blob.is_mutable()) { + off = offset; + b_off = offset - ep->blob_start(); + uint64_t chunk_size = blob.get_chunk_size(block_size); + uint64_t ondisk = blob.get_ondisk_length(); + used = std::min(l, ondisk - b_off); + + // will read some data to fill out the chunk? + head_read = p2phase<uint64_t>(b_off, chunk_size); + tail_read = p2nphase<uint64_t>(b_off + used, chunk_size); + b_off -= head_read; + + ceph_assert(b_off % chunk_size == 0); + ceph_assert(blob_aligned_len() % chunk_size == 0); + + res = blob_aligned_len() < prefer_deferred_size && + blob_aligned_len() <= ondisk && + blob.is_allocated(b_off, blob_aligned_len()); + if (res) { + blob_ref = ep->blob; + blob_start = ep->blob_start(); + } + } + return res; +} + +bool BlueStore::BigDeferredWriteContext::apply_defer() +{ + int r = blob_ref->get_blob().map( + b_off, blob_aligned_len(), + [&](const bluestore_pextent_t& pext, + uint64_t offset, + uint64_t length) { + // apply deferred if overwrite breaks blob continuity only. + // if it totally overlaps some pextent - fallback to regular write + if (pext.offset < offset || + pext.end() > offset + length) { + res_extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + } + return -1; + }); + return r >= 0; +} + +void BlueStore::_do_write_big_apply_deferred( + TransContext* txc, + CollectionRef& c, + OnodeRef& o, + BlueStore::BigDeferredWriteContext& dctx, + bufferlist::iterator& blp, + WriteContext* wctx) +{ + bufferlist bl; + dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read + << " and tail 0x" << dctx.tail_read << std::dec << dendl; + if (dctx.head_read) { + int r = _do_read(c.get(), o, + dctx.off - dctx.head_read, + dctx.head_read, + bl, + 0); + ceph_assert(r >= 0 && r <= (int)dctx.head_read); + size_t zlen = dctx.head_read - r; + if (zlen) { + bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + logger->inc(l_bluestore_write_penalty_read_ops); + } + blp.copy(dctx.used, bl); + + if (dctx.tail_read) { + bufferlist tail_bl; + int r = _do_read(c.get(), o, + dctx.off + dctx.used, dctx.tail_read, + tail_bl, 0); + ceph_assert(r >= 0 && r <= (int)dctx.tail_read); + size_t zlen = dctx.tail_read - r; + if (zlen) { + tail_bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + bl.claim_append(tail_bl); + logger->inc(l_bluestore_write_penalty_read_ops); + } + auto& b0 = dctx.blob_ref; + _buffer_cache_write(txc, b0, dctx.b_off, bl, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + b0->dirty_blob().calc_csum(dctx.b_off, bl); + + Extent* le = o->extent_map.set_lextent(c, dctx.off, + dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents); + + // in fact this is a no-op for big writes but left here to maintain + // uniformity and avoid missing after some refactor. + b0->dirty_blob().mark_used(le->blob_offset, le->length); + txc->statfs_delta.stored() += le->length; + + if (!g_conf()->bluestore_debug_omit_block_device_write) { + bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + op->extents.swap(dctx.res_extents); + op->data = std::move(bl); + } +} + +void BlueStore::_do_write_big( + TransContext *txc, + CollectionRef &c, + OnodeRef& o, + uint64_t offset, uint64_t length, + bufferlist::iterator& blp, + WriteContext *wctx) +{ + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length + << " target_blob_size 0x" << wctx->target_blob_size << std::dec + << " compress " << (int)wctx->compress + << dendl; + logger->inc(l_bluestore_write_big); + logger->inc(l_bluestore_write_big_bytes, length); + auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); + uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load(); + while (length > 0) { + bool new_blob = false; + BlobRef b; + uint32_t b_off = 0; + uint32_t l = 0; + + //attempting to reuse existing blob + if (!wctx->compress) { + // enforce target blob alignment with max_bsize + l = max_bsize - p2phase(offset, max_bsize); + l = std::min(uint64_t(l), length); + + auto end = o->extent_map.extent_map.end(); + + dout(20) << __func__ << " may be defer: 0x" << std::hex + << offset << "~" << l + << std::dec << dendl; + + if (prefer_deferred_size_snapshot && + l <= prefer_deferred_size_snapshot * 2) { + // Single write that spans two adjusted existing blobs can result + // in up to two deferred blocks of 'prefer_deferred_size' + // So we're trying to minimize the amount of resulting blobs + // and preserve 2 blobs rather than inserting one more in between + // E.g. write 0x10000~20000 over existing blobs + // (0x0~20000 and 0x20000~20000) is better (from subsequent reading + // performance point of view) to result in two deferred writes to + // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000 + + // look for an existing mutable blob we can write into + auto ep = o->extent_map.seek_lextent(offset); + auto ep_next = end; + BigDeferredWriteContext head_info, tail_info; + + bool will_defer = ep != end ? + head_info.can_defer(ep, + prefer_deferred_size_snapshot, + block_size, + offset, + l) : + false; + auto offset_next = offset + head_info.used; + auto remaining = l - head_info.used; + if (will_defer && remaining) { + will_defer = false; + if (remaining <= prefer_deferred_size_snapshot) { + ep_next = o->extent_map.seek_lextent(offset_next); + // check if we can defer remaining totally + will_defer = ep_next == end ? + false : + tail_info.can_defer(ep_next, + prefer_deferred_size_snapshot, + block_size, + offset_next, + remaining); + will_defer = will_defer && remaining == tail_info.used; + } + } + if (will_defer) { + dout(20) << __func__ << " " << *(head_info.blob_ref) + << " deferring big " << std::hex + << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")" + << std::dec << " write via deferred" + << dendl; + if (remaining) { + dout(20) << __func__ << " " << *(tail_info.blob_ref) + << " deferring big " << std::hex + << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")" + << std::dec << " write via deferred" + << dendl; + } + + will_defer = head_info.apply_defer(); + if (!will_defer) { + dout(20) << __func__ + << " deferring big fell back, head isn't continuous" + << dendl; + } else if (remaining) { + will_defer = tail_info.apply_defer(); + if (!will_defer) { + dout(20) << __func__ + << " deferring big fell back, tail isn't continuous" + << dendl; + } + } + } + if (will_defer) { + _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx); + if (remaining) { + _do_write_big_apply_deferred(txc, c, o, tail_info, + blp, wctx); + } + dout(20) << __func__ << " defer big: 0x" << std::hex + << offset << "~" << l + << std::dec << dendl; + offset += l; + length -= l; + logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1); + logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1); + continue; + } + } + dout(20) << __func__ << " lookup for blocks to reuse..." << dendl; + + o->extent_map.punch_hole(c, offset, l, &wctx->old_extents); + + // seek again as punch_hole could invalidate ep + auto ep = o->extent_map.seek_lextent(offset); + auto begin = o->extent_map.extent_map.begin(); + auto prev_ep = end; + if (ep != begin) { + prev_ep = ep; + --prev_ep; + } + + auto min_off = offset >= max_bsize ? offset - max_bsize : 0; + // search suitable extent in both forward and reverse direction in + // [offset - target_max_blob_size, offset + target_max_blob_size] range + // then check if blob can be reused via can_reuse_blob func. + bool any_change; + do { + any_change = false; + if (ep != end && ep->logical_offset < offset + max_bsize) { + dout(20) << __func__ << " considering " << *ep + << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl; + + if (offset >= ep->blob_start() && + ep->blob->can_reuse_blob(min_alloc_size, max_bsize, + offset - ep->blob_start(), + &l)) { + b = ep->blob; + b_off = offset - ep->blob_start(); + prev_ep = end; // to avoid check below + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (0x" << b_off << "~" << l << ")" << std::dec << dendl; + } else { + ++ep; + any_change = true; + } + } + + if (prev_ep != end && prev_ep->logical_offset >= min_off) { + dout(20) << __func__ << " considering rev " << *prev_ep + << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl; + if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize, + offset - prev_ep->blob_start(), + &l)) { + b = prev_ep->blob; + b_off = offset - prev_ep->blob_start(); + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (0x" << b_off << "~" << l << ")" << std::dec << dendl; + } else if (prev_ep != begin) { + --prev_ep; + any_change = true; + } else { + prev_ep = end; // to avoid useless first extent re-check + } + } + } while (b == nullptr && any_change); + } else { + // trying to utilize as longer chunk as permitted in case of compression. + l = std::min(max_bsize, length); + o->extent_map.punch_hole(c, offset, l, &wctx->old_extents); + } // if (!wctx->compress) + + if (b == nullptr) { + b = c->new_blob(); + b_off = 0; + new_blob = true; + } + bufferlist t; + blp.copy(l, t); + + // Zero detection -- big block + if (!cct->_conf->bluestore_zero_block_detection || !t.is_zero()) { + wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob); + + dout(20) << __func__ << " schedule write big: 0x" + << std::hex << offset << "~" << l << std::dec + << (new_blob ? " new " : " reuse ") + << *b << dendl; + + logger->inc(l_bluestore_write_big_blobs); + } else { // if (!t.is_zero()) + dout(20) << __func__ << " skip big zero block " << std::hex + << " (0x" << b_off << "~" << t.length() << ")" + << " (0x" << b_off << "~" << l << ")" + << std::dec << dendl; + logger->inc(l_bluestore_write_big_skipped_blobs); + logger->inc(l_bluestore_write_big_skipped_bytes, l); + } + + offset += l; + length -= l; + } +} + +int BlueStore::_do_alloc_write( + TransContext *txc, + CollectionRef coll, + OnodeRef& o, + WriteContext *wctx) +{ + dout(20) << __func__ << " txc " << txc + << " " << wctx->writes.size() << " blobs" + << dendl; + if (wctx->writes.empty()) { + return 0; + } + + CompressorRef c; + double crr = 0; + if (wctx->compress) { + c = select_option( + "compression_algorithm", + compressor, + [&]() { + string val; + if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) { + CompressorRef cp = compressor; + if (!cp || cp->get_type_name() != val) { + cp = Compressor::create(cct, val); + if (!cp) { + if (_set_compression_alert(false, val.c_str())) { + derr << __func__ << " unable to initialize " << val.c_str() + << " compressor" << dendl; + } + } + } + return std::optional<CompressorRef>(cp); + } + return std::optional<CompressorRef>(); + } + ); + + crr = select_option( + "compression_required_ratio", + cct->_conf->bluestore_compression_required_ratio, + [&]() { + double val; + if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) { + return std::optional<double>(val); + } + return std::optional<double>(); + } + ); + } + + // checksum + int64_t csum = csum_type.load(); + csum = select_option( + "csum_type", + csum, + [&]() { + int64_t val; + if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) { + return std::optional<int64_t>(val); + } + return std::optional<int64_t>(); + } + ); + + // compress (as needed) and calc needed space + uint64_t need = 0; + uint64_t data_size = 0; + // 'need' is amount of space that must be provided by allocator. + // 'data_size' is a size of data that will be transferred to disk. + // Note that data_size is always <= need. This comes from: + // - write to blob was unaligned, and there is free space + // - data has been compressed + // + // We make one decision and apply it to all blobs. + // All blobs will be deferred or none will. + // We assume that allocator does its best to provide contiguous space, + // and the condition is : (data_size < deferred). + + auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); + for (auto& wi : wctx->writes) { + if (c && wi.blob_length > min_alloc_size) { + auto start = mono_clock::now(); + + // compress + ceph_assert(wi.b_off == 0); + ceph_assert(wi.blob_length == wi.bl.length()); + + // FIXME: memory alignment here is bad + bufferlist t; + std::optional<int32_t> compressor_message; + int r = c->compress(wi.bl, t, compressor_message); + uint64_t want_len_raw = wi.blob_length * crr; + uint64_t want_len = p2roundup(want_len_raw, min_alloc_size); + bool rejected = false; + uint64_t compressed_len = t.length(); + // do an approximate (fast) estimation for resulting blob size + // that doesn't take header overhead into account + uint64_t result_len = p2roundup(compressed_len, min_alloc_size); + if (r == 0 && result_len <= want_len && result_len < wi.blob_length) { + bluestore_compression_header_t chdr; + chdr.type = c->get_type(); + chdr.length = t.length(); + chdr.compressor_message = compressor_message; + encode(chdr, wi.compressed_bl); + wi.compressed_bl.claim_append(t); + + compressed_len = wi.compressed_bl.length(); + result_len = p2roundup(compressed_len, min_alloc_size); + if (result_len <= want_len && result_len < wi.blob_length) { + // Cool. We compressed at least as much as we were hoping to. + // pad out to min_alloc_size + wi.compressed_bl.append_zero(result_len - compressed_len); + wi.compressed_len = compressed_len; + wi.compressed = true; + logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len); + dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length + << " -> 0x" << compressed_len << " => 0x" << result_len + << " with " << c->get_type() + << std::dec << dendl; + txc->statfs_delta.compressed() += compressed_len; + txc->statfs_delta.compressed_original() += wi.blob_length; + txc->statfs_delta.compressed_allocated() += result_len; + logger->inc(l_bluestore_compress_success_count); + need += result_len; + data_size += result_len; + } else { + rejected = true; + } + } else if (r != 0) { + dout(5) << __func__ << std::hex << " 0x" << wi.blob_length + << " bytes compressed using " << c->get_type_name() + << std::dec + << " failed with errcode = " << r + << ", leaving uncompressed" + << dendl; + logger->inc(l_bluestore_compress_rejected_count); + need += wi.blob_length; + data_size += wi.bl.length(); + } else { + rejected = true; + } + + if (rejected) { + dout(20) << __func__ << std::hex << " 0x" << wi.blob_length + << " compressed to 0x" << compressed_len << " -> 0x" << result_len + << " with " << c->get_type() + << ", which is more than required 0x" << want_len_raw + << " -> 0x" << want_len + << ", leaving uncompressed" + << std::dec << dendl; + logger->inc(l_bluestore_compress_rejected_count); + need += wi.blob_length; + data_size += wi.bl.length(); + } + log_latency("compress@_do_alloc_write", + l_bluestore_compress_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age ); + } else { + need += wi.blob_length; + data_size += wi.bl.length(); + } + } + PExtentVector prealloc; + prealloc.reserve(2 * wctx->writes.size()); + int64_t prealloc_left = 0; + prealloc_left = alloc->allocate( + need, min_alloc_size, need, + 0, &prealloc); + if (prealloc_left < 0 || prealloc_left < (int64_t)need) { + derr << __func__ << " failed to allocate 0x" << std::hex << need + << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left) + << " min_alloc_size 0x" << min_alloc_size + << " available 0x " << alloc->get_free() + << std::dec << dendl; + if (prealloc.size()) { + alloc->release(prealloc); + } + return -ENOSPC; + } + _collect_allocation_stats(need, min_alloc_size, prealloc); + + dout(20) << __func__ << std::hex << " need=0x" << need << " data=0x" << data_size + << " prealloc " << prealloc << dendl; + auto prealloc_pos = prealloc.begin(); + ceph_assert(prealloc_pos != prealloc.end()); + + for (auto& wi : wctx->writes) { + bluestore_blob_t& dblob = wi.b->dirty_blob(); + uint64_t b_off = wi.b_off; + bufferlist *l = &wi.bl; + uint64_t final_length = wi.blob_length; + uint64_t csum_length = wi.blob_length; + if (wi.compressed) { + final_length = wi.compressed_bl.length(); + csum_length = final_length; + unsigned csum_order = std::countr_zero(csum_length); + l = &wi.compressed_bl; + dblob.set_compressed(wi.blob_length, wi.compressed_len); + if (csum != Checksummer::CSUM_NONE) { + dout(20) << __func__ + << " initialize csum setting for compressed blob " << *wi.b + << " csum_type " << Checksummer::get_csum_type_string(csum) + << " csum_order " << csum_order + << " csum_length 0x" << std::hex << csum_length + << " blob_length 0x" << wi.blob_length + << " compressed_length 0x" << wi.compressed_len << std::dec + << dendl; + dblob.init_csum(csum, csum_order, csum_length); + } + } else if (wi.new_blob) { + unsigned csum_order; + // initialize newly created blob only + ceph_assert(dblob.is_mutable()); + if (l->length() != wi.blob_length) { + // hrm, maybe we could do better here, but let's not bother. + dout(20) << __func__ << " forcing csum_order to block_size_order " + << block_size_order << dendl; + csum_order = block_size_order; + } else { + csum_order = std::min<unsigned>(wctx->csum_order, std::countr_zero(l->length())); + } + // try to align blob with max_blob_size to improve + // its reuse ratio, e.g. in case of reverse write + uint32_t suggested_boff = + (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize; + if ((suggested_boff % (1 << csum_order)) == 0 && + suggested_boff + final_length <= max_bsize && + suggested_boff > b_off) { + dout(20) << __func__ << " forcing blob_offset to 0x" + << std::hex << suggested_boff << std::dec << dendl; + ceph_assert(suggested_boff >= b_off); + csum_length += suggested_boff - b_off; + b_off = suggested_boff; + } + if (csum != Checksummer::CSUM_NONE) { + dout(20) << __func__ + << " initialize csum setting for new blob " << *wi.b + << " csum_type " << Checksummer::get_csum_type_string(csum) + << " csum_order " << csum_order + << " csum_length 0x" << std::hex << csum_length << std::dec + << dendl; + dblob.init_csum(csum, csum_order, csum_length); + } + } + + PExtentVector extents; + int64_t left = final_length; + auto prefer_deferred_size_snapshot = prefer_deferred_size.load(); + while (left > 0) { + ceph_assert(prealloc_left > 0); + if (prealloc_pos->length <= left) { + prealloc_left -= prealloc_pos->length; + left -= prealloc_pos->length; + txc->statfs_delta.allocated() += prealloc_pos->length; + extents.push_back(*prealloc_pos); + ++prealloc_pos; + } else { + extents.emplace_back(prealloc_pos->offset, left); + prealloc_pos->offset += left; + prealloc_pos->length -= left; + prealloc_left -= left; + txc->statfs_delta.allocated() += left; + left = 0; + break; + } + } + for (auto& p : extents) { + txc->allocated.insert(p.offset, p.length); + } + dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents); + + dout(20) << __func__ << " blob " << *wi.b << dendl; + if (dblob.has_csum()) { + dblob.calc_csum(b_off, *l); + } + + if (wi.mark_unused) { + ceph_assert(!dblob.is_compressed()); + auto b_end = b_off + wi.bl.length(); + if (b_off) { + dblob.add_unused(0, b_off); + } + uint64_t llen = dblob.get_logical_length(); + if (b_end < llen) { + dblob.add_unused(b_end, llen - b_end); + } + } + + Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset, + b_off + (wi.b_off0 - wi.b_off), + wi.length0, + wi.b, + nullptr); + wi.b->dirty_blob().mark_used(le->blob_offset, le->length); + txc->statfs_delta.stored() += le->length; + dout(20) << __func__ << " lex " << *le << dendl; + _buffer_cache_write(txc, wi.b, b_off, wi.bl, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + // queue io + if (!g_conf()->bluestore_debug_omit_block_device_write) { + if (data_size < prefer_deferred_size_snapshot) { + dout(20) << __func__ << " deferring 0x" << std::hex + << l->length() << std::dec << " write via deferred" << dendl; + bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + int r = wi.b->get_blob().map( + b_off, l->length(), + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + ceph_assert(r == 0); + op->data = *l; + } else { + wi.b->get_blob().map_bl( + b_off, *l, + [&](uint64_t offset, bufferlist& t) { + bdev->aio_write(offset, t, &txc->ioc, false); + }); + logger->inc(l_bluestore_write_new); + } + } + } + ceph_assert(prealloc_pos == prealloc.end()); + ceph_assert(prealloc_left == 0); + return 0; +} + +void BlueStore::_wctx_finish( + TransContext *txc, + CollectionRef& c, + OnodeRef& o, + WriteContext *wctx, + set<SharedBlob*> *maybe_unshared_blobs) +{ +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + for (auto& w : wctx->writes) { + for (auto& e : w.b->get_blob().get_extents()) { + if (!e.is_valid()) { + continue; + } + uint32_t zone = e.offset / zone_size; + if (!o->onode.zone_offset_refs.count(zone)) { + uint64_t zoff = e.offset % zone_size; + dout(20) << __func__ << " add ref zone 0x" << std::hex << zone + << " offset 0x" << zoff << std::dec << dendl; + txc->note_write_zone_offset(o, zone, zoff); + } + } + } + } + set<uint32_t> zones_with_releases; +#endif + + auto oep = wctx->old_extents.begin(); + while (oep != wctx->old_extents.end()) { + auto &lo = *oep; + oep = wctx->old_extents.erase(oep); + dout(20) << __func__ << " lex_old " << lo.e << dendl; + BlobRef b = lo.e.blob; + const bluestore_blob_t& blob = b->get_blob(); + if (blob.is_compressed()) { + if (lo.blob_empty) { + txc->statfs_delta.compressed() -= blob.get_compressed_payload_length(); + } + txc->statfs_delta.compressed_original() -= lo.e.length; + } + auto& r = lo.r; + txc->statfs_delta.stored() -= lo.e.length; + if (!r.empty()) { + dout(20) << __func__ << " blob " << *b << " release " << r << dendl; + if (blob.is_shared()) { + PExtentVector final; + c->load_shared_blob(b->shared_blob); + bool unshare = false; + bool* unshare_ptr = + !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare; + for (auto e : r) { + b->shared_blob->put_ref( + e.offset, e.length, &final, + unshare_ptr); +#ifdef HAVE_LIBZBD + // we also drop zone ref for shared blob extents + if (bdev->is_smr() && e.is_valid()) { + zones_with_releases.insert(e.offset / zone_size); + } +#endif + } + if (unshare) { + ceph_assert(maybe_unshared_blobs); + maybe_unshared_blobs->insert(b->shared_blob.get()); + } + dout(20) << __func__ << " shared_blob release " << final + << " from " << *b->shared_blob << dendl; + txc->write_shared_blob(b->shared_blob); + r.clear(); + r.swap(final); + } + } + // we can't invalidate our logical extents as we drop them because + // other lextents (either in our onode or others) may still + // reference them. but we can throw out anything that is no + // longer allocated. Note that this will leave behind edge bits + // that are no longer referenced but not deallocated (until they + // age out of the cache naturally). + b->discard_unallocated(c.get()); + for (auto e : r) { + dout(20) << __func__ << " release " << e << dendl; + txc->released.insert(e.offset, e.length); + txc->statfs_delta.allocated() -= e.length; + if (blob.is_compressed()) { + txc->statfs_delta.compressed_allocated() -= e.length; + } +#ifdef HAVE_LIBZBD + if (bdev->is_smr() && e.is_valid()) { + zones_with_releases.insert(e.offset / zone_size); + } +#endif + } + + if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) { + dout(20) << __func__ << " spanning_blob_map removing empty " << *b + << dendl; + o->extent_map.spanning_blob_map.erase(b->id); + } + delete &lo; + } + +#ifdef HAVE_LIBZBD + if (!zones_with_releases.empty()) { + // we need to fault the entire extent range in here to determinte if we've dropped + // all refs to a zone. + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + for (auto& b : o->extent_map.extent_map) { + for (auto& e : b.blob->get_blob().get_extents()) { + if (e.is_valid()) { + zones_with_releases.erase(e.offset / zone_size); + } + } + } + for (auto zone : zones_with_releases) { + auto p = o->onode.zone_offset_refs.find(zone); + if (p != o->onode.zone_offset_refs.end()) { + dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone + << " offset 0x" << p->second << std::dec << dendl; + txc->note_release_zone_offset(o, zone, p->second); + } + } + } +#endif +} + +void BlueStore::_do_write_data( + TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, + uint64_t length, + bufferlist& bl, + WriteContext *wctx) +{ + uint64_t end = offset + length; + bufferlist::iterator p = bl.begin(); + + if (offset / min_alloc_size == (end - 1) / min_alloc_size && + (length != min_alloc_size)) { + // we fall within the same block + _do_write_small(txc, c, o, offset, length, p, wctx); + } else { + uint64_t head_offset, head_length; + uint64_t middle_offset, middle_length; + uint64_t tail_offset, tail_length; + + head_offset = offset; + head_length = p2nphase(offset, min_alloc_size); + + tail_offset = p2align(end, min_alloc_size); + tail_length = p2phase(end, min_alloc_size); + + middle_offset = head_offset + head_length; + middle_length = length - head_length - tail_length; + + if (head_length) { + _do_write_small(txc, c, o, head_offset, head_length, p, wctx); + } + + _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx); + + if (tail_length) { + _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx); + } + } +} + +void BlueStore::_choose_write_options( + CollectionRef& c, + OnodeRef& o, + uint32_t fadvise_flags, + WriteContext *wctx) +{ + if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { + dout(20) << __func__ << " will do buffered write" << dendl; + wctx->buffered = true; + } else if (cct->_conf->bluestore_default_buffered_write && + (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { + dout(20) << __func__ << " defaulting to buffered write" << dendl; + wctx->buffered = true; + } + + // apply basic csum block size + wctx->csum_order = block_size_order; + + // compression parameters + unsigned alloc_hints = o->onode.alloc_hint_flags; + auto cm = select_option( + "compression_mode", + comp_mode.load(), + [&]() { + string val; + if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) { + return std::optional<Compressor::CompressionMode>( + Compressor::get_comp_mode_type(val)); + } + return std::optional<Compressor::CompressionMode>(); + } + ); + + wctx->compress = (cm != Compressor::COMP_NONE) && + ((cm == Compressor::COMP_FORCE) || + (cm == Compressor::COMP_AGGRESSIVE && + (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) || + (cm == Compressor::COMP_PASSIVE && + (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE))); + + if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) && + (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 && + (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE | + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) && + (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) { + + dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl; + + if (o->onode.expected_write_size) { + wctx->csum_order = std::max(min_alloc_size_order, + (uint8_t)std::countr_zero(o->onode.expected_write_size)); + } else { + wctx->csum_order = min_alloc_size_order; + } + + if (wctx->compress) { + wctx->target_blob_size = select_option( + "compression_max_blob_size", + comp_max_blob_size.load(), + [&]() { + int64_t val; + if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) { + return std::optional<uint64_t>((uint64_t)val); + } + return std::optional<uint64_t>(); + } + ); + } + } else { + if (wctx->compress) { + wctx->target_blob_size = select_option( + "compression_min_blob_size", + comp_min_blob_size.load(), + [&]() { + int64_t val; + if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) { + return std::optional<uint64_t>((uint64_t)val); + } + return std::optional<uint64_t>(); + } + ); + } + } + + uint64_t max_bsize = max_blob_size.load(); + if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) { + wctx->target_blob_size = max_bsize; + } + + // set the min blob size floor at 2x the min_alloc_size, or else we + // won't be able to allocate a smaller extent for the compressed + // data. + if (wctx->compress && + wctx->target_blob_size < min_alloc_size * 2) { + wctx->target_blob_size = min_alloc_size * 2; + } + + dout(20) << __func__ << " prefer csum_order " << wctx->csum_order + << " target_blob_size 0x" << std::hex << wctx->target_blob_size + << " compress=" << (int)wctx->compress + << " buffered=" << (int)wctx->buffered + << std::dec << dendl; +} + +int BlueStore::_do_gc( + TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const WriteContext& wctx, + uint64_t *dirty_start, + uint64_t *dirty_end) +{ + + bool dirty_range_updated = false; + WriteContext wctx_gc; + wctx_gc.fork(wctx); // make a clone for garbage collection + + auto & extents_to_collect = wctx.extents_to_gc; + for (auto it = extents_to_collect.begin(); + it != extents_to_collect.end(); + ++it) { + bufferlist bl; + auto offset = (*it).first; + auto length = (*it).second; + dout(20) << __func__ << " processing " << std::hex + << offset << "~" << length << std::dec + << dendl; + int r = _do_read(c.get(), o, offset, length, bl, 0); + ceph_assert(r == (int)length); + + _do_write_data(txc, c, o, offset, length, bl, &wctx_gc); + logger->inc(l_bluestore_gc_merged, length); + + if (*dirty_start > offset) { + *dirty_start = offset; + dirty_range_updated = true; + } + + if (*dirty_end < offset + length) { + *dirty_end = offset + length; + dirty_range_updated = true; + } + } + if (dirty_range_updated) { + o->extent_map.fault_range(db, *dirty_start, *dirty_end); + } + + dout(30) << __func__ << " alloc write" << dendl; + int r = _do_alloc_write(txc, c, o, &wctx_gc); + if (r < 0) { + derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r) + << dendl; + return r; + } + + _wctx_finish(txc, c, o, &wctx_gc); + return 0; +} + +int BlueStore::_do_write( + TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, + uint64_t length, + bufferlist& bl, + uint32_t fadvise_flags) +{ + int r = 0; + + dout(20) << __func__ + << " " << o->oid + << " 0x" << std::hex << offset << "~" << length + << " - have 0x" << o->onode.size + << " (" << std::dec << o->onode.size << ")" + << " bytes" << std::hex + << " fadvise_flags 0x" << fadvise_flags + << " alloc_hint 0x" << o->onode.alloc_hint_flags + << " expected_object_size " << o->onode.expected_object_size + << " expected_write_size " << o->onode.expected_write_size + << std::dec + << dendl; + _dump_onode<30>(cct, *o); + + if (length == 0) { + return 0; + } + + uint64_t end = offset + length; + + GarbageCollector gc(c->store->cct); + int64_t benefit = 0; + auto dirty_start = offset; + auto dirty_end = end; + + WriteContext wctx; + _choose_write_options(c, o, fadvise_flags, &wctx); + o->extent_map.fault_range(db, offset, length); + _do_write_data(txc, c, o, offset, length, bl, &wctx); + r = _do_alloc_write(txc, c, o, &wctx); + if (r < 0) { + derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r) + << dendl; + goto out; + } + + if (wctx.extents_to_gc.empty() || + wctx.extents_to_gc.range_start() > offset || + wctx.extents_to_gc.range_end() < offset + length) { + benefit = gc.estimate(offset, + length, + o->extent_map, + wctx.old_extents, + min_alloc_size); + } + + // NB: _wctx_finish() will empty old_extents + // so we must do gc estimation before that + _wctx_finish(txc, c, o, &wctx); + if (end > o->onode.size) { + dout(20) << __func__ << " extending size to 0x" << std::hex << end + << std::dec << dendl; + o->onode.size = end; + } + + if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) { + wctx.extents_to_gc.union_of(gc.get_extents_to_collect()); + dout(20) << __func__ + << " perform garbage collection for compressed extents, " + << "expected benefit = " << benefit << " AUs" << dendl; + } + if (!wctx.extents_to_gc.empty()) { + dout(20) << __func__ << " perform garbage collection" << dendl; + + r = _do_gc(txc, c, o, + wctx, + &dirty_start, &dirty_end); + if (r < 0) { + derr << __func__ << " _do_gc failed with " << cpp_strerror(r) + << dendl; + goto out; + } + dout(20)<<__func__<<" gc range is " << std::hex << dirty_start + << "~" << dirty_end - dirty_start << std::dec << dendl; + } + o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start); + o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start); + + r = 0; + + out: + return r; +} + +int BlueStore::_write(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t length, + bufferlist& bl, + uint32_t fadvise_flags) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << dendl; + int r = 0; + if (offset + length >= OBJECT_MAX_SIZE) { + r = -E2BIG; + } else { + _assign_nid(txc, o); + r = _do_write(txc, c, o, offset, length, bl, fadvise_flags); + txc->write_onode(o); + } + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << " = " << r << dendl; + return r; +} + +int BlueStore::_zero(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t length) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << dendl; + int r = 0; + if (offset + length >= OBJECT_MAX_SIZE) { + r = -E2BIG; + } else { + _assign_nid(txc, o); + r = _do_zero(txc, c, o, offset, length); + } + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << " = " << r << dendl; + return r; +} + +int BlueStore::_do_zero(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset, size_t length) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << dendl; + int r = 0; + + _dump_onode<30>(cct, *o); + + WriteContext wctx; + o->extent_map.fault_range(db, offset, length); + o->extent_map.punch_hole(c, offset, length, &wctx.old_extents); + o->extent_map.dirty_range(offset, length); + _wctx_finish(txc, c, o, &wctx); + + if (length > 0 && offset + length > o->onode.size) { + o->onode.size = offset + length; + dout(20) << __func__ << " extending size to " << offset + length + << dendl; + } + txc->write_onode(o); + + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << "~" << length << std::dec + << " = " << r << dendl; + return r; +} + +void BlueStore::_do_truncate( + TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset, + set<SharedBlob*> *maybe_unshared_blobs) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << std::dec << dendl; + + _dump_onode<30>(cct, *o); + + if (offset == o->onode.size) + return; + + WriteContext wctx; + if (offset < o->onode.size) { + uint64_t length = o->onode.size - offset; + o->extent_map.fault_range(db, offset, length); + o->extent_map.punch_hole(c, offset, length, &wctx.old_extents); + o->extent_map.dirty_range(offset, length); + + _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs); + + // if we have shards past EOF, ask for a reshard + if (!o->onode.extent_map_shards.empty() && + o->onode.extent_map_shards.back().offset >= offset) { + dout(10) << __func__ << " request reshard past EOF" << dendl; + if (offset) { + o->extent_map.request_reshard(offset - 1, offset + length); + } else { + o->extent_map.request_reshard(0, length); + } + } + } + + o->onode.size = offset; + + txc->write_onode(o); +} + +int BlueStore::_truncate(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t offset) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << std::dec + << dendl; + + auto start_time = mono_clock::now(); + int r = 0; + if (offset >= OBJECT_MAX_SIZE) { + r = -E2BIG; + } else { + _do_truncate(txc, c, o, offset); + } + log_latency_fn( + __func__, + l_bluestore_truncate_lat, + mono_clock::now() - start_time, + cct->_conf->bluestore_log_op_age, + [&](const ceph::timespan& lat) { + ostringstream ostr; + ostr << ", lat = " << timespan_str(lat) + << " cid =" << c->cid + << " oid =" << o->oid; + return ostr.str(); + } + ); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " 0x" << std::hex << offset << std::dec + << " = " << r << dendl; + return r; +} + +int BlueStore::_do_remove( + TransContext *txc, + CollectionRef& c, + OnodeRef& o) +{ + set<SharedBlob*> maybe_unshared_blobs; + bool is_gen = !o->oid.is_no_gen(); + _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr); + if (o->onode.has_omap()) { + o->flush(); + _do_omap_clear(txc, o); + } + o->exists = false; + string key; + for (auto &s : o->extent_map.shards) { + dout(20) << __func__ << " removing shard 0x" << std::hex + << s.shard_info->offset << std::dec << dendl; + generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key, + [&](const string& final_key) { + txc->t->rmkey(PREFIX_OBJ, final_key); + } + ); + } + txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size()); + txc->note_removed_object(o); + o->extent_map.clear(); + o->onode = bluestore_onode_t(); + _debug_obj_on_delete(o->oid); + + if (!is_gen || maybe_unshared_blobs.empty()) { + return 0; + } + + // see if we can unshare blobs still referenced by the head + dout(10) << __func__ << " gen and maybe_unshared_blobs " + << maybe_unshared_blobs << dendl; + ghobject_t nogen = o->oid; + nogen.generation = ghobject_t::NO_GEN; + OnodeRef h = c->get_onode(nogen, false); + + if (!h || !h->exists) { + return 0; + } + + dout(20) << __func__ << " checking for unshareable blobs on " << h + << " " << h->oid << dendl; + map<SharedBlob*,bluestore_extent_ref_map_t> expect; + for (auto& e : h->extent_map.extent_map) { + const bluestore_blob_t& b = e.blob->get_blob(); + SharedBlob *sb = e.blob->shared_blob.get(); + if (b.is_shared() && + sb->loaded && + maybe_unshared_blobs.count(sb)) { + if (b.is_compressed()) { + expect[sb].get(0, b.get_ondisk_length()); + } else { + b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) { + expect[sb].get(off, len); + return 0; + }); + } + } + } + + vector<SharedBlob*> unshared_blobs; + unshared_blobs.reserve(maybe_unshared_blobs.size()); + for (auto& p : expect) { + dout(20) << " ? " << *p.first << " vs " << p.second << dendl; + if (p.first->persistent->ref_map == p.second) { + SharedBlob *sb = p.first; + dout(20) << __func__ << " unsharing " << *sb << dendl; + unshared_blobs.push_back(sb); + txc->unshare_blob(sb); + uint64_t sbid = c->make_blob_unshared(sb); + string key; + get_shared_blob_key(sbid, &key); + txc->t->rmkey(PREFIX_SHARED_BLOB, key); + } + } + + if (unshared_blobs.empty()) { + return 0; + } + + for (auto& e : h->extent_map.extent_map) { + const bluestore_blob_t& b = e.blob->get_blob(); + SharedBlob *sb = e.blob->shared_blob.get(); + if (b.is_shared() && + std::find(unshared_blobs.begin(), unshared_blobs.end(), + sb) != unshared_blobs.end()) { + dout(20) << __func__ << " unsharing " << e << dendl; + bluestore_blob_t& blob = e.blob->dirty_blob(); + blob.clear_flag(bluestore_blob_t::FLAG_SHARED); + h->extent_map.dirty_range(e.logical_offset, 1); + } + } + txc->write_onode(h); + + return 0; +} + +int BlueStore::_remove(TransContext *txc, + CollectionRef& c, + OnodeRef& o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " onode " << o.get() + << " txc "<< txc << dendl; + auto start_time = mono_clock::now(); + int r = _do_remove(txc, c, o); + + log_latency_fn( + __func__, + l_bluestore_remove_lat, + mono_clock::now() - start_time, + cct->_conf->bluestore_log_op_age, + [&](const ceph::timespan& lat) { + ostringstream ostr; + ostr << ", lat = " << timespan_str(lat) + << " cid =" << c->cid + << " oid =" << o->oid; + return ostr.str(); + } + ); + + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_setattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name, + bufferptr& val) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << name << " (" << val.length() << " bytes)" + << dendl; + int r = 0; + if (val.is_partial()) { + auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), + val.length()); + b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } else { + auto& b = o->onode.attrs[name.c_str()] = val; + b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << name << " (" << val.length() << " bytes)" + << " = " << r << dendl; + return r; +} + +int BlueStore::_setattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const map<string,bufferptr>& aset) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << aset.size() << " keys" + << dendl; + int r = 0; + for (map<string,bufferptr>::const_iterator p = aset.begin(); + p != aset.end(); ++p) { + if (p->second.is_partial()) { + auto& b = o->onode.attrs[p->first.c_str()] = + bufferptr(p->second.c_str(), p->second.length()); + b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } else { + auto& b = o->onode.attrs[p->first.c_str()] = p->second; + b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); + } + } + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << aset.size() << " keys" + << " = " << r << dendl; + return r; +} + + +int BlueStore::_rmattr(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& name) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " " << name << dendl; + int r = 0; + auto it = o->onode.attrs.find(name.c_str()); + if (it == o->onode.attrs.end()) + goto out; + + o->onode.attrs.erase(it); + txc->write_onode(o); + + out: + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " " << name << " = " << r << dendl; + return r; +} + +int BlueStore::_rmattrs(TransContext *txc, + CollectionRef& c, + OnodeRef& o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + + if (o->onode.attrs.empty()) + goto out; + + o->onode.attrs.clear(); + txc->write_onode(o); + + out: + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o) +{ + const string& omap_prefix = o->get_omap_prefix(); + string prefix, tail; + o->get_omap_header(&prefix); + o->get_omap_tail(&tail); + txc->t->rm_range_keys(omap_prefix, prefix, tail); + txc->t->rmkey(omap_prefix, tail); + o->onode.clear_omap_flag(); + dout(20) << __func__ << " remove range start: " + << pretty_binary_string(prefix) << " end: " + << pretty_binary_string(tail) << dendl; +} + +int BlueStore::_omap_clear(TransContext *txc, + CollectionRef& c, + OnodeRef& o) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + auto t0 = mono_clock::now(); + + int r = 0; + if (o->onode.has_omap()) { + o->flush(); + _do_omap_clear(txc, o); + txc->write_onode(o); + } + logger->tinc(l_bluestore_omap_clear_lat, mono_clock::now() - t0); + + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_omap_setkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist &bl) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r; + auto p = bl.cbegin(); + __u32 num; + if (!o->onode.has_omap()) { + if (o->oid.is_pgmeta()) { + o->onode.set_omap_flags_pgmeta(); + } else { + o->onode.set_omap_flags(per_pool_omap == OMAP_BULK); + } + txc->write_onode(o); + + const string& prefix = o->get_omap_prefix(); + string key_tail; + bufferlist tail; + o->get_omap_tail(&key_tail); + txc->t->set(prefix, key_tail, tail); + } else { + txc->note_modified_object(o); + } + const string& prefix = o->get_omap_prefix(); + string final_key; + o->get_omap_key(string(), &final_key); + size_t base_key_len = final_key.size(); + decode(num, p); + while (num--) { + string key; + bufferlist value; + decode(key, p); + decode(value, p); + final_key.resize(base_key_len); // keep prefix + final_key += key; + dout(20) << __func__ << " " << pretty_binary_string(final_key) + << " <- " << key << dendl; + txc->t->set(prefix, final_key, value); + } + r = 0; + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_omap_setheader(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist& bl) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r; + string key; + if (!o->onode.has_omap()) { + if (o->oid.is_pgmeta()) { + o->onode.set_omap_flags_pgmeta(); + } else { + o->onode.set_omap_flags(per_pool_omap == OMAP_BULK); + } + txc->write_onode(o); + + const string& prefix = o->get_omap_prefix(); + string key_tail; + bufferlist tail; + o->get_omap_tail(&key_tail); + txc->t->set(prefix, key_tail, tail); + } else { + txc->note_modified_object(o); + } + const string& prefix = o->get_omap_prefix(); + o->get_omap_header(&key); + txc->t->set(prefix, key, bl); + r = 0; + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_omap_rmkeys(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + bufferlist& bl) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + int r = 0; + auto p = bl.cbegin(); + __u32 num; + string final_key; + if (!o->onode.has_omap()) { + goto out; + } + { + const string& prefix = o->get_omap_prefix(); + o->get_omap_key(string(), &final_key); + size_t base_key_len = final_key.size(); + decode(num, p); + logger->inc(l_bluestore_omap_rmkeys_count, num); + while (num--) { + string key; + decode(key, p); + final_key.resize(base_key_len); // keep prefix + final_key += key; + dout(20) << __func__ << " rm " << pretty_binary_string(final_key) + << " <- " << key << dendl; + txc->t->rmkey(prefix, final_key); + } + } + txc->note_modified_object(o); + + out: + dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_omap_rmkey_range(TransContext *txc, + CollectionRef& c, + OnodeRef& o, + const string& first, const string& last) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl; + string key_first, key_last; + int r = 0; + if (!o->onode.has_omap()) { + goto out; + } + { + const string& prefix = o->get_omap_prefix(); + o->flush(); + o->get_omap_key(first, &key_first); + o->get_omap_key(last, &key_last); + logger->inc(l_bluestore_omap_rmkey_ranges_count); + txc->t->rm_range_keys(prefix, key_first, key_last); + dout(20) << __func__ << " remove range start: " + << pretty_binary_string(key_first) << " end: " + << pretty_binary_string(key_last) << dendl; + } + txc->note_modified_object(o); + + out: + return r; +} + +int BlueStore::_set_alloc_hint( + TransContext *txc, + CollectionRef& c, + OnodeRef& o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags) +{ + dout(15) << __func__ << " " << c->cid << " " << o->oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << " flags " << ceph_osd_alloc_hint_flag_string(flags) + << dendl; + int r = 0; + o->onode.expected_object_size = expected_object_size; + o->onode.expected_write_size = expected_write_size; + o->onode.alloc_hint_flags = flags; + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << o->oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << " flags " << ceph_osd_alloc_hint_flag_string(flags) + << " = " << r << dendl; + return r; +} + +int BlueStore::_clone(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << dendl; + int r = 0; + if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) { + derr << __func__ << " mismatched hash on " << oldo->oid + << " and " << newo->oid << dendl; + return -EINVAL; + } + + _assign_nid(txc, newo); + + // clone data + oldo->flush(); + _do_truncate(txc, c, newo, 0); + if (cct->_conf->bluestore_clone_cow) { + _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0); + } else { + bufferlist bl; + r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0); + if (r < 0) + goto out; + r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0); + if (r < 0) + goto out; + } + + // clone attrs + newo->onode.attrs = oldo->onode.attrs; + + // clone omap + if (newo->onode.has_omap()) { + dout(20) << __func__ << " clearing old omap data" << dendl; + newo->flush(); + _do_omap_clear(txc, newo); + } + if (oldo->onode.has_omap()) { + dout(20) << __func__ << " copying omap data" << dendl; + if (newo->oid.is_pgmeta()) { + newo->onode.set_omap_flags_pgmeta(); + } else { + newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK); + } + // check if prefix for omap key is exactly the same size for both objects + // otherwise rewrite_omap_key will corrupt data + ceph_assert(oldo->onode.flags == newo->onode.flags); + const string& prefix = newo->get_omap_prefix(); + string head, tail; + oldo->get_omap_header(&head); + oldo->get_omap_tail(&tail); + KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail}); + it->lower_bound(head); + while (it->valid()) { + if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + dout(30) << __func__ << " got header/data " + << pretty_binary_string(it->key()) << dendl; + string key; + newo->rewrite_omap_key(it->key(), &key); + txc->t->set(prefix, key, it->value()); + } + it->next(); + } + string new_tail; + bufferlist new_tail_value; + newo->get_omap_tail(&new_tail); + txc->t->set(prefix, new_tail, new_tail_value); + } + + txc->write_onode(newo); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << " = " << r << dendl; + return r; +} + +int BlueStore::_do_clone_range( + TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + uint64_t srcoff, + uint64_t length, + uint64_t dstoff) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid + << " 0x" << std::hex << srcoff << "~" << length << " -> " + << " 0x" << dstoff << "~" << length << std::dec << dendl; + oldo->extent_map.fault_range(db, srcoff, length); + newo->extent_map.fault_range(db, dstoff, length); + _dump_onode<30>(cct, *oldo); + _dump_onode<30>(cct, *newo); + + oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff); + +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + // duplicate the refs for the shared region. + Extent dummy(dstoff); + for (auto e = newo->extent_map.extent_map.lower_bound(dummy); + e != newo->extent_map.extent_map.end(); + ++e) { + if (e->logical_offset >= dstoff + length) { + break; + } + for (auto& ex : e->blob->get_blob().get_extents()) { + // note that we may introduce a new extent reference that is + // earlier than the first zone ref. we allow this since it is + // a lot of work to avoid and has marginal impact on cleaning + // performance. + if (!ex.is_valid()) { + continue; + } + uint32_t zone = ex.offset / zone_size; + if (!newo->onode.zone_offset_refs.count(zone)) { + uint64_t zoff = ex.offset % zone_size; + dout(20) << __func__ << " add ref zone 0x" << std::hex << zone + << " offset 0x" << zoff << std::dec + << " -> " << newo->oid << dendl; + txc->note_write_zone_offset(newo, zone, zoff); + } + } + } + } +#endif + + _dump_onode<30>(cct, *oldo); + _dump_onode<30>(cct, *newo); + return 0; +} + +int BlueStore::_clone_range(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + uint64_t srcoff, uint64_t length, uint64_t dstoff) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << " from 0x" << std::hex << srcoff << "~" << length + << " to offset 0x" << dstoff << std::dec << dendl; + int r = 0; + + if (srcoff + length >= OBJECT_MAX_SIZE || + dstoff + length >= OBJECT_MAX_SIZE) { + r = -E2BIG; + goto out; + } + if (srcoff + length > oldo->onode.size) { + r = -EINVAL; + goto out; + } + + _assign_nid(txc, newo); + + if (length > 0) { + if (cct->_conf->bluestore_clone_cow) { + _do_zero(txc, c, newo, dstoff, length); + _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff); + } else { + bufferlist bl; + r = _do_read(c.get(), oldo, srcoff, length, bl, 0); + if (r < 0) + goto out; + r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0); + if (r < 0) + goto out; + } + } + + txc->write_onode(newo); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << newo->oid << " from 0x" << std::hex << srcoff << "~" << length + << " to offset 0x" << dstoff << std::dec + << " = " << r << dendl; + return r; +} + +int BlueStore::_rename(TransContext *txc, + CollectionRef& c, + OnodeRef& oldo, + OnodeRef& newo, + const ghobject_t& new_oid) +{ + dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> " + << new_oid << dendl; + int r; + ghobject_t old_oid = oldo->oid; + mempool::bluestore_cache_meta::string new_okey; + + if (newo) { + if (newo->exists) { + r = -EEXIST; + goto out; + } + ceph_assert(txc->onodes.count(newo) == 0); + } + + txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size()); + + // rewrite shards + { + oldo->extent_map.fault_range(db, 0, oldo->onode.size); + get_object_key(cct, new_oid, &new_okey); + string key; + for (auto &s : oldo->extent_map.shards) { + generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key, + [&](const string& final_key) { + txc->t->rmkey(PREFIX_OBJ, final_key); + } + ); + s.dirty = true; + } + } + + newo = oldo; + txc->write_onode(newo); + + // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty + // Onode in the old slot + c->onode_space.rename(oldo, old_oid, new_oid, new_okey); + r = 0; + + // hold a ref to new Onode in old name position, to ensure we don't drop + // it from the cache before this txc commits (or else someone may come along + // and read newo's metadata via the old name). + txc->note_modified_object(oldo); + +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + // adjust zone refs + for (auto& [zone, offset] : newo->onode.zone_offset_refs) { + dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone + << " offset 0x" << offset << std::dec + << " -> " << oldo->oid << dendl; + string key; + get_zone_offset_object_key(zone, offset, oldo->oid, &key); + txc->t->rmkey(PREFIX_ZONED_CL_INFO, key); + + dout(20) << __func__ << " add ref zone 0x" << std::hex << zone + << " offset 0x" << offset << std::dec + << " -> " << newo->oid << dendl; + get_zone_offset_object_key(zone, offset, newo->oid, &key); + bufferlist v; + txc->t->set(PREFIX_ZONED_CL_INFO, key, v); + } + } +#endif + + out: + dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " = " << r << dendl; + return r; +} + +// collections + +int BlueStore::_create_collection( + TransContext *txc, + const coll_t &cid, + unsigned bits, + CollectionRef *c) +{ + dout(15) << __func__ << " " << cid << " bits " << bits << dendl; + int r; + bufferlist bl; + + { + std::unique_lock l(coll_lock); + if (*c) { + r = -EEXIST; + goto out; + } + auto p = new_coll_map.find(cid); + ceph_assert(p != new_coll_map.end()); + *c = p->second; + (*c)->cnode.bits = bits; + coll_map[cid] = *c; + new_coll_map.erase(p); + } + encode((*c)->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(cid), bl); + r = 0; + + out: + dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl; + return r; +} + +int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid, + CollectionRef *c) +{ + dout(15) << __func__ << " " << cid << dendl; + int r; + + (*c)->flush_all_but_last(); + { + std::unique_lock l(coll_lock); + if (!*c) { + r = -ENOENT; + goto out; + } + size_t nonexistent_count = 0; + ceph_assert((*c)->exists); + if ((*c)->onode_space.map_any([&](Onode* o) { + if (o->exists) { + dout(1) << __func__ << " " << o->oid << " " << o + << " exists in onode_map" << dendl; + return true; + } + ++nonexistent_count; + return false; + })) { + r = -ENOTEMPTY; + goto out; + } + vector<ghobject_t> ls; + ghobject_t next; + // Enumerate onodes in db, up to nonexistent_count + 1 + // then check if all of them are marked as non-existent. + // Bypass the check if (next != ghobject_t::get_max()) + r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(), + nonexistent_count + 1, false, &ls, &next); + if (r >= 0) { + // If true mean collecton has more objects than nonexistent_count, + // so bypass check. + bool exists = (!next.is_max()); + for (auto it = ls.begin(); !exists && it < ls.end(); ++it) { + dout(10) << __func__ << " oid " << *it << dendl; + auto onode = (*c)->onode_space.lookup(*it); + exists = !onode || onode->exists; + if (exists) { + dout(1) << __func__ << " " << *it + << " exists in db, " + << (!onode ? "not present in ram" : "present in ram") + << dendl; + } + } + if (!exists) { + _do_remove_collection(txc, c); + r = 0; + } else { + dout(10) << __func__ << " " << cid + << " is non-empty" << dendl; + r = -ENOTEMPTY; + } + } + } +out: + dout(10) << __func__ << " " << cid << " = " << r << dendl; + return r; +} + +void BlueStore::_do_remove_collection(TransContext *txc, + CollectionRef *c) +{ + coll_map.erase((*c)->cid); + txc->removed_collections.push_back(*c); + (*c)->exists = false; + _osr_register_zombie((*c)->osr.get()); + txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid)); + c->reset(); +} + +int BlueStore::_split_collection(TransContext *txc, + CollectionRef& c, + CollectionRef& d, + unsigned bits, int rem) +{ + dout(15) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << dendl; + std::unique_lock l(c->lock); + std::unique_lock l2(d->lock); + int r; + + // flush all previous deferred writes on this sequencer. this is a bit + // heavyweight, but we need to make sure all deferred writes complete + // before we split as the new collection's sequencer may need to order + // this after those writes, and we don't bother with the complexity of + // moving those TransContexts over to the new osr. + _osr_drain_preceding(txc); + + // move any cached items (onodes and referenced shared blobs) that will + // belong to the child collection post-split. leave everything else behind. + // this may include things that don't strictly belong to the now-smaller + // parent split, but the OSD will always send us a split for every new + // child. + + spg_t pgid, dest_pgid; + bool is_pg = c->cid.is_pg(&pgid); + ceph_assert(is_pg); + is_pg = d->cid.is_pg(&dest_pgid); + ceph_assert(is_pg); + + // the destination should initially be empty. + ceph_assert(d->onode_space.empty()); + ceph_assert(d->shared_blob_set.empty()); + ceph_assert(d->cnode.bits == bits); + + c->split_cache(d.get()); + + // adjust bits. note that this will be redundant for all but the first + // split call for this parent (first child). + c->cnode.bits = bits; + ceph_assert(d->cnode.bits == bits); + r = 0; + + bufferlist bl; + encode(c->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(c->cid), bl); + + dout(10) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << " = " << r << dendl; + return r; +} + +int BlueStore::_merge_collection( + TransContext *txc, + CollectionRef *c, + CollectionRef& d, + unsigned bits) +{ + dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid + << " bits " << bits << dendl; + std::unique_lock l((*c)->lock); + std::unique_lock l2(d->lock); + int r; + + coll_t cid = (*c)->cid; + + // flush all previous deferred writes on the source collection to ensure + // that all deferred writes complete before we merge as the target collection's + // sequencer may need to order new ops after those writes. + + _osr_drain((*c)->osr.get()); + + // move any cached items (onodes and referenced shared blobs) that will + // belong to the child collection post-split. leave everything else behind. + // this may include things that don't strictly belong to the now-smaller + // parent split, but the OSD will always send us a split for every new + // child. + + spg_t pgid, dest_pgid; + bool is_pg = cid.is_pg(&pgid); + ceph_assert(is_pg); + is_pg = d->cid.is_pg(&dest_pgid); + ceph_assert(is_pg); + + // adjust bits. note that this will be redundant for all but the first + // merge call for the parent/target. + d->cnode.bits = bits; + + // behavior depends on target (d) bits, so this after that is updated. + (*c)->split_cache(d.get()); + + // remove source collection + { + std::unique_lock l3(coll_lock); + _do_remove_collection(txc, c); + } + + r = 0; + + bufferlist bl; + encode(d->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(d->cid), bl); + + dout(10) << __func__ << " " << cid << " to " << d->cid << " " + << " bits " << bits << " = " << r << dendl; + return r; +} + +void BlueStore::log_latency( + const char* name, + int idx, + const ceph::timespan& l, + double lat_threshold, + const char* info) const +{ + logger->tinc(idx, l); + if (lat_threshold > 0.0 && + l >= make_timespan(lat_threshold)) { + dout(0) << __func__ << " slow operation observed for " << name + << ", latency = " << l + << info + << dendl; + } +} + +void BlueStore::log_latency_fn( + const char* name, + int idx, + const ceph::timespan& l, + double lat_threshold, + std::function<string (const ceph::timespan& lat)> fn) const +{ + logger->tinc(idx, l); + if (lat_threshold > 0.0 && + l >= make_timespan(lat_threshold)) { + dout(0) << __func__ << " slow operation observed for " << name + << ", latency = " << l + << fn(l) + << dendl; + } +} + +#if defined(WITH_LTTNG) +void BlueStore::BlueStoreThrottle::emit_initial_tracepoint( + KeyValueDB &db, + TransContext &txc, + mono_clock::time_point start_throttle_acquire) +{ + pending_kv_ios += txc.ios; + if (txc.deferred_txn) { + pending_deferred_ios += txc.ios; + } + + uint64_t started = 0; + uint64_t completed = 0; + if (should_trace(&started, &completed)) { + txc.tracing = true; + uint64_t rocksdb_base_level, + rocksdb_estimate_pending_compaction_bytes, + rocksdb_cur_size_all_mem_tables, + rocksdb_compaction_pending, + rocksdb_mem_table_flush_pending, + rocksdb_num_running_compactions, + rocksdb_num_running_flushes, + rocksdb_actual_delayed_write_rate; + db.get_property( + "rocksdb.base-level", + &rocksdb_base_level); + db.get_property( + "rocksdb.estimate-pending-compaction-bytes", + &rocksdb_estimate_pending_compaction_bytes); + db.get_property( + "rocksdb.cur-size-all-mem-tables", + &rocksdb_cur_size_all_mem_tables); + db.get_property( + "rocksdb.compaction-pending", + &rocksdb_compaction_pending); + db.get_property( + "rocksdb.mem-table-flush-pending", + &rocksdb_mem_table_flush_pending); + db.get_property( + "rocksdb.num-running-compactions", + &rocksdb_num_running_compactions); + db.get_property( + "rocksdb.num-running-flushes", + &rocksdb_num_running_flushes); + db.get_property( + "rocksdb.actual-delayed-write-rate", + &rocksdb_actual_delayed_write_rate); + + + tracepoint( + bluestore, + transaction_initial_state, + txc.osr->get_sequencer_id(), + txc.seq, + throttle_bytes.get_current(), + throttle_deferred_bytes.get_current(), + pending_kv_ios, + pending_deferred_ios, + started, + completed, + ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire)); + + tracepoint( + bluestore, + transaction_initial_state_rocksdb, + txc.osr->get_sequencer_id(), + txc.seq, + rocksdb_base_level, + rocksdb_estimate_pending_compaction_bytes, + rocksdb_cur_size_all_mem_tables, + rocksdb_compaction_pending, + rocksdb_mem_table_flush_pending, + rocksdb_num_running_compactions, + rocksdb_num_running_flushes, + rocksdb_actual_delayed_write_rate); + } +} +#endif + +mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency( + TransContext &txc, PerfCounters *logger, int state) +{ + mono_clock::time_point now = mono_clock::now(); + mono_clock::duration lat = now - txc.last_stamp; + logger->tinc(state, lat); +#if defined(WITH_LTTNG) + if (txc.tracing && + state >= l_bluestore_state_prepare_lat && + state <= l_bluestore_state_done_lat) { + OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state)); + tracepoint( + bluestore, + transaction_state_duration, + txc.osr->get_sequencer_id(), + txc.seq, + state, + ceph::to_seconds<double>(lat)); + } +#endif + txc.last_stamp = now; + return lat; +} + +bool BlueStore::BlueStoreThrottle::try_start_transaction( + KeyValueDB &db, + TransContext &txc, + mono_clock::time_point start_throttle_acquire) +{ + throttle_bytes.get(txc.cost); + + if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) { + emit_initial_tracepoint(db, txc, start_throttle_acquire); + return true; + } else { + return false; + } +} + +void BlueStore::BlueStoreThrottle::finish_start_transaction( + KeyValueDB &db, + TransContext &txc, + mono_clock::time_point start_throttle_acquire) +{ + ceph_assert(txc.deferred_txn); + throttle_deferred_bytes.get(txc.cost); + emit_initial_tracepoint(db, txc, start_throttle_acquire); +} + +#if defined(WITH_LTTNG) +void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc) +{ + pending_kv_ios -= 1; + ios_completed_since_last_traced++; + if (txc.tracing) { + tracepoint( + bluestore, + transaction_commit_latency, + txc.osr->get_sequencer_id(), + txc.seq, + ceph::to_seconds<double>(mono_clock::now() - txc.start)); + } +} +#endif + +#if defined(WITH_LTTNG) +void BlueStore::BlueStoreThrottle::complete(TransContext &txc) +{ + if (txc.deferred_txn) { + pending_deferred_ios -= 1; + } + if (txc.tracing) { + mono_clock::time_point now = mono_clock::now(); + mono_clock::duration lat = now - txc.start; + tracepoint( + bluestore, + transaction_total_duration, + txc.osr->get_sequencer_id(), + txc.seq, + ceph::to_seconds<double>(lat)); + } +} +#endif + +const string prefix_onode = "o"; +const string prefix_onode_shard = "x"; +const string prefix_other = "Z"; +//Itrerates through the db and collects the stats +void BlueStore::generate_db_histogram(Formatter *f) +{ + //globals + uint64_t num_onodes = 0; + uint64_t num_shards = 0; + uint64_t num_super = 0; + uint64_t num_coll = 0; + uint64_t num_omap = 0; + uint64_t num_pgmeta_omap = 0; + uint64_t num_deferred = 0; + uint64_t num_alloc = 0; + uint64_t num_stat = 0; + uint64_t num_others = 0; + uint64_t num_shared_shards = 0; + size_t max_key_size =0, max_value_size = 0; + uint64_t total_key_size = 0, total_value_size = 0; + size_t key_size = 0, value_size = 0; + KeyValueHistogram hist; + + auto start = coarse_mono_clock::now(); + + KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator(); + iter->seek_to_first(); + while (iter->valid()) { + dout(30) << __func__ << " Key: " << iter->key() << dendl; + key_size = iter->key_size(); + value_size = iter->value_size(); + hist.value_hist[hist.get_value_slab(value_size)]++; + max_key_size = std::max(max_key_size, key_size); + max_value_size = std::max(max_value_size, value_size); + total_key_size += key_size; + total_value_size += value_size; + + pair<string,string> key(iter->raw_key()); + + if (key.first == PREFIX_SUPER) { + hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size); + num_super++; + } else if (key.first == PREFIX_STAT) { + hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size); + num_stat++; + } else if (key.first == PREFIX_COLL) { + hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size); + num_coll++; + } else if (key.first == PREFIX_OBJ) { + if (key.second.back() == ONODE_KEY_SUFFIX) { + hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size); + num_onodes++; + } else { + hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size); + num_shards++; + } + } else if (key.first == PREFIX_OMAP) { + hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size); + num_omap++; + } else if (key.first == PREFIX_PERPOOL_OMAP) { + hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size); + num_omap++; + } else if (key.first == PREFIX_PERPG_OMAP) { + hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size); + num_omap++; + } else if (key.first == PREFIX_PGMETA_OMAP) { + hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size); + num_pgmeta_omap++; + } else if (key.first == PREFIX_DEFERRED) { + hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size); + num_deferred++; + } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) { + hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size); + num_alloc++; + } else if (key.first == PREFIX_SHARED_BLOB) { + hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size); + num_shared_shards++; + } else { + hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size); + num_others++; + } + iter->next(); + } + + ceph::timespan duration = coarse_mono_clock::now() - start; + f->open_object_section("rocksdb_key_value_stats"); + f->dump_unsigned("num_onodes", num_onodes); + f->dump_unsigned("num_shards", num_shards); + f->dump_unsigned("num_super", num_super); + f->dump_unsigned("num_coll", num_coll); + f->dump_unsigned("num_omap", num_omap); + f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap); + f->dump_unsigned("num_deferred", num_deferred); + f->dump_unsigned("num_alloc", num_alloc); + f->dump_unsigned("num_stat", num_stat); + f->dump_unsigned("num_shared_shards", num_shared_shards); + f->dump_unsigned("num_others", num_others); + f->dump_unsigned("max_key_size", max_key_size); + f->dump_unsigned("max_value_size", max_value_size); + f->dump_unsigned("total_key_size", total_key_size); + f->dump_unsigned("total_value_size", total_value_size); + f->close_section(); + + hist.dump(f); + + dout(20) << __func__ << " finished in " << duration << " seconds" << dendl; + +} + +void BlueStore::_shutdown_cache() +{ + dout(10) << __func__ << dendl; + for (auto i : buffer_cache_shards) { + i->flush(); + ceph_assert(i->empty()); + } + for (auto& p : coll_map) { + p.second->onode_space.clear(); + if (!p.second->shared_blob_set.empty()) { + derr << __func__ << " stray shared blobs on " << p.first << dendl; + p.second->shared_blob_set.dump<0>(cct); + } + ceph_assert(p.second->onode_space.empty()); + ceph_assert(p.second->shared_blob_set.empty()); + } + coll_map.clear(); + for (auto i : onode_cache_shards) { + ceph_assert(i->empty()); + } +} + +// For external caller. +// We use a best-effort policy instead, e.g., +// we don't care if there are still some pinned onodes/data in the cache +// after this command is completed. +int BlueStore::flush_cache(ostream *os) +{ + dout(10) << __func__ << dendl; + for (auto i : onode_cache_shards) { + i->flush(); + } + for (auto i : buffer_cache_shards) { + i->flush(); + } + + return 0; +} + +void BlueStore::_apply_padding(uint64_t head_pad, + uint64_t tail_pad, + bufferlist& padded) +{ + if (head_pad) { + padded.prepend_zero(head_pad); + } + if (tail_pad) { + padded.append_zero(tail_pad); + } + if (head_pad || tail_pad) { + dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad + << " tail 0x" << tail_pad << std::dec << dendl; + logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad); + } +} + +void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn) +{ + // finalize extent_map shards + o->extent_map.update(txn, false); + if (o->extent_map.needs_reshard()) { + o->extent_map.reshard(db, txn); + o->extent_map.update(txn, true); + if (o->extent_map.needs_reshard()) { + dout(20) << __func__ << " warning: still wants reshard, check options?" + << dendl; + o->extent_map.clear_needs_reshard(); + } + logger->inc(l_bluestore_onode_reshard); + } + + // bound encode + size_t bound = 0; + denc(o->onode, bound); + o->extent_map.bound_encode_spanning_blobs(bound); + if (o->onode.extent_map_shards.empty()) { + denc(o->extent_map.inline_bl, bound); + } + + // encode + bufferlist bl; + unsigned onode_part, blob_part, extent_part; + { + auto p = bl.get_contiguous_appender(bound, true); + denc(o->onode, p); + onode_part = p.get_logical_offset(); + o->extent_map.encode_spanning_blobs(p); + blob_part = p.get_logical_offset() - onode_part; + if (o->onode.extent_map_shards.empty()) { + denc(o->extent_map.inline_bl, p); + } + extent_part = p.get_logical_offset() - onode_part - blob_part; + } + + dout(20) << __func__ << " onode " << o->oid << " is " << bl.length() + << " (" << onode_part << " bytes onode + " + << blob_part << " bytes spanning blobs + " + << extent_part << " bytes inline extents)" + << dendl; + + + txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl); +} + +void BlueStore::_log_alerts(osd_alert_list_t& alerts) +{ + std::lock_guard l(qlock); + size_t used = bluefs && bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW ? + bluefs->get_used(BlueFS::BDEV_SLOW) : 0; + if (used > 0) { + auto db_used = bluefs->get_used(BlueFS::BDEV_DB); + auto db_total = bluefs->get_total(BlueFS::BDEV_DB); + ostringstream ss; + ss << "spilled over " << byte_u_t(used) + << " metadata from 'db' device (" << byte_u_t(db_used) + << " used of " << byte_u_t(db_total) << ") to slow device"; + spillover_alert = ss.str(); + } else if (!spillover_alert.empty()){ + spillover_alert.clear(); + } + + if (!spurious_read_errors_alert.empty() && + cct->_conf->bluestore_warn_on_spurious_read_errors) { + alerts.emplace( + "BLUESTORE_SPURIOUS_READ_ERRORS", + spurious_read_errors_alert); + } + if (!disk_size_mismatch_alert.empty()) { + alerts.emplace( + "BLUESTORE_DISK_SIZE_MISMATCH", + disk_size_mismatch_alert); + } + if (!legacy_statfs_alert.empty()) { + alerts.emplace( + "BLUESTORE_LEGACY_STATFS", + legacy_statfs_alert); + } + if (!spillover_alert.empty() && + cct->_conf->bluestore_warn_on_bluefs_spillover) { + alerts.emplace( + "BLUEFS_SPILLOVER", + spillover_alert); + } + if (!no_per_pg_omap_alert.empty()) { + alerts.emplace( + "BLUESTORE_NO_PER_PG_OMAP", + no_per_pg_omap_alert); + } + if (!no_per_pool_omap_alert.empty()) { + alerts.emplace( + "BLUESTORE_NO_PER_POOL_OMAP", + no_per_pool_omap_alert); + } + string s0(failed_cmode); + + if (!failed_compressors.empty()) { + if (!s0.empty()) { + s0 += ", "; + } + s0 += "unable to load:"; + bool first = true; + for (auto& s : failed_compressors) { + if (first) { + first = false; + } else { + s0 += ", "; + } + s0 += s; + } + alerts.emplace( + "BLUESTORE_NO_COMPRESSION", + s0); + } +} + +void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size, + const PExtentVector& extents) +{ + alloc_stats_count++; + alloc_stats_fragments += extents.size(); + alloc_stats_size += need; + + for (auto& e : extents) { + logger->hinc(l_bluestore_allocate_hist, e.length, need); + } +} + +void BlueStore::_record_allocation_stats() +{ + // don't care about data consistency, + // fields can be partially modified while making the tuple + auto t0 = std::make_tuple( + alloc_stats_count.exchange(0), + alloc_stats_fragments.exchange(0), + alloc_stats_size.exchange(0)); + + dout(0) << " allocation stats probe " + << probe_count << ":" + << " cnt: " << std::get<0>(t0) + << " frags: " << std::get<1>(t0) + << " size: " << std::get<2>(t0) + << dendl; + + + // + // Keep the history for probes from the power-of-two sequence: + // -1, -2, -4, -8, -16 + // + size_t base = 1; + for (auto& t : alloc_stats_history) { + dout(0) << " probe -" + << base + (probe_count % base) << ": " + << std::get<0>(t) + << ", " << std::get<1>(t) + << ", " << std::get<2>(t) + << dendl; + base <<= 1; + } + dout(0) << "------------" << dendl; + + ++ probe_count; + + for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) { + if ((probe_count % (1 << i)) == 0) { + alloc_stats_history[i] = alloc_stats_history[i - 1]; + } + } + alloc_stats_history[0].swap(t0); +} + +// =========================================== +// BlueStoreRepairer + +size_t BlueStoreRepairer::StoreSpaceTracker::filter_out( + const interval_set<uint64_t>& extents) +{ + ceph_assert(granularity); // initialized + // can't call for the second time + ceph_assert(!was_filtered_out); + ceph_assert(collections_bfs.size() == objects_bfs.size()); + + uint64_t prev_pos = 0; + uint64_t npos = collections_bfs.size(); + + bloom_vector collections_reduced; + bloom_vector objects_reduced; + + for (auto e : extents) { + if (e.second == 0) { + continue; + } + uint64_t pos = max(e.first / granularity, prev_pos); + uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity; + while (pos != npos && pos < end_pos) { + ceph_assert( collections_bfs[pos].element_count() == + objects_bfs[pos].element_count()); + if (collections_bfs[pos].element_count()) { + collections_reduced.push_back(std::move(collections_bfs[pos])); + objects_reduced.push_back(std::move(objects_bfs[pos])); + } + ++pos; + } + prev_pos = end_pos; + } + collections_reduced.swap(collections_bfs); + objects_reduced.swap(objects_bfs); + was_filtered_out = true; + return collections_bfs.size(); +} + +bool BlueStoreRepairer::remove_key(KeyValueDB *db, + const string& prefix, + const string& key) +{ + std::lock_guard l(lock); + if (!remove_key_txn) { + remove_key_txn = db->get_transaction(); + } + ++to_repair_cnt; + remove_key_txn->rmkey(prefix, key); + + return true; +} + +void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val) +{ + std::lock_guard l(lock); // possibly redundant + ceph_assert(fix_per_pool_omap_txn == nullptr); + fix_per_pool_omap_txn = db->get_transaction(); + ++to_repair_cnt; + bufferlist bl; + bl.append(stringify(val)); + fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl); +} + +bool BlueStoreRepairer::fix_shared_blob( + KeyValueDB::Transaction txn, + uint64_t sbid, + bluestore_extent_ref_map_t* ref_map, + size_t repaired) +{ + string key; + get_shared_blob_key(sbid, &key); + if (ref_map) { + bluestore_shared_blob_t persistent(sbid, std::move(*ref_map)); + bufferlist bl; + encode(persistent, bl); + txn->set(PREFIX_SHARED_BLOB, key, bl); + } else { + txn->rmkey(PREFIX_SHARED_BLOB, key); + } + to_repair_cnt += repaired; + return true; +} + +bool BlueStoreRepairer::fix_statfs(KeyValueDB *db, + const string& key, + const store_statfs_t& new_statfs) +{ + std::lock_guard l(lock); + if (!fix_statfs_txn) { + fix_statfs_txn = db->get_transaction(); + } + BlueStore::volatile_statfs vstatfs; + vstatfs = new_statfs; + bufferlist bl; + vstatfs.encode(bl); + ++to_repair_cnt; + fix_statfs_txn->set(PREFIX_STAT, key, bl); + return true; +} + +bool BlueStoreRepairer::fix_leaked(KeyValueDB *db, + FreelistManager* fm, + uint64_t offset, uint64_t len) +{ + std::lock_guard l(lock); + ceph_assert(!fm->is_null_manager()); + + if (!fix_fm_leaked_txn) { + fix_fm_leaked_txn = db->get_transaction(); + } + ++to_repair_cnt; + fm->release(offset, len, fix_fm_leaked_txn); + return true; +} +bool BlueStoreRepairer::fix_false_free(KeyValueDB *db, + FreelistManager* fm, + uint64_t offset, uint64_t len) +{ + std::lock_guard l(lock); + ceph_assert(!fm->is_null_manager()); + + if (!fix_fm_false_free_txn) { + fix_fm_false_free_txn = db->get_transaction(); + } + ++to_repair_cnt; + fm->allocate(offset, len, fix_fm_false_free_txn); + return true; +} + +bool BlueStoreRepairer::fix_spanning_blobs( + KeyValueDB* db, + std::function<void(KeyValueDB::Transaction)> f) +{ + std::lock_guard l(lock); + if (!fix_onode_txn) { + fix_onode_txn = db->get_transaction(); + } + f(fix_onode_txn); + ++to_repair_cnt; + return true; +} + +bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db) +{ + //NB: not for use in multithreading mode!!! + if (misreferenced_extents.size()) { + size_t n = space_usage_tracker.filter_out(misreferenced_extents); + ceph_assert(n > 0); + if (!fix_misreferences_txn) { + fix_misreferences_txn = db->get_transaction(); + } + return true; + } + return false; +} + +unsigned BlueStoreRepairer::apply(KeyValueDB* db) +{ + //NB: not for use in multithreading mode!!! + if (fix_per_pool_omap_txn) { + auto ok = db->submit_transaction_sync(fix_per_pool_omap_txn) == 0; + ceph_assert(ok); + fix_per_pool_omap_txn = nullptr; + } + if (fix_fm_leaked_txn) { + auto ok = db->submit_transaction_sync(fix_fm_leaked_txn) == 0; + ceph_assert(ok); + fix_fm_leaked_txn = nullptr; + } + if (fix_fm_false_free_txn) { + auto ok = db->submit_transaction_sync(fix_fm_false_free_txn) == 0; + ceph_assert(ok); + fix_fm_false_free_txn = nullptr; + } + if (remove_key_txn) { + auto ok = db->submit_transaction_sync(remove_key_txn) == 0; + ceph_assert(ok); + remove_key_txn = nullptr; + } + if (fix_misreferences_txn) { + auto ok = db->submit_transaction_sync(fix_misreferences_txn) == 0; + ceph_assert(ok); + fix_misreferences_txn = nullptr; + } + if (fix_onode_txn) { + auto ok = db->submit_transaction_sync(fix_onode_txn) == 0; + ceph_assert(ok); + fix_onode_txn = nullptr; + } + if (fix_shared_blob_txn) { + auto ok = db->submit_transaction_sync(fix_shared_blob_txn) == 0; + ceph_assert(ok); + fix_shared_blob_txn = nullptr; + } + if (fix_statfs_txn) { + auto ok = db->submit_transaction_sync(fix_statfs_txn) == 0; + ceph_assert(ok); + fix_statfs_txn = nullptr; + } + if (need_compact) { + db->compact(); + need_compact = false; + } + unsigned repaired = to_repair_cnt; + to_repair_cnt = 0; + return repaired; +} + +// ======================================================= +// RocksDBBlueFSVolumeSelector + +uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) { + ceph_assert(h != nullptr); + uint64_t hint = reinterpret_cast<uint64_t>(h); + uint8_t res; + switch (hint) { + case LEVEL_SLOW: + res = BlueFS::BDEV_SLOW; + if (db_avail4slow > 0) { + // considering statically available db space vs. + // - observed maximums on DB dev for DB/WAL/UNSORTED data + // - observed maximum spillovers + uint64_t max_db_use = 0; // max db usage we potentially observed + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST); + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST); + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST); + // this could go to db hence using it in the estimation + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST); + + auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST]; + uint64_t avail = min( + db_avail4slow, + max_db_use < db_total ? db_total - max_db_use : 0); + + // considering current DB dev usage for SLOW data + if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) { + res = BlueFS::BDEV_DB; + } + } + break; + case LEVEL_LOG: + case LEVEL_WAL: + res = BlueFS::BDEV_WAL; + break; + case LEVEL_DB: + default: + res = BlueFS::BDEV_DB; + break; + } + return res; +} + +void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const +{ + auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST]; + res.emplace_back(base, db_size); + auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST]; + if (slow_size == 0) { + slow_size = db_size; + } + res.emplace_back(base + ".slow", slow_size); +} + +void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const { + uint8_t res = LEVEL_DB; + if (dirname.length() > 5) { + // the "db.slow" and "db.wal" directory names are hard-coded at + // match up with bluestore. the slow device is always the second + // one (when a dedicated block.db device is present and used at + // bdev 0). the wal device is always last. + if (boost::algorithm::ends_with(dirname, ".slow")) { + res = LEVEL_SLOW; + } + else if (boost::algorithm::ends_with(dirname, ".wal")) { + res = LEVEL_WAL; + } + } + return reinterpret_cast<void*>(res); +} + +void RocksDBBlueFSVolumeSelector::dump(ostream& sout) { + auto max_x = per_level_per_dev_usage.get_max_x(); + auto max_y = per_level_per_dev_usage.get_max_y(); + + sout << "RocksDBBlueFSVolumeSelector Usage Matrix:" << std::endl; + constexpr std::array<const char*, 8> names{ { + "DEV/LEV", + "WAL", + "DB", + "SLOW", + "*", + "*", + "REAL", + "FILES", + } }; + const size_t width = 12; + for (size_t i = 0; i < names.size(); ++i) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << names[i]; + } + sout << std::endl; + for (size_t l = 0; l < max_y; l++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + switch (l + LEVEL_FIRST) { + case LEVEL_LOG: + sout << "LOG"; break; + case LEVEL_WAL: + sout << "WAL"; break; + case LEVEL_DB: + sout << "DB"; break; + case LEVEL_SLOW: + sout << "SLOW"; break; + case LEVEL_MAX: + sout << "TOTAL"; break; + } + for (size_t d = 0; d < max_x; d++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l))); + } + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(per_level_files[l]) << std::endl; + } + ceph_assert(max_x == per_level_per_dev_max.get_max_x()); + ceph_assert(max_y == per_level_per_dev_max.get_max_y()); + sout << "MAXIMUMS:" << std::endl; + for (size_t l = 0; l < max_y; l++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + switch (l + LEVEL_FIRST) { + case LEVEL_LOG: + sout << "LOG"; break; + case LEVEL_WAL: + sout << "WAL"; break; + case LEVEL_DB: + sout << "DB"; break; + case LEVEL_SLOW: + sout << "SLOW"; break; + case LEVEL_MAX: + sout << "TOTAL"; break; + } + for (size_t d = 0; d < max_x - 1; d++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l))); + } + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l))); + sout << std::endl; + } + string sizes[] = { + ">> SIZE <<", + stringify(byte_u_t(l_totals[LEVEL_WAL - LEVEL_FIRST])), + stringify(byte_u_t(l_totals[LEVEL_DB - LEVEL_FIRST])), + stringify(byte_u_t(l_totals[LEVEL_SLOW - LEVEL_FIRST])), + }; + for (size_t i = 0; i < (sizeof(sizes) / sizeof(sizes[0])); i++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << sizes[i]; + } + sout << std::endl; +} + +BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const { + RocksDBBlueFSVolumeSelector* ns = + new RocksDBBlueFSVolumeSelector(0, 0, 0, + 0, 0, 0, + 0, 0, false); + return ns; +} + +bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) { + RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other); + ceph_assert(o); + bool equal = true; + for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) { + for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) { + equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y)); + } + } + for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) { + equal &= (per_level_files[t] == o->per_level_files[t]); + } + return equal; +} + +// ======================================================= + +//================================================================================================================ +// BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed. +// This cause a delay in write path and add significant load to the CPU/Memory/Disk. +// The reason for the RocksDB updates is that it allows Ceph to survive any failure without losing the allocation state. +// +// We changed the code skiping RocksDB updates on allocation time and instead performing a full desatge of the allocator object +// with all the OSD allocation state in a single step during umount(). +// This change leads to a 25% increase in IOPS and reduced latency in small random-write workload, but exposes the system +// to losing allocation info in failure cases where we don't call umount. +// We add code to perform a full allocation-map rebuild from information stored inside the ONode which is used in failure cases. +// When we perform a graceful shutdown there is no need for recovery and we simply read the allocation-map from a flat file +// where we store the allocation-map during umount(). +//================================================================================================================ + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore::NCB::" << __func__ << "::" + +static const std::string allocator_dir = "ALLOCATOR_NCB_DIR"; +static const std::string allocator_file = "ALLOCATOR_NCB_FILE"; +static uint32_t s_format_version = 0x01; // support future changes to allocator-map file +static uint32_t s_serial = 0x01; + +#if 1 +#define CEPHTOH_32 le32toh +#define CEPHTOH_64 le64toh +#define HTOCEPH_32 htole32 +#define HTOCEPH_64 htole64 +#else +// help debug the encode/decode by forcing alien format +#define CEPHTOH_32 be32toh +#define CEPHTOH_64 be64toh +#define HTOCEPH_32 htobe32 +#define HTOCEPH_64 htobe64 +#endif + +// 48 Bytes header for on-disk alloator image +const uint64_t ALLOCATOR_IMAGE_VALID_SIGNATURE = 0x1FACE0FF; +struct allocator_image_header { + uint32_t format_version; // 0x00 + uint32_t valid_signature; // 0x04 + utime_t timestamp; // 0x08 + uint32_t serial; // 0x10 + uint32_t pad[0x7]; // 0x14 + + allocator_image_header() { + memset((char*)this, 0, sizeof(allocator_image_header)); + } + + // create header in CEPH format + allocator_image_header(utime_t timestamp, uint32_t format_version, uint32_t serial) { + this->format_version = format_version; + this->timestamp = timestamp; + this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE; + this->serial = serial; + memset(this->pad, 0, sizeof(this->pad)); + } + + friend std::ostream& operator<<(std::ostream& out, const allocator_image_header& header) { + out << "format_version = " << header.format_version << std::endl; + out << "valid_signature = " << header.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl; + out << "timestamp = " << header.timestamp << std::endl; + out << "serial = " << header.serial << std::endl; + for (unsigned i = 0; i < sizeof(header.pad)/sizeof(uint32_t); i++) { + if (header.pad[i]) { + out << "header.pad[" << i << "] = " << header.pad[i] << std::endl; + } + } + return out; + } + + DENC(allocator_image_header, v, p) { + denc(v.format_version, p); + denc(v.valid_signature, p); + denc(v.timestamp.tv.tv_sec, p); + denc(v.timestamp.tv.tv_nsec, p); + denc(v.serial, p); + for (auto& pad: v.pad) { + denc(pad, p); + } + } + + + int verify(CephContext* cct, const std::string &path) { + if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) { + for (unsigned i = 0; i < (sizeof(pad) / sizeof(uint32_t)); i++) { + if (this->pad[i]) { + derr << "Illegal Header - pad[" << i << "]="<< pad[i] << dendl; + return -1; + } + } + return 0; + } + else { + derr << "Illegal Header - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl; + return -1; + } + } +}; +WRITE_CLASS_DENC(allocator_image_header) + +// 56 Bytes trailer for on-disk alloator image +struct allocator_image_trailer { + extent_t null_extent; // 0x00 + + uint32_t format_version; // 0x10 + uint32_t valid_signature; // 0x14 + + utime_t timestamp; // 0x18 + + uint32_t serial; // 0x20 + uint32_t pad; // 0x24 + uint64_t entries_count; // 0x28 + uint64_t allocation_size; // 0x30 + + // trailer is created in CEPH format + allocator_image_trailer(utime_t timestamp, uint32_t format_version, uint32_t serial, uint64_t entries_count, uint64_t allocation_size) { + memset((char*)&(this->null_extent), 0, sizeof(this->null_extent)); + this->format_version = format_version; + this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE; + this->timestamp = timestamp; + this->serial = serial; + this->pad = 0; + this->entries_count = entries_count; + this->allocation_size = allocation_size; + } + + allocator_image_trailer() { + memset((char*)this, 0, sizeof(allocator_image_trailer)); + } + + friend std::ostream& operator<<(std::ostream& out, const allocator_image_trailer& trailer) { + if (trailer.null_extent.offset || trailer.null_extent.length) { + out << "trailer.null_extent.offset = " << trailer.null_extent.offset << std::endl; + out << "trailer.null_extent.length = " << trailer.null_extent.length << std::endl; + } + out << "format_version = " << trailer.format_version << std::endl; + out << "valid_signature = " << trailer.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl; + out << "timestamp = " << trailer.timestamp << std::endl; + out << "serial = " << trailer.serial << std::endl; + if (trailer.pad) { + out << "trailer.pad= " << trailer.pad << std::endl; + } + out << "entries_count = " << trailer.entries_count << std::endl; + out << "allocation_size = " << trailer.allocation_size << std::endl; + return out; + } + + int verify(CephContext* cct, const std::string &path, const allocator_image_header *p_header, uint64_t entries_count, uint64_t allocation_size) { + if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) { + + // trailer must starts with null extents (both fields set to zero) [no need to convert formats for zero) + if (null_extent.offset || null_extent.length) { + derr << "illegal trailer - null_extent = [" << null_extent.offset << "," << null_extent.length << "]"<< dendl; + return -1; + } + + if (serial != p_header->serial) { + derr << "Illegal trailer: header->serial(" << p_header->serial << ") != trailer->serial(" << serial << ")" << dendl; + return -1; + } + + if (format_version != p_header->format_version) { + derr << "Illegal trailer: header->format_version(" << p_header->format_version + << ") != trailer->format_version(" << format_version << ")" << dendl; + return -1; + } + + if (timestamp != p_header->timestamp) { + derr << "Illegal trailer: header->timestamp(" << p_header->timestamp + << ") != trailer->timestamp(" << timestamp << ")" << dendl; + return -1; + } + + if (this->entries_count != entries_count) { + derr << "Illegal trailer: entries_count(" << entries_count << ") != trailer->entries_count(" + << this->entries_count << ")" << dendl; + return -1; + } + + if (this->allocation_size != allocation_size) { + derr << "Illegal trailer: allocation_size(" << allocation_size << ") != trailer->allocation_size(" + << this->allocation_size << ")" << dendl; + return -1; + } + + if (pad) { + derr << "Illegal Trailer - pad="<< pad << dendl; + return -1; + } + + // if arrived here -> trailer is valid !! + return 0; + } else { + derr << "Illegal Trailer - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl; + return -1; + } + } + + DENC(allocator_image_trailer, v, p) { + denc(v.null_extent.offset, p); + denc(v.null_extent.length, p); + denc(v.format_version, p); + denc(v.valid_signature, p); + denc(v.timestamp.tv.tv_sec, p); + denc(v.timestamp.tv.tv_nsec, p); + denc(v.serial, p); + denc(v.pad, p); + denc(v.entries_count, p); + denc(v.allocation_size, p); + } +}; +WRITE_CLASS_DENC(allocator_image_trailer) + + +//------------------------------------------------------------------------------------- +// invalidate old allocation file if exists so will go directly to recovery after failure +// we can safely ignore non-existing file +int BlueStore::invalidate_allocation_file_on_bluefs() +{ + // mark that allocation-file was invalidated and we should destage a new copy whne closing db + need_to_destage_allocation_file = true; + dout(10) << __func__ << " need_to_destage_allocation_file was set" << dendl; + + BlueFS::FileWriter *p_handle = nullptr; + if (!bluefs->dir_exists(allocator_dir)) { + dout(5) << "allocator_dir(" << allocator_dir << ") doesn't exist" << dendl; + // nothing to do -> return + return 0; + } + + int ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr); + if (ret != 0) { + dout(5) << __func__ << " allocator_file(" << allocator_file << ") doesn't exist" << dendl; + // nothing to do -> return + return 0; + } + + + ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, true); + if (ret != 0) { + derr << __func__ << "::NCB:: Failed open_for_write with error-code " + << ret << dendl; + return -1; + } + + dout(5) << "invalidate using bluefs->truncate(p_handle, 0)" << dendl; + ret = bluefs->truncate(p_handle, 0); + if (ret != 0) { + derr << __func__ << "::NCB:: Failed truncaste with error-code " + << ret << dendl; + bluefs->close_writer(p_handle); + return -1; + } + + bluefs->fsync(p_handle); + bluefs->close_writer(p_handle); + + return 0; +} + +//----------------------------------------------------------------------------------- +int BlueStore::copy_allocator(Allocator* src_alloc, Allocator* dest_alloc, uint64_t* p_num_entries) +{ + *p_num_entries = 0; + auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) { + (*p_num_entries)++; + }; + src_alloc->foreach(count_entries); + + dout(5) << "count num_entries=" << *p_num_entries << dendl; + + // add 16K extra entries in case new allocation happened + (*p_num_entries) += 16*1024; + unique_ptr<extent_t[]> arr; + try { + arr = make_unique<extent_t[]>(*p_num_entries); + } catch (std::bad_alloc&) { + derr << "****Failed dynamic allocation, num_entries=" << *p_num_entries << dendl; + return -1; + } + + uint64_t idx = 0; + auto copy_entries = [&](uint64_t extent_offset, uint64_t extent_length) { + if (extent_length > 0) { + if (idx < *p_num_entries) { + arr[idx] = {extent_offset, extent_length}; + } + idx++; + } + else { + derr << "zero length extent!!! offset=" << extent_offset << ", index=" << idx << dendl; + } + }; + src_alloc->foreach(copy_entries); + + dout(5) << "copy num_entries=" << idx << dendl; + if (idx > *p_num_entries) { + derr << "****spillover, num_entries=" << *p_num_entries << ", spillover=" << (idx - *p_num_entries) << dendl; + ceph_assert(idx <= *p_num_entries); + } + + *p_num_entries = idx; + + for (idx = 0; idx < *p_num_entries; idx++) { + const extent_t *p_extent = &arr[idx]; + dest_alloc->init_add_free(p_extent->offset, p_extent->length); + } + + return 0; +} + +//----------------------------------------------------------------------------------- +static uint32_t flush_extent_buffer_with_crc(BlueFS::FileWriter *p_handle, const char* buffer, const char *p_curr, uint32_t crc) +{ + std::ptrdiff_t length = p_curr - buffer; + p_handle->append(buffer, length); + + crc = ceph_crc32c(crc, (const uint8_t*)buffer, length); + uint32_t encoded_crc = HTOCEPH_32(crc); + p_handle->append((byte*)&encoded_crc, sizeof(encoded_crc)); + + return crc; +} + +const unsigned MAX_EXTENTS_IN_BUFFER = 4 * 1024; // 4K extents = 64KB of data +// write the allocator to a flat bluefs file - 4K extents at a time +//----------------------------------------------------------------------------------- +int BlueStore::store_allocator(Allocator* src_allocator) +{ + // when storing allocations to file we must be sure there is no background compactions + // the easiest way to achieve it is to make sure db is closed + ceph_assert(db == nullptr); + utime_t start_time = ceph_clock_now(); + int ret = 0; + + // create dir if doesn't exist already + if (!bluefs->dir_exists(allocator_dir) ) { + ret = bluefs->mkdir(allocator_dir); + if (ret != 0) { + derr << "Failed mkdir with error-code " << ret << dendl; + return -1; + } + } + bluefs->compact_log(); + // reuse previous file-allocation if exists + ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr); + bool overwrite_file = (ret == 0); + BlueFS::FileWriter *p_handle = nullptr; + ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file); + if (ret != 0) { + derr << __func__ << "Failed open_for_write with error-code " << ret << dendl; + return -1; + } + + uint64_t file_size = p_handle->file->fnode.size; + uint64_t allocated = p_handle->file->fnode.get_allocated(); + dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl; + + bluefs->sync_metadata(false); + unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator)); + if (!allocator) { + bluefs->close_writer(p_handle); + return -1; + } + + // store all extents (except for the bluefs extents we removed) in a single flat file + utime_t timestamp = ceph_clock_now(); + uint32_t crc = -1; + { + allocator_image_header header(timestamp, s_format_version, s_serial); + bufferlist header_bl; + encode(header, header_bl); + crc = header_bl.crc32c(crc); + encode(crc, header_bl); + p_handle->append(header_bl); + } + + crc = -1; // reset crc + extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB + extent_t *p_curr = buffer; + const extent_t *p_end = buffer + MAX_EXTENTS_IN_BUFFER; + uint64_t extent_count = 0; + uint64_t allocation_size = 0; + auto iterated_allocation = [&](uint64_t extent_offset, uint64_t extent_length) { + if (extent_length == 0) { + derr << __func__ << "" << extent_count << "::[" << extent_offset << "," << extent_length << "]" << dendl; + ret = -1; + return; + } + p_curr->offset = HTOCEPH_64(extent_offset); + p_curr->length = HTOCEPH_64(extent_length); + extent_count++; + allocation_size += extent_length; + p_curr++; + + if (p_curr == p_end) { + crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc); + p_curr = buffer; // recycle the buffer + } + }; + allocator->foreach(iterated_allocation); + // if got null extent -> fail the operation + if (ret != 0) { + derr << "Illegal extent, fail store operation" << dendl; + derr << "invalidate using bluefs->truncate(p_handle, 0)" << dendl; + bluefs->truncate(p_handle, 0); + bluefs->close_writer(p_handle); + return -1; + } + + // if we got any leftovers -> add crc and append to file + if (p_curr > buffer) { + crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc); + } + + { + allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size); + bufferlist trailer_bl; + encode(trailer, trailer_bl); + uint32_t crc = -1; + crc = trailer_bl.crc32c(crc); + encode(crc, trailer_bl); + p_handle->append(trailer_bl); + } + + bluefs->fsync(p_handle); + bluefs->truncate(p_handle, p_handle->pos); + bluefs->fsync(p_handle); + + utime_t duration = ceph_clock_now() - start_time; + dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl; + dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl; + + bluefs->close_writer(p_handle); + need_to_destage_allocation_file = false; + return 0; +} + +//----------------------------------------------------------------------------------- +Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) { + // create allocator + uint64_t alloc_size = min_alloc_size; + Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size, + zone_size, first_sequential_zone, + "recovery"); + if (alloc) { + return alloc; + } else { + derr << "Failed Allocator Creation" << dendl; + return nullptr; + } +} + +//----------------------------------------------------------------------------------- +size_t calc_allocator_image_header_size() +{ + utime_t timestamp = ceph_clock_now(); + allocator_image_header header(timestamp, s_format_version, s_serial); + bufferlist header_bl; + encode(header, header_bl); + uint32_t crc = -1; + crc = header_bl.crc32c(crc); + encode(crc, header_bl); + + return header_bl.length(); +} + +//----------------------------------------------------------------------------------- +int calc_allocator_image_trailer_size() +{ + utime_t timestamp = ceph_clock_now(); + uint64_t extent_count = -1; + uint64_t allocation_size = -1; + uint32_t crc = -1; + bufferlist trailer_bl; + allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size); + + encode(trailer, trailer_bl); + crc = trailer_bl.crc32c(crc); + encode(crc, trailer_bl); + return trailer_bl.length(); +} + +//----------------------------------------------------------------------------------- +int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes) +{ + if (cct->_conf->bluestore_debug_inject_allocation_from_file_failure > 0) { + boost::mt11213b rng(time(NULL)); + boost::uniform_real<> ur(0, 1); + if (ur(rng) < cct->_conf->bluestore_debug_inject_allocation_from_file_failure) { + derr << __func__ << " failure injected." << dendl; + return -1; + } + } + utime_t start_time = ceph_clock_now(); + BlueFS::FileReader *p_temp_handle = nullptr; + int ret = bluefs->open_for_read(allocator_dir, allocator_file, &p_temp_handle, false); + if (ret != 0) { + dout(1) << "Failed open_for_read with error-code " << ret << dendl; + return -1; + } + unique_ptr<BlueFS::FileReader> p_handle(p_temp_handle); + uint64_t read_alloc_size = 0; + uint64_t file_size = p_handle->file->fnode.size; + dout(5) << "file_size=" << file_size << ",sizeof(extent_t)=" << sizeof(extent_t) << dendl; + + // make sure we were able to store a valid copy + if (file_size == 0) { + dout(1) << "No Valid allocation info on disk (empty file)" << dendl; + return -1; + } + + // first read the header + size_t offset = 0; + allocator_image_header header; + int header_size = calc_allocator_image_header_size(); + { + bufferlist header_bl,temp_bl; + int read_bytes = bluefs->read(p_handle.get(), offset, header_size, &temp_bl, nullptr); + if (read_bytes != header_size) { + derr << "Failed bluefs->read() for header::read_bytes=" << read_bytes << ", req_bytes=" << header_size << dendl; + return -1; + } + + offset += read_bytes; + + header_bl.claim_append(temp_bl); + auto p = header_bl.cbegin(); + decode(header, p); + if (header.verify(cct, path) != 0 ) { + derr << "header = \n" << header << dendl; + return -1; + } + + uint32_t crc_calc = -1, crc; + crc_calc = header_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos + decode(crc, p); + if (crc != crc_calc) { + derr << "crc mismatch!!! crc=" << crc << ", crc_calc=" << crc_calc << dendl; + derr << "header = \n" << header << dendl; + return -1; + } + + // increment version for next store + s_serial = header.serial + 1; + } + + // then read the payload (extents list) using a recycled buffer + extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB + uint32_t crc = -1; + int trailer_size = calc_allocator_image_trailer_size(); + uint64_t extent_count = 0; + uint64_t extents_bytes_left = file_size - (header_size + trailer_size + sizeof(crc)); + while (extents_bytes_left) { + int req_bytes = std::min(extents_bytes_left, static_cast<uint64_t>(sizeof(buffer))); + int read_bytes = bluefs->read(p_handle.get(), offset, req_bytes, nullptr, (char*)buffer); + if (read_bytes != req_bytes) { + derr << "Failed bluefs->read()::read_bytes=" << read_bytes << ", req_bytes=" << req_bytes << dendl; + return -1; + } + + offset += read_bytes; + extents_bytes_left -= read_bytes; + + const unsigned num_extent_in_buffer = read_bytes/sizeof(extent_t); + const extent_t *p_end = buffer + num_extent_in_buffer; + for (const extent_t *p_ext = buffer; p_ext < p_end; p_ext++) { + uint64_t offset = CEPHTOH_64(p_ext->offset); + uint64_t length = CEPHTOH_64(p_ext->length); + read_alloc_size += length; + + if (length > 0) { + allocator->init_add_free(offset, length); + extent_count ++; + } else { + derr << "extent with zero length at idx=" << extent_count << dendl; + return -1; + } + } + + uint32_t calc_crc = ceph_crc32c(crc, (const uint8_t*)buffer, read_bytes); + read_bytes = bluefs->read(p_handle.get(), offset, sizeof(crc), nullptr, (char*)&crc); + if (read_bytes == sizeof(crc) ) { + crc = CEPHTOH_32(crc); + if (crc != calc_crc) { + derr << "data crc mismatch!!! crc=" << crc << ", calc_crc=" << calc_crc << dendl; + derr << "extents_bytes_left=" << extents_bytes_left << ", offset=" << offset << ", extent_count=" << extent_count << dendl; + return -1; + } + + offset += read_bytes; + if (extents_bytes_left) { + extents_bytes_left -= read_bytes; + } + } else { + derr << "Failed bluefs->read() for crc::read_bytes=" << read_bytes << ", req_bytes=" << sizeof(crc) << dendl; + return -1; + } + + } + + // finally, read the trailer and verify it is in good shape and that we got all the extents + { + bufferlist trailer_bl,temp_bl; + int read_bytes = bluefs->read(p_handle.get(), offset, trailer_size, &temp_bl, nullptr); + if (read_bytes != trailer_size) { + derr << "Failed bluefs->read() for trailer::read_bytes=" << read_bytes << ", req_bytes=" << trailer_size << dendl; + return -1; + } + offset += read_bytes; + + trailer_bl.claim_append(temp_bl); + uint32_t crc_calc = -1; + uint32_t crc; + allocator_image_trailer trailer; + auto p = trailer_bl.cbegin(); + decode(trailer, p); + if (trailer.verify(cct, path, &header, extent_count, read_alloc_size) != 0 ) { + derr << "trailer=\n" << trailer << dendl; + return -1; + } + + crc_calc = trailer_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos + decode(crc, p); + if (crc != crc_calc) { + derr << "trailer crc mismatch!::crc=" << crc << ", crc_calc=" << crc_calc << dendl; + derr << "trailer=\n" << trailer << dendl; + return -1; + } + } + + utime_t duration = ceph_clock_now() - start_time; + dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= " + << read_alloc_size << ", file_size=" << file_size << dendl; + dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl; + *num = extent_count; + *bytes = read_alloc_size; + return 0; +} + +//----------------------------------------------------------------------------------- +int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint64_t *bytes) +{ + utime_t start = ceph_clock_now(); + auto temp_allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size())); + int ret = __restore_allocator(temp_allocator.get(), num, bytes); + if (ret != 0) { + return ret; + } + + uint64_t num_entries = 0; + dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl; + copy_allocator(temp_allocator.get(), dest_allocator, &num_entries); + utime_t duration = ceph_clock_now() - start; + dout(5) << "restored in " << duration << " seconds, num_entries=" << num_entries << dendl; + return ret; +} + +//----------------------------------------------------------------------------------- +void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length) +{ + dout(30) << __func__ << " 0x" << std::hex + << offset << "~" << length + << " " << min_alloc_size_mask + << dendl; + ceph_assert((offset & min_alloc_size_mask) == 0); + ceph_assert((length & min_alloc_size_mask) == 0); + sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order); +} + +void BlueStore::ExtentDecoderPartial::_consume_new_blob(bool spanning, + uint64_t extent_no, + uint64_t sbid, + BlobRef b) +{ + [[maybe_unused]] auto cct = store.cct; + ceph_assert(per_pool_statfs); + ceph_assert(oid != ghobject_t()); + + auto &blob = b->get_blob(); + if(spanning) { + dout(20) << __func__ << " " << spanning << " " << b->id << dendl; + ceph_assert(b->id >= 0); + spanning_blobs[b->id] = b; + ++stats.spanning_blob_count; + } else { + dout(20) << __func__ << " " << spanning << " " << extent_no << dendl; + blobs[extent_no] = b; + } + bool compressed = blob.is_compressed(); + if (!blob.is_shared()) { + for (auto& pe : blob.get_extents()) { + if (pe.offset == bluestore_pextent_t::INVALID_OFFSET) { + ++stats.skipped_illegal_extent; + continue; + } + store.set_allocation_in_simple_bmap(&sbmap, pe.offset, pe.length); + + per_pool_statfs->allocated() += pe.length; + if (compressed) { + per_pool_statfs->compressed_allocated() += pe.length; + } + } + if (compressed) { + per_pool_statfs->compressed() += + blob.get_compressed_payload_length(); + ++stats.compressed_blob_count; + } + } else { + auto it = sb_info.find(sbid); + if (it == sb_info.end()) { + derr << __func__ << " shared blob not found:" << sbid + << dendl; + } + auto &sbi = *it; + auto pool_id = oid.hobj.get_logical_pool(); + if (sbi.pool_id == sb_info_t::INVALID_POOL_ID) { + sbi.pool_id = pool_id; + size_t alloc_delta = sbi.allocated_chunks << min_alloc_size_order; + per_pool_statfs->allocated() += alloc_delta; + if (compressed) { + per_pool_statfs->compressed_allocated() += alloc_delta; + ++stats.compressed_blob_count; + } + } + if (compressed) { + per_pool_statfs->compressed() += + blob.get_compressed_payload_length(); + } + } +} + +void BlueStore::ExtentDecoderPartial::consume_blobid(Extent* le, + bool spanning, + uint64_t blobid) +{ + [[maybe_unused]] auto cct = store.cct; + dout(20) << __func__ << " " << spanning << " " << blobid << dendl; + auto &map = spanning ? spanning_blobs : blobs; + auto it = map.find(blobid); + ceph_assert(it != map.end()); + per_pool_statfs->stored() += le->length; + if (it->second->get_blob().is_compressed()) { + per_pool_statfs->compressed_original() += le->length; + } +} + +void BlueStore::ExtentDecoderPartial::consume_blob(Extent* le, + uint64_t extent_no, + uint64_t sbid, + BlobRef b) +{ + _consume_new_blob(false, extent_no, sbid, b); + per_pool_statfs->stored() += le->length; + if (b->get_blob().is_compressed()) { + per_pool_statfs->compressed_original() += le->length; + } +} + +void BlueStore::ExtentDecoderPartial::consume_spanning_blob(uint64_t sbid, + BlobRef b) +{ + _consume_new_blob(true, 0/*doesn't matter*/, sbid, b); +} + +void BlueStore::ExtentDecoderPartial::reset(const ghobject_t _oid, + volatile_statfs* _per_pool_statfs) +{ + oid = _oid; + per_pool_statfs = _per_pool_statfs; + blob_map_t empty; + blob_map_t empty2; + std::swap(blobs, empty); + std::swap(spanning_blobs, empty2); +} + +int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats) +{ + sb_info_space_efficient_map_t sb_info; + // iterate over all shared blobs + auto it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE); + if (!it) { + derr << "failed getting shared blob's iterator" << dendl; + return -ENOENT; + } + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + const auto& key = it->key(); + dout(20) << __func__ << " decode sb " << pretty_binary_string(key) << dendl; + uint64_t sbid = 0; + if (get_key_shared_blob(key, &sbid) != 0) { + derr << __func__ << " bad shared blob key '" << pretty_binary_string(key) + << "'" << dendl; + } + bluestore_shared_blob_t shared_blob(sbid); + bufferlist bl = it->value(); + auto blp = bl.cbegin(); + try { + decode(shared_blob, blp); + } + catch (ceph::buffer::error& e) { + derr << __func__ << " failed to decode Shared Blob" + << pretty_binary_string(key) << dendl; + continue; + } + dout(20) << __func__ << " " << shared_blob << dendl; + uint64_t allocated = 0; + for (auto& r : shared_blob.ref_map.ref_map) { + ceph_assert(r.first != bluestore_pextent_t::INVALID_OFFSET); + set_allocation_in_simple_bmap(sbmap, r.first, r.second.length); + allocated += r.second.length; + } + auto &sbi = sb_info.add_or_adopt(sbid); + ceph_assert(p2phase(allocated, min_alloc_size) == 0); + sbi.allocated_chunks += (allocated >> min_alloc_size_order); + ++stats.shared_blob_count; + } + } + + it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); + if (!it) { + derr << "failed getting onode's iterator" << dendl; + return -ENOENT; + } + + uint64_t kv_count = 0; + uint64_t count_interval = 1'000'000; + ExtentDecoderPartial edecoder(*this, + stats, + *sbmap, + sb_info, + min_alloc_size_order); + + // iterate over all ONodes stored in RocksDB + for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) { + // trace an even after every million processed objects (typically every 5-10 seconds) + if (kv_count && (kv_count % count_interval == 0) ) { + dout(5) << __func__ << " processed objects count = " << kv_count << dendl; + } + + auto key = it->key(); + auto okey = key; + dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl; + ghobject_t oid; + if (!is_extent_shard_key(it->key())) { + int r = get_key_object(okey, &oid); + if (r != 0) { + derr << __func__ << " failed to decode onode key = " + << pretty_binary_string(okey) << dendl; + return -EIO; + } + edecoder.reset(oid, + &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]); + Onode dummy_on(cct); + Onode::decode_raw(&dummy_on, + it->value(), + edecoder); + ++stats.onode_count; + } else { + uint32_t offset; + int r = get_key_extent_shard(key, &okey, &offset); + if (r != 0) { + derr << __func__ << " failed to decode onode extent key = " + << pretty_binary_string(key) << dendl; + return -EIO; + } + r = get_key_object(okey, &oid); + if (r != 0) { + derr << __func__ + << " failed to decode onode key= " << pretty_binary_string(okey) + << " from extent key= " << pretty_binary_string(key) + << dendl; + return -EIO; + } + ceph_assert(oid == edecoder.get_oid()); + edecoder.decode_some(it->value(), nullptr); + ++stats.shard_count; + } + } + + std::lock_guard l(vstatfs_lock); + store_statfs_t s; + osd_pools.clear(); + for (auto& p : stats.actual_pool_vstatfs) { + if (per_pool_stat_collection) { + osd_pools[p.first] = p.second; + } + stats.actual_store_vstatfs += p.second; + p.second.publish(&s); + dout(5) << __func__ << " recovered pool " + << std::hex + << p.first << "->" << s + << std::dec + << " per-pool:" << per_pool_stat_collection + << dendl; + } + vstatfs = stats.actual_store_vstatfs; + vstatfs.publish(&s); + dout(5) << __func__ << " recovered " << s + << dendl; + return 0; +} + +//--------------------------------------------------------- +int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats) +{ + // first set space used by superblock + auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED); + set_allocation_in_simple_bmap(sbmap, 0, super_length); + stats.extent_count++; + + // then set all space taken by Objects + int ret = read_allocation_from_onodes(sbmap, stats); + if (ret < 0) { + derr << "failed read_allocation_from_onodes()" << dendl; + return ret; + } + + return 0; +} + +//----------------------------------------------------------------------------------- +static void copy_simple_bitmap_to_allocator(SimpleBitmap* sbmap, Allocator* dest_alloc, uint64_t alloc_size) +{ + int alloc_size_shift = std::countr_zero(alloc_size); + uint64_t offset = 0; + extent_t ext = sbmap->get_next_clr_extent(offset); + while (ext.length != 0) { + dest_alloc->init_add_free(ext.offset << alloc_size_shift, ext.length << alloc_size_shift); + offset = ext.offset + ext.length; + ext = sbmap->get_next_clr_extent(offset); + } +} + +//--------------------------------------------------------- +int BlueStore::read_allocation_from_drive_on_startup() +{ + int ret = 0; + + ret = _open_collections(); + if (ret < 0) { + return ret; + } + auto shutdown_cache = make_scope_guard([&] { + _shutdown_cache(); + }); + + utime_t start = ceph_clock_now(); + read_alloc_stats_t stats = {}; + SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size)); + ret = reconstruct_allocations(&sbmap, stats); + if (ret != 0) { + return ret; + } + + copy_simple_bitmap_to_allocator(&sbmap, alloc, min_alloc_size); + + utime_t duration = ceph_clock_now() - start; + dout(1) << "::Allocation Recovery was completed in " << duration << " seconds, extent_count=" << stats.extent_count << dendl; + return ret; +} + + + + +// Only used for debugging purposes - we build a secondary allocator from the Onodes and compare it to the existing one +// Not meant to be run by customers +#ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION + +#include <stdlib.h> +#include <algorithm> +//--------------------------------------------------------- +int cmpfunc (const void * a, const void * b) +{ + if ( ((extent_t*)a)->offset > ((extent_t*)b)->offset ) { + return 1; + } + else if( ((extent_t*)a)->offset < ((extent_t*)b)->offset ) { + return -1; + } + else { + return 0; + } +} + +// compare the allocator built from Onodes with the system allocator (CF-B) +//--------------------------------------------------------- +int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t req_extent_count, uint64_t memory_target) +{ + uint64_t allocation_size = std::min((req_extent_count) * sizeof(extent_t), memory_target / 3); + uint64_t extent_count = allocation_size/sizeof(extent_t); + dout(5) << "req_extent_count=" << req_extent_count << ", granted extent_count="<< extent_count << dendl; + + unique_ptr<extent_t[]> arr1; + unique_ptr<extent_t[]> arr2; + try { + arr1 = make_unique<extent_t[]>(extent_count); + arr2 = make_unique<extent_t[]>(extent_count); + } catch (std::bad_alloc&) { + derr << "****Failed dynamic allocation, extent_count=" << extent_count << dendl; + return -1; + } + + // copy the extents from the allocators into simple array and then compare them + uint64_t size1 = 0, size2 = 0; + uint64_t idx1 = 0, idx2 = 0; + auto iterated_mapper1 = [&](uint64_t offset, uint64_t length) { + size1 += length; + if (idx1 < extent_count) { + arr1[idx1++] = {offset, length}; + } + else if (idx1 == extent_count) { + derr << "(2)compare_allocators:: spillover" << dendl; + idx1 ++; + } + + }; + + auto iterated_mapper2 = [&](uint64_t offset, uint64_t length) { + size2 += length; + if (idx2 < extent_count) { + arr2[idx2++] = {offset, length}; + } + else if (idx2 == extent_count) { + derr << "(2)compare_allocators:: spillover" << dendl; + idx2 ++; + } + }; + + alloc1->foreach(iterated_mapper1); + alloc2->foreach(iterated_mapper2); + + qsort(arr1.get(), std::min(idx1, extent_count), sizeof(extent_t), cmpfunc); + qsort(arr2.get(), std::min(idx2, extent_count), sizeof(extent_t), cmpfunc); + + if (idx1 == idx2) { + idx1 = idx2 = std::min(idx1, extent_count); + if (memcmp(arr1.get(), arr2.get(), sizeof(extent_t) * idx2) == 0) { + return 0; + } + derr << "Failed memcmp(arr1, arr2, sizeof(extent_t)*idx2)" << dendl; + for (uint64_t i = 0; i < idx1; i++) { + if (memcmp(arr1.get()+i, arr2.get()+i, sizeof(extent_t)) != 0) { + derr << "!!!![" << i << "] arr1::<" << arr1[i].offset << "," << arr1[i].length << ">" << dendl; + derr << "!!!![" << i << "] arr2::<" << arr2[i].offset << "," << arr2[i].length << ">" << dendl; + return -1; + } + } + return 0; + } else { + derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl; + return -1; + } +} + +//--------------------------------------------------------- +int BlueStore::add_existing_bluefs_allocation(Allocator* allocator, read_alloc_stats_t &stats) +{ + // then add space used by bluefs to store rocksdb + unsigned extent_count = 0; + if (bluefs) { + bluefs->foreach_block_extents( + bluefs_layout.shared_bdev, + [&](uint64_t start, uint32_t len) { + allocator->init_rm_free(start, len); + stats.extent_count++; + } + ); + } + + dout(5) << "bluefs extent_count=" << extent_count << dendl; + return 0; +} + +//--------------------------------------------------------- +int BlueStore::read_allocation_from_drive_for_bluestore_tool() +{ + dout(5) << __func__ << dendl; + int ret = 0; + uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target"); + ret = _open_db_and_around(true, false); + if (ret < 0) { + return ret; + } + + ret = _open_collections(); + if (ret < 0) { + _close_db_and_around(); + return ret; + } + + utime_t duration; + read_alloc_stats_t stats = {}; + utime_t start = ceph_clock_now(); + + auto shutdown_cache = make_scope_guard([&] { + dout(1) << "Allocation Recovery was completed in " << duration + << " seconds; insert_count=" << stats.insert_count + << "; extent_count=" << stats.extent_count << dendl; + _shutdown_cache(); + _close_db_and_around(); + }); + + { + auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size())); + //reconstruct allocations into a temp simple-bitmap and copy into allocator + { + SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size)); + ret = reconstruct_allocations(&sbmap, stats); + if (ret != 0) { + return ret; + } + copy_simple_bitmap_to_allocator(&sbmap, allocator.get(), min_alloc_size); + } + + // add allocation space used by the bluefs itself + ret = add_existing_bluefs_allocation(allocator.get(), stats); + if (ret < 0) { + return ret; + } + + duration = ceph_clock_now() - start; + stats.insert_count = 0; + auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) { + stats.insert_count++; + }; + allocator->foreach(count_entries); + ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target); + if (ret == 0) { + dout(5) << "Allocator drive - file integrity check OK" << dendl; + } else { + derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl; + } + } + + dout(1) << stats << dendl; + return ret; +} + +//--------------------------------------------------------- +Allocator* BlueStore::clone_allocator_without_bluefs(Allocator *src_allocator) +{ + uint64_t bdev_size = bdev->get_size(); + Allocator* allocator = create_bitmap_allocator(bdev_size); + if (allocator) { + dout(5) << "bitmap-allocator=" << allocator << dendl; + } else { + derr << "****failed create_bitmap_allocator()" << dendl; + return nullptr; + } + + uint64_t num_entries = 0; + copy_allocator(src_allocator, allocator, &num_entries); + + // BlueFS stores its internal allocation outside RocksDB (FM) so we should not destage them to the allcoator-file + // we are going to hide bluefs allocation during allocator-destage as they are stored elsewhere + { + bluefs->foreach_block_extents( + bluefs_layout.shared_bdev, + [&] (uint64_t start, uint32_t len) { + allocator->init_add_free(start, len); + } + ); + } + + return allocator; +} + +//--------------------------------------------------------- +static void clear_allocation_objects_from_rocksdb(KeyValueDB *db, CephContext *cct, const std::string &path) +{ + dout(5) << "t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP)" << dendl; + KeyValueDB::Transaction t = db->get_transaction(); + t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP); + db->submit_transaction_sync(t); +} + +//--------------------------------------------------------- +void BlueStore::copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm) +{ + unsigned max_txn = 1024; + dout(5) << "max_transaction_submit=" << max_txn << dendl; + uint64_t size = 0, idx = 0; + KeyValueDB::Transaction txn = db->get_transaction(); + auto iterated_insert = [&](uint64_t offset, uint64_t length) { + size += length; + real_fm->release(offset, length, txn); + if ((++idx % max_txn) == 0) { + db->submit_transaction_sync(txn); + txn = db->get_transaction(); + } + }; + allocator->foreach(iterated_insert); + if (idx % max_txn != 0) { + db->submit_transaction_sync(txn); + } + dout(5) << "size=" << size << ", num extents=" << idx << dendl; +} + +//--------------------------------------------------------- +Allocator* BlueStore::initialize_allocator_from_freelist(FreelistManager *real_fm) +{ + dout(5) << "real_fm->enumerate_next" << dendl; + Allocator* allocator2 = create_bitmap_allocator(bdev->get_size()); + if (allocator2) { + dout(5) << "bitmap-allocator=" << allocator2 << dendl; + } else { + return nullptr; + } + + uint64_t size2 = 0, idx2 = 0; + real_fm->enumerate_reset(); + uint64_t offset, length; + while (real_fm->enumerate_next(db, &offset, &length)) { + allocator2->init_add_free(offset, length); + ++idx2; + size2 += length; + } + real_fm->enumerate_reset(); + + dout(5) << "size2=" << size2 << ", num2=" << idx2 << dendl; + return allocator2; +} + +//--------------------------------------------------------- +// close the active fm and open it in a new mode like makefs() +// but make sure to mark the full device space as allocated +// later we will mark all exetents from the allocator as free +int BlueStore::reset_fm_for_restore() +{ + dout(5) << "<<==>> fm->clear_null_manager()" << dendl; + fm->shutdown(); + delete fm; + fm = nullptr; + freelist_type = "bitmap"; + KeyValueDB::Transaction t = db->get_transaction(); + // call _open_fm() with fm_restore set to TRUE + // this will mark the full device space as allocated (and not just the reserved space) + _open_fm(t, true, true, true); + if (fm == nullptr) { + derr << "Failed _open_fm()" << dendl; + return -1; + } + db->submit_transaction_sync(t); + ceph_assert(!fm->is_null_manager()); + dout(5) << "fm was reactivated in full mode" << dendl; + return 0; +} + + +//--------------------------------------------------------- +// create a temp allocator filled with allocation state from the fm +// and compare it to the base allocator passed in +int BlueStore::verify_rocksdb_allocations(Allocator *allocator) +{ + dout(5) << "verify that alloc content is identical to FM" << dendl; + // initialize from freelist + Allocator* temp_allocator = initialize_allocator_from_freelist(fm); + if (temp_allocator == nullptr) { + return -1; + } + + uint64_t insert_count = 0; + auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) { + insert_count++; + }; + temp_allocator->foreach(count_entries); + uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target"); + int ret = compare_allocators(allocator, temp_allocator, insert_count, memory_target); + + delete temp_allocator; + + if (ret == 0) { + dout(5) << "SUCCESS!!! compare(allocator, temp_allocator)" << dendl; + return 0; + } else { + derr << "**** FAILURE compare(allocator, temp_allocator)::ret=" << ret << dendl; + return -1; + } +} + +//--------------------------------------------------------- +int BlueStore::db_cleanup(int ret) +{ + _shutdown_cache(); + _close_db_and_around(); + return ret; +} + +//--------------------------------------------------------- +// convert back the system from null-allocator to using rocksdb to store allocation +int BlueStore::push_allocation_to_rocksdb() +{ + if (cct->_conf->bluestore_allocation_from_file) { + derr << "cct->_conf->bluestore_allocation_from_file must be cleared first" << dendl; + derr << "please change default to false in ceph.conf file>" << dendl; + return -1; + } + + dout(5) << "calling open_db_and_around() in read/write mode" << dendl; + int ret = _open_db_and_around(false); + if (ret < 0) { + return ret; + } + + if (!fm->is_null_manager()) { + derr << "This is not a NULL-MANAGER -> nothing to do..." << dendl; + return db_cleanup(0); + } + + // start by creating a clone copy of the shared-allocator + unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(alloc)); + if (!allocator) { + return db_cleanup(-1); + } + + // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start + clear_allocation_objects_from_rocksdb(db, cct, path); + + // then open fm in new mode with the full devie marked as alloctaed + if (reset_fm_for_restore() != 0) { + return db_cleanup(-1); + } + + // push the free-space from the allocator (shared-alloc without bfs) to rocksdb + copy_allocator_content_to_fm(allocator.get(), fm); + + // compare the allocator info with the info stored in the fm/rocksdb + if (verify_rocksdb_allocations(allocator.get()) == 0) { + // all is good -> we can commit to rocksdb allocator + commit_to_real_manager(); + } else { + return db_cleanup(-1); + } + + // can't be too paranoid :-) + dout(5) << "Running full scale verification..." << dendl; + // close db/fm/allocator and start fresh + db_cleanup(0); + dout(5) << "calling open_db_and_around() in read-only mode" << dendl; + ret = _open_db_and_around(true); + if (ret < 0) { + return db_cleanup(ret); + } + ceph_assert(!fm->is_null_manager()); + ceph_assert(verify_rocksdb_allocations(allocator.get()) == 0); + + return db_cleanup(ret); +} + +#endif // CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION + +//------------------------------------------------------------------------------------- +int BlueStore::commit_freelist_type() +{ + // When freelist_type to "bitmap" we will store allocation in RocksDB + // When allocation-info is stored in a single file we set freelist_type to "null" + // This will direct the startup code to read allocation from file and not RocksDB + KeyValueDB::Transaction t = db->get_transaction(); + if (t == nullptr) { + derr << "db->get_transaction() failed!!!" << dendl; + return -1; + } + + bufferlist bl; + bl.append(freelist_type); + t->set(PREFIX_SUPER, "freelist_type", bl); + + int ret = db->submit_transaction_sync(t); + if (ret != 0) { + derr << "Failed db->submit_transaction_sync(t)" << dendl; + } + return ret; +} + +//------------------------------------------------------------------------------------- +int BlueStore::commit_to_null_manager() +{ + dout(5) << __func__ << " Set FreelistManager to NULL FM..." << dendl; + fm->set_null_manager(); + freelist_type = "null"; +#if 1 + return commit_freelist_type(); +#else + // should check how long this step take on a big configuration as deletes are expensive + if (commit_freelist_type() == 0) { + // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start + clear_allocation_objects_from_rocksdb(db, cct, path); + } +#endif +} + + +//------------------------------------------------------------------------------------- +int BlueStore::commit_to_real_manager() +{ + dout(5) << "Set FreelistManager to Real FM..." << dendl; + ceph_assert(!fm->is_null_manager()); + freelist_type = "bitmap"; + int ret = commit_freelist_type(); + if (ret == 0) { + //remove the allocation_file + invalidate_allocation_file_on_bluefs(); + ret = bluefs->unlink(allocator_dir, allocator_file); + bluefs->sync_metadata(false); + if (ret == 0) { + dout(5) << "Remove Allocation File successfully" << dendl; + } + else { + derr << "Remove Allocation File ret_code=" << ret << dendl; + } + } + + return ret; +} + +//================================================================================================================ +//================================================================================================================ |